From b7af3a8c36426811762bf331e3938f9d67b7429e Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 2 Aug 2019 16:58:34 +0200 Subject: [SERVER] Make image->users atomic and get rid of some locking With this change it should be safe to read the users count of an image without locking first, assuming you already have a reference on the image or are otherwise sure it cannot be freed, i.e. in an active uplink. Updating users, or checking whether it's 0 in order to free the image should only be done while holding the imageListLock. --- src/server/globals.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/server/globals.h') diff --git a/src/server/globals.h b/src/server/globals.h index b248800..73eb563 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -117,7 +117,7 @@ struct _dnbd3_image uint32_t masterCrc32; // CRC-32 of the crc-32 list int readFd; // used to read the image. Used from multiple threads, so use atomic operations (pread et al) int completenessEstimate; // Completeness estimate in percent - int users; // clients currently using this image + atomic_int users; // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock. int id; // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server bool working; // true if image exists and completeness is == 100% or a working upstream proxy is connected uint16_t rid; // revision of image -- cgit v1.2.3-55-g7522 From 77499f086631d0f6eeb96a3e0391cf72eb40ff5e Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Sat, 3 Aug 2019 16:35:02 +0200 Subject: [SERVER] Atomicize some global flags --- src/server/globals.h | 2 +- src/server/integrity.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/globals.h b/src/server/globals.h index 73eb563..7e5ff04 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -65,7 +65,7 @@ struct _dnbd3_connection dnbd3_host_t betterServer; // The better server uint8_t *recvBuffer; // Buffer for receiving payload uint32_t recvBufferLen; // Len of ^^ - volatile bool shutdown; // signal this thread to stop, must only be set from uplink_shutdown() or cleanup in uplink_mainloop() + atomic_bool shutdown; // signal this thread to stop, must only be set from uplink_shutdown() or cleanup in uplink_mainloop() bool replicatedLastBlock; // bool telling if the last block has been replicated yet bool cycleDetected; // connection cycle between proxies detected for current remote server int nextReplicationIndex; // Which index in the cache map we should start looking for incomplete blocks at diff --git a/src/server/integrity.c b/src/server/integrity.c index 8f17855..a66a364 100644 --- a/src/server/integrity.c +++ b/src/server/integrity.c @@ -29,7 +29,7 @@ static queue_entry checkQueue[CHECK_QUEUE_SIZE]; static pthread_mutex_t integrityQueueLock; static pthread_cond_t queueSignal; static int queueLen = -1; -static volatile bool bRunning = false; +static atomic_bool bRunning = false; static void* integrity_main(void *data); -- cgit v1.2.3-55-g7522 From 5dc776ac73be190daa2b2b8c3eb6042fdab4acda Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Tue, 6 Aug 2019 14:06:27 +0200 Subject: [SERVER] uplink: Improve attaching to existing requests Allow attaching in ULR_PROCESSING state, leave lower slots empty to increase chances attaching to ULR_PROCESSING. --- src/server/globals.h | 12 ------- src/server/uplink.c | 97 +++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 74 insertions(+), 35 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/globals.h b/src/server/globals.h index 7e5ff04..cd5ad7e 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -17,18 +17,6 @@ typedef struct _dnbd3_connection dnbd3_connection_t; typedef struct _dnbd3_image dnbd3_image_t; typedef struct _dnbd3_client dnbd3_client_t; -// Slot is free, can be used. -// Must only be set in uplink_handle_receive() or uplink_remove_client() -#define ULR_FREE 0 -// Slot has been filled with a request that hasn't been sent to the upstream server yet, matching request can safely rely on reuse. -// Must only be set in uplink_request() -#define ULR_NEW 1 -// Slot is occupied, reply has not yet been received, matching request can safely rely on reuse. -// Must only be set in uplink_mainloop() or uplink_request() -#define ULR_PENDING 2 -// Slot is being processed, do not consider for hop on. -// Must only be set in uplink_handle_receive() -#define ULR_PROCESSING 3 typedef struct { uint64_t handle; // Client defined handle to pass back in reply diff --git a/src/server/uplink.c b/src/server/uplink.c index f58b019..9f99fe4 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -21,6 +21,28 @@ #define REP_NONE ( (uint64_t)0xffffffffffffffff ) +// Status of request in queue + +// Slot is free, can be used. +// Must only be set in uplink_handle_receive() or uplink_remove_client() +#define ULR_FREE 0 +// Slot has been filled with a request that hasn't been sent to the upstream server yet, matching request can safely rely on reuse. +// Must only be set in uplink_request() +#define ULR_NEW 1 +// Slot is occupied, reply has not yet been received, matching request can safely rely on reuse. +// Must only be set in uplink_mainloop() or uplink_request() +#define ULR_PENDING 2 +// Slot is being processed, do not consider for hop on. +// Must only be set in uplink_handle_receive() +#define ULR_PROCESSING 3 + +static const char *const NAMES_ULR[4] = { + [ULR_FREE] = "ULR_FREE", + [ULR_NEW] = "ULR_NEW", + [ULR_PENDING] = "ULR_PENDING", + [ULR_PROCESSING] = "ULR_PROCESSING", +}; + static atomic_uint_fast64_t totalBytesReceived = 0; static void* uplink_mainloop(void *data); @@ -203,30 +225,37 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin int existingType = -1; // ULR_* type of existing request int i; int freeSlot = -1; + int firstUsedSlot = -1; bool requestLoop = false; const uint64_t end = start + length; mutex_lock( &uplink->queueLock ); mutex_unlock( &client->image->lock ); for (i = 0; i < uplink->queueLen; ++i) { - if ( freeSlot == -1 && uplink->queue[i].status == ULR_FREE ) { - freeSlot = i; + // find free slot to place this request into + if ( uplink->queue[i].status == ULR_FREE ) { + if ( freeSlot == -1 || existingType != ULR_PROCESSING ) { + freeSlot = i; + } continue; } - if ( uplink->queue[i].status != ULR_PENDING && uplink->queue[i].status != ULR_NEW ) continue; - if ( uplink->queue[i].from <= start && uplink->queue[i].to >= end ) { - if ( hops > uplink->queue[i].hopCount && uplink->queue[i].from == start && uplink->queue[i].to == end ) { - requestLoop = true; - break; - } - if ( foundExisting == -1 || existingType == ULR_PENDING ) { - foundExisting = i; - existingType = uplink->queue[i].status; - if ( freeSlot != -1 ) break; - } + if ( firstUsedSlot == -1 ) { + firstUsedSlot = i; + } + // find existing request to attach to + if ( uplink->queue[i].from > start || uplink->queue[i].to < end ) + continue; // Range not suitable + // Detect potential proxy cycle. New request hopcount is greater, range is same, old request has already been sent -> suspicious + if ( hops > uplink->queue[i].hopCount && uplink->queue[i].from == start && uplink->queue[i].to == end && uplink->queue[i].status == ULR_PENDING ) { + requestLoop = true; + break; + } + if ( foundExisting == -1 || existingType == ULR_PROCESSING ) { + foundExisting = i; + existingType = uplink->queue[i].status; } } - if ( requestLoop ) { + if ( unlikely( requestLoop ) ) { mutex_unlock( &uplink->queueLock ); logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops ); mutex_lock( &uplink->rttLock ); @@ -235,6 +264,9 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin signal_call( uplink->signal ); return false; } + if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) { + freeSlot = -1; // Not attaching to existing request, make it use a higher slot + } if ( freeSlot == -1 ) { if ( uplink->queueLen >= SERVER_MAX_UPLINK_QUEUE ) { mutex_unlock( &uplink->queueLock ); @@ -244,15 +276,17 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin freeSlot = uplink->queueLen++; } // Do not send request to uplink server if we have a matching pending request AND the request either has the - // status ULR_NEW OR we found a free slot with LOWER index than the one we attach to. Otherwise + // status ULR_NEW/PENDING OR we found a free slot with LOWER index than the one we attach to. Otherwise // explicitly send this request to the uplink server. The second condition mentioned here is to prevent // a race condition where the reply for the outstanding request already arrived and the uplink thread // is currently traversing the request queue. As it is processing the queue from highest to lowest index, it might // already have passed the index of the free slot we determined, but not reached the existing request we just found above. - if ( foundExisting != -1 && existingType != ULR_NEW && freeSlot > foundExisting ) foundExisting = -1; // -1 means "send request" + if ( foundExisting != -1 && existingType == ULR_PROCESSING && freeSlot > foundExisting ) { + foundExisting = -1; // -1 means "send request" + } #ifdef _DEBUG if ( foundExisting != -1 ) { - logadd( LOG_DEBUG2, "%p (%s) Found existing request of type %s at slot %d, attaching in slot %d.\n", (void*)uplink, uplink->image->name, existingType == ULR_NEW ? "ULR_NEW" : "ULR_PENDING", foundExisting, freeSlot ); + logadd( LOG_DEBUG2, "%p (%s) Found existing request of type %s at slot %d, attaching in slot %d.\n", (void*)uplink, uplink->image->name, NAMES_ULR[existingType], foundExisting, freeSlot ); logadd( LOG_DEBUG2, "Original %" PRIu64 "-%" PRIu64 " (%p)\n" "New %" PRIu64 "-%" PRIu64 " (%p)\n", uplink->queue[foundExisting].from, uplink->queue[foundExisting].to, (void*)uplink->queue[foundExisting].client, @@ -265,7 +299,8 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin uplink->queue[freeSlot].handle = handle; uplink->queue[freeSlot].client = client; //int old = uplink->queue[freeSlot].status; - uplink->queue[freeSlot].status = (foundExisting == -1 ? ULR_NEW : ULR_PENDING); + uplink->queue[freeSlot].status = ( foundExisting == -1 ? ULR_NEW : + ( existingType == ULR_NEW ? ULR_PENDING : existingType ) ); uplink->queue[freeSlot].hopCount = hops; #ifdef _DEBUG timing_get( &uplink->queue[freeSlot].entered ); @@ -292,14 +327,25 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin if ( !ret ) { logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" ); } else { + // Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again + int state; mutex_lock( &uplink->queueLock ); - if ( uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client && uplink->queue[freeSlot].status == ULR_NEW ) { - uplink->queue[freeSlot].status = ULR_PENDING; - logadd( LOG_DEBUG2, "Succesful direct uplink request" ); + if ( uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) { + state = uplink->queue[freeSlot].status; + if ( uplink->queue[freeSlot].status == ULR_NEW ) { + uplink->queue[freeSlot].status = ULR_PENDING; + } } else { - logadd( LOG_DEBUG2, "Weird queue update fail for direct uplink request" ); + state = -1; } mutex_unlock( &uplink->queueLock ); + if ( state == -1 ) { + logadd( LOG_DEBUG2, "Direct uplink request queue entry gone after sending and re-locking queue. *shrug*" ); + } else if ( state == ULR_NEW ) { + logadd( LOG_DEBUG2, "Succesful direct uplink request" ); + } else { + logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] ); + } return true; } // Fall through to waking up sender thread @@ -837,6 +883,11 @@ static void uplink_handleReceive(dnbd3_connection_t *link) } } // 2) Figure out which clients are interested in it + // Mark as ULR_PROCESSING, since we unlock repeatedly in the second loop + // below; this prevents uplink_request() from attaching to this request + // by populating a slot with index greater than the highest matching + // request with ULR_PROCESSING (assuming there is no ULR_PENDING or ULR_NEW + // where it's fine if the index is greater) mutex_lock( &link->queueLock ); for (i = 0; i < link->queueLen; ++i) { dnbd3_queued_request_t * const req = &link->queue[i]; @@ -877,10 +928,10 @@ static void uplink_handleReceive(dnbd3_connection_t *link) bytesSent = (size_t)sent - sizeof outReply; } } - mutex_unlock( &client->sendMutex ); if ( bytesSent != 0 ) { client->bytesSent += bytesSent; } + mutex_unlock( &client->sendMutex ); mutex_lock( &link->queueLock ); } if ( req->status == ULR_FREE && i == link->queueLen - 1 ) link->queueLen--; -- cgit v1.2.3-55-g7522 From 4e2e258dba3c9268e8d4fd061cbb9f291017ed2f Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Wed, 7 Aug 2019 14:39:44 +0200 Subject: [SERVER] Use more _Atomic --- src/server/globals.h | 6 +++--- src/server/net.c | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/globals.h b/src/server/globals.h index cd5ad7e..86b8865 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -60,7 +60,7 @@ struct _dnbd3_connection // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block" uint64_t replicationHandle; // Handle of pending replication request atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup. - int queueLen; // length of queue + atomic_int queueLen; // length of queue uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives) dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE]; }; @@ -107,7 +107,7 @@ struct _dnbd3_image int completenessEstimate; // Completeness estimate in percent atomic_int users; // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock. int id; // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server - bool working; // true if image exists and completeness is == 100% or a working upstream proxy is connected + atomic_bool working; // true if image exists and completeness is == 100% or a working upstream proxy is connected uint16_t rid; // revision of image pthread_mutex_t lock; }; @@ -116,7 +116,7 @@ struct _dnbd3_client { #define HOSTNAMELEN (48) atomic_uint_fast64_t bytesSent; // Byte counter for this client. - dnbd3_image_t *image; // Image in use by this client, or NULL during handshake + dnbd3_image_t * _Atomic image; // Image in use by this client, or NULL during handshake int sock; bool isServer; // true if a server in proxy mode, false if real client dnbd3_host_t host; diff --git a/src/server/net.c b/src/server/net.c index c1fa6fa..92728c0 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -255,9 +255,8 @@ void* net_handleNewConnection(void *clientPtr) // No BGR mismatch, but don't lookup if image is unknown locally image = image_get( image_name, rid, true ); } - mutex_lock( &client->lock ); client->image = image; - mutex_unlock( &client->lock ); + atomic_thread_fence( memory_order_release ); if ( image == NULL ) { //logadd( LOG_DEBUG1, "Client requested non-existent image '%s' (rid:%d), rejected\n", image_name, (int)rid ); } else if ( !image->working ) { -- cgit v1.2.3-55-g7522 From da0950ad342bae3b40a74bf82dba6c1f82e7eb57 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Sun, 18 Aug 2019 21:31:56 +0200 Subject: [SERVER] uplink: More consistent type/variable naming * Change link to uplink everywhere * dnbd3_connection_t -> dnbd3_uplink_t --- src/server/altservers.c | 10 +- src/server/altservers.h | 4 +- src/server/globals.h | 12 +- src/server/uplink.c | 554 ++++++++++++++++++++++++------------------------ src/server/uplink.h | 2 +- 5 files changed, 294 insertions(+), 288 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/altservers.c b/src/server/altservers.c index 60c046c..1001981 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -14,7 +14,7 @@ #define LOG_GOTO(jumplabel, lvl, ...) do { LOG(lvl, __VA_ARGS__); goto jumplabel; } while (0); #define ERROR_GOTO(jumplabel, ...) LOG_GOTO(jumplabel, LOG_ERROR, __VA_ARGS__) -static dnbd3_connection_t * _Atomic pending[SERVER_MAX_PENDING_ALT_CHECKS]; +static dnbd3_uplink_t * _Atomic pending[SERVER_MAX_PENDING_ALT_CHECKS]; static dnbd3_signal_t * _Atomic runSignal = NULL; static dnbd3_alt_server_t altServers[SERVER_MAX_ALTS]; @@ -121,7 +121,7 @@ bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate /** * ONLY called from the passed uplink's main thread */ -void altservers_findUplink(dnbd3_connection_t *uplink) +void altservers_findUplink(dnbd3_uplink_t *uplink) { if ( uplink->shutdown ) return; @@ -149,7 +149,7 @@ void altservers_findUplink(dnbd3_connection_t *uplink) uplink->rttTestResult = RTT_INPROGRESS; for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { if ( pending[i] != NULL ) continue; - dnbd3_connection_t *null = NULL; + dnbd3_uplink_t *null = NULL; if ( atomic_compare_exchange_strong( &pending[i], &null, uplink ) ) { mutex_unlock( &uplink->rttLock ); atomic_thread_fence( memory_order_release ); @@ -167,7 +167,7 @@ void altservers_findUplink(dnbd3_connection_t *uplink) * The given uplink is about to disappear, * wait until any pending RTT check is done. */ -void altservers_removeUplink(dnbd3_connection_t *uplink) +void altservers_removeUplink(dnbd3_uplink_t *uplink) { assert( uplink != NULL ); assert( uplink->shutdown ); @@ -453,7 +453,7 @@ static void *altservers_main(void *data UNUSED) // Work your way through the queue atomic_thread_fence( memory_order_acquire ); for (itLink = 0; itLink < SERVER_MAX_PENDING_ALT_CHECKS; ++itLink) { - dnbd3_connection_t * const uplink = pending[itLink]; + dnbd3_uplink_t * const uplink = pending[itLink]; if ( uplink == NULL ) continue; // First, get 4 alt servers diff --git a/src/server/altservers.h b/src/server/altservers.h index 7b7b46d..e03b900 100644 --- a/src/server/altservers.h +++ b/src/server/altservers.h @@ -13,9 +13,9 @@ int altservers_load(); bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly); -void altservers_findUplink(dnbd3_connection_t *uplink); +void altservers_findUplink(dnbd3_uplink_t *uplink); -void altservers_removeUplink(dnbd3_connection_t *uplink); +void altservers_removeUplink(dnbd3_uplink_t *uplink); int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size); diff --git a/src/server/globals.h b/src/server/globals.h index 86b8865..0371e33 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -13,7 +13,7 @@ typedef struct timespec ticks; // ######### All structs/types used by the server ######## -typedef struct _dnbd3_connection dnbd3_connection_t; +typedef struct _dnbd3_uplink dnbd3_uplink_t; typedef struct _dnbd3_image dnbd3_image_t; typedef struct _dnbd3_client dnbd3_client_t; @@ -30,12 +30,18 @@ typedef struct uint8_t hopCount; // How many hops this request has already taken across proxies } dnbd3_queued_request_t; +typedef struct { + int fd; + int version; + dnbd3_host_t host; +} dnbd3_server_connection_t; + #define RTT_IDLE 0 // Not in progress #define RTT_INPROGRESS 1 // In progess, not finished #define RTT_DONTCHANGE 2 // Finished, but no better alternative found #define RTT_DOCHANGE 3 // Finished, better alternative written to .betterServer + .betterFd #define RTT_NOT_REACHABLE 4 // No uplink was reachable -struct _dnbd3_connection +struct _dnbd3_uplink { int fd; // socket fd to remote server int version; // remote server protocol version @@ -94,7 +100,7 @@ struct _dnbd3_image { char *path; // absolute path of the image char *name; // public name of the image (usually relative path minus revision ID) - dnbd3_connection_t *uplink; // pointer to a server connection + dnbd3_uplink_t *uplink; // pointer to a server connection uint8_t *cache_map; // cache map telling which parts are locally cached, NULL if complete uint64_t virtualFilesize; // virtual size of image (real size rounded up to multiple of 4k) uint64_t realFilesize; // actual file size on disk diff --git a/src/server/uplink.c b/src/server/uplink.c index 9570273..7d66b21 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -46,16 +46,16 @@ static const char *const NAMES_ULR[4] = { static atomic_uint_fast64_t totalBytesReceived = 0; static void* uplink_mainloop(void *data); -static void uplink_sendRequests(dnbd3_connection_t *link, bool newOnly); -static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const int lastBlockIndex); -static void uplink_handleReceive(dnbd3_connection_t *link); +static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly); +static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex); +static void uplink_handleReceive(dnbd3_uplink_t *uplink); static int uplink_sendKeepalive(const int fd); -static void uplink_addCrc32(dnbd3_connection_t *uplink); -static void uplink_sendReplicationRequest(dnbd3_connection_t *link); -static bool uplink_reopenCacheFd(dnbd3_connection_t *link, const bool force); -static bool uplink_saveCacheMap(dnbd3_connection_t *link); -static bool uplink_connectionShouldShutdown(dnbd3_connection_t *link); -static void uplink_connectionFailed(dnbd3_connection_t *link, bool findNew); +static void uplink_addCrc32(dnbd3_uplink_t *uplink); +static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink); +static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force); +static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink); +static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink); +static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew); // ############ uplink connection handling @@ -76,7 +76,7 @@ uint64_t uplink_getTotalBytesReceived() bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version) { if ( !_isProxy || _shutdown ) return false; - dnbd3_connection_t *link = NULL; + dnbd3_uplink_t *uplink = NULL; assert( image != NULL ); mutex_lock( &image->lock ); if ( image->uplink != NULL && !image->uplink->shutdown ) { @@ -88,44 +88,44 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version logadd( LOG_WARNING, "Uplink was requested for image %s, but it is already complete", image->name ); goto failure; } - link = image->uplink = calloc( 1, sizeof(dnbd3_connection_t) ); - mutex_init( &link->queueLock, LOCK_UPLINK_QUEUE ); - mutex_init( &link->rttLock, LOCK_UPLINK_RTT ); - mutex_init( &link->sendMutex, LOCK_UPLINK_SEND ); - link->image = image; - link->bytesReceived = 0; - link->idleTime = 0; - link->queueLen = 0; - mutex_lock( &link->sendMutex ); - link->fd = -1; - mutex_unlock( &link->sendMutex ); - link->cacheFd = -1; - link->signal = NULL; - link->replicationHandle = REP_NONE; - mutex_lock( &link->rttLock ); - link->cycleDetected = false; + uplink = image->uplink = calloc( 1, sizeof(dnbd3_uplink_t) ); + mutex_init( &uplink->queueLock, LOCK_UPLINK_QUEUE ); + mutex_init( &uplink->rttLock, LOCK_UPLINK_RTT ); + mutex_init( &uplink->sendMutex, LOCK_UPLINK_SEND ); + uplink->image = image; + uplink->bytesReceived = 0; + uplink->idleTime = 0; + uplink->queueLen = 0; + mutex_lock( &uplink->sendMutex ); + uplink->fd = -1; + mutex_unlock( &uplink->sendMutex ); + uplink->cacheFd = -1; + uplink->signal = NULL; + uplink->replicationHandle = REP_NONE; + mutex_lock( &uplink->rttLock ); + uplink->cycleDetected = false; if ( sock >= 0 ) { - link->betterFd = sock; - link->betterServer = *host; - link->rttTestResult = RTT_DOCHANGE; - link->betterVersion = version; + uplink->betterFd = sock; + uplink->betterServer = *host; + uplink->rttTestResult = RTT_DOCHANGE; + uplink->betterVersion = version; } else { - link->betterFd = -1; - link->rttTestResult = RTT_IDLE; + uplink->betterFd = -1; + uplink->rttTestResult = RTT_IDLE; } - mutex_unlock( &link->rttLock ); - link->recvBufferLen = 0; - link->shutdown = false; - if ( 0 != thread_create( &(link->thread), NULL, &uplink_mainloop, (void *)link ) ) { + mutex_unlock( &uplink->rttLock ); + uplink->recvBufferLen = 0; + uplink->shutdown = false; + if ( 0 != thread_create( &(uplink->thread), NULL, &uplink_mainloop, (void *)link ) ) { logadd( LOG_ERROR, "Could not start thread for new uplink." ); goto failure; } mutex_unlock( &image->lock ); return true; failure: ; - if ( link != NULL ) { - free( link ); - link = image->uplink = NULL; + if ( uplink != NULL ) { + free( uplink ); + uplink = image->uplink = NULL; } mutex_unlock( &image->lock ); return false; @@ -146,7 +146,7 @@ void uplink_shutdown(dnbd3_image_t *image) mutex_unlock( &image->lock ); return; } - dnbd3_connection_t * const uplink = image->uplink; + dnbd3_uplink_t * const uplink = image->uplink; mutex_lock( &uplink->queueLock ); if ( !uplink->shutdown ) { uplink->shutdown = true; @@ -170,7 +170,7 @@ void uplink_shutdown(dnbd3_image_t *image) * Remove given client from uplink request queue * Locks on: uplink.queueLock */ -void uplink_removeClient(dnbd3_connection_t *uplink, dnbd3_client_t *client) +void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client) { mutex_lock( &uplink->queueLock ); for (int i = uplink->queueLen - 1; i >= 0; --i) { @@ -203,7 +203,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin logadd( LOG_DEBUG1, "Uplink request for image with no uplink" ); return false; } - dnbd3_connection_t * const uplink = client->image->uplink; + dnbd3_uplink_t * const uplink = client->image->uplink; if ( uplink->shutdown ) { mutex_unlock( &client->image->lock ); logadd( LOG_DEBUG1, "Uplink request for image with uplink shutting down" ); @@ -370,7 +370,7 @@ static void* uplink_mainloop(void *data) #define EV_SOCKET (1) #define EV_COUNT (2) struct pollfd events[EV_COUNT]; - dnbd3_connection_t * const link = (dnbd3_connection_t*)data; + dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data; int numSocks, i, waitTime; int altCheckInterval = SERVER_RTT_INTERVAL_INIT; uint32_t discoverFailCount = 0; @@ -381,31 +381,31 @@ static void* uplink_mainloop(void *data) timing_get( &nextAltCheck ); lastKeepalive = nextAltCheck; // - assert( link != NULL ); + assert( uplink != NULL ); setThreadName( "idle-uplink" ); blockNoncriticalSignals(); // Make sure file is open for writing - if ( !uplink_reopenCacheFd( link, false ) ) { + if ( !uplink_reopenCacheFd( uplink, false ) ) { // It might have failed - still offer proxy mode, we just can't cache - logadd( LOG_WARNING, "Cannot open cache file %s for writing (errno=%d); will just proxy traffic without caching!", link->image->path, errno ); + logadd( LOG_WARNING, "Cannot open cache file %s for writing (errno=%d); will just proxy traffic without caching!", uplink->image->path, errno ); } // - link->signal = signal_new(); - if ( link->signal == NULL ) { + uplink->signal = signal_new(); + if ( uplink->signal == NULL ) { logadd( LOG_WARNING, "error creating signal. Uplink unavailable." ); goto cleanup; } events[EV_SIGNAL].events = POLLIN; - events[EV_SIGNAL].fd = signal_getWaitFd( link->signal ); + events[EV_SIGNAL].fd = signal_getWaitFd( uplink->signal ); events[EV_SOCKET].fd = -1; - while ( !_shutdown && !link->shutdown ) { + while ( !_shutdown && !uplink->shutdown ) { // poll() - mutex_lock( &link->rttLock ); - waitTime = link->rttTestResult == RTT_DOCHANGE ? 0 : -1; - mutex_unlock( &link->rttLock ); + mutex_lock( &uplink->rttLock ); + waitTime = uplink->rttTestResult == RTT_DOCHANGE ? 0 : -1; + mutex_unlock( &uplink->rttLock ); if ( waitTime == 0 ) { // Nothing - } else if ( link->fd == -1 && !uplink_connectionShouldShutdown( link ) ) { + } else if ( uplink->fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) { waitTime = 1000; } else { declare_now; @@ -413,9 +413,9 @@ static void* uplink_mainloop(void *data) if ( waitTime < 100 ) waitTime = 100; if ( waitTime > 5000 ) waitTime = 5000; } - events[EV_SOCKET].fd = link->fd; + events[EV_SOCKET].fd = uplink->fd; numSocks = poll( events, EV_COUNT, waitTime ); - if ( _shutdown || link->shutdown ) goto cleanup; + if ( _shutdown || uplink->shutdown ) goto cleanup; if ( numSocks == -1 ) { // Error? if ( errno == EINTR ) continue; logadd( LOG_DEBUG1, "poll() error %d", (int)errno ); @@ -423,39 +423,39 @@ static void* uplink_mainloop(void *data) continue; } // Check if server switch is in order - mutex_lock( &link->rttLock ); - if ( link->rttTestResult != RTT_DOCHANGE ) { - mutex_unlock( &link->rttLock ); + mutex_lock( &uplink->rttLock ); + if ( uplink->rttTestResult != RTT_DOCHANGE ) { + mutex_unlock( &uplink->rttLock ); } else { - link->rttTestResult = RTT_IDLE; + uplink->rttTestResult = RTT_IDLE; // The rttTest worker thread has finished our request. // And says it's better to switch to another server - const int fd = link->fd; - mutex_lock( &link->sendMutex ); - link->fd = link->betterFd; - mutex_unlock( &link->sendMutex ); - link->betterFd = -1; - link->currentServer = link->betterServer; - link->version = link->betterVersion; - link->cycleDetected = false; - mutex_unlock( &link->rttLock ); + const int fd = uplink->fd; + mutex_lock( &uplink->sendMutex ); + uplink->fd = uplink->betterFd; + mutex_unlock( &uplink->sendMutex ); + uplink->betterFd = -1; + uplink->currentServer = uplink->betterServer; + uplink->version = uplink->betterVersion; + uplink->cycleDetected = false; + mutex_unlock( &uplink->rttLock ); discoverFailCount = 0; if ( fd != -1 ) close( fd ); - link->replicationHandle = REP_NONE; - link->image->working = true; - link->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received + uplink->replicationHandle = REP_NONE; + uplink->image->working = true; + uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received buffer[0] = '@'; - if ( host_to_string( &link->currentServer, buffer + 1, sizeof(buffer) - 1 ) ) { - logadd( LOG_DEBUG1, "(Uplink %s) Now connected to %s\n", link->image->name, buffer + 1 ); + if ( host_to_string( &uplink->currentServer, buffer + 1, sizeof(buffer) - 1 ) ) { + logadd( LOG_DEBUG1, "(Uplink %s) Now connected to %s\n", uplink->image->name, buffer + 1 ); setThreadName( buffer ); } // If we don't have a crc32 list yet, see if the new server has one - if ( link->image->crc32 == NULL ) { - uplink_addCrc32( link ); + if ( uplink->image->crc32 == NULL ) { + uplink_addCrc32( uplink ); } // Re-send all pending requests - uplink_sendRequests( link, false ); - uplink_sendReplicationRequest( link ); + uplink_sendRequests( uplink, false ); + uplink_sendReplicationRequest( uplink ); events[EV_SOCKET].events = POLLIN | POLLRDHUP; timing_gets( &nextAltCheck, altCheckInterval ); // The rtt worker already did the handshake for our image, so there's nothing @@ -468,161 +468,161 @@ static void* uplink_mainloop(void *data) goto cleanup; } else if ( (events[EV_SIGNAL].revents & POLLIN) ) { // signal triggered -> pending requests - if ( signal_clear( link->signal ) == SIGNAL_ERROR ) { - logadd( LOG_WARNING, "Errno on signal on uplink for %s! Things will break!", link->image->name ); + if ( signal_clear( uplink->signal ) == SIGNAL_ERROR ) { + logadd( LOG_WARNING, "Errno on signal on uplink for %s! Things will break!", uplink->image->name ); } - if ( link->fd != -1 ) { + if ( uplink->fd != -1 ) { // Uplink seems fine, relay requests to it... - uplink_sendRequests( link, true ); + uplink_sendRequests( uplink, true ); } else { // No uplink; maybe it was shutdown since it was idle for too long - link->idleTime = 0; + uplink->idleTime = 0; } } // Uplink socket if ( (events[EV_SOCKET].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) { - uplink_connectionFailed( link, true ); + uplink_connectionFailed( uplink, true ); logadd( LOG_DEBUG1, "Uplink gone away, panic!\n" ); setThreadName( "panic-uplink" ); } else if ( (events[EV_SOCKET].revents & POLLIN) ) { - uplink_handleReceive( link ); - if ( _shutdown || link->shutdown ) goto cleanup; + uplink_handleReceive( uplink ); + if ( _shutdown || uplink->shutdown ) goto cleanup; } declare_now; uint32_t timepassed = timing_diff( &lastKeepalive, &now ); if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) { lastKeepalive = now; - link->idleTime += timepassed; + uplink->idleTime += timepassed; unsavedSeconds += timepassed; - if ( unsavedSeconds > 240 || ( unsavedSeconds > 60 && link->idleTime >= 20 && link->idleTime <= 70 ) ) { - // fsync/save every 4 minutes, or every 60 seconds if link is idle + if ( unsavedSeconds > 240 || ( unsavedSeconds > 60 && uplink->idleTime >= 20 && uplink->idleTime <= 70 ) ) { + // fsync/save every 4 minutes, or every 60 seconds if uplink is idle unsavedSeconds = 0; - uplink_saveCacheMap( link ); + uplink_saveCacheMap( uplink ); } // Keep-alive - if ( link->fd != -1 && link->replicationHandle == REP_NONE ) { + if ( uplink->fd != -1 && uplink->replicationHandle == REP_NONE ) { // Send keep-alive if nothing is happening - if ( uplink_sendKeepalive( link->fd ) ) { + if ( uplink_sendKeepalive( uplink->fd ) ) { // Re-trigger periodically, in case it requires a minimum user count - uplink_sendReplicationRequest( link ); + uplink_sendReplicationRequest( uplink ); } else { - uplink_connectionFailed( link, true ); + uplink_connectionFailed( uplink, true ); logadd( LOG_DEBUG1, "Error sending keep-alive, panic!\n" ); setThreadName( "panic-uplink" ); } } - // Don't keep link established if we're idle for too much - if ( link->fd != -1 && uplink_connectionShouldShutdown( link ) ) { - mutex_lock( &link->sendMutex ); - close( link->fd ); - link->fd = events[EV_SOCKET].fd = -1; - mutex_unlock( &link->sendMutex ); - link->cycleDetected = false; - if ( link->recvBufferLen != 0 ) { - link->recvBufferLen = 0; - free( link->recvBuffer ); - link->recvBuffer = NULL; + // Don't keep uplink established if we're idle for too much + if ( uplink->fd != -1 && uplink_connectionShouldShutdown( uplink ) ) { + mutex_lock( &uplink->sendMutex ); + close( uplink->fd ); + uplink->fd = events[EV_SOCKET].fd = -1; + mutex_unlock( &uplink->sendMutex ); + uplink->cycleDetected = false; + if ( uplink->recvBufferLen != 0 ) { + uplink->recvBufferLen = 0; + free( uplink->recvBuffer ); + uplink->recvBuffer = NULL; } - logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", link->image->name, (int)link->image->rid ); + logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", uplink->image->name, (int)uplink->image->rid ); setThreadName( "idle-uplink" ); } } // See if we should trigger an RTT measurement - mutex_lock( &link->rttLock ); - const int rttTestResult = link->rttTestResult; - mutex_unlock( &link->rttLock ); + mutex_lock( &uplink->rttLock ); + const int rttTestResult = uplink->rttTestResult; + mutex_unlock( &uplink->rttLock ); if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) { - if ( timing_reached( &nextAltCheck, &now ) || ( link->fd == -1 && !uplink_connectionShouldShutdown( link ) ) || link->cycleDetected ) { + if ( timing_reached( &nextAltCheck, &now ) || ( uplink->fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) { // It seems it's time for a check - if ( image_isComplete( link->image ) ) { + if ( image_isComplete( uplink->image ) ) { // Quit work if image is complete - logadd( LOG_INFO, "Replication of %s complete.", link->image->name ); + logadd( LOG_INFO, "Replication of %s complete.", uplink->image->name ); setThreadName( "finished-uplink" ); goto cleanup; - } else if ( !uplink_connectionShouldShutdown( link ) ) { + } else if ( !uplink_connectionShouldShutdown( uplink ) ) { // Not complete - do measurement - altservers_findUplink( link ); // This will set RTT_INPROGRESS (synchronous) - if ( _backgroundReplication == BGR_FULL && link->nextReplicationIndex == -1 ) { - link->nextReplicationIndex = 0; + altservers_findUplink( uplink ); // This will set RTT_INPROGRESS (synchronous) + if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) { + uplink->nextReplicationIndex = 0; } } altCheckInterval = MIN(altCheckInterval + 1, SERVER_RTT_INTERVAL_MAX); timing_set( &nextAltCheck, &now, altCheckInterval ); } } else if ( rttTestResult == RTT_NOT_REACHABLE ) { - mutex_lock( &link->rttLock ); - link->rttTestResult = RTT_IDLE; - mutex_unlock( &link->rttLock ); + mutex_lock( &uplink->rttLock ); + uplink->rttTestResult = RTT_IDLE; + mutex_unlock( &uplink->rttLock ); discoverFailCount++; timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_BACKOFF_COUNT ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) ); } #ifdef _DEBUG - if ( link->fd != -1 && !link->shutdown ) { + if ( uplink->fd != -1 && !uplink->shutdown ) { bool resend = false; ticks deadline; timing_set( &deadline, &now, -10 ); - mutex_lock( &link->queueLock ); - for (i = 0; i < link->queueLen; ++i) { - if ( link->queue[i].status != ULR_FREE && timing_reached( &link->queue[i].entered, &deadline ) ) { + mutex_lock( &uplink->queueLock ); + for (i = 0; i < uplink->queueLen; ++i) { + if ( uplink->queue[i].status != ULR_FREE && timing_reached( &uplink->queue[i].entered, &deadline ) ) { snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n" - "%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)link, i, link->queue[i].client->image->name, - link->queue[i].from, link->queue[i].to, link->queue[i].status ); - link->queue[i].entered = now; + "%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)link, i, uplink->queue[i].client->image->name, + uplink->queue[i].from, uplink->queue[i].to, uplink->queue[i].status ); + uplink->queue[i].entered = now; #ifdef _DEBUG_RESEND_STARVING - link->queue[i].status = ULR_NEW; + uplink->queue[i].status = ULR_NEW; resend = true; #endif - mutex_unlock( &link->queueLock ); + mutex_unlock( &uplink->queueLock ); logadd( LOG_WARNING, "%s", buffer ); - mutex_lock( &link->queueLock ); + mutex_lock( &uplink->queueLock ); } } - mutex_unlock( &link->queueLock ); + mutex_unlock( &uplink->queueLock ); if ( resend ) - uplink_sendRequests( link, true ); + uplink_sendRequests( uplink, true ); } #endif } cleanup: ; - if ( !link->shutdown ) { - link->shutdown = true; - thread_detach( link->thread ); + if ( !uplink->shutdown ) { + uplink->shutdown = true; + thread_detach( uplink->thread ); } - altservers_removeUplink( link ); - uplink_saveCacheMap( link ); - mutex_lock( &link->image->lock ); - if ( link->image->uplink == link ) { - link->image->uplink = NULL; + altservers_removeUplink( uplink ); + uplink_saveCacheMap( uplink ); + mutex_lock( &uplink->image->lock ); + if ( uplink->image->uplink == uplink ) { + uplink->image->uplink = NULL; } - mutex_lock( &link->queueLock ); - const int fd = link->fd; - const dnbd3_signal_t* signal = link->signal; - mutex_lock( &link->sendMutex ); - link->fd = -1; - mutex_unlock( &link->sendMutex ); - link->signal = NULL; - // Do not access link->image after unlocking, since we set + mutex_lock( &uplink->queueLock ); + const int fd = uplink->fd; + const dnbd3_signal_t* signal = uplink->signal; + mutex_lock( &uplink->sendMutex ); + uplink->fd = -1; + mutex_unlock( &uplink->sendMutex ); + uplink->signal = NULL; + // Do not access uplink->image after unlocking, since we set // image->uplink to NULL. Acquire with image_lock first, // like done below when checking whether to re-init uplink - mutex_unlock( &link->image->lock ); - mutex_unlock( &link->queueLock ); + mutex_unlock( &uplink->image->lock ); + mutex_unlock( &uplink->queueLock ); if ( fd != -1 ) close( fd ); if ( signal != NULL ) signal_close( signal ); // Wait for the RTT check to finish/fail if it's in progress - while ( link->rttTestResult == RTT_INPROGRESS ) + while ( uplink->rttTestResult == RTT_INPROGRESS ) usleep( 10000 ); - if ( link->betterFd != -1 ) { - close( link->betterFd ); + if ( uplink->betterFd != -1 ) { + close( uplink->betterFd ); } - mutex_destroy( &link->queueLock ); - mutex_destroy( &link->rttLock ); - mutex_destroy( &link->sendMutex ); - free( link->recvBuffer ); - link->recvBuffer = NULL; - if ( link->cacheFd != -1 ) { - close( link->cacheFd ); + mutex_destroy( &uplink->queueLock ); + mutex_destroy( &uplink->rttLock ); + mutex_destroy( &uplink->sendMutex ); + free( uplink->recvBuffer ); + uplink->recvBuffer = NULL; + if ( uplink->cacheFd != -1 ) { + close( uplink->cacheFd ); } - dnbd3_image_t *image = image_lock( link->image ); - free( link ); // !!! + dnbd3_image_t *image = image_lock( uplink->image ); + free( uplink ); // !!! if ( image != NULL ) { if ( !_shutdown && image->cache_map != NULL ) { // Ingegrity checker must have found something in the meantime @@ -633,37 +633,37 @@ static void* uplink_mainloop(void *data) return NULL ; } -static void uplink_sendRequests(dnbd3_connection_t *link, bool newOnly) +static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) { // Scan for new requests int j; - mutex_lock( &link->queueLock ); - for (j = 0; j < link->queueLen; ++j) { - if ( link->queue[j].status != ULR_NEW && (newOnly || link->queue[j].status != ULR_PENDING) ) continue; - link->queue[j].status = ULR_PENDING; - uint8_t hops = link->queue[j].hopCount; - const uint64_t reqStart = link->queue[j].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); - const uint32_t reqSize = (uint32_t)(((link->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart); + mutex_lock( &uplink->queueLock ); + for (j = 0; j < uplink->queueLen; ++j) { + if ( uplink->queue[j].status != ULR_NEW && (newOnly || uplink->queue[j].status != ULR_PENDING) ) continue; + uplink->queue[j].status = ULR_PENDING; + uint8_t hops = uplink->queue[j].hopCount; + const uint64_t reqStart = uplink->queue[j].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + const uint32_t reqSize = (uint32_t)(((uplink->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart); /* logadd( LOG_DEBUG2, "[%p] Sending slot %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 " (%" PRIu64 "-%" PRIu64 ")", - (void*)link, j, link->queue[j].status, link->queue[j].handle, link->queue[j].from, link->queue[j].to, reqStart, reqStart+reqSize ); + (void*)link, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize ); */ - mutex_unlock( &link->queueLock ); + mutex_unlock( &uplink->queueLock ); if ( hops < 200 ) ++hops; - mutex_lock( &link->sendMutex ); - const bool ret = dnbd3_get_block( link->fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( link->version, hops ) ); - mutex_unlock( &link->sendMutex ); + mutex_lock( &uplink->sendMutex ); + const bool ret = dnbd3_get_block( uplink->fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->version, hops ) ); + mutex_unlock( &uplink->sendMutex ); if ( !ret ) { // Non-critical - if the connection dropped or the server was changed // the thread will re-send this request as soon as the connection // is reestablished. logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" ); - altservers_serverFailed( &link->currentServer ); + altservers_serverFailed( &uplink->currentServer ); return; } - mutex_lock( &link->queueLock ); + mutex_lock( &uplink->queueLock ); } - mutex_unlock( &link->queueLock ); + mutex_unlock( &uplink->queueLock ); } /** @@ -676,13 +676,13 @@ static void uplink_sendRequests(dnbd3_connection_t *link, bool newOnly) * the code simpler. Worst case would be only one bit is zero, which means * 4kb are missing, but we will request 32kb. */ -static void uplink_sendReplicationRequest(dnbd3_connection_t *link) +static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) { - if ( link == NULL || link->fd == -1 ) return; - if ( _backgroundReplication == BGR_DISABLED || link->cacheFd == -1 ) return; // Don't do background replication - if ( link->nextReplicationIndex == -1 || link->replicationHandle != REP_NONE ) + if ( uplink == NULL || uplink->fd == -1 ) return; + if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return; // Don't do background replication + if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE ) return; - dnbd3_image_t * const image = link->image; + dnbd3_image_t * const image = uplink->image; if ( image->virtualFilesize < DNBD3_BLOCK_SIZE ) return; mutex_lock( &image->lock ); if ( image == NULL || image->cache_map == NULL || image->users < _bgrMinClients ) { @@ -694,17 +694,17 @@ static void uplink_sendReplicationRequest(dnbd3_connection_t *link) const int lastBlockIndex = mapBytes - 1; int endByte; if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks - endByte = link->nextReplicationIndex + mapBytes; + endByte = uplink->nextReplicationIndex + mapBytes; } else { // Hashblock based: Only look for match in current hash block - endByte = ( link->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK; + endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK; if ( endByte > mapBytes ) { endByte = mapBytes; } } int replicationIndex = -1; - for ( int j = link->nextReplicationIndex; j < endByte; ++j ) { + for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) { const int i = j % ( mapBytes ); // Wrap around for BGR_FULL - if ( image->cache_map[i] != 0xff && ( i != lastBlockIndex || !link->replicatedLastBlock ) ) { + if ( image->cache_map[i] != 0xff && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) { // Found incomplete one replicationIndex = i; break; @@ -713,31 +713,31 @@ static void uplink_sendReplicationRequest(dnbd3_connection_t *link) mutex_unlock( &image->lock ); if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) { // Nothing left in current block, find next one - replicationIndex = uplink_findNextIncompleteHashBlock( link, endByte ); + replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte ); } if ( replicationIndex == -1 ) { // Replication might be complete, uplink_mainloop should take care.... - link->nextReplicationIndex = -1; + uplink->nextReplicationIndex = -1; return; } const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE; - link->replicationHandle = offset; + uplink->replicationHandle = offset; const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE ); - mutex_lock( &link->sendMutex ); - bool sendOk = dnbd3_get_block( link->fd, offset, size, link->replicationHandle, COND_HOPCOUNT( link->version, 1 ) ); - mutex_unlock( &link->sendMutex ); + mutex_lock( &uplink->sendMutex ); + bool sendOk = dnbd3_get_block( uplink->fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->version, 1 ) ); + mutex_unlock( &uplink->sendMutex ); if ( !sendOk ) { logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" ); return; } if ( replicationIndex == lastBlockIndex ) { - link->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks + uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks } - link->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter + uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter if ( _backgroundReplication == BGR_HASHBLOCK - && link->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) { + && uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) { // Just crossed a hash block boundary, look for new candidate starting at this very index - link->nextReplicationIndex = uplink_findNextIncompleteHashBlock( link, link->nextReplicationIndex ); + uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex ); } } @@ -746,18 +746,18 @@ static void uplink_sendReplicationRequest(dnbd3_connection_t *link) * of a hash block which is neither completely empty nor completely * replicated yet. Returns -1 if no match. */ -static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const int startMapIndex) +static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int startMapIndex) { int retval = -1; - mutex_lock( &link->image->lock ); - const int mapBytes = IMGSIZE_TO_MAPBYTES( link->image->virtualFilesize ); - const uint8_t *cache_map = link->image->cache_map; + mutex_lock( &uplink->image->lock ); + const int mapBytes = IMGSIZE_TO_MAPBYTES( uplink->image->virtualFilesize ); + const uint8_t *cache_map = uplink->image->cache_map; if ( cache_map != NULL ) { int j; const int start = ( startMapIndex & MAP_INDEX_HASH_START_MASK ); for (j = 0; j < mapBytes; ++j) { const int i = ( start + j ) % mapBytes; - const bool isFull = cache_map[i] == 0xff || ( i + 1 == mapBytes && link->replicatedLastBlock ); + const bool isFull = cache_map[i] == 0xff || ( i + 1 == mapBytes && uplink->replicatedLastBlock ); const bool isEmpty = cache_map[i] == 0; if ( !isEmpty && !isFull ) { // Neither full nor empty, replicate @@ -785,7 +785,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const in retval = -1; } } - mutex_unlock( &link->image->lock ); + mutex_unlock( &uplink->image->lock ); return retval; } @@ -793,41 +793,41 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const in * Receive data from uplink server and process/dispatch * Locks on: link.lock, images[].lock */ -static void uplink_handleReceive(dnbd3_connection_t *link) +static void uplink_handleReceive(dnbd3_uplink_t *uplink) { dnbd3_reply_t inReply, outReply; int ret, i; for (;;) { - ret = dnbd3_read_reply( link->fd, &inReply, false ); - if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !link->shutdown ) ) continue; + ret = dnbd3_read_reply( uplink->fd, &inReply, false ); + if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue; if ( ret == REPLY_AGAIN ) break; if ( unlikely( ret == REPLY_CLOSED ) ) { - logadd( LOG_INFO, "Uplink: Remote host hung up (%s)", link->image->path ); + logadd( LOG_INFO, "Uplink: Remote host hung up (%s)", uplink->image->path ); goto error_cleanup; } if ( unlikely( ret == REPLY_WRONGMAGIC ) ) { - logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s)", link->image->path ); + logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s)", uplink->image->path ); goto error_cleanup; } if ( unlikely( ret != REPLY_OK ) ) { - logadd( LOG_INFO, "Uplink: Connection error %d (%s)", ret, link->image->path ); + logadd( LOG_INFO, "Uplink: Connection error %d (%s)", ret, uplink->image->path ); goto error_cleanup; } if ( unlikely( inReply.size > (uint32_t)_maxPayload ) ) { - logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s", inReply.size, link->image->path ); + logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s", inReply.size, uplink->image->path ); goto error_cleanup; } - if ( unlikely( link->recvBufferLen < inReply.size ) ) { - link->recvBufferLen = MIN((uint32_t)_maxPayload, inReply.size + 65536); - link->recvBuffer = realloc( link->recvBuffer, link->recvBufferLen ); - if ( link->recvBuffer == NULL ) { + if ( unlikely( uplink->recvBufferLen < inReply.size ) ) { + uplink->recvBufferLen = MIN((uint32_t)_maxPayload, inReply.size + 65536); + uplink->recvBuffer = realloc( uplink->recvBuffer, uplink->recvBufferLen ); + if ( uplink->recvBuffer == NULL ) { logadd( LOG_ERROR, "Out of memory when trying to allocate receive buffer for uplink" ); exit( 1 ); } } - if ( unlikely( (uint32_t)sock_recv( link->fd, link->recvBuffer, inReply.size ) != inReply.size ) ) { - logadd( LOG_INFO, "Lost connection to uplink server of %s (payload)", link->image->path ); + if ( unlikely( (uint32_t)sock_recv( uplink->fd, uplink->recvBuffer, inReply.size ) != inReply.size ) ) { + logadd( LOG_INFO, "Lost connection to uplink server of %s (payload)", uplink->image->path ); goto error_cleanup; } // Payload read completely @@ -838,18 +838,18 @@ static void uplink_handleReceive(dnbd3_connection_t *link) const uint64_t start = inReply.handle; const uint64_t end = inReply.handle + inReply.size; totalBytesReceived += inReply.size; - link->bytesReceived += inReply.size; + uplink->bytesReceived += inReply.size; // 1) Write to cache file - if ( unlikely( link->cacheFd == -1 ) ) { - uplink_reopenCacheFd( link, false ); + if ( unlikely( uplink->cacheFd == -1 ) ) { + uplink_reopenCacheFd( uplink, false ); } - if ( likely( link->cacheFd != -1 ) ) { + if ( likely( uplink->cacheFd != -1 ) ) { int err = 0; bool tryAgain = true; // Allow one retry in case we run out of space or the write fd became invalid uint32_t done = 0; ret = 0; while ( done < inReply.size ) { - ret = (int)pwrite( link->cacheFd, link->recvBuffer + done, inReply.size - done, start + done ); + ret = (int)pwrite( uplink->cacheFd, uplink->recvBuffer + done, inReply.size - done, start + done ); if ( unlikely( ret == -1 ) ) { err = errno; if ( err == EINTR ) continue; @@ -860,26 +860,26 @@ static void uplink_handleReceive(dnbd3_connection_t *link) continue; // Success, retry write } if ( err == EBADF || err == EINVAL || err == EIO ) { - if ( !tryAgain || !uplink_reopenCacheFd( link, true ) ) + if ( !tryAgain || !uplink_reopenCacheFd( uplink, true ) ) break; tryAgain = false; continue; // Write handle to image successfully re-opened, try again } - logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", link->image->name, (int)link->image->rid, err ); + logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", uplink->image->name, (int)uplink->image->rid, err ); break; } if ( unlikely( ret <= 0 || (uint32_t)ret > inReply.size - done ) ) { - logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d", ret, link->image->name, (int)link->image->rid ); + logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d", ret, uplink->image->name, (int)uplink->image->rid ); break; } done += (uint32_t)ret; } if ( likely( done > 0 ) ) { - image_updateCachemap( link->image, start, start + done, true ); + image_updateCachemap( uplink->image, start, start + done, true ); } if ( unlikely( ret == -1 && ( err == EBADF || err == EINVAL || err == EIO ) ) ) { logadd( LOG_WARNING, "Error writing received data for %s:%d (errno=%d); disabling caching.", - link->image->name, (int)link->image->rid, err ); + uplink->image->name, (int)uplink->image->rid, err ); } } // 2) Figure out which clients are interested in it @@ -888,9 +888,9 @@ static void uplink_handleReceive(dnbd3_connection_t *link) // by populating a slot with index greater than the highest matching // request with ULR_PROCESSING (assuming there is no ULR_PENDING or ULR_NEW // where it's fine if the index is greater) - mutex_lock( &link->queueLock ); - for (i = 0; i < link->queueLen; ++i) { - dnbd3_queued_request_t * const req = &link->queue[i]; + mutex_lock( &uplink->queueLock ); + for (i = 0; i < uplink->queueLen; ++i) { + dnbd3_queued_request_t * const req = &uplink->queue[i]; assert( req->status != ULR_PROCESSING ); if ( req->status != ULR_PENDING && req->status != ULR_NEW ) continue; assert( req->client != NULL ); @@ -903,8 +903,8 @@ static void uplink_handleReceive(dnbd3_connection_t *link) // from 0, you also need to change the "attach to existing request"-logic in uplink_request() outReply.magic = dnbd3_packet_magic; bool served = false; - for ( i = link->queueLen - 1; i >= 0; --i ) { - dnbd3_queued_request_t * const req = &link->queue[i]; + for ( i = uplink->queueLen - 1; i >= 0; --i ) { + dnbd3_queued_request_t * const req = &uplink->queue[i]; if ( req->status == ULR_PROCESSING ) { size_t bytesSent = 0; assert( req->from >= start && req->to <= end ); @@ -914,14 +914,14 @@ static void uplink_handleReceive(dnbd3_connection_t *link) outReply.size = (uint32_t)( req->to - req->from ); iov[0].iov_base = &outReply; iov[0].iov_len = sizeof outReply; - iov[1].iov_base = link->recvBuffer + (req->from - start); + iov[1].iov_base = uplink->recvBuffer + (req->from - start); iov[1].iov_len = outReply.size; fixup_reply( outReply ); req->status = ULR_FREE; req->client = NULL; served = true; mutex_lock( &client->sendMutex ); - mutex_unlock( &link->queueLock ); + mutex_unlock( &uplink->queueLock ); if ( client->sock != -1 ) { ssize_t sent = writev( client->sock, iov, 2 ); if ( sent > (ssize_t)sizeof outReply ) { @@ -932,66 +932,66 @@ static void uplink_handleReceive(dnbd3_connection_t *link) client->bytesSent += bytesSent; } mutex_unlock( &client->sendMutex ); - mutex_lock( &link->queueLock ); + mutex_lock( &uplink->queueLock ); } - if ( req->status == ULR_FREE && i == link->queueLen - 1 ) link->queueLen--; + if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--; } - mutex_unlock( &link->queueLock ); + mutex_unlock( &uplink->queueLock ); #ifdef _DEBUG - if ( !served && start != link->replicationHandle ) { - logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)link, link->image->name, start, end ); + if ( !served && start != uplink->replicationHandle ) { + logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)link, uplink->image->name, start, end ); } #endif - if ( start == link->replicationHandle ) { + if ( start == uplink->replicationHandle ) { // Was our background replication - link->replicationHandle = REP_NONE; + uplink->replicationHandle = REP_NONE; // Try to remove from fs cache if no client was interested in this data - if ( !served && link->cacheFd != -1 ) { - posix_fadvise( link->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED ); + if ( !served && uplink->cacheFd != -1 ) { + posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED ); } } if ( served ) { // Was some client -- reset idle counter - link->idleTime = 0; + uplink->idleTime = 0; // Re-enable replication if disabled - if ( link->nextReplicationIndex == -1 ) { - link->nextReplicationIndex = (int)( start / FILE_BYTES_PER_MAP_BYTE ) & MAP_INDEX_HASH_START_MASK; + if ( uplink->nextReplicationIndex == -1 ) { + uplink->nextReplicationIndex = (int)( start / FILE_BYTES_PER_MAP_BYTE ) & MAP_INDEX_HASH_START_MASK; } } } - if ( link->replicationHandle == REP_NONE ) { - mutex_lock( &link->queueLock ); - const bool rep = ( link->queueLen == 0 ); - mutex_unlock( &link->queueLock ); - if ( rep ) uplink_sendReplicationRequest( link ); + if ( uplink->replicationHandle == REP_NONE ) { + mutex_lock( &uplink->queueLock ); + const bool rep = ( uplink->queueLen == 0 ); + mutex_unlock( &uplink->queueLock ); + if ( rep ) uplink_sendReplicationRequest( uplink ); } return; // Error handling from failed receive or message parsing error_cleanup: ; - uplink_connectionFailed( link, true ); + uplink_connectionFailed( uplink, true ); } -static void uplink_connectionFailed(dnbd3_connection_t *link, bool findNew) +static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) { - if ( link->fd == -1 ) + if ( uplink->fd == -1 ) return; - altservers_serverFailed( &link->currentServer ); - mutex_lock( &link->sendMutex ); - close( link->fd ); - link->fd = -1; - mutex_unlock( &link->sendMutex ); - link->replicationHandle = REP_NONE; - if ( _backgroundReplication == BGR_FULL && link->nextReplicationIndex == -1 ) { - link->nextReplicationIndex = 0; + altservers_serverFailed( &uplink->currentServer ); + mutex_lock( &uplink->sendMutex ); + close( uplink->fd ); + uplink->fd = -1; + mutex_unlock( &uplink->sendMutex ); + uplink->replicationHandle = REP_NONE; + if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) { + uplink->nextReplicationIndex = 0; } if ( !findNew ) return; - mutex_lock( &link->rttLock ); - bool bail = link->rttTestResult == RTT_INPROGRESS || link->betterFd != -1; - mutex_unlock( &link->rttLock ); + mutex_lock( &uplink->rttLock ); + bool bail = uplink->rttTestResult == RTT_INPROGRESS || uplink->betterFd != -1; + mutex_unlock( &uplink->rttLock ); if ( bail ) return; - altservers_findUplink( link ); + altservers_findUplink( uplink ); } /** @@ -1008,7 +1008,7 @@ static int uplink_sendKeepalive(const int fd) return send( fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request); } -static void uplink_addCrc32(dnbd3_connection_t *uplink) +static void uplink_addCrc32(dnbd3_uplink_t *uplink) { dnbd3_image_t *image = uplink->image; if ( image == NULL || image->virtualFilesize == 0 ) return; @@ -1051,14 +1051,14 @@ static void uplink_addCrc32(dnbd3_connection_t *uplink) * it will be closed first. Otherwise, nothing will happen and true will be returned * immediately. */ -static bool uplink_reopenCacheFd(dnbd3_connection_t *link, const bool force) +static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force) { - if ( link->cacheFd != -1 ) { + if ( uplink->cacheFd != -1 ) { if ( !force ) return true; - close( link->cacheFd ); + close( uplink->cacheFd ); } - link->cacheFd = open( link->image->path, O_WRONLY | O_CREAT, 0644 ); - return link->cacheFd != -1; + uplink->cacheFd = open( uplink->image->path, O_WRONLY | O_CREAT, 0644 ); + return uplink->cacheFd != -1; } /** @@ -1066,13 +1066,13 @@ static bool uplink_reopenCacheFd(dnbd3_connection_t *link, const bool force) * Return true on success. * Locks on: imageListLock, image.lock */ -static bool uplink_saveCacheMap(dnbd3_connection_t *link) +static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) { - dnbd3_image_t *image = link->image; + dnbd3_image_t *image = uplink->image; assert( image != NULL ); - if ( link->cacheFd != -1 ) { - if ( fsync( link->cacheFd ) == -1 ) { + if ( uplink->cacheFd != -1 ) { + if ( fsync( uplink->cacheFd ) == -1 ) { // A failing fsync means we have no guarantee that any data // since the last fsync (or open if none) has been saved. Apart // from keeping the cache_map from the last successful fsync @@ -1134,9 +1134,9 @@ static bool uplink_saveCacheMap(dnbd3_connection_t *link) return true; } -static bool uplink_connectionShouldShutdown(dnbd3_connection_t *link) +static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink) { - return ( link->idleTime > SERVER_UPLINK_IDLE_TIMEOUT - && ( _backgroundReplication != BGR_FULL || _bgrMinClients > link->image->users ) ); + return ( uplink->idleTime > SERVER_UPLINK_IDLE_TIMEOUT + && ( _backgroundReplication != BGR_FULL || _bgrMinClients > uplink->image->users ) ); } diff --git a/src/server/uplink.h b/src/server/uplink.h index 2b41dfc..4fd41b0 100644 --- a/src/server/uplink.h +++ b/src/server/uplink.h @@ -10,7 +10,7 @@ uint64_t uplink_getTotalBytesReceived(); bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version); -void uplink_removeClient(dnbd3_connection_t *uplink, dnbd3_client_t *client); +void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client); bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hopCount); -- cgit v1.2.3-55-g7522 From 1d2295131020688b5a688286ce8c53d6bb7abdb8 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Sun, 18 Aug 2019 21:59:26 +0200 Subject: [SERVER] Add struct representing active connection to uplink server --- src/server/altservers.c | 30 +++++++++---------- src/server/globals.h | 14 ++++----- src/server/image.c | 2 +- src/server/integrity.c | 2 +- src/server/uplink.c | 78 ++++++++++++++++++++++++------------------------- 5 files changed, 60 insertions(+), 66 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/altservers.c b/src/server/altservers.c index 1001981..fbe10a8 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -125,14 +125,14 @@ void altservers_findUplink(dnbd3_uplink_t *uplink) { if ( uplink->shutdown ) return; - if ( uplink->fd != -1 && numAltServers <= 1 ) + if ( uplink->current.fd != -1 && numAltServers <= 1 ) return; int i; // if betterFd != -1 it means the uplink is supposed to switch to another // server. As this function here is called by the uplink thread, it can // never be that the uplink is supposed to switch, but instead calls // this function. - assert( uplink->betterFd == -1 ); + assert( uplink->better.fd == -1 ); // it is however possible that an RTT measurement is currently in progress, // so check for that case and do nothing if one is in progress // XXX As this function is only ever called by the image's uplink thread, @@ -457,9 +457,9 @@ static void *altservers_main(void *data UNUSED) if ( uplink == NULL ) continue; // First, get 4 alt servers - numAlts = altservers_getListForUplink( servers, ALTS, uplink->fd == -1 ); + numAlts = altservers_getListForUplink( servers, ALTS, uplink->current.fd == -1 ); // If we're already connected and only got one server anyways, there isn't much to do - if ( numAlts <= 1 && uplink->fd != -1 ) { + if ( numAlts <= 1 && uplink->current.fd != -1 ) { uplink->rttTestResult = RTT_DONTCHANGE; continue; } @@ -475,15 +475,15 @@ static void *altservers_main(void *data UNUSED) } LOG( LOG_DEBUG2, "[%d] Running alt check", itLink ); assert( uplink->rttTestResult == RTT_INPROGRESS ); - if ( uplink->fd != -1 ) { + if ( uplink->current.fd != -1 ) { // Add current server if not already in list found = false; for (itAlt = 0; itAlt < numAlts; ++itAlt) { - if ( !isSameAddressPort( &uplink->currentServer, &servers[itAlt] ) ) continue; + if ( !isSameAddressPort( &uplink->current.host, &servers[itAlt] ) ) continue; found = true; break; } - if ( !found ) servers[numAlts++] = uplink->currentServer; + if ( !found ) servers[numAlts++] = uplink->current.host; } // Test them all int bestSock = -1; @@ -537,7 +537,7 @@ static void *altservers_main(void *data UNUSED) // Measurement done - everything fine so far mutex_lock( &altServersLock ); mutex_lock( &uplink->rttLock ); - const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->currentServer ); + const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->current.host ); // Penaltize rtt if this was a cycle; this will treat this server with lower priority // in the near future too, so we prevent alternating between two servers that are both // part of a cycle and have the lowest latency. @@ -547,9 +547,9 @@ static void *altservers_main(void *data UNUSED) unsigned int avg = altservers_updateRtt( &servers[itAlt], rtt ); mutex_unlock( &altServersLock ); // If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time - if ( ( uplink->cycleDetected || uplink->fd == -1 ) && isCurrent ) avg = (avg * 2) + 50000; + if ( ( uplink->cycleDetected || uplink->current.fd == -1 ) && isCurrent ) avg = (avg * 2) + 50000; mutex_unlock( &uplink->rttLock ); - if ( uplink->fd != -1 && isCurrent ) { + if ( uplink->current.fd != -1 && isCurrent ) { // Was measuring current server currentRtt = avg; close( sock ); @@ -574,18 +574,18 @@ static void *altservers_main(void *data UNUSED) close( sock ); } // Done testing all servers. See if we should switch - if ( bestSock != -1 && (uplink->fd == -1 || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) { + if ( bestSock != -1 && (uplink->current.fd == -1 || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) { // yep - if ( currentRtt > 10000000 || uplink->fd == -1 ) { + if ( currentRtt > 10000000 || uplink->current.fd == -1 ) { LOG( LOG_DEBUG1, "Change - best: %luµs, current: -", bestRtt ); } else { LOG( LOG_DEBUG1, "Change - best: %luµs, current: %luµs", bestRtt, currentRtt ); } sock_setTimeout( bestSock, _uplinkTimeout ); mutex_lock( &uplink->rttLock ); - uplink->betterFd = bestSock; - uplink->betterServer = servers[bestIndex]; - uplink->betterVersion = bestProtocolVersion; + uplink->better.fd = bestSock; + uplink->better.host = servers[bestIndex]; + uplink->better.version = bestProtocolVersion; uplink->rttTestResult = RTT_DOCHANGE; mutex_unlock( &uplink->rttLock ); signal_call( uplink->signal ); diff --git a/src/server/globals.h b/src/server/globals.h index 0371e33..659e5a2 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -31,9 +31,9 @@ typedef struct } dnbd3_queued_request_t; typedef struct { - int fd; - int version; - dnbd3_host_t host; + int fd; // Socket fd for this connection + int version; // Protocol version of remote server + dnbd3_host_t host; // IP/Port of remote server } dnbd3_server_connection_t; #define RTT_IDLE 0 // Not in progress @@ -43,20 +43,16 @@ typedef struct { #define RTT_NOT_REACHABLE 4 // No uplink was reachable struct _dnbd3_uplink { - int fd; // socket fd to remote server - int version; // remote server protocol version + dnbd3_server_connection_t current; // Currently active connection; fd == -1 means disconnected + dnbd3_server_connection_t better; // Better connection as found by altserver worker; fd == -1 means none dnbd3_signal_t* signal; // used to wake up the process pthread_t thread; // thread holding the connection pthread_mutex_t sendMutex; // For locking socket while sending pthread_mutex_t queueLock; // lock for synchronization on request queue etc. dnbd3_image_t *image; // image that this uplink is used for; do not call get/release for this pointer - dnbd3_host_t currentServer; // Current server we're connected to pthread_mutex_t rttLock; // When accessing rttTestResult, betterFd or betterServer int rttTestResult; // RTT_* int cacheFd; // used to write to the image, in case it is relayed. ONLY USE FROM UPLINK THREAD! - int betterVersion; // protocol version of better server - int betterFd; // Active connection to better server, ready to use - dnbd3_host_t betterServer; // The better server uint8_t *recvBuffer; // Buffer for receiving payload uint32_t recvBufferLen; // Len of ^^ atomic_bool shutdown; // signal this thread to stop, must only be set from uplink_shutdown() or cleanup in uplink_mainloop() diff --git a/src/server/image.c b/src/server/image.c index 4a65ed3..d250715 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -1508,7 +1508,7 @@ json_t* image_getListAsJson() uplinkName[0] = '\0'; } else { bytesReceived = image->uplink->bytesReceived; - if ( image->uplink->fd == -1 || !host_to_string( &image->uplink->currentServer, uplinkName, sizeof(uplinkName) ) ) { + if ( image->uplink->current.fd == -1 || !host_to_string( &image->uplink->current.host, uplinkName, sizeof(uplinkName) ) ) { uplinkName[0] = '\0'; } } diff --git a/src/server/integrity.c b/src/server/integrity.c index c52d17b..3d1ac9b 100644 --- a/src/server/integrity.c +++ b/src/server/integrity.c @@ -240,7 +240,7 @@ static void* integrity_main(void * data UNUSED) if ( !foundCorrupted ) { mutex_lock( &image->lock ); if ( image->uplink != NULL ) { // TODO: image_determineWorkingState() helper? - image->working = image->uplink->fd != -1 && image->readFd != -1; + image->working = image->uplink->current.fd != -1 && image->readFd != -1; } mutex_unlock( &image->lock ); } diff --git a/src/server/uplink.c b/src/server/uplink.c index 7d66b21..e21e28c 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -97,7 +97,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version uplink->idleTime = 0; uplink->queueLen = 0; mutex_lock( &uplink->sendMutex ); - uplink->fd = -1; + uplink->current.fd = -1; mutex_unlock( &uplink->sendMutex ); uplink->cacheFd = -1; uplink->signal = NULL; @@ -105,12 +105,12 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version mutex_lock( &uplink->rttLock ); uplink->cycleDetected = false; if ( sock >= 0 ) { - uplink->betterFd = sock; - uplink->betterServer = *host; + uplink->better.fd = sock; + uplink->better.host = *host; uplink->rttTestResult = RTT_DOCHANGE; - uplink->betterVersion = version; + uplink->better.version = version; } else { - uplink->betterFd = -1; + uplink->better.fd = -1; uplink->rttTestResult = RTT_IDLE; } mutex_unlock( &uplink->rttLock ); @@ -211,7 +211,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin } // Check if the client is the same host as the uplink. If so assume this is a circular proxy chain // This might be a false positive if there are multiple instances running on the same host (IP) - if ( hops != 0 && isSameAddress( &uplink->currentServer, &client->host ) ) { + if ( hops != 0 && isSameAddress( &uplink->current.host, &client->host ) ) { mutex_unlock( &client->image->lock ); logadd( LOG_WARNING, "Proxy cycle detected (same host)." ); mutex_lock( &uplink->rttLock ); @@ -315,14 +315,14 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin if ( mutex_trylock( &uplink->sendMutex ) != 0 ) { logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" ); } else { - if ( uplink->fd == -1 ) { + if ( uplink->current.fd == -1 ) { mutex_unlock( &uplink->sendMutex ); logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" ); } else { const uint64_t reqStart = uplink->queue[freeSlot].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); const uint32_t reqSize = (uint32_t)(((uplink->queue[freeSlot].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart); if ( hops < 200 ) ++hops; - const bool ret = dnbd3_get_block( uplink->fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->version, hops ) ); + const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) ); mutex_unlock( &uplink->sendMutex ); if ( !ret ) { logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" ); @@ -405,7 +405,7 @@ static void* uplink_mainloop(void *data) mutex_unlock( &uplink->rttLock ); if ( waitTime == 0 ) { // Nothing - } else if ( uplink->fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) { + } else if ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) { waitTime = 1000; } else { declare_now; @@ -413,7 +413,7 @@ static void* uplink_mainloop(void *data) if ( waitTime < 100 ) waitTime = 100; if ( waitTime > 5000 ) waitTime = 5000; } - events[EV_SOCKET].fd = uplink->fd; + events[EV_SOCKET].fd = uplink->current.fd; numSocks = poll( events, EV_COUNT, waitTime ); if ( _shutdown || uplink->shutdown ) goto cleanup; if ( numSocks == -1 ) { // Error? @@ -430,13 +430,11 @@ static void* uplink_mainloop(void *data) uplink->rttTestResult = RTT_IDLE; // The rttTest worker thread has finished our request. // And says it's better to switch to another server - const int fd = uplink->fd; + const int fd = uplink->current.fd; mutex_lock( &uplink->sendMutex ); - uplink->fd = uplink->betterFd; + uplink->current = uplink->better; mutex_unlock( &uplink->sendMutex ); - uplink->betterFd = -1; - uplink->currentServer = uplink->betterServer; - uplink->version = uplink->betterVersion; + uplink->better.fd = -1; uplink->cycleDetected = false; mutex_unlock( &uplink->rttLock ); discoverFailCount = 0; @@ -445,7 +443,7 @@ static void* uplink_mainloop(void *data) uplink->image->working = true; uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received buffer[0] = '@'; - if ( host_to_string( &uplink->currentServer, buffer + 1, sizeof(buffer) - 1 ) ) { + if ( host_to_string( &uplink->current.host, buffer + 1, sizeof(buffer) - 1 ) ) { logadd( LOG_DEBUG1, "(Uplink %s) Now connected to %s\n", uplink->image->name, buffer + 1 ); setThreadName( buffer ); } @@ -471,7 +469,7 @@ static void* uplink_mainloop(void *data) if ( signal_clear( uplink->signal ) == SIGNAL_ERROR ) { logadd( LOG_WARNING, "Errno on signal on uplink for %s! Things will break!", uplink->image->name ); } - if ( uplink->fd != -1 ) { + if ( uplink->current.fd != -1 ) { // Uplink seems fine, relay requests to it... uplink_sendRequests( uplink, true ); } else { // No uplink; maybe it was shutdown since it was idle for too long @@ -499,9 +497,9 @@ static void* uplink_mainloop(void *data) uplink_saveCacheMap( uplink ); } // Keep-alive - if ( uplink->fd != -1 && uplink->replicationHandle == REP_NONE ) { + if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) { // Send keep-alive if nothing is happening - if ( uplink_sendKeepalive( uplink->fd ) ) { + if ( uplink_sendKeepalive( uplink->current.fd ) ) { // Re-trigger periodically, in case it requires a minimum user count uplink_sendReplicationRequest( uplink ); } else { @@ -511,10 +509,10 @@ static void* uplink_mainloop(void *data) } } // Don't keep uplink established if we're idle for too much - if ( uplink->fd != -1 && uplink_connectionShouldShutdown( uplink ) ) { + if ( uplink->current.fd != -1 && uplink_connectionShouldShutdown( uplink ) ) { mutex_lock( &uplink->sendMutex ); - close( uplink->fd ); - uplink->fd = events[EV_SOCKET].fd = -1; + close( uplink->current.fd ); + uplink->current.fd = events[EV_SOCKET].fd = -1; mutex_unlock( &uplink->sendMutex ); uplink->cycleDetected = false; if ( uplink->recvBufferLen != 0 ) { @@ -531,7 +529,7 @@ static void* uplink_mainloop(void *data) const int rttTestResult = uplink->rttTestResult; mutex_unlock( &uplink->rttLock ); if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) { - if ( timing_reached( &nextAltCheck, &now ) || ( uplink->fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) { + if ( timing_reached( &nextAltCheck, &now ) || ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) { // It seems it's time for a check if ( image_isComplete( uplink->image ) ) { // Quit work if image is complete @@ -556,7 +554,7 @@ static void* uplink_mainloop(void *data) timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_BACKOFF_COUNT ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) ); } #ifdef _DEBUG - if ( uplink->fd != -1 && !uplink->shutdown ) { + if ( uplink->current.fd != -1 && !uplink->shutdown ) { bool resend = false; ticks deadline; timing_set( &deadline, &now, -10 ); @@ -594,10 +592,10 @@ static void* uplink_mainloop(void *data) uplink->image->uplink = NULL; } mutex_lock( &uplink->queueLock ); - const int fd = uplink->fd; + const int fd = uplink->current.fd; const dnbd3_signal_t* signal = uplink->signal; mutex_lock( &uplink->sendMutex ); - uplink->fd = -1; + uplink->current.fd = -1; mutex_unlock( &uplink->sendMutex ); uplink->signal = NULL; // Do not access uplink->image after unlocking, since we set @@ -610,8 +608,8 @@ static void* uplink_mainloop(void *data) // Wait for the RTT check to finish/fail if it's in progress while ( uplink->rttTestResult == RTT_INPROGRESS ) usleep( 10000 ); - if ( uplink->betterFd != -1 ) { - close( uplink->betterFd ); + if ( uplink->better.fd != -1 ) { + close( uplink->better.fd ); } mutex_destroy( &uplink->queueLock ); mutex_destroy( &uplink->rttLock ); @@ -651,14 +649,14 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) mutex_unlock( &uplink->queueLock ); if ( hops < 200 ) ++hops; mutex_lock( &uplink->sendMutex ); - const bool ret = dnbd3_get_block( uplink->fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->version, hops ) ); + const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) ); mutex_unlock( &uplink->sendMutex ); if ( !ret ) { // Non-critical - if the connection dropped or the server was changed // the thread will re-send this request as soon as the connection // is reestablished. logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" ); - altservers_serverFailed( &uplink->currentServer ); + altservers_serverFailed( &uplink->current.host ); return; } mutex_lock( &uplink->queueLock ); @@ -678,7 +676,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) */ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) { - if ( uplink == NULL || uplink->fd == -1 ) return; + if ( uplink == NULL || uplink->current.fd == -1 ) return; if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return; // Don't do background replication if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE ) return; @@ -724,7 +722,7 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) uplink->replicationHandle = offset; const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE ); mutex_lock( &uplink->sendMutex ); - bool sendOk = dnbd3_get_block( uplink->fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->version, 1 ) ); + bool sendOk = dnbd3_get_block( uplink->current.fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->current.version, 1 ) ); mutex_unlock( &uplink->sendMutex ); if ( !sendOk ) { logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" ); @@ -798,7 +796,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) dnbd3_reply_t inReply, outReply; int ret, i; for (;;) { - ret = dnbd3_read_reply( uplink->fd, &inReply, false ); + ret = dnbd3_read_reply( uplink->current.fd, &inReply, false ); if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue; if ( ret == REPLY_AGAIN ) break; if ( unlikely( ret == REPLY_CLOSED ) ) { @@ -826,7 +824,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) exit( 1 ); } } - if ( unlikely( (uint32_t)sock_recv( uplink->fd, uplink->recvBuffer, inReply.size ) != inReply.size ) ) { + if ( unlikely( (uint32_t)sock_recv( uplink->current.fd, uplink->recvBuffer, inReply.size ) != inReply.size ) ) { logadd( LOG_INFO, "Lost connection to uplink server of %s (payload)", uplink->image->path ); goto error_cleanup; } @@ -973,12 +971,12 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) { - if ( uplink->fd == -1 ) + if ( uplink->current.fd == -1 ) return; - altservers_serverFailed( &uplink->currentServer ); + altservers_serverFailed( &uplink->current.host ); mutex_lock( &uplink->sendMutex ); - close( uplink->fd ); - uplink->fd = -1; + close( uplink->current.fd ); + uplink->current.fd = -1; mutex_unlock( &uplink->sendMutex ); uplink->replicationHandle = REP_NONE; if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) { @@ -987,7 +985,7 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) if ( !findNew ) return; mutex_lock( &uplink->rttLock ); - bool bail = uplink->rttTestResult == RTT_INPROGRESS || uplink->betterFd != -1; + bool bail = uplink->rttTestResult == RTT_INPROGRESS || uplink->better.fd != -1; mutex_unlock( &uplink->rttLock ); if ( bail ) return; @@ -1016,7 +1014,7 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink) uint32_t masterCrc; uint32_t *buffer = malloc( bytes ); mutex_lock( &uplink->sendMutex ); - bool sendOk = dnbd3_get_crc32( uplink->fd, &masterCrc, buffer, &bytes ); + bool sendOk = dnbd3_get_crc32( uplink->current.fd, &masterCrc, buffer, &bytes ); mutex_unlock( &uplink->sendMutex ); if ( !sendOk || bytes == 0 ) { free( buffer ); -- cgit v1.2.3-55-g7522 From 5fb4ef278be86fb6bda487f65ec4855d830bf4e5 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Thu, 22 Aug 2019 16:14:27 +0200 Subject: [SERVER] Get rid of alt-servers thread, per-uplink rtt history Alt-Server checks are now run using the threadpool, so we don't need a queue and dedicated thread anymore. The rtt history is now kept per uplink, so many uplinks won't overwhelm the history, making its time window very short. Also the fail counter is now split up; a global one for when the server actually isn't reachable, a local (per-uplink) one for when the server is reachable but doesn't serve the requested image. --- src/server/altservers.c | 738 ++++++++++++++++++++++-------------------------- src/server/altservers.h | 16 +- src/server/globals.h | 41 ++- src/server/image.c | 6 +- src/server/net.c | 16 +- src/server/server.c | 8 +- src/server/uplink.c | 117 ++++---- src/server/uplink.h | 2 + src/serverconfig.h | 10 +- 9 files changed, 469 insertions(+), 485 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/altservers.c b/src/server/altservers.c index fbe10a8..493ed9e 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -1,5 +1,6 @@ #include "altservers.h" #include "locks.h" +#include "threadpool.h" #include "helper.h" #include "image.h" #include "fileutil.h" @@ -14,46 +15,22 @@ #define LOG_GOTO(jumplabel, lvl, ...) do { LOG(lvl, __VA_ARGS__); goto jumplabel; } while (0); #define ERROR_GOTO(jumplabel, ...) LOG_GOTO(jumplabel, LOG_ERROR, __VA_ARGS__) -static dnbd3_uplink_t * _Atomic pending[SERVER_MAX_PENDING_ALT_CHECKS]; -static dnbd3_signal_t * _Atomic runSignal = NULL; - static dnbd3_alt_server_t altServers[SERVER_MAX_ALTS]; static atomic_int numAltServers = 0; static pthread_mutex_t altServersLock; +static ticks nextCloseUnusedFd; // TODO: Move away -static pthread_t altThread; - -static void *altservers_main(void *data); -static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const unsigned int rtt); +static void *altservers_runCheck(void *data); +static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int size, int current); +static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink); +static uint32_t altservers_updateRtt(dnbd3_uplink_t *uplink, int index, uint32_t rtt); +static void altservers_imageFailed(dnbd3_uplink_t *uplink, int server); void altservers_init() { srand( (unsigned int)time( NULL ) ); - // Init spinlock + // Init lock mutex_init( &altServersLock, LOCK_ALT_SERVER_LIST ); - // Init signal - runSignal = signal_new(); - if ( runSignal == NULL ) { - logadd( LOG_ERROR, "Error creating signal object. Uplink feature unavailable." ); - exit( EXIT_FAILURE ); - } - memset( altServers, 0, SERVER_MAX_ALTS * sizeof(dnbd3_alt_server_t) ); - if ( 0 != thread_create( &altThread, NULL, &altservers_main, (void *)NULL ) ) { - logadd( LOG_ERROR, "Could not start altservers connector thread" ); - exit( EXIT_FAILURE ); - } - // Init waiting links queue -- this is currently a global static array so - // it will already be zero, but in case we refactor later do it explicitly - for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { - pending[i] = NULL; - } -} - -void altservers_shutdown() -{ - if ( runSignal == NULL ) return; - signal_call( runSignal ); // Wake altservers thread up - thread_join( altThread, NULL ); } static void addalt(int argc, char **argv, void *data) @@ -121,7 +98,7 @@ bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate /** * ONLY called from the passed uplink's main thread */ -void altservers_findUplink(dnbd3_uplink_t *uplink) +void altservers_findUplinkAsync(dnbd3_uplink_t *uplink) { if ( uplink->shutdown ) return; @@ -135,67 +112,11 @@ void altservers_findUplink(dnbd3_uplink_t *uplink) assert( uplink->better.fd == -1 ); // it is however possible that an RTT measurement is currently in progress, // so check for that case and do nothing if one is in progress - // XXX As this function is only ever called by the image's uplink thread, - // it cannot happen that the uplink ends up in this list concurrently mutex_lock( &uplink->rttLock ); - if ( uplink->rttTestResult == RTT_INPROGRESS ) { - for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { - if ( pending[i] != uplink ) continue; - // Yep, measuring right now - return; - } - } - // Find free slot for measurement - uplink->rttTestResult = RTT_INPROGRESS; - for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { - if ( pending[i] != NULL ) continue; - dnbd3_uplink_t *null = NULL; - if ( atomic_compare_exchange_strong( &pending[i], &null, uplink ) ) { - mutex_unlock( &uplink->rttLock ); - atomic_thread_fence( memory_order_release ); - signal_call( runSignal ); // Wake altservers thread up - return; - } + if ( uplink->rttTestResult != RTT_INPROGRESS ) { + threadpool_run( &altservers_runCheck, uplink ); } - // End of loop - no free slot - uplink->rttTestResult = RTT_NOT_REACHABLE; mutex_unlock( &uplink->rttLock ); - logadd( LOG_WARNING, "No more free RTT measurement slots, ignoring a request..." ); -} - -/** - * The given uplink is about to disappear, - * wait until any pending RTT check is done. - */ -void altservers_removeUplink(dnbd3_uplink_t *uplink) -{ - assert( uplink != NULL ); - assert( uplink->shutdown ); - int i; - for ( i = 1 ;; ++i ) { - atomic_thread_fence( memory_order_acquire ); - if ( runSignal == NULL ) { - // Thread is already done, remove manually - uplink->rttTestResult = RTT_NOT_REACHABLE; - break; - } - // Thread still running, wait until test is done - bool found = false; - for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { - if ( pending[i] == uplink ) { - found = true; - break; - } - } - if ( !found ) // No more test running - break; - usleep( 10000 ); // 10ms - signal_call( runSignal ); // Wake altservers thread up - if ( i % 500 == 0 ) { - logadd( LOG_INFO, "Still waiting for altserver check for uplink %p...", (void*)uplink ); - } - } - logadd( LOG_DEBUG1, "Waited for %d iterations for altservers check when tearing down uplink", i ); } /** @@ -209,90 +130,124 @@ int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output if ( host == NULL || host->type == 0 || numAltServers == 0 || output == NULL || size <= 0 ) return 0; int i, j; int count = 0; - int scores[size]; - int score; - mutex_lock( &altServersLock ); + uint16_t scores[SERVER_MAX_ALTS] = { 0 }; if ( size > numAltServers ) size = numAltServers; - for (i = 0; i < numAltServers; ++i) { - if ( altServers[i].host.type == 0 ) continue; // Slot is empty - if ( altServers[i].isPrivate ) continue; // Do not tell clients about private servers + mutex_lock( &altServersLock ); + for ( i = 0; i < numAltServers; ++i ) { + if ( altServers[i].host.type == 0 || altServers[i].isPrivate ) + continue; // Slot is empty or uplink is for replication only if ( host->type == altServers[i].host.type ) { - score = altservers_netCloseness( host, &altServers[i].host ) - altServers[i].numFails; + scores[i] = 10 + altservers_netCloseness( host, &altServers[i].host ); } else { - score = -( altServers[i].numFails + 128 ); // Wrong address family + scores[i] = 1; // Wrong address family } - if ( count == 0 ) { - // Trivial - this is the first entry - output[0].host = altServers[i].host; - output[0].failures = 0; - scores[0] = score; - count++; - } else { - // Other entries already exist, insert in proper position - for (j = 0; j < size; ++j) { - if ( j < count && score <= scores[j] ) continue; - if ( j > count ) break; // Should never happen but just in case... - if ( j < count && j + 1 < size ) { - // Check if we're in the middle and need to move other entries... - memmove( &output[j + 1], &output[j], sizeof(dnbd3_server_entry_t) * (size - j - 1) ); - memmove( &scores[j + 1], &scores[j], sizeof(int) * (size - j - 1) ); - } - if ( count < size ) { - count++; - } - output[j].host = altServers[i].host; - output[j].failures = 0; - scores[j] = score; - break; + } + while ( count < size ) { + i = -1; + for ( j = 0; j < numAltServers; ++j ) { + if ( scores[j] == 0 ) + continue; + if ( i == -1 || scores[j] > scores[i] ) { + i = j; } } + if ( i == -1 ) + break; + output[count].host = altServers[i].host; + output[count].failures = 0; + count++; } mutex_unlock( &altServersLock ); return count; } +bool altservers_toString(int server, char *buffer, size_t len) +{ + return host_to_string( &altServers[server].host, buffer, len ); +} + +static bool isUsableForUplink( dnbd3_uplink_t *uplink, int server, ticks *now ) +{ + dnbd3_alt_local_t *local = ( uplink == NULL ? NULL : &uplink->altData[server] ); + dnbd3_alt_server_t *global = &altServers[server]; + if ( global->isClientOnly || ( !global->isPrivate && _proxyPrivateOnly ) ) + return false; + // Blocked locally (image not found on server...) + if ( local != NULL && local->blocked ) { + if ( --local->fails > 0 ) + return false; + local->blocked = false; + } + if ( global->blocked ) { + if ( timing_diff( &global->lastFail, now ) < SERVER_GLOBAL_DUP_TIME ) + return false; + global->lastFail = *now; + if ( --global->fails > 0 ) + return false; + global->blocked = false; + } + // Not blocked, depend on both fail counters + int fails = ( local == NULL ? 0 : local->fails ) + global->fails; + return fails < SERVER_BAD_UPLINK_MIN || ( rand() % fails ) < SERVER_BAD_UPLINK_MIN; +} + +int altservers_getHostListForReplication(dnbd3_host_t *servers, int size) +{ + int idx[size]; + int num = altservers_getListForUplink( NULL, idx, size, -1 ); + for ( int i = 0; i < num; ++i ) { + servers[i] = altServers[i].host; + } + return num; +} + /** * Get alt servers. If there are more alt servers than * requested, random servers will be picked. * This function is suited for finding uplink servers as * it includes private servers and ignores any "client only" servers + * @param current index of server for current connection, or -1 in panic mode */ -int altservers_getListForUplink(dnbd3_host_t *output, int size, int emergency) +static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int size, int current) { - if ( size <= 0 ) return 0; - int count = 0, i; - ticks now; - timing_get( &now ); + if ( size <= 0 ) + return 0; + int count = 0; + declare_now; mutex_lock( &altServersLock ); - // Flip first server in list with a random one every time this is called - if ( numAltServers > 1 ) { - const dnbd3_alt_server_t tmp = altServers[0]; - do { - i = rand() % numAltServers; - } while ( i == 0 ); - altServers[0] = altServers[i]; - altServers[i] = tmp; - } - // We iterate over the list twice. First run adds servers with 0 failures only, - // second one also considers those that failed (not too many times) - if ( size > numAltServers ) size = numAltServers; - for (i = 0; i < numAltServers * 2; ++i) { - dnbd3_alt_server_t *srv = &altServers[i % numAltServers]; - if ( srv->host.type == 0 ) continue; // Slot is empty - if ( _proxyPrivateOnly && !srv->isPrivate ) continue; // Config says to consider private alt-servers only? ignore! - if ( srv->isClientOnly ) continue; - bool first = ( i < numAltServers ); - if ( first ) { - if ( srv->numFails > 0 ) continue; - } else { - if ( srv->numFails == 0 ) continue; // Already added in first iteration - if ( !emergency && srv->numFails > SERVER_BAD_UPLINK_THRES // server failed X times in a row - && timing_diff( &srv->lastFail, &now ) < SERVER_BAD_UPLINK_IGNORE ) continue; // and last fail was not too long ago? ignore! - if ( !emergency ) srv->numFails--; + // If we don't have enough servers to randomize, take a shortcut + if ( numAltServers <= size ) { + for ( int i = 0; i < numAltServers; ++i ) { + if ( current == -1 || i == current || isUsableForUplink( uplink, i, &now ) ) { + servers[count++] = i; + } + } + } else { + // Plenty of alt servers; randomize + uint8_t state[SERVER_MAX_ALTS] = { 0 }; + if ( current != -1 ) { // Make sure we also test the current server + servers[count++] = current; + state[current] = 2; + } + for ( int tr = size * 10; tr > 0 && count < size; --tr ) { + int idx = rand() % numAltServers; + if ( state[idx] != 0 ) + continue; + if ( isUsableForUplink( uplink, idx, &now ) ) { + servers[count++] = idx; + state[idx] = 2; // Used + } else { + state[idx] = 1; // Potential + } + } + // If panic mode, consider others too + for ( int tr = size * 10; current == -1 && tr > 0 && count < size; --tr ) { + int idx = rand() % numAltServers; + if ( state[idx] == 2 ) + continue; + servers[count++] = idx; + state[idx] = 2; // Used } - // server seems ok, include in output and decrease its fail counter - output[count++] = srv->host; - if ( count >= size ) break; } mutex_unlock( &altServersLock ); return count; @@ -320,7 +275,7 @@ json_t* altservers_toJson() "rtt", rtts, "isPrivate", (int)src[i].isPrivate, "isClientOnly", (int)src[i].isClientOnly, - "numFails", src[i].numFails + "numFails", src[i].fails ); json_array_append_new( list, server ); } @@ -329,32 +284,27 @@ json_t* altservers_toJson() /** * Update rtt history of given server - returns the new average for that server. - * XXX HOLD altServersLock WHEN CALLING THIS! */ -static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const unsigned int rtt) +static uint32_t altservers_updateRtt(dnbd3_uplink_t *uplink, int index, uint32_t rtt) { - unsigned int avg = rtt; - int i; - for (i = 0; i < numAltServers; ++i) { - if ( !isSameAddressPort( host, &altServers[i].host ) ) continue; - altServers[i].rtt[++altServers[i].rttIndex % SERVER_RTT_PROBES] = rtt; -#if SERVER_RTT_PROBES == 5 - avg = (altServers[i].rtt[0] + altServers[i].rtt[1] + altServers[i].rtt[2] - + altServers[i].rtt[3] + altServers[i].rtt[4]) / SERVER_RTT_PROBES; -#else -#warning You might want to change the code in altservers_update_rtt if you changed SERVER_RTT_PROBES - avg = 0; - for (int j = 0; j < SERVER_RTT_PROBES; ++j) { - avg += altServers[i].rtt[j]; + uint32_t avg = 0, j; + dnbd3_alt_local_t *local = &uplink->altData[index]; + mutex_lock( &altServersLock ); + if ( likely( local->initDone ) ) { + local->rtt[++local->rttIndex % SERVER_RTT_PROBES] = rtt; + for ( j = 0; j < SERVER_RTT_PROBES; ++j ) { + avg += local->rtt[j]; } avg /= SERVER_RTT_PROBES; -#endif - // If we got a new rtt value, server must be working - if ( altServers[i].numFails > 0 ) { - altServers[i].numFails--; + } else { // First rtt measurement -- copy to every slot + for ( j = 0; j < SERVER_RTT_PROBES; ++j ) { + local->rtt[j] = rtt; } - break; + avg = rtt; + local->initDone = true; } + altServers[index].rtt[++altServers[index].rttIndex % SERVER_RTT_PROBES] = avg; + mutex_unlock( &altServersLock ); return avg; } @@ -383,40 +333,33 @@ int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2) * track of how often servers fail, and consider them disabled for some time if they * fail too many times. */ -void altservers_serverFailed(const dnbd3_host_t * const host) +void altservers_serverFailed(int server) { - int i; - int foundIndex = -1, lastOk = -1; - ticks now; - timing_get( &now ); + declare_now; mutex_lock( &altServersLock ); - for (i = 0; i < numAltServers; ++i) { - if ( foundIndex == -1 ) { - // Looking for the failed server in list - if ( isSameAddressPort( host, &altServers[i].host ) ) { - foundIndex = i; - } - } else if ( altServers[i].host.type != 0 && altServers[i].numFails == 0 ) { - lastOk = i; + if ( timing_diff( &altServers[server].lastFail, &now ) > SERVER_GLOBAL_DUP_TIME ) { + altServers[server].lastFail = now; + if ( altServers[server].fails++ >= SERVER_BAD_UPLINK_MAX ) { + altServers[server].blocked = true; } } - // Do only increase counter if last fail was not too recent. This is - // to prevent the counter from increasing rapidly if many images use the - // same uplink. If there's a network hickup, all uplinks will call this - // function and would increase the counter too quickly, disabling the server. - if ( foundIndex != -1 && timing_diff( &altServers[foundIndex].lastFail, &now ) > SERVER_RTT_INTERVAL_INIT ) { - altServers[foundIndex].numFails += SERVER_UPLINK_FAIL_INCREASE; - altServers[foundIndex].lastFail = now; - if ( lastOk != -1 ) { - // Make sure non-working servers are put at the end of the list, so they're less likely - // to get picked when testing servers for uplink connections. - const dnbd3_alt_server_t tmp = altServers[foundIndex]; - altServers[foundIndex] = altServers[lastOk]; - altServers[lastOk] = tmp; - } + mutex_unlock( &altServersLock ); +} + +/** + * Called from RTT checker if connecting to a server succeeded but + * subsequently selecting the given image failed. Handle this within + * the uplink and don't increase the global fail counter. + */ +static void altservers_imageFailed(dnbd3_uplink_t *uplink, int server) +{ + mutex_lock( &altServersLock ); + if ( uplink->altData[server].fails++ >= SERVER_BAD_UPLINK_MAX ) { + uplink->altData[server].blocked = true; } mutex_unlock( &altServersLock ); } + /** * Mainloop of this module. It will wait for requests by uplinks to find a * suitable uplink server for them. If found, it will tell the uplink about @@ -425,206 +368,213 @@ void altservers_serverFailed(const dnbd3_host_t * const host) * will update quite quickly. Needs to be improved some time, ie. by only * updating the rtt if the last update was at least X seconds ago. */ -static void *altservers_main(void *data UNUSED) +static void *altservers_runCheck(void *data) +{ + dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data; + + assert( uplink != NULL ); + setThreadName( "altserver-check" ); + altservers_findUplinkInternal( uplink ); + // Save cache maps of all images if applicable + // TODO: Has nothing to do with alt servers really, maybe move somewhere else? + declare_now; + if ( _closeUnusedFd && timing_reached( &nextCloseUnusedFd, &now ) ) { + timing_gets( &nextCloseUnusedFd, 900 ); + image_closeUnusedFd(); + } + return NULL; +} + +void altservers_findUplink(dnbd3_uplink_t *uplink) +{ + altservers_findUplinkInternal( uplink ); + while ( uplink->rttTestResult == RTT_INPROGRESS ) { + usleep( 5000 ); + } +} + +int altservers_hostToIndex(dnbd3_host_t *host) +{ + for ( int i = 0; i < numAltServers; ++i ) { + if ( isSameAddressPort( host, &altServers[i].host ) ) + return i; + } + return -1; +} + +const dnbd3_host_t* altservers_indexToHost(int server) +{ + return &altServers[server].host; +} + +// XXX Sync call above must block until async worker has finished XXX +static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink) { const int ALTS = 4; - int ret, itLink, itAlt, numAlts; - bool found; - char buffer[DNBD3_BLOCK_SIZE ]; - dnbd3_reply_t reply; - dnbd3_host_t servers[ALTS + 1]; - serialized_buffer_t serialized; + int ret, itAlt, numAlts, current; + bool panic; + int servers[ALTS + 1]; struct timespec start, end; - ticks nextCloseUnusedFd; - setThreadName( "altserver-check" ); - blockNoncriticalSignals(); - timing_gets( &nextCloseUnusedFd, 900 ); - // LOOP - while ( !_shutdown ) { - // Wait 5 seconds max. - ret = signal_wait( runSignal, 5000 ); - if ( _shutdown ) goto cleanup; - if ( ret == SIGNAL_ERROR ) { - if ( errno == EAGAIN || errno == EINTR ) continue; - logadd( LOG_WARNING, "Error %d on signal_clear on alservers_main! Things will break!", errno ); - usleep( 100000 ); + if ( _shutdown ) + return; + mutex_lock( &uplink->rttLock ); + // Maybe we already have a result, or check is currently running + if ( uplink->better.fd != -1 || uplink->rttTestResult == RTT_INPROGRESS ) { + mutex_unlock( &uplink->rttLock ); + return; + } + assert( uplink->rttTestResult != RTT_DOCHANGE ); + uplink->rttTestResult = RTT_INPROGRESS; + panic = ( uplink->current.fd == -1 ); + current = uplink->current.index; // Current server index (or last one in panic mode) + mutex_unlock( &uplink->rttLock ); + // First, get 4 alt servers + numAlts = altservers_getListForUplink( uplink, servers, ALTS, panic ? -1 : current ); + // If we're already connected and only got one server anyways, there isn't much to do + if ( numAlts == 0 || ( numAlts == 1 && !panic ) ) { + uplink->rttTestResult = RTT_DONTCHANGE; + return; + } + dnbd3_image_t * const image = image_lock( uplink->image ); + if ( image == NULL ) { // Check again after locking + uplink->rttTestResult = RTT_NOT_REACHABLE; + logadd( LOG_WARNING, "Image has gone away that was queued for RTT measurement" ); + return; + } + LOG( LOG_DEBUG2, "Running alt check for %s:%d", image->name, (int)image->rid ); + assert( uplink->rttTestResult == RTT_INPROGRESS ); + // Test them all + dnbd3_server_connection_t best = { .fd = -1 }; + unsigned long bestRtt = RTT_UNREACHABLE; + unsigned long currentRtt = RTT_UNREACHABLE; + for (itAlt = 0; itAlt < numAlts; ++itAlt) { + int server = servers[itAlt]; + // Connect + clock_gettime( BEST_CLOCK_SOURCE, &start ); + int sock = sock_connect( &altServers[server].host, 750, 1000 ); + if ( sock == -1 ) { // Connection failed means global error + altservers_serverFailed( server ); + continue; } - // Work your way through the queue - atomic_thread_fence( memory_order_acquire ); - for (itLink = 0; itLink < SERVER_MAX_PENDING_ALT_CHECKS; ++itLink) { - dnbd3_uplink_t * const uplink = pending[itLink]; - if ( uplink == NULL ) - continue; - // First, get 4 alt servers - numAlts = altservers_getListForUplink( servers, ALTS, uplink->current.fd == -1 ); - // If we're already connected and only got one server anyways, there isn't much to do - if ( numAlts <= 1 && uplink->current.fd != -1 ) { - uplink->rttTestResult = RTT_DONTCHANGE; - continue; - } - dnbd3_image_t * const image = image_lock( uplink->image ); - if ( image == NULL ) { // Check again after locking - mutex_lock( &uplink->rttLock ); - uplink->rttTestResult = RTT_NOT_REACHABLE; - assert( pending[itLink] == uplink ); - pending[itLink] = NULL; - mutex_unlock( &uplink->rttLock ); - logadd( LOG_WARNING, "Image has gone away that was queued for RTT measurement" ); - continue; - } - LOG( LOG_DEBUG2, "[%d] Running alt check", itLink ); - assert( uplink->rttTestResult == RTT_INPROGRESS ); - if ( uplink->current.fd != -1 ) { - // Add current server if not already in list - found = false; - for (itAlt = 0; itAlt < numAlts; ++itAlt) { - if ( !isSameAddressPort( &uplink->current.host, &servers[itAlt] ) ) continue; - found = true; - break; - } - if ( !found ) servers[numAlts++] = uplink->current.host; - } - // Test them all - int bestSock = -1; - int bestIndex = -1; - int bestProtocolVersion = -1; - unsigned long bestRtt = RTT_UNREACHABLE; - unsigned long currentRtt = RTT_UNREACHABLE; - for (itAlt = 0; itAlt < numAlts; ++itAlt) { - usleep( 1000 ); // Wait a very short moment for the network to recover (we might be doing lots of measurements...) - // Connect - clock_gettime( BEST_CLOCK_SOURCE, &start ); - int sock = sock_connect( &servers[itAlt], 750, 1000 ); - if ( sock < 0 ) continue; - // Select image ++++++++++++++++++++++++++++++ - if ( !dnbd3_select_image( sock, image->name, image->rid, SI_SERVER_FLAGS ) ) { - goto server_failed; - } - // See if selecting the image succeeded ++++++++++++++++++++++++++++++ - uint16_t protocolVersion, rid; - uint64_t imageSize; - char *name; - if ( !dnbd3_select_image_reply( &serialized, sock, &protocolVersion, &name, &rid, &imageSize ) ) { - goto server_image_not_available; - } - if ( protocolVersion < MIN_SUPPORTED_SERVER ) goto server_failed; - if ( name == NULL || strcmp( name, image->name ) != 0 ) { - ERROR_GOTO( server_failed, "[RTT] Server offers image '%s'", name ); - } - if ( rid != image->rid ) { - ERROR_GOTO( server_failed, "[RTT] Server provides rid %d", (int)rid ); - } - if ( imageSize != image->virtualFilesize ) { - ERROR_GOTO( server_failed, "[RTT] Remote size: %" PRIu64 ", expected: %" PRIu64, imageSize, image->virtualFilesize ); - } - // Request first block (NOT random!) ++++++++++++++++++++++++++++++ - if ( !dnbd3_get_block( sock, 0, DNBD3_BLOCK_SIZE, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) { - LOG_GOTO( server_failed, LOG_DEBUG1, "[RTT%d] Could not request first block", itLink ); - } - // See if requesting the block succeeded ++++++++++++++++++++++ - if ( !dnbd3_get_reply( sock, &reply ) ) { - LOG_GOTO( server_failed, LOG_DEBUG1, "[RTT%d] Received corrupted reply header after CMD_GET_BLOCK", itLink ); - } - // check reply header - if ( reply.cmd != CMD_GET_BLOCK || reply.size != DNBD3_BLOCK_SIZE ) { - ERROR_GOTO( server_failed, "[RTT] Reply to first block request is %" PRIu32 " bytes", reply.size ); - } - if ( recv( sock, buffer, DNBD3_BLOCK_SIZE, MSG_WAITALL ) != DNBD3_BLOCK_SIZE ) { - ERROR_GOTO( server_failed, "[RTT%d] Could not read first block payload", itLink ); - } - clock_gettime( BEST_CLOCK_SOURCE, &end ); - // Measurement done - everything fine so far - mutex_lock( &altServersLock ); - mutex_lock( &uplink->rttLock ); - const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->current.host ); - // Penaltize rtt if this was a cycle; this will treat this server with lower priority - // in the near future too, so we prevent alternating between two servers that are both - // part of a cycle and have the lowest latency. - const unsigned int rtt = (unsigned int)((end.tv_sec - start.tv_sec) * 1000000 - + (end.tv_nsec - start.tv_nsec) / 1000 - + ( (isCurrent && uplink->cycleDetected) ? 1000000 : 0 )); // µs - unsigned int avg = altservers_updateRtt( &servers[itAlt], rtt ); - mutex_unlock( &altServersLock ); - // If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time - if ( ( uplink->cycleDetected || uplink->current.fd == -1 ) && isCurrent ) avg = (avg * 2) + 50000; - mutex_unlock( &uplink->rttLock ); - if ( uplink->current.fd != -1 && isCurrent ) { - // Was measuring current server - currentRtt = avg; - close( sock ); - } else if ( avg < bestRtt ) { - // Was another server, update "best" - if ( bestSock != -1 ) close( bestSock ); - bestSock = sock; - bestRtt = avg; - bestIndex = itAlt; - bestProtocolVersion = protocolVersion; - } else { - // Was too slow, ignore - close( sock ); - } - // We're done, call continue - continue; - // Jump here if anything went wrong - // This will cleanup and continue - server_failed: ; - altservers_serverFailed( &servers[itAlt] ); - server_image_not_available: ; - close( sock ); - } - // Done testing all servers. See if we should switch - if ( bestSock != -1 && (uplink->current.fd == -1 || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) { - // yep - if ( currentRtt > 10000000 || uplink->current.fd == -1 ) { - LOG( LOG_DEBUG1, "Change - best: %luµs, current: -", bestRtt ); - } else { - LOG( LOG_DEBUG1, "Change - best: %luµs, current: %luµs", bestRtt, currentRtt ); - } - sock_setTimeout( bestSock, _uplinkTimeout ); - mutex_lock( &uplink->rttLock ); - uplink->better.fd = bestSock; - uplink->better.host = servers[bestIndex]; - uplink->better.version = bestProtocolVersion; - uplink->rttTestResult = RTT_DOCHANGE; - mutex_unlock( &uplink->rttLock ); - signal_call( uplink->signal ); - } else if ( bestSock == -1 && currentRtt == RTT_UNREACHABLE ) { - // No server was reachable - mutex_lock( &uplink->rttLock ); - uplink->rttTestResult = RTT_NOT_REACHABLE; - mutex_unlock( &uplink->rttLock ); - } else { - // nope - if ( bestSock != -1 ) close( bestSock ); - mutex_lock( &uplink->rttLock ); - uplink->rttTestResult = RTT_DONTCHANGE; - uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away - mutex_unlock( &uplink->rttLock ); - if ( !image->working ) { - image->working = true; - LOG( LOG_DEBUG1, "[%d] No better alt server found, enabling again", itLink ); - } - } - image_release( image ); - // end of loop over all pending uplinks - assert( pending[itLink] == uplink ); - pending[itLink] = NULL; - atomic_thread_fence( memory_order_release ); + // Select image ++++++++++++++++++++++++++++++ + if ( !dnbd3_select_image( sock, image->name, image->rid, SI_SERVER_FLAGS ) ) { + goto image_failed; } - // Save cache maps of all images if applicable - declare_now; - // TODO: Has nothing to do with alt servers really, maybe move somewhere else? - if ( _closeUnusedFd && timing_reached( &nextCloseUnusedFd, &now ) ) { - timing_gets( &nextCloseUnusedFd, 900 ); - image_closeUnusedFd(); + // See if selecting the image succeeded ++++++++++++++++++++++++++++++ + uint16_t protocolVersion, rid; + uint64_t imageSize; + char *name; + serialized_buffer_t serialized; + if ( !dnbd3_select_image_reply( &serialized, sock, &protocolVersion, &name, &rid, &imageSize ) ) { + goto image_failed; } + if ( protocolVersion < MIN_SUPPORTED_SERVER ) { // Server version unsupported; global fail + goto server_failed; + } + if ( name == NULL || strcmp( name, image->name ) != 0 ) { + ERROR_GOTO( image_failed, "[RTT] Server offers image '%s' instead of '%s'", name, image->name ); + } + if ( rid != image->rid ) { + ERROR_GOTO( image_failed, "[RTT] Server provides rid %d instead of %d", (int)rid, (int)image->rid ); + } + if ( imageSize != image->virtualFilesize ) { + ERROR_GOTO( image_failed, "[RTT] Remote size: %" PRIu64 ", expected: %" PRIu64, imageSize, image->virtualFilesize ); + } + // Request first block (NOT random!) ++++++++++++++++++++++++++++++ + if ( !dnbd3_get_block( sock, 0, DNBD3_BLOCK_SIZE, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) { + LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Could not request first block", server ); + } + // See if requesting the block succeeded ++++++++++++++++++++++ + dnbd3_reply_t reply; + if ( !dnbd3_get_reply( sock, &reply ) ) { + LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Received corrupted reply header after CMD_GET_BLOCK", server ); + } + // check reply header + if ( reply.cmd != CMD_GET_BLOCK || reply.size != DNBD3_BLOCK_SIZE ) { + // Sanity check failed; count this as global error (malicious/broken server) + ERROR_GOTO( server_failed, "[RTT] Reply to first block request is %" PRIu32 " bytes", reply.size ); + } + // flush payload to include this into measurement + char buffer[DNBD3_BLOCK_SIZE]; + if ( recv( sock, buffer, DNBD3_BLOCK_SIZE, MSG_WAITALL ) != DNBD3_BLOCK_SIZE ) { + ERROR_GOTO( image_failed, "[RTT%d] Could not read first block payload", server ); + } + clock_gettime( BEST_CLOCK_SOURCE, &end ); + // Measurement done - everything fine so far + mutex_lock( &uplink->rttLock ); + const bool isCurrent = ( uplink->current.index == server ); + mutex_unlock( &uplink->rttLock ); + // Penaltize rtt if this was a cycle; this will treat this server with lower priority + // in the near future too, so we prevent alternating between two servers that are both + // part of a cycle and have the lowest latency. + uint32_t rtt = (uint32_t)((end.tv_sec - start.tv_sec) * 1000000 + + (end.tv_nsec - start.tv_nsec) / 1000); // µs + uint32_t avg = altservers_updateRtt( uplink, server, rtt ); + // If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time + if ( ( uplink->cycleDetected || panic ) && isCurrent ) { + avg = (avg * 2) + 50000; + } + if ( !panic && isCurrent ) { + // Was measuring current server + currentRtt = avg; + close( sock ); + } else if ( avg < bestRtt ) { + // Was another server, update "best" + if ( best.fd != -1 ) { + close( best.fd ); + } + best.fd = sock; + bestRtt = avg; + best.index = server; + best.version = protocolVersion; + } else { + // Was too slow, ignore + close( sock ); + } + // We're done, call continue + continue; + // Jump here if anything went wrong + // This will cleanup and continue +image_failed: + altservers_imageFailed( uplink, server ); + goto failed; +server_failed: + altservers_serverFailed( server ); +failed: + close( sock ); } - cleanup: ; - if ( runSignal != NULL ) { - signal_close( runSignal ); + // Done testing all servers. See if we should switch + if ( best.fd != -1 && (panic || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) { + // yep + if ( currentRtt > 10000000 || panic ) { + LOG( LOG_DEBUG1, "Change - best: %luµs, current: -", bestRtt ); + } else { + LOG( LOG_DEBUG1, "Change - best: %luµs, current: %luµs", bestRtt, currentRtt ); + } + sock_setTimeout( best.fd, _uplinkTimeout ); + mutex_lock( &uplink->rttLock ); + uplink->better = best; + uplink->rttTestResult = RTT_DOCHANGE; + mutex_unlock( &uplink->rttLock ); + signal_call( uplink->signal ); + } else if ( best.fd == -1 && currentRtt == RTT_UNREACHABLE ) { + // No server was reachable, including current + uplink->rttTestResult = RTT_NOT_REACHABLE; + } else { + // nope + if ( best.fd != -1 ) { + close( best.fd ); + } + if ( !image->working || uplink->cycleDetected ) { + image->working = true; + LOG( LOG_DEBUG1, "[RTT] No better alt server found, enabling '%s:%d' again... :-(", image->name, (int)image->rid ); + } + uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away + mutex_lock( &uplink->rttLock ); + uplink->rttTestResult = RTT_DONTCHANGE; + mutex_unlock( &uplink->rttLock ); } - runSignal = NULL; - return NULL ; + image_release( image ); } diff --git a/src/server/altservers.h b/src/server/altservers.h index e03b900..8e2b964 100644 --- a/src/server/altservers.h +++ b/src/server/altservers.h @@ -7,23 +7,27 @@ struct json_t; void altservers_init(); -void altservers_shutdown(); - int altservers_load(); bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly); -void altservers_findUplink(dnbd3_uplink_t *uplink); +void altservers_findUplinkAsync(dnbd3_uplink_t *uplink); -void altservers_removeUplink(dnbd3_uplink_t *uplink); +void altservers_findUplink(dnbd3_uplink_t *uplink); int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size); -int altservers_getListForUplink(dnbd3_host_t *output, int size, int emergency); +int altservers_getHostListForReplication(dnbd3_host_t *servers, int size); + +bool altservers_toString(int server, char *buffer, size_t len); int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2); -void altservers_serverFailed(const dnbd3_host_t * const host); +void altservers_serverFailed(int server); + +int altservers_hostToIndex(dnbd3_host_t *host); + +const dnbd3_host_t* altservers_indexToHost(int server); struct json_t* altservers_toJson(); diff --git a/src/server/globals.h b/src/server/globals.h index 659e5a2..4d97c6b 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -30,10 +30,31 @@ typedef struct uint8_t hopCount; // How many hops this request has already taken across proxies } dnbd3_queued_request_t; +typedef struct +{ + int fails; // Hard fail: Connection failed + int rttIndex; + uint32_t rtt[SERVER_RTT_PROBES]; + bool isPrivate, isClientOnly; + bool blocked; // If true count down fails until 0 to enable again + ticks lastFail; // Last hard fail + dnbd3_host_t host; + char comment[COMMENT_LENGTH]; +} dnbd3_alt_server_t; + +typedef struct +{ + int fails; // Soft fail: Image not found + int rttIndex; + uint32_t rtt[SERVER_RTT_PROBES]; + bool blocked; // True if server is to be ignored and fails should be counted down + bool initDone; +} dnbd3_alt_local_t; + typedef struct { - int fd; // Socket fd for this connection - int version; // Protocol version of remote server - dnbd3_host_t host; // IP/Port of remote server + int fd; // Socket fd for this connection + int version; // Protocol version of remote server + int index; // Entry in uplinks list } dnbd3_server_connection_t; #define RTT_IDLE 0 // Not in progress @@ -51,7 +72,7 @@ struct _dnbd3_uplink pthread_mutex_t queueLock; // lock for synchronization on request queue etc. dnbd3_image_t *image; // image that this uplink is used for; do not call get/release for this pointer pthread_mutex_t rttLock; // When accessing rttTestResult, betterFd or betterServer - int rttTestResult; // RTT_* + atomic_int rttTestResult; // RTT_* int cacheFd; // used to write to the image, in case it is relayed. ONLY USE FROM UPLINK THREAD! uint8_t *recvBuffer; // Buffer for receiving payload uint32_t recvBufferLen; // Len of ^^ @@ -65,19 +86,9 @@ struct _dnbd3_uplink atomic_int queueLen; // length of queue uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives) dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE]; + dnbd3_alt_local_t altData[SERVER_MAX_ALTS]; }; -typedef struct -{ - char comment[COMMENT_LENGTH]; - dnbd3_host_t host; - unsigned int rtt[SERVER_RTT_PROBES]; - unsigned int rttIndex; - bool isPrivate, isClientOnly; - ticks lastFail; - int numFails; -} dnbd3_alt_server_t; - typedef struct { uint8_t host[16]; diff --git a/src/server/image.c b/src/server/image.c index d250715..1a6e0f8 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -1178,7 +1178,7 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision, dnbd3_host_t servers[REP_NUM_SRV]; int uplinkSock = -1; dnbd3_host_t uplinkServer; - const int count = altservers_getListForUplink( servers, REP_NUM_SRV, false ); + const int count = altservers_getHostListForReplication( servers, REP_NUM_SRV ); uint16_t remoteProtocolVersion; uint16_t remoteRid = revision; uint64_t remoteImageSize; @@ -1491,7 +1491,7 @@ json_t* image_getListAsJson() json_t *imagesJson = json_array(); json_t *jsonImage; int i; - char uplinkName[100] = { 0 }; + char uplinkName[100]; uint64_t bytesReceived; int completeness, idleTime; declare_now; @@ -1508,7 +1508,7 @@ json_t* image_getListAsJson() uplinkName[0] = '\0'; } else { bytesReceived = image->uplink->bytesReceived; - if ( image->uplink->current.fd == -1 || !host_to_string( &image->uplink->current.host, uplinkName, sizeof(uplinkName) ) ) { + if ( !uplink_getHostString( image->uplink, uplinkName, sizeof(uplinkName) ) ) { uplinkName[0] = '\0'; } } diff --git a/src/server/net.c b/src/server/net.c index 7f3c1ce..4976eea 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -669,11 +669,19 @@ static void removeFromList(dnbd3_client_t *client) { int i; mutex_lock( &_clients_lock ); - for ( i = _num_clients - 1; i >= 0; --i ) { - if ( _clients[i] == client ) { - _clients[i] = NULL; + if ( _num_clients != 0 ) { + for ( i = _num_clients - 1; i >= 0; --i ) { + if ( _clients[i] == client ) { + _clients[i] = NULL; + break; + } + } + if ( i != 0 && i + 1 == _num_clients ) { + do { + i--; + } while ( _clients[i] == NULL && i > 0 ); + _num_clients = i + 1; } - if ( _clients[i] == NULL && i + 1 == _num_clients ) --_num_clients; } mutex_unlock( &_clients_lock ); } diff --git a/src/server/server.c b/src/server/server.c index 838aec2..640048a 100644 --- a/src/server/server.c +++ b/src/server/server.c @@ -121,9 +121,6 @@ void dnbd3_cleanup() // Disable threadpool threadpool_close(); - // Terminate the altserver checking thread - altservers_shutdown(); - // Terminate all uplinks image_killUplinks(); @@ -198,6 +195,11 @@ int main(int argc, char *argv[]) case LONGOPT_CRC4: return image_generateCrcFile( optarg ) ? 0 : EXIT_FAILURE; case LONGOPT_ASSERT: + printf( "Testing use after free:\n" ); + volatile char * volatile test = malloc( 10 ); + test[0] = 1; + free( test ); + test[1] = 2; printf( "Testing a failing assertion:\n" ); assert( 4 == 5 ); printf( "Assertion 4 == 5 seems to hold. ;-)\n" ); diff --git a/src/server/uplink.c b/src/server/uplink.c index e21e28c..6c85580 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -96,17 +96,18 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version uplink->bytesReceived = 0; uplink->idleTime = 0; uplink->queueLen = 0; - mutex_lock( &uplink->sendMutex ); - uplink->current.fd = -1; - mutex_unlock( &uplink->sendMutex ); uplink->cacheFd = -1; uplink->signal = NULL; uplink->replicationHandle = REP_NONE; mutex_lock( &uplink->rttLock ); + mutex_lock( &uplink->sendMutex ); + uplink->current.fd = -1; + mutex_unlock( &uplink->sendMutex ); uplink->cycleDetected = false; - if ( sock >= 0 ) { + if ( sock != -1 ) { uplink->better.fd = sock; - uplink->better.host = *host; + int index = altservers_hostToIndex( host ); + uplink->better.index = index == -1 ? 0 : index; // Prevent invalid array access uplink->rttTestResult = RTT_DOCHANGE; uplink->better.version = version; } else { @@ -116,7 +117,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version mutex_unlock( &uplink->rttLock ); uplink->recvBufferLen = 0; uplink->shutdown = false; - if ( 0 != thread_create( &(uplink->thread), NULL, &uplink_mainloop, (void *)link ) ) { + if ( 0 != thread_create( &(uplink->thread), NULL, &uplink_mainloop, (void *)uplink ) ) { logadd( LOG_ERROR, "Could not start thread for new uplink." ); goto failure; } @@ -148,8 +149,8 @@ void uplink_shutdown(dnbd3_image_t *image) } dnbd3_uplink_t * const uplink = image->uplink; mutex_lock( &uplink->queueLock ); - if ( !uplink->shutdown ) { - uplink->shutdown = true; + bool exp = false; + if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) { signal_call( uplink->signal ); thread = uplink->thread; join = true; @@ -211,13 +212,11 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin } // Check if the client is the same host as the uplink. If so assume this is a circular proxy chain // This might be a false positive if there are multiple instances running on the same host (IP) - if ( hops != 0 && isSameAddress( &uplink->current.host, &client->host ) ) { - mutex_unlock( &client->image->lock ); - logadd( LOG_WARNING, "Proxy cycle detected (same host)." ); - mutex_lock( &uplink->rttLock ); + if ( hops != 0 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) { uplink->cycleDetected = true; - mutex_unlock( &uplink->rttLock ); signal_call( uplink->signal ); + mutex_unlock( &client->image->lock ); + logadd( LOG_WARNING, "Proxy cycle detected (same host)." ); return false; } @@ -256,12 +255,10 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin } } if ( unlikely( requestLoop ) ) { - mutex_unlock( &uplink->queueLock ); - logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops ); - mutex_lock( &uplink->rttLock ); uplink->cycleDetected = true; - mutex_unlock( &uplink->rttLock ); signal_call( uplink->signal ); + mutex_unlock( &uplink->queueLock ); + logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops ); return false; } if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) { @@ -311,6 +308,8 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin if ( foundExisting != -1 ) return true; // Attached to pending request, do nothing + usleep( 10000 ); + // See if we can fire away the request if ( mutex_trylock( &uplink->sendMutex ) != 0 ) { logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" ); @@ -342,7 +341,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin if ( state == -1 ) { logadd( LOG_DEBUG2, "Direct uplink request queue entry gone after sending and re-locking queue. *shrug*" ); } else if ( state == ULR_NEW ) { - logadd( LOG_DEBUG2, "Succesful direct uplink request" ); + //logadd( LOG_DEBUG2, "Direct uplink request" ); } else { logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] ); } @@ -352,10 +351,8 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin } } - if ( foundExisting == -1 ) { // Only wake up uplink thread if the request needs to be relayed - if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) { - logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno ); - } + if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) { + logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno ); } return true; } @@ -443,7 +440,7 @@ static void* uplink_mainloop(void *data) uplink->image->working = true; uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received buffer[0] = '@'; - if ( host_to_string( &uplink->current.host, buffer + 1, sizeof(buffer) - 1 ) ) { + if ( altservers_toString( uplink->current.index, buffer + 1, sizeof(buffer) - 1 ) ) { logadd( LOG_DEBUG1, "(Uplink %s) Now connected to %s\n", uplink->image->name, buffer + 1 ); setThreadName( buffer ); } @@ -525,9 +522,7 @@ static void* uplink_mainloop(void *data) } } // See if we should trigger an RTT measurement - mutex_lock( &uplink->rttLock ); - const int rttTestResult = uplink->rttTestResult; - mutex_unlock( &uplink->rttLock ); + int rttTestResult = uplink->rttTestResult; if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) { if ( timing_reached( &nextAltCheck, &now ) || ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) { // It seems it's time for a check @@ -538,7 +533,7 @@ static void* uplink_mainloop(void *data) goto cleanup; } else if ( !uplink_connectionShouldShutdown( uplink ) ) { // Not complete - do measurement - altservers_findUplink( uplink ); // This will set RTT_INPROGRESS (synchronous) + altservers_findUplinkAsync( uplink ); // This will set RTT_INPROGRESS (synchronous) if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) { uplink->nextReplicationIndex = 0; } @@ -547,11 +542,9 @@ static void* uplink_mainloop(void *data) timing_set( &nextAltCheck, &now, altCheckInterval ); } } else if ( rttTestResult == RTT_NOT_REACHABLE ) { - mutex_lock( &uplink->rttLock ); - uplink->rttTestResult = RTT_IDLE; - mutex_unlock( &uplink->rttLock ); + atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE ); discoverFailCount++; - timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_BACKOFF_COUNT ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) ); + timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) ); } #ifdef _DEBUG if ( uplink->current.fd != -1 && !uplink->shutdown ) { @@ -581,36 +574,38 @@ static void* uplink_mainloop(void *data) #endif } cleanup: ; - if ( !uplink->shutdown ) { - uplink->shutdown = true; + // Detach depends on whether someone is joining this thread... + bool exp = false; + if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) { thread_detach( uplink->thread ); } - altservers_removeUplink( uplink ); uplink_saveCacheMap( uplink ); - mutex_lock( &uplink->image->lock ); - if ( uplink->image->uplink == uplink ) { - uplink->image->uplink = NULL; + dnbd3_image_t *image = uplink->image; + mutex_lock( &image->lock ); + // in the list anymore, but we want to prevent it from being freed in either case + if ( image->uplink == uplink ) { + image->uplink = NULL; } + mutex_unlock( &image->lock ); // Do NOT use image without locking it mutex_lock( &uplink->queueLock ); - const int fd = uplink->current.fd; - const dnbd3_signal_t* signal = uplink->signal; - mutex_lock( &uplink->sendMutex ); - uplink->current.fd = -1; - mutex_unlock( &uplink->sendMutex ); - uplink->signal = NULL; - // Do not access uplink->image after unlocking, since we set - // image->uplink to NULL. Acquire with image_lock first, - // like done below when checking whether to re-init uplink - mutex_unlock( &uplink->image->lock ); - mutex_unlock( &uplink->queueLock ); - if ( fd != -1 ) close( fd ); - if ( signal != NULL ) signal_close( signal ); - // Wait for the RTT check to finish/fail if it's in progress - while ( uplink->rttTestResult == RTT_INPROGRESS ) + // Wait for active RTT measurement to finish + while ( uplink->rttTestResult == RTT_INPROGRESS ) { usleep( 10000 ); + } + signal_close( uplink->signal ); + mutex_lock( &uplink->rttLock ); + mutex_lock( &uplink->sendMutex ); + if ( uplink->current.fd != -1 ) { + close( uplink->current.fd ); + uplink->current.fd = -1; + } if ( uplink->better.fd != -1 ) { close( uplink->better.fd ); + uplink->better.fd = -1; } + mutex_unlock( &uplink->sendMutex ); + mutex_unlock( &uplink->rttLock ); + mutex_unlock( &uplink->queueLock ); mutex_destroy( &uplink->queueLock ); mutex_destroy( &uplink->rttLock ); mutex_destroy( &uplink->sendMutex ); @@ -619,9 +614,9 @@ static void* uplink_mainloop(void *data) if ( uplink->cacheFd != -1 ) { close( uplink->cacheFd ); } - dnbd3_image_t *image = image_lock( uplink->image ); free( uplink ); // !!! - if ( image != NULL ) { + if ( image_lock( image ) != NULL ) { + // Image is still in list... if ( !_shutdown && image->cache_map != NULL ) { // Ingegrity checker must have found something in the meantime uplink_init( image, -1, NULL, 0 ); @@ -656,7 +651,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) // the thread will re-send this request as soon as the connection // is reestablished. logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" ); - altservers_serverFailed( &uplink->current.host ); + altservers_serverFailed( uplink->current.index ); return; } mutex_lock( &uplink->queueLock ); @@ -973,7 +968,7 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) { if ( uplink->current.fd == -1 ) return; - altservers_serverFailed( &uplink->current.host ); + altservers_serverFailed( uplink->current.index ); mutex_lock( &uplink->sendMutex ); close( uplink->current.fd ); uplink->current.fd = -1; @@ -1138,3 +1133,13 @@ static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink) && ( _backgroundReplication != BGR_FULL || _bgrMinClients > uplink->image->users ) ); } +bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len) +{ + int current; + mutex_lock( &uplink->rttLock ); + current = uplink->current.fd == -1 ? -1 : uplink->current.index; + mutex_unlock( &uplink->rttLock ); + if ( current == -1 ) + return false; + return altservers_toString( current, buffer, len ); +} diff --git a/src/server/uplink.h b/src/server/uplink.h index 4fd41b0..acc8e11 100644 --- a/src/server/uplink.h +++ b/src/server/uplink.h @@ -16,4 +16,6 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin void uplink_shutdown(dnbd3_image_t *image); +bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len); + #endif /* UPLINK_H_ */ diff --git a/src/serverconfig.h b/src/serverconfig.h index 0cbb320..239f0a2 100644 --- a/src/serverconfig.h +++ b/src/serverconfig.h @@ -6,10 +6,12 @@ // +++++ Performance/memory related #define SERVER_MAX_CLIENTS 4000 #define SERVER_MAX_IMAGES 5000 -#define SERVER_MAX_ALTS 100 +#define SERVER_MAX_ALTS 50 // +++++ Uplink handling (proxy mode) -#define SERVER_UPLINK_FAIL_INCREASE 5 // On server failure, increase numFails by this value -#define SERVER_BAD_UPLINK_THRES 40 // Thresold for numFails at which we ignore a server for the time span below +#define SERVER_GLOBAL_DUP_TIME 6 // How many seconds to wait before changing global fail counter again +#define SERVER_BAD_UPLINK_MIN 10 // Thresold for fails at which we start ignoring the server occasionally +#define SERVER_BAD_UPLINK_MAX 20 // Hard block server if it failed this many times +#define SERVER_BAD_UPLINK_LOCAL_BLOCK 10 // If a server didn't supply the requested image this many times, block it for some time #define SERVER_BAD_UPLINK_IGNORE 180 // How many seconds is a server ignored #define SERVER_MAX_UPLINK_QUEUE 1500 // Maximum number of queued requests per uplink #define SERVER_UPLINK_QUEUELEN_THRES 900 // Threshold where we start dropping incoming clients @@ -33,7 +35,7 @@ #define SERVER_RTT_PROBES 5 // How many probes to average over #define SERVER_RTT_INTERVAL_INIT 5 // Initial interval between probes #define SERVER_RTT_INTERVAL_MAX 45 // Maximum interval between probes -#define SERVER_RTT_BACKOFF_COUNT 5 // If we can't reach any uplink server this many times, consider the uplink bad +#define SERVER_RTT_MAX_UNREACH 10 // If no server was reachable this many times, stop RTT measurements for a while #define SERVER_RTT_INTERVAL_FAILED 180 // Interval to use if no uplink server is reachable for above many times #define SERVER_REMOTE_IMAGE_CHECK_CACHETIME 120 // 2 minutes -- cgit v1.2.3-55-g7522 From 69f5bf408b9587a6e2008fba2224c2d506f1a895 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Tue, 27 Aug 2019 16:13:07 +0200 Subject: [SERVER] Use reference counting for uplink First step towards less locking for proxy mode --- src/server/altservers.c | 13 ++- src/server/globals.h | 4 +- src/server/image.c | 39 ++++----- src/server/integrity.c | 17 ++-- src/server/net.c | 48 +++++++---- src/server/net.h | 2 + src/server/reference.c | 33 ++++++++ src/server/reference.h | 54 ++++++++++++ src/server/reftypes.h | 25 ++++++ src/server/uplink.c | 214 ++++++++++++++++++++++++++++-------------------- src/server/uplink.h | 2 +- 11 files changed, 311 insertions(+), 140 deletions(-) create mode 100644 src/server/reference.c create mode 100644 src/server/reference.h create mode 100644 src/server/reftypes.h (limited to 'src/server/globals.h') diff --git a/src/server/altservers.c b/src/server/altservers.c index 493ed9e..7d7fdbe 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -7,6 +7,8 @@ #include "../shared/protocol.h" #include "../shared/timing.h" #include "../serverconfig.h" +#include "reference.h" + #include #include #include @@ -104,7 +106,6 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink) return; if ( uplink->current.fd != -1 && numAltServers <= 1 ) return; - int i; // if betterFd != -1 it means the uplink is supposed to switch to another // server. As this function here is called by the uplink thread, it can // never be that the uplink is supposed to switch, but instead calls @@ -112,11 +113,14 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink) assert( uplink->better.fd == -1 ); // it is however possible that an RTT measurement is currently in progress, // so check for that case and do nothing if one is in progress - mutex_lock( &uplink->rttLock ); if ( uplink->rttTestResult != RTT_INPROGRESS ) { - threadpool_run( &altservers_runCheck, uplink ); + dnbd3_uplink_t *current = ref_get_uplink( &uplink->image->uplinkref ); + if ( current == uplink ) { + threadpool_run( &altservers_runCheck, uplink ); + } else if ( current != NULL ) { + ref_put( ¤t->reference ); + } } - mutex_unlock( &uplink->rttLock ); } /** @@ -375,6 +379,7 @@ static void *altservers_runCheck(void *data) assert( uplink != NULL ); setThreadName( "altserver-check" ); altservers_findUplinkInternal( uplink ); + ref_put( &uplink->reference ); // Acquired in findUplinkAsync // Save cache maps of all images if applicable // TODO: Has nothing to do with alt servers really, maybe move somewhere else? declare_now; diff --git a/src/server/globals.h b/src/server/globals.h index 4d97c6b..5dd205a 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -8,6 +8,7 @@ #include #include #include +#include "reftypes.h" typedef struct timespec ticks; @@ -64,6 +65,7 @@ typedef struct { #define RTT_NOT_REACHABLE 4 // No uplink was reachable struct _dnbd3_uplink { + ref reference; dnbd3_server_connection_t current; // Currently active connection; fd == -1 means disconnected dnbd3_server_connection_t better; // Better connection as found by altserver worker; fd == -1 means none dnbd3_signal_t* signal; // used to wake up the process @@ -107,7 +109,7 @@ struct _dnbd3_image { char *path; // absolute path of the image char *name; // public name of the image (usually relative path minus revision ID) - dnbd3_uplink_t *uplink; // pointer to a server connection + weakref uplinkref; // pointer to a server connection uint8_t *cache_map; // cache map telling which parts are locally cached, NULL if complete uint64_t virtualFilesize; // virtual size of image (real size rounded up to multiple of 4k) uint64_t realFilesize; // actual file size on disk diff --git a/src/server/image.c b/src/server/image.c index 1a6e0f8..5b58347 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -8,6 +8,7 @@ #include "../shared/protocol.h" #include "../shared/timing.h" #include "../shared/crc32.h" +#include "reference.h" #include #include @@ -375,9 +376,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) // Check if image is incomplete, handle if ( candidate->cache_map != NULL ) { - if ( candidate->uplink == NULL ) { - uplink_init( candidate, -1, NULL, -1 ); - } + uplink_init( candidate, -1, NULL, -1 ); } return candidate; // We did all we can, hopefully it's working @@ -484,17 +483,7 @@ void image_killUplinks() mutex_lock( &imageListLock ); for (i = 0; i < _num_images; ++i) { if ( _images[i] == NULL ) continue; - mutex_lock( &_images[i]->lock ); - if ( _images[i]->uplink != NULL ) { - mutex_lock( &_images[i]->uplink->queueLock ); - if ( !_images[i]->uplink->shutdown ) { - thread_detach( _images[i]->uplink->thread ); - _images[i]->uplink->shutdown = true; - } - mutex_unlock( &_images[i]->uplink->queueLock ); - signal_call( _images[i]->uplink->signal ); - } - mutex_unlock( &_images[i]->lock ); + uplink_shutdown( _images[i] ); } mutex_unlock( &imageListLock ); } @@ -588,11 +577,15 @@ bool image_tryFreeAll() static dnbd3_image_t* image_free(dnbd3_image_t *image) { assert( image != NULL ); + assert( image->users == 0 ); if ( !_shutdown ) { logadd( LOG_INFO, "Freeing image %s:%d", image->name, (int)image->rid ); } - // - uplink_shutdown( image ); + // uplink_shutdown might return false to tell us + // that the shutdown is in progress. Bail out since + // this will get called again when the uplink is done. + if ( !uplink_shutdown( image ) ) + return NULL; mutex_lock( &image->lock ); free( image->cache_map ); free( image->crc32 ); @@ -860,7 +853,7 @@ static bool image_load(char *base, char *path, int withUplink) image->cache_map = cache_map; image->crc32 = crc32list; image->masterCrc32 = masterCrc; - image->uplink = NULL; + image->uplinkref = NULL; image->realFilesize = realFilesize; image->virtualFilesize = virtualFilesize; image->rid = (uint16_t)revision; @@ -1503,16 +1496,18 @@ json_t* image_getListAsJson() mutex_lock( &image->lock ); idleTime = (int)timing_diff( &image->atime, &now ); completeness = image_getCompletenessEstimate( image ); - if ( image->uplink == NULL ) { + mutex_unlock( &image->lock ); + dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); + if ( uplink == NULL ) { bytesReceived = 0; uplinkName[0] = '\0'; } else { - bytesReceived = image->uplink->bytesReceived; - if ( !uplink_getHostString( image->uplink, uplinkName, sizeof(uplinkName) ) ) { + bytesReceived = uplink->bytesReceived; + if ( !uplink_getHostString( uplink, uplinkName, sizeof(uplinkName) ) ) { uplinkName[0] = '\0'; } + ref_put( &uplink->reference ); } - mutex_unlock( &image->lock ); jsonImage = json_pack( "{sisssisisisisI}", "id", image->id, // id, name, rid never change, so access them without locking @@ -1734,7 +1729,7 @@ void image_closeUnusedFd() if ( image == NULL ) continue; mutex_lock( &image->lock ); - if ( image->users == 0 && image->uplink == NULL && timing_reached( &image->atime, &deadline ) ) { + if ( image->users == 0 && image->uplinkref == NULL && timing_reached( &image->atime, &deadline ) ) { snprintf( imgstr, sizeof(imgstr), "%s:%d", image->name, (int)image->rid ); fd = image->readFd; image->readFd = -1; diff --git a/src/server/integrity.c b/src/server/integrity.c index 3d1ac9b..f358c46 100644 --- a/src/server/integrity.c +++ b/src/server/integrity.c @@ -4,6 +4,7 @@ #include "locks.h" #include "image.h" #include "uplink.h" +#include "reference.h" #include #include @@ -238,11 +239,13 @@ static void* integrity_main(void * data UNUSED) if ( i + 1 == queueLen ) queueLen--; // Mark as working again if applicable if ( !foundCorrupted ) { - mutex_lock( &image->lock ); - if ( image->uplink != NULL ) { // TODO: image_determineWorkingState() helper? - image->working = image->uplink->current.fd != -1 && image->readFd != -1; + dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); + if ( uplink != NULL ) { // TODO: image_determineWorkingState() helper? + mutex_lock( &image->lock ); + image->working = uplink->current.fd != -1 && image->readFd != -1; + mutex_unlock( &image->lock ); + ref_put( &uplink->reference ); } - mutex_unlock( &image->lock ); } } else { // Still more blocks to go... @@ -255,12 +258,8 @@ static void* integrity_main(void * data UNUSED) // Something was fishy, make sure uplink exists mutex_lock( &image->lock ); image->working = false; - bool restart = image->uplink == NULL || image->uplink->shutdown; mutex_unlock( &image->lock ); - if ( restart ) { - uplink_shutdown( image ); - uplink_init( image, -1, NULL, -1 ); - } + uplink_init( image, -1, NULL, -1 ); } // Release :-) image_release( image ); diff --git a/src/server/net.c b/src/server/net.c index 4976eea..e0b516e 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -24,6 +24,7 @@ #include "locks.h" #include "rpc.h" #include "altservers.h" +#include "reference.h" #include "../shared/sockhelper.h" #include "../shared/timing.h" @@ -229,7 +230,7 @@ void* net_handleNewConnection(void *clientPtr) rid = serializer_get_uint16( &payload ); const uint8_t flags = serializer_get_uint8( &payload ); client->isServer = ( flags & FLAGS8_SERVER ); - if ( request.size < 3 || !image_name || client_version < MIN_SUPPORTED_CLIENT ) { + if ( unlikely( request.size < 3 || !image_name || client_version < MIN_SUPPORTED_CLIENT ) ) { if ( client_version < MIN_SUPPORTED_CLIENT ) { logadd( LOG_DEBUG1, "Client %s too old", client->hostName ); } else { @@ -257,22 +258,25 @@ void* net_handleNewConnection(void *clientPtr) } client->image = image; atomic_thread_fence( memory_order_release ); - if ( image == NULL ) { + if ( unlikely( image == NULL ) ) { //logadd( LOG_DEBUG1, "Client requested non-existent image '%s' (rid:%d), rejected\n", image_name, (int)rid ); - } else if ( !image->working ) { + } else if ( unlikely( !image->working ) ) { logadd( LOG_DEBUG1, "Client %s requested non-working image '%s' (rid:%d), rejected\n", client->hostName, image_name, (int)rid ); } else { - bool penalty; // Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable bOk = true; if ( image->cache_map != NULL ) { - mutex_lock( &image->lock ); - if ( image->uplink == NULL || image->uplink->cacheFd == -1 || image->uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) { + dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); + if ( uplink == NULL || uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) { bOk = ( rand() % 4 ) == 1; } - penalty = bOk && image->uplink != NULL && image->uplink->cacheFd == -1; - mutex_unlock( &image->lock ); + bool penalty = bOk && ( uplink == NULL || uplink->cacheFd == -1 ); + if ( uplink == NULL ) { + uplink_init( image, -1, NULL, 0 ); + } else { + ref_put( &uplink->reference ); + } if ( penalty ) { // Wait 100ms if local caching is not working so this usleep( 100000 ); // server gets a penalty and is less likely to be selected } @@ -300,7 +304,7 @@ void* net_handleNewConnection(void *clientPtr) } } - if ( bOk ) { + if ( likely( bOk ) ) { // add artificial delay if applicable if ( client->isServer && _serverPenalty != 0 ) { usleep( _serverPenalty ); @@ -315,7 +319,7 @@ void* net_handleNewConnection(void *clientPtr) case CMD_GET_BLOCK:; const uint64_t offset = request.offset_small; // Copy to full uint64 to prevent repeated masking reply.handle = request.handle; - if ( offset >= image->virtualFilesize ) { + if ( unlikely( offset >= image->virtualFilesize ) ) { // Sanity check logadd( LOG_WARNING, "Client %s requested non-existent block", client->hostName ); reply.size = 0; @@ -323,7 +327,7 @@ void* net_handleNewConnection(void *clientPtr) send_reply( client->sock, &reply, NULL ); break; } - if ( offset + request.size > image->virtualFilesize ) { + if ( unlikely( offset + request.size > image->virtualFilesize ) ) { // Sanity check logadd( LOG_WARNING, "Client %s requested data block that extends beyond image size", client->hostName ); reply.size = 0; @@ -398,7 +402,7 @@ void* net_handleNewConnection(void *clientPtr) reply.size = request.size; fixup_reply( reply ); - const bool lock = image->uplink != NULL; + const bool lock = image->uplinkref != NULL; if ( lock ) mutex_lock( &client->sendMutex ); // Send reply header if ( send( client->sock, &reply, sizeof(dnbd3_reply_t), (request.size == 0 ? 0 : MSG_MORE) ) != sizeof(dnbd3_reply_t) ) { @@ -696,9 +700,11 @@ static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client) { mutex_lock( &client->lock ); if ( client->image != NULL ) { - mutex_lock( &client->image->lock ); - if ( client->image->uplink != NULL ) uplink_removeClient( client->image->uplink, client ); - mutex_unlock( &client->image->lock ); + dnbd3_uplink_t *uplink = ref_get_uplink( &client->image->uplinkref ); + if ( uplink != NULL ) { + uplink_removeClient( uplink, client ); + ref_put( &uplink->reference ); + } } mutex_lock( &client->sendMutex ); if ( client->sock != -1 ) { @@ -740,3 +746,15 @@ static bool addToList(dnbd3_client_t *client) return true; } +void net_sendReply(dnbd3_client_t *client, uint16_t cmd, uint64_t handle) +{ + dnbd3_reply_t reply; + reply.magic = dnbd3_packet_magic; + reply.cmd = cmd; + reply.handle = handle; + reply.size = 0; + mutex_lock( &client->sendMutex ); + send_reply( client->sock, &reply, NULL ); + mutex_unlock( &client->sendMutex ); +} + diff --git a/src/server/net.h b/src/server/net.h index 6813b49..7719aef 100644 --- a/src/server/net.h +++ b/src/server/net.h @@ -37,4 +37,6 @@ void net_disconnectAll(); void net_waitForAllDisconnected(); +void net_sendReply(dnbd3_client_t *client, uint16_t cmd, uint64_t handle); + #endif /* NET_H_ */ diff --git a/src/server/reference.c b/src/server/reference.c new file mode 100644 index 0000000..468e00b --- /dev/null +++ b/src/server/reference.c @@ -0,0 +1,33 @@ +#ifndef unlikely +#define unlikely(x) (x) +#endif +#include "reference.h" +#include +#include + +void ref_init( ref *reference, void ( *freefun )( ref * ), long count ) +{ + reference->count = count; + reference->free = freefun; +} + +_Noreturn void _ref_error( const char *message ) +{ + fprintf( stderr, "Reference counter overflow\n" ); + abort(); +} + +void ref_setref( weakref *weakref, ref *ref ) +{ + union _aligned_ref_ *new_weakref = 0; + if ( ref ) { + ( new_weakref = aligned_ref( ref->_aligned_ref ) )->ref = ref; + ref->count += sizeof( union _aligned_ref_ ) + 1; + } + char *old_weakref = (char *)atomic_exchange( weakref, new_weakref ); + if ( !old_weakref ) + return; + struct _ref_ *old_ref = aligned_ref( old_weakref )->ref; + old_ref->count += old_weakref - (char *)aligned_ref( old_weakref ) - sizeof( union _aligned_ref_ ); + ref_put( old_ref ); +} diff --git a/src/server/reference.h b/src/server/reference.h new file mode 100644 index 0000000..0bc081a --- /dev/null +++ b/src/server/reference.h @@ -0,0 +1,54 @@ +#ifndef _REFERENCE_H_ +#define _REFERENCE_H_ + +#include "reftypes.h" +#include +#include + +#define container_of(ptr, type, member) \ + ((type *)((char *)(ptr) - (char *)&(((type *)NULL)->member))) + +void ref_init( ref *reference, void ( *freefun )( ref * ), long count ); + +void ref_setref( weakref *weakref, ref *ref ); + +_Noreturn void _ref_error( const char *message ); + +static inline ref *ref_get( weakref *weakref ) +{ + char *old_weakref = (char *)*weakref; + do { + if ( old_weakref == NULL ) + return NULL; + if ( aligned_ref( old_weakref ) != aligned_ref( old_weakref + 1 ) ) { + old_weakref = (char *)*weakref; + continue; + } + } while ( !atomic_compare_exchange_weak( weakref, (void **)&old_weakref, old_weakref + 1 ) ); + struct _ref_ *ref = aligned_ref( old_weakref )->ref; + if ( unlikely( ++ref->count == -1 ) ) { + _ref_error( "Reference counter overflow. Aborting.\n" ); + } + char *cur_weakref = ( char * )*weakref; + do { + if ( aligned_ref( cur_weakref ) != aligned_ref( old_weakref ) ) { + ref->count--; + break; + } + } while ( !atomic_compare_exchange_weak( weakref, (void **)&cur_weakref, cur_weakref - 1 ) ); + return ref; +} + +static inline void ref_put( ref *ref ) +{ + if ( --ref->count == 0 ) { + ref->free( ref ); + } +} + +#define ref_get_uplink(wr) ({ \ + ref* ref = ref_get( wr ); \ + ref == NULL ? NULL : container_of(ref, dnbd3_uplink_t, reference); \ +}) + +#endif diff --git a/src/server/reftypes.h b/src/server/reftypes.h new file mode 100644 index 0000000..45c0c20 --- /dev/null +++ b/src/server/reftypes.h @@ -0,0 +1,25 @@ +#ifndef _REFTYPES_H_ +#define _REFTYPES_H_ + +#include + +_Static_assert( sizeof( void * ) == sizeof( _Atomic( void * ) ), "Atomic pointer bad" ); + +typedef _Atomic( void * ) weakref; + +#define aligned_ref(ptr) \ + ((union _aligned_ref_ *)((ptr) - (uintptr_t)(ptr) % sizeof(union _aligned_ref_))) + +union _aligned_ref_ { + struct _ref_ *ref; + void *_padding[( 32 - 1 ) / sizeof( void * ) + 1]; +}; + +typedef struct _ref_ { + _Atomic long count; + void ( *free )( struct _ref_ * ); + char _padding[sizeof( union _aligned_ref_ )]; + char _aligned_ref[sizeof( union _aligned_ref_ )]; +} ref; + +#endif diff --git a/src/server/uplink.c b/src/server/uplink.c index abfebf0..7a39887 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -3,10 +3,12 @@ #include "locks.h" #include "image.h" #include "altservers.h" +#include "net.h" #include "../shared/sockhelper.h" #include "../shared/protocol.h" #include "../shared/timing.h" #include "../shared/crc32.h" +#include "reference.h" #include #include @@ -45,6 +47,8 @@ static const char *const NAMES_ULR[4] = { static atomic_uint_fast64_t totalBytesReceived = 0; +static void cancelAllRequests(dnbd3_uplink_t *uplink); +static void uplink_free(ref *ref); static void* uplink_mainloop(void *data); static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly); static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex); @@ -76,19 +80,24 @@ uint64_t uplink_getTotalBytesReceived() bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version) { if ( !_isProxy || _shutdown ) return false; - dnbd3_uplink_t *uplink = NULL; assert( image != NULL ); mutex_lock( &image->lock ); - if ( image->uplink != NULL && !image->uplink->shutdown ) { + dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); + if ( uplink != NULL ) { mutex_unlock( &image->lock ); - if ( sock >= 0 ) close( sock ); + if ( sock != -1 ) { + close( sock ); + } + ref_put( &uplink->reference ); return true; // There's already an uplink, so should we consider this success or failure? } if ( image->cache_map == NULL ) { logadd( LOG_WARNING, "Uplink was requested for image %s, but it is already complete", image->name ); goto failure; } - uplink = image->uplink = calloc( 1, sizeof(dnbd3_uplink_t) ); + uplink = calloc( 1, sizeof(dnbd3_uplink_t) ); + // Start with one reference for the uplink thread. We'll return it when the thread finishes + ref_init( &uplink->reference, uplink_free, 1 ); mutex_init( &uplink->queueLock, LOCK_UPLINK_QUEUE ); mutex_init( &uplink->rttLock, LOCK_UPLINK_RTT ); mutex_init( &uplink->sendMutex, LOCK_UPLINK_SEND ); @@ -121,12 +130,13 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version logadd( LOG_ERROR, "Could not start thread for new uplink." ); goto failure; } + ref_setref( &image->uplinkref, &uplink->reference ); mutex_unlock( &image->lock ); return true; failure: ; if ( uplink != NULL ) { free( uplink ); - uplink = image->uplink = NULL; + uplink = NULL; } mutex_unlock( &image->lock ); return false; @@ -137,34 +147,83 @@ failure: ; * Calling it multiple times, even concurrently, will * not break anything. */ -void uplink_shutdown(dnbd3_image_t *image) +bool uplink_shutdown(dnbd3_image_t *image) { - bool join = false; - pthread_t thread; assert( image != NULL ); mutex_lock( &image->lock ); - if ( image->uplink == NULL ) { + dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); + if ( uplink == NULL ) { mutex_unlock( &image->lock ); - return; + return true; } - dnbd3_uplink_t * const uplink = image->uplink; mutex_lock( &uplink->queueLock ); bool exp = false; if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) { + image->users++; // Prevent free while uplink shuts down signal_call( uplink->signal ); - thread = uplink->thread; - join = true; + } else { + logadd( LOG_ERROR, "This will never happen. '%s:%d'", image->name, (int)image->rid ); } + cancelAllRequests( uplink ); + ref_setref( &image->uplinkref, NULL ); + ref_put( &uplink->reference ); mutex_unlock( &uplink->queueLock ); - bool wait = image->uplink != NULL; + bool retval = ( exp && image->users == 0 ); mutex_unlock( &image->lock ); - if ( join ) thread_join( thread, NULL ); - while ( wait ) { - usleep( 5000 ); - mutex_lock( &image->lock ); - wait = image->uplink != NULL && image->uplink->shutdown; - mutex_unlock( &image->lock ); + return exp; +} + +/** + * Cancel all requests of this uplink. + * HOLD QUEUE LOCK WHILE CALLING + */ +static void cancelAllRequests(dnbd3_uplink_t *uplink) +{ + for ( int i = 0; i < uplink->queueLen; ++i ) { + if ( uplink->queue[i].status != ULR_FREE ) { + net_sendReply( uplink->queue[i].client, CMD_ERROR, uplink->queue[i].handle ); + uplink->queue[i].status = ULR_FREE; + } + } + uplink->queueLen = 0; +} + +static void uplink_free(ref *ref) +{ + dnbd3_uplink_t *uplink = container_of(ref, dnbd3_uplink_t, reference); + logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", uplink->image->name, (int)uplink->image->rid ); + assert( uplink->queueLen == 0 ); + signal_close( uplink->signal ); + if ( uplink->current.fd != -1 ) { + close( uplink->current.fd ); + uplink->current.fd = -1; + } + if ( uplink->better.fd != -1 ) { + close( uplink->better.fd ); + uplink->better.fd = -1; + } + mutex_destroy( &uplink->queueLock ); + mutex_destroy( &uplink->rttLock ); + mutex_destroy( &uplink->sendMutex ); + free( uplink->recvBuffer ); + uplink->recvBuffer = NULL; + if ( uplink->cacheFd != -1 ) { + close( uplink->cacheFd ); } + // TODO Requeue any requests + dnbd3_image_t *image = image_lock( uplink->image ); + if ( image != NULL ) { + // != NULL means image is still in list... + if ( !_shutdown && image->cache_map != NULL ) { + // Ingegrity checker must have found something in the meantime + uplink_init( image, -1, NULL, 0 ); + } + image_release( image ); + } + // Finally let go of image. It was acquired either in uplink_shutdown or in the cleanup code + // of the uplink thread, depending on who set the uplink->shutdown flag. + image_release( image ); + free( uplink ); // !!! } /** @@ -193,31 +252,28 @@ void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client) */ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops) { - if ( client == NULL || client->image == NULL ) return false; + if ( client == NULL || client->image == NULL ) + return false; if ( length > (uint32_t)_maxPayload ) { logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length ); return false; } - mutex_lock( &client->image->lock ); - if ( client->image->uplink == NULL ) { - mutex_unlock( &client->image->lock ); + dnbd3_uplink_t * const uplink = ref_get_uplink( &client->image->uplinkref ); + if ( uplink == NULL ) { logadd( LOG_DEBUG1, "Uplink request for image with no uplink" ); return false; } - dnbd3_uplink_t * const uplink = client->image->uplink; if ( uplink->shutdown ) { - mutex_unlock( &client->image->lock ); logadd( LOG_DEBUG1, "Uplink request for image with uplink shutting down" ); - return false; + goto fail_ref; } // Check if the client is the same host as the uplink. If so assume this is a circular proxy chain // This might be a false positive if there are multiple instances running on the same host (IP) if ( hops != 0 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) { uplink->cycleDetected = true; signal_call( uplink->signal ); - mutex_unlock( &client->image->lock ); logadd( LOG_WARNING, "Proxy cycle detected (same host)." ); - return false; + goto fail_ref; } int foundExisting = -1; // Index of a pending request that is a superset of our range, -1 otherwise @@ -229,7 +285,9 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin const uint64_t end = start + length; mutex_lock( &uplink->queueLock ); - mutex_unlock( &client->image->lock ); + if ( uplink->shutdown ) { // Check again after locking to prevent lost requests + goto fail_lock; + } for (i = 0; i < uplink->queueLen; ++i) { // find free slot to place this request into if ( uplink->queue[i].status == ULR_FREE ) { @@ -257,18 +315,16 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin if ( unlikely( requestLoop ) ) { uplink->cycleDetected = true; signal_call( uplink->signal ); - mutex_unlock( &uplink->queueLock ); logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops ); - return false; + goto fail_lock; } if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) { freeSlot = -1; // Not attaching to existing request, make it use a higher slot } if ( freeSlot == -1 ) { if ( uplink->queueLen >= SERVER_MAX_UPLINK_QUEUE ) { - mutex_unlock( &uplink->queueLock ); logadd( LOG_WARNING, "Uplink queue is full, consider increasing SERVER_MAX_UPLINK_QUEUE. Dropping client..." ); - return false; + goto fail_lock; } freeSlot = uplink->queueLen++; } @@ -305,16 +361,16 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin #endif mutex_unlock( &uplink->queueLock ); - if ( foundExisting != -1 ) + if ( foundExisting != -1 ) { + ref_put( &uplink->reference ); return true; // Attached to pending request, do nothing - - usleep( 10000 ); + } // See if we can fire away the request - if ( mutex_trylock( &uplink->sendMutex ) != 0 ) { + if ( unlikely( mutex_trylock( &uplink->sendMutex ) != 0 ) ) { logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" ); } else { - if ( uplink->current.fd == -1 ) { + if ( unlikely( uplink->current.fd == -1 ) ) { mutex_unlock( &uplink->sendMutex ); logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" ); } else { @@ -323,13 +379,13 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin if ( hops < 200 ) ++hops; const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) ); mutex_unlock( &uplink->sendMutex ); - if ( !ret ) { + if ( unlikely( !ret ) ) { logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" ); } else { // Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again int state; mutex_lock( &uplink->queueLock ); - if ( uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) { + if ( !uplink->shutdown && uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) { state = uplink->queue[freeSlot].status; if ( uplink->queue[freeSlot].status == ULR_NEW ) { uplink->queue[freeSlot].status = ULR_PENDING; @@ -345,6 +401,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin } else { logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] ); } + ref_put( &uplink->reference ); return true; } // Fall through to waking up sender thread @@ -354,7 +411,13 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) { logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno ); } + ref_put( &uplink->reference ); return true; +fail_lock: + mutex_unlock( &uplink->queueLock ); +fail_ref: + ref_put( &uplink->reference ); + return false; } /** @@ -381,6 +444,7 @@ static void* uplink_mainloop(void *data) // assert( uplink != NULL ); setThreadName( "idle-uplink" ); + thread_detach( uplink->thread ); blockNoncriticalSignals(); // Make sure file is open for writing if ( !uplink_reopenCacheFd( uplink, false ) ) { @@ -553,7 +617,7 @@ static void* uplink_mainloop(void *data) for (i = 0; i < uplink->queueLen; ++i) { if ( uplink->queue[i].status != ULR_FREE && timing_reached( &uplink->queue[i].entered, &deadline ) ) { snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n" - "%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)link, i, uplink->queue[i].client->image->name, + "%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)uplink, i, uplink->queue[i].client->image->name, uplink->queue[i].from, uplink->queue[i].to, uplink->queue[i].status ); uplink->queue[i].entered = now; #ifdef _DEBUG_RESEND_STARVING @@ -572,55 +636,26 @@ static void* uplink_mainloop(void *data) #endif } cleanup: ; - // Detach depends on whether someone is joining this thread... - bool exp = false; - if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) { - thread_detach( uplink->thread ); - } uplink_saveCacheMap( uplink ); dnbd3_image_t *image = uplink->image; mutex_lock( &image->lock ); - // in the list anymore, but we want to prevent it from being freed in either case - if ( image->uplink == uplink ) { - image->uplink = NULL; - } - mutex_unlock( &image->lock ); // Do NOT use image without locking it - mutex_lock( &uplink->queueLock ); - // Wait for active RTT measurement to finish - while ( uplink->rttTestResult == RTT_INPROGRESS ) { - usleep( 10000 ); - } - signal_close( uplink->signal ); - mutex_lock( &uplink->rttLock ); - mutex_lock( &uplink->sendMutex ); - if ( uplink->current.fd != -1 ) { - close( uplink->current.fd ); - uplink->current.fd = -1; - } - if ( uplink->better.fd != -1 ) { - close( uplink->better.fd ); - uplink->better.fd = -1; + bool exp = false; + if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) { + image->users++; // We set the flag - hold onto image } - mutex_unlock( &uplink->sendMutex ); - mutex_unlock( &uplink->rttLock ); - mutex_unlock( &uplink->queueLock ); - mutex_destroy( &uplink->queueLock ); - mutex_destroy( &uplink->rttLock ); - mutex_destroy( &uplink->sendMutex ); - free( uplink->recvBuffer ); - uplink->recvBuffer = NULL; - if ( uplink->cacheFd != -1 ) { - close( uplink->cacheFd ); + dnbd3_uplink_t *current = ref_get_uplink( &image->uplinkref ); + if ( current == uplink ) { // Set NULL if it's still us... + mutex_lock( &uplink->queueLock ); + cancelAllRequests( uplink ); + mutex_unlock( &uplink->queueLock ); + ref_setref( &image->uplinkref, NULL ); } - free( uplink ); // !!! - if ( image_lock( image ) != NULL ) { - // Image is still in list... - if ( !_shutdown && image->cache_map != NULL ) { - // Ingegrity checker must have found something in the meantime - uplink_init( image, -1, NULL, 0 ); - } - image_release( image ); + if ( current != NULL ) { // Decrease ref in any case + ref_put( ¤t->reference ); } + mutex_unlock( &image->lock ); + // Finally as the thread is done, decrease our own ref that we initialized with + ref_put( &uplink->reference ); return NULL ; } @@ -637,7 +672,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) const uint32_t reqSize = (uint32_t)(((uplink->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart); /* logadd( LOG_DEBUG2, "[%p] Sending slot %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 " (%" PRIu64 "-%" PRIu64 ")", - (void*)link, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize ); + (void*)uplink, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize ); */ mutex_unlock( &uplink->queueLock ); if ( hops < 200 ) ++hops; @@ -782,7 +817,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int /** * Receive data from uplink server and process/dispatch - * Locks on: link.lock, images[].lock + * Locks on: uplink.lock, images[].lock */ static void uplink_handleReceive(dnbd3_uplink_t *uplink) { @@ -924,13 +959,16 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) } mutex_unlock( &client->sendMutex ); mutex_lock( &uplink->queueLock ); + if ( i > uplink->queueLen ) { + uplink->queueLen = i; // Might have been set to 0 by cancelAllRequests + } } if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--; } mutex_unlock( &uplink->queueLock ); #ifdef _DEBUG if ( !served && start != uplink->replicationHandle ) { - logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)link, uplink->image->name, start, end ); + logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)uplink, uplink->image->name, start, end ); } #endif if ( start == uplink->replicationHandle ) { diff --git a/src/server/uplink.h b/src/server/uplink.h index acc8e11..49ff0b4 100644 --- a/src/server/uplink.h +++ b/src/server/uplink.h @@ -14,7 +14,7 @@ void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client); bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hopCount); -void uplink_shutdown(dnbd3_image_t *image); +bool uplink_shutdown(dnbd3_image_t *image); bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len); -- cgit v1.2.3-55-g7522 From ac1bf45ebdd630fbc9ad2c1fa3c0ea99f5206799 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Wed, 28 Aug 2019 13:07:13 +0200 Subject: [SERVER] Make signal handling more POSIX According to POSIX, a signal sent to a PID can be delivered to an arbitrary thread of that process that hasn't the signal blocked. This seens to never happen on Linux, but would mess things up since the code expected the main signal handler to only be executed by the main thread. This should now be fixed by examining the destination PID of the signal as well as the ID of the thread currently running the signal handler. If we notice the signal wasn't sent by our own PID and the handler is not currently run by the main thread, we re-send the signal to the main thread. Otherwise, if the signal was sent by our own PID but the handler is not run in the main thread, do nothing. This way we can use pthread_kill() to wake up threads that might be stuck in a blocking syscall when it's time to shut down. --- src/server/globals.h | 1 + src/server/image.c | 10 ++-------- src/server/integrity.c | 17 +++++++++++++---- src/server/net.c | 11 ++++++----- src/server/rpc.c | 13 ++++++++----- src/server/server.c | 22 +++++++++++++++++----- src/server/threadpool.c | 28 ++++++++++++++++++++++------ src/server/threadpool.h | 5 +++++ 8 files changed, 74 insertions(+), 33 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/globals.h b/src/server/globals.h index 5dd205a..f940666 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -138,6 +138,7 @@ struct _dnbd3_client char hostName[HOSTNAMELEN]; // inet_ntop version of host pthread_mutex_t sendMutex; // Held while writing to sock if image is incomplete (since uplink uses socket too) pthread_mutex_t lock; + pthread_t thread; }; // ####################################################### diff --git a/src/server/image.c b/src/server/image.c index de93cd4..248c12c 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -562,9 +562,7 @@ bool image_tryFreeAll() if ( _images[i] != NULL && _images[i]->users == 0 ) { dnbd3_image_t *image = _images[i]; _images[i] = NULL; - mutex_unlock( &imageListLock ); image = image_free( image ); - mutex_lock( &imageListLock ); } if ( i + 1 == _num_images && _images[i] == NULL ) _num_images--; } @@ -574,15 +572,13 @@ bool image_tryFreeAll() /** * Free image. DOES NOT check if it's in use. - * Indirectly locks on imageListLock, image.lock, uplink.queueLock + * (Indirectly) locks on image.lock, uplink.queueLock */ static dnbd3_image_t* image_free(dnbd3_image_t *image) { assert( image != NULL ); assert( image->users == 0 ); - if ( !_shutdown ) { - logadd( LOG_INFO, "Freeing image %s:%d", image->name, (int)image->rid ); - } + logadd( ( _shutdown ? LOG_DEBUG1 : LOG_INFO ), "Freeing image %s:%d", image->name, (int)image->rid ); // uplink_shutdown might return false to tell us // that the shutdown is in progress. Bail out since // this will get called again when the uplink is done. @@ -600,8 +596,6 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image) mutex_unlock( &image->lock ); if ( image->readFd != -1 ) close( image->readFd ); mutex_destroy( &image->lock ); - // - memset( image, 0, sizeof(*image) ); free( image ); return NULL ; } diff --git a/src/server/integrity.c b/src/server/integrity.c index f358c46..e7ebeb2 100644 --- a/src/server/integrity.c +++ b/src/server/integrity.c @@ -184,13 +184,20 @@ static void* integrity_main(void * data UNUSED) mutex_unlock( &image->lock ); } #if defined(linux) || defined(__linux) - if ( sync_file_range( fd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 ) { + while ( sync_file_range( fd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 ) #else - if ( fsync( fd ) == -1 ) { + while ( fsync( fd ) == -1 ) #endif - logadd( LOG_ERROR, "Cannot flush %s for integrity check", image->path ); + { + if ( _shutdown ) + break; + if ( errno == EINTR ) + continue; + logadd( LOG_ERROR, "Cannot flush %s for integrity check (errno=%d)", image->path, errno ); exit( 1 ); } + if ( _shutdown ) + break; // Use direct I/O only if read length is multiple of 4096 to be on the safe side int tfd; if ( direct && ( end % DNBD3_BLOCK_SIZE ) == 0 ) { @@ -266,7 +273,9 @@ static void* integrity_main(void * data UNUSED) } } mutex_unlock( &integrityQueueLock ); - if ( buffer != NULL ) free( buffer ); + if ( buffer != NULL ) { + free( buffer ); + } bRunning = false; return NULL; } diff --git a/src/server/net.c b/src/server/net.c index e0b516e..9c855e4 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -44,6 +44,7 @@ #include #include #include +#include static dnbd3_client_t *_clients[SERVER_MAX_CLIENTS]; static int _num_clients = 0; @@ -153,6 +154,7 @@ void* net_handleNewConnection(void *clientPtr) { dnbd3_client_t * const client = (dnbd3_client_t *)clientPtr; dnbd3_request_t request; + client->thread = pthread_self(); // Await data from client. Since this is a fresh connection, we expect data right away sock_setTimeout( client->sock, _clientTimeout ); @@ -631,11 +633,10 @@ void net_disconnectAll() int i; mutex_lock( &_clients_lock ); for (i = 0; i < _num_clients; ++i) { - if ( _clients[i] == NULL ) continue; - dnbd3_client_t * const client = _clients[i]; - mutex_lock( &client->lock ); - if ( client->sock >= 0 ) shutdown( client->sock, SHUT_RDWR ); - mutex_unlock( &client->lock ); + if ( _clients[i] == NULL ) + continue; + shutdown( _clients[i]->sock, SHUT_RDWR ); + pthread_kill( _clients[i]->thread, SIGINT ); } mutex_unlock( &_clients_lock ); } diff --git a/src/server/rpc.c b/src/server/rpc.c index 261c6c0..662263e 100644 --- a/src/server/rpc.c +++ b/src/server/rpc.c @@ -137,13 +137,13 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int bool hasName = false; bool ok; int keepAlive = HTTP_KEEPALIVE; - do { + while ( !_shutdown ) { // Read request from client struct phr_header headers[100]; size_t numHeaders, prevLen = 0, consumed; struct string method, path; int minorVersion; - do { + while ( !_shutdown ) { // Parse before calling recv, there might be a complete pipelined request in the buffer already // If the request is incomplete, we allow exactly one additional recv() to complete it. // This should suffice for real world scenarios as I don't know of any @@ -188,7 +188,9 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int sendReply( sock, "400 Bad Request", "text/plain", "Server cannot understand what you're trying to say", -1, HTTP_CLOSE ); goto func_return; } - } while ( true ); + } // Loop while request header incomplete + if ( _shutdown ) + break; if ( keepAlive == HTTP_KEEPALIVE ) { // Only keep the connection alive (and indicate so) if the client seems to support this if ( minorVersion == 0 || hasHeaderValue( headers, numHeaders, &STR_CONNECTION, &STR_CLOSE ) ) { @@ -213,7 +215,8 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int } else { ok = sendReply( sock, "404 Not found", "text/plain", "Nothing", -1, keepAlive ); } - if ( !ok ) break; + if ( !ok ) + break; } // hoff might be beyond end if the client sent another request (burst) const ssize_t extra = hoff - consumed; @@ -225,7 +228,7 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int hasName = true; setThreadName( "HTTP" ); } - } while (true); + } // Loop while more requests func_return:; do { const int curCount = --status.count; diff --git a/src/server/server.c b/src/server/server.c index 1cdd2ab..0dddea7 100644 --- a/src/server/server.c +++ b/src/server/server.c @@ -37,6 +37,8 @@ #include #include #include +#include +#include #define LONGOPT_CRC4 1000 #define LONGOPT_ASSERT 1001 @@ -60,6 +62,7 @@ static _Atomic(job_t *) newJob; static bool hasTimerThread = false; static pthread_t timerThread; +static pid_t mainPid; static pthread_t mainThread; #define DEFAULT_TIMER_TIMEOUT (60) @@ -138,7 +141,7 @@ _Noreturn static void dnbd3_cleanup() logadd( LOG_INFO, "Cleanup..." ); if ( hasTimerThread ) { - pthread_kill( timerThread, SIGHUP ); + pthread_kill( timerThread, SIGINT ); thread_join( timerThread, NULL ); } @@ -162,6 +165,8 @@ _Noreturn static void dnbd3_cleanup() // Wait for clients to disconnect net_waitForAllDisconnected(); + threadpool_waitEmpty(); + // Clean up images retries = 5; while ( !image_tryFreeAll() && --retries > 0 ) { @@ -204,6 +209,7 @@ int main(int argc, char *argv[]) { 0, 0, 0, 0 } }; + mainPid = getpid(); mainThread = pthread_self(); opt = getopt_long( argc, argv, optString, longOpts, &longIndex ); @@ -509,10 +515,16 @@ static void dnbd3_handleSignal(int signum) static void dnbd3_handleSignal2(int signum, siginfo_t *info, void *data UNUSED) { - if ( !pthread_equal( pthread_self(), mainThread ) ) - return; - memcpy( &lastSignal, info, sizeof(siginfo_t) ); - dnbd3_handleSignal( signum ); + if ( info->si_pid != mainPid ) { // Source is not this process + memcpy( &lastSignal, info, sizeof(siginfo_t) ); // Copy signal info + if ( info->si_pid != 0 && !pthread_equal( pthread_self(), mainThread ) ) { + pthread_kill( mainThread, info->si_signo ); // And relay signal if we're not the main thread + } + } + if ( pthread_equal( pthread_self(), mainThread ) ) { + // Signal received by main thread -- handle + dnbd3_handleSignal( signum ); + } } uint32_t dnbd3_serverUptime() diff --git a/src/server/threadpool.c b/src/server/threadpool.c index 3947677..0b46fd6 100644 --- a/src/server/threadpool.c +++ b/src/server/threadpool.c @@ -15,6 +15,7 @@ static void *threadpool_worker(void *entryPtr); static pthread_attr_t threadAttrs; static atomic_int maxIdleThreads = -1; static _Atomic(entry_t *) *pool = NULL; +static atomic_int activeThreads = 0; bool threadpool_init(int maxIdle) { @@ -34,10 +35,9 @@ bool threadpool_init(int maxIdle) void threadpool_close() { - _shutdown = true; - int max = maxIdleThreads; - maxIdleThreads = -1; - if ( max <= 0 ) return; + int max = atomic_exchange( &maxIdleThreads, -1 ); + if ( max <= 0 ) + return; for ( int i = 0; i < max; ++i ) { entry_t *cur = pool[i]; if ( cur != NULL && atomic_compare_exchange_strong( &pool[i], &cur, NULL ) ) { @@ -46,9 +46,23 @@ void threadpool_close() } } +void threadpool_waitEmpty() +{ + if ( activeThreads == 0 ) + return; + do { + sleep( 1 ); + logadd( LOG_INFO, "Threadpool: %d threads still active", (int)activeThreads ); + } while ( activeThreads != 0 ); +} + bool threadpool_run(void *(*startRoutine)(void *), void *arg) { - if ( startRoutine == NULL ) { + if ( unlikely( _shutdown ) ) { + logadd( LOG_MINOR, "Cannot submit work to threadpool while shutting down!" ); + return false; + } + if ( unlikely( startRoutine == NULL ) ) { logadd( LOG_ERROR, "Trying to queue work for thread pool with NULL startRoutine" ); return false; // Or bail out!? } @@ -60,7 +74,7 @@ bool threadpool_run(void *(*startRoutine)(void *), void *arg) break; } } - if ( entry == NULL ) { + if ( unlikely( entry == NULL ) ) { entry = malloc( sizeof(entry_t) ); if ( entry == NULL ) { logadd( LOG_WARNING, "Could not alloc entry_t for new thread\n" ); @@ -78,6 +92,7 @@ bool threadpool_run(void *(*startRoutine)(void *), void *arg) free( entry ); return false; } + activeThreads++; } entry->startRoutine = startRoutine; entry->arg = arg; @@ -130,6 +145,7 @@ keep_going:; } signal_close( entry->signal ); free( entry ); + activeThreads--; return NULL; } diff --git a/src/server/threadpool.h b/src/server/threadpool.h index 15dd151..ee0b3aa 100644 --- a/src/server/threadpool.h +++ b/src/server/threadpool.h @@ -17,6 +17,11 @@ bool threadpool_init(int maxIdleThreadCount); */ void threadpool_close(); +/** + * Block until all threads spawned have exited + */ +void threadpool_waitEmpty(); + /** * Run a thread using the thread pool. * @param startRoutine function to run in new thread -- cgit v1.2.3-55-g7522 From 88695877f085af475a6ca8a01c2fbb08eb5b15da Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Thu, 29 Aug 2019 14:49:18 +0200 Subject: [SERVER] Use weakref for cache maps Gets rid of a bunch of locking, especially the hot path in net.c where clients are requesting data. Many clients unsing the same incomplete image previously created a bottleneck here. --- src/server/globals.h | 10 ++- src/server/image.c | 208 +++++++++++++++++++++++++++++++------------------ src/server/image.h | 2 +- src/server/integrity.c | 10 ++- src/server/net.c | 81 +++++++++---------- src/server/reference.h | 5 ++ src/server/uplink.c | 64 +++++++-------- 7 files changed, 220 insertions(+), 160 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/globals.h b/src/server/globals.h index f940666..221af78 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -99,6 +99,12 @@ typedef struct int permissions; } dnbd3_access_rule_t; +typedef struct +{ + ref reference; + atomic_uint_least8_t map[]; +} dnbd3_cache_map_t; + /** * Image struct. An image path could be something like * /mnt/images/rz/zfs/Windows7 ZfS.vmdk.r1 @@ -110,7 +116,7 @@ struct _dnbd3_image char *path; // absolute path of the image char *name; // public name of the image (usually relative path minus revision ID) weakref uplinkref; // pointer to a server connection - uint8_t *cache_map; // cache map telling which parts are locally cached, NULL if complete + weakref ref_cacheMap; // cache map telling which parts are locally cached, NULL if complete uint64_t virtualFilesize; // virtual size of image (real size rounded up to multiple of 4k) uint64_t realFilesize; // actual file size on disk ticks atime; // last access time @@ -119,7 +125,7 @@ struct _dnbd3_image uint32_t *crc32; // list of crc32 checksums for each 16MiB block in image uint32_t masterCrc32; // CRC-32 of the crc-32 list int readFd; // used to read the image. Used from multiple threads, so use atomic operations (pread et al) - int completenessEstimate; // Completeness estimate in percent + atomic_int completenessEstimate; // Completeness estimate in percent atomic_int users; // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock. int id; // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server atomic_bool working; // true if image exists and completeness is == 100% or a working upstream proxy is connected diff --git a/src/server/image.c b/src/server/image.c index 4eab1d2..1972f48 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -51,10 +51,18 @@ static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageS static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc); static bool image_ensureDiskSpace(uint64_t size, bool force); -static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize); +static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize); static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc); -static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, uint8_t * const cache_map); +static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, atomic_uint_least8_t * const cache_map); static void* closeUnusedFds(void*); +static void allocCacheMap(dnbd3_image_t *image, bool complete); + +static void cmfree(ref *ref) +{ + dnbd3_cache_map_t *cache = container_of(ref, dnbd3_cache_map_t, reference); + logadd( LOG_DEBUG2, "Freeing a cache map" ); + free( cache ); +} // ########################################## @@ -70,7 +78,6 @@ void image_serverStartup() /** * Update cache-map of given image for the given byte range * start (inclusive) - end (exclusive) - * Locks on: images[].lock */ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set) { @@ -91,33 +98,55 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co if ( start >= end ) return; bool setNewBlocks = false; - uint64_t pos = start; - mutex_lock( &image->lock ); - if ( image->cache_map == NULL ) { + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) { // Image seems already complete if ( set ) { // This makes no sense - mutex_unlock( &image->lock ); - logadd( LOG_DEBUG1, "image_updateCachemap(true) with no cache_map: %s", image->path ); + logadd( LOG_DEBUG1, "image_updateCachemap(true) with no cache map: %s", image->path ); return; } // Recreate a cache map, set it to all 1 initially as we assume the image was complete - const int byteSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); - image->cache_map = malloc( byteSize ); - memset( image->cache_map, 0xff, byteSize ); - } - while ( pos < end ) { - const size_t map_y = (int)( pos >> 15 ); - const int map_x = (int)( (pos >> 12) & 7 ); // mod 8 - const int bit_mask = 1 << map_x; - if ( set ) { - if ( (image->cache_map[map_y] & bit_mask) == 0 ) setNewBlocks = true; - image->cache_map[map_y] |= (uint8_t)bit_mask; - } else { - image->cache_map[map_y] &= (uint8_t)~bit_mask; + allocCacheMap( image, true ); + cache = ref_get_cachemap( image ); + if ( cache == NULL ) { + logadd( LOG_WARNING, "WHAT!!!?!?!= No cache map right after alloc?! %s", image->path ); + return; } - pos += DNBD3_BLOCK_SIZE; } + // Set/unset + const uint64_t firstByteInMap = start >> 15; + const uint64_t lastByteInMap = (end - 1) >> 15; + uint64_t pos; + // First byte + uint8_t fb = 0, lb = 0; + for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) { + const int map_x = (pos >> 12) & 7; // mod 8 + const uint8_t bit_mask = (uint8_t)( 1 << map_x ); + fb |= bit_mask; + } + // Last byte + for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) { + const int map_x = (pos >> 12) & 7; // mod 8 + const uint8_t bit_mask = (uint8_t)( 1 << map_x ); + lb |= bit_mask; + } + if ( set ) { + uint8_t fo = atomic_fetch_or_explicit( &cache->map[firstByteInMap], fb, memory_order_relaxed ); + uint8_t lo = atomic_fetch_or_explicit( &cache->map[lastByteInMap], lb, memory_order_relaxed ); + setNewBlocks = ( fo != cache->map[firstByteInMap] || lo != cache->map[lastByteInMap] ); + } else { + atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~fb, memory_order_relaxed ); + atomic_fetch_and_explicit( &cache->map[lastByteInMap], (uint8_t)~lb, memory_order_relaxed ); + } + const uint8_t nval = set ? 0xff : 0; + // Everything in between + for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) { + if ( atomic_exchange_explicit( &cache->map[pos], nval, memory_order_relaxed ) != nval && set ) { + setNewBlocks = true; + } + } + atomic_thread_fence( memory_order_release ); if ( setNewBlocks && image->crc32 != NULL ) { // If setNewBlocks is set, at least one of the blocks was not cached before, so queue all hash blocks // for checking, even though this might lead to checking some hash block again, if it was @@ -125,19 +154,14 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co // First set start and end to borders of hash blocks start &= ~(uint64_t)(HASH_BLOCK_SIZE - 1); end = (end + HASH_BLOCK_SIZE - 1) & ~(uint64_t)(HASH_BLOCK_SIZE - 1); - pos = start; - while ( pos < end ) { - if ( image->cache_map == NULL ) break; + for ( pos = start; pos < end; pos += HASH_BLOCK_SIZE ) { const int block = (int)( pos / HASH_BLOCK_SIZE ); - if ( image_isHashBlockComplete( image->cache_map, block, image->realFilesize ) ) { - mutex_unlock( &image->lock ); + if ( image_isHashBlockComplete( cache->map, block, image->realFilesize ) ) { integrity_check( image, block ); - mutex_lock( &image->lock ); } - pos += HASH_BLOCK_SIZE; } } - mutex_unlock( &image->lock ); + ref_put( &cache->reference ); } /** @@ -149,20 +173,18 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co bool image_isComplete(dnbd3_image_t *image) { assert( image != NULL ); - mutex_lock( &image->lock ); if ( image->virtualFilesize == 0 ) { - mutex_unlock( &image->lock ); return false; } - if ( image->cache_map == NULL ) { - mutex_unlock( &image->lock ); + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) { return true; } bool complete = true; int j; const int map_len_bytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); for (j = 0; j < map_len_bytes - 1; ++j) { - if ( image->cache_map[j] != 0xFF ) { + if ( cache->map[j] != 0xFF ) { complete = false; break; } @@ -177,18 +199,27 @@ bool image_isComplete(dnbd3_image_t *image) for (j = 0; j < blocks_in_last_byte; ++j) last_byte |= (uint8_t)(1 << j); } - complete = ((image->cache_map[map_len_bytes - 1] & last_byte) == last_byte); + complete = ((cache->map[map_len_bytes - 1] & last_byte) == last_byte); } - if ( !complete ) { - mutex_unlock( &image->lock ); + ref_put( &cache->reference ); + if ( !complete ) return false; + mutex_lock( &image->lock ); + // Lock and make sure current cache map is still the one we saw complete + dnbd3_cache_map_t *current = ref_get_cachemap( image ); + if ( current == cache ) { + // Set cache map NULL as it's complete + ref_setref( &image->ref_cacheMap, NULL ); + } + if ( current != NULL ) { + ref_put( ¤t->reference ); } - char mapfile[PATHLEN] = ""; - free( image->cache_map ); - image->cache_map = NULL; - snprintf( mapfile, PATHLEN, "%s.map", image->path ); mutex_unlock( &image->lock ); - unlink( mapfile ); + if ( current == cache ) { // Successfully set cache map to NULL above + char mapfile[PATHLEN] = ""; + snprintf( mapfile, PATHLEN, "%s.map", image->path ); + unlink( mapfile ); + } return true; } @@ -350,19 +381,18 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) img->rid = candidate->rid; img->users = 1; img->working = false; + img->ref_cacheMap = NULL; mutex_init( &img->lock, LOCK_IMAGE ); if ( candidate->crc32 != NULL ) { const size_t mb = IMGSIZE_TO_HASHBLOCKS( candidate->virtualFilesize ) * sizeof(uint32_t); img->crc32 = malloc( mb ); memcpy( img->crc32, candidate->crc32, mb ); } - mutex_lock( &candidate->lock ); - if ( candidate->cache_map != NULL ) { - const size_t mb = IMGSIZE_TO_MAPBYTES( candidate->virtualFilesize ); - img->cache_map = malloc( mb ); - memcpy( img->cache_map, candidate->cache_map, mb ); + dnbd3_cache_map_t *cache = ref_get_cachemap( candidate ); + if ( cache != NULL ) { + ref_setref( &img->ref_cacheMap, &cache->reference ); + ref_put( &cache->reference ); } - mutex_unlock( &candidate->lock ); if ( image_addToList( img ) ) { image_release( candidate ); candidate = img; @@ -377,7 +407,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) } // Check if image is incomplete, handle - if ( candidate->cache_map != NULL ) { + if ( candidate->ref_cacheMap != NULL ) { uplink_init( candidate, -1, NULL, -1 ); } @@ -585,11 +615,10 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image) if ( !uplink_shutdown( image ) ) return NULL; mutex_lock( &image->lock ); - free( image->cache_map ); + ref_setref( &image->ref_cacheMap, NULL ); free( image->crc32 ); free( image->path ); free( image->name ); - image->cache_map = NULL; image->crc32 = NULL; image->path = NULL; image->name = NULL; @@ -600,7 +629,7 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image) return NULL ; } -bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t realFilesize) +bool image_isHashBlockComplete(atomic_uint_least8_t * const cacheMap, const uint64_t block, const uint64_t realFilesize) { if ( cacheMap == NULL ) return true; const uint64_t end = (block + 1) * HASH_BLOCK_SIZE; @@ -707,7 +736,7 @@ static bool image_load(char *base, char *path, int withUplink) { int revision = -1; struct stat st; - uint8_t *cache_map = NULL; + dnbd3_cache_map_t *cache = NULL; uint32_t *crc32list = NULL; dnbd3_image_t *existing = NULL; int fdImage = -1; @@ -790,7 +819,7 @@ static bool image_load(char *base, char *path, int withUplink) } // 1. Allocate memory for the cache map if the image is incomplete - cache_map = image_loadCacheMap( path, virtualFilesize ); + cache = image_loadCacheMap( path, virtualFilesize ); // XXX: Maybe try sha-256 or 512 first if you're paranoid (to be implemented) @@ -802,7 +831,7 @@ static bool image_load(char *base, char *path, int withUplink) // Check CRC32 if ( crc32list != NULL ) { - if ( !image_checkRandomBlocks( 4, fdImage, realFilesize, crc32list, cache_map ) ) { + if ( !image_checkRandomBlocks( 4, fdImage, realFilesize, crc32list, cache != NULL ? cache->map : NULL ) ) { logadd( LOG_ERROR, "quick crc32 check of %s failed. Data corruption?", path ); doFullCheck = true; } @@ -826,7 +855,7 @@ static bool image_load(char *base, char *path, int withUplink) crc32list = NULL; function_return = true; goto load_error; // Keep existing - } else if ( existing->cache_map != NULL && cache_map == NULL ) { + } else if ( existing->ref_cacheMap != NULL && cache == NULL ) { // Just ignore that fact, if replication is really complete the cache map will be removed anyways logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", existing->name, (int)existing->rid ); function_return = true; @@ -846,7 +875,8 @@ static bool image_load(char *base, char *path, int withUplink) dnbd3_image_t *image = calloc( 1, sizeof(dnbd3_image_t) ); image->path = strdup( path ); image->name = strdup( imgName ); - image->cache_map = cache_map; + image->ref_cacheMap = NULL; + ref_setref( &image->ref_cacheMap, &cache->reference ); image->crc32 = crc32list; image->masterCrc32 = masterCrc; image->uplinkref = NULL; @@ -855,7 +885,7 @@ static bool image_load(char *base, char *path, int withUplink) image->rid = (uint16_t)revision; image->users = 0; image->readFd = -1; - image->working = (image->cache_map == NULL ); + image->working = ( cache == NULL ); timing_get( &image->nextCompletenessEstimate ); image->completenessEstimate = -1; mutex_init( &image->lock, LOCK_IMAGE ); @@ -870,16 +900,16 @@ static bool image_load(char *base, char *path, int withUplink) timing_gets( &image->atime, offset ); // Prevent freeing in cleanup - cache_map = NULL; + cache = NULL; crc32list = NULL; // Get rid of cache map if image is complete - if ( image->cache_map != NULL ) { + if ( image->ref_cacheMap != NULL ) { image_isComplete( image ); } // Image is definitely incomplete, initialize uplink worker - if ( image->cache_map != NULL ) { + if ( image->ref_cacheMap != NULL ) { image->working = false; if ( withUplink ) { uplink_init( image, -1, NULL, -1 ); @@ -910,21 +940,22 @@ static bool image_load(char *base, char *path, int withUplink) load_error: ; if ( existing != NULL ) existing = image_release( existing ); if ( crc32list != NULL ) free( crc32list ); - if ( cache_map != NULL ) free( cache_map ); + if ( cache != NULL ) free( cache ); if ( fdImage != -1 ) close( fdImage ); return function_return; } -static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize) +static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize) { - uint8_t *retval = NULL; + dnbd3_cache_map_t *retval = NULL; char mapFile[strlen( imagePath ) + 10 + 1]; sprintf( mapFile, "%s.map", imagePath ); int fdMap = open( mapFile, O_RDONLY ); - if ( fdMap >= 0 ) { + if ( fdMap != -1 ) { const int map_size = IMGSIZE_TO_MAPBYTES( fileSize ); - retval = calloc( 1, map_size ); - const ssize_t rd = read( fdMap, retval, map_size ); + retval = calloc( 1, sizeof(*retval) + map_size ); + ref_init( &retval->reference, cmfree, 0 ); + const ssize_t rd = read( fdMap, retval->map, map_size ); if ( map_size != rd ) { logadd( LOG_WARNING, "Could only read %d of expected %d bytes of cache map of '%s'", (int)rd, (int)map_size, imagePath ); // Could not read complete map, that means the rest of the image file will be considered incomplete @@ -985,7 +1016,7 @@ static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t f return retval; } -static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t realFilesize, uint32_t * const crc32list, uint8_t * const cache_map) +static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t realFilesize, uint32_t * const crc32list, atomic_uint_least8_t * const cache_map) { // This checks the first block and (up to) count - 1 random blocks for corruption // via the known crc32 list. This is very sloppy and is merely supposed to detect @@ -1529,30 +1560,37 @@ json_t* image_getListAsJson() /** * Get completeness of an image in percent. Only estimated, not exact. * Returns: 0-100 - * DOES NOT LOCK, so make sure to do so before calling */ int image_getCompletenessEstimate(dnbd3_image_t * const image) { assert( image != NULL ); - if ( image->cache_map == NULL ) return image->working ? 100 : 0; + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) + return image->working ? 100 : 0; + const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + if ( unlikely( len == 0 ) ) { + ref_put( &cache->reference ); + return 0; + } declare_now; if ( !timing_reached( &image->nextCompletenessEstimate, &now ) ) { // Since this operation is relatively expensive, we cache the result for a while + ref_put( &cache->reference ); return image->completenessEstimate; } int i; int percent = 0; - const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); - if ( len == 0 ) return 0; for ( i = 0; i < len; ++i ) { - if ( image->cache_map[i] == 0xff ) { + const uint8_t v = atomic_load_explicit( &cache->map[i], memory_order_relaxed ); + if ( v == 0xff ) { percent += 100; - } else if ( image->cache_map[i] != 0 ) { + } else if ( v != 0 ) { percent += 50; } } + ref_put( &cache->reference ); image->completenessEstimate = percent / len; - timing_set( &image->nextCompletenessEstimate, &now, 8 + rand() % 32 ); + timing_set( &image->nextCompletenessEstimate, &now, 4 + rand() % 16 ); return image->completenessEstimate; } @@ -1744,3 +1782,21 @@ static void* closeUnusedFds(void* nix UNUSED) } return NULL; } + +static void allocCacheMap(dnbd3_image_t *image, bool complete) +{ + const uint8_t val = complete ? 0xff : 0; + const int byteSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + dnbd3_cache_map_t *cache = malloc( sizeof(*cache) + byteSize ); + ref_init( &cache->reference, cmfree, 0 ); + memset( cache->map, val, byteSize ); + mutex_lock( &image->lock ); + if ( image->ref_cacheMap != NULL ) { + logadd( LOG_WARNING, "BUG: allocCacheMap called but there already is a cache map for %s:%d", image->name, (int)image->rid ); + free( cache ); + } else { + ref_setref( &image->ref_cacheMap, &cache->reference ); + } + mutex_unlock( &image->lock ); +} + diff --git a/src/server/image.h b/src/server/image.h index 4668eff..cd87f03 100644 --- a/src/server/image.h +++ b/src/server/image.h @@ -9,7 +9,7 @@ void image_serverStartup(); bool image_isComplete(dnbd3_image_t *image); -bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t fileSize); +bool image_isHashBlockComplete(atomic_uint_least8_t * const cacheMap, const uint64_t block, const uint64_t fileSize); void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set); diff --git a/src/server/integrity.c b/src/server/integrity.c index 1fcb558..a9fbae6 100644 --- a/src/server/integrity.c +++ b/src/server/integrity.c @@ -181,10 +181,12 @@ static void* integrity_main(void * data UNUSED) const uint64_t end = MIN( (uint64_t)(blocks[0] + 1) * HASH_BLOCK_SIZE, image->virtualFilesize ); bool complete = true; if ( qCount == CHECK_ALL ) { - // When checking full image, skip incomplete blocks, otherwise assume block is complete - mutex_lock( &image->lock ); - complete = image_isHashBlockComplete( image->cache_map, blocks[0], fileSize ); - mutex_unlock( &image->lock ); + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache != NULL ) { + // When checking full image, skip incomplete blocks, otherwise assume block is complete + complete = image_isHashBlockComplete( cache->map, blocks[0], fileSize ); + ref_put( &cache->reference ); + } } #if defined(linux) || defined(__linux) while ( sync_file_range( fd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 ) diff --git a/src/server/net.c b/src/server/net.c index 9c855e4..12bcdad 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -246,7 +246,7 @@ void* net_handleNewConnection(void *clientPtr) // We're a proxy, client is another proxy, we don't do BGR, but connecting proxy does... // Reject, as this would basically force this proxy to do BGR too. image = image_get( image_name, rid, true ); - if ( image != NULL && image->cache_map != NULL ) { + if ( image != NULL && image->ref_cacheMap != NULL ) { // Only exception is if the image is complete locally image = image_release( image ); } @@ -268,7 +268,7 @@ void* net_handleNewConnection(void *clientPtr) } else { // Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable bOk = true; - if ( image->cache_map != NULL ) { + if ( image->ref_cacheMap != NULL ) { dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); if ( uplink == NULL || uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) { bOk = ( rand() % 4 ) == 1; @@ -338,57 +338,52 @@ void* net_handleNewConnection(void *clientPtr) break; } - if ( request.size != 0 && image->cache_map != NULL ) { + dnbd3_cache_map_t *cache; + if ( request.size != 0 && ( cache = ref_get_cachemap( image ) ) != NULL ) { // This is a proxyed image, check if we need to relay the request... start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); bool isCached = true; - mutex_lock( &image->lock ); - // Check again as we only aquired the lock just now - if ( image->cache_map != NULL ) { - const uint64_t firstByteInMap = start >> 15; - const uint64_t lastByteInMap = (end - 1) >> 15; - uint64_t pos; - // Middle - quick checking - if ( isCached ) { - pos = firstByteInMap + 1; - while ( pos < lastByteInMap ) { - if ( image->cache_map[pos] != 0xff ) { - isCached = false; - break; - } - ++pos; + const uint64_t firstByteInMap = start >> 15; + const uint64_t lastByteInMap = (end - 1) >> 15; + uint64_t pos; + uint8_t b; + atomic_thread_fence( memory_order_acquire ); + // Middle - quick checking + if ( isCached ) { + for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) { + if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) { + isCached = false; + break; } } - // First byte - if ( isCached ) { - pos = start; - do { - const int map_x = (pos >> 12) & 7; // mod 8 - const uint8_t bit_mask = (uint8_t)( 1 << map_x ); - if ( (image->cache_map[firstByteInMap] & bit_mask) == 0 ) { - isCached = false; - break; - } - pos += DNBD3_BLOCK_SIZE; - } while ( firstByteInMap == (pos >> 15) && pos < end ); + } + // First byte + if ( isCached ) { + b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed ); + for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) { + const int map_x = (pos >> 12) & 7; // mod 8 + const uint8_t bit_mask = (uint8_t)( 1 << map_x ); + if ( (b & bit_mask) == 0 ) { + isCached = false; + break; + } } - // Last byte - only check if request spans multiple bytes in cache map - if ( isCached && firstByteInMap != lastByteInMap ) { - pos = lastByteInMap << 15; - while ( pos < end ) { - assert( lastByteInMap == (pos >> 15) ); - const int map_x = (pos >> 12) & 7; // mod 8 - const uint8_t bit_mask = (uint8_t)( 1 << map_x ); - if ( (image->cache_map[lastByteInMap] & bit_mask) == 0 ) { - isCached = false; - break; - } - pos += DNBD3_BLOCK_SIZE; + } + // Last byte - only check if request spans multiple bytes in cache map + if ( isCached && firstByteInMap != lastByteInMap ) { + b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed ); + for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) { + assert( lastByteInMap == (pos >> 15) ); + const int map_x = (pos >> 12) & 7; // mod 8 + const uint8_t bit_mask = (uint8_t)( 1 << map_x ); + if ( (b & bit_mask) == 0 ) { + isCached = false; + break; } } } - mutex_unlock( &image->lock ); + ref_put( &cache->reference ); if ( !isCached ) { if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) { logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy, disabling image %s:%d", diff --git a/src/server/reference.h b/src/server/reference.h index 8883eb1..2a80955 100644 --- a/src/server/reference.h +++ b/src/server/reference.h @@ -51,4 +51,9 @@ static inline void ref_put( ref *ref ) ref == NULL ? NULL : container_of(ref, dnbd3_uplink_t, reference); \ }) +#define ref_get_cachemap(image) ({ \ + ref* ref = ref_get( &(image)->ref_cacheMap ); \ + ref == NULL ? NULL : container_of(ref, dnbd3_cache_map_t, reference); \ +}) + #endif diff --git a/src/server/uplink.c b/src/server/uplink.c index d77be9c..0a6bd11 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -91,7 +91,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version ref_put( &uplink->reference ); return true; // There's already an uplink, so should we consider this success or failure? } - if ( image->cache_map == NULL ) { + if ( image->ref_cacheMap == NULL ) { logadd( LOG_WARNING, "Uplink was requested for image %s, but it is already complete", image->name ); goto failure; } @@ -170,7 +170,7 @@ bool uplink_shutdown(dnbd3_image_t *image) mutex_unlock( &uplink->queueLock ); bool retval = ( exp && image->users == 0 ); mutex_unlock( &image->lock ); - return exp; + return retval; } /** @@ -214,7 +214,7 @@ static void uplink_free(ref *ref) dnbd3_image_t *image = image_lock( uplink->image ); if ( image != NULL ) { // != NULL means image is still in list... - if ( !_shutdown && image->cache_map != NULL ) { + if ( !_shutdown && image->ref_cacheMap != NULL ) { // Ingegrity checker must have found something in the meantime uplink_init( image, -1, NULL, 0 ); } @@ -707,13 +707,14 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) if ( uplink == NULL || uplink->current.fd == -1 ) return; if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return; // Don't do background replication if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE ) - return; + return; // Already a replication request on the wire, or no more blocks to replicate dnbd3_image_t * const image = uplink->image; if ( image->virtualFilesize < DNBD3_BLOCK_SIZE ) return; - mutex_lock( &image->lock ); - if ( image == NULL || image->cache_map == NULL || image->users < _bgrMinClients ) { - // No cache map (=image complete), or replication pending, or not enough users, do nothing - mutex_unlock( &image->lock ); + if ( image->users < _bgrMinClients ) return; // Not enough active users + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL || image->users < _bgrMinClients ) { + // No cache map (=image complete) + ref_put( &cache->reference ); return; } const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); @@ -727,16 +728,18 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) endByte = mapBytes; } } + atomic_thread_fence( memory_order_acquire ); int replicationIndex = -1; for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) { const int i = j % ( mapBytes ); // Wrap around for BGR_FULL - if ( image->cache_map[i] != 0xff && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) { + if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff + && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) { // Found incomplete one replicationIndex = i; break; } } - mutex_unlock( &image->lock ); + ref_put( &cache->reference ); if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) { // Nothing left in current block, find next one replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte ); @@ -768,23 +771,24 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) } /** - * find next index into cache_map that corresponds to the beginning + * find next index into cache map that corresponds to the beginning * of a hash block which is neither completely empty nor completely * replicated yet. Returns -1 if no match. */ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int startMapIndex) { int retval = -1; - mutex_lock( &uplink->image->lock ); - const int mapBytes = IMGSIZE_TO_MAPBYTES( uplink->image->virtualFilesize ); - const uint8_t *cache_map = uplink->image->cache_map; - if ( cache_map != NULL ) { - int j; + dnbd3_cache_map_t *cache = ref_get_cachemap( uplink->image ); + if ( cache != NULL ) { + const int mapBytes = IMGSIZE_TO_MAPBYTES( uplink->image->virtualFilesize ); const int start = ( startMapIndex & MAP_INDEX_HASH_START_MASK ); + atomic_thread_fence( memory_order_acquire ); + int j; for (j = 0; j < mapBytes; ++j) { const int i = ( start + j ) % mapBytes; - const bool isFull = cache_map[i] == 0xff || ( i + 1 == mapBytes && uplink->replicatedLastBlock ); - const bool isEmpty = cache_map[i] == 0; + const uint8_t b = atomic_load_explicit( &cache->map[i], memory_order_relaxed ); + const bool isFull = b == 0xff || ( i + 1 == mapBytes && uplink->replicatedLastBlock ); + const bool isEmpty = b == 0; if ( !isEmpty && !isFull ) { // Neither full nor empty, replicate if ( retval == -1 ) { @@ -811,7 +815,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int retval = -1; } } - mutex_unlock( &uplink->image->lock ); + ref_put( &cache->reference ); return retval; } @@ -1107,7 +1111,7 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) if ( fsync( uplink->cacheFd ) == -1 ) { // A failing fsync means we have no guarantee that any data // since the last fsync (or open if none) has been saved. Apart - // from keeping the cache_map from the last successful fsync + // from keeping the cache map from the last successful fsync // around and restoring it there isn't much we can do to recover // a consistent state. Bail out. logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d", image->path, errno ); @@ -1116,21 +1120,13 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) } } - if ( image->cache_map == NULL ) return true; - logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); - mutex_lock( &image->lock ); - // Lock and get a copy of the cache map, as it could be freed by another thread that is just about to - // figure out that this image's cache copy is complete - if ( image->cache_map == NULL || image->virtualFilesize < DNBD3_BLOCK_SIZE ) { - mutex_unlock( &image->lock ); + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) return true; - } + logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize); - uint8_t *map = malloc( size ); - memcpy( map, image->cache_map, size ); // Unlock. Use path and cacheFd without locking. path should never change after initialization of the image, // cacheFd is owned by the uplink thread and we don't want to hold a spinlock during I/O - mutex_unlock( &image->lock ); assert( image->path != NULL ); char mapfile[strlen( image->path ) + 4 + 1]; strcpy( mapfile, image->path ); @@ -1139,14 +1135,14 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 ); if ( fd == -1 ) { const int err = errno; - free( map ); + ref_put( &cache->reference ); logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile ); return false; } size_t done = 0; while ( done < size ) { - const ssize_t ret = write( fd, map, size - done ); + const ssize_t ret = write( fd, cache->map + done, size - done ); if ( ret == -1 ) { if ( errno == EINTR ) continue; logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile ); @@ -1158,11 +1154,11 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) } done += (size_t)ret; } + ref_put( &cache->reference ); if ( fsync( fd ) == -1 ) { logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno ); } close( fd ); - free( map ); return true; } -- cgit v1.2.3-55-g7522 From 543877c7fc17c0a881d6a85c76dfc17f8def7dff Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Wed, 4 Sep 2019 20:06:11 +0200 Subject: [SERVER] Support limiting alt-servers to specific namespace Not really namespace but simple string matching for the image path. Path is matched from start with no support for glob or regex, so usually you want to have a trailing '/' to limit to certain directories. --- src/server/altservers.c | 51 +++++++++++++++++++++++++++++++++++-------------- src/server/altservers.h | 4 ++-- src/server/globals.h | 8 ++++++++ src/server/image.c | 2 +- src/server/net.c | 2 +- 5 files changed, 49 insertions(+), 18 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/altservers.c b/src/server/altservers.c index 952af4f..943345c 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -23,7 +23,7 @@ static atomic_int numAltServers = 0; static pthread_mutex_t altServersLock; static void *altservers_runCheck(void *data); -static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int size, int current); +static int altservers_getListForUplink(dnbd3_uplink_t *uplink, const char *image, int *servers, int size, int current); static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink); static uint32_t altservers_updateRtt(dnbd3_uplink_t *uplink, int index, uint32_t rtt); static void altservers_imageFailed(dnbd3_uplink_t *uplink, int server); @@ -86,6 +86,13 @@ static int addAltFromIni(void *countptr, const char* section, const char* key, c } } else if ( strcmp( key, "comment" ) == 0 ) { snprintf( altServers[index].comment, COMMENT_LENGTH, "%s", value ); + } else if ( strcmp( key, "namespace" ) == 0 ) { + dnbd3_ns_t *elem = malloc( sizeof(*elem) ); + elem->name = strdup( value ); + elem->len = strlen( value ); + do { + elem->next = altServers[index].nameSpaces; + } while ( !atomic_compare_exchange_weak( &altServers[index].nameSpaces, &elem->next, elem ) ); } else { logadd( LOG_DEBUG1, "Unknown key in alt-servers section: '%s'", key ); } @@ -139,6 +146,7 @@ bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate altServers[freeSlot].host = *host; altServers[freeSlot].isPrivate = isPrivate; altServers[freeSlot].isClientOnly = isClientOnly; + altServers[freeSlot].nameSpaces = NULL; if ( comment != NULL ) snprintf( altServers[freeSlot].comment, COMMENT_LENGTH, "%s", comment ); mutex_unlock( &altServersLock ); *index = freeSlot; @@ -171,15 +179,28 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink) } } +static bool isImageAllowed(dnbd3_alt_server_t *alt, const char *image) +{ + if ( alt->nameSpaces == NULL ) + return true; + for ( dnbd3_ns_t *it = alt->nameSpaces; it != NULL; it = it->next ) { + if ( strncmp( it->name, image, it->len ) == 0 ) + return true; + } + return false; +} + /** * Get known (working) alt servers, ordered by network closeness * (by finding the smallest possible subnet) * Private servers are excluded, so this is what you want to call to * get a list of servers you can tell a client about */ -int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size) +int altservers_getListForClient(dnbd3_client_t *client, dnbd3_server_entry_t *output, int size) { - if ( host == NULL || host->type == 0 || numAltServers == 0 || output == NULL || size <= 0 ) return 0; + dnbd3_host_t *host = &client->host; + if ( host->type == 0 || numAltServers == 0 || output == NULL || size <= 0 ) + return 0; int i, j; int count = 0; uint16_t scores[SERVER_MAX_ALTS] = { 0 }; @@ -188,11 +209,9 @@ int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output for ( i = 0; i < numAltServers; ++i ) { if ( altServers[i].host.type == 0 || altServers[i].isPrivate ) continue; // Slot is empty or uplink is for replication only - if ( host->type == altServers[i].host.type ) { - scores[i] = (uint16_t)( 10 + altservers_netCloseness( host, &altServers[i].host ) ); - } else { - scores[i] = 1; // Wrong address family - } + if ( !isImageAllowed( &altServers[i], client->image->name ) ) + continue; + scores[i] = (uint16_t)( 10 + altservers_netCloseness( host, &altServers[i].host ) ); } while ( count < size ) { i = -1; @@ -244,10 +263,10 @@ static bool isUsableForUplink( dnbd3_uplink_t *uplink, int server, ticks *now ) return fails < SERVER_BAD_UPLINK_MIN || ( rand() % fails ) < SERVER_BAD_UPLINK_MIN; } -int altservers_getHostListForReplication(dnbd3_host_t *servers, int size) +int altservers_getHostListForReplication(const char *image, dnbd3_host_t *servers, int size) { int idx[size]; - int num = altservers_getListForUplink( NULL, idx, size, -1 ); + int num = altservers_getListForUplink( NULL, image, idx, size, -1 ); for ( int i = 0; i < num; ++i ) { servers[i] = altServers[i].host; } @@ -261,7 +280,7 @@ int altservers_getHostListForReplication(dnbd3_host_t *servers, int size) * it includes private servers and ignores any "client only" servers * @param current index of server for current connection, or -1 in panic mode */ -static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int size, int current) +static int altservers_getListForUplink(dnbd3_uplink_t *uplink, const char *image, int *servers, int size, int current) { if ( size <= 0 ) return 0; @@ -272,7 +291,9 @@ static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int if ( numAltServers <= size ) { for ( int i = 0; i < numAltServers; ++i ) { if ( current == -1 || i == current || isUsableForUplink( uplink, i, &now ) ) { - servers[count++] = i; + if ( isImageAllowed( &altServers[i], image ) ) { + servers[count++] = i; + } } } } else { @@ -286,7 +307,9 @@ static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int int idx = rand() % numAltServers; if ( state[idx] != 0 ) continue; - if ( isUsableForUplink( uplink, idx, &now ) ) { + if ( !isImageAllowed( &altServers[idx], image ) ) { + state[idx] = 2; // Mark as used without adding, so it will be ignored in panic loop + } else if ( isUsableForUplink( uplink, idx, &now ) ) { servers[count++] = idx; state[idx] = 2; // Used } else { @@ -469,7 +492,7 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink) current = uplink->current.index; // Current server index (or last one in panic mode) mutex_unlock( &uplink->rttLock ); // First, get 4 alt servers - numAlts = altservers_getListForUplink( uplink, servers, ALTS, panic ? -1 : current ); + numAlts = altservers_getListForUplink( uplink, uplink->image->name, servers, ALTS, panic ? -1 : current ); // If we're already connected and only got one server anyways, there isn't much to do if ( numAlts == 0 || ( numAlts == 1 && !panic ) ) { uplink->rttTestResult = RTT_DONTCHANGE; diff --git a/src/server/altservers.h b/src/server/altservers.h index 1e1f119..8e29aaa 100644 --- a/src/server/altservers.h +++ b/src/server/altservers.h @@ -15,9 +15,9 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink); void altservers_findUplink(dnbd3_uplink_t *uplink); -int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size); +int altservers_getListForClient(dnbd3_client_t *client, dnbd3_server_entry_t *output, int size); -int altservers_getHostListForReplication(dnbd3_host_t *servers, int size); +int altservers_getHostListForReplication(const char *image, dnbd3_host_t *servers, int size); bool altservers_toString(int server, char *buffer, size_t len); diff --git a/src/server/globals.h b/src/server/globals.h index 221af78..ebdc1c7 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -31,6 +31,13 @@ typedef struct uint8_t hopCount; // How many hops this request has already taken across proxies } dnbd3_queued_request_t; +typedef struct _ns +{ + struct _ns *next; + char *name; + size_t len; +} dnbd3_ns_t; + typedef struct { int fails; // Hard fail: Connection failed @@ -41,6 +48,7 @@ typedef struct ticks lastFail; // Last hard fail dnbd3_host_t host; char comment[COMMENT_LENGTH]; + _Atomic(dnbd3_ns_t *) nameSpaces; // Linked list of name spaces } dnbd3_alt_server_t; typedef struct diff --git a/src/server/image.c b/src/server/image.c index bdb910d..86e6b87 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -1204,7 +1204,7 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision, dnbd3_host_t servers[REP_NUM_SRV]; int uplinkSock = -1; dnbd3_host_t uplinkServer; - const int count = altservers_getHostListForReplication( servers, REP_NUM_SRV ); + const int count = altservers_getHostListForReplication( name, servers, REP_NUM_SRV ); uint16_t remoteProtocolVersion; uint16_t remoteRid = revision; uint64_t remoteImageSize; diff --git a/src/server/net.c b/src/server/net.c index 00c9a8d..aba4e7d 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -477,7 +477,7 @@ void* net_handleNewConnection(void *clientPtr) case CMD_GET_SERVERS: // Build list of known working alt servers - num = altservers_getListForClient( &client->host, server_list, NUMBER_SERVERS ); + num = altservers_getListForClient( client, server_list, NUMBER_SERVERS ); reply.cmd = CMD_GET_SERVERS; reply.size = (uint32_t)( num * sizeof(dnbd3_server_entry_t) ); mutex_lock( &client->sendMutex ); -- cgit v1.2.3-55-g7522 From bf665f59411840c60b6e3c9ac33f28a818233c0a Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Thu, 5 Sep 2019 18:15:52 +0200 Subject: [SERVER] Introduce autoFreeDiskSpaceDelay This setting allows you to control the formerly hard-coded timeout of 10 hours before a proxy would start deleting old images in order to free up space for new images. Setting it to -1 entirely disables automatic deletion, in case you have an external process for freeing up disk space. --- conf/server.conf | 11 +++++++++++ src/server/globals.c | 12 ++++++++++++ src/server/globals.h | 7 +++++++ src/server/image.c | 14 ++++++++------ 4 files changed, 38 insertions(+), 6 deletions(-) (limited to 'src/server/globals.h') diff --git a/conf/server.conf b/conf/server.conf index a101f34..a15092f 100644 --- a/conf/server.conf +++ b/conf/server.conf @@ -31,6 +31,17 @@ vmdkLegacyMode=false ; Don't set the server flag when connecting to alt-servers ; Intended for if the proxy is used for on-client caching pretendClient=false +; When running in proxy mode and running out of space, automatically delete oldest image(s) to make +; the newly replicated image fit. In sparse mode, this will make sure at least 2GB of free space are +; available when replicating a new image. During normal operation, it will free at least 256MB whenever +; an attempt to write more data to cache fails. In non-sparse mode, whenever a new image is replicated, +; as much space as is required to store the entire image will be made available. +; However, after startup the proxy will refuse to delete any images for the time span given below, to be +; able to gather up to date usage information for the images available. If unitless, the value is +; interpreted in seconds. Valid suffixes are m, h, d. +; Setting this to -1 disables deletion of images. If the cache partition is full, no more images will +; be replicated unless you manually free up more disk space. +autoFreeDiskSpaceDelay=10h [limits] maxClients=2000 diff --git a/src/server/globals.c b/src/server/globals.c index 46c1030..f8c3f66 100644 --- a/src/server/globals.c +++ b/src/server/globals.c @@ -28,6 +28,7 @@ atomic_bool _closeUnusedFd = false; atomic_bool _vmdkLegacyMode = false; // Not really needed anymore since we have '+' and '-' in alt-servers atomic_bool _proxyPrivateOnly = false; +atomic_int _autoFreeDiskSpaceDelay = 3600 * 10; // [limits] atomic_int _maxClients = SERVER_MAX_CLIENTS; atomic_int _maxImages = SERVER_MAX_IMAGES; @@ -83,6 +84,7 @@ static int ini_handler(void *custom UNUSED, const char* section, const char* key SAVE_TO_VAR_UINT( limits, maxPayload ); SAVE_TO_VAR_UINT64( limits, maxReplicationSize ); SAVE_TO_VAR_BOOL( dnbd3, pretendClient ); + SAVE_TO_VAR_INT( dnbd3, autoFreeDiskSpaceDelay ); if ( strcmp( section, "dnbd3" ) == 0 && strcmp( key, "backgroundReplication" ) == 0 ) { if ( strcmp( value, "hashblock" ) == 0 ) { _backgroundReplication = BGR_HASHBLOCK; @@ -229,6 +231,15 @@ static bool parse64(const char *in, atomic_int_fast64_t *out, const char *optnam while ( *end == ' ' ) end++; if ( *end == '\0' ) { exp = 0; + } else if ( *end == 'm' ) { + exp = 1; + base = 60; + } else if ( *end == 'h' ) { + exp = 1; + base = 3600; + } else if ( *end == 'd' ) { + exp = 1; + base = 24 * 3600; } else { char *pos = strchr( units, *end > 'Z' ? (*end - 32) : *end ); if ( pos == NULL ) { @@ -318,6 +329,7 @@ size_t globals_dumpConfig(char *buffer, size_t size) PBOOL(vmdkLegacyMode); PBOOL(proxyPrivateOnly); PBOOL(pretendClient); + PINT(autoFreeDiskSpaceDelay); P_ARG("[limits]\n"); PINT(maxClients); PINT(maxImages); diff --git a/src/server/globals.h b/src/server/globals.h index ebdc1c7..58b2c9d 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -290,6 +290,13 @@ extern atomic_uint_fast64_t _maxReplicationSize; */ extern atomic_bool _pretendClient; +/** + * Minimum uptime in seconds before proxy starts deleting old + * images if running out of space. -1 disables automatic deletion. + * Only relevant in proxy mode. + */ +extern atomic_int _autoFreeDiskSpaceDelay; + /** * Load the server configuration. */ diff --git a/src/server/image.c b/src/server/image.c index 86e6b87..9fcb866 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -1696,14 +1696,16 @@ static bool image_ensureDiskSpace(uint64_t size, bool force) for ( int maxtries = 0; maxtries < 20; ++maxtries ) { uint64_t available; if ( !file_freeDiskSpace( _basePath, NULL, &available ) ) { - const int e = errno; - logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left... ;-)\n", e ); + logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left... ;-)\n", errno ); return true; } - if ( available > size ) return true; - if ( !force && dnbd3_serverUptime() < 10 * 3600 ) { - logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < 10 hours...", (int)(available / (1024ll * 1024ll)), - (int)(size / (1024 * 1024)) ); + if ( available > size ) + return true; // Yay + if ( !_isProxy || _autoFreeDiskSpaceDelay == -1 ) + return false; // If not in proxy mode at all, or explicitly disabled, never delete anything + if ( !force && dnbd3_serverUptime() < (uint32_t)_autoFreeDiskSpaceDelay ) { + logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < %d minutes...", (int)(available / (1024ll * 1024ll)), + (int)(size / (1024 * 1024)), _autoFreeDiskSpaceDelay / 60 ); return false; } logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, freeing an image...", (int)(available / (1024ll * 1024ll)), -- cgit v1.2.3-55-g7522 From 701e5a967fd6bc97644f39e6fea3714f49a90291 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 6 Sep 2019 17:32:58 +0200 Subject: [SERVER] rpc: Add cachemap feature --- src/server/globals.h | 2 +- src/server/image.c | 16 ++++++++++++++++ src/server/image.h | 2 ++ src/server/rpc.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 1 deletion(-) (limited to 'src/server/globals.h') diff --git a/src/server/globals.h b/src/server/globals.h index 58b2c9d..df8c595 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -110,7 +110,7 @@ typedef struct typedef struct { ref reference; - atomic_uint_least8_t map[]; + _Atomic uint8_t map[]; } dnbd3_cache_map_t; /** diff --git a/src/server/image.c b/src/server/image.c index 9fcb866..5fa06d8 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -274,6 +274,22 @@ bool image_ensureOpen(dnbd3_image_t *image) return image->readFd != -1; } +dnbd3_image_t* image_byId(int imgId) +{ + int i; + mutex_lock( &imageListLock ); + for (i = 0; i < _num_images; ++i) { + dnbd3_image_t * const image = _images[i]; + if ( image != NULL && image->id == imgId ) { + image->users++; + mutex_unlock( &imageListLock ); + return image; + } + } + mutex_unlock( &imageListLock ); + return NULL; +} + /** * Get an image by name+rid. This function increases a reference counter, * so you HAVE TO CALL image_release for every image_get() call at some diff --git a/src/server/image.h b/src/server/image.h index cd87f03..449e31f 100644 --- a/src/server/image.h +++ b/src/server/image.h @@ -17,6 +17,8 @@ void image_markComplete(dnbd3_image_t *image); bool image_ensureOpen(dnbd3_image_t *image); +dnbd3_image_t* image_byId(int imgId); + dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking); bool image_reopenCacheFd(dnbd3_image_t *image, const bool force); diff --git a/src/server/rpc.c b/src/server/rpc.c index 662263e..548c80f 100644 --- a/src/server/rpc.c +++ b/src/server/rpc.c @@ -9,6 +9,7 @@ #include "fileutil.h" #include "picohttpparser/picohttpparser.h" #include "urldecode.h" +#include "reference.h" #include #include @@ -43,7 +44,9 @@ _Static_assert( sizeof("test") == 5 && sizeof("test2") == 6, "Stringsize messup DEFSTR(STR_CONNECTION, "connection") DEFSTR(STR_CLOSE, "close") DEFSTR(STR_QUERY, "/query") +DEFSTR(STR_CACHEMAP, "/cachemap") DEFSTR(STR_Q, "q") +DEFSTR(STR_ID, "id") static inline bool equals(struct string *s1,struct string *s2) { @@ -81,6 +84,7 @@ static struct { } status; static bool handleStatus(int sock, int permissions, struct field *fields, size_t fields_num, int keepAlive); +static bool handleCacheMap(int sock, int permissions, struct field *fields, size_t fields_num, int keepAlive); static bool sendReply(int sock, const char *status, const char *ctype, const char *payload, ssize_t plen, int keepAlive); static void parsePath(struct string *path, struct string *file, struct field *getv, size_t *getc); static bool hasHeaderValue(struct phr_header *headers, size_t numHeaders, struct string *name, struct string *value); @@ -212,6 +216,8 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int // Don't care if GET or POST if ( equals( &file, &STR_QUERY ) ) { ok = handleStatus( sock, permissions, getv, getc, keepAlive ); + } else if ( equals( &file, &STR_CACHEMAP ) ) { + ok = handleCacheMap( sock, permissions, getv, getc, keepAlive ); } else { ok = sendReply( sock, "404 Not found", "text/plain", "Nothing", -1, keepAlive ); } @@ -342,6 +348,44 @@ static bool handleStatus(int sock, int permissions, struct field *fields, size_t return ok; } +static bool handleCacheMap(int sock, int permissions, struct field *fields, size_t fields_num, int keepAlive) +{ + if ( !(permissions & ACL_IMAGE_LIST) ) { + return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access image list", -1, keepAlive ); + } + int imgId = -1; + static const char one = 0xff; + for (size_t i = 0; i < fields_num; ++i) { + if ( equals( &fields[i].name, &STR_ID ) ) { + char *broken; + imgId = strtol( fields[i].value.s, &broken, 10 ); + if ( broken != fields[i].value.s ) + break; + imgId = -1; + } + } + if ( imgId == -1 ) + return sendReply( sock, "400 Bad Request", "text/plain", "Missing parameter 'id'", -1, keepAlive ); + dnbd3_image_t *image = image_byId( imgId ); + if ( image == NULL ) + return sendReply( sock, "404 Not found", "text/plain", "Image not found", -1, keepAlive ); + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + image_release( image ); + int len; + const char *map; + if ( cache == NULL ) { + map = &one; + len = 1; + } else { + _Static_assert( sizeof(const char) == sizeof(_Atomic uint8_t), "Atomic assumption exploded" ); + map = (const char*)cache->map; + len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + } + bool ok = sendReply( sock, "200 OK", "application/octet-stream", map, len, keepAlive ); + ref_put( &cache->reference ); + return ok; +} + static bool sendReply(int sock, const char *status, const char *ctype, const char *payload, ssize_t plen, int keepAlive) { if ( plen == -1 ) plen = strlen( payload ); -- cgit v1.2.3-55-g7522 From dd0880b8ee67f9a69802a2a3ef26cd5df6881129 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Mon, 24 Feb 2020 14:13:01 +0100 Subject: [SERVER] Introduce ignoreAllocErrors If enabled, a failed fallocate will not abort image replication, but retry with sparse mode. --- src/server/globals.c | 3 +++ src/server/globals.h | 6 ++++++ src/server/image.c | 9 +++++++-- 3 files changed, 16 insertions(+), 2 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/globals.c b/src/server/globals.c index f8c3f66..2e87400 100644 --- a/src/server/globals.c +++ b/src/server/globals.c @@ -21,6 +21,7 @@ atomic_int _backgroundReplication = BGR_FULL; atomic_int _bgrMinClients = 0; atomic_bool _lookupMissingForProxy = true; atomic_bool _sparseFiles = false; +atomic_bool _ignoreAllocErrors = false; atomic_bool _removeMissingImages = true; atomic_int _uplinkTimeout = SOCKET_TIMEOUT_UPLINK; atomic_int _clientTimeout = SOCKET_TIMEOUT_CLIENT; @@ -75,6 +76,7 @@ static int ini_handler(void *custom UNUSED, const char* section, const char* key SAVE_TO_VAR_INT( dnbd3, bgrMinClients ); SAVE_TO_VAR_BOOL( dnbd3, lookupMissingForProxy ); SAVE_TO_VAR_BOOL( dnbd3, sparseFiles ); + SAVE_TO_VAR_BOOL( dnbd3, ignoreAllocErrors ); SAVE_TO_VAR_BOOL( dnbd3, removeMissingImages ); SAVE_TO_VAR_BOOL( dnbd3, closeUnusedFd ); SAVE_TO_VAR_UINT( dnbd3, serverPenalty ); @@ -322,6 +324,7 @@ size_t globals_dumpConfig(char *buffer, size_t size) PINT(bgrMinClients); PBOOL(lookupMissingForProxy); PBOOL(sparseFiles); + PBOOL(ignoreAllocErrors); PBOOL(removeMissingImages); PINT(uplinkTimeout); PINT(clientTimeout); diff --git a/src/server/globals.h b/src/server/globals.h index df8c595..b1336dc 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -254,6 +254,12 @@ extern atomic_bool _lookupMissingForProxy; */ extern atomic_bool _sparseFiles; +/** + * If true, don't abort image replication if preallocating + * the image fails, but retry with sparse file. + */ +extern atomic_bool _ignoreAllocErrors; + /** * Port to listen on (default: #define PORT (5003)) */ diff --git a/src/server/image.c b/src/server/image.c index 16dae45..6017e59 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -1111,14 +1111,19 @@ bool image_create(char *image, int revision, uint64_t size) logadd( LOG_DEBUG1, "Could not allocate %d bytes for %s (errno=%d)", mapsize, cache, err ); } // Now write image + bool fallback = false; if ( !_sparseFiles && !file_alloc( fdImage, 0, size ) ) { logadd( LOG_ERROR, "Could not allocate %" PRIu64 " bytes for %s (errno=%d)", size, path, errno ); logadd( LOG_ERROR, "It is highly recommended to use a file system that supports preallocating disk" " space without actually writing all zeroes to the block device." ); logadd( LOG_ERROR, "If you cannot fix this, try setting sparseFiles=true, but don't expect" " divine performance during replication." ); - goto failure_cleanup; - } else if ( _sparseFiles && !file_setSize( fdImage, size ) ) { + if ( !_ignoreAllocErrors ) { + goto failure_cleanup; + } + fallback = true; + } + if ( ( _sparseFiles || fallback ) && !file_setSize( fdImage, size ) ) { logadd( LOG_ERROR, "Could not create sparse file of %" PRIu64 " bytes for %s (errno=%d)", size, path, errno ); logadd( LOG_ERROR, "Make sure you have enough disk space, check directory permissions, fs errors etc." ); goto failure_cleanup; -- cgit v1.2.3-55-g7522 From 26c1ad7af0f5749c5343a5823b9c8cece885ce84 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Tue, 3 Mar 2020 12:21:01 +0100 Subject: [SERVER] Remove "working" flag, introduce fine-grained flags Tracking the "working" state of images using one boolean is insufficient regarding the different ways in which providing an image can fail. Introduce separate flags for different conditions, like "file not readable", "file not writable", "no uplink server available", "file content has changed". --- src/server/altservers.c | 4 - src/server/globals.h | 7 +- src/server/image.c | 193 +++++++++++++++++++++++++----------------------- src/server/integrity.c | 20 +---- src/server/net.c | 17 +++-- src/server/uplink.c | 114 ++++++++++++++++++---------- 6 files changed, 197 insertions(+), 158 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/altservers.c b/src/server/altservers.c index 3fdbe0d..a6ad235 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -628,10 +628,6 @@ failed: if ( best.fd != -1 ) { close( best.fd ); } - if ( !image->working || uplink->cycleDetected ) { - image->working = true; - LOG( LOG_DEBUG1, "[RTT] No better alt server found, enabling '%s:%d' again... :-(", image->name, (int)image->rid ); - } uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away mutex_lock( &uplink->rttLock ); uplink->rttTestResult = RTT_DONTCHANGE; diff --git a/src/server/globals.h b/src/server/globals.h index b1336dc..31fbce5 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -136,7 +136,12 @@ struct _dnbd3_image atomic_int completenessEstimate; // Completeness estimate in percent atomic_int users; // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock. int id; // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server - atomic_bool working; // true if image exists and completeness is == 100% or a working upstream proxy is connected + struct { + atomic_bool uplink; // No uplink connected + atomic_bool write; // Error writing to file + atomic_bool read; // Error reading from file + atomic_bool changed; // File disappeared or changed, thorough check required if it seems to be back + } problem; uint16_t rid; // revision of image pthread_mutex_t lock; }; diff --git a/src/server/image.c b/src/server/image.c index 6017e59..1ce1574 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -53,7 +53,7 @@ static bool image_ensureDiskSpace(uint64_t size, bool force); static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize); static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc); -static void image_checkRandomBlocks(dnbd3_image_t *image, const int count); +static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd); static void* closeUnusedFds(void*); static void allocCacheMap(dnbd3_image_t *image, bool complete); @@ -239,35 +239,76 @@ bool image_isComplete(dnbd3_image_t *image) */ bool image_ensureOpen(dnbd3_image_t *image) { - if ( image->readFd != -1 ) return image; - int newFd = open( image->path, O_RDONLY ); + bool sizeChanged = false; + if ( image->readFd != -1 && !image->problem.changed ) + return true; + int newFd = image->readFd == -1 ? open( image->path, O_RDONLY ) : dup( image->readFd ); if ( newFd == -1 ) { - logadd( LOG_WARNING, "Cannot open %s for reading", image->path ); + if ( !image->problem.read ) { + logadd( LOG_WARNING, "Cannot open %s for reading", image->path ); + image->problem.read = true; + } } else { - // Check size + // Check size + read access + char buffer[100]; const off_t flen = lseek( newFd, 0, SEEK_END ); if ( flen == -1 ) { - logadd( LOG_WARNING, "Could not seek to end of %s (errno %d)", image->path, errno ); + if ( !image->problem.read ) { + logadd( LOG_WARNING, "Could not seek to end of %s (errno %d)", image->path, errno ); + image->problem.read = true; + } close( newFd ); newFd = -1; } else if ( (uint64_t)flen != image->realFilesize ) { - logadd( LOG_WARNING, "Size of active image with closed fd changed from %" PRIu64 " to %" PRIu64, image->realFilesize, (uint64_t)flen ); + if ( !image->problem.changed ) { + logadd( LOG_WARNING, "Size of active image with closed fd changed from %" PRIu64 " to %" PRIu64, + image->realFilesize, (uint64_t)flen ); + } + sizeChanged = true; + } else if ( pread( newFd, buffer, sizeof(buffer), 0 ) == -1 ) { + if ( !image->problem.read ) { + logadd( LOG_WARNING, "Reading first %d bytes from %s failed (errno=%d)", + (int)sizeof(buffer), image->path, errno ); + image->problem.read = true; + } close( newFd ); newFd = -1; } } if ( newFd == -1 ) { - mutex_lock( &image->lock ); - image->working = false; - mutex_unlock( &image->lock ); + if ( sizeChanged ) { + image->problem.changed = true; + } return false; } + + // Re-opened. Check if the "size/content changed" flag was set before and if so, check crc32, + // but only if the size we just got above is correct. + if ( image->problem.changed && !sizeChanged ) { + if ( image->crc32 == NULL ) { + // Cannot verify further, hope for the best + image->problem.changed = false; + logadd( LOG_DEBUG1, "Size of image %s:%d changed back to expected value", + image->name, (int)image->rid ); + } else if ( image_checkRandomBlocks( image, 1, newFd ) ) { + // This should have checked the first block (if complete) -> All is well again + image->problem.changed = false; + logadd( LOG_DEBUG1, "Size and CRC of image %s:%d changed back to expected value", + image->name, (int)image->rid ); + } + } else { + image->problem.changed = sizeChanged; + } + mutex_lock( &image->lock ); if ( image->readFd == -1 ) { image->readFd = newFd; + image->problem.read = false; mutex_unlock( &image->lock ); } else { - // There was a race while opening the file (happens cause not locked cause blocking), we lost the race so close new fd and proceed + // There was a race while opening the file (happens cause not locked cause blocking), + // we lost the race so close new fd and proceed. + // *OR* we dup()'ed above for cheating when the image changed before. mutex_unlock( &image->lock ); close( newFd ); } @@ -296,7 +337,7 @@ dnbd3_image_t* image_byId(int imgId) * point... * Locks on: imageListLock, _images[].lock */ -dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) +dnbd3_image_t* image_get(char *name, uint16_t revision, bool ensureFdOpen) { int i; const char *removingText = _removeMissingImages ? ", removing from list" : ""; @@ -326,84 +367,36 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) candidate->users++; mutex_unlock( &imageListLock ); - // Found, see if it works - // TODO: Also make sure a non-working image still has old fd open but created a new one and removed itself from the list - // TODO: But remember size-changed images forever - if ( candidate->working || checkIfWorking ) { - // Is marked working, but might not have an fd open - if ( !image_ensureOpen( candidate ) ) { - mutex_lock( &candidate->lock ); - timing_get( &candidate->lastWorkCheck ); - mutex_unlock( &candidate->lock ); - if ( _removeMissingImages ) { - candidate = image_remove( candidate ); // No release here, the image is still returned and should be released by caller - } - return candidate; - } - } - - if ( !checkIfWorking ) return candidate; // Not interested in re-cechking working state - - // ...not working... - - // Don't re-check too often - mutex_lock( &candidate->lock ); - bool check; - declare_now; - check = timing_diff( &candidate->lastWorkCheck, &now ) > NONWORKING_RECHECK_INTERVAL_SECONDS; - if ( check ) { - candidate->lastWorkCheck = now; - } - mutex_unlock( &candidate->lock ); - if ( !check ) { + if ( !ensureFdOpen ) // Don't want to re-check return candidate; - } - // reaching this point means: - // 1) We should check if the image is working, it might or might not be in working state right now - // 2) The image is open for reading (or at least was at some point, the fd might be stale if images lie on an NFS share etc.) - // 3) We made sure not to re-check this image too often - - // Common for ro and rw images: Size check, read check - const off_t len = lseek( candidate->readFd, 0, SEEK_END ); - bool reload = false; - if ( len == -1 ) { - logadd( LOG_WARNING, "lseek() on %s failed (errno=%d)%s.", candidate->path, errno, removingText ); - reload = true; - } else if ( (uint64_t)len != candidate->realFilesize ) { - logadd( LOG_WARNING, "Size of %s changed at runtime, keeping disabled! Expected: %" PRIu64 ", found: %" PRIu64 - ". Try sending SIGHUP to server if you know what you're doing.", - candidate->path, candidate->realFilesize, (uint64_t)len ); - } else { - // Seek worked, file size is same, now see if we can read from file - char buffer[100]; - if ( pread( candidate->readFd, buffer, sizeof(buffer), 0 ) == -1 ) { - logadd( LOG_WARNING, "Reading first %d bytes from %s failed (errno=%d)%s.", - (int)sizeof(buffer), candidate->path, errno, removingText ); - reload = true; - } else if ( !candidate->working ) { - // Seems everything is fine again \o/ - candidate->working = true; - logadd( LOG_INFO, "Changed state of %s:%d to 'working'", candidate->name, candidate->rid ); - } - } + if ( image_ensureOpen( candidate ) && !candidate->problem.read ) + return candidate; // We have a read fd and no read or changed problems - if ( reload ) { + // -- image could not be opened again, or is open but has problem -- + + if ( _removeMissingImages && !file_isReadable( candidate->path ) ) { + candidate = image_remove( candidate ); + // No image_release here, the image is still returned and should be released by caller + } else if ( candidate->readFd != -1 ) { + // We cannot just close the fd as it might be in use. Make a copy and remove old entry. + candidate = image_remove( candidate ); // Could not access the image with exising fd - mark for reload which will re-open the file. // make a copy of the image struct but keep the old one around. If/When it's not being used // anymore, it will be freed automatically. - logadd( LOG_DEBUG1, "Reloading image file %s", candidate->path ); + logadd( LOG_DEBUG1, "Reloading image file %s because of read problem/changed", candidate->path ); dnbd3_image_t *img = calloc( sizeof(dnbd3_image_t), 1 ); img->path = strdup( candidate->path ); img->name = strdup( candidate->name ); img->virtualFilesize = candidate->virtualFilesize; img->realFilesize = candidate->realFilesize; - img->atime = now; + timing_get( &img->atime ); img->masterCrc32 = candidate->masterCrc32; img->readFd = -1; img->rid = candidate->rid; img->users = 1; - img->working = false; + img->problem.read = true; + img->problem.changed = candidate->problem.changed; img->ref_cacheMap = NULL; mutex_init( &img->lock, LOCK_IMAGE ); if ( candidate->crc32 != NULL ) { @@ -419,18 +412,17 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) if ( image_addToList( img ) ) { image_release( candidate ); candidate = img; + // Check if image is incomplete, initialize uplink + if ( candidate->ref_cacheMap != NULL ) { + uplink_init( candidate, -1, NULL, -1 ); + } + // Try again with new instance + image_ensureOpen( candidate ); } else { img->users = 0; image_free( img ); } - // Check if image is incomplete, initialize uplink - if ( candidate->ref_cacheMap != NULL ) { - uplink_init( candidate, -1, NULL, -1 ); - } - // readFd == -1 and working == FALSE at this point, - // this function needs some splitting up for handling as we need to run most - // of the above code again. for now we know that the next call for this - // name:rid will get ne newly inserted "img" and try to re-open the file. + // readFd == -1 and problem.read == true } return candidate; // We did all we can, hopefully it's working @@ -900,7 +892,6 @@ static bool image_load(char *base, char *path, int withUplink) image->rid = (uint16_t)revision; image->users = 0; image->readFd = -1; - image->working = ( cache == NULL ); timing_get( &image->nextCompletenessEstimate ); image->completenessEstimate = -1; mutex_init( &image->lock, LOCK_IMAGE ); @@ -925,7 +916,7 @@ static bool image_load(char *base, char *path, int withUplink) // Image is definitely incomplete, initialize uplink worker if ( image->ref_cacheMap != NULL ) { - image->working = false; + image->problem.uplink = true; if ( withUplink ) { uplink_init( image, -1, NULL, -1 ); } @@ -937,7 +928,7 @@ static bool image_load(char *base, char *path, int withUplink) // Keep fd for reading fdImage = -1; // Check CRC32 - image_checkRandomBlocks( image, 4 ); + image_checkRandomBlocks( image, 4, -1 ); } else { logadd( LOG_ERROR, "Image list full: Could not add image %s", path ); image->readFd = -1; // Keep fdImage instead, will be closed below @@ -1027,10 +1018,19 @@ static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t f return retval; } -static void image_checkRandomBlocks(dnbd3_image_t *image, const int count) +/** + * Check up to count random blocks from given image. If fromFd is -1, the check will + * be run asynchronously using the integrity checker. Otherwise, the check will + * happen in the function and return the result of the check. + * @param image image to check + * @param count number of blocks to check (max) + * @param fromFd, check synchronously and use this fd for reading, -1 = async + * @return true = OK, false = error. Meaningless if fromFd == -1 + */ +static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd) { if ( image->crc32 == NULL ) - return; + return true; // This checks the first block and (up to) count - 1 random blocks for corruption // via the known crc32 list. This is very sloppy and is merely supposed to detect // accidental corruption due to broken dnbd3-proxy functionality or file system @@ -1038,7 +1038,7 @@ static void image_checkRandomBlocks(dnbd3_image_t *image, const int count) assert( count > 0 ); dnbd3_cache_map_t *cache = ref_get_cachemap( image ); const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( image->virtualFilesize ); - int blocks[count]; + int blocks[count+1]; // +1 for "-1" in sync case int index = 0, j; int block; if ( image_isHashBlockComplete( cache, 0, image->virtualFilesize ) ) { @@ -1062,9 +1062,16 @@ while_end: ; if ( cache != NULL ) { ref_put( &cache->reference ); } - for ( int i = 0; i < index; ++i ) { - integrity_check( image, blocks[i], true ); + if ( fromFd == -1 ) { + // Async + for ( int i = 0; i < index; ++i ) { + integrity_check( image, blocks[i], true ); + } + return true; } + // Sync + blocks[index] = -1; + return image_checkBlocksCrc32( fromFd, image->crc32, blocks, image->realFilesize ); } /** @@ -1306,7 +1313,7 @@ server_fail: ; } else { // Clumsy busy wait, but this should only take as long as it takes to start a thread, so is it really worth using a signalling mechanism? int i = 0; - while ( !image->working && ++i < 100 ) + while ( image->problem.uplink && ++i < 100 ) usleep( 2000 ); } } else if ( uplinkSock != -1 ) { @@ -1599,7 +1606,7 @@ int image_getCompletenessEstimate(dnbd3_image_t * const image) assert( image != NULL ); dnbd3_cache_map_t *cache = ref_get_cachemap( image ); if ( cache == NULL ) - return image->working ? 100 : 0; + return 100; const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); if ( unlikely( len == 0 ) ) { ref_put( &cache->reference ); diff --git a/src/server/integrity.c b/src/server/integrity.c index 4006dfc..91e53b8 100644 --- a/src/server/integrity.c +++ b/src/server/integrity.c @@ -195,9 +195,10 @@ static void* integrity_main(void * data UNUSED) readFd = directFd; } } - if ( readFd == -1 ) { // Try buffered; flush to disk for that - image_ensureOpen( image ); - readFd = image->readFd; + if ( readFd == -1 ) { // Try buffered as fallback + if ( image_ensureOpen( image ) && !image->problem.read ) { + readFd = image->readFd; + } } if ( readFd == -1 ) { logadd( LOG_MINOR, "Couldn't get any valid fd for integrity check of %s... ignoring...", image->path ); @@ -237,16 +238,6 @@ static void* integrity_main(void * data UNUSED) // Done with this task as nothing left checkQueue[i].image = NULL; if ( i + 1 == queueLen ) queueLen--; - // Mark as working again if applicable - if ( !foundCorrupted ) { - dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); - if ( uplink != NULL ) { // TODO: image_determineWorkingState() helper? - mutex_lock( &image->lock ); - image->working = uplink->current.fd != -1 && image->readFd != -1; - mutex_unlock( &image->lock ); - ref_put( &uplink->reference ); - } - } } else { // Still more blocks to go... checkQueue[i].block = blocks[0]; @@ -254,9 +245,6 @@ static void* integrity_main(void * data UNUSED) } if ( foundCorrupted && !_shutdown ) { // Something was fishy, make sure uplink exists - mutex_lock( &image->lock ); - image->working = false; - mutex_unlock( &image->lock ); uplink_init( image, -1, NULL, -1 ); } // Release :-) diff --git a/src/server/net.c b/src/server/net.c index aba4e7d..29147be 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -262,7 +262,7 @@ void* net_handleNewConnection(void *clientPtr) atomic_thread_fence( memory_order_release ); if ( unlikely( image == NULL ) ) { //logadd( LOG_DEBUG1, "Client requested non-existent image '%s' (rid:%d), rejected\n", image_name, (int)rid ); - } else if ( unlikely( !image->working ) ) { + } else if ( unlikely( image->problem.read || image->problem.changed ) ) { logadd( LOG_DEBUG1, "Client %s requested non-working image '%s' (rid:%d), rejected\n", client->hostName, image_name, (int)rid ); } else { @@ -273,8 +273,14 @@ void* net_handleNewConnection(void *clientPtr) if ( uplink != NULL && ( uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) ) { bOk = ( rand() % 4 ) == 1; } - if ( bOk && uplink != NULL && uplink->cacheFd == -1 ) { // Wait 100ms if local caching is not working so this - usleep( 100000 ); // server gets a penalty and is less likely to be selected + if ( bOk && uplink != NULL ) { + if ( uplink->cacheFd == -1 ) { // Wait 100ms if local caching is not working so this + usleep( 100000 ); // server gets a penalty and is less likely to be selected + } + if ( image->problem.uplink ) { + // Penaltize depending on completeness, if no uplink is available + usleep( ( 100 - image->completenessEstimate ) * 100 ); + } } if ( uplink != NULL ) { ref_put( &uplink->reference ); @@ -383,9 +389,8 @@ void* net_handleNewConnection(void *clientPtr) ref_put( &cache->reference ); if ( !isCached ) { if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) { - logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy, disabling image %s:%d", + logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy for image %s:%d", client->hostName, image->name, image->rid ); - image->working = false; goto exit_client_cleanup; } break; // DONE, exit request.cmd switch @@ -456,7 +461,7 @@ void* net_handleNewConnection(void *clientPtr) } if ( err == EBADF || err == EFAULT || err == EINVAL || err == EIO ) { logadd( LOG_INFO, "Disabling %s:%d", image->name, image->rid ); - image->working = false; + image->problem.read = true; } } goto exit_client_cleanup; diff --git a/src/server/uplink.c b/src/server/uplink.c index f39e633..aba53ba 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -53,9 +53,9 @@ static void* uplink_mainloop(void *data); static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly); static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex); static void uplink_handleReceive(dnbd3_uplink_t *uplink); -static int uplink_sendKeepalive(const int fd); +static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink); static void uplink_addCrc32(dnbd3_uplink_t *uplink); -static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink); +static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink); static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force); static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink); static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink); @@ -117,6 +117,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version uplink->current.fd = -1; mutex_unlock( &uplink->sendMutex ); uplink->cycleDetected = false; + image->problem.uplink = true; if ( sock != -1 ) { uplink->better.fd = sock; int index = altservers_hostToIndex( host ); @@ -371,6 +372,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" ); } else { if ( unlikely( uplink->current.fd == -1 ) ) { + uplink->image->problem.uplink = true; mutex_unlock( &uplink->sendMutex ); logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" ); } else { @@ -378,12 +380,14 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin const uint32_t reqSize = (uint32_t)(((uplink->queue[freeSlot].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart); if ( hops < 200 ) ++hops; const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) ); - mutex_unlock( &uplink->sendMutex ); if ( unlikely( !ret ) ) { + uplink->image->problem.uplink = true; + mutex_unlock( &uplink->sendMutex ); logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" ); } else { // Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again int state; + mutex_unlock( &uplink->sendMutex ); mutex_lock( &uplink->queueLock ); if ( !uplink->shutdown && uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) { state = uplink->queue[freeSlot].status; @@ -460,9 +464,9 @@ static void* uplink_mainloop(void *data) } while ( !_shutdown && !uplink->shutdown ) { // poll() - waitTime = uplink->rttTestResult == RTT_DOCHANGE ? 0 : -1; - if ( waitTime == 0 ) { + if ( uplink->rttTestResult == RTT_DOCHANGE ) { // 0 means poll, since we're about to change the server + waitTime = 0; } else { declare_now; waitTime = (int)timing_diffMs( &now, &nextAltCheck ); @@ -495,7 +499,7 @@ static void* uplink_mainloop(void *data) discoverFailCount = 0; if ( fd != -1 ) close( fd ); uplink->replicationHandle = REP_NONE; - uplink->image->working = true; + uplink->image->problem.uplink = false; uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received buffer[0] = '@'; if ( altservers_toString( uplink->current.index, buffer + 1, sizeof(buffer) - 1 ) ) { @@ -510,6 +514,11 @@ static void* uplink_mainloop(void *data) uplink_sendRequests( uplink, false ); uplink_sendReplicationRequest( uplink ); events[EV_SOCKET].events = POLLIN | POLLRDHUP; + if ( uplink->image->problem.uplink ) { + // Some of the requests above must have failed again already :-( + logadd( LOG_DEBUG1, "Newly established uplink connection failed during getCRC or sendRequests" ); + uplink_connectionFailed( uplink, true ); + } timing_gets( &nextAltCheck, altCheckInterval ); // The rtt worker already did the handshake for our image, so there's nothing // more to do here @@ -517,6 +526,7 @@ static void* uplink_mainloop(void *data) // Check events // Signal if ( (events[EV_SIGNAL].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) { + uplink->image->problem.uplink = true; logadd( LOG_WARNING, "poll error on signal in uplink_mainloop!" ); goto cleanup; } else if ( (events[EV_SIGNAL].revents & POLLIN) ) { @@ -553,14 +563,10 @@ static void* uplink_mainloop(void *data) } // Keep-alive if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) { - // Send keep-alive if nothing is happening - if ( uplink_sendKeepalive( uplink->current.fd ) ) { - // Re-trigger periodically, in case it requires a minimum user count - uplink_sendReplicationRequest( uplink ); - } else { + // Send keep-alive if nothing is happening, and try to trigger background rep. + if ( !uplink_sendKeepalive( uplink ) || !uplink_sendReplicationRequest( uplink ) ) { uplink_connectionFailed( uplink, true ); - logadd( LOG_DEBUG1, "Error sending keep-alive, panic!\n" ); - setThreadName( "panic-uplink" ); + logadd( LOG_DEBUG1, "Error sending keep-alive/BGR, panic!\n" ); } } // Don't keep uplink established if we're idle for too much @@ -578,6 +584,7 @@ static void* uplink_mainloop(void *data) // Quit work if image is complete logadd( LOG_INFO, "Replication of %s complete.", uplink->image->name ); setThreadName( "finished-uplink" ); + uplink->image->problem.uplink = false; goto cleanup; } else { // Not complete - do measurement @@ -592,10 +599,6 @@ static void* uplink_mainloop(void *data) } else if ( rttTestResult == RTT_NOT_REACHABLE ) { if ( atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE ) ) { discoverFailCount++; - if ( uplink->image->working && uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) { - logadd( LOG_DEBUG1, "Disabling %s:%d since no uplink is available", uplink->image->name, (int)uplink->image->rid ); - uplink->image->working = false; - } if ( uplink->current.fd == -1 ) { uplink->cycleDetected = false; } @@ -624,8 +627,9 @@ static void* uplink_mainloop(void *data) } } mutex_unlock( &uplink->queueLock ); - if ( resend ) + if ( resend ) { uplink_sendRequests( uplink, true ); + } } #endif } @@ -653,6 +657,9 @@ static void* uplink_mainloop(void *data) return NULL ; } +/** + * Only called from uplink thread. + */ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) { // Scan for new requests @@ -672,13 +679,15 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) if ( hops < 200 ) ++hops; mutex_lock( &uplink->sendMutex ); const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) ); - mutex_unlock( &uplink->sendMutex ); - if ( !ret ) { + if ( likely( ret ) ) { + mutex_unlock( &uplink->sendMutex ); + } else { // Non-critical - if the connection dropped or the server was changed // the thread will re-send this request as soon as the connection // is reestablished. + uplink->image->problem.uplink = true; + mutex_unlock( &uplink->sendMutex ); logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" ); - altservers_serverFailed( uplink->current.index ); return; } mutex_lock( &uplink->queueLock ); @@ -695,21 +704,27 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) * server. This means we might request data we already have, but it makes * the code simpler. Worst case would be only one bit is zero, which means * 4kb are missing, but we will request 32kb. + * + * Only called form uplink thread, so current.fd is assumed to be valid. + * + * @return false if sending request failed, true otherwise (i.e. not necessary/disabled) */ -static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) +static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) { - if ( uplink == NULL || uplink->current.fd == -1 ) return; - if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return; // Don't do background replication + if ( uplink->current.fd == -1 ) + return false; // Should never be called in this state, consider send error + if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) + return true; // Don't do background replication if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE ) - return; // Already a replication request on the wire, or no more blocks to replicate + return true; // Already a replication request on the wire, or no more blocks to replicate dnbd3_image_t * const image = uplink->image; - if ( image->virtualFilesize < DNBD3_BLOCK_SIZE ) return; - if ( image->users < _bgrMinClients ) return; // Not enough active users + if ( image->users < _bgrMinClients ) + return true; // Not enough active users dnbd3_cache_map_t *cache = ref_get_cachemap( image ); - if ( cache == NULL || image->users < _bgrMinClients ) { + if ( cache == NULL || image->users ) { // No cache map (=image complete) ref_put( &cache->reference ); - return; + return true; } const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); const int lastBlockIndex = mapBytes - 1; @@ -741,17 +756,20 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) if ( replicationIndex == -1 ) { // Replication might be complete, uplink_mainloop should take care.... uplink->nextReplicationIndex = -1; - return; + return true; } const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE; uplink->replicationHandle = offset; const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE ); mutex_lock( &uplink->sendMutex ); bool sendOk = dnbd3_get_block( uplink->current.fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->current.version, 1 ) ); - mutex_unlock( &uplink->sendMutex ); - if ( !sendOk ) { + if ( likely( sendOk ) ) { + mutex_unlock( &uplink->sendMutex ); + } else { + uplink->image->problem.uplink = true; + mutex_unlock( &uplink->sendMutex ); logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" ); - return; + return false; } if ( replicationIndex == lastBlockIndex ) { uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks @@ -762,6 +780,7 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) // Just crossed a hash block boundary, look for new candidate starting at this very index uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex ); } + return true; } /** @@ -816,6 +835,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int /** * Receive data from uplink server and process/dispatch * Locks on: uplink.lock, images[].lock + * Only called from uplink thread, so current.fd is assumed to be valid. */ static void uplink_handleReceive(dnbd3_uplink_t *uplink) { @@ -990,11 +1010,14 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) mutex_lock( &uplink->queueLock ); const bool rep = ( uplink->queueLen == 0 ); mutex_unlock( &uplink->queueLock ); - if ( rep ) uplink_sendReplicationRequest( uplink ); + if ( rep ) { + if ( !uplink_sendReplicationRequest( uplink ) ) + goto error_cleanup; + } } return; // Error handling from failed receive or message parsing - error_cleanup: ; +error_cleanup: ; uplink_connectionFailed( uplink, true ); } @@ -1005,8 +1028,10 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) { if ( uplink->current.fd == -1 ) return; + setThreadName( "panic-uplink" ); altservers_serverFailed( uplink->current.index ); mutex_lock( &uplink->sendMutex ); + uplink->image->problem.uplink = true; close( uplink->current.fd ); uplink->current.fd = -1; mutex_unlock( &uplink->sendMutex ); @@ -1025,14 +1050,24 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) } /** - * Send keep alive request to server + * Send keep alive request to server. + * Called from uplink thread, current.fd must be valid. */ -static int uplink_sendKeepalive(const int fd) +static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink) { static const dnbd3_request_t request = { .magic = dnbd3_packet_magic, .cmd = net_order_16( CMD_KEEPALIVE ) }; - return send( fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request); + mutex_lock( &uplink->sendMutex ); + bool sendOk = send( uplink->current.fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request); + mutex_unlock( &uplink->sendMutex ); + return sendOk; } +/** + * Request crclist from uplink. + * Called from uplink thread, current.fd must be valid. + * FIXME This is broken as it could happen that another message arrives after sending + * the request. Refactor, split and move receive into general receive handler. + */ static void uplink_addCrc32(dnbd3_uplink_t *uplink) { dnbd3_image_t *image = uplink->image; @@ -1042,6 +1077,9 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink) uint32_t *buffer = malloc( bytes ); mutex_lock( &uplink->sendMutex ); bool sendOk = dnbd3_get_crc32( uplink->current.fd, &masterCrc, buffer, &bytes ); + if ( !sendOk ) { + uplink->image->problem.uplink = true; + } mutex_unlock( &uplink->sendMutex ); if ( !sendOk || bytes == 0 ) { free( buffer ); -- cgit v1.2.3-55-g7522 From 5bc3badd013b88201da64dc970600d19451daaec Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Tue, 3 Mar 2020 14:55:01 +0100 Subject: [SERVER] Also add a flag for uplink queue overload --- src/server/globals.h | 3 ++- src/server/net.c | 10 +++------- src/server/uplink.c | 11 +++++++++++ 3 files changed, 16 insertions(+), 8 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/globals.h b/src/server/globals.h index 31fbce5..0bd6e47 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -93,7 +93,7 @@ struct _dnbd3_uplink // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block" uint64_t replicationHandle; // Handle of pending replication request atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup. - atomic_int queueLen; // length of queue + int queueLen; // length of queue uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives) dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE]; dnbd3_alt_local_t altData[SERVER_MAX_ALTS]; @@ -141,6 +141,7 @@ struct _dnbd3_image atomic_bool write; // Error writing to file atomic_bool read; // Error reading from file atomic_bool changed; // File disappeared or changed, thorough check required if it seems to be back + atomic_bool queue; // Too many requests waiting on uplink } problem; uint16_t rid; // revision of image pthread_mutex_t lock; diff --git a/src/server/net.c b/src/server/net.c index 29147be..a478e0c 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -269,12 +269,11 @@ void* net_handleNewConnection(void *clientPtr) // Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable bOk = true; if ( image->ref_cacheMap != NULL ) { - dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); - if ( uplink != NULL && ( uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) ) { + if ( image->problem.queue || image->problem.write ) { bOk = ( rand() % 4 ) == 1; } - if ( bOk && uplink != NULL ) { - if ( uplink->cacheFd == -1 ) { // Wait 100ms if local caching is not working so this + if ( bOk ) { + if ( image->problem.write ) { // Wait 100ms if local caching is not working so this usleep( 100000 ); // server gets a penalty and is less likely to be selected } if ( image->problem.uplink ) { @@ -282,9 +281,6 @@ void* net_handleNewConnection(void *clientPtr) usleep( ( 100 - image->completenessEstimate ) * 100 ); } } - if ( uplink != NULL ) { - ref_put( &uplink->reference ); - } } if ( bOk ) { mutex_lock( &image->lock ); diff --git a/src/server/uplink.c b/src/server/uplink.c index aba53ba..97cb2a9 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -118,6 +118,8 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version mutex_unlock( &uplink->sendMutex ); uplink->cycleDetected = false; image->problem.uplink = true; + image->problem.write = true; + image->problem.queue = false; if ( sock != -1 ) { uplink->better.fd = sock; int index = altservers_hostToIndex( host ); @@ -191,6 +193,7 @@ static void cancelAllRequests(dnbd3_uplink_t *uplink) } } uplink->queueLen = 0; + uplink->image->problem.queue = false; } static void uplink_free(ref *ref) @@ -328,6 +331,9 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin goto fail_lock; } freeSlot = uplink->queueLen++; + if ( freeSlot > SERVER_UPLINK_QUEUELEN_THRES ) { + uplink->image->problem.queue = true; + } } // Do not send request to uplink server if we have a matching pending request AND the request either has the // status ULR_NEW/PENDING OR we found a free slot with LOWER index than the one we attach to. Otherwise @@ -904,6 +910,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) continue; // Success, retry write } if ( err == EBADF || err == EINVAL || err == EIO ) { + uplink->image->problem.write = true; if ( !tryAgain || !uplink_reopenCacheFd( uplink, true ) ) break; tryAgain = false; @@ -983,6 +990,9 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) } if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--; } + if ( uplink->queueLen < SERVER_UPLINK_QUEUELEN_THRES ) { + uplink->image->problem.queue = false; + } mutex_unlock( &uplink->queueLock ); #ifdef _DEBUG if ( !served && start != uplink->replicationHandle ) { @@ -1121,6 +1131,7 @@ static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force) close( uplink->cacheFd ); } uplink->cacheFd = open( uplink->image->path, O_WRONLY | O_CREAT, 0644 ); + uplink->image->problem.write = uplink->cacheFd == -1; return uplink->cacheFd != -1; } -- cgit v1.2.3-55-g7522 From f9468ef42cb5e2b1779c3309b2bbbe2495418787 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Tue, 3 Mar 2020 15:48:26 +0100 Subject: [SERVER] Expose image->problem bools as bitmask in RPC json data --- src/server/globals.h | 4 ++-- src/server/image.c | 13 +++++++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/globals.h b/src/server/globals.h index 0bd6e47..5de4180 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -137,10 +137,10 @@ struct _dnbd3_image atomic_int users; // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock. int id; // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server struct { - atomic_bool uplink; // No uplink connected - atomic_bool write; // Error writing to file atomic_bool read; // Error reading from file + atomic_bool write; // Error writing to file atomic_bool changed; // File disappeared or changed, thorough check required if it seems to be back + atomic_bool uplink; // No uplink connected atomic_bool queue; // Too many requests waiting on uplink } problem; uint16_t rid; // revision of image diff --git a/src/server/image.c b/src/server/image.c index 1ce1574..a6aec82 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -1576,14 +1576,23 @@ json_t* image_getListAsJson() ref_put( &uplink->reference ); } - jsonImage = json_pack( "{sisssisisisisI}", + int problems = 0; +#define addproblem(name,val) if (image->problem.name) problems |= (1 << val) + addproblem(read, 0); + addproblem(write, 1); + addproblem(changed, 2); + addproblem(uplink, 3); + addproblem(queue, 4); + + jsonImage = json_pack( "{sisssisisisisIsi}", "id", image->id, // id, name, rid never change, so access them without locking "name", image->name, "rid", (int) image->rid, "users", image->users, "complete", completeness, "idle", idleTime, - "size", (json_int_t)image->virtualFilesize ); + "size", (json_int_t)image->virtualFilesize, + "problems", problems ); if ( bytesReceived != 0 ) { json_object_set_new( jsonImage, "bytesReceived", json_integer( (json_int_t) bytesReceived ) ); } -- cgit v1.2.3-55-g7522 From 930b65f26cb39687a113641f56711a2d58f886ca Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Wed, 4 Mar 2020 17:49:50 +0100 Subject: [SERVER] Add timer task for saving cache maps Cache maps will now be saved periodically, but only if either they have a "dirty" bit set, which happens if any bits in the map get cleared again (due to corruption), or if new data has been replicated from an uplink server. This either means at least one byte received and 5 minutes have passed, or at least 500MB have been downloaded. The timer currently runs every 20 seconds. --- src/server/altservers.c | 20 +++++++ src/server/altservers.h | 2 + src/server/globals.h | 3 +- src/server/image.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++- src/server/image.h | 2 + src/server/uplink.c | 76 ++------------------------- src/serverconfig.h | 5 +- 7 files changed, 168 insertions(+), 76 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/altservers.c b/src/server/altservers.c index a6ad235..380737c 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -273,6 +273,26 @@ int altservers_getHostListForReplication(const char *image, dnbd3_host_t *server return num; } +/** + * Returns true if there is at least one alt-server the + * given image name would be allowed to be cloned from. + */ +bool altservers_imageHasAltServers(const char *image) +{ + bool ret = false; + mutex_lock( &altServersLock ); + for ( int i = 0; i < numAltServers; ++i ) { + if ( altServers[i].isClientOnly || ( !altServers[i].isPrivate && _proxyPrivateOnly ) ) + continue; + if ( !isImageAllowed( &altServers[i], image ) ) + continue; + ret = true; + break; + } + mutex_unlock( &altServersLock ); + return ret; +} + /** * Get alt servers. If there are more alt servers than * requested, random servers will be picked. diff --git a/src/server/altservers.h b/src/server/altservers.h index 8e29aaa..78f6fcc 100644 --- a/src/server/altservers.h +++ b/src/server/altservers.h @@ -19,6 +19,8 @@ int altservers_getListForClient(dnbd3_client_t *client, dnbd3_server_entry_t *ou int altservers_getHostListForReplication(const char *image, dnbd3_host_t *servers, int size); +bool altservers_imageHasAltServers(const char *image); + bool altservers_toString(int server, char *buffer, size_t len); int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2); diff --git a/src/server/globals.h b/src/server/globals.h index 5de4180..10d3ee3 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -93,6 +93,7 @@ struct _dnbd3_uplink // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block" uint64_t replicationHandle; // Handle of pending replication request atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup. + atomic_uint_fast64_t bytesReceivedLastSave; // Number of bytes received when we last saved the cache map int queueLen; // length of queue uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives) dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE]; @@ -128,7 +129,6 @@ struct _dnbd3_image uint64_t virtualFilesize; // virtual size of image (real size rounded up to multiple of 4k) uint64_t realFilesize; // actual file size on disk ticks atime; // last access time - ticks lastWorkCheck; // last time a non-working image has been checked ticks nextCompletenessEstimate; // next time the completeness estimate should be updated uint32_t *crc32; // list of crc32 checksums for each 16MiB block in image uint32_t masterCrc32; // CRC-32 of the crc-32 list @@ -144,6 +144,7 @@ struct _dnbd3_image atomic_bool queue; // Too many requests waiting on uplink } problem; uint16_t rid; // revision of image + atomic_bool mapDirty; // Cache map has been modified outside uplink (only integrity checker for now) pthread_mutex_t lock; }; diff --git a/src/server/image.c b/src/server/image.c index 3583f86..5a9e15b 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -55,6 +55,8 @@ static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc); static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd); static void* closeUnusedFds(void*); +static void* saveAllCacheMaps(void*); +static bool saveCacheMap(dnbd3_image_t *image); static void allocCacheMap(dnbd3_image_t *image, bool complete); static void cmfree(ref *ref) @@ -73,6 +75,7 @@ void image_serverStartup() mutex_init( &remoteCloneLock, LOCK_REMOTE_CLONE ); mutex_init( &reloadLock, LOCK_RELOAD ); server_addJob( &closeUnusedFds, NULL, 10, 900 ); + server_addJob( &saveAllCacheMaps, NULL, 9, 20 ); } /** @@ -160,6 +163,8 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co integrity_check( image, block, false ); } } + } else if ( !set ) { + image->mapDirty = true; } ref_put( &cache->reference ); } @@ -624,6 +629,7 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image) // this will get called again when the uplink is done. if ( !uplink_shutdown( image ) ) return NULL; + saveCacheMap( image ); mutex_lock( &image->lock ); ref_setref( &image->ref_cacheMap, NULL ); free( image->crc32 ); @@ -1830,6 +1836,135 @@ static void* closeUnusedFds(void* nix UNUSED) return NULL; } +#define IMGCOUNT 5 +static void* saveAllCacheMaps(void* nix UNUSED) +{ + static ticks nextSave; + dnbd3_image_t *list[IMGCOUNT]; + int count = 0; + declare_now; + bool full = timing_reached( &nextSave, &now ); + mutex_lock( &imageListLock ); + for ( int i = 0; i < _num_images; ++i ) { + dnbd3_image_t * const image = _images[i]; + if ( image->mapDirty ) { + // Flag is set if integrity checker found a problem - save out + image->users++; + list[count++] = image; + image->mapDirty = false; + } else { + // Otherwise, consider longer timeout and byte count limits of uplink + dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); + if ( uplink != NULL ) { + assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived ); + uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave; + if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES + || ( full && diff != 0 ) ) { + image->users++; + list[count++] = image; + uplink->bytesReceivedLastSave = uplink->bytesReceived; + } + ref_put( &uplink->reference ); + } + } + if ( count == IMGCOUNT ) + break; + } + mutex_unlock( &imageListLock ); + if ( full && count < IMGCOUNT ) { + // Only update nextSave once we handled all images in the list + timing_addSeconds( &nextSave, &now, CACHE_MAP_MAX_SAVE_DELAY ); + } + for ( int i = 0; i < count; ++i ) { + saveCacheMap( list[i] ); + image_release( list[i] ); + } + return NULL; +} +#undef IMGCOUNT + +/** + * Saves the cache map of the given image. + * Return true on success. + * @param image the image + */ +static bool saveCacheMap(dnbd3_image_t *image) +{ + if ( !_isProxy ) + return true; // Nothing to do + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) + return true; // Nothing to do + // Check if we're a "hybrid proxy", i.e. there are only some namespaces (directories) + // for which we have any upstream servers configured. If there's none, don't touch + // the cache map on disk. + if ( !altservers_imageHasAltServers( image->name ) ) { + ref_put( &cache->reference ); + return true; // Nothing to do + } + + logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); + const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize); + char mapfile[strlen( image->path ) + 4 + 1]; + strcpy( mapfile, image->path ); + strcat( mapfile, ".map" ); + + int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 ); + if ( fd == -1 ) { + const int err = errno; + ref_put( &cache->reference ); + logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile ); + return false; + } + + // On Linux we could use readFd, but in general it's not guaranteed to work + int imgFd = open( image->path, O_WRONLY ); + if ( imgFd == -1 ) { + logadd( LOG_WARNING, "Cannot open %s for fsync(): errno=%d", image->path, errno ); + } else { + if ( fsync( imgFd ) == -1 ) { + logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d. Resetting cache map.", image->path, errno ); + dnbd3_cache_map_t *old = image_loadCacheMap(image->path, image->virtualFilesize); + const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + if ( old == NULL ) { + // Could not load old map. FS might be toast. + logadd( LOG_ERROR, "Cannot load old cache map. Setting all zero." ); + memset( cache->map, 0, mapSize ); + } else { + // AND the maps together to be safe + for ( int i = 0; i < mapSize; ++i ) { + cache->map[i] &= old->map[i]; + } + old->reference.free( &old->reference ); + } + } + close( imgFd ); + } + + // Write current map to file + size_t done = 0; + while ( done < size ) { + const ssize_t ret = write( fd, cache->map + done, size - done ); + if ( ret == -1 ) { + if ( errno == EINTR ) continue; + logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile ); + break; + } + if ( ret <= 0 ) { + logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile ); + break; + } + done += (size_t)ret; + } + ref_put( &cache->reference ); + if ( fsync( fd ) == -1 ) { + logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno ); + } + close( fd ); + // TODO fsync on parent directory + return true; +} + static void allocCacheMap(dnbd3_image_t *image, bool complete) { const uint8_t val = complete ? 0xff : 0; @@ -1846,4 +1981,3 @@ static void allocCacheMap(dnbd3_image_t *image, bool complete) } mutex_unlock( &image->lock ); } - diff --git a/src/server/image.h b/src/server/image.h index 89791fc..4614c74 100644 --- a/src/server/image.h +++ b/src/server/image.h @@ -49,6 +49,8 @@ void image_closeUnusedFd(); bool image_ensureDiskSpaceLocked(uint64_t size, bool force); +bool image_saveCacheMap(dnbd3_image_t *image); + // one byte in the map covers 8 4kib blocks, so 32kib per byte // "+ (1 << 15) - 1" is required to account for the last bit of // the image that is smaller than 32kib diff --git a/src/server/uplink.c b/src/server/uplink.c index 97cb2a9..e5ab9c0 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -57,7 +57,6 @@ static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink); static void uplink_addCrc32(dnbd3_uplink_t *uplink); static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink); static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force); -static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink); static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink); static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew); @@ -103,6 +102,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version mutex_init( &uplink->sendMutex, LOCK_UPLINK_SEND ); uplink->image = image; uplink->bytesReceived = 0; + uplink->bytesReceivedLastSave = 0; uplink->idleTime = 0; uplink->queueLen = 0; uplink->cacheFd = -1; @@ -445,7 +445,6 @@ static void* uplink_mainloop(void *data) int altCheckInterval = SERVER_RTT_INTERVAL_INIT; int rttTestResult; uint32_t discoverFailCount = 0; - uint32_t unsavedSeconds = 0; ticks nextAltCheck, lastKeepalive; char buffer[200]; memset( events, 0, sizeof(events) ); @@ -561,12 +560,6 @@ static void* uplink_mainloop(void *data) if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) { lastKeepalive = now; uplink->idleTime += timepassed; - unsavedSeconds += timepassed; - if ( unsavedSeconds > 240 || ( unsavedSeconds > 60 && uplink->idleTime >= 20 && uplink->idleTime <= 70 ) ) { - // fsync/save every 4 minutes, or every 60 seconds if uplink is idle - unsavedSeconds = 0; - uplink_saveCacheMap( uplink ); - } // Keep-alive if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) { // Send keep-alive if nothing is happening, and try to trigger background rep. @@ -639,9 +632,9 @@ static void* uplink_mainloop(void *data) } #endif } - cleanup: ; - uplink_saveCacheMap( uplink ); +cleanup: ; dnbd3_image_t *image = uplink->image; + image->mapDirty = true; // Force writeout of cache map mutex_lock( &image->lock ); bool exp = false; if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) { @@ -1135,69 +1128,6 @@ static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force) return uplink->cacheFd != -1; } -/** - * Saves the cache map of the given image. - * Return true on success. - * Locks on: imageListLock, image.lock - */ -static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) -{ - dnbd3_image_t *image = uplink->image; - assert( image != NULL ); - - if ( uplink->cacheFd != -1 ) { - if ( fsync( uplink->cacheFd ) == -1 ) { - // A failing fsync means we have no guarantee that any data - // since the last fsync (or open if none) has been saved. Apart - // from keeping the cache map from the last successful fsync - // around and restoring it there isn't much we can do to recover - // a consistent state. Bail out. - logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d", image->path, errno ); - logadd( LOG_ERROR, "Bailing out immediately" ); - exit( 1 ); - } - } - - dnbd3_cache_map_t *cache = ref_get_cachemap( image ); - if ( cache == NULL ) - return true; - logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); - const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize); - assert( image->path != NULL ); - char mapfile[strlen( image->path ) + 4 + 1]; - strcpy( mapfile, image->path ); - strcat( mapfile, ".map" ); - - int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 ); - if ( fd == -1 ) { - const int err = errno; - ref_put( &cache->reference ); - logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile ); - return false; - } - - size_t done = 0; - while ( done < size ) { - const ssize_t ret = write( fd, cache->map + done, size - done ); - if ( ret == -1 ) { - if ( errno == EINTR ) continue; - logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile ); - break; - } - if ( ret <= 0 ) { - logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile ); - break; - } - done += (size_t)ret; - } - ref_put( &cache->reference ); - if ( fsync( fd ) == -1 ) { - logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno ); - } - close( fd ); - return true; -} - static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink) { return ( uplink->idleTime > SERVER_UPLINK_IDLE_TIMEOUT diff --git a/src/serverconfig.h b/src/serverconfig.h index 239f0a2..5c7301d 100644 --- a/src/serverconfig.h +++ b/src/serverconfig.h @@ -17,7 +17,10 @@ #define SERVER_UPLINK_QUEUELEN_THRES 900 // Threshold where we start dropping incoming clients #define SERVER_MAX_PENDING_ALT_CHECKS 500 // Length of queue for pending alt checks requested by uplinks -#define SERVER_CACHE_MAP_SAVE_INTERVAL 90 +// Wait a maximum of 5 minutes before saving cache map (if data was received at all) +#define CACHE_MAP_MAX_SAVE_DELAY 300 +// If more than 500MB have been received from uplink without saving cache map, do so +#define CACHE_MAP_MAX_UNSAVED_BYTES ((uint64_t)500 * 1000 * 1000) // Time in ms to wait for a read/write call to complete on an uplink connection #define SOCKET_TIMEOUT_UPLINK 5000 -- cgit v1.2.3-55-g7522 From 080a06ab22c8ac0841c06fe52ab4dbc982beafc1 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 6 Mar 2020 11:34:58 +0100 Subject: [SERVER] Reload cache maps periodically for local images If an image is incomplete, but has no upstream server that can be used for replication, reload the cache map from disk periodically, in case some other server instance is writing to the image. --- src/server/globals.h | 3 +- src/server/image.c | 129 +++++++++++++++++++++++++++++++++------------------ src/server/uplink.c | 10 +++- 3 files changed, 93 insertions(+), 49 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/globals.h b/src/server/globals.h index 10d3ee3..211fe2d 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -111,6 +111,8 @@ typedef struct typedef struct { ref reference; + atomic_bool dirty; // Cache map has been modified outside uplink (only integrity checker for now) + bool unchanged; // How many times in a row a reloaded cache map went unchanged _Atomic uint8_t map[]; } dnbd3_cache_map_t; @@ -144,7 +146,6 @@ struct _dnbd3_image atomic_bool queue; // Too many requests waiting on uplink } problem; uint16_t rid; // revision of image - atomic_bool mapDirty; // Cache map has been modified outside uplink (only integrity checker for now) pthread_mutex_t lock; }; diff --git a/src/server/image.c b/src/server/image.c index 5a9e15b..7ffe041 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -55,8 +55,9 @@ static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc); static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd); static void* closeUnusedFds(void*); -static void* saveAllCacheMaps(void*); -static bool saveCacheMap(dnbd3_image_t *image); +static bool imageShouldSaveCacheMap(dnbd3_image_t *image); +static void* saveLoadAllCacheMaps(void*); +static void saveCacheMap(dnbd3_image_t *image); static void allocCacheMap(dnbd3_image_t *image, bool complete); static void cmfree(ref *ref) @@ -75,7 +76,7 @@ void image_serverStartup() mutex_init( &remoteCloneLock, LOCK_REMOTE_CLONE ); mutex_init( &reloadLock, LOCK_RELOAD ); server_addJob( &closeUnusedFds, NULL, 10, 900 ); - server_addJob( &saveAllCacheMaps, NULL, 9, 20 ); + server_addJob( &saveLoadAllCacheMaps, NULL, 9, 20 ); } /** @@ -164,7 +165,7 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co } } } else if ( !set ) { - image->mapDirty = true; + cache->dirty = true; } ref_put( &cache->reference ); } @@ -629,7 +630,9 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image) // this will get called again when the uplink is done. if ( !uplink_shutdown( image ) ) return NULL; - saveCacheMap( image ); + if ( imageShouldSaveCacheMap( image ) ) { + saveCacheMap( image ); + } mutex_lock( &image->lock ); ref_setref( &image->ref_cacheMap, NULL ); free( image->crc32 ); @@ -1836,72 +1839,107 @@ static void* closeUnusedFds(void* nix UNUSED) return NULL; } -#define IMGCOUNT 5 -static void* saveAllCacheMaps(void* nix UNUSED) +static bool imageShouldSaveCacheMap(dnbd3_image_t *image) +{ + if ( !_isProxy ) + return false; // Nothing to do + if ( image->ref_cacheMap == NULL ) + return false; // Nothing to do + // Check if we're a "hybrid proxy", i.e. there are only some namespaces (directories) + // for which we have any upstream servers configured. If there's none, don't touch + // the cache map on disk. + if ( !altservers_imageHasAltServers( image->name ) ) + return false; // Nothing to do + return true; +} + +static void* saveLoadAllCacheMaps(void* nix UNUSED) { static ticks nextSave; - dnbd3_image_t *list[IMGCOUNT]; - int count = 0; declare_now; bool full = timing_reached( &nextSave, &now ); mutex_lock( &imageListLock ); for ( int i = 0; i < _num_images; ++i ) { dnbd3_image_t * const image = _images[i]; - if ( image->mapDirty ) { - // Flag is set if integrity checker found a problem - save out - image->users++; - list[count++] = image; - image->mapDirty = false; - } else { - // Otherwise, consider longer timeout and byte count limits of uplink + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) + continue; // No users++ or mutex_unlock yet -> safe + image->users++; + mutex_unlock( &imageListLock ); + if ( imageShouldSaveCacheMap( image ) ) { + // Replicated image, we're responsible for updating the map, so save it + // Save if dirty bit is set, blocks were invalidated + bool save = cache->dirty; dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); - if ( uplink != NULL ) { - assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived ); - uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave; - if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES - || ( full && diff != 0 ) ) { - image->users++; - list[count++] = image; + if ( !save ) { + // Otherwise, consider longer timeout and byte count limits of uplink + if ( uplink != NULL ) { + assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived ); + uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave; + if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES || ( full && diff != 0 ) ) { + save = true; + } + } + } + if ( save ) { + cache->dirty = false; + if ( uplink != NULL ) { uplink->bytesReceivedLastSave = uplink->bytesReceived; } + saveCacheMap( image ); + } + if ( uplink != NULL ) { ref_put( &uplink->reference ); } + } else { + // We're not replicating this image, if there's a cache map, reload + // it periodically, since we might read from a shared storage that + // another server instance is writing to. + if ( full || !cache->unchanged && !image->problem.read ) { + logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", image->name, (int)image->rid ); + dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize); + if ( onDisk == NULL ) { + // Should be complete now + logadd( LOG_DEBUG1, "External replication of %s:%d complete", image->name, (int)image->rid ); + ref_setref( &image->ref_cacheMap, NULL ); + } else { + const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + if ( memcmp( cache->map, onDisk->map, mapSize ) == 0 ) { + // Unchanged + cache->unchanged = true; + onDisk->reference.free( &onDisk->reference ); + } else { + // Replace + ref_setref( &image->ref_cacheMap, &onDisk->reference ); + logadd( LOG_DEBUG2, "Map changed" ); + } + } + } } - if ( count == IMGCOUNT ) - break; + ref_put( &cache->reference ); + image_release( image ); // Always do this instead of users-- to handle freeing + mutex_lock( &imageListLock ); } mutex_unlock( &imageListLock ); - if ( full && count < IMGCOUNT ) { - // Only update nextSave once we handled all images in the list + if ( full ) { timing_addSeconds( &nextSave, &now, CACHE_MAP_MAX_SAVE_DELAY ); } - for ( int i = 0; i < count; ++i ) { - saveCacheMap( list[i] ); - image_release( list[i] ); - } return NULL; } -#undef IMGCOUNT /** * Saves the cache map of the given image. - * Return true on success. + * Return false if this image doesn't have a cache map, or if the image + * doesn't have any uplink to replicate from. In this case the image might + * still have a cache map that was loaded from disk, and should be reloaded + * periodically. * @param image the image */ -static bool saveCacheMap(dnbd3_image_t *image) +static void saveCacheMap(dnbd3_image_t *image) { - if ( !_isProxy ) - return true; // Nothing to do dnbd3_cache_map_t *cache = ref_get_cachemap( image ); if ( cache == NULL ) - return true; // Nothing to do - // Check if we're a "hybrid proxy", i.e. there are only some namespaces (directories) - // for which we have any upstream servers configured. If there's none, don't touch - // the cache map on disk. - if ( !altservers_imageHasAltServers( image->name ) ) { - ref_put( &cache->reference ); - return true; // Nothing to do - } + return; // Race - wasn't NULL in function call above... logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize); @@ -1914,7 +1952,7 @@ static bool saveCacheMap(dnbd3_image_t *image) const int err = errno; ref_put( &cache->reference ); logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile ); - return false; + return; } // On Linux we could use readFd, but in general it's not guaranteed to work @@ -1962,7 +2000,6 @@ static bool saveCacheMap(dnbd3_image_t *image) } close( fd ); // TODO fsync on parent directory - return true; } static void allocCacheMap(dnbd3_image_t *image, bool complete) diff --git a/src/server/uplink.c b/src/server/uplink.c index e5ab9c0..e644e56 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -80,6 +80,8 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version { if ( !_isProxy || _shutdown ) return false; assert( image != NULL ); + if ( sock == -1 && !altservers_imageHasAltServers( image->name ) ) + return false; // Nothing to do mutex_lock( &image->lock ); dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); if ( uplink != NULL ) { @@ -103,7 +105,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version uplink->image = image; uplink->bytesReceived = 0; uplink->bytesReceivedLastSave = 0; - uplink->idleTime = 0; + uplink->idleTime = SERVER_UPLINK_IDLE_TIMEOUT - 90; uplink->queueLen = 0; uplink->cacheFd = -1; uplink->signal = signal_new(); @@ -634,7 +636,11 @@ static void* uplink_mainloop(void *data) } cleanup: ; dnbd3_image_t *image = uplink->image; - image->mapDirty = true; // Force writeout of cache map + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache != NULL ) { + cache->dirty = true; // Force writeout of cache map + ref_put( &cache->reference ); + } mutex_lock( &image->lock ); bool exp = false; if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) { -- cgit v1.2.3-55-g7522 From 9f11c67b291b50e0f1c98d2e85db22a33d2e2d11 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 6 Mar 2020 16:02:54 +0100 Subject: [SERVER] Add printf macro for image (name:rid as %s:%d) --- src/server/altservers.c | 4 ++-- src/server/globals.h | 1 + src/server/image.c | 32 ++++++++++++++------------------ src/server/uplink.c | 20 ++++++++++---------- 4 files changed, 27 insertions(+), 30 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/altservers.c b/src/server/altservers.c index 380737c..35da3a2 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -14,7 +14,7 @@ #include #include -#define LOG(lvl, msg, ...) logadd(lvl, msg " (%s:%d)", __VA_ARGS__, image->name, (int)image->rid) +#define LOG(lvl, msg, ...) logadd(lvl, msg " (%s:%d)", __VA_ARGS__, PIMG(image)) #define LOG_GOTO(jumplabel, lvl, ...) do { LOG(lvl, __VA_ARGS__); goto jumplabel; } while (0); #define ERROR_GOTO(jumplabel, ...) LOG_GOTO(jumplabel, LOG_ERROR, __VA_ARGS__) @@ -524,7 +524,7 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink) logadd( LOG_WARNING, "Image has gone away that was queued for RTT measurement" ); return; } - LOG( LOG_DEBUG2, "Running alt check for %s:%d", image->name, (int)image->rid ); + logadd( LOG_DEBUG2, "Running alt check for %s:%d", PIMG(image) ); assert( uplink->rttTestResult == RTT_INPROGRESS ); // Test them all dnbd3_server_connection_t best = { .fd = -1 }; diff --git a/src/server/globals.h b/src/server/globals.h index 211fe2d..1bb6857 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -148,6 +148,7 @@ struct _dnbd3_image uint16_t rid; // revision of image pthread_mutex_t lock; }; +#define PIMG(x) (x)->name, (int)(x)->rid struct _dnbd3_client { diff --git a/src/server/image.c b/src/server/image.c index 32c9efe..18e91d9 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -290,13 +290,11 @@ bool image_ensureOpen(dnbd3_image_t *image) if ( image->crc32 == NULL ) { // Cannot verify further, hope for the best image->problem.changed = false; - logadd( LOG_DEBUG1, "Size of image %s:%d changed back to expected value", - image->name, (int)image->rid ); + logadd( LOG_DEBUG1, "Size of image %s:%d changed back to expected value", PIMG(image) ); } else if ( image_checkRandomBlocks( image, 1, newFd ) ) { // This should have checked the first block (if complete) -> All is well again image->problem.changed = false; - logadd( LOG_DEBUG1, "Size and CRC of image %s:%d changed back to expected value", - image->name, (int)image->rid ); + logadd( LOG_DEBUG1, "Size and CRC of image %s:%d changed back to expected value", PIMG(image) ); } } else { image->problem.changed = sizeChanged; @@ -624,7 +622,7 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image) { assert( image != NULL ); assert( image->users == 0 ); - logadd( ( _shutdown ? LOG_DEBUG1 : LOG_INFO ), "Freeing image %s:%d", image->name, (int)image->rid ); + logadd( ( _shutdown ? LOG_DEBUG1 : LOG_INFO ), "Freeing image %s:%d", PIMG(image) ); // uplink_shutdown might return false to tell us // that the shutdown is in progress. Bail out since // this will get called again when the uplink is done. @@ -852,16 +850,16 @@ static bool image_load(char *base, char *path, int withUplink) // Compare data just loaded to identical image we apparently already loaded if ( existing != NULL ) { if ( existing->realFilesize != realFilesize ) { - logadd( LOG_WARNING, "Size of image '%s:%d' has changed.", existing->name, (int)existing->rid ); + logadd( LOG_WARNING, "Size of image '%s:%d' has changed.", PIMG(existing) ); // Image will be replaced below } else if ( existing->crc32 != NULL && crc32list != NULL && memcmp( existing->crc32, crc32list, sizeof(uint32_t) * hashBlockCount ) != 0 ) { - logadd( LOG_WARNING, "CRC32 list of image '%s:%d' has changed.", existing->name, (int)existing->rid ); + logadd( LOG_WARNING, "CRC32 list of image '%s:%d' has changed.", PIMG(existing) ); logadd( LOG_WARNING, "The image will be reloaded, but you should NOT replace existing images while the server is running." ); logadd( LOG_WARNING, "Actually even if it's not running this should never be done. Use a new RID instead!" ); // Image will be replaced below } else if ( existing->crc32 == NULL && crc32list != NULL ) { - logadd( LOG_INFO, "Found CRC-32 list for already loaded image '%s:%d', adding...", existing->name, (int)existing->rid ); + logadd( LOG_INFO, "Found CRC-32 list for already loaded image '%s:%d', adding...", PIMG(existing) ); existing->crc32 = crc32list; existing->masterCrc32 = masterCrc; crc32list = NULL; @@ -869,7 +867,7 @@ static bool image_load(char *base, char *path, int withUplink) goto load_error; // Keep existing } else if ( existing->ref_cacheMap != NULL && cache == NULL ) { // Just ignore that fact, if replication is really complete the cache map will be removed anyways - logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", existing->name, (int)existing->rid ); + logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", PIMG(existing) ); function_return = true; goto load_error; // Keep existing } else { @@ -940,7 +938,7 @@ static bool image_load(char *base, char *path, int withUplink) image = image_free( image ); goto load_error; } - logadd( LOG_DEBUG1, "Loaded image '%s:%d'\n", image->name, (int)image->rid ); + logadd( LOG_DEBUG1, "Loaded image '%s:%d'\n", PIMG(image) ); function_return = true; // Clean exit: @@ -1790,7 +1788,7 @@ static bool image_ensureDiskSpace(uint64_t size, bool force) image_release( oldest ); // We did users++ above; image might have to be freed entirely return false; } - logadd( LOG_INFO, "'%s:%d' has to go!", oldest->name, (int)oldest->rid ); + logadd( LOG_INFO, "'%s:%d' has to go!", PIMG(oldest) ); char *filename = strdup( oldest->path ); // Copy name as we remove the image first oldest = image_remove( oldest ); // Remove from list first... oldest = image_release( oldest ); // Decrease users counter; if it falls to 0, image will be freed @@ -1825,10 +1823,8 @@ static void* closeUnusedFds(void* nix UNUSED) dnbd3_image_t * const image = _images[i]; if ( image == NULL || image->readFd == -1 ) continue; - // TODO: Also close for idle uplinks (uplink_connectionShouldShutdown) - // TODO: And close writeFd for idle uplinks.... if ( image->users == 0 && image->uplinkref == NULL && timing_reached( &image->atime, &deadline ) ) { - logadd( LOG_DEBUG1, "Inactive fd closed for %s:%d", image->name, (int)image->rid ); + logadd( LOG_DEBUG1, "Inactive fd closed for %s:%d", PIMG(image) ); fds[fdindex++] = image->readFd; image->readFd = -1; // Not a race; image->users is 0 and to increase it you need imageListLock if ( fdindex == FDCOUNT ) @@ -1900,11 +1896,11 @@ static void* saveLoadAllCacheMaps(void* nix UNUSED) // it periodically, since we might read from a shared storage that // another server instance is writing to. if ( full || !cache->unchanged && !image->problem.read ) { - logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", image->name, (int)image->rid ); + logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", PIMG(image) ); dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize); if ( onDisk == NULL ) { // Should be complete now - logadd( LOG_DEBUG1, "External replication of %s:%d complete", image->name, (int)image->rid ); + logadd( LOG_DEBUG1, "External replication of %s:%d complete", PIMG(image) ); ref_setref( &image->ref_cacheMap, NULL ); } else { const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); @@ -1945,7 +1941,7 @@ static void saveCacheMap(dnbd3_image_t *image) if ( cache == NULL ) return; // Race - wasn't NULL in function call above... - logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); + logadd( LOG_DEBUG2, "Saving cache map of %s:%d", PIMG(image) ); const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize); char mapfile[strlen( image->path ) + 4 + 1]; strcpy( mapfile, image->path ); @@ -2015,7 +2011,7 @@ static void allocCacheMap(dnbd3_image_t *image, bool complete) memset( cache->map, val, byteSize ); mutex_lock( &image->lock ); if ( image->ref_cacheMap != NULL ) { - logadd( LOG_WARNING, "BUG: allocCacheMap called but there already is a cache map for %s:%d", image->name, (int)image->rid ); + logadd( LOG_WARNING, "BUG: allocCacheMap called but there already is a map for %s:%d", PIMG(image) ); free( cache ); } else { ref_setref( &image->ref_cacheMap, &cache->reference ); diff --git a/src/server/uplink.c b/src/server/uplink.c index 71d9f94..7c7cd1c 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -171,7 +171,7 @@ bool uplink_shutdown(dnbd3_image_t *image) image->users++; // Prevent free while uplink shuts down signal_call( uplink->signal ); } else { - logadd( LOG_ERROR, "This will never happen. '%s:%d'", image->name, (int)image->rid ); + logadd( LOG_ERROR, "This will never happen. '%s:%d'", PIMG(image) ); } cancelAllRequests( uplink ); ref_setref( &image->uplinkref, NULL ); @@ -201,7 +201,7 @@ static void cancelAllRequests(dnbd3_uplink_t *uplink) static void uplink_free(ref *ref) { dnbd3_uplink_t *uplink = container_of(ref, dnbd3_uplink_t, reference); - logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", uplink->image->name, (int)uplink->image->rid ); + logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", PIMG(uplink->image) ); assert( uplink->queueLen == 0 ); if ( uplink->signal != NULL ) { signal_close( uplink->signal ); @@ -572,7 +572,7 @@ static void* uplink_mainloop(void *data) } // Don't keep uplink established if we're idle for too much if ( uplink_connectionShouldShutdown( uplink ) ) { - logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", uplink->image->name, (int)uplink->image->rid ); + logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", PIMG(uplink->image) ); goto cleanup; } } @@ -915,11 +915,13 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) tryAgain = false; continue; // Write handle to image successfully re-opened, try again } - logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", uplink->image->name, (int)uplink->image->rid, err ); + logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", + PIMG(uplink->image), err ); break; } if ( unlikely( ret <= 0 || (uint32_t)ret > inReply.size - done ) ) { - logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d", ret, uplink->image->name, (int)uplink->image->rid ); + logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d", + ret, PIMG(uplink->image) ); break; } done += (uint32_t)ret; @@ -929,7 +931,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) } if ( unlikely( ret == -1 && ( err == EBADF || err == EINVAL || err == EIO ) ) ) { logadd( LOG_WARNING, "Error writing received data for %s:%d (errno=%d); disabling caching.", - uplink->image->name, (int)uplink->image->rid, err ); + PIMG(uplink->image), err ); } } // 2) Figure out which clients are interested in it @@ -1098,8 +1100,7 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink) lists_crc = crc32( lists_crc, (uint8_t*)buffer, bytes ); lists_crc = net_order_32( lists_crc ); if ( lists_crc != masterCrc ) { - logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s:%d)!", - uplink->image->name, (int)uplink->image->rid ); + logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s:%d)!", PIMG(uplink->image) ); free( buffer ); return; } @@ -1115,8 +1116,7 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink) close( fd ); if ( (size_t)ret != sizeof(masterCrc) + bytes ) { unlink( path ); - logadd( LOG_WARNING, "Could not write crc32 file for %s:%d", - uplink->image->name, (int)uplink->image->rid ); + logadd( LOG_WARNING, "Could not write crc32 file for %s:%d", PIMG(uplink->image) ); } } } -- cgit v1.2.3-55-g7522 From 290d3478f245bb7d2112bb781286a9fbae42b983 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 13 Mar 2020 16:03:29 +0100 Subject: [SERVER] Rewrite uplink queue handling - Now uses linked lists instead of huge array - Does prefetch data on client requests - Can have multiple replication requests in-flight --- src/server/globals.c | 6 + src/server/globals.h | 35 ++- src/server/image.c | 3 +- src/server/image.h | 44 +++ src/server/net.c | 44 +-- src/server/reference.h | 5 + src/server/uplink.c | 771 +++++++++++++++++++++++++++---------------------- src/server/uplink.h | 2 +- src/serverconfig.h | 3 +- 9 files changed, 518 insertions(+), 395 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/globals.c b/src/server/globals.c index ac079b1..98e0ddb 100644 --- a/src/server/globals.c +++ b/src/server/globals.c @@ -19,6 +19,7 @@ atomic_int _clientPenalty = 0; atomic_bool _isProxy = false; atomic_int _backgroundReplication = BGR_FULL; atomic_int _bgrMinClients = 0; +atomic_int _bgrWindowSize = 1; atomic_bool _lookupMissingForProxy = true; atomic_bool _sparseFiles = false; atomic_bool _ignoreAllocErrors = false; @@ -74,6 +75,7 @@ static int ini_handler(void *custom UNUSED, const char* section, const char* key SAVE_TO_VAR_BOOL( dnbd3, isProxy ); SAVE_TO_VAR_BOOL( dnbd3, proxyPrivateOnly ); SAVE_TO_VAR_INT( dnbd3, bgrMinClients ); + SAVE_TO_VAR_INT( dnbd3, bgrWindowSize ); SAVE_TO_VAR_BOOL( dnbd3, lookupMissingForProxy ); SAVE_TO_VAR_BOOL( dnbd3, sparseFiles ); SAVE_TO_VAR_BOOL( dnbd3, ignoreAllocErrors ); @@ -134,6 +136,9 @@ void globals_loadConfig() logadd( LOG_WARNING, "Ignoring 'sparseFiles=true' since backgroundReplication is set to true and bgrMinClients is too low" ); _sparseFiles = false; } + if ( _bgrWindowSize < 1 ) { + _bgrWindowSize = 1; + } // Dump config as interpreted char buffer[2000]; globals_dumpConfig( buffer, sizeof(buffer) ); @@ -325,6 +330,7 @@ size_t globals_dumpConfig(char *buffer, size_t size) PBOOL(backgroundReplication); } PINT(bgrMinClients); + PINT(bgrWindowSize); PBOOL(lookupMissingForProxy); PBOOL(sparseFiles); PBOOL(ignoreAllocErrors); diff --git a/src/server/globals.h b/src/server/globals.h index 1bb6857..5cee92a 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -18,18 +18,27 @@ typedef struct _dnbd3_uplink dnbd3_uplink_t; typedef struct _dnbd3_image dnbd3_image_t; typedef struct _dnbd3_client dnbd3_client_t; -typedef struct +typedef struct _dnbd3_queue_client { - uint64_t handle; // Client defined handle to pass back in reply - uint64_t from; // First byte offset of requested block (ie. 4096) - uint64_t to; // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191) + struct _dnbd3_queue_client *next; + uint64_t handle; // Handle used by client + uint64_t from, to; // Client range dnbd3_client_t * client; // Client to send reply to - int status; // status of this entry: ULR_* +} dnbd3_queue_client_t; + +typedef struct _dnbd3_queue_entry +{ + struct _dnbd3_queue_entry *next; + uint64_t handle; // Our handle for this entry + uint64_t from; // First byte offset of requested block (ie. 4096) + uint64_t to; // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191) + dnbd3_queue_client_t *clients; #ifdef _DEBUG - ticks entered; // When this request entered the queue (for debugging) + ticks entered; // When this request entered the queue (for debugging) #endif - uint8_t hopCount; // How many hops this request has already taken across proxies -} dnbd3_queued_request_t; + uint8_t hopCount; // How many hops this request has already taken across proxies + bool sent; // Already sent to uplink? +} dnbd3_queue_entry_t; typedef struct _ns { @@ -91,12 +100,12 @@ struct _dnbd3_uplink bool cycleDetected; // connection cycle between proxies detected for current remote server int nextReplicationIndex; // Which index in the cache map we should start looking for incomplete blocks at // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block" - uint64_t replicationHandle; // Handle of pending replication request atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup. atomic_uint_fast64_t bytesReceivedLastSave; // Number of bytes received when we last saved the cache map int queueLen; // length of queue uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives) - dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE]; + dnbd3_queue_entry_t *queue; + atomic_uint_fast32_t queueId; dnbd3_alt_local_t altData[SERVER_MAX_ALTS]; }; @@ -156,6 +165,7 @@ struct _dnbd3_client atomic_uint_fast64_t bytesSent; // Byte counter for this client. dnbd3_image_t * _Atomic image; // Image in use by this client, or NULL during handshake int sock; + _Atomic uint8_t relayedCount; // How many requests are in-flight to the uplink server bool isServer; // true if a server in proxy mode, false if real client dnbd3_host_t host; char hostName[HOSTNAMELEN]; // inet_ntop version of host @@ -242,6 +252,11 @@ extern atomic_int _backgroundReplication; */ extern atomic_int _bgrMinClients; +/** + * How many in-flight replication requests we should target (per uplink) + */ +extern atomic_int _bgrWindowSize; + /** * (In proxy mode): If connecting client is a proxy, and the requested image * is not known locally, should we ask our known alt servers for it? diff --git a/src/server/image.c b/src/server/image.c index 86b6374..81ec479 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -340,7 +340,6 @@ dnbd3_image_t* image_byId(int imgId) dnbd3_image_t* image_get(char *name, uint16_t revision, bool ensureFdOpen) { int i; - const char *removingText = _removeMissingImages ? ", removing from list" : ""; dnbd3_image_t *candidate = NULL; // Simple sanity check const size_t slen = strlen( name ); @@ -1895,7 +1894,7 @@ static void* saveLoadAllCacheMaps(void* nix UNUSED) // We're not replicating this image, if there's a cache map, reload // it periodically, since we might read from a shared storage that // another server instance is writing to. - if ( full || !cache->unchanged && !image->problem.read ) { + if ( full || ( !cache->unchanged && !image->problem.read ) ) { logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", PIMG(image) ); dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize); if ( onDisk == NULL ) { diff --git a/src/server/image.h b/src/server/image.h index 4614c74..b23711b 100644 --- a/src/server/image.h +++ b/src/server/image.h @@ -51,6 +51,50 @@ bool image_ensureDiskSpaceLocked(uint64_t size, bool force); bool image_saveCacheMap(dnbd3_image_t *image); +/** + * Check if given range is cached. Be careful when using this function because: + * 1) you need to hold a reference to the cache map + * 2) start and end are assumed to be 4k aligned + * 3) start and end are not checked to be in bounds (we don't know the image in this context) + */ +static inline bool image_isRangeCachedUnsafe(dnbd3_cache_map_t *cache, uint64_t start, uint64_t end) +{ + const uint64_t firstByteInMap = start >> 15; + const uint64_t lastByteInMap = (end - 1) >> 15; + const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7)); + const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1))); + uint64_t pos; + uint8_t b; + bool isCached; + if ( firstByteInMap == lastByteInMap ) { // Single byte to check, much simpler + b = cache->map[firstByteInMap]; + isCached = ( b & ( fb & lb ) ) == ( fb & lb ); + } else { + isCached = true; + atomic_thread_fence( memory_order_acquire ); + // First byte + if ( isCached ) { + b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed ); + isCached = ( ( b & fb ) == fb ); + } + // Last byte + if ( isCached ) { + b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed ); + isCached = ( ( b & lb ) == lb ); + } + // Middle, must be all bits set (0xff) + if ( isCached ) { + for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) { + if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) { + isCached = false; + break; + } + } + } + } + return isCached; +} + // one byte in the map covers 8 4kib blocks, so 32kib per byte // "+ (1 << 15) - 1" is required to account for the last bit of // the image that is smaller than 32kib diff --git a/src/server/net.c b/src/server/net.c index 954cb8a..9ba9dbc 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -197,6 +197,7 @@ void* net_handleNewConnection(void *clientPtr) client->hostName[HOSTNAMELEN-1] = '\0'; mutex_unlock( &client->lock ); client->bytesSent = 0; + client->relayedCount = 0; if ( !addToList( client ) ) { freeClientStruct( client ); @@ -344,41 +345,18 @@ void* net_handleNewConnection(void *clientPtr) // This is a proxyed image, check if we need to relay the request... const uint64_t start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); const uint64_t end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); - const uint64_t firstByteInMap = start >> 15; - const uint64_t lastByteInMap = (end - 1) >> 15; - const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7)); - const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1))); - uint64_t pos; - uint8_t b; - bool isCached; - if ( firstByteInMap == lastByteInMap ) { // Single byte to check, much simpler - b = cache->map[firstByteInMap]; - isCached = ( b & ( fb & lb ) ) == ( fb & lb ); - } else { - isCached = true; - atomic_thread_fence( memory_order_acquire ); - // First byte - if ( isCached ) { - b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed ); - isCached = ( ( b & fb ) == fb ); - } - // Last byte - if ( isCached ) { - b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed ); - isCached = ( ( b & lb ) == lb ); - } - // Middle, must be all bits set (0xff) - if ( isCached ) { - for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) { - if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) { - isCached = false; - break; - } + if ( !image_isRangeCachedUnsafe( cache, start, end ) ) { + if ( unlikely( client->relayedCount > 250 ) ) { + logadd( LOG_DEBUG1, "Client is overloading uplink; throttling" ); + for ( int i = 0; i < 100 && client->relayedCount > 200; ++i ) { + usleep( 10000 ); + } + if ( client->relayedCount > 250 ) { + logadd( LOG_WARNING, "Could not lower client's uplink backlog; dropping client" ); + goto exit_client_cleanup; } } - } - if ( !isCached ) { - if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) { + if ( !uplink_request( NULL, client, request.handle, offset, request.size, request.hops ) ) { logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy for image %s:%d", client->hostName, image->name, image->rid ); goto exit_client_cleanup; diff --git a/src/server/reference.h b/src/server/reference.h index 4eda546..75a681f 100644 --- a/src/server/reference.h +++ b/src/server/reference.h @@ -39,6 +39,11 @@ static inline ref *ref_get( weakref *weakref ) return ref; } +static inline void ref_inc( ref *ref ) +{ + ++ref->count; +} + static inline void ref_put( ref *ref ) { if ( --ref->count == 0 ) { diff --git a/src/server/uplink.c b/src/server/uplink.c index 7c7cd1c..188bf06 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -8,6 +8,7 @@ #include "../shared/protocol.h" #include "../shared/timing.h" #include "../shared/crc32.h" +#include "threadpool.h" #include "reference.h" #include @@ -21,30 +22,6 @@ #define MAP_BYTES_PER_HASH_BLOCK (int)( HASH_BLOCK_SIZE / FILE_BYTES_PER_MAP_BYTE ) #define MAP_INDEX_HASH_START_MASK ( ~(int)( MAP_BYTES_PER_HASH_BLOCK - 1 ) ) -#define REP_NONE ( (uint64_t)0xffffffffffffffff ) - -// Status of request in queue - -// Slot is free, can be used. -// Must only be set in uplink_handle_receive() or uplink_remove_client() -#define ULR_FREE 0 -// Slot has been filled with a request that hasn't been sent to the upstream server yet, matching request can safely rely on reuse. -// Must only be set in uplink_request() -#define ULR_NEW 1 -// Slot is occupied, reply has not yet been received, matching request can safely rely on reuse. -// Must only be set in uplink_mainloop() or uplink_request() -#define ULR_PENDING 2 -// Slot is being processed, do not consider for hop on. -// Must only be set in uplink_handle_receive() -#define ULR_PROCESSING 3 - -static const char *const NAMES_ULR[4] = { - [ULR_FREE] = "ULR_FREE", - [ULR_NEW] = "ULR_NEW", - [ULR_PENDING] = "ULR_PENDING", - [ULR_PROCESSING] = "ULR_PROCESSING", -}; - static atomic_uint_fast64_t totalBytesReceived = 0; static void cancelAllRequests(dnbd3_uplink_t *uplink); @@ -59,6 +36,15 @@ static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink); static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force); static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink); static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew); +static int numWantedReplicationRequests(dnbd3_uplink_t *uplink); +static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle); +static void *prefetchForClient(void *data); + +typedef struct { + dnbd3_uplink_t *uplink; + uint64_t start; + uint32_t length; +} prefetch_request_t; // ############ uplink connection handling @@ -106,6 +92,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version uplink->bytesReceived = 0; uplink->bytesReceivedLastSave = 0; uplink->idleTime = SERVER_UPLINK_IDLE_TIMEOUT - 90; + uplink->queue = NULL; uplink->queueLen = 0; uplink->cacheFd = -1; uplink->signal = signal_new(); @@ -113,7 +100,6 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version logadd( LOG_WARNING, "Error creating signal. Uplink unavailable." ); goto failure; } - uplink->replicationHandle = REP_NONE; mutex_lock( &uplink->rttLock ); mutex_lock( &uplink->sendMutex ); uplink->current.fd = -1; @@ -175,9 +161,9 @@ bool uplink_shutdown(dnbd3_image_t *image) } cancelAllRequests( uplink ); ref_setref( &image->uplinkref, NULL ); - ref_put( &uplink->reference ); mutex_unlock( &uplink->queueLock ); bool retval = ( exp && image->users == 0 ); + ref_put( &uplink->reference ); mutex_unlock( &image->lock ); return retval; } @@ -188,12 +174,21 @@ bool uplink_shutdown(dnbd3_image_t *image) */ static void cancelAllRequests(dnbd3_uplink_t *uplink) { - for ( int i = 0; i < uplink->queueLen; ++i ) { - if ( uplink->queue[i].status != ULR_FREE ) { - net_sendReply( uplink->queue[i].client, CMD_ERROR, uplink->queue[i].handle ); - uplink->queue[i].status = ULR_FREE; + dnbd3_queue_entry_t *it = uplink->queue; + while ( it != NULL ) { + dnbd3_queue_client_t *cit = it->clients; + while ( cit != NULL ) { + net_sendReply( cit->client, CMD_ERROR, cit->handle ); + cit->client->relayedCount--; + dnbd3_queue_client_t *next = cit->next; + free( cit ); + cit = next; } + dnbd3_queue_entry_t *next = it->next; + free( it ); + it = next; } + uplink->queue = NULL; uplink->queueLen = 0; uplink->image->problem.queue = false; } @@ -234,39 +229,54 @@ static void uplink_free(ref *ref) */ void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client) { + if ( client->relayedCount == 0 ) + return; mutex_lock( &uplink->queueLock ); - for (int i = uplink->queueLen - 1; i >= 0; --i) { - if ( uplink->queue[i].client == client ) { - // Make sure client doesn't get destroyed while we're sending it data - mutex_lock( &client->sendMutex ); - mutex_unlock( &client->sendMutex ); - uplink->queue[i].client = NULL; - uplink->queue[i].status = ULR_FREE; + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + for ( dnbd3_queue_client_t **cit = &it->clients; *cit != NULL; cit = &(**cit).next ) { + if ( (**cit).client == client ) { + --client->relayedCount; + dnbd3_queue_client_t *entry = *cit; + *cit = (**cit).next; + free( entry ); + } } - if ( uplink->queue[i].client == NULL && uplink->queueLen == i + 1 ) uplink->queueLen--; } mutex_unlock( &uplink->queueLock ); + if ( unlikely( client->relayedCount != 0 ) ) { + logadd( LOG_DEBUG1, "Client has relayedCount == %"PRIu8" on disconnect..", client->relayedCount ); + int i; + for ( i = 0; i < 1000 && client->relayedCount != 0; ++i ) { + usleep( 10000 ); + } + if ( client->relayedCount != 0 ) { + logadd( LOG_WARNING, "Client relayedCount still %"PRIu8" after sleeping!", client->relayedCount ); + } + } } /** - * Request a chunk of data through an uplink server - * Locks on: image.lock, uplink.queueLock + * Request a chunk of data through an uplink server. Either uplink or client has to be non-NULL. + * If client is NULL, this is assumed to be a background replication request. + * Locks on: uplink.queueLock, uplink.sendMutex */ -bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops) +bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops) { - if ( client == NULL || client->image == NULL ) - return false; + bool getUplink = ( uplink == NULL ); + assert( client != NULL || uplink != NULL ); if ( length > (uint32_t)_maxPayload ) { logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length ); return false; } - dnbd3_uplink_t * uplink = ref_get_uplink( &client->image->uplinkref ); - if ( unlikely( uplink == NULL ) ) { - uplink_init( client->image, -1, NULL, -1 ); + if ( getUplink ) { uplink = ref_get_uplink( &client->image->uplinkref ); - if ( uplink == NULL ) { - logadd( LOG_DEBUG1, "Uplink request for image with no uplink" ); - return false; + if ( unlikely( uplink == NULL ) ) { + uplink_init( client->image, -1, NULL, -1 ); + uplink = ref_get_uplink( &client->image->uplinkref ); + if ( uplink == NULL ) { + logadd( LOG_DEBUG1, "Uplink request for image with no uplink" ); + return false; + } } } if ( uplink->shutdown ) { @@ -275,163 +285,179 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin } // Check if the client is the same host as the uplink. If so assume this is a circular proxy chain // This might be a false positive if there are multiple instances running on the same host (IP) - if ( hops != 0 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) { + if ( client != NULL && hops != 0 + && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) { uplink->cycleDetected = true; signal_call( uplink->signal ); logadd( LOG_WARNING, "Proxy cycle detected (same host)." ); goto fail_ref; } - int foundExisting = -1; // Index of a pending request that is a superset of our range, -1 otherwise - int existingType = -1; // ULR_* type of existing request - int i; - int freeSlot = -1; - int firstUsedSlot = -1; - bool requestLoop = false; - const uint64_t end = start + length; - - mutex_lock( &uplink->queueLock ); - if ( uplink->shutdown ) { // Check again after locking to prevent lost requests - goto fail_lock; - } - for (i = 0; i < uplink->queueLen; ++i) { - // find free slot to place this request into - if ( uplink->queue[i].status == ULR_FREE ) { - if ( freeSlot == -1 || existingType != ULR_PROCESSING ) { - freeSlot = i; - } - continue; - } - if ( firstUsedSlot == -1 ) { - firstUsedSlot = i; - } - // find existing request to attach to - if ( uplink->queue[i].from > start || uplink->queue[i].to < end ) - continue; // Range not suitable - // Detect potential proxy cycle. New request hopcount is greater, range is same, old request has already been sent -> suspicious - if ( hops > uplink->queue[i].hopCount && uplink->queue[i].from == start && uplink->queue[i].to == end && uplink->queue[i].status == ULR_PENDING ) { - requestLoop = true; - break; - } - if ( foundExisting == -1 || existingType == ULR_PROCESSING ) { - foundExisting = i; - existingType = uplink->queue[i].status; - } - } - if ( unlikely( requestLoop ) ) { - uplink->cycleDetected = true; - signal_call( uplink->signal ); - logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops ); - goto fail_lock; - } - if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) { - freeSlot = -1; // Not attaching to existing request, make it use a higher slot - } - if ( freeSlot == -1 ) { - if ( uplink->queueLen >= SERVER_MAX_UPLINK_QUEUE ) { - logadd( LOG_WARNING, "Uplink queue is full, consider increasing SERVER_MAX_UPLINK_QUEUE. Dropping client..." ); + struct { + uint64_t handle, start, end; + } req; + do { + const uint64_t end = start + length; + dnbd3_queue_entry_t *request = NULL, *last = NULL; + bool isNew; + mutex_lock( &uplink->queueLock ); + if ( uplink->shutdown ) { // Check again after locking to prevent lost requests goto fail_lock; } - freeSlot = uplink->queueLen++; - if ( freeSlot > SERVER_UPLINK_QUEUELEN_THRES ) { - uplink->image->problem.queue = true; + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + if ( it->from <= start && it->to >= end ) { + // Matching range, attach + request = it; + break; + } + if ( it->next == NULL ) { + // Not matching, last in list, remember + last = it; + break; + } } - } - // Do not send request to uplink server if we have a matching pending request AND the request either has the - // status ULR_NEW/PENDING OR we found a free slot with LOWER index than the one we attach to. Otherwise - // explicitly send this request to the uplink server. The second condition mentioned here is to prevent - // a race condition where the reply for the outstanding request already arrived and the uplink thread - // is currently traversing the request queue. As it is processing the queue from highest to lowest index, it might - // already have passed the index of the free slot we determined, but not reached the existing request we just found above. - if ( foundExisting != -1 && existingType == ULR_PROCESSING && freeSlot > foundExisting ) { - foundExisting = -1; // -1 means "send request" - } -#ifdef _DEBUG - if ( foundExisting != -1 ) { - logadd( LOG_DEBUG2, "%p (%s) Found existing request of type %s at slot %d, attaching in slot %d.\n", (void*)uplink, uplink->image->name, NAMES_ULR[existingType], foundExisting, freeSlot ); - logadd( LOG_DEBUG2, "Original %" PRIu64 "-%" PRIu64 " (%p)\n" - "New %" PRIu64 "-%" PRIu64 " (%p)\n", - uplink->queue[foundExisting].from, uplink->queue[foundExisting].to, (void*)uplink->queue[foundExisting].client, - start, end, (void*)client ); - } -#endif - // Fill structure - uplink->queue[freeSlot].from = start; - uplink->queue[freeSlot].to = end; - uplink->queue[freeSlot].handle = handle; - uplink->queue[freeSlot].client = client; - //int old = uplink->queue[freeSlot].status; - uplink->queue[freeSlot].status = ( foundExisting == -1 ? ULR_NEW : - ( existingType == ULR_NEW ? ULR_PENDING : existingType ) ); - uplink->queue[freeSlot].hopCount = hops; + dnbd3_queue_client_t **c; + if ( request == NULL ) { + // No existing request to attach to + if ( uplink->queueLen >= UPLINK_MAX_QUEUE ) { + logadd( LOG_WARNING, "Uplink queue is full, consider increasing UPLINK_MAX_QUEUE. Dropping client..." ); + goto fail_lock; + } + uplink->queueLen++; + if ( uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) { + uplink->image->problem.queue = true; + } + request = malloc( sizeof(*request) ); + if ( last == NULL ) { + uplink->queue = request; + } else { + last->next = request; + } + request->next = NULL; + request->handle = ++uplink->queueId; + request->from = start & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + request->to = (end + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); #ifdef _DEBUG - timing_get( &uplink->queue[freeSlot].entered ); - //logadd( LOG_DEBUG2 %p] Inserting request at slot %d, was %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 "\n", (void*)uplink, freeSlot, old, uplink->queue[freeSlot].status, uplink->queue[freeSlot, ".handle, start, end ); + timing_get( &request->entered ); #endif - mutex_unlock( &uplink->queueLock ); + request->hopCount = hops; + request->sent = true; // Optimistic; would be set to false on failure + if ( client == NULL ) { + // BGR + request->clients = NULL; + } else { + c = &request->clients; + } + isNew = true; + } else if ( client == NULL ) { + // Replication request that maches existing request. Do nothing + isNew = false; + } else { + // Existing request. Check if potential cycle + if ( hops > request->hopCount + 1 && request->from == start && request->to == end ) { + logadd( LOG_DEBUG1, "Request cycle detected on uplink for %s:%d", PIMG(uplink->image) ); + goto fail_lock; + } + // Count number if clients, get tail of list + int count = 0; + c = &request->clients; + while ( *c != NULL ) { + c = &(**c).next; + if ( ++count >= UPLINK_MAX_CLIENTS_PER_REQUEST ) { + logadd( LOG_DEBUG2, "Won't accept more than %d clients per request, dropping client", count ); + goto fail_lock; + } + } + isNew = false; + } + req.handle = request->handle; + req.start = request->from; + req.end = request->to; + if ( client != NULL ) { + *c = malloc( sizeof( *request->clients ) ); + (**c).next = NULL; + (**c).handle = handle; + (**c).from = start; + (**c).to = end; + (**c).client = client; + client->relayedCount++; + } + mutex_unlock( &uplink->queueLock ); - if ( foundExisting != -1 ) { - ref_put( &uplink->reference ); - return true; // Attached to pending request, do nothing - } + if ( !isNew ) { + goto success_ref; // Attached to pending request, do nothing + } + } while (0); - // See if we can fire away the request - if ( unlikely( mutex_trylock( &uplink->sendMutex ) != 0 ) ) { - logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" ); + // Fire away the request + mutex_lock( &uplink->sendMutex ); + if ( unlikely( uplink->current.fd == -1 ) ) { + uplink->image->problem.uplink = true; + markRequestUnsent( uplink, req.handle ); + mutex_unlock( &uplink->sendMutex ); + logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" ); } else { - if ( unlikely( uplink->current.fd == -1 ) ) { + if ( hops < 200 ) ++hops; + const bool ret = dnbd3_get_block( uplink->current.fd, req.start, req.end - req.start, + req.handle, COND_HOPCOUNT( uplink->current.version, hops ) ); + if ( unlikely( !ret ) ) { + markRequestUnsent( uplink, req.handle ); uplink->image->problem.uplink = true; mutex_unlock( &uplink->sendMutex ); - logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" ); + logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing (%"PRIu64")", req.handle ); } else { - const uint64_t reqStart = uplink->queue[freeSlot].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); - const uint32_t reqSize = (uint32_t)(((uplink->queue[freeSlot].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart); - if ( hops < 200 ) ++hops; - const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) ); - if ( unlikely( !ret ) ) { - uplink->image->problem.uplink = true; - mutex_unlock( &uplink->sendMutex ); - logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" ); - } else { - // Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again - int state; - mutex_unlock( &uplink->sendMutex ); - mutex_lock( &uplink->queueLock ); - if ( !uplink->shutdown && uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) { - state = uplink->queue[freeSlot].status; - if ( uplink->queue[freeSlot].status == ULR_NEW ) { - uplink->queue[freeSlot].status = ULR_PENDING; - } - } else { - state = -1; - } - mutex_unlock( &uplink->queueLock ); - if ( state == -1 ) { - logadd( LOG_DEBUG2, "Direct uplink request queue entry gone after sending and re-locking queue. *shrug*" ); - } else if ( state == ULR_NEW ) { - //logadd( LOG_DEBUG2, "Direct uplink request" ); - } else { - logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] ); - } - ref_put( &uplink->reference ); - return true; - } - // Fall through to waking up sender thread + // OK + mutex_unlock( &uplink->sendMutex ); + goto success_ref; } + // Fall through to waking up sender thread } if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) { logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno ); } - ref_put( &uplink->reference ); + +success_ref: + if ( client != NULL ) { + // Was from client -- potential prefetch + uint32_t len = MIN( uplink->image->virtualFilesize - req.end, req.end - req.start ); + if ( len > 0 ) { + prefetch_request_t *job = malloc( sizeof( *job ) ); + job->start = req.end; + job->length = len; + job->uplink = uplink; + ref_inc( &uplink->reference ); // Hold one for the thread, thread will return it + threadpool_run( &prefetchForClient, (void*)job ); + } + } + if ( getUplink ) { + ref_put( &uplink->reference ); + } return true; fail_lock: mutex_unlock( &uplink->queueLock ); fail_ref: - ref_put( &uplink->reference ); + if ( getUplink ) { + ref_put( &uplink->reference ); + } return false; } +static void *prefetchForClient(void *data) +{ + prefetch_request_t *job = (prefetch_request_t*)data; + dnbd3_cache_map_t *cache = ref_get_cachemap( job->uplink->image ); + if ( cache != NULL ) { + if ( !image_isRangeCachedUnsafe( cache, job->start, job->start + job->length ) ) { + uplink_request( job->uplink, NULL, ++job->uplink->queueId, job->start, job->length, 0 ); + } + ref_put( &cache->reference ); + } + ref_put( &job->uplink->reference ); + free( job ); + return NULL; +} + /** * Uplink thread. * Locks are irrelevant as this is never called from another function @@ -443,7 +469,7 @@ static void* uplink_mainloop(void *data) #define EV_COUNT (2) struct pollfd events[EV_COUNT]; dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data; - int numSocks, i, waitTime; + int numSocks, waitTime; int altCheckInterval = SERVER_RTT_INTERVAL_INIT; int rttTestResult; uint32_t discoverFailCount = 0; @@ -478,7 +504,7 @@ static void* uplink_mainloop(void *data) declare_now; waitTime = (int)timing_diffMs( &now, &nextAltCheck ); if ( waitTime < 100 ) waitTime = 100; - if ( waitTime > 10000 ) waitTime = 10000; + else if ( waitTime > 10000 ) waitTime = 10000; } events[EV_SOCKET].fd = uplink->current.fd; numSocks = poll( events, EV_COUNT, waitTime ); @@ -505,7 +531,6 @@ static void* uplink_mainloop(void *data) mutex_unlock( &uplink->rttLock ); discoverFailCount = 0; if ( fd != -1 ) close( fd ); - uplink->replicationHandle = REP_NONE; uplink->image->problem.uplink = false; uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received buffer[0] = '@'; @@ -559,11 +584,11 @@ static void* uplink_mainloop(void *data) } declare_now; uint32_t timepassed = timing_diff( &lastKeepalive, &now ); - if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) { + if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL || ( timepassed >= 2 && uplink->idleTime < _bgrWindowSize ) ) { lastKeepalive = now; uplink->idleTime += timepassed; // Keep-alive - if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) { + if ( uplink->current.fd != -1 && uplink->queueLen < _bgrWindowSize ) { // Send keep-alive if nothing is happening, and try to trigger background rep. if ( !uplink_sendKeepalive( uplink ) || !uplink_sendReplicationRequest( uplink ) ) { uplink_connectionFailed( uplink, true ); @@ -612,19 +637,16 @@ static void* uplink_mainloop(void *data) ticks deadline; timing_set( &deadline, &now, -10 ); mutex_lock( &uplink->queueLock ); - for (i = 0; i < uplink->queueLen; ++i) { - if ( uplink->queue[i].status != ULR_FREE && timing_reached( &uplink->queue[i].entered, &deadline ) ) { - snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n" - "%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)uplink, i, uplink->queue[i].client->image->name, - uplink->queue[i].from, uplink->queue[i].to, uplink->queue[i].status ); - uplink->queue[i].entered = now; + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + if ( timing_reached( &it->entered, &deadline ) ) { + logadd( LOG_WARNING, "Starving request detected:" + " (from %" PRIu64 " to %" PRIu64 ", sent: %d) %s:%d", + it->from, it->to, (int)it->sent, PIMG(uplink->image) ); + it->entered = now; #ifdef _DEBUG_RESEND_STARVING - uplink->queue[i].status = ULR_NEW; + it->sent = false; resend = true; #endif - mutex_unlock( &uplink->queueLock ); - logadd( LOG_WARNING, "%s", buffer ); - mutex_lock( &uplink->queueLock ); } } mutex_unlock( &uplink->queueLock ); @@ -667,37 +689,54 @@ cleanup: ; */ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) { - // Scan for new requests - int j; + // Scan for new requests, or optionally, (re)send all + // Build a buffer, so if there aren't too many requests, we can send them after + // unlocking the queue again. Otherwise we need flushes during iteration, which + // is no ideal, but in that case the uplink is probably overwhelmed anyways. + // Try 125 as that's exactly 300bytes, usually 2*MTU. +#define MAX_RESEND_BATCH 125 + dnbd3_request_t reqs[MAX_RESEND_BATCH]; + int count = 0; mutex_lock( &uplink->queueLock ); - for (j = 0; j < uplink->queueLen; ++j) { - if ( uplink->queue[j].status != ULR_NEW && (newOnly || uplink->queue[j].status != ULR_PENDING) ) continue; - uplink->queue[j].status = ULR_PENDING; - uint8_t hops = uplink->queue[j].hopCount; - const uint64_t reqStart = uplink->queue[j].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); - const uint32_t reqSize = (uint32_t)(((uplink->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart); - /* - logadd( LOG_DEBUG2, "[%p] Sending slot %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 " (%" PRIu64 "-%" PRIu64 ")", - (void*)uplink, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize ); - */ - mutex_unlock( &uplink->queueLock ); - if ( hops < 200 ) ++hops; - mutex_lock( &uplink->sendMutex ); - const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) ); - if ( likely( ret ) ) { - mutex_unlock( &uplink->sendMutex ); - } else { - // Non-critical - if the connection dropped or the server was changed - // the thread will re-send this request as soon as the connection - // is reestablished. - uplink->image->problem.uplink = true; + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + if ( newOnly && it->sent ) + continue; + it->sent = true; + dnbd3_request_t *hdr = &reqs[count++]; + hdr->magic = dnbd3_packet_magic; + hdr->cmd = CMD_GET_BLOCK; + hdr->size = it->to - it->from; + hdr->offset_small = it->from; + hdr->hops = it->hopCount + 1; + hdr->handle = it->handle; + fixup_request( *hdr ); + if ( count == MAX_RESEND_BATCH ) { + bool ok = false; + logadd( LOG_DEBUG2, "BLOCKING resend of %d", count ); + count = 0; + mutex_lock( &uplink->sendMutex ); + if ( uplink->current.fd != -1 ) { + ok = ( sock_sendAll( uplink->current.fd, reqs, DNBD3_REQUEST_SIZE * MAX_RESEND_BATCH, 3 ) + == DNBD3_REQUEST_SIZE * MAX_RESEND_BATCH ); + } mutex_unlock( &uplink->sendMutex ); - logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" ); - return; + if ( !ok ) { + uplink->image->problem.uplink = true; + break; + } } - mutex_lock( &uplink->queueLock ); } mutex_unlock( &uplink->queueLock ); + if ( count != 0 ) { + mutex_lock( &uplink->sendMutex ); + if ( uplink->current.fd != -1 ) { + uplink->image->problem.uplink = + ( sock_sendAll( uplink->current.fd, reqs, DNBD3_REQUEST_SIZE * count, 3 ) + != DNBD3_REQUEST_SIZE * count ); + } + mutex_unlock( &uplink->sendMutex ); + } +#undef MAX_RESEND_BATCH } /** @@ -720,71 +759,73 @@ static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) return false; // Should never be called in this state, consider send error if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return true; // Don't do background replication - if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE ) - return true; // Already a replication request on the wire, or no more blocks to replicate + if ( uplink->nextReplicationIndex == -1 ) + return true; // No more blocks to replicate dnbd3_image_t * const image = uplink->image; if ( image->users < _bgrMinClients ) return true; // Not enough active users + const int numNewRequests = numWantedReplicationRequests( uplink ); + if ( numNewRequests <= 0 ) + return true; // Already sufficient amount of requests on the wire dnbd3_cache_map_t *cache = ref_get_cachemap( image ); - if ( cache == NULL || image->users ) { + if ( cache == NULL ) { // No cache map (=image complete) - ref_put( &cache->reference ); return true; } const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); const int lastBlockIndex = mapBytes - 1; - int endByte; - if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks - endByte = uplink->nextReplicationIndex + mapBytes; - } else { // Hashblock based: Only look for match in current hash block - endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK; - if ( endByte > mapBytes ) { - endByte = mapBytes; + for ( int bc = 0; bc < numNewRequests; ++bc ) { + int endByte; + if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks + endByte = uplink->nextReplicationIndex + mapBytes; + } else { // Hashblock based: Only look for match in current hash block + endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK; + if ( endByte > mapBytes ) { + endByte = mapBytes; + } } - } - atomic_thread_fence( memory_order_acquire ); - int replicationIndex = -1; - for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) { - const int i = j % ( mapBytes ); // Wrap around for BGR_FULL - if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff - && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) { - // Found incomplete one - replicationIndex = i; + atomic_thread_fence( memory_order_acquire ); + int replicationIndex = -1; + for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) { + const int i = j % ( mapBytes ); // Wrap around for BGR_FULL + if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff + && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) { + // Found incomplete one + replicationIndex = i; + break; + } + } + if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) { + // Nothing left in current block, find next one + replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte ); + } + if ( replicationIndex == -1 ) { + // Replication might be complete, uplink_mainloop should take care.... + uplink->nextReplicationIndex = -1; break; } + const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE; + const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE ); + const uint64_t handle = ++uplink->queueId; + if ( !uplink_request( uplink, NULL, handle, offset, size, 0 ) ) { + logadd( LOG_DEBUG1, "Error sending background replication request to uplink server (%s:%d)", + PIMG(uplink->image) ); + ref_put( &cache->reference ); + return false; + } + if ( replicationIndex == lastBlockIndex ) { + uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks + } + uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter + if ( _backgroundReplication == BGR_HASHBLOCK + && uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) { + // Just crossed a hash block boundary, look for new candidate starting at this very index + uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex ); + if ( uplink->nextReplicationIndex == -1 ) + break; + } } ref_put( &cache->reference ); - if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) { - // Nothing left in current block, find next one - replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte ); - } - if ( replicationIndex == -1 ) { - // Replication might be complete, uplink_mainloop should take care.... - uplink->nextReplicationIndex = -1; - return true; - } - const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE; - uplink->replicationHandle = offset; - const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE ); - mutex_lock( &uplink->sendMutex ); - bool sendOk = dnbd3_get_block( uplink->current.fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->current.version, 1 ) ); - if ( likely( sendOk ) ) { - mutex_unlock( &uplink->sendMutex ); - } else { - uplink->image->problem.uplink = true; - mutex_unlock( &uplink->sendMutex ); - logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" ); - return false; - } - if ( replicationIndex == lastBlockIndex ) { - uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks - } - uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter - if ( _backgroundReplication == BGR_HASHBLOCK - && uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) { - // Just crossed a hash block boundary, look for new candidate starting at this very index - uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex ); - } return true; } @@ -845,7 +886,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int static void uplink_handleReceive(dnbd3_uplink_t *uplink) { dnbd3_reply_t inReply, outReply; - int ret, i; + int ret; for (;;) { ret = dnbd3_read_reply( uplink->current.fd, &inReply, false ); if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue; @@ -881,13 +922,34 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) } // Payload read completely // Bail out if we're not interested - if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) ) continue; + if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) ) + continue; // Is a legit block reply - struct iovec iov[2]; - const uint64_t start = inReply.handle; - const uint64_t end = inReply.handle + inReply.size; totalBytesReceived += inReply.size; uplink->bytesReceived += inReply.size; + // Get entry from queue + dnbd3_queue_entry_t *entry; + mutex_lock( &uplink->queueLock ); + for ( entry = uplink->queue; entry != NULL; entry = entry->next ) { + if ( entry->handle == inReply.handle ) + break; + } + if ( entry == NULL ) { + mutex_unlock( &uplink->queueLock ); // Do not dereference pointer after unlock! + logadd( LOG_DEBUG1, "Received block reply on uplink, but handle %"PRIu64" is unknown (%s:%d)", + inReply.handle, PIMG(uplink->image) ); + continue; + } + const uint64_t start = entry->from; + const uint64_t end = entry->to; + mutex_unlock( &uplink->queueLock ); // Do not dereference pointer after unlock! + // We don't remove the entry from the list here yet, to slightly increase the chance of other + // clients attaching to this request while we write the data to disk + if ( end - start != inReply.size ) { + logadd( LOG_WARNING, "Received payload length does not match! (is: %"PRIu32", expect: %u, %s:%d)", + inReply.size, (unsigned int)( end - start ), PIMG(uplink->image) ); + } + struct iovec iov[2]; // 1) Write to cache file if ( unlikely( uplink->cacheFd == -1 ) ) { uplink_reopenCacheFd( uplink, false ); @@ -934,98 +996,76 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) PIMG(uplink->image), err ); } } - // 2) Figure out which clients are interested in it - // Mark as ULR_PROCESSING, since we unlock repeatedly in the second loop - // below; this prevents uplink_request() from attaching to this request - // by populating a slot with index greater than the highest matching - // request with ULR_PROCESSING (assuming there is no ULR_PENDING or ULR_NEW - // where it's fine if the index is greater) + bool found = false; + dnbd3_queue_entry_t **it; mutex_lock( &uplink->queueLock ); - for (i = 0; i < uplink->queueLen; ++i) { - dnbd3_queued_request_t * const req = &uplink->queue[i]; - assert( req->status != ULR_PROCESSING ); - if ( req->status != ULR_PENDING && req->status != ULR_NEW ) continue; - assert( req->client != NULL ); - if ( req->from >= start && req->to <= end ) { // Match :-) - req->status = ULR_PROCESSING; - } - } - // 3) Send to interested clients - iterate backwards so request collaboration works, and - // so we can decrease queueLen on the fly while iterating. Should you ever change this to start - // from 0, you also need to change the "attach to existing request"-logic in uplink_request() - outReply.magic = dnbd3_packet_magic; - bool served = false; - for ( i = uplink->queueLen - 1; i >= 0; --i ) { - dnbd3_queued_request_t * const req = &uplink->queue[i]; - if ( req->status == ULR_PROCESSING ) { - size_t bytesSent = 0; - assert( req->from >= start && req->to <= end ); - dnbd3_client_t * const client = req->client; - outReply.cmd = CMD_GET_BLOCK; - outReply.handle = req->handle; - outReply.size = (uint32_t)( req->to - req->from ); - iov[0].iov_base = &outReply; - iov[0].iov_len = sizeof outReply; - iov[1].iov_base = uplink->recvBuffer + (req->from - start); - iov[1].iov_len = outReply.size; - fixup_reply( outReply ); - req->status = ULR_FREE; - req->client = NULL; - served = true; - mutex_lock( &client->sendMutex ); - mutex_unlock( &uplink->queueLock ); - if ( client->sock != -1 ) { - ssize_t sent = writev( client->sock, iov, 2 ); - if ( sent > (ssize_t)sizeof outReply ) { - bytesSent = (size_t)sent - sizeof outReply; - } - } - if ( bytesSent != 0 ) { - client->bytesSent += bytesSent; - } - mutex_unlock( &client->sendMutex ); - mutex_lock( &uplink->queueLock ); - if ( i > uplink->queueLen ) { - i = uplink->queueLen; // Might have been set to 0 by cancelAllRequests - } + for ( it = &uplink->queue; *it != NULL; it = &(**it).next ) { + if ( *it == entry && entry->handle == inReply.handle ) { // ABA check + assert( found == false ); + *it = (**it).next; + found = true; + uplink->queueLen--; + break; } - if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--; } if ( uplink->queueLen < SERVER_UPLINK_QUEUELEN_THRES ) { uplink->image->problem.queue = false; } mutex_unlock( &uplink->queueLock ); -#ifdef _DEBUG - if ( !served && start != uplink->replicationHandle ) { - logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)uplink, uplink->image->name, start, end ); + if ( !found ) { + logadd( LOG_DEBUG1, "Replication request vanished from queue after writing to disk (%s:%d)", + PIMG(uplink->image) ); + continue; } -#endif - if ( start == uplink->replicationHandle ) { - // Was our background replication - uplink->replicationHandle = REP_NONE; - // Try to remove from fs cache if no client was interested in this data - if ( !served && uplink->cacheFd != -1 ) { - posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED ); + outReply.magic = dnbd3_packet_magic; + dnbd3_queue_client_t *next; + for ( dnbd3_queue_client_t *c = entry->clients; c != NULL; c = next ) { + size_t bytesSent = 0; + assert( c->from >= start && c->to <= end ); + dnbd3_client_t * const client = c->client; + outReply.cmd = CMD_GET_BLOCK; + outReply.handle = c->handle; + outReply.size = (uint32_t)( c->to - c->from ); + iov[0].iov_base = &outReply; + iov[0].iov_len = sizeof outReply; + iov[1].iov_base = uplink->recvBuffer + (c->from - start); + iov[1].iov_len = outReply.size; + fixup_reply( outReply ); + mutex_lock( &client->sendMutex ); + if ( client->sock != -1 ) { + ssize_t sent = writev( client->sock, iov, 2 ); + if ( sent > (ssize_t)sizeof outReply ) { + bytesSent = (size_t)sent - sizeof outReply; + } + if ( bytesSent != 0 ) { + client->bytesSent += bytesSent; + } } + mutex_unlock( &client->sendMutex ); + client->relayedCount--; + next = c->next; + free( c ); } - if ( served ) { + if ( entry->clients != NULL ) { // Was some client -- reset idle counter uplink->idleTime = 0; // Re-enable replication if disabled if ( uplink->nextReplicationIndex == -1 ) { uplink->nextReplicationIndex = (int)( start / FILE_BYTES_PER_MAP_BYTE ) & MAP_INDEX_HASH_START_MASK; } + } else { + if ( uplink->cacheFd != -1 ) { + // Try to remove from fs cache if no client was interested in this data + posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED ); + } } + free( entry ); + } // main receive loop + // Trigger background replication if applicable + if ( !uplink_sendReplicationRequest( uplink ) ) { + goto error_cleanup; } - if ( uplink->replicationHandle == REP_NONE ) { - mutex_lock( &uplink->queueLock ); - const bool rep = ( uplink->queueLen == 0 ); - mutex_unlock( &uplink->queueLock ); - if ( rep ) { - if ( !uplink_sendReplicationRequest( uplink ) ) - goto error_cleanup; - } - } + // Normal end return; // Error handling from failed receive or message parsing error_cleanup: ; @@ -1046,7 +1086,6 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) close( uplink->current.fd ); uplink->current.fd = -1; mutex_unlock( &uplink->sendMutex ); - uplink->replicationHandle = REP_NONE; if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) { uplink->nextReplicationIndex = 0; } @@ -1156,3 +1195,39 @@ bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len) return false; return altservers_toString( current, buffer, len ); } + +/** + * Get number of replication requests that should be sent right now to + * meet the configured bgrWindowSize. Returns 0 if any client requests + * are pending + */ +static int numWantedReplicationRequests(dnbd3_uplink_t *uplink) +{ + int ret = MIN( _bgrWindowSize, uplink->idleTime + 1 ); + if ( uplink->queueLen == 0 ) + return ret; + mutex_lock( &uplink->queueLock ); + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + if ( it->clients == NULL ) { + ret--; + } else { + ret = 0; // Do not allow BGR if client requests are being handled + break; + } + } + mutex_unlock( &uplink->queueLock ); + return ret; +} + +static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle) +{ + mutex_lock( &uplink->queueLock ); + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + if ( it->handle == handle ) { + it->sent = false; + break; + } + } + mutex_unlock( &uplink->queueLock ); +} + diff --git a/src/server/uplink.h b/src/server/uplink.h index 49ff0b4..8f69b05 100644 --- a/src/server/uplink.h +++ b/src/server/uplink.h @@ -12,7 +12,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client); -bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hopCount); +bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops); bool uplink_shutdown(dnbd3_image_t *image); diff --git a/src/serverconfig.h b/src/serverconfig.h index 5c7301d..31708de 100644 --- a/src/serverconfig.h +++ b/src/serverconfig.h @@ -13,7 +13,8 @@ #define SERVER_BAD_UPLINK_MAX 20 // Hard block server if it failed this many times #define SERVER_BAD_UPLINK_LOCAL_BLOCK 10 // If a server didn't supply the requested image this many times, block it for some time #define SERVER_BAD_UPLINK_IGNORE 180 // How many seconds is a server ignored -#define SERVER_MAX_UPLINK_QUEUE 1500 // Maximum number of queued requests per uplink +#define UPLINK_MAX_QUEUE 500 // Maximum number of queued requests per uplink +#define UPLINK_MAX_CLIENTS_PER_REQUEST 32 // Maximum number of clients that can attach to one uplink request #define SERVER_UPLINK_QUEUELEN_THRES 900 // Threshold where we start dropping incoming clients #define SERVER_MAX_PENDING_ALT_CHECKS 500 // Length of queue for pending alt checks requested by uplinks -- cgit v1.2.3-55-g7522 From a9f5b836d9fddb3e1851c5b0a77c566b0f267ead Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 20 Mar 2020 12:08:10 +0100 Subject: [SERVER] Fix warnings, add assertions --- src/server/globals.h | 2 +- src/server/image.c | 7 +++++-- src/server/uplink.c | 15 +++++++++------ 3 files changed, 15 insertions(+), 9 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/globals.h b/src/server/globals.h index 5cee92a..08ec303 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -103,7 +103,7 @@ struct _dnbd3_uplink atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup. atomic_uint_fast64_t bytesReceivedLastSave; // Number of bytes received when we last saved the cache map int queueLen; // length of queue - uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives) + int idleTime; // How many seconds the uplink was idle (apart from keep-alives) dnbd3_queue_entry_t *queue; atomic_uint_fast32_t queueId; dnbd3_alt_local_t altData[SERVER_MAX_ALTS]; diff --git a/src/server/image.c b/src/server/image.c index 0ec1d58..ef40325 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -440,6 +440,7 @@ dnbd3_image_t* image_lock(dnbd3_image_t *image) mutex_lock( &imageListLock ); for (i = 0; i < _num_images; ++i) { if ( _images[i] == image ) { + assert( _images[i]->id == image->id ); image->users++; mutex_unlock( &imageListLock ); return image; @@ -470,6 +471,7 @@ dnbd3_image_t* image_release(dnbd3_image_t *image) // responsible for freeing it for (int i = 0; i < _num_images; ++i) { if ( _images[i] == image ) { // Found, do nothing + assert( _images[i]->id == image->id ); mutex_unlock( &imageListLock ); return NULL; } @@ -509,6 +511,7 @@ static dnbd3_image_t* image_remove(dnbd3_image_t *image) mutex_lock( &imageListLock ); for ( int i = _num_images - 1; i >= 0; --i ) { if ( _images[i] == image ) { + assert( _images[i]->id == image->id ); _images[i] = NULL; mustFree = ( image->users == 0 ); } @@ -1088,7 +1091,7 @@ bool image_create(char *image, int revision, uint64_t size) logadd( LOG_ERROR, "revision id invalid: %d", revision ); return false; } - char path[PATHLEN], cache[PATHLEN]; + char path[PATHLEN], cache[PATHLEN+4]; char *lastSlash = strrchr( image, '/' ); if ( lastSlash == NULL ) { snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision ); @@ -1099,7 +1102,7 @@ bool image_create(char *image, int revision, uint64_t size) *lastSlash = '/'; snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision ); } - snprintf( cache, PATHLEN, "%s.map", path ); + snprintf( cache, PATHLEN+4, "%s.map", path ); size = (size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); const int mapsize = IMGSIZE_TO_MAPBYTES(size); // Write files diff --git a/src/server/uplink.c b/src/server/uplink.c index a7f140f..f5ac6ac 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -403,8 +403,9 @@ bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t han mutex_unlock( &uplink->sendMutex ); logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" ); } else { - const bool ret = dnbd3_get_block( uplink->current.fd, req.start, req.end - req.start, - req.handle, COND_HOPCOUNT( uplink->current.version, hops ) ); + const bool ret = dnbd3_get_block( uplink->current.fd, req.start, + (uint32_t)( req.end - req.start ), req.handle, + COND_HOPCOUNT( uplink->current.version, hops ) ); if ( unlikely( !ret ) ) { markRequestUnsent( uplink, req.handle ); uplink->image->problem.uplink = true; @@ -426,7 +427,8 @@ success_ref: if ( client != NULL ) { // Was from client -- potential prefetch // Same size as this request, but consider end of image... - uint32_t len = MIN( uplink->image->virtualFilesize - req.end, req.end - req.start ); + uint32_t len = (uint32_t)MIN( uplink->image->virtualFilesize - req.end, + req.end - req.start ); // Also don't prefetch if we cross a hash block border and BGR mode == hashblock if ( len > 0 && ( _backgroundReplication != BGR_HASHBLOCK || req.start % HASH_BLOCK_SIZE == (req.end-1) % HASH_BLOCK_SIZE ) ) { @@ -592,7 +594,8 @@ static void* uplink_mainloop(void *data) } declare_now; uint32_t timepassed = timing_diff( &lastKeepalive, &now ); - if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL || ( timepassed >= 2 && uplink->idleTime < _bgrWindowSize ) ) { + if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL + || ( timepassed >= 2 && uplink->idleTime < _bgrWindowSize ) ) { lastKeepalive = now; uplink->idleTime += timepassed; // Keep-alive @@ -714,8 +717,8 @@ static void sendQueuedRequests(dnbd3_uplink_t *uplink, bool newOnly) dnbd3_request_t *hdr = &reqs[count++]; hdr->magic = dnbd3_packet_magic; hdr->cmd = CMD_GET_BLOCK; - hdr->size = it->to - it->from; - hdr->offset_small = it->from; + hdr->size = (uint32_t)( it->to - it->from ); + hdr->offset = it->from; // Offset first, then hops! (union) hdr->hops = COND_HOPCOUNT( uplink->current.version, it->hopCount ); hdr->handle = it->handle; fixup_request( *hdr ); -- cgit v1.2.3-55-g7522 From 894eeb86f872a7f7f5f36bfa8649da3075dd28d6 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 20 Mar 2020 21:22:18 +0100 Subject: [SERVER] Remember atime in .meta file --- src/server/globals.h | 1 + src/server/image.c | 198 +++++++++++++++++++++++++++++++++++---------------- src/server/net.c | 2 + 3 files changed, 139 insertions(+), 62 deletions(-) (limited to 'src/server/globals.h') diff --git a/src/server/globals.h b/src/server/globals.h index 08ec303..95d8ec2 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -155,6 +155,7 @@ struct _dnbd3_image atomic_bool queue; // Too many requests waiting on uplink } problem; uint16_t rid; // revision of image + bool accessed; // image was accessed since .meta was written pthread_mutex_t lock; }; #define PIMG(x) (x)->name, (int)(x)->rid diff --git a/src/server/image.c b/src/server/image.c index 67a763c..4944bfd 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -55,10 +55,12 @@ static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc); static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd); static void* closeUnusedFds(void*); -static bool imageShouldSaveCacheMap(dnbd3_image_t *image); +static bool isImageFromUpstream(dnbd3_image_t *image); static void* saveLoadAllCacheMaps(void*); static void saveCacheMap(dnbd3_image_t *image); static void allocCacheMap(dnbd3_image_t *image, bool complete); +static void saveMetaData(dnbd3_image_t *image, ticks *now, time_t walltime); +static void loadImageMeta(dnbd3_image_t *image); static void cmfree(ref *ref) { @@ -630,8 +632,11 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image) // this will get called again when the uplink is done. if ( !uplink_shutdown( image ) ) return NULL; - if ( imageShouldSaveCacheMap( image ) ) { - saveCacheMap( image ); + if ( isImageFromUpstream( image ) ) { + saveMetaData( image, NULL, 0 ); + if ( image->ref_cacheMap != NULL ) { + saveCacheMap( image ); + } } mutex_lock( &image->lock ); ref_setref( &image->ref_cacheMap, NULL ); @@ -757,7 +762,6 @@ static bool image_addToList(dnbd3_image_t *image) static bool image_load(char *base, char *path, bool withUplink) { int revision = -1; - struct stat st; dnbd3_cache_map_t *cache = NULL; uint32_t *crc32list = NULL; dnbd3_image_t *existing = NULL; @@ -901,15 +905,7 @@ static bool image_load(char *base, char *path, bool withUplink) timing_get( &image->nextCompletenessEstimate ); image->completenessEstimate = -1; mutex_init( &image->lock, LOCK_IMAGE ); - int32_t offset; - if ( stat( path, &st ) == 0 ) { - // Negatively offset atime by file modification time - offset = (int32_t)( st.st_mtime - time( NULL ) ); - if ( offset > 0 ) offset = 0; - } else { - offset = 0; - } - timing_gets( &image->atime, offset ); + loadImageMeta( image ); // Prevent freeing in cleanup cache = NULL; @@ -1843,12 +1839,10 @@ static void* closeUnusedFds(void* nix UNUSED) return NULL; } -static bool imageShouldSaveCacheMap(dnbd3_image_t *image) +static bool isImageFromUpstream(dnbd3_image_t *image) { if ( !_isProxy ) return false; // Nothing to do - if ( image->ref_cacheMap == NULL ) - return false; // Nothing to do // Check if we're a "hybrid proxy", i.e. there are only some namespaces (directories) // for which we have any upstream servers configured. If there's none, don't touch // the cache map on disk. @@ -1862,66 +1856,71 @@ static void* saveLoadAllCacheMaps(void* nix UNUSED) static ticks nextSave; declare_now; bool full = timing_reached( &nextSave, &now ); + time_t walltime = full ? time( NULL ) : 0; setThreadName( "cache-mapper" ); mutex_lock( &imageListLock ); for ( int i = 0; i < _num_images; ++i ) { dnbd3_image_t * const image = _images[i]; - dnbd3_cache_map_t *cache = ref_get_cachemap( image ); - if ( cache == NULL ) - continue; // No users++ or mutex_unlock yet -> safe image->users++; mutex_unlock( &imageListLock ); - if ( imageShouldSaveCacheMap( image ) ) { - // Replicated image, we're responsible for updating the map, so save it - // Save if dirty bit is set, blocks were invalidated - bool save = cache->dirty; - dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); - if ( !save ) { - // Otherwise, consider longer timeout and byte count limits of uplink - if ( uplink != NULL ) { - assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived ); - uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave; - if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES || ( full && diff != 0 ) ) { - save = true; + const bool fromUpstream = isImageFromUpstream( image ); + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache != NULL ) { + if ( fromUpstream ) { + // Replicated image, we're responsible for updating the map, so save it + // Save if dirty bit is set, blocks were invalidated + bool save = cache->dirty; + dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); + if ( !save ) { + // Otherwise, consider longer timeout and byte count limits of uplink + if ( uplink != NULL ) { + assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived ); + uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave; + if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES || ( full && diff != 0 ) ) { + save = true; + } } } - } - if ( save ) { - cache->dirty = false; + if ( save ) { + cache->dirty = false; + if ( uplink != NULL ) { + uplink->bytesReceivedLastSave = uplink->bytesReceived; + } + saveCacheMap( image ); + } if ( uplink != NULL ) { - uplink->bytesReceivedLastSave = uplink->bytesReceived; + ref_put( &uplink->reference ); } - saveCacheMap( image ); - } - if ( uplink != NULL ) { - ref_put( &uplink->reference ); - } - } else { - // We're not replicating this image, if there's a cache map, reload - // it periodically, since we might read from a shared storage that - // another server instance is writing to. - if ( full || ( !cache->unchanged && !image->problem.read ) ) { - logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", PIMG(image) ); - dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize); - if ( onDisk == NULL ) { - // Should be complete now - logadd( LOG_DEBUG1, "External replication of %s:%d complete", PIMG(image) ); - ref_setref( &image->ref_cacheMap, NULL ); - } else { - const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); - if ( memcmp( cache->map, onDisk->map, mapSize ) == 0 ) { - // Unchanged - cache->unchanged = true; - onDisk->reference.free( &onDisk->reference ); + } else { + // We're not replicating this image, if there's a cache map, reload + // it periodically, since we might read from a shared storage that + // another server instance is writing to. + if ( full || ( !cache->unchanged && !image->problem.read ) ) { + logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", PIMG(image) ); + dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize); + if ( onDisk == NULL ) { + // Should be complete now + logadd( LOG_DEBUG1, "External replication of %s:%d complete", PIMG(image) ); + ref_setref( &image->ref_cacheMap, NULL ); } else { - // Replace - ref_setref( &image->ref_cacheMap, &onDisk->reference ); - logadd( LOG_DEBUG2, "Map changed" ); + const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + if ( memcmp( cache->map, onDisk->map, mapSize ) == 0 ) { + // Unchanged + cache->unchanged = true; + onDisk->reference.free( &onDisk->reference ); + } else { + // Replace + ref_setref( &image->ref_cacheMap, &onDisk->reference ); + logadd( LOG_DEBUG2, "Map changed" ); + } } } - } + } // end reload cache map + ref_put( &cache->reference ); + } // end has cache map + if ( full && fromUpstream ) { + saveMetaData( image, &now, walltime ); } - ref_put( &cache->reference ); image_release( image ); // Always do this instead of users-- to handle freeing mutex_lock( &imageListLock ); } @@ -2023,3 +2022,78 @@ static void allocCacheMap(dnbd3_image_t *image, bool complete) } mutex_unlock( &image->lock ); } + +/** + * It's assumed you hold a reference to the image + */ +static void saveMetaData(dnbd3_image_t *image, ticks *now, time_t walltime) +{ + if ( !image->accessed ) + return; + ticks tmp; + uint32_t diff; + char *fn; + if ( asprintf( &fn, "%s.meta", image->path ) == -1 ) { + logadd( LOG_WARNING, "Cannot asprintf meta" ); + return; + } + if ( now == NULL ) { + timing_get( &tmp ); + now = &tmp; + walltime = time( NULL ); + } + mutex_lock( &image->lock ); + image->accessed = false; + diff = timing_diff( &image->atime, now ); + mutex_unlock( &image->lock ); + FILE *f = fopen( fn, "w" ); + if ( f == NULL ) { + logadd( LOG_WARNING, "Cannot open %s for writing", fn ); + } else { + fprintf( f, "[main]\natime=%"PRIu64"\n", (uint64_t)( walltime - diff ) ); + fclose( f ); + } + free( fn ); + // TODO: fsync() dir +} + +static void loadImageMeta(dnbd3_image_t *image) +{ + int32_t offset = 1; + char *fn; + if ( asprintf( &fn, "%s.meta", image->path ) == -1 ) { + logadd( LOG_WARNING, "asprintf load" ); + } else { + int fh = open( fn, O_RDONLY ); + free( fn ); + if ( fh != -1 ) { + char buf[200]; + ssize_t ret = read( fh, buf, sizeof(buf)-1 ); + close( fh ); + if ( ret > 0 ) { + buf[ret] = '\0'; + // Do it the cheap way until we actually store more stuff + char *pos = strstr( buf, "atime=" ); + if ( pos != NULL ) { + offset = (int32_t)( atol( pos + 6 ) - time( NULL ) ); + } + } + } + } + if ( offset == 1 ) { + // Nothing from .meta file, use old guesstimate + struct stat st; + if ( stat( image->path, &st ) == 0 ) { + // Negatively offset atime by file modification time + offset = (int32_t)( st.st_mtime - time( NULL ) ); + } else { + offset = 0; + } + image->accessed = true; + } + if ( offset > 0 ) { + offset = 0; + } + timing_gets( &image->atime, offset ); +} + diff --git a/src/server/net.c b/src/server/net.c index 9ba9dbc..6b930df 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -289,6 +289,7 @@ void* net_handleNewConnection(void *clientPtr) if ( !client->isServer ) { // Only update immediately if this is a client. Servers are handled on disconnect. timing_get( &image->atime ); + image->accessed = true; } mutex_unlock( &image->lock ); serializer_reset_write( &payload ); @@ -515,6 +516,7 @@ exit_client_cleanup: ; if ( image != NULL && client->bytesSent > DNBD3_BLOCK_SIZE * 10 ) { mutex_lock( &image->lock ); timing_get( &image->atime ); + image->accessed = true; mutex_unlock( &image->lock ); } if ( cache != NULL ) { -- cgit v1.2.3-55-g7522