From 1a8a31603e56995639eba99492611ab4e7ef64af Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Mon, 5 Aug 2019 13:42:19 +0200 Subject: [SERVER] Allow uplink shutdown if bgrMinClients > image->users --- src/server/uplink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'src/server/uplink.c') diff --git a/src/server/uplink.c b/src/server/uplink.c index 682b986..aa5228c 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -1082,6 +1082,7 @@ static bool uplink_saveCacheMap(dnbd3_connection_t *link) static bool uplink_connectionShouldShutdown(dnbd3_connection_t *link) { - return ( link->idleTime > SERVER_UPLINK_IDLE_TIMEOUT && _backgroundReplication != BGR_FULL ); + return ( link->idleTime > SERVER_UPLINK_IDLE_TIMEOUT + && ( _backgroundReplication != BGR_FULL || _bgrMinClients > link->image->users ) ); } -- cgit v1.2.3-55-g7522 From 48533240493c0dd970c926bbdb8939bb7d93cd14 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Tue, 6 Aug 2019 11:44:27 +0200 Subject: [SERVER] Fix: Client thread could destroy sendMutex while in use Fix a race condition where the client thread tears down the client struct including the sendMutex while the uplink thead is currently holding the lock, trying to send data to the client. --- src/server/uplink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'src/server/uplink.c') diff --git a/src/server/uplink.c b/src/server/uplink.c index aa5228c..f58b019 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -153,6 +153,9 @@ void uplink_removeClient(dnbd3_connection_t *uplink, dnbd3_client_t *client) mutex_lock( &uplink->queueLock ); for (int i = uplink->queueLen - 1; i >= 0; --i) { if ( uplink->queue[i].client == client ) { + // Make sure client doesn't get destroyed while we're sending it data + mutex_lock( &client->sendMutex ); + mutex_unlock( &client->sendMutex ); uplink->queue[i].client = NULL; uplink->queue[i].status = ULR_FREE; } -- cgit v1.2.3-55-g7522 From 5dc776ac73be190daa2b2b8c3eb6042fdab4acda Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Tue, 6 Aug 2019 14:06:27 +0200 Subject: [SERVER] uplink: Improve attaching to existing requests Allow attaching in ULR_PROCESSING state, leave lower slots empty to increase chances attaching to ULR_PROCESSING. --- src/server/globals.h | 12 ------- src/server/uplink.c | 97 +++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 74 insertions(+), 35 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/globals.h b/src/server/globals.h index 7e5ff04..cd5ad7e 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -17,18 +17,6 @@ typedef struct _dnbd3_connection dnbd3_connection_t; typedef struct _dnbd3_image dnbd3_image_t; typedef struct _dnbd3_client dnbd3_client_t; -// Slot is free, can be used. -// Must only be set in uplink_handle_receive() or uplink_remove_client() -#define ULR_FREE 0 -// Slot has been filled with a request that hasn't been sent to the upstream server yet, matching request can safely rely on reuse. -// Must only be set in uplink_request() -#define ULR_NEW 1 -// Slot is occupied, reply has not yet been received, matching request can safely rely on reuse. -// Must only be set in uplink_mainloop() or uplink_request() -#define ULR_PENDING 2 -// Slot is being processed, do not consider for hop on. -// Must only be set in uplink_handle_receive() -#define ULR_PROCESSING 3 typedef struct { uint64_t handle; // Client defined handle to pass back in reply diff --git a/src/server/uplink.c b/src/server/uplink.c index f58b019..9f99fe4 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -21,6 +21,28 @@ #define REP_NONE ( (uint64_t)0xffffffffffffffff ) +// Status of request in queue + +// Slot is free, can be used. +// Must only be set in uplink_handle_receive() or uplink_remove_client() +#define ULR_FREE 0 +// Slot has been filled with a request that hasn't been sent to the upstream server yet, matching request can safely rely on reuse. +// Must only be set in uplink_request() +#define ULR_NEW 1 +// Slot is occupied, reply has not yet been received, matching request can safely rely on reuse. +// Must only be set in uplink_mainloop() or uplink_request() +#define ULR_PENDING 2 +// Slot is being processed, do not consider for hop on. +// Must only be set in uplink_handle_receive() +#define ULR_PROCESSING 3 + +static const char *const NAMES_ULR[4] = { + [ULR_FREE] = "ULR_FREE", + [ULR_NEW] = "ULR_NEW", + [ULR_PENDING] = "ULR_PENDING", + [ULR_PROCESSING] = "ULR_PROCESSING", +}; + static atomic_uint_fast64_t totalBytesReceived = 0; static void* uplink_mainloop(void *data); @@ -203,30 +225,37 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin int existingType = -1; // ULR_* type of existing request int i; int freeSlot = -1; + int firstUsedSlot = -1; bool requestLoop = false; const uint64_t end = start + length; mutex_lock( &uplink->queueLock ); mutex_unlock( &client->image->lock ); for (i = 0; i < uplink->queueLen; ++i) { - if ( freeSlot == -1 && uplink->queue[i].status == ULR_FREE ) { - freeSlot = i; + // find free slot to place this request into + if ( uplink->queue[i].status == ULR_FREE ) { + if ( freeSlot == -1 || existingType != ULR_PROCESSING ) { + freeSlot = i; + } continue; } - if ( uplink->queue[i].status != ULR_PENDING && uplink->queue[i].status != ULR_NEW ) continue; - if ( uplink->queue[i].from <= start && uplink->queue[i].to >= end ) { - if ( hops > uplink->queue[i].hopCount && uplink->queue[i].from == start && uplink->queue[i].to == end ) { - requestLoop = true; - break; - } - if ( foundExisting == -1 || existingType == ULR_PENDING ) { - foundExisting = i; - existingType = uplink->queue[i].status; - if ( freeSlot != -1 ) break; - } + if ( firstUsedSlot == -1 ) { + firstUsedSlot = i; + } + // find existing request to attach to + if ( uplink->queue[i].from > start || uplink->queue[i].to < end ) + continue; // Range not suitable + // Detect potential proxy cycle. New request hopcount is greater, range is same, old request has already been sent -> suspicious + if ( hops > uplink->queue[i].hopCount && uplink->queue[i].from == start && uplink->queue[i].to == end && uplink->queue[i].status == ULR_PENDING ) { + requestLoop = true; + break; + } + if ( foundExisting == -1 || existingType == ULR_PROCESSING ) { + foundExisting = i; + existingType = uplink->queue[i].status; } } - if ( requestLoop ) { + if ( unlikely( requestLoop ) ) { mutex_unlock( &uplink->queueLock ); logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops ); mutex_lock( &uplink->rttLock ); @@ -235,6 +264,9 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin signal_call( uplink->signal ); return false; } + if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) { + freeSlot = -1; // Not attaching to existing request, make it use a higher slot + } if ( freeSlot == -1 ) { if ( uplink->queueLen >= SERVER_MAX_UPLINK_QUEUE ) { mutex_unlock( &uplink->queueLock ); @@ -244,15 +276,17 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin freeSlot = uplink->queueLen++; } // Do not send request to uplink server if we have a matching pending request AND the request either has the - // status ULR_NEW OR we found a free slot with LOWER index than the one we attach to. Otherwise + // status ULR_NEW/PENDING OR we found a free slot with LOWER index than the one we attach to. Otherwise // explicitly send this request to the uplink server. The second condition mentioned here is to prevent // a race condition where the reply for the outstanding request already arrived and the uplink thread // is currently traversing the request queue. As it is processing the queue from highest to lowest index, it might // already have passed the index of the free slot we determined, but not reached the existing request we just found above. - if ( foundExisting != -1 && existingType != ULR_NEW && freeSlot > foundExisting ) foundExisting = -1; // -1 means "send request" + if ( foundExisting != -1 && existingType == ULR_PROCESSING && freeSlot > foundExisting ) { + foundExisting = -1; // -1 means "send request" + } #ifdef _DEBUG if ( foundExisting != -1 ) { - logadd( LOG_DEBUG2, "%p (%s) Found existing request of type %s at slot %d, attaching in slot %d.\n", (void*)uplink, uplink->image->name, existingType == ULR_NEW ? "ULR_NEW" : "ULR_PENDING", foundExisting, freeSlot ); + logadd( LOG_DEBUG2, "%p (%s) Found existing request of type %s at slot %d, attaching in slot %d.\n", (void*)uplink, uplink->image->name, NAMES_ULR[existingType], foundExisting, freeSlot ); logadd( LOG_DEBUG2, "Original %" PRIu64 "-%" PRIu64 " (%p)\n" "New %" PRIu64 "-%" PRIu64 " (%p)\n", uplink->queue[foundExisting].from, uplink->queue[foundExisting].to, (void*)uplink->queue[foundExisting].client, @@ -265,7 +299,8 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin uplink->queue[freeSlot].handle = handle; uplink->queue[freeSlot].client = client; //int old = uplink->queue[freeSlot].status; - uplink->queue[freeSlot].status = (foundExisting == -1 ? ULR_NEW : ULR_PENDING); + uplink->queue[freeSlot].status = ( foundExisting == -1 ? ULR_NEW : + ( existingType == ULR_NEW ? ULR_PENDING : existingType ) ); uplink->queue[freeSlot].hopCount = hops; #ifdef _DEBUG timing_get( &uplink->queue[freeSlot].entered ); @@ -292,14 +327,25 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin if ( !ret ) { logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" ); } else { + // Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again + int state; mutex_lock( &uplink->queueLock ); - if ( uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client && uplink->queue[freeSlot].status == ULR_NEW ) { - uplink->queue[freeSlot].status = ULR_PENDING; - logadd( LOG_DEBUG2, "Succesful direct uplink request" ); + if ( uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) { + state = uplink->queue[freeSlot].status; + if ( uplink->queue[freeSlot].status == ULR_NEW ) { + uplink->queue[freeSlot].status = ULR_PENDING; + } } else { - logadd( LOG_DEBUG2, "Weird queue update fail for direct uplink request" ); + state = -1; } mutex_unlock( &uplink->queueLock ); + if ( state == -1 ) { + logadd( LOG_DEBUG2, "Direct uplink request queue entry gone after sending and re-locking queue. *shrug*" ); + } else if ( state == ULR_NEW ) { + logadd( LOG_DEBUG2, "Succesful direct uplink request" ); + } else { + logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] ); + } return true; } // Fall through to waking up sender thread @@ -837,6 +883,11 @@ static void uplink_handleReceive(dnbd3_connection_t *link) } } // 2) Figure out which clients are interested in it + // Mark as ULR_PROCESSING, since we unlock repeatedly in the second loop + // below; this prevents uplink_request() from attaching to this request + // by populating a slot with index greater than the highest matching + // request with ULR_PROCESSING (assuming there is no ULR_PENDING or ULR_NEW + // where it's fine if the index is greater) mutex_lock( &link->queueLock ); for (i = 0; i < link->queueLen; ++i) { dnbd3_queued_request_t * const req = &link->queue[i]; @@ -877,10 +928,10 @@ static void uplink_handleReceive(dnbd3_connection_t *link) bytesSent = (size_t)sent - sizeof outReply; } } - mutex_unlock( &client->sendMutex ); if ( bytesSent != 0 ) { client->bytesSent += bytesSent; } + mutex_unlock( &client->sendMutex ); mutex_lock( &link->queueLock ); } if ( req->status == ULR_FREE && i == link->queueLen - 1 ) link->queueLen--; -- cgit v1.2.3-55-g7522 From 121dd5eceb64be43d188670bff5bce265d57d199 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Wed, 7 Aug 2019 16:31:05 +0200 Subject: [SERVER] Lock-free queue for altservers check thread --- src/server/altservers.c | 97 +++++++++++++++++++++++++++---------------------- src/server/uplink.c | 8 ++-- 2 files changed, 57 insertions(+), 48 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/altservers.c b/src/server/altservers.c index bbbc584..a270bf3 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -14,10 +14,8 @@ #define LOG_GOTO(jumplabel, lvl, ...) do { LOG(lvl, __VA_ARGS__); goto jumplabel; } while (0); #define ERROR_GOTO(jumplabel, ...) LOG_GOTO(jumplabel, LOG_ERROR, __VA_ARGS__) -static dnbd3_connection_t *pending[SERVER_MAX_PENDING_ALT_CHECKS]; -static pthread_mutex_t pendingLockWrite; // Lock for adding something to pending. (NULL -> nonNULL) -static pthread_mutex_t pendingLockConsume; // Lock for removing something (nonNULL -> NULL) -static dnbd3_signal_t* runSignal = NULL; +static dnbd3_connection_t * _Atomic pending[SERVER_MAX_PENDING_ALT_CHECKS]; +static dnbd3_signal_t * _Atomic runSignal = NULL; static dnbd3_alt_server_t altServers[SERVER_MAX_ALTS]; static int numAltServers = 0; @@ -32,8 +30,6 @@ void altservers_init() { srand( (unsigned int)time( NULL ) ); // Init spinlock - mutex_init( &pendingLockWrite ); - mutex_init( &pendingLockConsume ); mutex_init( &altServersLock ); // Init signal runSignal = signal_new(); @@ -48,12 +44,9 @@ void altservers_init() } // Init waiting links queue -- this is currently a global static array so // it will already be zero, but in case we refactor later do it explicitly - // while also holding the write lock so thread sanitizer is happy - mutex_lock( &pendingLockWrite ); for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { pending[i] = NULL; } - mutex_unlock( &pendingLockWrite ); } void altservers_shutdown() @@ -130,52 +123,77 @@ bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate */ void altservers_findUplink(dnbd3_connection_t *uplink) { + if ( uplink->shutdown ) + return; int i; // if betterFd != -1 it means the uplink is supposed to switch to another // server. As this function here is called by the uplink thread, it can // never be that the uplink is supposed to switch, but instead calls // this function. assert( uplink->betterFd == -1 ); - mutex_lock( &pendingLockWrite ); // it is however possible that an RTT measurement is currently in progress, // so check for that case and do nothing if one is in progress + // XXX As this function is only ever called by the image's uplink thread, + // it cannot happen that the uplink ends up in this list concurrently + mutex_lock( &uplink->rttLock ); if ( uplink->rttTestResult == RTT_INPROGRESS ) { for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { if ( pending[i] != uplink ) continue; // Yep, measuring right now - mutex_unlock( &pendingLockWrite ); return; } } // Find free slot for measurement + uplink->rttTestResult = RTT_INPROGRESS; for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { if ( pending[i] != NULL ) continue; - pending[i] = uplink; - uplink->rttTestResult = RTT_INPROGRESS; - mutex_unlock( &pendingLockWrite ); - signal_call( runSignal ); // Wake altservers thread up - return; + dnbd3_connection_t *null = NULL; + if ( atomic_compare_exchange_strong( &pending[i], &null, uplink ) ) { + mutex_unlock( &uplink->rttLock ); + atomic_thread_fence( memory_order_release ); + signal_call( runSignal ); // Wake altservers thread up + return; + } } // End of loop - no free slot - mutex_unlock( &pendingLockWrite ); + uplink->rttTestResult = RTT_NOT_REACHABLE; + mutex_unlock( &uplink->rttLock ); logadd( LOG_WARNING, "No more free RTT measurement slots, ignoring a request..." ); } /** - * The given uplink is about to disappear, so remove it from any queues + * The given uplink is about to disappear, + * wait until any pending RTT check is done. */ void altservers_removeUplink(dnbd3_connection_t *uplink) { - mutex_lock( &pendingLockConsume ); - mutex_lock( &pendingLockWrite ); - for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { - if ( pending[i] == uplink ) { + assert( uplink != NULL ); + assert( uplink->shutdown ); + int i; + for ( i = 1 ;; ++i ) { + atomic_thread_fence( memory_order_acquire ); + if ( runSignal == NULL ) { + // Thread is already done, remove manually uplink->rttTestResult = RTT_NOT_REACHABLE; - pending[i] = NULL; + break; + } + // Thread still running, wait until test is done + bool found = false; + for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { + if ( pending[i] == uplink ) { + found = true; + break; + } + } + if ( !found ) // No more test running + break; + usleep( 10000 ); // 10ms + signal_call( runSignal ); // Wake altservers thread up + if ( i % 500 == 0 ) { + logadd( LOG_INFO, "Still waiting for altserver check for uplink %p...", (void*)uplink ); } } - mutex_unlock( &pendingLockWrite ); - mutex_unlock( &pendingLockConsume ); + logadd( LOG_DEBUG1, "Waited for %d iterations for altservers check when tearing down uplink", i ); } /** @@ -432,28 +450,18 @@ static void *altservers_main(void *data UNUSED) usleep( 100000 ); } // Work your way through the queue + atomic_thread_fence( memory_order_acquire ); for (itLink = 0; itLink < SERVER_MAX_PENDING_ALT_CHECKS; ++itLink) { - mutex_lock( &pendingLockWrite ); - if ( pending[itLink] == NULL ) { - mutex_unlock( &pendingLockWrite ); - continue; // Check once before locking, as a mutex is expensive - } - mutex_unlock( &pendingLockWrite ); - mutex_lock( &pendingLockConsume ); - mutex_lock( &pendingLockWrite ); dnbd3_connection_t * const uplink = pending[itLink]; - mutex_unlock( &pendingLockWrite ); - if ( uplink == NULL ) { // Check again after locking - mutex_unlock( &pendingLockConsume ); + if ( uplink == NULL ) continue; - } dnbd3_image_t * const image = image_lock( uplink->image ); if ( image == NULL ) { // Check again after locking + mutex_lock( &uplink->rttLock ); uplink->rttTestResult = RTT_NOT_REACHABLE; - mutex_lock( &pendingLockWrite ); + assert( pending[itLink] == uplink ); pending[itLink] = NULL; - mutex_unlock( &pendingLockWrite ); - mutex_unlock( &pendingLockConsume ); + mutex_unlock( &uplink->rttLock ); logadd( LOG_DEBUG1, "Image has gone away that was queued for RTT measurement" ); continue; } @@ -592,10 +600,9 @@ static void *altservers_main(void *data UNUSED) } image_release( image ); // end of loop over all pending uplinks - mutex_lock( &pendingLockWrite ); + assert( pending[itLink] == uplink ); pending[itLink] = NULL; - mutex_unlock( &pendingLockWrite ); - mutex_unlock( &pendingLockConsume ); + atomic_thread_fence( memory_order_release ); } // Save cache maps of all images if applicable declare_now; @@ -606,7 +613,9 @@ static void *altservers_main(void *data UNUSED) } } cleanup: ; - if ( runSignal != NULL ) signal_close( runSignal ); + if ( runSignal != NULL ) { + signal_close( runSignal ); + } runSignal = NULL; return NULL ; } diff --git a/src/server/uplink.c b/src/server/uplink.c index 9f99fe4..bb1ffdc 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -583,6 +583,10 @@ static void* uplink_mainloop(void *data) #endif } cleanup: ; + if ( !link->shutdown ) { + link->shutdown = true; + thread_detach( link->thread ); + } altservers_removeUplink( link ); uplink_saveCacheMap( link ); mutex_lock( &link->image->lock ); @@ -596,10 +600,6 @@ static void* uplink_mainloop(void *data) link->fd = -1; mutex_unlock( &link->sendMutex ); link->signal = NULL; - if ( !link->shutdown ) { - link->shutdown = true; - thread_detach( link->thread ); - } // Do not access link->image after unlocking, since we set // image->uplink to NULL. Acquire with image_lock first, // like done below when checking whether to re-init uplink -- cgit v1.2.3-55-g7522 From be7d7d95850c30a154aaa56e95d6a7f36793409d Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Wed, 7 Aug 2019 17:11:51 +0200 Subject: [SERVER] Better lock debugging: Always check lock order Lock order is predefined in locks.h. Immediately bail out if a lock with lower priority is obtained while the same thread already holds one with higher priority. --- LOCKS | 13 +- src/server/altservers.c | 9 +- src/server/globals.c | 2 +- src/server/image.c | 10 +- src/server/integrity.c | 2 +- src/server/locks.c | 319 ++++++++++++++++++++++-------------------------- src/server/locks.h | 36 ++++-- src/server/net.c | 6 +- src/server/rpc.c | 14 +-- src/server/server.c | 7 -- src/server/uplink.c | 6 +- 11 files changed, 198 insertions(+), 226 deletions(-) (limited to 'src/server/uplink.c') diff --git a/LOCKS b/LOCKS index 4b5b07c..77e44a8 100644 --- a/LOCKS +++ b/LOCKS @@ -16,23 +16,22 @@ requests.lock ===== SERVER ===== This is a list of used locks, in the order they -have to be aquired if you must hold multiple locks: -remoteCloneLock | reloadLock +have to be aquired if you must hold multiple locks. +Note this list might be out of date, take a look at the +defines in lock.h for the effective order. +reloadLock +remoteCloneLock _clients_lock _clients[].lock integrityQueueLock _images_lock _images[].lock -pendingLockConsume -pendingLockProduce uplink.queueLock altServersLock client.sendMutex -client.statsLock -statisticsSentLock -statisticsReceivedLock uplink.rttLock uplink.sendMutex +aclLock If you need to lock multiple clients/images/... at once, lock the client with the lowest array index first. diff --git a/src/server/altservers.c b/src/server/altservers.c index a270bf3..3d5e71e 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -30,7 +30,7 @@ void altservers_init() { srand( (unsigned int)time( NULL ) ); // Init spinlock - mutex_init( &altServersLock ); + mutex_init( &altServersLock, LOCK_ALT_SERVER_LIST ); // Init signal runSignal = signal_new(); if ( runSignal == NULL ) { @@ -326,13 +326,13 @@ json_t* altservers_toJson() } /** - * Update rtt history of given server - returns the new average for that server + * Update rtt history of given server - returns the new average for that server. + * XXX HOLD altServersLock WHEN CALLING THIS! */ static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const unsigned int rtt) { unsigned int avg = rtt; int i; - mutex_lock( &altServersLock ); for (i = 0; i < numAltServers; ++i) { if ( !isSameAddressPort( host, &altServers[i].host ) ) continue; altServers[i].rtt[++altServers[i].rttIndex % SERVER_RTT_PROBES] = rtt; @@ -353,7 +353,6 @@ static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const } break; } - mutex_unlock( &altServersLock ); return avg; } @@ -529,6 +528,7 @@ static void *altservers_main(void *data UNUSED) } clock_gettime( BEST_CLOCK_SOURCE, &end ); // Measurement done - everything fine so far + mutex_lock( &altServersLock ); mutex_lock( &uplink->rttLock ); const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->currentServer ); // Penaltize rtt if this was a cycle; this will treat this server with lower priority @@ -538,6 +538,7 @@ static void *altservers_main(void *data UNUSED) + (end.tv_nsec - start.tv_nsec) / 1000 + ( (isCurrent && uplink->cycleDetected) ? 1000000 : 0 )); // µs unsigned int avg = altservers_updateRtt( &servers[itAlt], rtt ); + mutex_unlock( &altServersLock ); // If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time if ( ( uplink->cycleDetected || uplink->fd == -1 ) && isCurrent ) avg = (avg * 2) + 50000; mutex_unlock( &uplink->rttLock ); diff --git a/src/server/globals.c b/src/server/globals.c index 69e8a6e..46c1030 100644 --- a/src/server/globals.c +++ b/src/server/globals.c @@ -112,7 +112,7 @@ void globals_loadConfig() asprintf( &name, "%s/%s", _configDir, CONFIG_FILENAME ); if ( name == NULL ) return; if ( initialLoad ) { - mutex_init( &loadLock ); + mutex_init( &loadLock, LOCK_LOAD_CONFIG ); } if ( mutex_trylock( &loadLock ) != 0 ) { logadd( LOG_INFO, "Ignoring config reload request due to already running reload" ); diff --git a/src/server/image.c b/src/server/image.c index 1f12eda..4a65ed3 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -59,9 +59,9 @@ static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t void image_serverStartup() { srand( (unsigned int)time( NULL ) ); - mutex_init( &imageListLock ); - mutex_init( &remoteCloneLock ); - mutex_init( &reloadLock ); + mutex_init( &imageListLock, LOCK_IMAGE_LIST ); + mutex_init( &remoteCloneLock, LOCK_REMOTE_CLONE ); + mutex_init( &reloadLock, LOCK_RELOAD ); } /** @@ -347,7 +347,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) img->rid = candidate->rid; img->users = 1; img->working = false; - mutex_init( &img->lock ); + mutex_init( &img->lock, LOCK_IMAGE ); if ( candidate->crc32 != NULL ) { const size_t mb = IMGSIZE_TO_HASHBLOCKS( candidate->virtualFilesize ) * sizeof(uint32_t); img->crc32 = malloc( mb ); @@ -869,7 +869,7 @@ static bool image_load(char *base, char *path, int withUplink) image->working = (image->cache_map == NULL ); timing_get( &image->nextCompletenessEstimate ); image->completenessEstimate = -1; - mutex_init( &image->lock ); + mutex_init( &image->lock, LOCK_IMAGE ); int32_t offset; if ( stat( path, &st ) == 0 ) { // Negatively offset atime by file modification time diff --git a/src/server/integrity.c b/src/server/integrity.c index a66a364..c52d17b 100644 --- a/src/server/integrity.c +++ b/src/server/integrity.c @@ -39,7 +39,7 @@ static void* integrity_main(void *data); void integrity_init() { assert( queueLen == -1 ); - mutex_init( &integrityQueueLock ); + mutex_init( &integrityQueueLock, LOCK_INTEGRITY_QUEUE ); pthread_cond_init( &queueSignal, NULL ); mutex_lock( &integrityQueueLock ); queueLen = 0; diff --git a/src/server/locks.c b/src/server/locks.c index 2c0cb27..b39576b 100644 --- a/src/server/locks.c +++ b/src/server/locks.c @@ -12,47 +12,45 @@ #ifdef _DEBUG #define MAXLOCKS (SERVER_MAX_CLIENTS * 2 + SERVER_MAX_ALTS + 200 + SERVER_MAX_IMAGES) #define MAXTHREADS (SERVER_MAX_CLIENTS + 100) +#define MAXLPT 20 #define LOCKLEN 60 typedef struct { - void *lock; + void * _Atomic lock; ticks locktime; - char locked; - pthread_t thread; + bool _Atomic locked; + pthread_t _Atomic thread; int lockId; + int prio; char name[LOCKLEN]; char where[LOCKLEN]; } debug_lock_t; typedef struct { - pthread_t tid; + pthread_t _Atomic tid; ticks time; char name[LOCKLEN]; char where[LOCKLEN]; - + debug_lock_t *locks[MAXLPT]; } debug_thread_t; int debugThreadCount = 0; static debug_lock_t locks[MAXLOCKS]; static debug_thread_t threads[MAXTHREADS]; -static int init_done = 0; -static pthread_mutex_t initdestory; +static pthread_mutex_t initdestory = PTHREAD_MUTEX_INITIALIZER; static int lockId = 0; -static pthread_t watchdog = 0; -static dnbd3_signal_t* watchdogSignal = NULL; -static void *debug_thread_watchdog(void *something); +#define ULDE(...) do { \ + pthread_mutex_unlock( &initdestory ); \ + logadd( LOG_ERROR, __VA_ARGS__ ); \ + debug_dump_lock_stats(); \ + exit( 4 ); \ +} while(0) -int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex_t *lock) +int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex_t *lock, int priority) { - if ( !init_done ) { - memset( locks, 0, MAXLOCKS * sizeof(debug_lock_t) ); - memset( threads, 0, MAXTHREADS * sizeof(debug_thread_t) ); - pthread_mutex_init( &initdestory, NULL ); - init_done = 1; - } int first = -1; pthread_mutex_lock( &initdestory ); for (int i = 0; i < MAXLOCKS; ++i) { @@ -63,20 +61,18 @@ int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex if ( first == -1 && locks[i].lock == NULL ) first = i; } if ( first == -1 ) { - logadd( LOG_ERROR, "No more free debug locks (%s:%d)\n", file, line ); - pthread_mutex_unlock( &initdestory ); - debug_dump_lock_stats(); - exit( 4 ); + ULDE( "No more free debug locks (%s:%d)\n", file, line ); } locks[first].lock = (void*)lock; - locks[first].locked = 0; + locks[first].locked = false; + locks[first].prio = priority; snprintf( locks[first].name, LOCKLEN, "%s", name ); snprintf( locks[first].where, LOCKLEN, "I %s:%d", file, line ); pthread_mutex_unlock( &initdestory ); return pthread_mutex_init( lock, NULL ); } -int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex_t *lock) +int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex_t *lock, bool try) { debug_lock_t *l = NULL; pthread_mutex_lock( &initdestory ); @@ -86,163 +82,180 @@ int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex break; } } - pthread_mutex_unlock( &initdestory ); if ( l == NULL ) { - logadd( LOG_ERROR, "Tried to lock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line ); - debug_dump_lock_stats(); - exit( 4 ); + ULDE( "Tried to lock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line ); } debug_thread_t *t = NULL; - pthread_mutex_lock( &initdestory ); + int first = -1; + const pthread_t self = pthread_self(); for (int i = 0; i < MAXTHREADS; ++i) { - if ( threads[i].tid != 0 ) continue; - threads[i].tid = pthread_self(); - timing_get( &threads[i].time ); - snprintf( threads[i].name, LOCKLEN, "%s", name ); - snprintf( threads[i].where, LOCKLEN, "%s:%d", file, line ); - t = &threads[i]; - break; - } - pthread_mutex_unlock( &initdestory ); - if ( t == NULL ) { - logadd( LOG_ERROR, "Lock sanity check: Too many waiting threads for lock %p (%s) at %s:%d\n", (void*)lock, name, file, line ); - exit( 4 ); - } - const int retval = pthread_mutex_lock( lock ); - pthread_mutex_lock( &initdestory ); - t->tid = 0; - pthread_mutex_unlock( &initdestory ); - if ( l->locked ) { - logadd( LOG_ERROR, "Lock sanity check: lock %p (%s) already locked at %s:%d\n", (void*)lock, name, file, line ); - exit( 4 ); - } - l->locked = 1; - timing_get( &l->locktime ); - l->thread = pthread_self(); - snprintf( l->where, LOCKLEN, "L %s:%d", file, line ); - pthread_mutex_lock( &initdestory ); - l->lockId = ++lockId; - pthread_mutex_unlock( &initdestory ); - return retval; -} - -int debug_mutex_trylock(const char *name, const char *file, int line, pthread_mutex_t *lock) -{ - debug_lock_t *l = NULL; - pthread_mutex_lock( &initdestory ); - for (int i = 0; i < MAXLOCKS; ++i) { - if ( locks[i].lock == lock ) { - l = &locks[i]; + if ( threads[i].tid == self ) { + t = &threads[i]; break; } + if ( first == -1 && threads[i].tid == 0 ) { + first = i; + } } - pthread_mutex_unlock( &initdestory ); - if ( l == NULL ) { - logadd( LOG_ERROR, "Tried to lock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line ); - debug_dump_lock_stats(); - exit( 4 ); - } - debug_thread_t *t = NULL; - pthread_mutex_lock( &initdestory ); - for (int i = 0; i < MAXTHREADS; ++i) { - if ( threads[i].tid != 0 ) continue; - threads[i].tid = pthread_self(); - timing_get( &threads[i].time ); - snprintf( threads[i].name, LOCKLEN, "%s", name ); - snprintf( threads[i].where, LOCKLEN, "%s:%d", file, line ); - t = &threads[i]; - break; - } - pthread_mutex_unlock( &initdestory ); + int idx; if ( t == NULL ) { - logadd( LOG_ERROR, "Lock sanity check: Too many waiting threads for %p (%s) at %s:%d\n", (void*)lock, name, file, line ); - exit( 4 ); + if ( first == -1 ) { + ULDE( "Lock sanity check: Too many waiting threads for lock %p (%s) at %s:%d\n", (void*)lock, name, file, line ); + } + t = &threads[first]; + timing_get( &t->time ); + t->tid = self; + snprintf( t->name, LOCKLEN, "%s", name ); + snprintf( t->where, LOCKLEN, "%s:%d", file, line ); + memset( t->locks, 0, sizeof(t->locks) ); + idx = 0; + } else { + // Thread already has locks, check for order violation + idx = -1; + for (int i = 0; i < MAXLPT; ++i) { + if ( t->locks[i] == NULL ) { + if ( idx == -1 ) { + idx = i; + } + continue; + } + if ( t->locks[i]->prio >= l->prio ) { + ULDE( "Lock priority violation: %s at %s:%d (%d) when already holding %s at %s (%d)", + name, file, line, l->prio, + t->locks[i]->name, t->locks[i]->where, t->locks[i]->prio ); + } + if ( t->locks[i] == l ) { + ULDE( "Tried to recusively lock %s in the same thread. Tried at %s:%d, when already locked at %s", + name, file, line, t->locks[i]->name ); + } + } + if ( idx == -1 ) { + ULDE( "Thread %d tried to lock more than %d locks.", (int)self, (int)MAXLPT ); + } } - const int retval = pthread_mutex_trylock( lock ); - pthread_mutex_lock( &initdestory ); - t->tid = 0; pthread_mutex_unlock( &initdestory ); + const int retval = try ? pthread_mutex_trylock( lock ) : pthread_mutex_lock( lock ); if ( retval == 0 ) { + timing_get( &l->locktime ); + l->thread = self; + snprintf( l->where, LOCKLEN, "L %s:%d", file, line ); + pthread_mutex_lock( &initdestory ); if ( l->locked ) { logadd( LOG_ERROR, "Lock sanity check: lock %p (%s) already locked at %s:%d\n", (void*)lock, name, file, line ); exit( 4 ); } - l->locked = 1; - timing_get( &l->locktime ); - l->thread = pthread_self(); - snprintf( l->where, LOCKLEN, "L %s:%d", file, line ); - pthread_mutex_lock( &initdestory ); + l->locked = true; + t->locks[idx] = l; l->lockId = ++lockId; pthread_mutex_unlock( &initdestory ); + } else if ( !try || retval != EBUSY ) { + logadd( LOG_ERROR, "Acquiring lock %s at %s:%d failed with error code %d", name, file, line, retval ); + debug_dump_lock_stats(); + exit( 4 ); } return retval; } int debug_mutex_unlock(const char *name, const char *file, int line, pthread_mutex_t *lock) { - debug_lock_t *l = NULL; + debug_thread_t *t = NULL; + pthread_t self = pthread_self(); pthread_mutex_lock( &initdestory ); - for (int i = 0; i < MAXLOCKS; ++i) { - if ( locks[i].lock == lock ) { - l = &locks[i]; + for (int i = 0; i < MAXTHREADS; ++i) { + if ( threads[i].tid == self ) { + t = &threads[i]; break; } } - pthread_mutex_unlock( &initdestory ); - if ( l == NULL ) { - logadd( LOG_ERROR, "Tried to unlock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line ); - exit( 4 ); + if ( t == NULL ) { + ULDE( "Unlock called from unknown thread for %s at %s:%d", name, file, line ); } - if ( !l->locked ) { - logadd( LOG_ERROR, "Unlock sanity check: lock %p (%s) not locked at %s:%d\n", (void*)lock, name, file, line ); - exit( 4 ); + int idx = -1; + int cnt = 0; + for (int i = 0; i < MAXLPT; ++i) { + if ( t->locks[i] == NULL ) + continue; + cnt++; + if ( t->locks[i]->lock == lock ) { + idx = i; + } + } + if ( idx == -1 ) { + ULDE( "Unlock: Calling thread doesn't hold lock %s at %s:%d", name, file, line ); } - l->locked = 0; + debug_lock_t *l = t->locks[idx]; + if ( l->thread != self || !l->locked ) { + ULDE( "Unlock sanity check for lock debugger failed! Lock %s is assigned to calling thread, but lock's meta data doesn't match up at %s:%d", name, file, line ); + } + l->locked = false; l->thread = 0; + t->locks[idx] = NULL; + if ( cnt == 1 ) { + t->tid = 0; // No more locks held, free up slot + } snprintf( l->where, LOCKLEN, "U %s:%d", file, line ); - int retval = pthread_mutex_unlock( lock ); + pthread_mutex_unlock( &initdestory ); + const int retval = pthread_mutex_unlock( lock ); + if ( retval != 0 ) { + logadd( LOG_ERROR, "pthread_mutex_unlock returned %d for %s at %s:%d", retval, name, file, line ); + exit( 4 ); + } return retval; } int debug_mutex_cond_wait(const char *name, const char *file, int line, pthread_cond_t *restrict cond, pthread_mutex_t *restrict lock) { debug_lock_t *l = NULL; + debug_thread_t *t = NULL; + pthread_t self = pthread_self(); pthread_mutex_lock( &initdestory ); - for (int i = 0; i < MAXLOCKS; ++i) { - if ( locks[i].lock == lock ) { - l = &locks[i]; + for (int i = 0; i < MAXTHREADS; ++i) { + if ( threads[i].tid == self ) { + t = &threads[i]; break; } } - pthread_mutex_unlock( &initdestory ); + if ( t == NULL ) { + ULDE( "Unlock called from unknown thread for %s at %s:%d", name, file, line ); + } + int mp = 0, mpi = -1; + for (int i = 0; i < MAXLPT; ++i) { + if ( t->locks[i] == NULL ) + continue; + if ( t->locks[i]->lock == lock ) { + l = t->locks[i]; + } else if ( t->locks[i]->prio > mp ) { + mp = t->locks[i]->prio; + mpi = i; + } + } if ( l == NULL ) { - logadd( LOG_ERROR, "Tried to cond_wait on uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line ); - exit( 4 ); + ULDE( "cond_wait: Calling thread doesn't hold lock %s at %s:%d", name, file, line ); } - if ( !l->locked ) { - logadd( LOG_ERROR, "Cond_wait sanity check: lock %p (%s) not locked at %s:%d\n", (void*)lock, name, file, line ); - exit( 4 ); + if ( l->thread != self || !l->locked ) { + ULDE( "cond_wait: Sanity check for lock debugger failed! Lock %s is assigned to calling thread, but lock's meta data doesn't match up at %s:%d", name, file, line ); } - pthread_t self = pthread_self(); - if ( l->thread != self ) { - logadd( LOG_ERROR, "Cond_wait called from non-owning thread for %p (%s) at %s:%d\n", (void*)lock, name, file, line ); - exit( 4 ); + if ( mp >= l->prio ) { + ULDE( "cond_wait: Yielding a mutex while holding another one with higher prio: %s at %s:%d (%d) while also holding %s at %s (%d)", + name, file, line, l->prio, + t->locks[mpi]->name, t->locks[mpi]->where, mp ); } - l->locked = 0; + l->locked = false; l->thread = 0; - snprintf( l->where, LOCKLEN, "CW %s:%d", file, line ); + snprintf( l->where, LOCKLEN, "CWU %s:%d", file, line ); + pthread_mutex_unlock( &initdestory ); int retval = pthread_cond_wait( cond, lock ); if ( retval != 0 ) { logadd( LOG_ERROR, "pthread_cond_wait returned %d for lock %p (%s) at %s:%d\n", retval, (void*)lock, name, file, line ); exit( 4 ); } - if ( l->locked != 0 || l->thread != 0 ) { + if ( l->locked || l->thread != 0 ) { logadd( LOG_ERROR, "Lock is not free after returning from pthread_cond_wait for %p (%s) at %s:%d\n", (void*)lock, name, file, line ); exit( 4 ); } - l->locked = 1; l->thread = self; timing_get( &l->locktime ); + l->locked = true; pthread_mutex_lock( &initdestory ); l->lockId = ++lockId; pthread_mutex_unlock( &initdestory ); @@ -290,63 +303,21 @@ void debug_dump_lock_stats() "* Locked: %d\n", locks[i].name, locks[i].where, (int)locks[i].locked ); } } - printf( "\n **** WAITING THREADS ****\n\n" ); + printf( "\n **** ACTIVE THREADS ****\n\n" ); for (int i = 0; i < MAXTHREADS; ++i) { - if ( threads[i].tid == 0 ) continue; + if ( threads[i].tid == 0 ) + continue; printf( "* *** Thread %d ***\n" "* Lock: %s\n" "* Where: %s\n" "* How long: %d secs\n", (int)threads[i].tid, threads[i].name, threads[i].where, (int)timing_diff( &threads[i].time, &now ) ); - } - pthread_mutex_unlock( &initdestory ); -} - -static void *debug_thread_watchdog(void *something UNUSED) -{ - setThreadName( "debug-watchdog" ); - while ( !_shutdown ) { - if ( init_done ) { - declare_now; - pthread_mutex_lock( &initdestory ); - for (int i = 0; i < MAXTHREADS; ++i) { - if ( threads[i].tid == 0 ) continue; - const uint32_t diff = timing_diff( &threads[i].time, &now ); - if ( diff > 6 && diff < 100000 ) { - printf( "\n\n +++++++++ DEADLOCK ++++++++++++\n\n" ); - pthread_mutex_unlock( &initdestory ); - debug_dump_lock_stats(); - exit( 99 ); - } - } - pthread_mutex_unlock( &initdestory ); + for (int j = 0; j < MAXLPT; ++j) { + if ( threads[i].locks[j] == NULL ) + continue; + printf( " * Lock %s @ %s\n", threads[i].locks[j]->name, threads[i].locks[j]->where ); } - if ( watchdogSignal == NULL || signal_wait( watchdogSignal, 5000 ) == SIGNAL_ERROR ) sleep( 5 ); } - return NULL ; -} - -#endif - -void debug_locks_start_watchdog() -{ -#ifdef _DEBUG - watchdogSignal = signal_new(); - if ( 0 != thread_create( &watchdog, NULL, &debug_thread_watchdog, (void *)NULL ) ) { - logadd( LOG_ERROR, "Could not start debug-lock watchdog." ); - return; - } -#endif + pthread_mutex_unlock( &initdestory ); } -void debug_locks_stop_watchdog() -{ -#ifdef _DEBUG - _shutdown = true; - printf( "Killing debug watchdog...\n" ); - pthread_mutex_lock( &initdestory ); - signal_call( watchdogSignal ); - pthread_mutex_unlock( &initdestory ); - thread_join( watchdog, NULL ); - signal_close( watchdogSignal ); #endif -} diff --git a/src/server/locks.h b/src/server/locks.h index 7f72722..e5c9801 100644 --- a/src/server/locks.h +++ b/src/server/locks.h @@ -5,19 +5,38 @@ #include #include #include +#include + +// Lock priority + +#define LOCK_RELOAD 90 +#define LOCK_LOAD_CONFIG 100 +#define LOCK_REMOTE_CLONE 110 +#define LOCK_CLIENT_LIST 120 +#define LOCK_CLIENT 130 +#define LOCK_INTEGRITY_QUEUE 140 +#define LOCK_IMAGE_LIST 150 +#define LOCK_IMAGE 160 +#define LOCK_UPLINK_QUEUE 170 +#define LOCK_ALT_SERVER_LIST 180 +#define LOCK_CLIENT_SEND 190 +#define LOCK_UPLINK_RTT 200 +#define LOCK_UPLINK_SEND 210 +#define LOCK_RPC_ACL 220 + +// #ifdef _DEBUG -#define mutex_init( lock ) debug_mutex_init( #lock, __FILE__, __LINE__, lock) -#define mutex_lock( lock ) debug_mutex_lock( #lock, __FILE__, __LINE__, lock) -#define mutex_trylock( lock ) debug_mutex_trylock( #lock, __FILE__, __LINE__, lock) +#define mutex_init( lock, prio ) debug_mutex_init( #lock, __FILE__, __LINE__, lock, prio) +#define mutex_lock( lock ) debug_mutex_lock( #lock, __FILE__, __LINE__, lock, false) +#define mutex_trylock( lock ) debug_mutex_lock( #lock, __FILE__, __LINE__, lock, true) #define mutex_unlock( lock ) debug_mutex_unlock( #lock, __FILE__, __LINE__, lock) #define mutex_cond_wait( cond, lock ) debug_mutex_cond_wait( #lock, __FILE__, __LINE__, cond, lock) #define mutex_destroy( lock ) debug_mutex_destroy( #lock, __FILE__, __LINE__, lock) -int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex_t *lock); -int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex_t *lock); -int debug_mutex_trylock(const char *name, const char *file, int line, pthread_mutex_t *lock); +int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex_t *lock, int priority); +int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex_t *lock, bool try); int debug_mutex_unlock(const char *name, const char *file, int line, pthread_mutex_t *lock); int debug_mutex_cond_wait(const char *name, const char *file, int line, pthread_cond_t *restrict cond, pthread_mutex_t *restrict lock); int debug_mutex_destroy(const char *name, const char *file, int line, pthread_mutex_t *lock); @@ -27,7 +46,7 @@ void debug_dump_lock_stats(); #else -#define mutex_init( lock ) pthread_mutex_init(lock, NULL) +#define mutex_init( lock, prio ) pthread_mutex_init(lock, NULL) #define mutex_lock( lock ) pthread_mutex_lock(lock) #define mutex_trylock( lock ) pthread_mutex_trylock(lock) #define mutex_unlock( lock ) pthread_mutex_unlock(lock) @@ -82,7 +101,4 @@ static inline int debug_thread_join(pthread_t thread, void **value_ptr) #endif -void debug_locks_start_watchdog(); -void debug_locks_stop_watchdog(); - #endif /* LOCKS_H_ */ diff --git a/src/server/net.c b/src/server/net.c index 92728c0..8f97a12 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -145,7 +145,7 @@ static inline bool sendPadding( const int fd, uint32_t bytes ) void net_init() { - mutex_init( &_clients_lock ); + mutex_init( &_clients_lock, LOCK_CLIENT_LIST ); } void* net_handleNewConnection(void *clientPtr) @@ -186,8 +186,8 @@ void* net_handleNewConnection(void *clientPtr) } } while (0); // Fully init client struct - mutex_init( &client->lock ); - mutex_init( &client->sendMutex ); + mutex_init( &client->lock, LOCK_CLIENT ); + mutex_init( &client->sendMutex, LOCK_CLIENT_SEND ); mutex_lock( &client->lock ); host_to_string( &client->host, client->hostName, HOSTNAMELEN ); diff --git a/src/server/rpc.c b/src/server/rpc.c index 5dbcafe..261c6c0 100644 --- a/src/server/rpc.c +++ b/src/server/rpc.c @@ -75,10 +75,9 @@ static json_int_t randomRunId; static pthread_mutex_t aclLock; #define MAX_CLIENTS 50 #define CUTOFF_START 40 -static pthread_mutex_t statusLock; static struct { - int count; - bool overloaded; + atomic_int count; + atomic_bool overloaded; } status; static bool handleStatus(int sock, int permissions, struct field *fields, size_t fields_num, int keepAlive); @@ -91,8 +90,7 @@ static void loadAcl(); void rpc_init() { - mutex_init( &aclLock ); - mutex_init( &statusLock ); + mutex_init( &aclLock, LOCK_RPC_ACL ); randomRunId = (((json_int_t)getpid()) << 16) | (json_int_t)time(NULL); // if ( sizeof(randomRunId) > 4 ) { @@ -123,10 +121,8 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int return; } do { - mutex_lock( &statusLock ); const int curCount = ++status.count; UPDATE_LOADSTATE( curCount ); - mutex_unlock( &statusLock ); if ( curCount > MAX_CLIENTS ) { sendReply( sock, "503 Service Temporarily Unavailable", "text/plain", "Too many HTTP clients", -1, HTTP_CLOSE ); goto func_return; @@ -198,9 +194,7 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int if ( minorVersion == 0 || hasHeaderValue( headers, numHeaders, &STR_CONNECTION, &STR_CLOSE ) ) { keepAlive = HTTP_CLOSE; } else { // And if there aren't too many active HTTP sessions - mutex_lock( &statusLock ); if ( status.overloaded ) keepAlive = HTTP_CLOSE; - mutex_unlock( &statusLock ); } } if ( method.s != NULL && path.s != NULL ) { @@ -234,10 +228,8 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int } while (true); func_return:; do { - mutex_lock( &statusLock ); const int curCount = --status.count; UPDATE_LOADSTATE( curCount ); - mutex_unlock( &statusLock ); } while (0); } diff --git a/src/server/server.c b/src/server/server.c index 10ab208..838aec2 100644 --- a/src/server/server.c +++ b/src/server/server.c @@ -133,9 +133,6 @@ void dnbd3_cleanup() // Wait for clients to disconnect net_waitForAllDisconnected(); - // Watchdog not needed anymore - debug_locks_stop_watchdog(); - // Clean up images retries = 5; while ( !image_tryFreeAll() && --retries > 0 ) { @@ -303,10 +300,6 @@ int main(int argc, char *argv[]) logadd( LOG_WARNING, "Could not load alt-servers. Does the file exist in %s?", _configDir ); } -#ifdef _DEBUG - debug_locks_start_watchdog(); -#endif - // setup signal handler struct sigaction sa; memset( &sa, 0, sizeof(sa) ); diff --git a/src/server/uplink.c b/src/server/uplink.c index bb1ffdc..9570273 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -89,9 +89,9 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version goto failure; } link = image->uplink = calloc( 1, sizeof(dnbd3_connection_t) ); - mutex_init( &link->queueLock ); - mutex_init( &link->rttLock ); - mutex_init( &link->sendMutex ); + mutex_init( &link->queueLock, LOCK_UPLINK_QUEUE ); + mutex_init( &link->rttLock, LOCK_UPLINK_RTT ); + mutex_init( &link->sendMutex, LOCK_UPLINK_SEND ); link->image = image; link->bytesReceived = 0; link->idleTime = 0; -- cgit v1.2.3-55-g7522 From da0950ad342bae3b40a74bf82dba6c1f82e7eb57 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Sun, 18 Aug 2019 21:31:56 +0200 Subject: [SERVER] uplink: More consistent type/variable naming * Change link to uplink everywhere * dnbd3_connection_t -> dnbd3_uplink_t --- src/server/altservers.c | 10 +- src/server/altservers.h | 4 +- src/server/globals.h | 12 +- src/server/uplink.c | 554 ++++++++++++++++++++++++------------------------ src/server/uplink.h | 2 +- 5 files changed, 294 insertions(+), 288 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/altservers.c b/src/server/altservers.c index 60c046c..1001981 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -14,7 +14,7 @@ #define LOG_GOTO(jumplabel, lvl, ...) do { LOG(lvl, __VA_ARGS__); goto jumplabel; } while (0); #define ERROR_GOTO(jumplabel, ...) LOG_GOTO(jumplabel, LOG_ERROR, __VA_ARGS__) -static dnbd3_connection_t * _Atomic pending[SERVER_MAX_PENDING_ALT_CHECKS]; +static dnbd3_uplink_t * _Atomic pending[SERVER_MAX_PENDING_ALT_CHECKS]; static dnbd3_signal_t * _Atomic runSignal = NULL; static dnbd3_alt_server_t altServers[SERVER_MAX_ALTS]; @@ -121,7 +121,7 @@ bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate /** * ONLY called from the passed uplink's main thread */ -void altservers_findUplink(dnbd3_connection_t *uplink) +void altservers_findUplink(dnbd3_uplink_t *uplink) { if ( uplink->shutdown ) return; @@ -149,7 +149,7 @@ void altservers_findUplink(dnbd3_connection_t *uplink) uplink->rttTestResult = RTT_INPROGRESS; for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { if ( pending[i] != NULL ) continue; - dnbd3_connection_t *null = NULL; + dnbd3_uplink_t *null = NULL; if ( atomic_compare_exchange_strong( &pending[i], &null, uplink ) ) { mutex_unlock( &uplink->rttLock ); atomic_thread_fence( memory_order_release ); @@ -167,7 +167,7 @@ void altservers_findUplink(dnbd3_connection_t *uplink) * The given uplink is about to disappear, * wait until any pending RTT check is done. */ -void altservers_removeUplink(dnbd3_connection_t *uplink) +void altservers_removeUplink(dnbd3_uplink_t *uplink) { assert( uplink != NULL ); assert( uplink->shutdown ); @@ -453,7 +453,7 @@ static void *altservers_main(void *data UNUSED) // Work your way through the queue atomic_thread_fence( memory_order_acquire ); for (itLink = 0; itLink < SERVER_MAX_PENDING_ALT_CHECKS; ++itLink) { - dnbd3_connection_t * const uplink = pending[itLink]; + dnbd3_uplink_t * const uplink = pending[itLink]; if ( uplink == NULL ) continue; // First, get 4 alt servers diff --git a/src/server/altservers.h b/src/server/altservers.h index 7b7b46d..e03b900 100644 --- a/src/server/altservers.h +++ b/src/server/altservers.h @@ -13,9 +13,9 @@ int altservers_load(); bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly); -void altservers_findUplink(dnbd3_connection_t *uplink); +void altservers_findUplink(dnbd3_uplink_t *uplink); -void altservers_removeUplink(dnbd3_connection_t *uplink); +void altservers_removeUplink(dnbd3_uplink_t *uplink); int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size); diff --git a/src/server/globals.h b/src/server/globals.h index 86b8865..0371e33 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -13,7 +13,7 @@ typedef struct timespec ticks; // ######### All structs/types used by the server ######## -typedef struct _dnbd3_connection dnbd3_connection_t; +typedef struct _dnbd3_uplink dnbd3_uplink_t; typedef struct _dnbd3_image dnbd3_image_t; typedef struct _dnbd3_client dnbd3_client_t; @@ -30,12 +30,18 @@ typedef struct uint8_t hopCount; // How many hops this request has already taken across proxies } dnbd3_queued_request_t; +typedef struct { + int fd; + int version; + dnbd3_host_t host; +} dnbd3_server_connection_t; + #define RTT_IDLE 0 // Not in progress #define RTT_INPROGRESS 1 // In progess, not finished #define RTT_DONTCHANGE 2 // Finished, but no better alternative found #define RTT_DOCHANGE 3 // Finished, better alternative written to .betterServer + .betterFd #define RTT_NOT_REACHABLE 4 // No uplink was reachable -struct _dnbd3_connection +struct _dnbd3_uplink { int fd; // socket fd to remote server int version; // remote server protocol version @@ -94,7 +100,7 @@ struct _dnbd3_image { char *path; // absolute path of the image char *name; // public name of the image (usually relative path minus revision ID) - dnbd3_connection_t *uplink; // pointer to a server connection + dnbd3_uplink_t *uplink; // pointer to a server connection uint8_t *cache_map; // cache map telling which parts are locally cached, NULL if complete uint64_t virtualFilesize; // virtual size of image (real size rounded up to multiple of 4k) uint64_t realFilesize; // actual file size on disk diff --git a/src/server/uplink.c b/src/server/uplink.c index 9570273..7d66b21 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -46,16 +46,16 @@ static const char *const NAMES_ULR[4] = { static atomic_uint_fast64_t totalBytesReceived = 0; static void* uplink_mainloop(void *data); -static void uplink_sendRequests(dnbd3_connection_t *link, bool newOnly); -static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const int lastBlockIndex); -static void uplink_handleReceive(dnbd3_connection_t *link); +static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly); +static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex); +static void uplink_handleReceive(dnbd3_uplink_t *uplink); static int uplink_sendKeepalive(const int fd); -static void uplink_addCrc32(dnbd3_connection_t *uplink); -static void uplink_sendReplicationRequest(dnbd3_connection_t *link); -static bool uplink_reopenCacheFd(dnbd3_connection_t *link, const bool force); -static bool uplink_saveCacheMap(dnbd3_connection_t *link); -static bool uplink_connectionShouldShutdown(dnbd3_connection_t *link); -static void uplink_connectionFailed(dnbd3_connection_t *link, bool findNew); +static void uplink_addCrc32(dnbd3_uplink_t *uplink); +static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink); +static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force); +static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink); +static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink); +static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew); // ############ uplink connection handling @@ -76,7 +76,7 @@ uint64_t uplink_getTotalBytesReceived() bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version) { if ( !_isProxy || _shutdown ) return false; - dnbd3_connection_t *link = NULL; + dnbd3_uplink_t *uplink = NULL; assert( image != NULL ); mutex_lock( &image->lock ); if ( image->uplink != NULL && !image->uplink->shutdown ) { @@ -88,44 +88,44 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version logadd( LOG_WARNING, "Uplink was requested for image %s, but it is already complete", image->name ); goto failure; } - link = image->uplink = calloc( 1, sizeof(dnbd3_connection_t) ); - mutex_init( &link->queueLock, LOCK_UPLINK_QUEUE ); - mutex_init( &link->rttLock, LOCK_UPLINK_RTT ); - mutex_init( &link->sendMutex, LOCK_UPLINK_SEND ); - link->image = image; - link->bytesReceived = 0; - link->idleTime = 0; - link->queueLen = 0; - mutex_lock( &link->sendMutex ); - link->fd = -1; - mutex_unlock( &link->sendMutex ); - link->cacheFd = -1; - link->signal = NULL; - link->replicationHandle = REP_NONE; - mutex_lock( &link->rttLock ); - link->cycleDetected = false; + uplink = image->uplink = calloc( 1, sizeof(dnbd3_uplink_t) ); + mutex_init( &uplink->queueLock, LOCK_UPLINK_QUEUE ); + mutex_init( &uplink->rttLock, LOCK_UPLINK_RTT ); + mutex_init( &uplink->sendMutex, LOCK_UPLINK_SEND ); + uplink->image = image; + uplink->bytesReceived = 0; + uplink->idleTime = 0; + uplink->queueLen = 0; + mutex_lock( &uplink->sendMutex ); + uplink->fd = -1; + mutex_unlock( &uplink->sendMutex ); + uplink->cacheFd = -1; + uplink->signal = NULL; + uplink->replicationHandle = REP_NONE; + mutex_lock( &uplink->rttLock ); + uplink->cycleDetected = false; if ( sock >= 0 ) { - link->betterFd = sock; - link->betterServer = *host; - link->rttTestResult = RTT_DOCHANGE; - link->betterVersion = version; + uplink->betterFd = sock; + uplink->betterServer = *host; + uplink->rttTestResult = RTT_DOCHANGE; + uplink->betterVersion = version; } else { - link->betterFd = -1; - link->rttTestResult = RTT_IDLE; + uplink->betterFd = -1; + uplink->rttTestResult = RTT_IDLE; } - mutex_unlock( &link->rttLock ); - link->recvBufferLen = 0; - link->shutdown = false; - if ( 0 != thread_create( &(link->thread), NULL, &uplink_mainloop, (void *)link ) ) { + mutex_unlock( &uplink->rttLock ); + uplink->recvBufferLen = 0; + uplink->shutdown = false; + if ( 0 != thread_create( &(uplink->thread), NULL, &uplink_mainloop, (void *)link ) ) { logadd( LOG_ERROR, "Could not start thread for new uplink." ); goto failure; } mutex_unlock( &image->lock ); return true; failure: ; - if ( link != NULL ) { - free( link ); - link = image->uplink = NULL; + if ( uplink != NULL ) { + free( uplink ); + uplink = image->uplink = NULL; } mutex_unlock( &image->lock ); return false; @@ -146,7 +146,7 @@ void uplink_shutdown(dnbd3_image_t *image) mutex_unlock( &image->lock ); return; } - dnbd3_connection_t * const uplink = image->uplink; + dnbd3_uplink_t * const uplink = image->uplink; mutex_lock( &uplink->queueLock ); if ( !uplink->shutdown ) { uplink->shutdown = true; @@ -170,7 +170,7 @@ void uplink_shutdown(dnbd3_image_t *image) * Remove given client from uplink request queue * Locks on: uplink.queueLock */ -void uplink_removeClient(dnbd3_connection_t *uplink, dnbd3_client_t *client) +void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client) { mutex_lock( &uplink->queueLock ); for (int i = uplink->queueLen - 1; i >= 0; --i) { @@ -203,7 +203,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin logadd( LOG_DEBUG1, "Uplink request for image with no uplink" ); return false; } - dnbd3_connection_t * const uplink = client->image->uplink; + dnbd3_uplink_t * const uplink = client->image->uplink; if ( uplink->shutdown ) { mutex_unlock( &client->image->lock ); logadd( LOG_DEBUG1, "Uplink request for image with uplink shutting down" ); @@ -370,7 +370,7 @@ static void* uplink_mainloop(void *data) #define EV_SOCKET (1) #define EV_COUNT (2) struct pollfd events[EV_COUNT]; - dnbd3_connection_t * const link = (dnbd3_connection_t*)data; + dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data; int numSocks, i, waitTime; int altCheckInterval = SERVER_RTT_INTERVAL_INIT; uint32_t discoverFailCount = 0; @@ -381,31 +381,31 @@ static void* uplink_mainloop(void *data) timing_get( &nextAltCheck ); lastKeepalive = nextAltCheck; // - assert( link != NULL ); + assert( uplink != NULL ); setThreadName( "idle-uplink" ); blockNoncriticalSignals(); // Make sure file is open for writing - if ( !uplink_reopenCacheFd( link, false ) ) { + if ( !uplink_reopenCacheFd( uplink, false ) ) { // It might have failed - still offer proxy mode, we just can't cache - logadd( LOG_WARNING, "Cannot open cache file %s for writing (errno=%d); will just proxy traffic without caching!", link->image->path, errno ); + logadd( LOG_WARNING, "Cannot open cache file %s for writing (errno=%d); will just proxy traffic without caching!", uplink->image->path, errno ); } // - link->signal = signal_new(); - if ( link->signal == NULL ) { + uplink->signal = signal_new(); + if ( uplink->signal == NULL ) { logadd( LOG_WARNING, "error creating signal. Uplink unavailable." ); goto cleanup; } events[EV_SIGNAL].events = POLLIN; - events[EV_SIGNAL].fd = signal_getWaitFd( link->signal ); + events[EV_SIGNAL].fd = signal_getWaitFd( uplink->signal ); events[EV_SOCKET].fd = -1; - while ( !_shutdown && !link->shutdown ) { + while ( !_shutdown && !uplink->shutdown ) { // poll() - mutex_lock( &link->rttLock ); - waitTime = link->rttTestResult == RTT_DOCHANGE ? 0 : -1; - mutex_unlock( &link->rttLock ); + mutex_lock( &uplink->rttLock ); + waitTime = uplink->rttTestResult == RTT_DOCHANGE ? 0 : -1; + mutex_unlock( &uplink->rttLock ); if ( waitTime == 0 ) { // Nothing - } else if ( link->fd == -1 && !uplink_connectionShouldShutdown( link ) ) { + } else if ( uplink->fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) { waitTime = 1000; } else { declare_now; @@ -413,9 +413,9 @@ static void* uplink_mainloop(void *data) if ( waitTime < 100 ) waitTime = 100; if ( waitTime > 5000 ) waitTime = 5000; } - events[EV_SOCKET].fd = link->fd; + events[EV_SOCKET].fd = uplink->fd; numSocks = poll( events, EV_COUNT, waitTime ); - if ( _shutdown || link->shutdown ) goto cleanup; + if ( _shutdown || uplink->shutdown ) goto cleanup; if ( numSocks == -1 ) { // Error? if ( errno == EINTR ) continue; logadd( LOG_DEBUG1, "poll() error %d", (int)errno ); @@ -423,39 +423,39 @@ static void* uplink_mainloop(void *data) continue; } // Check if server switch is in order - mutex_lock( &link->rttLock ); - if ( link->rttTestResult != RTT_DOCHANGE ) { - mutex_unlock( &link->rttLock ); + mutex_lock( &uplink->rttLock ); + if ( uplink->rttTestResult != RTT_DOCHANGE ) { + mutex_unlock( &uplink->rttLock ); } else { - link->rttTestResult = RTT_IDLE; + uplink->rttTestResult = RTT_IDLE; // The rttTest worker thread has finished our request. // And says it's better to switch to another server - const int fd = link->fd; - mutex_lock( &link->sendMutex ); - link->fd = link->betterFd; - mutex_unlock( &link->sendMutex ); - link->betterFd = -1; - link->currentServer = link->betterServer; - link->version = link->betterVersion; - link->cycleDetected = false; - mutex_unlock( &link->rttLock ); + const int fd = uplink->fd; + mutex_lock( &uplink->sendMutex ); + uplink->fd = uplink->betterFd; + mutex_unlock( &uplink->sendMutex ); + uplink->betterFd = -1; + uplink->currentServer = uplink->betterServer; + uplink->version = uplink->betterVersion; + uplink->cycleDetected = false; + mutex_unlock( &uplink->rttLock ); discoverFailCount = 0; if ( fd != -1 ) close( fd ); - link->replicationHandle = REP_NONE; - link->image->working = true; - link->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received + uplink->replicationHandle = REP_NONE; + uplink->image->working = true; + uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received buffer[0] = '@'; - if ( host_to_string( &link->currentServer, buffer + 1, sizeof(buffer) - 1 ) ) { - logadd( LOG_DEBUG1, "(Uplink %s) Now connected to %s\n", link->image->name, buffer + 1 ); + if ( host_to_string( &uplink->currentServer, buffer + 1, sizeof(buffer) - 1 ) ) { + logadd( LOG_DEBUG1, "(Uplink %s) Now connected to %s\n", uplink->image->name, buffer + 1 ); setThreadName( buffer ); } // If we don't have a crc32 list yet, see if the new server has one - if ( link->image->crc32 == NULL ) { - uplink_addCrc32( link ); + if ( uplink->image->crc32 == NULL ) { + uplink_addCrc32( uplink ); } // Re-send all pending requests - uplink_sendRequests( link, false ); - uplink_sendReplicationRequest( link ); + uplink_sendRequests( uplink, false ); + uplink_sendReplicationRequest( uplink ); events[EV_SOCKET].events = POLLIN | POLLRDHUP; timing_gets( &nextAltCheck, altCheckInterval ); // The rtt worker already did the handshake for our image, so there's nothing @@ -468,161 +468,161 @@ static void* uplink_mainloop(void *data) goto cleanup; } else if ( (events[EV_SIGNAL].revents & POLLIN) ) { // signal triggered -> pending requests - if ( signal_clear( link->signal ) == SIGNAL_ERROR ) { - logadd( LOG_WARNING, "Errno on signal on uplink for %s! Things will break!", link->image->name ); + if ( signal_clear( uplink->signal ) == SIGNAL_ERROR ) { + logadd( LOG_WARNING, "Errno on signal on uplink for %s! Things will break!", uplink->image->name ); } - if ( link->fd != -1 ) { + if ( uplink->fd != -1 ) { // Uplink seems fine, relay requests to it... - uplink_sendRequests( link, true ); + uplink_sendRequests( uplink, true ); } else { // No uplink; maybe it was shutdown since it was idle for too long - link->idleTime = 0; + uplink->idleTime = 0; } } // Uplink socket if ( (events[EV_SOCKET].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) { - uplink_connectionFailed( link, true ); + uplink_connectionFailed( uplink, true ); logadd( LOG_DEBUG1, "Uplink gone away, panic!\n" ); setThreadName( "panic-uplink" ); } else if ( (events[EV_SOCKET].revents & POLLIN) ) { - uplink_handleReceive( link ); - if ( _shutdown || link->shutdown ) goto cleanup; + uplink_handleReceive( uplink ); + if ( _shutdown || uplink->shutdown ) goto cleanup; } declare_now; uint32_t timepassed = timing_diff( &lastKeepalive, &now ); if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) { lastKeepalive = now; - link->idleTime += timepassed; + uplink->idleTime += timepassed; unsavedSeconds += timepassed; - if ( unsavedSeconds > 240 || ( unsavedSeconds > 60 && link->idleTime >= 20 && link->idleTime <= 70 ) ) { - // fsync/save every 4 minutes, or every 60 seconds if link is idle + if ( unsavedSeconds > 240 || ( unsavedSeconds > 60 && uplink->idleTime >= 20 && uplink->idleTime <= 70 ) ) { + // fsync/save every 4 minutes, or every 60 seconds if uplink is idle unsavedSeconds = 0; - uplink_saveCacheMap( link ); + uplink_saveCacheMap( uplink ); } // Keep-alive - if ( link->fd != -1 && link->replicationHandle == REP_NONE ) { + if ( uplink->fd != -1 && uplink->replicationHandle == REP_NONE ) { // Send keep-alive if nothing is happening - if ( uplink_sendKeepalive( link->fd ) ) { + if ( uplink_sendKeepalive( uplink->fd ) ) { // Re-trigger periodically, in case it requires a minimum user count - uplink_sendReplicationRequest( link ); + uplink_sendReplicationRequest( uplink ); } else { - uplink_connectionFailed( link, true ); + uplink_connectionFailed( uplink, true ); logadd( LOG_DEBUG1, "Error sending keep-alive, panic!\n" ); setThreadName( "panic-uplink" ); } } - // Don't keep link established if we're idle for too much - if ( link->fd != -1 && uplink_connectionShouldShutdown( link ) ) { - mutex_lock( &link->sendMutex ); - close( link->fd ); - link->fd = events[EV_SOCKET].fd = -1; - mutex_unlock( &link->sendMutex ); - link->cycleDetected = false; - if ( link->recvBufferLen != 0 ) { - link->recvBufferLen = 0; - free( link->recvBuffer ); - link->recvBuffer = NULL; + // Don't keep uplink established if we're idle for too much + if ( uplink->fd != -1 && uplink_connectionShouldShutdown( uplink ) ) { + mutex_lock( &uplink->sendMutex ); + close( uplink->fd ); + uplink->fd = events[EV_SOCKET].fd = -1; + mutex_unlock( &uplink->sendMutex ); + uplink->cycleDetected = false; + if ( uplink->recvBufferLen != 0 ) { + uplink->recvBufferLen = 0; + free( uplink->recvBuffer ); + uplink->recvBuffer = NULL; } - logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", link->image->name, (int)link->image->rid ); + logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", uplink->image->name, (int)uplink->image->rid ); setThreadName( "idle-uplink" ); } } // See if we should trigger an RTT measurement - mutex_lock( &link->rttLock ); - const int rttTestResult = link->rttTestResult; - mutex_unlock( &link->rttLock ); + mutex_lock( &uplink->rttLock ); + const int rttTestResult = uplink->rttTestResult; + mutex_unlock( &uplink->rttLock ); if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) { - if ( timing_reached( &nextAltCheck, &now ) || ( link->fd == -1 && !uplink_connectionShouldShutdown( link ) ) || link->cycleDetected ) { + if ( timing_reached( &nextAltCheck, &now ) || ( uplink->fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) { // It seems it's time for a check - if ( image_isComplete( link->image ) ) { + if ( image_isComplete( uplink->image ) ) { // Quit work if image is complete - logadd( LOG_INFO, "Replication of %s complete.", link->image->name ); + logadd( LOG_INFO, "Replication of %s complete.", uplink->image->name ); setThreadName( "finished-uplink" ); goto cleanup; - } else if ( !uplink_connectionShouldShutdown( link ) ) { + } else if ( !uplink_connectionShouldShutdown( uplink ) ) { // Not complete - do measurement - altservers_findUplink( link ); // This will set RTT_INPROGRESS (synchronous) - if ( _backgroundReplication == BGR_FULL && link->nextReplicationIndex == -1 ) { - link->nextReplicationIndex = 0; + altservers_findUplink( uplink ); // This will set RTT_INPROGRESS (synchronous) + if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) { + uplink->nextReplicationIndex = 0; } } altCheckInterval = MIN(altCheckInterval + 1, SERVER_RTT_INTERVAL_MAX); timing_set( &nextAltCheck, &now, altCheckInterval ); } } else if ( rttTestResult == RTT_NOT_REACHABLE ) { - mutex_lock( &link->rttLock ); - link->rttTestResult = RTT_IDLE; - mutex_unlock( &link->rttLock ); + mutex_lock( &uplink->rttLock ); + uplink->rttTestResult = RTT_IDLE; + mutex_unlock( &uplink->rttLock ); discoverFailCount++; timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_BACKOFF_COUNT ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) ); } #ifdef _DEBUG - if ( link->fd != -1 && !link->shutdown ) { + if ( uplink->fd != -1 && !uplink->shutdown ) { bool resend = false; ticks deadline; timing_set( &deadline, &now, -10 ); - mutex_lock( &link->queueLock ); - for (i = 0; i < link->queueLen; ++i) { - if ( link->queue[i].status != ULR_FREE && timing_reached( &link->queue[i].entered, &deadline ) ) { + mutex_lock( &uplink->queueLock ); + for (i = 0; i < uplink->queueLen; ++i) { + if ( uplink->queue[i].status != ULR_FREE && timing_reached( &uplink->queue[i].entered, &deadline ) ) { snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n" - "%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)link, i, link->queue[i].client->image->name, - link->queue[i].from, link->queue[i].to, link->queue[i].status ); - link->queue[i].entered = now; + "%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)link, i, uplink->queue[i].client->image->name, + uplink->queue[i].from, uplink->queue[i].to, uplink->queue[i].status ); + uplink->queue[i].entered = now; #ifdef _DEBUG_RESEND_STARVING - link->queue[i].status = ULR_NEW; + uplink->queue[i].status = ULR_NEW; resend = true; #endif - mutex_unlock( &link->queueLock ); + mutex_unlock( &uplink->queueLock ); logadd( LOG_WARNING, "%s", buffer ); - mutex_lock( &link->queueLock ); + mutex_lock( &uplink->queueLock ); } } - mutex_unlock( &link->queueLock ); + mutex_unlock( &uplink->queueLock ); if ( resend ) - uplink_sendRequests( link, true ); + uplink_sendRequests( uplink, true ); } #endif } cleanup: ; - if ( !link->shutdown ) { - link->shutdown = true; - thread_detach( link->thread ); + if ( !uplink->shutdown ) { + uplink->shutdown = true; + thread_detach( uplink->thread ); } - altservers_removeUplink( link ); - uplink_saveCacheMap( link ); - mutex_lock( &link->image->lock ); - if ( link->image->uplink == link ) { - link->image->uplink = NULL; + altservers_removeUplink( uplink ); + uplink_saveCacheMap( uplink ); + mutex_lock( &uplink->image->lock ); + if ( uplink->image->uplink == uplink ) { + uplink->image->uplink = NULL; } - mutex_lock( &link->queueLock ); - const int fd = link->fd; - const dnbd3_signal_t* signal = link->signal; - mutex_lock( &link->sendMutex ); - link->fd = -1; - mutex_unlock( &link->sendMutex ); - link->signal = NULL; - // Do not access link->image after unlocking, since we set + mutex_lock( &uplink->queueLock ); + const int fd = uplink->fd; + const dnbd3_signal_t* signal = uplink->signal; + mutex_lock( &uplink->sendMutex ); + uplink->fd = -1; + mutex_unlock( &uplink->sendMutex ); + uplink->signal = NULL; + // Do not access uplink->image after unlocking, since we set // image->uplink to NULL. Acquire with image_lock first, // like done below when checking whether to re-init uplink - mutex_unlock( &link->image->lock ); - mutex_unlock( &link->queueLock ); + mutex_unlock( &uplink->image->lock ); + mutex_unlock( &uplink->queueLock ); if ( fd != -1 ) close( fd ); if ( signal != NULL ) signal_close( signal ); // Wait for the RTT check to finish/fail if it's in progress - while ( link->rttTestResult == RTT_INPROGRESS ) + while ( uplink->rttTestResult == RTT_INPROGRESS ) usleep( 10000 ); - if ( link->betterFd != -1 ) { - close( link->betterFd ); + if ( uplink->betterFd != -1 ) { + close( uplink->betterFd ); } - mutex_destroy( &link->queueLock ); - mutex_destroy( &link->rttLock ); - mutex_destroy( &link->sendMutex ); - free( link->recvBuffer ); - link->recvBuffer = NULL; - if ( link->cacheFd != -1 ) { - close( link->cacheFd ); + mutex_destroy( &uplink->queueLock ); + mutex_destroy( &uplink->rttLock ); + mutex_destroy( &uplink->sendMutex ); + free( uplink->recvBuffer ); + uplink->recvBuffer = NULL; + if ( uplink->cacheFd != -1 ) { + close( uplink->cacheFd ); } - dnbd3_image_t *image = image_lock( link->image ); - free( link ); // !!! + dnbd3_image_t *image = image_lock( uplink->image ); + free( uplink ); // !!! if ( image != NULL ) { if ( !_shutdown && image->cache_map != NULL ) { // Ingegrity checker must have found something in the meantime @@ -633,37 +633,37 @@ static void* uplink_mainloop(void *data) return NULL ; } -static void uplink_sendRequests(dnbd3_connection_t *link, bool newOnly) +static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) { // Scan for new requests int j; - mutex_lock( &link->queueLock ); - for (j = 0; j < link->queueLen; ++j) { - if ( link->queue[j].status != ULR_NEW && (newOnly || link->queue[j].status != ULR_PENDING) ) continue; - link->queue[j].status = ULR_PENDING; - uint8_t hops = link->queue[j].hopCount; - const uint64_t reqStart = link->queue[j].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); - const uint32_t reqSize = (uint32_t)(((link->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart); + mutex_lock( &uplink->queueLock ); + for (j = 0; j < uplink->queueLen; ++j) { + if ( uplink->queue[j].status != ULR_NEW && (newOnly || uplink->queue[j].status != ULR_PENDING) ) continue; + uplink->queue[j].status = ULR_PENDING; + uint8_t hops = uplink->queue[j].hopCount; + const uint64_t reqStart = uplink->queue[j].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + const uint32_t reqSize = (uint32_t)(((uplink->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart); /* logadd( LOG_DEBUG2, "[%p] Sending slot %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 " (%" PRIu64 "-%" PRIu64 ")", - (void*)link, j, link->queue[j].status, link->queue[j].handle, link->queue[j].from, link->queue[j].to, reqStart, reqStart+reqSize ); + (void*)link, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize ); */ - mutex_unlock( &link->queueLock ); + mutex_unlock( &uplink->queueLock ); if ( hops < 200 ) ++hops; - mutex_lock( &link->sendMutex ); - const bool ret = dnbd3_get_block( link->fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( link->version, hops ) ); - mutex_unlock( &link->sendMutex ); + mutex_lock( &uplink->sendMutex ); + const bool ret = dnbd3_get_block( uplink->fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->version, hops ) ); + mutex_unlock( &uplink->sendMutex ); if ( !ret ) { // Non-critical - if the connection dropped or the server was changed // the thread will re-send this request as soon as the connection // is reestablished. logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" ); - altservers_serverFailed( &link->currentServer ); + altservers_serverFailed( &uplink->currentServer ); return; } - mutex_lock( &link->queueLock ); + mutex_lock( &uplink->queueLock ); } - mutex_unlock( &link->queueLock ); + mutex_unlock( &uplink->queueLock ); } /** @@ -676,13 +676,13 @@ static void uplink_sendRequests(dnbd3_connection_t *link, bool newOnly) * the code simpler. Worst case would be only one bit is zero, which means * 4kb are missing, but we will request 32kb. */ -static void uplink_sendReplicationRequest(dnbd3_connection_t *link) +static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) { - if ( link == NULL || link->fd == -1 ) return; - if ( _backgroundReplication == BGR_DISABLED || link->cacheFd == -1 ) return; // Don't do background replication - if ( link->nextReplicationIndex == -1 || link->replicationHandle != REP_NONE ) + if ( uplink == NULL || uplink->fd == -1 ) return; + if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return; // Don't do background replication + if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE ) return; - dnbd3_image_t * const image = link->image; + dnbd3_image_t * const image = uplink->image; if ( image->virtualFilesize < DNBD3_BLOCK_SIZE ) return; mutex_lock( &image->lock ); if ( image == NULL || image->cache_map == NULL || image->users < _bgrMinClients ) { @@ -694,17 +694,17 @@ static void uplink_sendReplicationRequest(dnbd3_connection_t *link) const int lastBlockIndex = mapBytes - 1; int endByte; if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks - endByte = link->nextReplicationIndex + mapBytes; + endByte = uplink->nextReplicationIndex + mapBytes; } else { // Hashblock based: Only look for match in current hash block - endByte = ( link->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK; + endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK; if ( endByte > mapBytes ) { endByte = mapBytes; } } int replicationIndex = -1; - for ( int j = link->nextReplicationIndex; j < endByte; ++j ) { + for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) { const int i = j % ( mapBytes ); // Wrap around for BGR_FULL - if ( image->cache_map[i] != 0xff && ( i != lastBlockIndex || !link->replicatedLastBlock ) ) { + if ( image->cache_map[i] != 0xff && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) { // Found incomplete one replicationIndex = i; break; @@ -713,31 +713,31 @@ static void uplink_sendReplicationRequest(dnbd3_connection_t *link) mutex_unlock( &image->lock ); if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) { // Nothing left in current block, find next one - replicationIndex = uplink_findNextIncompleteHashBlock( link, endByte ); + replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte ); } if ( replicationIndex == -1 ) { // Replication might be complete, uplink_mainloop should take care.... - link->nextReplicationIndex = -1; + uplink->nextReplicationIndex = -1; return; } const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE; - link->replicationHandle = offset; + uplink->replicationHandle = offset; const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE ); - mutex_lock( &link->sendMutex ); - bool sendOk = dnbd3_get_block( link->fd, offset, size, link->replicationHandle, COND_HOPCOUNT( link->version, 1 ) ); - mutex_unlock( &link->sendMutex ); + mutex_lock( &uplink->sendMutex ); + bool sendOk = dnbd3_get_block( uplink->fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->version, 1 ) ); + mutex_unlock( &uplink->sendMutex ); if ( !sendOk ) { logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" ); return; } if ( replicationIndex == lastBlockIndex ) { - link->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks + uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks } - link->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter + uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter if ( _backgroundReplication == BGR_HASHBLOCK - && link->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) { + && uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) { // Just crossed a hash block boundary, look for new candidate starting at this very index - link->nextReplicationIndex = uplink_findNextIncompleteHashBlock( link, link->nextReplicationIndex ); + uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex ); } } @@ -746,18 +746,18 @@ static void uplink_sendReplicationRequest(dnbd3_connection_t *link) * of a hash block which is neither completely empty nor completely * replicated yet. Returns -1 if no match. */ -static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const int startMapIndex) +static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int startMapIndex) { int retval = -1; - mutex_lock( &link->image->lock ); - const int mapBytes = IMGSIZE_TO_MAPBYTES( link->image->virtualFilesize ); - const uint8_t *cache_map = link->image->cache_map; + mutex_lock( &uplink->image->lock ); + const int mapBytes = IMGSIZE_TO_MAPBYTES( uplink->image->virtualFilesize ); + const uint8_t *cache_map = uplink->image->cache_map; if ( cache_map != NULL ) { int j; const int start = ( startMapIndex & MAP_INDEX_HASH_START_MASK ); for (j = 0; j < mapBytes; ++j) { const int i = ( start + j ) % mapBytes; - const bool isFull = cache_map[i] == 0xff || ( i + 1 == mapBytes && link->replicatedLastBlock ); + const bool isFull = cache_map[i] == 0xff || ( i + 1 == mapBytes && uplink->replicatedLastBlock ); const bool isEmpty = cache_map[i] == 0; if ( !isEmpty && !isFull ) { // Neither full nor empty, replicate @@ -785,7 +785,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const in retval = -1; } } - mutex_unlock( &link->image->lock ); + mutex_unlock( &uplink->image->lock ); return retval; } @@ -793,41 +793,41 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const in * Receive data from uplink server and process/dispatch * Locks on: link.lock, images[].lock */ -static void uplink_handleReceive(dnbd3_connection_t *link) +static void uplink_handleReceive(dnbd3_uplink_t *uplink) { dnbd3_reply_t inReply, outReply; int ret, i; for (;;) { - ret = dnbd3_read_reply( link->fd, &inReply, false ); - if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !link->shutdown ) ) continue; + ret = dnbd3_read_reply( uplink->fd, &inReply, false ); + if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue; if ( ret == REPLY_AGAIN ) break; if ( unlikely( ret == REPLY_CLOSED ) ) { - logadd( LOG_INFO, "Uplink: Remote host hung up (%s)", link->image->path ); + logadd( LOG_INFO, "Uplink: Remote host hung up (%s)", uplink->image->path ); goto error_cleanup; } if ( unlikely( ret == REPLY_WRONGMAGIC ) ) { - logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s)", link->image->path ); + logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s)", uplink->image->path ); goto error_cleanup; } if ( unlikely( ret != REPLY_OK ) ) { - logadd( LOG_INFO, "Uplink: Connection error %d (%s)", ret, link->image->path ); + logadd( LOG_INFO, "Uplink: Connection error %d (%s)", ret, uplink->image->path ); goto error_cleanup; } if ( unlikely( inReply.size > (uint32_t)_maxPayload ) ) { - logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s", inReply.size, link->image->path ); + logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s", inReply.size, uplink->image->path ); goto error_cleanup; } - if ( unlikely( link->recvBufferLen < inReply.size ) ) { - link->recvBufferLen = MIN((uint32_t)_maxPayload, inReply.size + 65536); - link->recvBuffer = realloc( link->recvBuffer, link->recvBufferLen ); - if ( link->recvBuffer == NULL ) { + if ( unlikely( uplink->recvBufferLen < inReply.size ) ) { + uplink->recvBufferLen = MIN((uint32_t)_maxPayload, inReply.size + 65536); + uplink->recvBuffer = realloc( uplink->recvBuffer, uplink->recvBufferLen ); + if ( uplink->recvBuffer == NULL ) { logadd( LOG_ERROR, "Out of memory when trying to allocate receive buffer for uplink" ); exit( 1 ); } } - if ( unlikely( (uint32_t)sock_recv( link->fd, link->recvBuffer, inReply.size ) != inReply.size ) ) { - logadd( LOG_INFO, "Lost connection to uplink server of %s (payload)", link->image->path ); + if ( unlikely( (uint32_t)sock_recv( uplink->fd, uplink->recvBuffer, inReply.size ) != inReply.size ) ) { + logadd( LOG_INFO, "Lost connection to uplink server of %s (payload)", uplink->image->path ); goto error_cleanup; } // Payload read completely @@ -838,18 +838,18 @@ static void uplink_handleReceive(dnbd3_connection_t *link) const uint64_t start = inReply.handle; const uint64_t end = inReply.handle + inReply.size; totalBytesReceived += inReply.size; - link->bytesReceived += inReply.size; + uplink->bytesReceived += inReply.size; // 1) Write to cache file - if ( unlikely( link->cacheFd == -1 ) ) { - uplink_reopenCacheFd( link, false ); + if ( unlikely( uplink->cacheFd == -1 ) ) { + uplink_reopenCacheFd( uplink, false ); } - if ( likely( link->cacheFd != -1 ) ) { + if ( likely( uplink->cacheFd != -1 ) ) { int err = 0; bool tryAgain = true; // Allow one retry in case we run out of space or the write fd became invalid uint32_t done = 0; ret = 0; while ( done < inReply.size ) { - ret = (int)pwrite( link->cacheFd, link->recvBuffer + done, inReply.size - done, start + done ); + ret = (int)pwrite( uplink->cacheFd, uplink->recvBuffer + done, inReply.size - done, start + done ); if ( unlikely( ret == -1 ) ) { err = errno; if ( err == EINTR ) continue; @@ -860,26 +860,26 @@ static void uplink_handleReceive(dnbd3_connection_t *link) continue; // Success, retry write } if ( err == EBADF || err == EINVAL || err == EIO ) { - if ( !tryAgain || !uplink_reopenCacheFd( link, true ) ) + if ( !tryAgain || !uplink_reopenCacheFd( uplink, true ) ) break; tryAgain = false; continue; // Write handle to image successfully re-opened, try again } - logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", link->image->name, (int)link->image->rid, err ); + logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", uplink->image->name, (int)uplink->image->rid, err ); break; } if ( unlikely( ret <= 0 || (uint32_t)ret > inReply.size - done ) ) { - logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d", ret, link->image->name, (int)link->image->rid ); + logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d", ret, uplink->image->name, (int)uplink->image->rid ); break; } done += (uint32_t)ret; } if ( likely( done > 0 ) ) { - image_updateCachemap( link->image, start, start + done, true ); + image_updateCachemap( uplink->image, start, start + done, true ); } if ( unlikely( ret == -1 && ( err == EBADF || err == EINVAL || err == EIO ) ) ) { logadd( LOG_WARNING, "Error writing received data for %s:%d (errno=%d); disabling caching.", - link->image->name, (int)link->image->rid, err ); + uplink->image->name, (int)uplink->image->rid, err ); } } // 2) Figure out which clients are interested in it @@ -888,9 +888,9 @@ static void uplink_handleReceive(dnbd3_connection_t *link) // by populating a slot with index greater than the highest matching // request with ULR_PROCESSING (assuming there is no ULR_PENDING or ULR_NEW // where it's fine if the index is greater) - mutex_lock( &link->queueLock ); - for (i = 0; i < link->queueLen; ++i) { - dnbd3_queued_request_t * const req = &link->queue[i]; + mutex_lock( &uplink->queueLock ); + for (i = 0; i < uplink->queueLen; ++i) { + dnbd3_queued_request_t * const req = &uplink->queue[i]; assert( req->status != ULR_PROCESSING ); if ( req->status != ULR_PENDING && req->status != ULR_NEW ) continue; assert( req->client != NULL ); @@ -903,8 +903,8 @@ static void uplink_handleReceive(dnbd3_connection_t *link) // from 0, you also need to change the "attach to existing request"-logic in uplink_request() outReply.magic = dnbd3_packet_magic; bool served = false; - for ( i = link->queueLen - 1; i >= 0; --i ) { - dnbd3_queued_request_t * const req = &link->queue[i]; + for ( i = uplink->queueLen - 1; i >= 0; --i ) { + dnbd3_queued_request_t * const req = &uplink->queue[i]; if ( req->status == ULR_PROCESSING ) { size_t bytesSent = 0; assert( req->from >= start && req->to <= end ); @@ -914,14 +914,14 @@ static void uplink_handleReceive(dnbd3_connection_t *link) outReply.size = (uint32_t)( req->to - req->from ); iov[0].iov_base = &outReply; iov[0].iov_len = sizeof outReply; - iov[1].iov_base = link->recvBuffer + (req->from - start); + iov[1].iov_base = uplink->recvBuffer + (req->from - start); iov[1].iov_len = outReply.size; fixup_reply( outReply ); req->status = ULR_FREE; req->client = NULL; served = true; mutex_lock( &client->sendMutex ); - mutex_unlock( &link->queueLock ); + mutex_unlock( &uplink->queueLock ); if ( client->sock != -1 ) { ssize_t sent = writev( client->sock, iov, 2 ); if ( sent > (ssize_t)sizeof outReply ) { @@ -932,66 +932,66 @@ static void uplink_handleReceive(dnbd3_connection_t *link) client->bytesSent += bytesSent; } mutex_unlock( &client->sendMutex ); - mutex_lock( &link->queueLock ); + mutex_lock( &uplink->queueLock ); } - if ( req->status == ULR_FREE && i == link->queueLen - 1 ) link->queueLen--; + if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--; } - mutex_unlock( &link->queueLock ); + mutex_unlock( &uplink->queueLock ); #ifdef _DEBUG - if ( !served && start != link->replicationHandle ) { - logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)link, link->image->name, start, end ); + if ( !served && start != uplink->replicationHandle ) { + logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)link, uplink->image->name, start, end ); } #endif - if ( start == link->replicationHandle ) { + if ( start == uplink->replicationHandle ) { // Was our background replication - link->replicationHandle = REP_NONE; + uplink->replicationHandle = REP_NONE; // Try to remove from fs cache if no client was interested in this data - if ( !served && link->cacheFd != -1 ) { - posix_fadvise( link->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED ); + if ( !served && uplink->cacheFd != -1 ) { + posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED ); } } if ( served ) { // Was some client -- reset idle counter - link->idleTime = 0; + uplink->idleTime = 0; // Re-enable replication if disabled - if ( link->nextReplicationIndex == -1 ) { - link->nextReplicationIndex = (int)( start / FILE_BYTES_PER_MAP_BYTE ) & MAP_INDEX_HASH_START_MASK; + if ( uplink->nextReplicationIndex == -1 ) { + uplink->nextReplicationIndex = (int)( start / FILE_BYTES_PER_MAP_BYTE ) & MAP_INDEX_HASH_START_MASK; } } } - if ( link->replicationHandle == REP_NONE ) { - mutex_lock( &link->queueLock ); - const bool rep = ( link->queueLen == 0 ); - mutex_unlock( &link->queueLock ); - if ( rep ) uplink_sendReplicationRequest( link ); + if ( uplink->replicationHandle == REP_NONE ) { + mutex_lock( &uplink->queueLock ); + const bool rep = ( uplink->queueLen == 0 ); + mutex_unlock( &uplink->queueLock ); + if ( rep ) uplink_sendReplicationRequest( uplink ); } return; // Error handling from failed receive or message parsing error_cleanup: ; - uplink_connectionFailed( link, true ); + uplink_connectionFailed( uplink, true ); } -static void uplink_connectionFailed(dnbd3_connection_t *link, bool findNew) +static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) { - if ( link->fd == -1 ) + if ( uplink->fd == -1 ) return; - altservers_serverFailed( &link->currentServer ); - mutex_lock( &link->sendMutex ); - close( link->fd ); - link->fd = -1; - mutex_unlock( &link->sendMutex ); - link->replicationHandle = REP_NONE; - if ( _backgroundReplication == BGR_FULL && link->nextReplicationIndex == -1 ) { - link->nextReplicationIndex = 0; + altservers_serverFailed( &uplink->currentServer ); + mutex_lock( &uplink->sendMutex ); + close( uplink->fd ); + uplink->fd = -1; + mutex_unlock( &uplink->sendMutex ); + uplink->replicationHandle = REP_NONE; + if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) { + uplink->nextReplicationIndex = 0; } if ( !findNew ) return; - mutex_lock( &link->rttLock ); - bool bail = link->rttTestResult == RTT_INPROGRESS || link->betterFd != -1; - mutex_unlock( &link->rttLock ); + mutex_lock( &uplink->rttLock ); + bool bail = uplink->rttTestResult == RTT_INPROGRESS || uplink->betterFd != -1; + mutex_unlock( &uplink->rttLock ); if ( bail ) return; - altservers_findUplink( link ); + altservers_findUplink( uplink ); } /** @@ -1008,7 +1008,7 @@ static int uplink_sendKeepalive(const int fd) return send( fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request); } -static void uplink_addCrc32(dnbd3_connection_t *uplink) +static void uplink_addCrc32(dnbd3_uplink_t *uplink) { dnbd3_image_t *image = uplink->image; if ( image == NULL || image->virtualFilesize == 0 ) return; @@ -1051,14 +1051,14 @@ static void uplink_addCrc32(dnbd3_connection_t *uplink) * it will be closed first. Otherwise, nothing will happen and true will be returned * immediately. */ -static bool uplink_reopenCacheFd(dnbd3_connection_t *link, const bool force) +static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force) { - if ( link->cacheFd != -1 ) { + if ( uplink->cacheFd != -1 ) { if ( !force ) return true; - close( link->cacheFd ); + close( uplink->cacheFd ); } - link->cacheFd = open( link->image->path, O_WRONLY | O_CREAT, 0644 ); - return link->cacheFd != -1; + uplink->cacheFd = open( uplink->image->path, O_WRONLY | O_CREAT, 0644 ); + return uplink->cacheFd != -1; } /** @@ -1066,13 +1066,13 @@ static bool uplink_reopenCacheFd(dnbd3_connection_t *link, const bool force) * Return true on success. * Locks on: imageListLock, image.lock */ -static bool uplink_saveCacheMap(dnbd3_connection_t *link) +static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) { - dnbd3_image_t *image = link->image; + dnbd3_image_t *image = uplink->image; assert( image != NULL ); - if ( link->cacheFd != -1 ) { - if ( fsync( link->cacheFd ) == -1 ) { + if ( uplink->cacheFd != -1 ) { + if ( fsync( uplink->cacheFd ) == -1 ) { // A failing fsync means we have no guarantee that any data // since the last fsync (or open if none) has been saved. Apart // from keeping the cache_map from the last successful fsync @@ -1134,9 +1134,9 @@ static bool uplink_saveCacheMap(dnbd3_connection_t *link) return true; } -static bool uplink_connectionShouldShutdown(dnbd3_connection_t *link) +static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink) { - return ( link->idleTime > SERVER_UPLINK_IDLE_TIMEOUT - && ( _backgroundReplication != BGR_FULL || _bgrMinClients > link->image->users ) ); + return ( uplink->idleTime > SERVER_UPLINK_IDLE_TIMEOUT + && ( _backgroundReplication != BGR_FULL || _bgrMinClients > uplink->image->users ) ); } diff --git a/src/server/uplink.h b/src/server/uplink.h index 2b41dfc..4fd41b0 100644 --- a/src/server/uplink.h +++ b/src/server/uplink.h @@ -10,7 +10,7 @@ uint64_t uplink_getTotalBytesReceived(); bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version); -void uplink_removeClient(dnbd3_connection_t *uplink, dnbd3_client_t *client); +void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client); bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hopCount); -- cgit v1.2.3-55-g7522 From 1d2295131020688b5a688286ce8c53d6bb7abdb8 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Sun, 18 Aug 2019 21:59:26 +0200 Subject: [SERVER] Add struct representing active connection to uplink server --- src/server/altservers.c | 30 +++++++++---------- src/server/globals.h | 14 ++++----- src/server/image.c | 2 +- src/server/integrity.c | 2 +- src/server/uplink.c | 78 ++++++++++++++++++++++++------------------------- 5 files changed, 60 insertions(+), 66 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/altservers.c b/src/server/altservers.c index 1001981..fbe10a8 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -125,14 +125,14 @@ void altservers_findUplink(dnbd3_uplink_t *uplink) { if ( uplink->shutdown ) return; - if ( uplink->fd != -1 && numAltServers <= 1 ) + if ( uplink->current.fd != -1 && numAltServers <= 1 ) return; int i; // if betterFd != -1 it means the uplink is supposed to switch to another // server. As this function here is called by the uplink thread, it can // never be that the uplink is supposed to switch, but instead calls // this function. - assert( uplink->betterFd == -1 ); + assert( uplink->better.fd == -1 ); // it is however possible that an RTT measurement is currently in progress, // so check for that case and do nothing if one is in progress // XXX As this function is only ever called by the image's uplink thread, @@ -457,9 +457,9 @@ static void *altservers_main(void *data UNUSED) if ( uplink == NULL ) continue; // First, get 4 alt servers - numAlts = altservers_getListForUplink( servers, ALTS, uplink->fd == -1 ); + numAlts = altservers_getListForUplink( servers, ALTS, uplink->current.fd == -1 ); // If we're already connected and only got one server anyways, there isn't much to do - if ( numAlts <= 1 && uplink->fd != -1 ) { + if ( numAlts <= 1 && uplink->current.fd != -1 ) { uplink->rttTestResult = RTT_DONTCHANGE; continue; } @@ -475,15 +475,15 @@ static void *altservers_main(void *data UNUSED) } LOG( LOG_DEBUG2, "[%d] Running alt check", itLink ); assert( uplink->rttTestResult == RTT_INPROGRESS ); - if ( uplink->fd != -1 ) { + if ( uplink->current.fd != -1 ) { // Add current server if not already in list found = false; for (itAlt = 0; itAlt < numAlts; ++itAlt) { - if ( !isSameAddressPort( &uplink->currentServer, &servers[itAlt] ) ) continue; + if ( !isSameAddressPort( &uplink->current.host, &servers[itAlt] ) ) continue; found = true; break; } - if ( !found ) servers[numAlts++] = uplink->currentServer; + if ( !found ) servers[numAlts++] = uplink->current.host; } // Test them all int bestSock = -1; @@ -537,7 +537,7 @@ static void *altservers_main(void *data UNUSED) // Measurement done - everything fine so far mutex_lock( &altServersLock ); mutex_lock( &uplink->rttLock ); - const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->currentServer ); + const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->current.host ); // Penaltize rtt if this was a cycle; this will treat this server with lower priority // in the near future too, so we prevent alternating between two servers that are both // part of a cycle and have the lowest latency. @@ -547,9 +547,9 @@ static void *altservers_main(void *data UNUSED) unsigned int avg = altservers_updateRtt( &servers[itAlt], rtt ); mutex_unlock( &altServersLock ); // If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time - if ( ( uplink->cycleDetected || uplink->fd == -1 ) && isCurrent ) avg = (avg * 2) + 50000; + if ( ( uplink->cycleDetected || uplink->current.fd == -1 ) && isCurrent ) avg = (avg * 2) + 50000; mutex_unlock( &uplink->rttLock ); - if ( uplink->fd != -1 && isCurrent ) { + if ( uplink->current.fd != -1 && isCurrent ) { // Was measuring current server currentRtt = avg; close( sock ); @@ -574,18 +574,18 @@ static void *altservers_main(void *data UNUSED) close( sock ); } // Done testing all servers. See if we should switch - if ( bestSock != -1 && (uplink->fd == -1 || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) { + if ( bestSock != -1 && (uplink->current.fd == -1 || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) { // yep - if ( currentRtt > 10000000 || uplink->fd == -1 ) { + if ( currentRtt > 10000000 || uplink->current.fd == -1 ) { LOG( LOG_DEBUG1, "Change - best: %luµs, current: -", bestRtt ); } else { LOG( LOG_DEBUG1, "Change - best: %luµs, current: %luµs", bestRtt, currentRtt ); } sock_setTimeout( bestSock, _uplinkTimeout ); mutex_lock( &uplink->rttLock ); - uplink->betterFd = bestSock; - uplink->betterServer = servers[bestIndex]; - uplink->betterVersion = bestProtocolVersion; + uplink->better.fd = bestSock; + uplink->better.host = servers[bestIndex]; + uplink->better.version = bestProtocolVersion; uplink->rttTestResult = RTT_DOCHANGE; mutex_unlock( &uplink->rttLock ); signal_call( uplink->signal ); diff --git a/src/server/globals.h b/src/server/globals.h index 0371e33..659e5a2 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -31,9 +31,9 @@ typedef struct } dnbd3_queued_request_t; typedef struct { - int fd; - int version; - dnbd3_host_t host; + int fd; // Socket fd for this connection + int version; // Protocol version of remote server + dnbd3_host_t host; // IP/Port of remote server } dnbd3_server_connection_t; #define RTT_IDLE 0 // Not in progress @@ -43,20 +43,16 @@ typedef struct { #define RTT_NOT_REACHABLE 4 // No uplink was reachable struct _dnbd3_uplink { - int fd; // socket fd to remote server - int version; // remote server protocol version + dnbd3_server_connection_t current; // Currently active connection; fd == -1 means disconnected + dnbd3_server_connection_t better; // Better connection as found by altserver worker; fd == -1 means none dnbd3_signal_t* signal; // used to wake up the process pthread_t thread; // thread holding the connection pthread_mutex_t sendMutex; // For locking socket while sending pthread_mutex_t queueLock; // lock for synchronization on request queue etc. dnbd3_image_t *image; // image that this uplink is used for; do not call get/release for this pointer - dnbd3_host_t currentServer; // Current server we're connected to pthread_mutex_t rttLock; // When accessing rttTestResult, betterFd or betterServer int rttTestResult; // RTT_* int cacheFd; // used to write to the image, in case it is relayed. ONLY USE FROM UPLINK THREAD! - int betterVersion; // protocol version of better server - int betterFd; // Active connection to better server, ready to use - dnbd3_host_t betterServer; // The better server uint8_t *recvBuffer; // Buffer for receiving payload uint32_t recvBufferLen; // Len of ^^ atomic_bool shutdown; // signal this thread to stop, must only be set from uplink_shutdown() or cleanup in uplink_mainloop() diff --git a/src/server/image.c b/src/server/image.c index 4a65ed3..d250715 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -1508,7 +1508,7 @@ json_t* image_getListAsJson() uplinkName[0] = '\0'; } else { bytesReceived = image->uplink->bytesReceived; - if ( image->uplink->fd == -1 || !host_to_string( &image->uplink->currentServer, uplinkName, sizeof(uplinkName) ) ) { + if ( image->uplink->current.fd == -1 || !host_to_string( &image->uplink->current.host, uplinkName, sizeof(uplinkName) ) ) { uplinkName[0] = '\0'; } } diff --git a/src/server/integrity.c b/src/server/integrity.c index c52d17b..3d1ac9b 100644 --- a/src/server/integrity.c +++ b/src/server/integrity.c @@ -240,7 +240,7 @@ static void* integrity_main(void * data UNUSED) if ( !foundCorrupted ) { mutex_lock( &image->lock ); if ( image->uplink != NULL ) { // TODO: image_determineWorkingState() helper? - image->working = image->uplink->fd != -1 && image->readFd != -1; + image->working = image->uplink->current.fd != -1 && image->readFd != -1; } mutex_unlock( &image->lock ); } diff --git a/src/server/uplink.c b/src/server/uplink.c index 7d66b21..e21e28c 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -97,7 +97,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version uplink->idleTime = 0; uplink->queueLen = 0; mutex_lock( &uplink->sendMutex ); - uplink->fd = -1; + uplink->current.fd = -1; mutex_unlock( &uplink->sendMutex ); uplink->cacheFd = -1; uplink->signal = NULL; @@ -105,12 +105,12 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version mutex_lock( &uplink->rttLock ); uplink->cycleDetected = false; if ( sock >= 0 ) { - uplink->betterFd = sock; - uplink->betterServer = *host; + uplink->better.fd = sock; + uplink->better.host = *host; uplink->rttTestResult = RTT_DOCHANGE; - uplink->betterVersion = version; + uplink->better.version = version; } else { - uplink->betterFd = -1; + uplink->better.fd = -1; uplink->rttTestResult = RTT_IDLE; } mutex_unlock( &uplink->rttLock ); @@ -211,7 +211,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin } // Check if the client is the same host as the uplink. If so assume this is a circular proxy chain // This might be a false positive if there are multiple instances running on the same host (IP) - if ( hops != 0 && isSameAddress( &uplink->currentServer, &client->host ) ) { + if ( hops != 0 && isSameAddress( &uplink->current.host, &client->host ) ) { mutex_unlock( &client->image->lock ); logadd( LOG_WARNING, "Proxy cycle detected (same host)." ); mutex_lock( &uplink->rttLock ); @@ -315,14 +315,14 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin if ( mutex_trylock( &uplink->sendMutex ) != 0 ) { logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" ); } else { - if ( uplink->fd == -1 ) { + if ( uplink->current.fd == -1 ) { mutex_unlock( &uplink->sendMutex ); logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" ); } else { const uint64_t reqStart = uplink->queue[freeSlot].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); const uint32_t reqSize = (uint32_t)(((uplink->queue[freeSlot].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart); if ( hops < 200 ) ++hops; - const bool ret = dnbd3_get_block( uplink->fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->version, hops ) ); + const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) ); mutex_unlock( &uplink->sendMutex ); if ( !ret ) { logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" ); @@ -405,7 +405,7 @@ static void* uplink_mainloop(void *data) mutex_unlock( &uplink->rttLock ); if ( waitTime == 0 ) { // Nothing - } else if ( uplink->fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) { + } else if ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) { waitTime = 1000; } else { declare_now; @@ -413,7 +413,7 @@ static void* uplink_mainloop(void *data) if ( waitTime < 100 ) waitTime = 100; if ( waitTime > 5000 ) waitTime = 5000; } - events[EV_SOCKET].fd = uplink->fd; + events[EV_SOCKET].fd = uplink->current.fd; numSocks = poll( events, EV_COUNT, waitTime ); if ( _shutdown || uplink->shutdown ) goto cleanup; if ( numSocks == -1 ) { // Error? @@ -430,13 +430,11 @@ static void* uplink_mainloop(void *data) uplink->rttTestResult = RTT_IDLE; // The rttTest worker thread has finished our request. // And says it's better to switch to another server - const int fd = uplink->fd; + const int fd = uplink->current.fd; mutex_lock( &uplink->sendMutex ); - uplink->fd = uplink->betterFd; + uplink->current = uplink->better; mutex_unlock( &uplink->sendMutex ); - uplink->betterFd = -1; - uplink->currentServer = uplink->betterServer; - uplink->version = uplink->betterVersion; + uplink->better.fd = -1; uplink->cycleDetected = false; mutex_unlock( &uplink->rttLock ); discoverFailCount = 0; @@ -445,7 +443,7 @@ static void* uplink_mainloop(void *data) uplink->image->working = true; uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received buffer[0] = '@'; - if ( host_to_string( &uplink->currentServer, buffer + 1, sizeof(buffer) - 1 ) ) { + if ( host_to_string( &uplink->current.host, buffer + 1, sizeof(buffer) - 1 ) ) { logadd( LOG_DEBUG1, "(Uplink %s) Now connected to %s\n", uplink->image->name, buffer + 1 ); setThreadName( buffer ); } @@ -471,7 +469,7 @@ static void* uplink_mainloop(void *data) if ( signal_clear( uplink->signal ) == SIGNAL_ERROR ) { logadd( LOG_WARNING, "Errno on signal on uplink for %s! Things will break!", uplink->image->name ); } - if ( uplink->fd != -1 ) { + if ( uplink->current.fd != -1 ) { // Uplink seems fine, relay requests to it... uplink_sendRequests( uplink, true ); } else { // No uplink; maybe it was shutdown since it was idle for too long @@ -499,9 +497,9 @@ static void* uplink_mainloop(void *data) uplink_saveCacheMap( uplink ); } // Keep-alive - if ( uplink->fd != -1 && uplink->replicationHandle == REP_NONE ) { + if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) { // Send keep-alive if nothing is happening - if ( uplink_sendKeepalive( uplink->fd ) ) { + if ( uplink_sendKeepalive( uplink->current.fd ) ) { // Re-trigger periodically, in case it requires a minimum user count uplink_sendReplicationRequest( uplink ); } else { @@ -511,10 +509,10 @@ static void* uplink_mainloop(void *data) } } // Don't keep uplink established if we're idle for too much - if ( uplink->fd != -1 && uplink_connectionShouldShutdown( uplink ) ) { + if ( uplink->current.fd != -1 && uplink_connectionShouldShutdown( uplink ) ) { mutex_lock( &uplink->sendMutex ); - close( uplink->fd ); - uplink->fd = events[EV_SOCKET].fd = -1; + close( uplink->current.fd ); + uplink->current.fd = events[EV_SOCKET].fd = -1; mutex_unlock( &uplink->sendMutex ); uplink->cycleDetected = false; if ( uplink->recvBufferLen != 0 ) { @@ -531,7 +529,7 @@ static void* uplink_mainloop(void *data) const int rttTestResult = uplink->rttTestResult; mutex_unlock( &uplink->rttLock ); if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) { - if ( timing_reached( &nextAltCheck, &now ) || ( uplink->fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) { + if ( timing_reached( &nextAltCheck, &now ) || ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) { // It seems it's time for a check if ( image_isComplete( uplink->image ) ) { // Quit work if image is complete @@ -556,7 +554,7 @@ static void* uplink_mainloop(void *data) timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_BACKOFF_COUNT ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) ); } #ifdef _DEBUG - if ( uplink->fd != -1 && !uplink->shutdown ) { + if ( uplink->current.fd != -1 && !uplink->shutdown ) { bool resend = false; ticks deadline; timing_set( &deadline, &now, -10 ); @@ -594,10 +592,10 @@ static void* uplink_mainloop(void *data) uplink->image->uplink = NULL; } mutex_lock( &uplink->queueLock ); - const int fd = uplink->fd; + const int fd = uplink->current.fd; const dnbd3_signal_t* signal = uplink->signal; mutex_lock( &uplink->sendMutex ); - uplink->fd = -1; + uplink->current.fd = -1; mutex_unlock( &uplink->sendMutex ); uplink->signal = NULL; // Do not access uplink->image after unlocking, since we set @@ -610,8 +608,8 @@ static void* uplink_mainloop(void *data) // Wait for the RTT check to finish/fail if it's in progress while ( uplink->rttTestResult == RTT_INPROGRESS ) usleep( 10000 ); - if ( uplink->betterFd != -1 ) { - close( uplink->betterFd ); + if ( uplink->better.fd != -1 ) { + close( uplink->better.fd ); } mutex_destroy( &uplink->queueLock ); mutex_destroy( &uplink->rttLock ); @@ -651,14 +649,14 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) mutex_unlock( &uplink->queueLock ); if ( hops < 200 ) ++hops; mutex_lock( &uplink->sendMutex ); - const bool ret = dnbd3_get_block( uplink->fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->version, hops ) ); + const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) ); mutex_unlock( &uplink->sendMutex ); if ( !ret ) { // Non-critical - if the connection dropped or the server was changed // the thread will re-send this request as soon as the connection // is reestablished. logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" ); - altservers_serverFailed( &uplink->currentServer ); + altservers_serverFailed( &uplink->current.host ); return; } mutex_lock( &uplink->queueLock ); @@ -678,7 +676,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) */ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) { - if ( uplink == NULL || uplink->fd == -1 ) return; + if ( uplink == NULL || uplink->current.fd == -1 ) return; if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return; // Don't do background replication if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE ) return; @@ -724,7 +722,7 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) uplink->replicationHandle = offset; const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE ); mutex_lock( &uplink->sendMutex ); - bool sendOk = dnbd3_get_block( uplink->fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->version, 1 ) ); + bool sendOk = dnbd3_get_block( uplink->current.fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->current.version, 1 ) ); mutex_unlock( &uplink->sendMutex ); if ( !sendOk ) { logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" ); @@ -798,7 +796,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) dnbd3_reply_t inReply, outReply; int ret, i; for (;;) { - ret = dnbd3_read_reply( uplink->fd, &inReply, false ); + ret = dnbd3_read_reply( uplink->current.fd, &inReply, false ); if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue; if ( ret == REPLY_AGAIN ) break; if ( unlikely( ret == REPLY_CLOSED ) ) { @@ -826,7 +824,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) exit( 1 ); } } - if ( unlikely( (uint32_t)sock_recv( uplink->fd, uplink->recvBuffer, inReply.size ) != inReply.size ) ) { + if ( unlikely( (uint32_t)sock_recv( uplink->current.fd, uplink->recvBuffer, inReply.size ) != inReply.size ) ) { logadd( LOG_INFO, "Lost connection to uplink server of %s (payload)", uplink->image->path ); goto error_cleanup; } @@ -973,12 +971,12 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) { - if ( uplink->fd == -1 ) + if ( uplink->current.fd == -1 ) return; - altservers_serverFailed( &uplink->currentServer ); + altservers_serverFailed( &uplink->current.host ); mutex_lock( &uplink->sendMutex ); - close( uplink->fd ); - uplink->fd = -1; + close( uplink->current.fd ); + uplink->current.fd = -1; mutex_unlock( &uplink->sendMutex ); uplink->replicationHandle = REP_NONE; if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) { @@ -987,7 +985,7 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) if ( !findNew ) return; mutex_lock( &uplink->rttLock ); - bool bail = uplink->rttTestResult == RTT_INPROGRESS || uplink->betterFd != -1; + bool bail = uplink->rttTestResult == RTT_INPROGRESS || uplink->better.fd != -1; mutex_unlock( &uplink->rttLock ); if ( bail ) return; @@ -1016,7 +1014,7 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink) uint32_t masterCrc; uint32_t *buffer = malloc( bytes ); mutex_lock( &uplink->sendMutex ); - bool sendOk = dnbd3_get_crc32( uplink->fd, &masterCrc, buffer, &bytes ); + bool sendOk = dnbd3_get_crc32( uplink->current.fd, &masterCrc, buffer, &bytes ); mutex_unlock( &uplink->sendMutex ); if ( !sendOk || bytes == 0 ) { free( buffer ); -- cgit v1.2.3-55-g7522 From 5fb4ef278be86fb6bda487f65ec4855d830bf4e5 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Thu, 22 Aug 2019 16:14:27 +0200 Subject: [SERVER] Get rid of alt-servers thread, per-uplink rtt history Alt-Server checks are now run using the threadpool, so we don't need a queue and dedicated thread anymore. The rtt history is now kept per uplink, so many uplinks won't overwhelm the history, making its time window very short. Also the fail counter is now split up; a global one for when the server actually isn't reachable, a local (per-uplink) one for when the server is reachable but doesn't serve the requested image. --- src/server/altservers.c | 738 ++++++++++++++++++++++-------------------------- src/server/altservers.h | 16 +- src/server/globals.h | 41 ++- src/server/image.c | 6 +- src/server/net.c | 16 +- src/server/server.c | 8 +- src/server/uplink.c | 117 ++++---- src/server/uplink.h | 2 + src/serverconfig.h | 10 +- 9 files changed, 469 insertions(+), 485 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/altservers.c b/src/server/altservers.c index fbe10a8..493ed9e 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -1,5 +1,6 @@ #include "altservers.h" #include "locks.h" +#include "threadpool.h" #include "helper.h" #include "image.h" #include "fileutil.h" @@ -14,46 +15,22 @@ #define LOG_GOTO(jumplabel, lvl, ...) do { LOG(lvl, __VA_ARGS__); goto jumplabel; } while (0); #define ERROR_GOTO(jumplabel, ...) LOG_GOTO(jumplabel, LOG_ERROR, __VA_ARGS__) -static dnbd3_uplink_t * _Atomic pending[SERVER_MAX_PENDING_ALT_CHECKS]; -static dnbd3_signal_t * _Atomic runSignal = NULL; - static dnbd3_alt_server_t altServers[SERVER_MAX_ALTS]; static atomic_int numAltServers = 0; static pthread_mutex_t altServersLock; +static ticks nextCloseUnusedFd; // TODO: Move away -static pthread_t altThread; - -static void *altservers_main(void *data); -static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const unsigned int rtt); +static void *altservers_runCheck(void *data); +static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int size, int current); +static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink); +static uint32_t altservers_updateRtt(dnbd3_uplink_t *uplink, int index, uint32_t rtt); +static void altservers_imageFailed(dnbd3_uplink_t *uplink, int server); void altservers_init() { srand( (unsigned int)time( NULL ) ); - // Init spinlock + // Init lock mutex_init( &altServersLock, LOCK_ALT_SERVER_LIST ); - // Init signal - runSignal = signal_new(); - if ( runSignal == NULL ) { - logadd( LOG_ERROR, "Error creating signal object. Uplink feature unavailable." ); - exit( EXIT_FAILURE ); - } - memset( altServers, 0, SERVER_MAX_ALTS * sizeof(dnbd3_alt_server_t) ); - if ( 0 != thread_create( &altThread, NULL, &altservers_main, (void *)NULL ) ) { - logadd( LOG_ERROR, "Could not start altservers connector thread" ); - exit( EXIT_FAILURE ); - } - // Init waiting links queue -- this is currently a global static array so - // it will already be zero, but in case we refactor later do it explicitly - for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { - pending[i] = NULL; - } -} - -void altservers_shutdown() -{ - if ( runSignal == NULL ) return; - signal_call( runSignal ); // Wake altservers thread up - thread_join( altThread, NULL ); } static void addalt(int argc, char **argv, void *data) @@ -121,7 +98,7 @@ bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate /** * ONLY called from the passed uplink's main thread */ -void altservers_findUplink(dnbd3_uplink_t *uplink) +void altservers_findUplinkAsync(dnbd3_uplink_t *uplink) { if ( uplink->shutdown ) return; @@ -135,67 +112,11 @@ void altservers_findUplink(dnbd3_uplink_t *uplink) assert( uplink->better.fd == -1 ); // it is however possible that an RTT measurement is currently in progress, // so check for that case and do nothing if one is in progress - // XXX As this function is only ever called by the image's uplink thread, - // it cannot happen that the uplink ends up in this list concurrently mutex_lock( &uplink->rttLock ); - if ( uplink->rttTestResult == RTT_INPROGRESS ) { - for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { - if ( pending[i] != uplink ) continue; - // Yep, measuring right now - return; - } - } - // Find free slot for measurement - uplink->rttTestResult = RTT_INPROGRESS; - for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { - if ( pending[i] != NULL ) continue; - dnbd3_uplink_t *null = NULL; - if ( atomic_compare_exchange_strong( &pending[i], &null, uplink ) ) { - mutex_unlock( &uplink->rttLock ); - atomic_thread_fence( memory_order_release ); - signal_call( runSignal ); // Wake altservers thread up - return; - } + if ( uplink->rttTestResult != RTT_INPROGRESS ) { + threadpool_run( &altservers_runCheck, uplink ); } - // End of loop - no free slot - uplink->rttTestResult = RTT_NOT_REACHABLE; mutex_unlock( &uplink->rttLock ); - logadd( LOG_WARNING, "No more free RTT measurement slots, ignoring a request..." ); -} - -/** - * The given uplink is about to disappear, - * wait until any pending RTT check is done. - */ -void altservers_removeUplink(dnbd3_uplink_t *uplink) -{ - assert( uplink != NULL ); - assert( uplink->shutdown ); - int i; - for ( i = 1 ;; ++i ) { - atomic_thread_fence( memory_order_acquire ); - if ( runSignal == NULL ) { - // Thread is already done, remove manually - uplink->rttTestResult = RTT_NOT_REACHABLE; - break; - } - // Thread still running, wait until test is done - bool found = false; - for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { - if ( pending[i] == uplink ) { - found = true; - break; - } - } - if ( !found ) // No more test running - break; - usleep( 10000 ); // 10ms - signal_call( runSignal ); // Wake altservers thread up - if ( i % 500 == 0 ) { - logadd( LOG_INFO, "Still waiting for altserver check for uplink %p...", (void*)uplink ); - } - } - logadd( LOG_DEBUG1, "Waited for %d iterations for altservers check when tearing down uplink", i ); } /** @@ -209,90 +130,124 @@ int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output if ( host == NULL || host->type == 0 || numAltServers == 0 || output == NULL || size <= 0 ) return 0; int i, j; int count = 0; - int scores[size]; - int score; - mutex_lock( &altServersLock ); + uint16_t scores[SERVER_MAX_ALTS] = { 0 }; if ( size > numAltServers ) size = numAltServers; - for (i = 0; i < numAltServers; ++i) { - if ( altServers[i].host.type == 0 ) continue; // Slot is empty - if ( altServers[i].isPrivate ) continue; // Do not tell clients about private servers + mutex_lock( &altServersLock ); + for ( i = 0; i < numAltServers; ++i ) { + if ( altServers[i].host.type == 0 || altServers[i].isPrivate ) + continue; // Slot is empty or uplink is for replication only if ( host->type == altServers[i].host.type ) { - score = altservers_netCloseness( host, &altServers[i].host ) - altServers[i].numFails; + scores[i] = 10 + altservers_netCloseness( host, &altServers[i].host ); } else { - score = -( altServers[i].numFails + 128 ); // Wrong address family + scores[i] = 1; // Wrong address family } - if ( count == 0 ) { - // Trivial - this is the first entry - output[0].host = altServers[i].host; - output[0].failures = 0; - scores[0] = score; - count++; - } else { - // Other entries already exist, insert in proper position - for (j = 0; j < size; ++j) { - if ( j < count && score <= scores[j] ) continue; - if ( j > count ) break; // Should never happen but just in case... - if ( j < count && j + 1 < size ) { - // Check if we're in the middle and need to move other entries... - memmove( &output[j + 1], &output[j], sizeof(dnbd3_server_entry_t) * (size - j - 1) ); - memmove( &scores[j + 1], &scores[j], sizeof(int) * (size - j - 1) ); - } - if ( count < size ) { - count++; - } - output[j].host = altServers[i].host; - output[j].failures = 0; - scores[j] = score; - break; + } + while ( count < size ) { + i = -1; + for ( j = 0; j < numAltServers; ++j ) { + if ( scores[j] == 0 ) + continue; + if ( i == -1 || scores[j] > scores[i] ) { + i = j; } } + if ( i == -1 ) + break; + output[count].host = altServers[i].host; + output[count].failures = 0; + count++; } mutex_unlock( &altServersLock ); return count; } +bool altservers_toString(int server, char *buffer, size_t len) +{ + return host_to_string( &altServers[server].host, buffer, len ); +} + +static bool isUsableForUplink( dnbd3_uplink_t *uplink, int server, ticks *now ) +{ + dnbd3_alt_local_t *local = ( uplink == NULL ? NULL : &uplink->altData[server] ); + dnbd3_alt_server_t *global = &altServers[server]; + if ( global->isClientOnly || ( !global->isPrivate && _proxyPrivateOnly ) ) + return false; + // Blocked locally (image not found on server...) + if ( local != NULL && local->blocked ) { + if ( --local->fails > 0 ) + return false; + local->blocked = false; + } + if ( global->blocked ) { + if ( timing_diff( &global->lastFail, now ) < SERVER_GLOBAL_DUP_TIME ) + return false; + global->lastFail = *now; + if ( --global->fails > 0 ) + return false; + global->blocked = false; + } + // Not blocked, depend on both fail counters + int fails = ( local == NULL ? 0 : local->fails ) + global->fails; + return fails < SERVER_BAD_UPLINK_MIN || ( rand() % fails ) < SERVER_BAD_UPLINK_MIN; +} + +int altservers_getHostListForReplication(dnbd3_host_t *servers, int size) +{ + int idx[size]; + int num = altservers_getListForUplink( NULL, idx, size, -1 ); + for ( int i = 0; i < num; ++i ) { + servers[i] = altServers[i].host; + } + return num; +} + /** * Get alt servers. If there are more alt servers than * requested, random servers will be picked. * This function is suited for finding uplink servers as * it includes private servers and ignores any "client only" servers + * @param current index of server for current connection, or -1 in panic mode */ -int altservers_getListForUplink(dnbd3_host_t *output, int size, int emergency) +static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int size, int current) { - if ( size <= 0 ) return 0; - int count = 0, i; - ticks now; - timing_get( &now ); + if ( size <= 0 ) + return 0; + int count = 0; + declare_now; mutex_lock( &altServersLock ); - // Flip first server in list with a random one every time this is called - if ( numAltServers > 1 ) { - const dnbd3_alt_server_t tmp = altServers[0]; - do { - i = rand() % numAltServers; - } while ( i == 0 ); - altServers[0] = altServers[i]; - altServers[i] = tmp; - } - // We iterate over the list twice. First run adds servers with 0 failures only, - // second one also considers those that failed (not too many times) - if ( size > numAltServers ) size = numAltServers; - for (i = 0; i < numAltServers * 2; ++i) { - dnbd3_alt_server_t *srv = &altServers[i % numAltServers]; - if ( srv->host.type == 0 ) continue; // Slot is empty - if ( _proxyPrivateOnly && !srv->isPrivate ) continue; // Config says to consider private alt-servers only? ignore! - if ( srv->isClientOnly ) continue; - bool first = ( i < numAltServers ); - if ( first ) { - if ( srv->numFails > 0 ) continue; - } else { - if ( srv->numFails == 0 ) continue; // Already added in first iteration - if ( !emergency && srv->numFails > SERVER_BAD_UPLINK_THRES // server failed X times in a row - && timing_diff( &srv->lastFail, &now ) < SERVER_BAD_UPLINK_IGNORE ) continue; // and last fail was not too long ago? ignore! - if ( !emergency ) srv->numFails--; + // If we don't have enough servers to randomize, take a shortcut + if ( numAltServers <= size ) { + for ( int i = 0; i < numAltServers; ++i ) { + if ( current == -1 || i == current || isUsableForUplink( uplink, i, &now ) ) { + servers[count++] = i; + } + } + } else { + // Plenty of alt servers; randomize + uint8_t state[SERVER_MAX_ALTS] = { 0 }; + if ( current != -1 ) { // Make sure we also test the current server + servers[count++] = current; + state[current] = 2; + } + for ( int tr = size * 10; tr > 0 && count < size; --tr ) { + int idx = rand() % numAltServers; + if ( state[idx] != 0 ) + continue; + if ( isUsableForUplink( uplink, idx, &now ) ) { + servers[count++] = idx; + state[idx] = 2; // Used + } else { + state[idx] = 1; // Potential + } + } + // If panic mode, consider others too + for ( int tr = size * 10; current == -1 && tr > 0 && count < size; --tr ) { + int idx = rand() % numAltServers; + if ( state[idx] == 2 ) + continue; + servers[count++] = idx; + state[idx] = 2; // Used } - // server seems ok, include in output and decrease its fail counter - output[count++] = srv->host; - if ( count >= size ) break; } mutex_unlock( &altServersLock ); return count; @@ -320,7 +275,7 @@ json_t* altservers_toJson() "rtt", rtts, "isPrivate", (int)src[i].isPrivate, "isClientOnly", (int)src[i].isClientOnly, - "numFails", src[i].numFails + "numFails", src[i].fails ); json_array_append_new( list, server ); } @@ -329,32 +284,27 @@ json_t* altservers_toJson() /** * Update rtt history of given server - returns the new average for that server. - * XXX HOLD altServersLock WHEN CALLING THIS! */ -static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const unsigned int rtt) +static uint32_t altservers_updateRtt(dnbd3_uplink_t *uplink, int index, uint32_t rtt) { - unsigned int avg = rtt; - int i; - for (i = 0; i < numAltServers; ++i) { - if ( !isSameAddressPort( host, &altServers[i].host ) ) continue; - altServers[i].rtt[++altServers[i].rttIndex % SERVER_RTT_PROBES] = rtt; -#if SERVER_RTT_PROBES == 5 - avg = (altServers[i].rtt[0] + altServers[i].rtt[1] + altServers[i].rtt[2] - + altServers[i].rtt[3] + altServers[i].rtt[4]) / SERVER_RTT_PROBES; -#else -#warning You might want to change the code in altservers_update_rtt if you changed SERVER_RTT_PROBES - avg = 0; - for (int j = 0; j < SERVER_RTT_PROBES; ++j) { - avg += altServers[i].rtt[j]; + uint32_t avg = 0, j; + dnbd3_alt_local_t *local = &uplink->altData[index]; + mutex_lock( &altServersLock ); + if ( likely( local->initDone ) ) { + local->rtt[++local->rttIndex % SERVER_RTT_PROBES] = rtt; + for ( j = 0; j < SERVER_RTT_PROBES; ++j ) { + avg += local->rtt[j]; } avg /= SERVER_RTT_PROBES; -#endif - // If we got a new rtt value, server must be working - if ( altServers[i].numFails > 0 ) { - altServers[i].numFails--; + } else { // First rtt measurement -- copy to every slot + for ( j = 0; j < SERVER_RTT_PROBES; ++j ) { + local->rtt[j] = rtt; } - break; + avg = rtt; + local->initDone = true; } + altServers[index].rtt[++altServers[index].rttIndex % SERVER_RTT_PROBES] = avg; + mutex_unlock( &altServersLock ); return avg; } @@ -383,40 +333,33 @@ int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2) * track of how often servers fail, and consider them disabled for some time if they * fail too many times. */ -void altservers_serverFailed(const dnbd3_host_t * const host) +void altservers_serverFailed(int server) { - int i; - int foundIndex = -1, lastOk = -1; - ticks now; - timing_get( &now ); + declare_now; mutex_lock( &altServersLock ); - for (i = 0; i < numAltServers; ++i) { - if ( foundIndex == -1 ) { - // Looking for the failed server in list - if ( isSameAddressPort( host, &altServers[i].host ) ) { - foundIndex = i; - } - } else if ( altServers[i].host.type != 0 && altServers[i].numFails == 0 ) { - lastOk = i; + if ( timing_diff( &altServers[server].lastFail, &now ) > SERVER_GLOBAL_DUP_TIME ) { + altServers[server].lastFail = now; + if ( altServers[server].fails++ >= SERVER_BAD_UPLINK_MAX ) { + altServers[server].blocked = true; } } - // Do only increase counter if last fail was not too recent. This is - // to prevent the counter from increasing rapidly if many images use the - // same uplink. If there's a network hickup, all uplinks will call this - // function and would increase the counter too quickly, disabling the server. - if ( foundIndex != -1 && timing_diff( &altServers[foundIndex].lastFail, &now ) > SERVER_RTT_INTERVAL_INIT ) { - altServers[foundIndex].numFails += SERVER_UPLINK_FAIL_INCREASE; - altServers[foundIndex].lastFail = now; - if ( lastOk != -1 ) { - // Make sure non-working servers are put at the end of the list, so they're less likely - // to get picked when testing servers for uplink connections. - const dnbd3_alt_server_t tmp = altServers[foundIndex]; - altServers[foundIndex] = altServers[lastOk]; - altServers[lastOk] = tmp; - } + mutex_unlock( &altServersLock ); +} + +/** + * Called from RTT checker if connecting to a server succeeded but + * subsequently selecting the given image failed. Handle this within + * the uplink and don't increase the global fail counter. + */ +static void altservers_imageFailed(dnbd3_uplink_t *uplink, int server) +{ + mutex_lock( &altServersLock ); + if ( uplink->altData[server].fails++ >= SERVER_BAD_UPLINK_MAX ) { + uplink->altData[server].blocked = true; } mutex_unlock( &altServersLock ); } + /** * Mainloop of this module. It will wait for requests by uplinks to find a * suitable uplink server for them. If found, it will tell the uplink about @@ -425,206 +368,213 @@ void altservers_serverFailed(const dnbd3_host_t * const host) * will update quite quickly. Needs to be improved some time, ie. by only * updating the rtt if the last update was at least X seconds ago. */ -static void *altservers_main(void *data UNUSED) +static void *altservers_runCheck(void *data) +{ + dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data; + + assert( uplink != NULL ); + setThreadName( "altserver-check" ); + altservers_findUplinkInternal( uplink ); + // Save cache maps of all images if applicable + // TODO: Has nothing to do with alt servers really, maybe move somewhere else? + declare_now; + if ( _closeUnusedFd && timing_reached( &nextCloseUnusedFd, &now ) ) { + timing_gets( &nextCloseUnusedFd, 900 ); + image_closeUnusedFd(); + } + return NULL; +} + +void altservers_findUplink(dnbd3_uplink_t *uplink) +{ + altservers_findUplinkInternal( uplink ); + while ( uplink->rttTestResult == RTT_INPROGRESS ) { + usleep( 5000 ); + } +} + +int altservers_hostToIndex(dnbd3_host_t *host) +{ + for ( int i = 0; i < numAltServers; ++i ) { + if ( isSameAddressPort( host, &altServers[i].host ) ) + return i; + } + return -1; +} + +const dnbd3_host_t* altservers_indexToHost(int server) +{ + return &altServers[server].host; +} + +// XXX Sync call above must block until async worker has finished XXX +static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink) { const int ALTS = 4; - int ret, itLink, itAlt, numAlts; - bool found; - char buffer[DNBD3_BLOCK_SIZE ]; - dnbd3_reply_t reply; - dnbd3_host_t servers[ALTS + 1]; - serialized_buffer_t serialized; + int ret, itAlt, numAlts, current; + bool panic; + int servers[ALTS + 1]; struct timespec start, end; - ticks nextCloseUnusedFd; - setThreadName( "altserver-check" ); - blockNoncriticalSignals(); - timing_gets( &nextCloseUnusedFd, 900 ); - // LOOP - while ( !_shutdown ) { - // Wait 5 seconds max. - ret = signal_wait( runSignal, 5000 ); - if ( _shutdown ) goto cleanup; - if ( ret == SIGNAL_ERROR ) { - if ( errno == EAGAIN || errno == EINTR ) continue; - logadd( LOG_WARNING, "Error %d on signal_clear on alservers_main! Things will break!", errno ); - usleep( 100000 ); + if ( _shutdown ) + return; + mutex_lock( &uplink->rttLock ); + // Maybe we already have a result, or check is currently running + if ( uplink->better.fd != -1 || uplink->rttTestResult == RTT_INPROGRESS ) { + mutex_unlock( &uplink->rttLock ); + return; + } + assert( uplink->rttTestResult != RTT_DOCHANGE ); + uplink->rttTestResult = RTT_INPROGRESS; + panic = ( uplink->current.fd == -1 ); + current = uplink->current.index; // Current server index (or last one in panic mode) + mutex_unlock( &uplink->rttLock ); + // First, get 4 alt servers + numAlts = altservers_getListForUplink( uplink, servers, ALTS, panic ? -1 : current ); + // If we're already connected and only got one server anyways, there isn't much to do + if ( numAlts == 0 || ( numAlts == 1 && !panic ) ) { + uplink->rttTestResult = RTT_DONTCHANGE; + return; + } + dnbd3_image_t * const image = image_lock( uplink->image ); + if ( image == NULL ) { // Check again after locking + uplink->rttTestResult = RTT_NOT_REACHABLE; + logadd( LOG_WARNING, "Image has gone away that was queued for RTT measurement" ); + return; + } + LOG( LOG_DEBUG2, "Running alt check for %s:%d", image->name, (int)image->rid ); + assert( uplink->rttTestResult == RTT_INPROGRESS ); + // Test them all + dnbd3_server_connection_t best = { .fd = -1 }; + unsigned long bestRtt = RTT_UNREACHABLE; + unsigned long currentRtt = RTT_UNREACHABLE; + for (itAlt = 0; itAlt < numAlts; ++itAlt) { + int server = servers[itAlt]; + // Connect + clock_gettime( BEST_CLOCK_SOURCE, &start ); + int sock = sock_connect( &altServers[server].host, 750, 1000 ); + if ( sock == -1 ) { // Connection failed means global error + altservers_serverFailed( server ); + continue; } - // Work your way through the queue - atomic_thread_fence( memory_order_acquire ); - for (itLink = 0; itLink < SERVER_MAX_PENDING_ALT_CHECKS; ++itLink) { - dnbd3_uplink_t * const uplink = pending[itLink]; - if ( uplink == NULL ) - continue; - // First, get 4 alt servers - numAlts = altservers_getListForUplink( servers, ALTS, uplink->current.fd == -1 ); - // If we're already connected and only got one server anyways, there isn't much to do - if ( numAlts <= 1 && uplink->current.fd != -1 ) { - uplink->rttTestResult = RTT_DONTCHANGE; - continue; - } - dnbd3_image_t * const image = image_lock( uplink->image ); - if ( image == NULL ) { // Check again after locking - mutex_lock( &uplink->rttLock ); - uplink->rttTestResult = RTT_NOT_REACHABLE; - assert( pending[itLink] == uplink ); - pending[itLink] = NULL; - mutex_unlock( &uplink->rttLock ); - logadd( LOG_WARNING, "Image has gone away that was queued for RTT measurement" ); - continue; - } - LOG( LOG_DEBUG2, "[%d] Running alt check", itLink ); - assert( uplink->rttTestResult == RTT_INPROGRESS ); - if ( uplink->current.fd != -1 ) { - // Add current server if not already in list - found = false; - for (itAlt = 0; itAlt < numAlts; ++itAlt) { - if ( !isSameAddressPort( &uplink->current.host, &servers[itAlt] ) ) continue; - found = true; - break; - } - if ( !found ) servers[numAlts++] = uplink->current.host; - } - // Test them all - int bestSock = -1; - int bestIndex = -1; - int bestProtocolVersion = -1; - unsigned long bestRtt = RTT_UNREACHABLE; - unsigned long currentRtt = RTT_UNREACHABLE; - for (itAlt = 0; itAlt < numAlts; ++itAlt) { - usleep( 1000 ); // Wait a very short moment for the network to recover (we might be doing lots of measurements...) - // Connect - clock_gettime( BEST_CLOCK_SOURCE, &start ); - int sock = sock_connect( &servers[itAlt], 750, 1000 ); - if ( sock < 0 ) continue; - // Select image ++++++++++++++++++++++++++++++ - if ( !dnbd3_select_image( sock, image->name, image->rid, SI_SERVER_FLAGS ) ) { - goto server_failed; - } - // See if selecting the image succeeded ++++++++++++++++++++++++++++++ - uint16_t protocolVersion, rid; - uint64_t imageSize; - char *name; - if ( !dnbd3_select_image_reply( &serialized, sock, &protocolVersion, &name, &rid, &imageSize ) ) { - goto server_image_not_available; - } - if ( protocolVersion < MIN_SUPPORTED_SERVER ) goto server_failed; - if ( name == NULL || strcmp( name, image->name ) != 0 ) { - ERROR_GOTO( server_failed, "[RTT] Server offers image '%s'", name ); - } - if ( rid != image->rid ) { - ERROR_GOTO( server_failed, "[RTT] Server provides rid %d", (int)rid ); - } - if ( imageSize != image->virtualFilesize ) { - ERROR_GOTO( server_failed, "[RTT] Remote size: %" PRIu64 ", expected: %" PRIu64, imageSize, image->virtualFilesize ); - } - // Request first block (NOT random!) ++++++++++++++++++++++++++++++ - if ( !dnbd3_get_block( sock, 0, DNBD3_BLOCK_SIZE, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) { - LOG_GOTO( server_failed, LOG_DEBUG1, "[RTT%d] Could not request first block", itLink ); - } - // See if requesting the block succeeded ++++++++++++++++++++++ - if ( !dnbd3_get_reply( sock, &reply ) ) { - LOG_GOTO( server_failed, LOG_DEBUG1, "[RTT%d] Received corrupted reply header after CMD_GET_BLOCK", itLink ); - } - // check reply header - if ( reply.cmd != CMD_GET_BLOCK || reply.size != DNBD3_BLOCK_SIZE ) { - ERROR_GOTO( server_failed, "[RTT] Reply to first block request is %" PRIu32 " bytes", reply.size ); - } - if ( recv( sock, buffer, DNBD3_BLOCK_SIZE, MSG_WAITALL ) != DNBD3_BLOCK_SIZE ) { - ERROR_GOTO( server_failed, "[RTT%d] Could not read first block payload", itLink ); - } - clock_gettime( BEST_CLOCK_SOURCE, &end ); - // Measurement done - everything fine so far - mutex_lock( &altServersLock ); - mutex_lock( &uplink->rttLock ); - const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->current.host ); - // Penaltize rtt if this was a cycle; this will treat this server with lower priority - // in the near future too, so we prevent alternating between two servers that are both - // part of a cycle and have the lowest latency. - const unsigned int rtt = (unsigned int)((end.tv_sec - start.tv_sec) * 1000000 - + (end.tv_nsec - start.tv_nsec) / 1000 - + ( (isCurrent && uplink->cycleDetected) ? 1000000 : 0 )); // µs - unsigned int avg = altservers_updateRtt( &servers[itAlt], rtt ); - mutex_unlock( &altServersLock ); - // If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time - if ( ( uplink->cycleDetected || uplink->current.fd == -1 ) && isCurrent ) avg = (avg * 2) + 50000; - mutex_unlock( &uplink->rttLock ); - if ( uplink->current.fd != -1 && isCurrent ) { - // Was measuring current server - currentRtt = avg; - close( sock ); - } else if ( avg < bestRtt ) { - // Was another server, update "best" - if ( bestSock != -1 ) close( bestSock ); - bestSock = sock; - bestRtt = avg; - bestIndex = itAlt; - bestProtocolVersion = protocolVersion; - } else { - // Was too slow, ignore - close( sock ); - } - // We're done, call continue - continue; - // Jump here if anything went wrong - // This will cleanup and continue - server_failed: ; - altservers_serverFailed( &servers[itAlt] ); - server_image_not_available: ; - close( sock ); - } - // Done testing all servers. See if we should switch - if ( bestSock != -1 && (uplink->current.fd == -1 || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) { - // yep - if ( currentRtt > 10000000 || uplink->current.fd == -1 ) { - LOG( LOG_DEBUG1, "Change - best: %luµs, current: -", bestRtt ); - } else { - LOG( LOG_DEBUG1, "Change - best: %luµs, current: %luµs", bestRtt, currentRtt ); - } - sock_setTimeout( bestSock, _uplinkTimeout ); - mutex_lock( &uplink->rttLock ); - uplink->better.fd = bestSock; - uplink->better.host = servers[bestIndex]; - uplink->better.version = bestProtocolVersion; - uplink->rttTestResult = RTT_DOCHANGE; - mutex_unlock( &uplink->rttLock ); - signal_call( uplink->signal ); - } else if ( bestSock == -1 && currentRtt == RTT_UNREACHABLE ) { - // No server was reachable - mutex_lock( &uplink->rttLock ); - uplink->rttTestResult = RTT_NOT_REACHABLE; - mutex_unlock( &uplink->rttLock ); - } else { - // nope - if ( bestSock != -1 ) close( bestSock ); - mutex_lock( &uplink->rttLock ); - uplink->rttTestResult = RTT_DONTCHANGE; - uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away - mutex_unlock( &uplink->rttLock ); - if ( !image->working ) { - image->working = true; - LOG( LOG_DEBUG1, "[%d] No better alt server found, enabling again", itLink ); - } - } - image_release( image ); - // end of loop over all pending uplinks - assert( pending[itLink] == uplink ); - pending[itLink] = NULL; - atomic_thread_fence( memory_order_release ); + // Select image ++++++++++++++++++++++++++++++ + if ( !dnbd3_select_image( sock, image->name, image->rid, SI_SERVER_FLAGS ) ) { + goto image_failed; } - // Save cache maps of all images if applicable - declare_now; - // TODO: Has nothing to do with alt servers really, maybe move somewhere else? - if ( _closeUnusedFd && timing_reached( &nextCloseUnusedFd, &now ) ) { - timing_gets( &nextCloseUnusedFd, 900 ); - image_closeUnusedFd(); + // See if selecting the image succeeded ++++++++++++++++++++++++++++++ + uint16_t protocolVersion, rid; + uint64_t imageSize; + char *name; + serialized_buffer_t serialized; + if ( !dnbd3_select_image_reply( &serialized, sock, &protocolVersion, &name, &rid, &imageSize ) ) { + goto image_failed; } + if ( protocolVersion < MIN_SUPPORTED_SERVER ) { // Server version unsupported; global fail + goto server_failed; + } + if ( name == NULL || strcmp( name, image->name ) != 0 ) { + ERROR_GOTO( image_failed, "[RTT] Server offers image '%s' instead of '%s'", name, image->name ); + } + if ( rid != image->rid ) { + ERROR_GOTO( image_failed, "[RTT] Server provides rid %d instead of %d", (int)rid, (int)image->rid ); + } + if ( imageSize != image->virtualFilesize ) { + ERROR_GOTO( image_failed, "[RTT] Remote size: %" PRIu64 ", expected: %" PRIu64, imageSize, image->virtualFilesize ); + } + // Request first block (NOT random!) ++++++++++++++++++++++++++++++ + if ( !dnbd3_get_block( sock, 0, DNBD3_BLOCK_SIZE, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) { + LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Could not request first block", server ); + } + // See if requesting the block succeeded ++++++++++++++++++++++ + dnbd3_reply_t reply; + if ( !dnbd3_get_reply( sock, &reply ) ) { + LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Received corrupted reply header after CMD_GET_BLOCK", server ); + } + // check reply header + if ( reply.cmd != CMD_GET_BLOCK || reply.size != DNBD3_BLOCK_SIZE ) { + // Sanity check failed; count this as global error (malicious/broken server) + ERROR_GOTO( server_failed, "[RTT] Reply to first block request is %" PRIu32 " bytes", reply.size ); + } + // flush payload to include this into measurement + char buffer[DNBD3_BLOCK_SIZE]; + if ( recv( sock, buffer, DNBD3_BLOCK_SIZE, MSG_WAITALL ) != DNBD3_BLOCK_SIZE ) { + ERROR_GOTO( image_failed, "[RTT%d] Could not read first block payload", server ); + } + clock_gettime( BEST_CLOCK_SOURCE, &end ); + // Measurement done - everything fine so far + mutex_lock( &uplink->rttLock ); + const bool isCurrent = ( uplink->current.index == server ); + mutex_unlock( &uplink->rttLock ); + // Penaltize rtt if this was a cycle; this will treat this server with lower priority + // in the near future too, so we prevent alternating between two servers that are both + // part of a cycle and have the lowest latency. + uint32_t rtt = (uint32_t)((end.tv_sec - start.tv_sec) * 1000000 + + (end.tv_nsec - start.tv_nsec) / 1000); // µs + uint32_t avg = altservers_updateRtt( uplink, server, rtt ); + // If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time + if ( ( uplink->cycleDetected || panic ) && isCurrent ) { + avg = (avg * 2) + 50000; + } + if ( !panic && isCurrent ) { + // Was measuring current server + currentRtt = avg; + close( sock ); + } else if ( avg < bestRtt ) { + // Was another server, update "best" + if ( best.fd != -1 ) { + close( best.fd ); + } + best.fd = sock; + bestRtt = avg; + best.index = server; + best.version = protocolVersion; + } else { + // Was too slow, ignore + close( sock ); + } + // We're done, call continue + continue; + // Jump here if anything went wrong + // This will cleanup and continue +image_failed: + altservers_imageFailed( uplink, server ); + goto failed; +server_failed: + altservers_serverFailed( server ); +failed: + close( sock ); } - cleanup: ; - if ( runSignal != NULL ) { - signal_close( runSignal ); + // Done testing all servers. See if we should switch + if ( best.fd != -1 && (panic || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) { + // yep + if ( currentRtt > 10000000 || panic ) { + LOG( LOG_DEBUG1, "Change - best: %luµs, current: -", bestRtt ); + } else { + LOG( LOG_DEBUG1, "Change - best: %luµs, current: %luµs", bestRtt, currentRtt ); + } + sock_setTimeout( best.fd, _uplinkTimeout ); + mutex_lock( &uplink->rttLock ); + uplink->better = best; + uplink->rttTestResult = RTT_DOCHANGE; + mutex_unlock( &uplink->rttLock ); + signal_call( uplink->signal ); + } else if ( best.fd == -1 && currentRtt == RTT_UNREACHABLE ) { + // No server was reachable, including current + uplink->rttTestResult = RTT_NOT_REACHABLE; + } else { + // nope + if ( best.fd != -1 ) { + close( best.fd ); + } + if ( !image->working || uplink->cycleDetected ) { + image->working = true; + LOG( LOG_DEBUG1, "[RTT] No better alt server found, enabling '%s:%d' again... :-(", image->name, (int)image->rid ); + } + uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away + mutex_lock( &uplink->rttLock ); + uplink->rttTestResult = RTT_DONTCHANGE; + mutex_unlock( &uplink->rttLock ); } - runSignal = NULL; - return NULL ; + image_release( image ); } diff --git a/src/server/altservers.h b/src/server/altservers.h index e03b900..8e2b964 100644 --- a/src/server/altservers.h +++ b/src/server/altservers.h @@ -7,23 +7,27 @@ struct json_t; void altservers_init(); -void altservers_shutdown(); - int altservers_load(); bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly); -void altservers_findUplink(dnbd3_uplink_t *uplink); +void altservers_findUplinkAsync(dnbd3_uplink_t *uplink); -void altservers_removeUplink(dnbd3_uplink_t *uplink); +void altservers_findUplink(dnbd3_uplink_t *uplink); int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size); -int altservers_getListForUplink(dnbd3_host_t *output, int size, int emergency); +int altservers_getHostListForReplication(dnbd3_host_t *servers, int size); + +bool altservers_toString(int server, char *buffer, size_t len); int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2); -void altservers_serverFailed(const dnbd3_host_t * const host); +void altservers_serverFailed(int server); + +int altservers_hostToIndex(dnbd3_host_t *host); + +const dnbd3_host_t* altservers_indexToHost(int server); struct json_t* altservers_toJson(); diff --git a/src/server/globals.h b/src/server/globals.h index 659e5a2..4d97c6b 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -30,10 +30,31 @@ typedef struct uint8_t hopCount; // How many hops this request has already taken across proxies } dnbd3_queued_request_t; +typedef struct +{ + int fails; // Hard fail: Connection failed + int rttIndex; + uint32_t rtt[SERVER_RTT_PROBES]; + bool isPrivate, isClientOnly; + bool blocked; // If true count down fails until 0 to enable again + ticks lastFail; // Last hard fail + dnbd3_host_t host; + char comment[COMMENT_LENGTH]; +} dnbd3_alt_server_t; + +typedef struct +{ + int fails; // Soft fail: Image not found + int rttIndex; + uint32_t rtt[SERVER_RTT_PROBES]; + bool blocked; // True if server is to be ignored and fails should be counted down + bool initDone; +} dnbd3_alt_local_t; + typedef struct { - int fd; // Socket fd for this connection - int version; // Protocol version of remote server - dnbd3_host_t host; // IP/Port of remote server + int fd; // Socket fd for this connection + int version; // Protocol version of remote server + int index; // Entry in uplinks list } dnbd3_server_connection_t; #define RTT_IDLE 0 // Not in progress @@ -51,7 +72,7 @@ struct _dnbd3_uplink pthread_mutex_t queueLock; // lock for synchronization on request queue etc. dnbd3_image_t *image; // image that this uplink is used for; do not call get/release for this pointer pthread_mutex_t rttLock; // When accessing rttTestResult, betterFd or betterServer - int rttTestResult; // RTT_* + atomic_int rttTestResult; // RTT_* int cacheFd; // used to write to the image, in case it is relayed. ONLY USE FROM UPLINK THREAD! uint8_t *recvBuffer; // Buffer for receiving payload uint32_t recvBufferLen; // Len of ^^ @@ -65,19 +86,9 @@ struct _dnbd3_uplink atomic_int queueLen; // length of queue uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives) dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE]; + dnbd3_alt_local_t altData[SERVER_MAX_ALTS]; }; -typedef struct -{ - char comment[COMMENT_LENGTH]; - dnbd3_host_t host; - unsigned int rtt[SERVER_RTT_PROBES]; - unsigned int rttIndex; - bool isPrivate, isClientOnly; - ticks lastFail; - int numFails; -} dnbd3_alt_server_t; - typedef struct { uint8_t host[16]; diff --git a/src/server/image.c b/src/server/image.c index d250715..1a6e0f8 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -1178,7 +1178,7 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision, dnbd3_host_t servers[REP_NUM_SRV]; int uplinkSock = -1; dnbd3_host_t uplinkServer; - const int count = altservers_getListForUplink( servers, REP_NUM_SRV, false ); + const int count = altservers_getHostListForReplication( servers, REP_NUM_SRV ); uint16_t remoteProtocolVersion; uint16_t remoteRid = revision; uint64_t remoteImageSize; @@ -1491,7 +1491,7 @@ json_t* image_getListAsJson() json_t *imagesJson = json_array(); json_t *jsonImage; int i; - char uplinkName[100] = { 0 }; + char uplinkName[100]; uint64_t bytesReceived; int completeness, idleTime; declare_now; @@ -1508,7 +1508,7 @@ json_t* image_getListAsJson() uplinkName[0] = '\0'; } else { bytesReceived = image->uplink->bytesReceived; - if ( image->uplink->current.fd == -1 || !host_to_string( &image->uplink->current.host, uplinkName, sizeof(uplinkName) ) ) { + if ( !uplink_getHostString( image->uplink, uplinkName, sizeof(uplinkName) ) ) { uplinkName[0] = '\0'; } } diff --git a/src/server/net.c b/src/server/net.c index 7f3c1ce..4976eea 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -669,11 +669,19 @@ static void removeFromList(dnbd3_client_t *client) { int i; mutex_lock( &_clients_lock ); - for ( i = _num_clients - 1; i >= 0; --i ) { - if ( _clients[i] == client ) { - _clients[i] = NULL; + if ( _num_clients != 0 ) { + for ( i = _num_clients - 1; i >= 0; --i ) { + if ( _clients[i] == client ) { + _clients[i] = NULL; + break; + } + } + if ( i != 0 && i + 1 == _num_clients ) { + do { + i--; + } while ( _clients[i] == NULL && i > 0 ); + _num_clients = i + 1; } - if ( _clients[i] == NULL && i + 1 == _num_clients ) --_num_clients; } mutex_unlock( &_clients_lock ); } diff --git a/src/server/server.c b/src/server/server.c index 838aec2..640048a 100644 --- a/src/server/server.c +++ b/src/server/server.c @@ -121,9 +121,6 @@ void dnbd3_cleanup() // Disable threadpool threadpool_close(); - // Terminate the altserver checking thread - altservers_shutdown(); - // Terminate all uplinks image_killUplinks(); @@ -198,6 +195,11 @@ int main(int argc, char *argv[]) case LONGOPT_CRC4: return image_generateCrcFile( optarg ) ? 0 : EXIT_FAILURE; case LONGOPT_ASSERT: + printf( "Testing use after free:\n" ); + volatile char * volatile test = malloc( 10 ); + test[0] = 1; + free( test ); + test[1] = 2; printf( "Testing a failing assertion:\n" ); assert( 4 == 5 ); printf( "Assertion 4 == 5 seems to hold. ;-)\n" ); diff --git a/src/server/uplink.c b/src/server/uplink.c index e21e28c..6c85580 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -96,17 +96,18 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version uplink->bytesReceived = 0; uplink->idleTime = 0; uplink->queueLen = 0; - mutex_lock( &uplink->sendMutex ); - uplink->current.fd = -1; - mutex_unlock( &uplink->sendMutex ); uplink->cacheFd = -1; uplink->signal = NULL; uplink->replicationHandle = REP_NONE; mutex_lock( &uplink->rttLock ); + mutex_lock( &uplink->sendMutex ); + uplink->current.fd = -1; + mutex_unlock( &uplink->sendMutex ); uplink->cycleDetected = false; - if ( sock >= 0 ) { + if ( sock != -1 ) { uplink->better.fd = sock; - uplink->better.host = *host; + int index = altservers_hostToIndex( host ); + uplink->better.index = index == -1 ? 0 : index; // Prevent invalid array access uplink->rttTestResult = RTT_DOCHANGE; uplink->better.version = version; } else { @@ -116,7 +117,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version mutex_unlock( &uplink->rttLock ); uplink->recvBufferLen = 0; uplink->shutdown = false; - if ( 0 != thread_create( &(uplink->thread), NULL, &uplink_mainloop, (void *)link ) ) { + if ( 0 != thread_create( &(uplink->thread), NULL, &uplink_mainloop, (void *)uplink ) ) { logadd( LOG_ERROR, "Could not start thread for new uplink." ); goto failure; } @@ -148,8 +149,8 @@ void uplink_shutdown(dnbd3_image_t *image) } dnbd3_uplink_t * const uplink = image->uplink; mutex_lock( &uplink->queueLock ); - if ( !uplink->shutdown ) { - uplink->shutdown = true; + bool exp = false; + if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) { signal_call( uplink->signal ); thread = uplink->thread; join = true; @@ -211,13 +212,11 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin } // Check if the client is the same host as the uplink. If so assume this is a circular proxy chain // This might be a false positive if there are multiple instances running on the same host (IP) - if ( hops != 0 && isSameAddress( &uplink->current.host, &client->host ) ) { - mutex_unlock( &client->image->lock ); - logadd( LOG_WARNING, "Proxy cycle detected (same host)." ); - mutex_lock( &uplink->rttLock ); + if ( hops != 0 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) { uplink->cycleDetected = true; - mutex_unlock( &uplink->rttLock ); signal_call( uplink->signal ); + mutex_unlock( &client->image->lock ); + logadd( LOG_WARNING, "Proxy cycle detected (same host)." ); return false; } @@ -256,12 +255,10 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin } } if ( unlikely( requestLoop ) ) { - mutex_unlock( &uplink->queueLock ); - logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops ); - mutex_lock( &uplink->rttLock ); uplink->cycleDetected = true; - mutex_unlock( &uplink->rttLock ); signal_call( uplink->signal ); + mutex_unlock( &uplink->queueLock ); + logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops ); return false; } if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) { @@ -311,6 +308,8 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin if ( foundExisting != -1 ) return true; // Attached to pending request, do nothing + usleep( 10000 ); + // See if we can fire away the request if ( mutex_trylock( &uplink->sendMutex ) != 0 ) { logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" ); @@ -342,7 +341,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin if ( state == -1 ) { logadd( LOG_DEBUG2, "Direct uplink request queue entry gone after sending and re-locking queue. *shrug*" ); } else if ( state == ULR_NEW ) { - logadd( LOG_DEBUG2, "Succesful direct uplink request" ); + //logadd( LOG_DEBUG2, "Direct uplink request" ); } else { logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] ); } @@ -352,10 +351,8 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin } } - if ( foundExisting == -1 ) { // Only wake up uplink thread if the request needs to be relayed - if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) { - logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno ); - } + if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) { + logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno ); } return true; } @@ -443,7 +440,7 @@ static void* uplink_mainloop(void *data) uplink->image->working = true; uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received buffer[0] = '@'; - if ( host_to_string( &uplink->current.host, buffer + 1, sizeof(buffer) - 1 ) ) { + if ( altservers_toString( uplink->current.index, buffer + 1, sizeof(buffer) - 1 ) ) { logadd( LOG_DEBUG1, "(Uplink %s) Now connected to %s\n", uplink->image->name, buffer + 1 ); setThreadName( buffer ); } @@ -525,9 +522,7 @@ static void* uplink_mainloop(void *data) } } // See if we should trigger an RTT measurement - mutex_lock( &uplink->rttLock ); - const int rttTestResult = uplink->rttTestResult; - mutex_unlock( &uplink->rttLock ); + int rttTestResult = uplink->rttTestResult; if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) { if ( timing_reached( &nextAltCheck, &now ) || ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) { // It seems it's time for a check @@ -538,7 +533,7 @@ static void* uplink_mainloop(void *data) goto cleanup; } else if ( !uplink_connectionShouldShutdown( uplink ) ) { // Not complete - do measurement - altservers_findUplink( uplink ); // This will set RTT_INPROGRESS (synchronous) + altservers_findUplinkAsync( uplink ); // This will set RTT_INPROGRESS (synchronous) if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) { uplink->nextReplicationIndex = 0; } @@ -547,11 +542,9 @@ static void* uplink_mainloop(void *data) timing_set( &nextAltCheck, &now, altCheckInterval ); } } else if ( rttTestResult == RTT_NOT_REACHABLE ) { - mutex_lock( &uplink->rttLock ); - uplink->rttTestResult = RTT_IDLE; - mutex_unlock( &uplink->rttLock ); + atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE ); discoverFailCount++; - timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_BACKOFF_COUNT ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) ); + timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) ); } #ifdef _DEBUG if ( uplink->current.fd != -1 && !uplink->shutdown ) { @@ -581,36 +574,38 @@ static void* uplink_mainloop(void *data) #endif } cleanup: ; - if ( !uplink->shutdown ) { - uplink->shutdown = true; + // Detach depends on whether someone is joining this thread... + bool exp = false; + if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) { thread_detach( uplink->thread ); } - altservers_removeUplink( uplink ); uplink_saveCacheMap( uplink ); - mutex_lock( &uplink->image->lock ); - if ( uplink->image->uplink == uplink ) { - uplink->image->uplink = NULL; + dnbd3_image_t *image = uplink->image; + mutex_lock( &image->lock ); + // in the list anymore, but we want to prevent it from being freed in either case + if ( image->uplink == uplink ) { + image->uplink = NULL; } + mutex_unlock( &image->lock ); // Do NOT use image without locking it mutex_lock( &uplink->queueLock ); - const int fd = uplink->current.fd; - const dnbd3_signal_t* signal = uplink->signal; - mutex_lock( &uplink->sendMutex ); - uplink->current.fd = -1; - mutex_unlock( &uplink->sendMutex ); - uplink->signal = NULL; - // Do not access uplink->image after unlocking, since we set - // image->uplink to NULL. Acquire with image_lock first, - // like done below when checking whether to re-init uplink - mutex_unlock( &uplink->image->lock ); - mutex_unlock( &uplink->queueLock ); - if ( fd != -1 ) close( fd ); - if ( signal != NULL ) signal_close( signal ); - // Wait for the RTT check to finish/fail if it's in progress - while ( uplink->rttTestResult == RTT_INPROGRESS ) + // Wait for active RTT measurement to finish + while ( uplink->rttTestResult == RTT_INPROGRESS ) { usleep( 10000 ); + } + signal_close( uplink->signal ); + mutex_lock( &uplink->rttLock ); + mutex_lock( &uplink->sendMutex ); + if ( uplink->current.fd != -1 ) { + close( uplink->current.fd ); + uplink->current.fd = -1; + } if ( uplink->better.fd != -1 ) { close( uplink->better.fd ); + uplink->better.fd = -1; } + mutex_unlock( &uplink->sendMutex ); + mutex_unlock( &uplink->rttLock ); + mutex_unlock( &uplink->queueLock ); mutex_destroy( &uplink->queueLock ); mutex_destroy( &uplink->rttLock ); mutex_destroy( &uplink->sendMutex ); @@ -619,9 +614,9 @@ static void* uplink_mainloop(void *data) if ( uplink->cacheFd != -1 ) { close( uplink->cacheFd ); } - dnbd3_image_t *image = image_lock( uplink->image ); free( uplink ); // !!! - if ( image != NULL ) { + if ( image_lock( image ) != NULL ) { + // Image is still in list... if ( !_shutdown && image->cache_map != NULL ) { // Ingegrity checker must have found something in the meantime uplink_init( image, -1, NULL, 0 ); @@ -656,7 +651,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) // the thread will re-send this request as soon as the connection // is reestablished. logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" ); - altservers_serverFailed( &uplink->current.host ); + altservers_serverFailed( uplink->current.index ); return; } mutex_lock( &uplink->queueLock ); @@ -973,7 +968,7 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) { if ( uplink->current.fd == -1 ) return; - altservers_serverFailed( &uplink->current.host ); + altservers_serverFailed( uplink->current.index ); mutex_lock( &uplink->sendMutex ); close( uplink->current.fd ); uplink->current.fd = -1; @@ -1138,3 +1133,13 @@ static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink) && ( _backgroundReplication != BGR_FULL || _bgrMinClients > uplink->image->users ) ); } +bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len) +{ + int current; + mutex_lock( &uplink->rttLock ); + current = uplink->current.fd == -1 ? -1 : uplink->current.index; + mutex_unlock( &uplink->rttLock ); + if ( current == -1 ) + return false; + return altservers_toString( current, buffer, len ); +} diff --git a/src/server/uplink.h b/src/server/uplink.h index 4fd41b0..acc8e11 100644 --- a/src/server/uplink.h +++ b/src/server/uplink.h @@ -16,4 +16,6 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin void uplink_shutdown(dnbd3_image_t *image); +bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len); + #endif /* UPLINK_H_ */ diff --git a/src/serverconfig.h b/src/serverconfig.h index 0cbb320..239f0a2 100644 --- a/src/serverconfig.h +++ b/src/serverconfig.h @@ -6,10 +6,12 @@ // +++++ Performance/memory related #define SERVER_MAX_CLIENTS 4000 #define SERVER_MAX_IMAGES 5000 -#define SERVER_MAX_ALTS 100 +#define SERVER_MAX_ALTS 50 // +++++ Uplink handling (proxy mode) -#define SERVER_UPLINK_FAIL_INCREASE 5 // On server failure, increase numFails by this value -#define SERVER_BAD_UPLINK_THRES 40 // Thresold for numFails at which we ignore a server for the time span below +#define SERVER_GLOBAL_DUP_TIME 6 // How many seconds to wait before changing global fail counter again +#define SERVER_BAD_UPLINK_MIN 10 // Thresold for fails at which we start ignoring the server occasionally +#define SERVER_BAD_UPLINK_MAX 20 // Hard block server if it failed this many times +#define SERVER_BAD_UPLINK_LOCAL_BLOCK 10 // If a server didn't supply the requested image this many times, block it for some time #define SERVER_BAD_UPLINK_IGNORE 180 // How many seconds is a server ignored #define SERVER_MAX_UPLINK_QUEUE 1500 // Maximum number of queued requests per uplink #define SERVER_UPLINK_QUEUELEN_THRES 900 // Threshold where we start dropping incoming clients @@ -33,7 +35,7 @@ #define SERVER_RTT_PROBES 5 // How many probes to average over #define SERVER_RTT_INTERVAL_INIT 5 // Initial interval between probes #define SERVER_RTT_INTERVAL_MAX 45 // Maximum interval between probes -#define SERVER_RTT_BACKOFF_COUNT 5 // If we can't reach any uplink server this many times, consider the uplink bad +#define SERVER_RTT_MAX_UNREACH 10 // If no server was reachable this many times, stop RTT measurements for a while #define SERVER_RTT_INTERVAL_FAILED 180 // Interval to use if no uplink server is reachable for above many times #define SERVER_REMOTE_IMAGE_CHECK_CACHETIME 120 // 2 minutes -- cgit v1.2.3-55-g7522 From e86ee9ba6a0b5299e835a51f62fe5979fc36788c Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Mon, 26 Aug 2019 12:00:00 +0200 Subject: [SERVER] Fix warnings, simplify locking --- src/server/server.c | 2 +- src/server/uplink.c | 23 ++++++++++++----------- 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/server.c b/src/server/server.c index 640048a..922740a 100644 --- a/src/server/server.c +++ b/src/server/server.c @@ -198,7 +198,7 @@ int main(int argc, char *argv[]) printf( "Testing use after free:\n" ); volatile char * volatile test = malloc( 10 ); test[0] = 1; - free( test ); + free( (void*)test ); test[1] = 2; printf( "Testing a failing assertion:\n" ); assert( 4 == 5 ); diff --git a/src/server/uplink.c b/src/server/uplink.c index 6c85580..abfebf0 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -370,6 +370,7 @@ static void* uplink_mainloop(void *data) dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data; int numSocks, i, waitTime; int altCheckInterval = SERVER_RTT_INTERVAL_INIT; + int rttTestResult; uint32_t discoverFailCount = 0; uint32_t unsavedSeconds = 0; ticks nextAltCheck, lastKeepalive; @@ -397,11 +398,9 @@ static void* uplink_mainloop(void *data) events[EV_SOCKET].fd = -1; while ( !_shutdown && !uplink->shutdown ) { // poll() - mutex_lock( &uplink->rttLock ); waitTime = uplink->rttTestResult == RTT_DOCHANGE ? 0 : -1; - mutex_unlock( &uplink->rttLock ); if ( waitTime == 0 ) { - // Nothing + // 0 means poll, since we're about to change the server } else if ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) { waitTime = 1000; } else { @@ -420,10 +419,9 @@ static void* uplink_mainloop(void *data) continue; } // Check if server switch is in order - mutex_lock( &uplink->rttLock ); - if ( uplink->rttTestResult != RTT_DOCHANGE ) { - mutex_unlock( &uplink->rttLock ); - } else { + if ( unlikely( uplink->rttTestResult == RTT_DOCHANGE ) ) { + mutex_lock( &uplink->rttLock ); + assert( uplink->rttTestResult == RTT_DOCHANGE ); uplink->rttTestResult = RTT_IDLE; // The rttTest worker thread has finished our request. // And says it's better to switch to another server @@ -476,7 +474,7 @@ static void* uplink_mainloop(void *data) // Uplink socket if ( (events[EV_SOCKET].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) { uplink_connectionFailed( uplink, true ); - logadd( LOG_DEBUG1, "Uplink gone away, panic!\n" ); + logadd( LOG_DEBUG1, "Uplink gone away, panic! (revents=%d)\n", (int)events[EV_SOCKET].revents ); setThreadName( "panic-uplink" ); } else if ( (events[EV_SOCKET].revents & POLLIN) ) { uplink_handleReceive( uplink ); @@ -509,7 +507,7 @@ static void* uplink_mainloop(void *data) if ( uplink->current.fd != -1 && uplink_connectionShouldShutdown( uplink ) ) { mutex_lock( &uplink->sendMutex ); close( uplink->current.fd ); - uplink->current.fd = events[EV_SOCKET].fd = -1; + uplink->current.fd = -1; mutex_unlock( &uplink->sendMutex ); uplink->cycleDetected = false; if ( uplink->recvBufferLen != 0 ) { @@ -522,7 +520,7 @@ static void* uplink_mainloop(void *data) } } // See if we should trigger an RTT measurement - int rttTestResult = uplink->rttTestResult; + rttTestResult = uplink->rttTestResult; if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) { if ( timing_reached( &nextAltCheck, &now ) || ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) { // It seems it's time for a check @@ -964,6 +962,9 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) uplink_connectionFailed( uplink, true ); } +/** + * Only call from uplink thread + */ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) { if ( uplink->current.fd == -1 ) @@ -984,7 +985,7 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) mutex_unlock( &uplink->rttLock ); if ( bail ) return; - altservers_findUplink( uplink ); + altservers_findUplinkAsync( uplink ); } /** -- cgit v1.2.3-55-g7522 From 69f5bf408b9587a6e2008fba2224c2d506f1a895 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Tue, 27 Aug 2019 16:13:07 +0200 Subject: [SERVER] Use reference counting for uplink First step towards less locking for proxy mode --- src/server/altservers.c | 13 ++- src/server/globals.h | 4 +- src/server/image.c | 39 ++++----- src/server/integrity.c | 17 ++-- src/server/net.c | 48 +++++++---- src/server/net.h | 2 + src/server/reference.c | 33 ++++++++ src/server/reference.h | 54 ++++++++++++ src/server/reftypes.h | 25 ++++++ src/server/uplink.c | 214 ++++++++++++++++++++++++++++-------------------- src/server/uplink.h | 2 +- 11 files changed, 311 insertions(+), 140 deletions(-) create mode 100644 src/server/reference.c create mode 100644 src/server/reference.h create mode 100644 src/server/reftypes.h (limited to 'src/server/uplink.c') diff --git a/src/server/altservers.c b/src/server/altservers.c index 493ed9e..7d7fdbe 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -7,6 +7,8 @@ #include "../shared/protocol.h" #include "../shared/timing.h" #include "../serverconfig.h" +#include "reference.h" + #include #include #include @@ -104,7 +106,6 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink) return; if ( uplink->current.fd != -1 && numAltServers <= 1 ) return; - int i; // if betterFd != -1 it means the uplink is supposed to switch to another // server. As this function here is called by the uplink thread, it can // never be that the uplink is supposed to switch, but instead calls @@ -112,11 +113,14 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink) assert( uplink->better.fd == -1 ); // it is however possible that an RTT measurement is currently in progress, // so check for that case and do nothing if one is in progress - mutex_lock( &uplink->rttLock ); if ( uplink->rttTestResult != RTT_INPROGRESS ) { - threadpool_run( &altservers_runCheck, uplink ); + dnbd3_uplink_t *current = ref_get_uplink( &uplink->image->uplinkref ); + if ( current == uplink ) { + threadpool_run( &altservers_runCheck, uplink ); + } else if ( current != NULL ) { + ref_put( ¤t->reference ); + } } - mutex_unlock( &uplink->rttLock ); } /** @@ -375,6 +379,7 @@ static void *altservers_runCheck(void *data) assert( uplink != NULL ); setThreadName( "altserver-check" ); altservers_findUplinkInternal( uplink ); + ref_put( &uplink->reference ); // Acquired in findUplinkAsync // Save cache maps of all images if applicable // TODO: Has nothing to do with alt servers really, maybe move somewhere else? declare_now; diff --git a/src/server/globals.h b/src/server/globals.h index 4d97c6b..5dd205a 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -8,6 +8,7 @@ #include #include #include +#include "reftypes.h" typedef struct timespec ticks; @@ -64,6 +65,7 @@ typedef struct { #define RTT_NOT_REACHABLE 4 // No uplink was reachable struct _dnbd3_uplink { + ref reference; dnbd3_server_connection_t current; // Currently active connection; fd == -1 means disconnected dnbd3_server_connection_t better; // Better connection as found by altserver worker; fd == -1 means none dnbd3_signal_t* signal; // used to wake up the process @@ -107,7 +109,7 @@ struct _dnbd3_image { char *path; // absolute path of the image char *name; // public name of the image (usually relative path minus revision ID) - dnbd3_uplink_t *uplink; // pointer to a server connection + weakref uplinkref; // pointer to a server connection uint8_t *cache_map; // cache map telling which parts are locally cached, NULL if complete uint64_t virtualFilesize; // virtual size of image (real size rounded up to multiple of 4k) uint64_t realFilesize; // actual file size on disk diff --git a/src/server/image.c b/src/server/image.c index 1a6e0f8..5b58347 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -8,6 +8,7 @@ #include "../shared/protocol.h" #include "../shared/timing.h" #include "../shared/crc32.h" +#include "reference.h" #include #include @@ -375,9 +376,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) // Check if image is incomplete, handle if ( candidate->cache_map != NULL ) { - if ( candidate->uplink == NULL ) { - uplink_init( candidate, -1, NULL, -1 ); - } + uplink_init( candidate, -1, NULL, -1 ); } return candidate; // We did all we can, hopefully it's working @@ -484,17 +483,7 @@ void image_killUplinks() mutex_lock( &imageListLock ); for (i = 0; i < _num_images; ++i) { if ( _images[i] == NULL ) continue; - mutex_lock( &_images[i]->lock ); - if ( _images[i]->uplink != NULL ) { - mutex_lock( &_images[i]->uplink->queueLock ); - if ( !_images[i]->uplink->shutdown ) { - thread_detach( _images[i]->uplink->thread ); - _images[i]->uplink->shutdown = true; - } - mutex_unlock( &_images[i]->uplink->queueLock ); - signal_call( _images[i]->uplink->signal ); - } - mutex_unlock( &_images[i]->lock ); + uplink_shutdown( _images[i] ); } mutex_unlock( &imageListLock ); } @@ -588,11 +577,15 @@ bool image_tryFreeAll() static dnbd3_image_t* image_free(dnbd3_image_t *image) { assert( image != NULL ); + assert( image->users == 0 ); if ( !_shutdown ) { logadd( LOG_INFO, "Freeing image %s:%d", image->name, (int)image->rid ); } - // - uplink_shutdown( image ); + // uplink_shutdown might return false to tell us + // that the shutdown is in progress. Bail out since + // this will get called again when the uplink is done. + if ( !uplink_shutdown( image ) ) + return NULL; mutex_lock( &image->lock ); free( image->cache_map ); free( image->crc32 ); @@ -860,7 +853,7 @@ static bool image_load(char *base, char *path, int withUplink) image->cache_map = cache_map; image->crc32 = crc32list; image->masterCrc32 = masterCrc; - image->uplink = NULL; + image->uplinkref = NULL; image->realFilesize = realFilesize; image->virtualFilesize = virtualFilesize; image->rid = (uint16_t)revision; @@ -1503,16 +1496,18 @@ json_t* image_getListAsJson() mutex_lock( &image->lock ); idleTime = (int)timing_diff( &image->atime, &now ); completeness = image_getCompletenessEstimate( image ); - if ( image->uplink == NULL ) { + mutex_unlock( &image->lock ); + dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); + if ( uplink == NULL ) { bytesReceived = 0; uplinkName[0] = '\0'; } else { - bytesReceived = image->uplink->bytesReceived; - if ( !uplink_getHostString( image->uplink, uplinkName, sizeof(uplinkName) ) ) { + bytesReceived = uplink->bytesReceived; + if ( !uplink_getHostString( uplink, uplinkName, sizeof(uplinkName) ) ) { uplinkName[0] = '\0'; } + ref_put( &uplink->reference ); } - mutex_unlock( &image->lock ); jsonImage = json_pack( "{sisssisisisisI}", "id", image->id, // id, name, rid never change, so access them without locking @@ -1734,7 +1729,7 @@ void image_closeUnusedFd() if ( image == NULL ) continue; mutex_lock( &image->lock ); - if ( image->users == 0 && image->uplink == NULL && timing_reached( &image->atime, &deadline ) ) { + if ( image->users == 0 && image->uplinkref == NULL && timing_reached( &image->atime, &deadline ) ) { snprintf( imgstr, sizeof(imgstr), "%s:%d", image->name, (int)image->rid ); fd = image->readFd; image->readFd = -1; diff --git a/src/server/integrity.c b/src/server/integrity.c index 3d1ac9b..f358c46 100644 --- a/src/server/integrity.c +++ b/src/server/integrity.c @@ -4,6 +4,7 @@ #include "locks.h" #include "image.h" #include "uplink.h" +#include "reference.h" #include #include @@ -238,11 +239,13 @@ static void* integrity_main(void * data UNUSED) if ( i + 1 == queueLen ) queueLen--; // Mark as working again if applicable if ( !foundCorrupted ) { - mutex_lock( &image->lock ); - if ( image->uplink != NULL ) { // TODO: image_determineWorkingState() helper? - image->working = image->uplink->current.fd != -1 && image->readFd != -1; + dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); + if ( uplink != NULL ) { // TODO: image_determineWorkingState() helper? + mutex_lock( &image->lock ); + image->working = uplink->current.fd != -1 && image->readFd != -1; + mutex_unlock( &image->lock ); + ref_put( &uplink->reference ); } - mutex_unlock( &image->lock ); } } else { // Still more blocks to go... @@ -255,12 +258,8 @@ static void* integrity_main(void * data UNUSED) // Something was fishy, make sure uplink exists mutex_lock( &image->lock ); image->working = false; - bool restart = image->uplink == NULL || image->uplink->shutdown; mutex_unlock( &image->lock ); - if ( restart ) { - uplink_shutdown( image ); - uplink_init( image, -1, NULL, -1 ); - } + uplink_init( image, -1, NULL, -1 ); } // Release :-) image_release( image ); diff --git a/src/server/net.c b/src/server/net.c index 4976eea..e0b516e 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -24,6 +24,7 @@ #include "locks.h" #include "rpc.h" #include "altservers.h" +#include "reference.h" #include "../shared/sockhelper.h" #include "../shared/timing.h" @@ -229,7 +230,7 @@ void* net_handleNewConnection(void *clientPtr) rid = serializer_get_uint16( &payload ); const uint8_t flags = serializer_get_uint8( &payload ); client->isServer = ( flags & FLAGS8_SERVER ); - if ( request.size < 3 || !image_name || client_version < MIN_SUPPORTED_CLIENT ) { + if ( unlikely( request.size < 3 || !image_name || client_version < MIN_SUPPORTED_CLIENT ) ) { if ( client_version < MIN_SUPPORTED_CLIENT ) { logadd( LOG_DEBUG1, "Client %s too old", client->hostName ); } else { @@ -257,22 +258,25 @@ void* net_handleNewConnection(void *clientPtr) } client->image = image; atomic_thread_fence( memory_order_release ); - if ( image == NULL ) { + if ( unlikely( image == NULL ) ) { //logadd( LOG_DEBUG1, "Client requested non-existent image '%s' (rid:%d), rejected\n", image_name, (int)rid ); - } else if ( !image->working ) { + } else if ( unlikely( !image->working ) ) { logadd( LOG_DEBUG1, "Client %s requested non-working image '%s' (rid:%d), rejected\n", client->hostName, image_name, (int)rid ); } else { - bool penalty; // Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable bOk = true; if ( image->cache_map != NULL ) { - mutex_lock( &image->lock ); - if ( image->uplink == NULL || image->uplink->cacheFd == -1 || image->uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) { + dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); + if ( uplink == NULL || uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) { bOk = ( rand() % 4 ) == 1; } - penalty = bOk && image->uplink != NULL && image->uplink->cacheFd == -1; - mutex_unlock( &image->lock ); + bool penalty = bOk && ( uplink == NULL || uplink->cacheFd == -1 ); + if ( uplink == NULL ) { + uplink_init( image, -1, NULL, 0 ); + } else { + ref_put( &uplink->reference ); + } if ( penalty ) { // Wait 100ms if local caching is not working so this usleep( 100000 ); // server gets a penalty and is less likely to be selected } @@ -300,7 +304,7 @@ void* net_handleNewConnection(void *clientPtr) } } - if ( bOk ) { + if ( likely( bOk ) ) { // add artificial delay if applicable if ( client->isServer && _serverPenalty != 0 ) { usleep( _serverPenalty ); @@ -315,7 +319,7 @@ void* net_handleNewConnection(void *clientPtr) case CMD_GET_BLOCK:; const uint64_t offset = request.offset_small; // Copy to full uint64 to prevent repeated masking reply.handle = request.handle; - if ( offset >= image->virtualFilesize ) { + if ( unlikely( offset >= image->virtualFilesize ) ) { // Sanity check logadd( LOG_WARNING, "Client %s requested non-existent block", client->hostName ); reply.size = 0; @@ -323,7 +327,7 @@ void* net_handleNewConnection(void *clientPtr) send_reply( client->sock, &reply, NULL ); break; } - if ( offset + request.size > image->virtualFilesize ) { + if ( unlikely( offset + request.size > image->virtualFilesize ) ) { // Sanity check logadd( LOG_WARNING, "Client %s requested data block that extends beyond image size", client->hostName ); reply.size = 0; @@ -398,7 +402,7 @@ void* net_handleNewConnection(void *clientPtr) reply.size = request.size; fixup_reply( reply ); - const bool lock = image->uplink != NULL; + const bool lock = image->uplinkref != NULL; if ( lock ) mutex_lock( &client->sendMutex ); // Send reply header if ( send( client->sock, &reply, sizeof(dnbd3_reply_t), (request.size == 0 ? 0 : MSG_MORE) ) != sizeof(dnbd3_reply_t) ) { @@ -696,9 +700,11 @@ static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client) { mutex_lock( &client->lock ); if ( client->image != NULL ) { - mutex_lock( &client->image->lock ); - if ( client->image->uplink != NULL ) uplink_removeClient( client->image->uplink, client ); - mutex_unlock( &client->image->lock ); + dnbd3_uplink_t *uplink = ref_get_uplink( &client->image->uplinkref ); + if ( uplink != NULL ) { + uplink_removeClient( uplink, client ); + ref_put( &uplink->reference ); + } } mutex_lock( &client->sendMutex ); if ( client->sock != -1 ) { @@ -740,3 +746,15 @@ static bool addToList(dnbd3_client_t *client) return true; } +void net_sendReply(dnbd3_client_t *client, uint16_t cmd, uint64_t handle) +{ + dnbd3_reply_t reply; + reply.magic = dnbd3_packet_magic; + reply.cmd = cmd; + reply.handle = handle; + reply.size = 0; + mutex_lock( &client->sendMutex ); + send_reply( client->sock, &reply, NULL ); + mutex_unlock( &client->sendMutex ); +} + diff --git a/src/server/net.h b/src/server/net.h index 6813b49..7719aef 100644 --- a/src/server/net.h +++ b/src/server/net.h @@ -37,4 +37,6 @@ void net_disconnectAll(); void net_waitForAllDisconnected(); +void net_sendReply(dnbd3_client_t *client, uint16_t cmd, uint64_t handle); + #endif /* NET_H_ */ diff --git a/src/server/reference.c b/src/server/reference.c new file mode 100644 index 0000000..468e00b --- /dev/null +++ b/src/server/reference.c @@ -0,0 +1,33 @@ +#ifndef unlikely +#define unlikely(x) (x) +#endif +#include "reference.h" +#include +#include + +void ref_init( ref *reference, void ( *freefun )( ref * ), long count ) +{ + reference->count = count; + reference->free = freefun; +} + +_Noreturn void _ref_error( const char *message ) +{ + fprintf( stderr, "Reference counter overflow\n" ); + abort(); +} + +void ref_setref( weakref *weakref, ref *ref ) +{ + union _aligned_ref_ *new_weakref = 0; + if ( ref ) { + ( new_weakref = aligned_ref( ref->_aligned_ref ) )->ref = ref; + ref->count += sizeof( union _aligned_ref_ ) + 1; + } + char *old_weakref = (char *)atomic_exchange( weakref, new_weakref ); + if ( !old_weakref ) + return; + struct _ref_ *old_ref = aligned_ref( old_weakref )->ref; + old_ref->count += old_weakref - (char *)aligned_ref( old_weakref ) - sizeof( union _aligned_ref_ ); + ref_put( old_ref ); +} diff --git a/src/server/reference.h b/src/server/reference.h new file mode 100644 index 0000000..0bc081a --- /dev/null +++ b/src/server/reference.h @@ -0,0 +1,54 @@ +#ifndef _REFERENCE_H_ +#define _REFERENCE_H_ + +#include "reftypes.h" +#include +#include + +#define container_of(ptr, type, member) \ + ((type *)((char *)(ptr) - (char *)&(((type *)NULL)->member))) + +void ref_init( ref *reference, void ( *freefun )( ref * ), long count ); + +void ref_setref( weakref *weakref, ref *ref ); + +_Noreturn void _ref_error( const char *message ); + +static inline ref *ref_get( weakref *weakref ) +{ + char *old_weakref = (char *)*weakref; + do { + if ( old_weakref == NULL ) + return NULL; + if ( aligned_ref( old_weakref ) != aligned_ref( old_weakref + 1 ) ) { + old_weakref = (char *)*weakref; + continue; + } + } while ( !atomic_compare_exchange_weak( weakref, (void **)&old_weakref, old_weakref + 1 ) ); + struct _ref_ *ref = aligned_ref( old_weakref )->ref; + if ( unlikely( ++ref->count == -1 ) ) { + _ref_error( "Reference counter overflow. Aborting.\n" ); + } + char *cur_weakref = ( char * )*weakref; + do { + if ( aligned_ref( cur_weakref ) != aligned_ref( old_weakref ) ) { + ref->count--; + break; + } + } while ( !atomic_compare_exchange_weak( weakref, (void **)&cur_weakref, cur_weakref - 1 ) ); + return ref; +} + +static inline void ref_put( ref *ref ) +{ + if ( --ref->count == 0 ) { + ref->free( ref ); + } +} + +#define ref_get_uplink(wr) ({ \ + ref* ref = ref_get( wr ); \ + ref == NULL ? NULL : container_of(ref, dnbd3_uplink_t, reference); \ +}) + +#endif diff --git a/src/server/reftypes.h b/src/server/reftypes.h new file mode 100644 index 0000000..45c0c20 --- /dev/null +++ b/src/server/reftypes.h @@ -0,0 +1,25 @@ +#ifndef _REFTYPES_H_ +#define _REFTYPES_H_ + +#include + +_Static_assert( sizeof( void * ) == sizeof( _Atomic( void * ) ), "Atomic pointer bad" ); + +typedef _Atomic( void * ) weakref; + +#define aligned_ref(ptr) \ + ((union _aligned_ref_ *)((ptr) - (uintptr_t)(ptr) % sizeof(union _aligned_ref_))) + +union _aligned_ref_ { + struct _ref_ *ref; + void *_padding[( 32 - 1 ) / sizeof( void * ) + 1]; +}; + +typedef struct _ref_ { + _Atomic long count; + void ( *free )( struct _ref_ * ); + char _padding[sizeof( union _aligned_ref_ )]; + char _aligned_ref[sizeof( union _aligned_ref_ )]; +} ref; + +#endif diff --git a/src/server/uplink.c b/src/server/uplink.c index abfebf0..7a39887 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -3,10 +3,12 @@ #include "locks.h" #include "image.h" #include "altservers.h" +#include "net.h" #include "../shared/sockhelper.h" #include "../shared/protocol.h" #include "../shared/timing.h" #include "../shared/crc32.h" +#include "reference.h" #include #include @@ -45,6 +47,8 @@ static const char *const NAMES_ULR[4] = { static atomic_uint_fast64_t totalBytesReceived = 0; +static void cancelAllRequests(dnbd3_uplink_t *uplink); +static void uplink_free(ref *ref); static void* uplink_mainloop(void *data); static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly); static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex); @@ -76,19 +80,24 @@ uint64_t uplink_getTotalBytesReceived() bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version) { if ( !_isProxy || _shutdown ) return false; - dnbd3_uplink_t *uplink = NULL; assert( image != NULL ); mutex_lock( &image->lock ); - if ( image->uplink != NULL && !image->uplink->shutdown ) { + dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); + if ( uplink != NULL ) { mutex_unlock( &image->lock ); - if ( sock >= 0 ) close( sock ); + if ( sock != -1 ) { + close( sock ); + } + ref_put( &uplink->reference ); return true; // There's already an uplink, so should we consider this success or failure? } if ( image->cache_map == NULL ) { logadd( LOG_WARNING, "Uplink was requested for image %s, but it is already complete", image->name ); goto failure; } - uplink = image->uplink = calloc( 1, sizeof(dnbd3_uplink_t) ); + uplink = calloc( 1, sizeof(dnbd3_uplink_t) ); + // Start with one reference for the uplink thread. We'll return it when the thread finishes + ref_init( &uplink->reference, uplink_free, 1 ); mutex_init( &uplink->queueLock, LOCK_UPLINK_QUEUE ); mutex_init( &uplink->rttLock, LOCK_UPLINK_RTT ); mutex_init( &uplink->sendMutex, LOCK_UPLINK_SEND ); @@ -121,12 +130,13 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version logadd( LOG_ERROR, "Could not start thread for new uplink." ); goto failure; } + ref_setref( &image->uplinkref, &uplink->reference ); mutex_unlock( &image->lock ); return true; failure: ; if ( uplink != NULL ) { free( uplink ); - uplink = image->uplink = NULL; + uplink = NULL; } mutex_unlock( &image->lock ); return false; @@ -137,34 +147,83 @@ failure: ; * Calling it multiple times, even concurrently, will * not break anything. */ -void uplink_shutdown(dnbd3_image_t *image) +bool uplink_shutdown(dnbd3_image_t *image) { - bool join = false; - pthread_t thread; assert( image != NULL ); mutex_lock( &image->lock ); - if ( image->uplink == NULL ) { + dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); + if ( uplink == NULL ) { mutex_unlock( &image->lock ); - return; + return true; } - dnbd3_uplink_t * const uplink = image->uplink; mutex_lock( &uplink->queueLock ); bool exp = false; if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) { + image->users++; // Prevent free while uplink shuts down signal_call( uplink->signal ); - thread = uplink->thread; - join = true; + } else { + logadd( LOG_ERROR, "This will never happen. '%s:%d'", image->name, (int)image->rid ); } + cancelAllRequests( uplink ); + ref_setref( &image->uplinkref, NULL ); + ref_put( &uplink->reference ); mutex_unlock( &uplink->queueLock ); - bool wait = image->uplink != NULL; + bool retval = ( exp && image->users == 0 ); mutex_unlock( &image->lock ); - if ( join ) thread_join( thread, NULL ); - while ( wait ) { - usleep( 5000 ); - mutex_lock( &image->lock ); - wait = image->uplink != NULL && image->uplink->shutdown; - mutex_unlock( &image->lock ); + return exp; +} + +/** + * Cancel all requests of this uplink. + * HOLD QUEUE LOCK WHILE CALLING + */ +static void cancelAllRequests(dnbd3_uplink_t *uplink) +{ + for ( int i = 0; i < uplink->queueLen; ++i ) { + if ( uplink->queue[i].status != ULR_FREE ) { + net_sendReply( uplink->queue[i].client, CMD_ERROR, uplink->queue[i].handle ); + uplink->queue[i].status = ULR_FREE; + } + } + uplink->queueLen = 0; +} + +static void uplink_free(ref *ref) +{ + dnbd3_uplink_t *uplink = container_of(ref, dnbd3_uplink_t, reference); + logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", uplink->image->name, (int)uplink->image->rid ); + assert( uplink->queueLen == 0 ); + signal_close( uplink->signal ); + if ( uplink->current.fd != -1 ) { + close( uplink->current.fd ); + uplink->current.fd = -1; + } + if ( uplink->better.fd != -1 ) { + close( uplink->better.fd ); + uplink->better.fd = -1; + } + mutex_destroy( &uplink->queueLock ); + mutex_destroy( &uplink->rttLock ); + mutex_destroy( &uplink->sendMutex ); + free( uplink->recvBuffer ); + uplink->recvBuffer = NULL; + if ( uplink->cacheFd != -1 ) { + close( uplink->cacheFd ); } + // TODO Requeue any requests + dnbd3_image_t *image = image_lock( uplink->image ); + if ( image != NULL ) { + // != NULL means image is still in list... + if ( !_shutdown && image->cache_map != NULL ) { + // Ingegrity checker must have found something in the meantime + uplink_init( image, -1, NULL, 0 ); + } + image_release( image ); + } + // Finally let go of image. It was acquired either in uplink_shutdown or in the cleanup code + // of the uplink thread, depending on who set the uplink->shutdown flag. + image_release( image ); + free( uplink ); // !!! } /** @@ -193,31 +252,28 @@ void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client) */ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops) { - if ( client == NULL || client->image == NULL ) return false; + if ( client == NULL || client->image == NULL ) + return false; if ( length > (uint32_t)_maxPayload ) { logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length ); return false; } - mutex_lock( &client->image->lock ); - if ( client->image->uplink == NULL ) { - mutex_unlock( &client->image->lock ); + dnbd3_uplink_t * const uplink = ref_get_uplink( &client->image->uplinkref ); + if ( uplink == NULL ) { logadd( LOG_DEBUG1, "Uplink request for image with no uplink" ); return false; } - dnbd3_uplink_t * const uplink = client->image->uplink; if ( uplink->shutdown ) { - mutex_unlock( &client->image->lock ); logadd( LOG_DEBUG1, "Uplink request for image with uplink shutting down" ); - return false; + goto fail_ref; } // Check if the client is the same host as the uplink. If so assume this is a circular proxy chain // This might be a false positive if there are multiple instances running on the same host (IP) if ( hops != 0 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) { uplink->cycleDetected = true; signal_call( uplink->signal ); - mutex_unlock( &client->image->lock ); logadd( LOG_WARNING, "Proxy cycle detected (same host)." ); - return false; + goto fail_ref; } int foundExisting = -1; // Index of a pending request that is a superset of our range, -1 otherwise @@ -229,7 +285,9 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin const uint64_t end = start + length; mutex_lock( &uplink->queueLock ); - mutex_unlock( &client->image->lock ); + if ( uplink->shutdown ) { // Check again after locking to prevent lost requests + goto fail_lock; + } for (i = 0; i < uplink->queueLen; ++i) { // find free slot to place this request into if ( uplink->queue[i].status == ULR_FREE ) { @@ -257,18 +315,16 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin if ( unlikely( requestLoop ) ) { uplink->cycleDetected = true; signal_call( uplink->signal ); - mutex_unlock( &uplink->queueLock ); logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops ); - return false; + goto fail_lock; } if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) { freeSlot = -1; // Not attaching to existing request, make it use a higher slot } if ( freeSlot == -1 ) { if ( uplink->queueLen >= SERVER_MAX_UPLINK_QUEUE ) { - mutex_unlock( &uplink->queueLock ); logadd( LOG_WARNING, "Uplink queue is full, consider increasing SERVER_MAX_UPLINK_QUEUE. Dropping client..." ); - return false; + goto fail_lock; } freeSlot = uplink->queueLen++; } @@ -305,16 +361,16 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin #endif mutex_unlock( &uplink->queueLock ); - if ( foundExisting != -1 ) + if ( foundExisting != -1 ) { + ref_put( &uplink->reference ); return true; // Attached to pending request, do nothing - - usleep( 10000 ); + } // See if we can fire away the request - if ( mutex_trylock( &uplink->sendMutex ) != 0 ) { + if ( unlikely( mutex_trylock( &uplink->sendMutex ) != 0 ) ) { logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" ); } else { - if ( uplink->current.fd == -1 ) { + if ( unlikely( uplink->current.fd == -1 ) ) { mutex_unlock( &uplink->sendMutex ); logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" ); } else { @@ -323,13 +379,13 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin if ( hops < 200 ) ++hops; const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) ); mutex_unlock( &uplink->sendMutex ); - if ( !ret ) { + if ( unlikely( !ret ) ) { logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" ); } else { // Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again int state; mutex_lock( &uplink->queueLock ); - if ( uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) { + if ( !uplink->shutdown && uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) { state = uplink->queue[freeSlot].status; if ( uplink->queue[freeSlot].status == ULR_NEW ) { uplink->queue[freeSlot].status = ULR_PENDING; @@ -345,6 +401,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin } else { logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] ); } + ref_put( &uplink->reference ); return true; } // Fall through to waking up sender thread @@ -354,7 +411,13 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) { logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno ); } + ref_put( &uplink->reference ); return true; +fail_lock: + mutex_unlock( &uplink->queueLock ); +fail_ref: + ref_put( &uplink->reference ); + return false; } /** @@ -381,6 +444,7 @@ static void* uplink_mainloop(void *data) // assert( uplink != NULL ); setThreadName( "idle-uplink" ); + thread_detach( uplink->thread ); blockNoncriticalSignals(); // Make sure file is open for writing if ( !uplink_reopenCacheFd( uplink, false ) ) { @@ -553,7 +617,7 @@ static void* uplink_mainloop(void *data) for (i = 0; i < uplink->queueLen; ++i) { if ( uplink->queue[i].status != ULR_FREE && timing_reached( &uplink->queue[i].entered, &deadline ) ) { snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n" - "%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)link, i, uplink->queue[i].client->image->name, + "%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)uplink, i, uplink->queue[i].client->image->name, uplink->queue[i].from, uplink->queue[i].to, uplink->queue[i].status ); uplink->queue[i].entered = now; #ifdef _DEBUG_RESEND_STARVING @@ -572,55 +636,26 @@ static void* uplink_mainloop(void *data) #endif } cleanup: ; - // Detach depends on whether someone is joining this thread... - bool exp = false; - if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) { - thread_detach( uplink->thread ); - } uplink_saveCacheMap( uplink ); dnbd3_image_t *image = uplink->image; mutex_lock( &image->lock ); - // in the list anymore, but we want to prevent it from being freed in either case - if ( image->uplink == uplink ) { - image->uplink = NULL; - } - mutex_unlock( &image->lock ); // Do NOT use image without locking it - mutex_lock( &uplink->queueLock ); - // Wait for active RTT measurement to finish - while ( uplink->rttTestResult == RTT_INPROGRESS ) { - usleep( 10000 ); - } - signal_close( uplink->signal ); - mutex_lock( &uplink->rttLock ); - mutex_lock( &uplink->sendMutex ); - if ( uplink->current.fd != -1 ) { - close( uplink->current.fd ); - uplink->current.fd = -1; - } - if ( uplink->better.fd != -1 ) { - close( uplink->better.fd ); - uplink->better.fd = -1; + bool exp = false; + if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) { + image->users++; // We set the flag - hold onto image } - mutex_unlock( &uplink->sendMutex ); - mutex_unlock( &uplink->rttLock ); - mutex_unlock( &uplink->queueLock ); - mutex_destroy( &uplink->queueLock ); - mutex_destroy( &uplink->rttLock ); - mutex_destroy( &uplink->sendMutex ); - free( uplink->recvBuffer ); - uplink->recvBuffer = NULL; - if ( uplink->cacheFd != -1 ) { - close( uplink->cacheFd ); + dnbd3_uplink_t *current = ref_get_uplink( &image->uplinkref ); + if ( current == uplink ) { // Set NULL if it's still us... + mutex_lock( &uplink->queueLock ); + cancelAllRequests( uplink ); + mutex_unlock( &uplink->queueLock ); + ref_setref( &image->uplinkref, NULL ); } - free( uplink ); // !!! - if ( image_lock( image ) != NULL ) { - // Image is still in list... - if ( !_shutdown && image->cache_map != NULL ) { - // Ingegrity checker must have found something in the meantime - uplink_init( image, -1, NULL, 0 ); - } - image_release( image ); + if ( current != NULL ) { // Decrease ref in any case + ref_put( ¤t->reference ); } + mutex_unlock( &image->lock ); + // Finally as the thread is done, decrease our own ref that we initialized with + ref_put( &uplink->reference ); return NULL ; } @@ -637,7 +672,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) const uint32_t reqSize = (uint32_t)(((uplink->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart); /* logadd( LOG_DEBUG2, "[%p] Sending slot %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 " (%" PRIu64 "-%" PRIu64 ")", - (void*)link, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize ); + (void*)uplink, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize ); */ mutex_unlock( &uplink->queueLock ); if ( hops < 200 ) ++hops; @@ -782,7 +817,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int /** * Receive data from uplink server and process/dispatch - * Locks on: link.lock, images[].lock + * Locks on: uplink.lock, images[].lock */ static void uplink_handleReceive(dnbd3_uplink_t *uplink) { @@ -924,13 +959,16 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) } mutex_unlock( &client->sendMutex ); mutex_lock( &uplink->queueLock ); + if ( i > uplink->queueLen ) { + uplink->queueLen = i; // Might have been set to 0 by cancelAllRequests + } } if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--; } mutex_unlock( &uplink->queueLock ); #ifdef _DEBUG if ( !served && start != uplink->replicationHandle ) { - logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)link, uplink->image->name, start, end ); + logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)uplink, uplink->image->name, start, end ); } #endif if ( start == uplink->replicationHandle ) { diff --git a/src/server/uplink.h b/src/server/uplink.h index acc8e11..49ff0b4 100644 --- a/src/server/uplink.h +++ b/src/server/uplink.h @@ -14,7 +14,7 @@ void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client); bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hopCount); -void uplink_shutdown(dnbd3_image_t *image); +bool uplink_shutdown(dnbd3_image_t *image); bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len); -- cgit v1.2.3-55-g7522 From b848c60317dcb5193e4541a679dfc82a257f83e9 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Tue, 27 Aug 2019 20:58:01 +0200 Subject: [SERVER] Fix swapped assignment --- src/server/uplink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/server/uplink.c') diff --git a/src/server/uplink.c b/src/server/uplink.c index 7a39887..d77be9c 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -960,7 +960,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) mutex_unlock( &client->sendMutex ); mutex_lock( &uplink->queueLock ); if ( i > uplink->queueLen ) { - uplink->queueLen = i; // Might have been set to 0 by cancelAllRequests + i = uplink->queueLen; // Might have been set to 0 by cancelAllRequests } } if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--; -- cgit v1.2.3-55-g7522 From 88695877f085af475a6ca8a01c2fbb08eb5b15da Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Thu, 29 Aug 2019 14:49:18 +0200 Subject: [SERVER] Use weakref for cache maps Gets rid of a bunch of locking, especially the hot path in net.c where clients are requesting data. Many clients unsing the same incomplete image previously created a bottleneck here. --- src/server/globals.h | 10 ++- src/server/image.c | 208 +++++++++++++++++++++++++++++++------------------ src/server/image.h | 2 +- src/server/integrity.c | 10 ++- src/server/net.c | 81 +++++++++---------- src/server/reference.h | 5 ++ src/server/uplink.c | 64 +++++++-------- 7 files changed, 220 insertions(+), 160 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/globals.h b/src/server/globals.h index f940666..221af78 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -99,6 +99,12 @@ typedef struct int permissions; } dnbd3_access_rule_t; +typedef struct +{ + ref reference; + atomic_uint_least8_t map[]; +} dnbd3_cache_map_t; + /** * Image struct. An image path could be something like * /mnt/images/rz/zfs/Windows7 ZfS.vmdk.r1 @@ -110,7 +116,7 @@ struct _dnbd3_image char *path; // absolute path of the image char *name; // public name of the image (usually relative path minus revision ID) weakref uplinkref; // pointer to a server connection - uint8_t *cache_map; // cache map telling which parts are locally cached, NULL if complete + weakref ref_cacheMap; // cache map telling which parts are locally cached, NULL if complete uint64_t virtualFilesize; // virtual size of image (real size rounded up to multiple of 4k) uint64_t realFilesize; // actual file size on disk ticks atime; // last access time @@ -119,7 +125,7 @@ struct _dnbd3_image uint32_t *crc32; // list of crc32 checksums for each 16MiB block in image uint32_t masterCrc32; // CRC-32 of the crc-32 list int readFd; // used to read the image. Used from multiple threads, so use atomic operations (pread et al) - int completenessEstimate; // Completeness estimate in percent + atomic_int completenessEstimate; // Completeness estimate in percent atomic_int users; // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock. int id; // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server atomic_bool working; // true if image exists and completeness is == 100% or a working upstream proxy is connected diff --git a/src/server/image.c b/src/server/image.c index 4eab1d2..1972f48 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -51,10 +51,18 @@ static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageS static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc); static bool image_ensureDiskSpace(uint64_t size, bool force); -static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize); +static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize); static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc); -static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, uint8_t * const cache_map); +static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, atomic_uint_least8_t * const cache_map); static void* closeUnusedFds(void*); +static void allocCacheMap(dnbd3_image_t *image, bool complete); + +static void cmfree(ref *ref) +{ + dnbd3_cache_map_t *cache = container_of(ref, dnbd3_cache_map_t, reference); + logadd( LOG_DEBUG2, "Freeing a cache map" ); + free( cache ); +} // ########################################## @@ -70,7 +78,6 @@ void image_serverStartup() /** * Update cache-map of given image for the given byte range * start (inclusive) - end (exclusive) - * Locks on: images[].lock */ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set) { @@ -91,33 +98,55 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co if ( start >= end ) return; bool setNewBlocks = false; - uint64_t pos = start; - mutex_lock( &image->lock ); - if ( image->cache_map == NULL ) { + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) { // Image seems already complete if ( set ) { // This makes no sense - mutex_unlock( &image->lock ); - logadd( LOG_DEBUG1, "image_updateCachemap(true) with no cache_map: %s", image->path ); + logadd( LOG_DEBUG1, "image_updateCachemap(true) with no cache map: %s", image->path ); return; } // Recreate a cache map, set it to all 1 initially as we assume the image was complete - const int byteSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); - image->cache_map = malloc( byteSize ); - memset( image->cache_map, 0xff, byteSize ); - } - while ( pos < end ) { - const size_t map_y = (int)( pos >> 15 ); - const int map_x = (int)( (pos >> 12) & 7 ); // mod 8 - const int bit_mask = 1 << map_x; - if ( set ) { - if ( (image->cache_map[map_y] & bit_mask) == 0 ) setNewBlocks = true; - image->cache_map[map_y] |= (uint8_t)bit_mask; - } else { - image->cache_map[map_y] &= (uint8_t)~bit_mask; + allocCacheMap( image, true ); + cache = ref_get_cachemap( image ); + if ( cache == NULL ) { + logadd( LOG_WARNING, "WHAT!!!?!?!= No cache map right after alloc?! %s", image->path ); + return; } - pos += DNBD3_BLOCK_SIZE; } + // Set/unset + const uint64_t firstByteInMap = start >> 15; + const uint64_t lastByteInMap = (end - 1) >> 15; + uint64_t pos; + // First byte + uint8_t fb = 0, lb = 0; + for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) { + const int map_x = (pos >> 12) & 7; // mod 8 + const uint8_t bit_mask = (uint8_t)( 1 << map_x ); + fb |= bit_mask; + } + // Last byte + for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) { + const int map_x = (pos >> 12) & 7; // mod 8 + const uint8_t bit_mask = (uint8_t)( 1 << map_x ); + lb |= bit_mask; + } + if ( set ) { + uint8_t fo = atomic_fetch_or_explicit( &cache->map[firstByteInMap], fb, memory_order_relaxed ); + uint8_t lo = atomic_fetch_or_explicit( &cache->map[lastByteInMap], lb, memory_order_relaxed ); + setNewBlocks = ( fo != cache->map[firstByteInMap] || lo != cache->map[lastByteInMap] ); + } else { + atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~fb, memory_order_relaxed ); + atomic_fetch_and_explicit( &cache->map[lastByteInMap], (uint8_t)~lb, memory_order_relaxed ); + } + const uint8_t nval = set ? 0xff : 0; + // Everything in between + for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) { + if ( atomic_exchange_explicit( &cache->map[pos], nval, memory_order_relaxed ) != nval && set ) { + setNewBlocks = true; + } + } + atomic_thread_fence( memory_order_release ); if ( setNewBlocks && image->crc32 != NULL ) { // If setNewBlocks is set, at least one of the blocks was not cached before, so queue all hash blocks // for checking, even though this might lead to checking some hash block again, if it was @@ -125,19 +154,14 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co // First set start and end to borders of hash blocks start &= ~(uint64_t)(HASH_BLOCK_SIZE - 1); end = (end + HASH_BLOCK_SIZE - 1) & ~(uint64_t)(HASH_BLOCK_SIZE - 1); - pos = start; - while ( pos < end ) { - if ( image->cache_map == NULL ) break; + for ( pos = start; pos < end; pos += HASH_BLOCK_SIZE ) { const int block = (int)( pos / HASH_BLOCK_SIZE ); - if ( image_isHashBlockComplete( image->cache_map, block, image->realFilesize ) ) { - mutex_unlock( &image->lock ); + if ( image_isHashBlockComplete( cache->map, block, image->realFilesize ) ) { integrity_check( image, block ); - mutex_lock( &image->lock ); } - pos += HASH_BLOCK_SIZE; } } - mutex_unlock( &image->lock ); + ref_put( &cache->reference ); } /** @@ -149,20 +173,18 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co bool image_isComplete(dnbd3_image_t *image) { assert( image != NULL ); - mutex_lock( &image->lock ); if ( image->virtualFilesize == 0 ) { - mutex_unlock( &image->lock ); return false; } - if ( image->cache_map == NULL ) { - mutex_unlock( &image->lock ); + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) { return true; } bool complete = true; int j; const int map_len_bytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); for (j = 0; j < map_len_bytes - 1; ++j) { - if ( image->cache_map[j] != 0xFF ) { + if ( cache->map[j] != 0xFF ) { complete = false; break; } @@ -177,18 +199,27 @@ bool image_isComplete(dnbd3_image_t *image) for (j = 0; j < blocks_in_last_byte; ++j) last_byte |= (uint8_t)(1 << j); } - complete = ((image->cache_map[map_len_bytes - 1] & last_byte) == last_byte); + complete = ((cache->map[map_len_bytes - 1] & last_byte) == last_byte); } - if ( !complete ) { - mutex_unlock( &image->lock ); + ref_put( &cache->reference ); + if ( !complete ) return false; + mutex_lock( &image->lock ); + // Lock and make sure current cache map is still the one we saw complete + dnbd3_cache_map_t *current = ref_get_cachemap( image ); + if ( current == cache ) { + // Set cache map NULL as it's complete + ref_setref( &image->ref_cacheMap, NULL ); + } + if ( current != NULL ) { + ref_put( ¤t->reference ); } - char mapfile[PATHLEN] = ""; - free( image->cache_map ); - image->cache_map = NULL; - snprintf( mapfile, PATHLEN, "%s.map", image->path ); mutex_unlock( &image->lock ); - unlink( mapfile ); + if ( current == cache ) { // Successfully set cache map to NULL above + char mapfile[PATHLEN] = ""; + snprintf( mapfile, PATHLEN, "%s.map", image->path ); + unlink( mapfile ); + } return true; } @@ -350,19 +381,18 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) img->rid = candidate->rid; img->users = 1; img->working = false; + img->ref_cacheMap = NULL; mutex_init( &img->lock, LOCK_IMAGE ); if ( candidate->crc32 != NULL ) { const size_t mb = IMGSIZE_TO_HASHBLOCKS( candidate->virtualFilesize ) * sizeof(uint32_t); img->crc32 = malloc( mb ); memcpy( img->crc32, candidate->crc32, mb ); } - mutex_lock( &candidate->lock ); - if ( candidate->cache_map != NULL ) { - const size_t mb = IMGSIZE_TO_MAPBYTES( candidate->virtualFilesize ); - img->cache_map = malloc( mb ); - memcpy( img->cache_map, candidate->cache_map, mb ); + dnbd3_cache_map_t *cache = ref_get_cachemap( candidate ); + if ( cache != NULL ) { + ref_setref( &img->ref_cacheMap, &cache->reference ); + ref_put( &cache->reference ); } - mutex_unlock( &candidate->lock ); if ( image_addToList( img ) ) { image_release( candidate ); candidate = img; @@ -377,7 +407,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) } // Check if image is incomplete, handle - if ( candidate->cache_map != NULL ) { + if ( candidate->ref_cacheMap != NULL ) { uplink_init( candidate, -1, NULL, -1 ); } @@ -585,11 +615,10 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image) if ( !uplink_shutdown( image ) ) return NULL; mutex_lock( &image->lock ); - free( image->cache_map ); + ref_setref( &image->ref_cacheMap, NULL ); free( image->crc32 ); free( image->path ); free( image->name ); - image->cache_map = NULL; image->crc32 = NULL; image->path = NULL; image->name = NULL; @@ -600,7 +629,7 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image) return NULL ; } -bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t realFilesize) +bool image_isHashBlockComplete(atomic_uint_least8_t * const cacheMap, const uint64_t block, const uint64_t realFilesize) { if ( cacheMap == NULL ) return true; const uint64_t end = (block + 1) * HASH_BLOCK_SIZE; @@ -707,7 +736,7 @@ static bool image_load(char *base, char *path, int withUplink) { int revision = -1; struct stat st; - uint8_t *cache_map = NULL; + dnbd3_cache_map_t *cache = NULL; uint32_t *crc32list = NULL; dnbd3_image_t *existing = NULL; int fdImage = -1; @@ -790,7 +819,7 @@ static bool image_load(char *base, char *path, int withUplink) } // 1. Allocate memory for the cache map if the image is incomplete - cache_map = image_loadCacheMap( path, virtualFilesize ); + cache = image_loadCacheMap( path, virtualFilesize ); // XXX: Maybe try sha-256 or 512 first if you're paranoid (to be implemented) @@ -802,7 +831,7 @@ static bool image_load(char *base, char *path, int withUplink) // Check CRC32 if ( crc32list != NULL ) { - if ( !image_checkRandomBlocks( 4, fdImage, realFilesize, crc32list, cache_map ) ) { + if ( !image_checkRandomBlocks( 4, fdImage, realFilesize, crc32list, cache != NULL ? cache->map : NULL ) ) { logadd( LOG_ERROR, "quick crc32 check of %s failed. Data corruption?", path ); doFullCheck = true; } @@ -826,7 +855,7 @@ static bool image_load(char *base, char *path, int withUplink) crc32list = NULL; function_return = true; goto load_error; // Keep existing - } else if ( existing->cache_map != NULL && cache_map == NULL ) { + } else if ( existing->ref_cacheMap != NULL && cache == NULL ) { // Just ignore that fact, if replication is really complete the cache map will be removed anyways logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", existing->name, (int)existing->rid ); function_return = true; @@ -846,7 +875,8 @@ static bool image_load(char *base, char *path, int withUplink) dnbd3_image_t *image = calloc( 1, sizeof(dnbd3_image_t) ); image->path = strdup( path ); image->name = strdup( imgName ); - image->cache_map = cache_map; + image->ref_cacheMap = NULL; + ref_setref( &image->ref_cacheMap, &cache->reference ); image->crc32 = crc32list; image->masterCrc32 = masterCrc; image->uplinkref = NULL; @@ -855,7 +885,7 @@ static bool image_load(char *base, char *path, int withUplink) image->rid = (uint16_t)revision; image->users = 0; image->readFd = -1; - image->working = (image->cache_map == NULL ); + image->working = ( cache == NULL ); timing_get( &image->nextCompletenessEstimate ); image->completenessEstimate = -1; mutex_init( &image->lock, LOCK_IMAGE ); @@ -870,16 +900,16 @@ static bool image_load(char *base, char *path, int withUplink) timing_gets( &image->atime, offset ); // Prevent freeing in cleanup - cache_map = NULL; + cache = NULL; crc32list = NULL; // Get rid of cache map if image is complete - if ( image->cache_map != NULL ) { + if ( image->ref_cacheMap != NULL ) { image_isComplete( image ); } // Image is definitely incomplete, initialize uplink worker - if ( image->cache_map != NULL ) { + if ( image->ref_cacheMap != NULL ) { image->working = false; if ( withUplink ) { uplink_init( image, -1, NULL, -1 ); @@ -910,21 +940,22 @@ static bool image_load(char *base, char *path, int withUplink) load_error: ; if ( existing != NULL ) existing = image_release( existing ); if ( crc32list != NULL ) free( crc32list ); - if ( cache_map != NULL ) free( cache_map ); + if ( cache != NULL ) free( cache ); if ( fdImage != -1 ) close( fdImage ); return function_return; } -static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize) +static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize) { - uint8_t *retval = NULL; + dnbd3_cache_map_t *retval = NULL; char mapFile[strlen( imagePath ) + 10 + 1]; sprintf( mapFile, "%s.map", imagePath ); int fdMap = open( mapFile, O_RDONLY ); - if ( fdMap >= 0 ) { + if ( fdMap != -1 ) { const int map_size = IMGSIZE_TO_MAPBYTES( fileSize ); - retval = calloc( 1, map_size ); - const ssize_t rd = read( fdMap, retval, map_size ); + retval = calloc( 1, sizeof(*retval) + map_size ); + ref_init( &retval->reference, cmfree, 0 ); + const ssize_t rd = read( fdMap, retval->map, map_size ); if ( map_size != rd ) { logadd( LOG_WARNING, "Could only read %d of expected %d bytes of cache map of '%s'", (int)rd, (int)map_size, imagePath ); // Could not read complete map, that means the rest of the image file will be considered incomplete @@ -985,7 +1016,7 @@ static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t f return retval; } -static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t realFilesize, uint32_t * const crc32list, uint8_t * const cache_map) +static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t realFilesize, uint32_t * const crc32list, atomic_uint_least8_t * const cache_map) { // This checks the first block and (up to) count - 1 random blocks for corruption // via the known crc32 list. This is very sloppy and is merely supposed to detect @@ -1529,30 +1560,37 @@ json_t* image_getListAsJson() /** * Get completeness of an image in percent. Only estimated, not exact. * Returns: 0-100 - * DOES NOT LOCK, so make sure to do so before calling */ int image_getCompletenessEstimate(dnbd3_image_t * const image) { assert( image != NULL ); - if ( image->cache_map == NULL ) return image->working ? 100 : 0; + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) + return image->working ? 100 : 0; + const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + if ( unlikely( len == 0 ) ) { + ref_put( &cache->reference ); + return 0; + } declare_now; if ( !timing_reached( &image->nextCompletenessEstimate, &now ) ) { // Since this operation is relatively expensive, we cache the result for a while + ref_put( &cache->reference ); return image->completenessEstimate; } int i; int percent = 0; - const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); - if ( len == 0 ) return 0; for ( i = 0; i < len; ++i ) { - if ( image->cache_map[i] == 0xff ) { + const uint8_t v = atomic_load_explicit( &cache->map[i], memory_order_relaxed ); + if ( v == 0xff ) { percent += 100; - } else if ( image->cache_map[i] != 0 ) { + } else if ( v != 0 ) { percent += 50; } } + ref_put( &cache->reference ); image->completenessEstimate = percent / len; - timing_set( &image->nextCompletenessEstimate, &now, 8 + rand() % 32 ); + timing_set( &image->nextCompletenessEstimate, &now, 4 + rand() % 16 ); return image->completenessEstimate; } @@ -1744,3 +1782,21 @@ static void* closeUnusedFds(void* nix UNUSED) } return NULL; } + +static void allocCacheMap(dnbd3_image_t *image, bool complete) +{ + const uint8_t val = complete ? 0xff : 0; + const int byteSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + dnbd3_cache_map_t *cache = malloc( sizeof(*cache) + byteSize ); + ref_init( &cache->reference, cmfree, 0 ); + memset( cache->map, val, byteSize ); + mutex_lock( &image->lock ); + if ( image->ref_cacheMap != NULL ) { + logadd( LOG_WARNING, "BUG: allocCacheMap called but there already is a cache map for %s:%d", image->name, (int)image->rid ); + free( cache ); + } else { + ref_setref( &image->ref_cacheMap, &cache->reference ); + } + mutex_unlock( &image->lock ); +} + diff --git a/src/server/image.h b/src/server/image.h index 4668eff..cd87f03 100644 --- a/src/server/image.h +++ b/src/server/image.h @@ -9,7 +9,7 @@ void image_serverStartup(); bool image_isComplete(dnbd3_image_t *image); -bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t fileSize); +bool image_isHashBlockComplete(atomic_uint_least8_t * const cacheMap, const uint64_t block, const uint64_t fileSize); void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set); diff --git a/src/server/integrity.c b/src/server/integrity.c index 1fcb558..a9fbae6 100644 --- a/src/server/integrity.c +++ b/src/server/integrity.c @@ -181,10 +181,12 @@ static void* integrity_main(void * data UNUSED) const uint64_t end = MIN( (uint64_t)(blocks[0] + 1) * HASH_BLOCK_SIZE, image->virtualFilesize ); bool complete = true; if ( qCount == CHECK_ALL ) { - // When checking full image, skip incomplete blocks, otherwise assume block is complete - mutex_lock( &image->lock ); - complete = image_isHashBlockComplete( image->cache_map, blocks[0], fileSize ); - mutex_unlock( &image->lock ); + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache != NULL ) { + // When checking full image, skip incomplete blocks, otherwise assume block is complete + complete = image_isHashBlockComplete( cache->map, blocks[0], fileSize ); + ref_put( &cache->reference ); + } } #if defined(linux) || defined(__linux) while ( sync_file_range( fd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 ) diff --git a/src/server/net.c b/src/server/net.c index 9c855e4..12bcdad 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -246,7 +246,7 @@ void* net_handleNewConnection(void *clientPtr) // We're a proxy, client is another proxy, we don't do BGR, but connecting proxy does... // Reject, as this would basically force this proxy to do BGR too. image = image_get( image_name, rid, true ); - if ( image != NULL && image->cache_map != NULL ) { + if ( image != NULL && image->ref_cacheMap != NULL ) { // Only exception is if the image is complete locally image = image_release( image ); } @@ -268,7 +268,7 @@ void* net_handleNewConnection(void *clientPtr) } else { // Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable bOk = true; - if ( image->cache_map != NULL ) { + if ( image->ref_cacheMap != NULL ) { dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); if ( uplink == NULL || uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) { bOk = ( rand() % 4 ) == 1; @@ -338,57 +338,52 @@ void* net_handleNewConnection(void *clientPtr) break; } - if ( request.size != 0 && image->cache_map != NULL ) { + dnbd3_cache_map_t *cache; + if ( request.size != 0 && ( cache = ref_get_cachemap( image ) ) != NULL ) { // This is a proxyed image, check if we need to relay the request... start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); bool isCached = true; - mutex_lock( &image->lock ); - // Check again as we only aquired the lock just now - if ( image->cache_map != NULL ) { - const uint64_t firstByteInMap = start >> 15; - const uint64_t lastByteInMap = (end - 1) >> 15; - uint64_t pos; - // Middle - quick checking - if ( isCached ) { - pos = firstByteInMap + 1; - while ( pos < lastByteInMap ) { - if ( image->cache_map[pos] != 0xff ) { - isCached = false; - break; - } - ++pos; + const uint64_t firstByteInMap = start >> 15; + const uint64_t lastByteInMap = (end - 1) >> 15; + uint64_t pos; + uint8_t b; + atomic_thread_fence( memory_order_acquire ); + // Middle - quick checking + if ( isCached ) { + for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) { + if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) { + isCached = false; + break; } } - // First byte - if ( isCached ) { - pos = start; - do { - const int map_x = (pos >> 12) & 7; // mod 8 - const uint8_t bit_mask = (uint8_t)( 1 << map_x ); - if ( (image->cache_map[firstByteInMap] & bit_mask) == 0 ) { - isCached = false; - break; - } - pos += DNBD3_BLOCK_SIZE; - } while ( firstByteInMap == (pos >> 15) && pos < end ); + } + // First byte + if ( isCached ) { + b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed ); + for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) { + const int map_x = (pos >> 12) & 7; // mod 8 + const uint8_t bit_mask = (uint8_t)( 1 << map_x ); + if ( (b & bit_mask) == 0 ) { + isCached = false; + break; + } } - // Last byte - only check if request spans multiple bytes in cache map - if ( isCached && firstByteInMap != lastByteInMap ) { - pos = lastByteInMap << 15; - while ( pos < end ) { - assert( lastByteInMap == (pos >> 15) ); - const int map_x = (pos >> 12) & 7; // mod 8 - const uint8_t bit_mask = (uint8_t)( 1 << map_x ); - if ( (image->cache_map[lastByteInMap] & bit_mask) == 0 ) { - isCached = false; - break; - } - pos += DNBD3_BLOCK_SIZE; + } + // Last byte - only check if request spans multiple bytes in cache map + if ( isCached && firstByteInMap != lastByteInMap ) { + b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed ); + for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) { + assert( lastByteInMap == (pos >> 15) ); + const int map_x = (pos >> 12) & 7; // mod 8 + const uint8_t bit_mask = (uint8_t)( 1 << map_x ); + if ( (b & bit_mask) == 0 ) { + isCached = false; + break; } } } - mutex_unlock( &image->lock ); + ref_put( &cache->reference ); if ( !isCached ) { if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) { logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy, disabling image %s:%d", diff --git a/src/server/reference.h b/src/server/reference.h index 8883eb1..2a80955 100644 --- a/src/server/reference.h +++ b/src/server/reference.h @@ -51,4 +51,9 @@ static inline void ref_put( ref *ref ) ref == NULL ? NULL : container_of(ref, dnbd3_uplink_t, reference); \ }) +#define ref_get_cachemap(image) ({ \ + ref* ref = ref_get( &(image)->ref_cacheMap ); \ + ref == NULL ? NULL : container_of(ref, dnbd3_cache_map_t, reference); \ +}) + #endif diff --git a/src/server/uplink.c b/src/server/uplink.c index d77be9c..0a6bd11 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -91,7 +91,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version ref_put( &uplink->reference ); return true; // There's already an uplink, so should we consider this success or failure? } - if ( image->cache_map == NULL ) { + if ( image->ref_cacheMap == NULL ) { logadd( LOG_WARNING, "Uplink was requested for image %s, but it is already complete", image->name ); goto failure; } @@ -170,7 +170,7 @@ bool uplink_shutdown(dnbd3_image_t *image) mutex_unlock( &uplink->queueLock ); bool retval = ( exp && image->users == 0 ); mutex_unlock( &image->lock ); - return exp; + return retval; } /** @@ -214,7 +214,7 @@ static void uplink_free(ref *ref) dnbd3_image_t *image = image_lock( uplink->image ); if ( image != NULL ) { // != NULL means image is still in list... - if ( !_shutdown && image->cache_map != NULL ) { + if ( !_shutdown && image->ref_cacheMap != NULL ) { // Ingegrity checker must have found something in the meantime uplink_init( image, -1, NULL, 0 ); } @@ -707,13 +707,14 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) if ( uplink == NULL || uplink->current.fd == -1 ) return; if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return; // Don't do background replication if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE ) - return; + return; // Already a replication request on the wire, or no more blocks to replicate dnbd3_image_t * const image = uplink->image; if ( image->virtualFilesize < DNBD3_BLOCK_SIZE ) return; - mutex_lock( &image->lock ); - if ( image == NULL || image->cache_map == NULL || image->users < _bgrMinClients ) { - // No cache map (=image complete), or replication pending, or not enough users, do nothing - mutex_unlock( &image->lock ); + if ( image->users < _bgrMinClients ) return; // Not enough active users + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL || image->users < _bgrMinClients ) { + // No cache map (=image complete) + ref_put( &cache->reference ); return; } const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); @@ -727,16 +728,18 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) endByte = mapBytes; } } + atomic_thread_fence( memory_order_acquire ); int replicationIndex = -1; for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) { const int i = j % ( mapBytes ); // Wrap around for BGR_FULL - if ( image->cache_map[i] != 0xff && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) { + if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff + && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) { // Found incomplete one replicationIndex = i; break; } } - mutex_unlock( &image->lock ); + ref_put( &cache->reference ); if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) { // Nothing left in current block, find next one replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte ); @@ -768,23 +771,24 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) } /** - * find next index into cache_map that corresponds to the beginning + * find next index into cache map that corresponds to the beginning * of a hash block which is neither completely empty nor completely * replicated yet. Returns -1 if no match. */ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int startMapIndex) { int retval = -1; - mutex_lock( &uplink->image->lock ); - const int mapBytes = IMGSIZE_TO_MAPBYTES( uplink->image->virtualFilesize ); - const uint8_t *cache_map = uplink->image->cache_map; - if ( cache_map != NULL ) { - int j; + dnbd3_cache_map_t *cache = ref_get_cachemap( uplink->image ); + if ( cache != NULL ) { + const int mapBytes = IMGSIZE_TO_MAPBYTES( uplink->image->virtualFilesize ); const int start = ( startMapIndex & MAP_INDEX_HASH_START_MASK ); + atomic_thread_fence( memory_order_acquire ); + int j; for (j = 0; j < mapBytes; ++j) { const int i = ( start + j ) % mapBytes; - const bool isFull = cache_map[i] == 0xff || ( i + 1 == mapBytes && uplink->replicatedLastBlock ); - const bool isEmpty = cache_map[i] == 0; + const uint8_t b = atomic_load_explicit( &cache->map[i], memory_order_relaxed ); + const bool isFull = b == 0xff || ( i + 1 == mapBytes && uplink->replicatedLastBlock ); + const bool isEmpty = b == 0; if ( !isEmpty && !isFull ) { // Neither full nor empty, replicate if ( retval == -1 ) { @@ -811,7 +815,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int retval = -1; } } - mutex_unlock( &uplink->image->lock ); + ref_put( &cache->reference ); return retval; } @@ -1107,7 +1111,7 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) if ( fsync( uplink->cacheFd ) == -1 ) { // A failing fsync means we have no guarantee that any data // since the last fsync (or open if none) has been saved. Apart - // from keeping the cache_map from the last successful fsync + // from keeping the cache map from the last successful fsync // around and restoring it there isn't much we can do to recover // a consistent state. Bail out. logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d", image->path, errno ); @@ -1116,21 +1120,13 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) } } - if ( image->cache_map == NULL ) return true; - logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); - mutex_lock( &image->lock ); - // Lock and get a copy of the cache map, as it could be freed by another thread that is just about to - // figure out that this image's cache copy is complete - if ( image->cache_map == NULL || image->virtualFilesize < DNBD3_BLOCK_SIZE ) { - mutex_unlock( &image->lock ); + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) return true; - } + logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize); - uint8_t *map = malloc( size ); - memcpy( map, image->cache_map, size ); // Unlock. Use path and cacheFd without locking. path should never change after initialization of the image, // cacheFd is owned by the uplink thread and we don't want to hold a spinlock during I/O - mutex_unlock( &image->lock ); assert( image->path != NULL ); char mapfile[strlen( image->path ) + 4 + 1]; strcpy( mapfile, image->path ); @@ -1139,14 +1135,14 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 ); if ( fd == -1 ) { const int err = errno; - free( map ); + ref_put( &cache->reference ); logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile ); return false; } size_t done = 0; while ( done < size ) { - const ssize_t ret = write( fd, map, size - done ); + const ssize_t ret = write( fd, cache->map + done, size - done ); if ( ret == -1 ) { if ( errno == EINTR ) continue; logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile ); @@ -1158,11 +1154,11 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) } done += (size_t)ret; } + ref_put( &cache->reference ); if ( fsync( fd ) == -1 ) { logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno ); } close( fd ); - free( map ); return true; } -- cgit v1.2.3-55-g7522 From 9d2d9c6de358b2cf1a602c999d2e0a7a664610f7 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Thu, 29 Aug 2019 23:05:26 +0200 Subject: [SERVER] Tear down whole uplink on idle timeout Keeping the uplink thread around forever even though we disconnected from the upstream server seems wasteful. Get rid of this and rear down the uplink entirely. --- src/server/net.c | 13 +++++-------- src/server/uplink.c | 40 +++++++++++++++++++--------------------- 2 files changed, 24 insertions(+), 29 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/net.c b/src/server/net.c index 12bcdad..00c9a8d 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -270,18 +270,15 @@ void* net_handleNewConnection(void *clientPtr) bOk = true; if ( image->ref_cacheMap != NULL ) { dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); - if ( uplink == NULL || uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) { + if ( uplink != NULL && ( uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) ) { bOk = ( rand() % 4 ) == 1; } - bool penalty = bOk && ( uplink == NULL || uplink->cacheFd == -1 ); - if ( uplink == NULL ) { - uplink_init( image, -1, NULL, 0 ); - } else { - ref_put( &uplink->reference ); - } - if ( penalty ) { // Wait 100ms if local caching is not working so this + if ( bOk && uplink != NULL && uplink->cacheFd == -1 ) { // Wait 100ms if local caching is not working so this usleep( 100000 ); // server gets a penalty and is less likely to be selected } + if ( uplink != NULL ) { + ref_put( &uplink->reference ); + } } if ( bOk ) { mutex_lock( &image->lock ); diff --git a/src/server/uplink.c b/src/server/uplink.c index 0a6bd11..58f8ea5 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -258,10 +258,14 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length ); return false; } - dnbd3_uplink_t * const uplink = ref_get_uplink( &client->image->uplinkref ); - if ( uplink == NULL ) { - logadd( LOG_DEBUG1, "Uplink request for image with no uplink" ); - return false; + dnbd3_uplink_t * uplink = ref_get_uplink( &client->image->uplinkref ); + if ( unlikely( uplink == NULL ) ) { + uplink_init( client->image, -1, NULL, -1 ); + uplink = ref_get_uplink( &client->image->uplinkref ); + if ( uplink == NULL ) { + logadd( LOG_DEBUG1, "Uplink request for image with no uplink" ); + return false; + } } if ( uplink->shutdown ) { logadd( LOG_DEBUG1, "Uplink request for image with uplink shutting down" ); @@ -460,12 +464,15 @@ static void* uplink_mainloop(void *data) events[EV_SIGNAL].events = POLLIN; events[EV_SIGNAL].fd = signal_getWaitFd( uplink->signal ); events[EV_SOCKET].fd = -1; + if ( uplink->rttTestResult != RTT_DOCHANGE ) { + altservers_findUplink( uplink ); // In case we didn't kickstart + } while ( !_shutdown && !uplink->shutdown ) { // poll() waitTime = uplink->rttTestResult == RTT_DOCHANGE ? 0 : -1; if ( waitTime == 0 ) { // 0 means poll, since we're about to change the server - } else if ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) { + } else if ( uplink->current.fd == -1 ) { waitTime = 1000; } else { declare_now; @@ -568,32 +575,22 @@ static void* uplink_mainloop(void *data) } } // Don't keep uplink established if we're idle for too much - if ( uplink->current.fd != -1 && uplink_connectionShouldShutdown( uplink ) ) { - mutex_lock( &uplink->sendMutex ); - close( uplink->current.fd ); - uplink->current.fd = -1; - mutex_unlock( &uplink->sendMutex ); - uplink->cycleDetected = false; - if ( uplink->recvBufferLen != 0 ) { - uplink->recvBufferLen = 0; - free( uplink->recvBuffer ); - uplink->recvBuffer = NULL; - } + if ( uplink_connectionShouldShutdown( uplink ) ) { logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", uplink->image->name, (int)uplink->image->rid ); - setThreadName( "idle-uplink" ); + goto cleanup; } } // See if we should trigger an RTT measurement rttTestResult = uplink->rttTestResult; if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) { - if ( timing_reached( &nextAltCheck, &now ) || ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) { + if ( timing_reached( &nextAltCheck, &now ) || uplink->current.fd == -1 || uplink->cycleDetected ) { // It seems it's time for a check if ( image_isComplete( uplink->image ) ) { // Quit work if image is complete logadd( LOG_INFO, "Replication of %s complete.", uplink->image->name ); setThreadName( "finished-uplink" ); goto cleanup; - } else if ( !uplink_connectionShouldShutdown( uplink ) ) { + } else { // Not complete - do measurement altservers_findUplinkAsync( uplink ); // This will set RTT_INPROGRESS (synchronous) if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) { @@ -606,6 +603,9 @@ static void* uplink_mainloop(void *data) } else if ( rttTestResult == RTT_NOT_REACHABLE ) { atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE ); discoverFailCount++; + if ( uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) { + uplink->image->working = false; + } timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) ); } #ifdef _DEBUG @@ -1125,8 +1125,6 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) return true; logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize); - // Unlock. Use path and cacheFd without locking. path should never change after initialization of the image, - // cacheFd is owned by the uplink thread and we don't want to hold a spinlock during I/O assert( image->path != NULL ); char mapfile[strlen( image->path ) + 4 + 1]; strcpy( mapfile, image->path ); -- cgit v1.2.3-55-g7522 From ebde623c2cdb84eb36e06bbf944efa54aef0e461 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 30 Aug 2019 09:25:28 +0200 Subject: [SERVER] No uplink_init when checking working state; improve logging --- src/server/image.c | 18 ++++++++++-------- src/server/uplink.c | 1 + 2 files changed, 11 insertions(+), 8 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/image.c b/src/server/image.c index 1972f48..b349058 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -237,7 +237,9 @@ bool image_ensureOpen(dnbd3_image_t *image) { if ( image->readFd != -1 ) return image; int newFd = open( image->path, O_RDONLY ); - if ( newFd != -1 ) { + if ( newFd == -1 ) { + logadd( LOG_WARNING, "Cannot open %s for reading", image->path ); + } else { // Check size const off_t flen = lseek( newFd, 0, SEEK_END ); if ( flen == -1 ) { @@ -349,14 +351,14 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) logadd( LOG_WARNING, "lseek() on %s failed (errno=%d)%s.", candidate->path, errno, removingText ); reload = true; } else if ( (uint64_t)len != candidate->realFilesize ) { - logadd( LOG_DEBUG1, "Size of %s changed at runtime, keeping disabled! Expected: %" PRIu64 ", found: %" PRIu64 + logadd( LOG_WARNING, "Size of %s changed at runtime, keeping disabled! Expected: %" PRIu64 ", found: %" PRIu64 ". Try sending SIGHUP to server if you know what you're doing.", candidate->path, candidate->realFilesize, (uint64_t)len ); } else { // Seek worked, file size is same, now see if we can read from file char buffer[100]; if ( pread( candidate->readFd, buffer, sizeof(buffer), 0 ) == -1 ) { - logadd( LOG_DEBUG2, "Reading first %d bytes from %s failed (errno=%d)%s.", + logadd( LOG_WARNING, "Reading first %d bytes from %s failed (errno=%d)%s.", (int)sizeof(buffer), candidate->path, errno, removingText ); reload = true; } else if ( !candidate->working ) { @@ -370,6 +372,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) // Could not access the image with exising fd - mark for reload which will re-open the file. // make a copy of the image struct but keep the old one around. If/When it's not being used // anymore, it will be freed automatically. + logadd( LOG_DEBUG1, "Reloading image file %s", candidate->path ); dnbd3_image_t *img = calloc( sizeof(dnbd3_image_t), 1 ); img->path = strdup( candidate->path ); img->name = strdup( candidate->name ); @@ -400,17 +403,16 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) img->users = 0; image_free( img ); } + // Check if image is incomplete, initialize uplink + if ( candidate->ref_cacheMap != NULL ) { + uplink_init( candidate, -1, NULL, -1 ); + } // readFd == -1 and working == FALSE at this point, // this function needs some splitting up for handling as we need to run most // of the above code again. for now we know that the next call for this // name:rid will get ne newly inserted "img" and try to re-open the file. } - // Check if image is incomplete, handle - if ( candidate->ref_cacheMap != NULL ) { - uplink_init( candidate, -1, NULL, -1 ); - } - return candidate; // We did all we can, hopefully it's working } diff --git a/src/server/uplink.c b/src/server/uplink.c index 58f8ea5..52cf417 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -604,6 +604,7 @@ static void* uplink_mainloop(void *data) atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE ); discoverFailCount++; if ( uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) { + logadd( LOG_DEBUG1, "Disabling %s:%d since no uplink is available", uplink->image->name, (int)uplink->image->rid ); uplink->image->working = false; } timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) ); -- cgit v1.2.3-55-g7522 From 645bb4b91b06c0eb23867aab1511b080ce122d96 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 30 Aug 2019 09:46:53 +0200 Subject: [SERVER] Introduce debug spam --- src/server/uplink.c | 16 ++++++++-------- src/shared/timing.h | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/uplink.c b/src/server/uplink.c index 52cf417..4cea7e2 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -472,11 +472,10 @@ static void* uplink_mainloop(void *data) waitTime = uplink->rttTestResult == RTT_DOCHANGE ? 0 : -1; if ( waitTime == 0 ) { // 0 means poll, since we're about to change the server - } else if ( uplink->current.fd == -1 ) { - waitTime = 1000; } else { declare_now; waitTime = (int)timing_diffMs( &now, &nextAltCheck ); + logadd( LOG_DEBUG1, "Next %d for %s", waitTime / 1000, uplink->image->name ); if ( waitTime < 100 ) waitTime = 100; if ( waitTime > 5000 ) waitTime = 5000; } @@ -601,13 +600,14 @@ static void* uplink_mainloop(void *data) timing_set( &nextAltCheck, &now, altCheckInterval ); } } else if ( rttTestResult == RTT_NOT_REACHABLE ) { - atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE ); - discoverFailCount++; - if ( uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) { - logadd( LOG_DEBUG1, "Disabling %s:%d since no uplink is available", uplink->image->name, (int)uplink->image->rid ); - uplink->image->working = false; + if ( atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE ) ) { + discoverFailCount++; + if ( uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) { + logadd( LOG_DEBUG1, "Disabling %s:%d since no uplink is available", uplink->image->name, (int)uplink->image->rid ); + uplink->image->working = false; + } } - timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) ); + timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH) ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED ); } #ifdef _DEBUG if ( uplink->current.fd != -1 && !uplink->shutdown ) { diff --git a/src/shared/timing.h b/src/shared/timing.h index f3d8802..f23bfeb 100644 --- a/src/shared/timing.h +++ b/src/shared/timing.h @@ -22,7 +22,7 @@ extern struct timespec basetime; /** * Assign src to dst while adding secs seconds. */ -#define timing_set(dst,src,secs) do { (dst)->tv_sec = (src)->tv_sec + secs; (dst)->tv_nsec = (src)->tv_nsec; } while (0) +#define timing_set(dst,src,secs) do { (dst)->tv_sec = (src)->tv_sec + (secs); (dst)->tv_nsec = (src)->tv_nsec; } while (0) /** * Define variable now, initialize to timing_get. -- cgit v1.2.3-55-g7522 From 5613ed8bf1f05c38af163c1303ab20be6b20090e Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 30 Aug 2019 09:55:41 +0200 Subject: [SERVER] Less debug spam, fix RTT interval calculation --- src/server/uplink.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/uplink.c b/src/server/uplink.c index 4cea7e2..d1cd2e8 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -475,9 +475,8 @@ static void* uplink_mainloop(void *data) } else { declare_now; waitTime = (int)timing_diffMs( &now, &nextAltCheck ); - logadd( LOG_DEBUG1, "Next %d for %s", waitTime / 1000, uplink->image->name ); if ( waitTime < 100 ) waitTime = 100; - if ( waitTime > 5000 ) waitTime = 5000; + if ( waitTime > 10000 ) waitTime = 10000; } events[EV_SOCKET].fd = uplink->current.fd; numSocks = poll( events, EV_COUNT, waitTime ); @@ -582,7 +581,7 @@ static void* uplink_mainloop(void *data) // See if we should trigger an RTT measurement rttTestResult = uplink->rttTestResult; if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) { - if ( timing_reached( &nextAltCheck, &now ) || uplink->current.fd == -1 || uplink->cycleDetected ) { + if ( timing_reached( &nextAltCheck, &now ) || ( uplink->current.fd == -1 && discoverFailCount == 0 ) || uplink->cycleDetected ) { // It seems it's time for a check if ( image_isComplete( uplink->image ) ) { // Quit work if image is complete @@ -606,6 +605,9 @@ static void* uplink_mainloop(void *data) logadd( LOG_DEBUG1, "Disabling %s:%d since no uplink is available", uplink->image->name, (int)uplink->image->rid ); uplink->image->working = false; } + if ( uplink->current.fd == -1 ) { + uplink->cycleDetected = false; + } } timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH) ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED ); } -- cgit v1.2.3-55-g7522 From 23210df3faf44521942be607e0afc7bf63742297 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Mon, 2 Sep 2019 13:19:07 +0200 Subject: [SERVER] uplink: Don't reinit uplink when freeing it --- src/server/uplink.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/uplink.c b/src/server/uplink.c index d1cd2e8..14b9013 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -89,7 +89,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version close( sock ); } ref_put( &uplink->reference ); - return true; // There's already an uplink, so should we consider this success or failure? + return true; // There's already an uplink } if ( image->ref_cacheMap == NULL ) { logadd( LOG_WARNING, "Uplink was requested for image %s, but it is already complete", image->name ); @@ -210,19 +210,9 @@ static void uplink_free(ref *ref) if ( uplink->cacheFd != -1 ) { close( uplink->cacheFd ); } - // TODO Requeue any requests - dnbd3_image_t *image = image_lock( uplink->image ); - if ( image != NULL ) { - // != NULL means image is still in list... - if ( !_shutdown && image->ref_cacheMap != NULL ) { - // Ingegrity checker must have found something in the meantime - uplink_init( image, -1, NULL, 0 ); - } - image_release( image ); - } // Finally let go of image. It was acquired either in uplink_shutdown or in the cleanup code // of the uplink thread, depending on who set the uplink->shutdown flag. - image_release( image ); + image_release( uplink->image ); free( uplink ); // !!! } @@ -536,7 +526,7 @@ static void* uplink_mainloop(void *data) if ( uplink->current.fd != -1 ) { // Uplink seems fine, relay requests to it... uplink_sendRequests( uplink, true ); - } else { // No uplink; maybe it was shutdown since it was idle for too long + } else if ( uplink->queueLen != 0 ) { // No uplink; maybe it was shutdown since it was idle for too long uplink->idleTime = 0; } } -- cgit v1.2.3-55-g7522 From be2e1135c7fcf3850535932b70c0d0891d095d12 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Mon, 2 Sep 2019 13:37:48 +0200 Subject: [SERVER] uplink: Don't disable already disabled image --- src/server/uplink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/server/uplink.c') diff --git a/src/server/uplink.c b/src/server/uplink.c index 14b9013..49e726d 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -591,7 +591,7 @@ static void* uplink_mainloop(void *data) } else if ( rttTestResult == RTT_NOT_REACHABLE ) { if ( atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE ) ) { discoverFailCount++; - if ( uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) { + if ( uplink->image->working && uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) { logadd( LOG_DEBUG1, "Disabling %s:%d since no uplink is available", uplink->image->name, (int)uplink->image->rid ); uplink->image->working = false; } -- cgit v1.2.3-55-g7522 From e83d45b1decd892dfd0a30d4f3db00f5e68c38ae Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Mon, 2 Sep 2019 17:30:19 +0200 Subject: [SERVER] Move signal init to uplink_init Initializing the signal in the thread lead to a race where we would init the uplink and queue a request for it before the thread actually initialized it. This was not harmful but lead to spurious warnings in the server's log. --- src/server/uplink.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/uplink.c b/src/server/uplink.c index 49e726d..8a0b06b 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -106,7 +106,11 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version uplink->idleTime = 0; uplink->queueLen = 0; uplink->cacheFd = -1; - uplink->signal = NULL; + uplink->signal = signal_new(); + if ( uplink->signal == NULL ) { + logadd( LOG_WARNING, "Error creating signal. Uplink unavailable." ); + goto failure; + } uplink->replicationHandle = REP_NONE; mutex_lock( &uplink->rttLock ); mutex_lock( &uplink->sendMutex ); @@ -135,8 +139,8 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version return true; failure: ; if ( uplink != NULL ) { - free( uplink ); - uplink = NULL; + image->users++; // Expected by uplink_free() + ref_put( &uplink->reference ); // The ref for the uplink thread that never was } mutex_unlock( &image->lock ); return false; @@ -193,7 +197,9 @@ static void uplink_free(ref *ref) dnbd3_uplink_t *uplink = container_of(ref, dnbd3_uplink_t, reference); logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", uplink->image->name, (int)uplink->image->rid ); assert( uplink->queueLen == 0 ); - signal_close( uplink->signal ); + if ( uplink->signal != NULL ) { + signal_close( uplink->signal ); + } if ( uplink->current.fd != -1 ) { close( uplink->current.fd ); uplink->current.fd = -1; @@ -211,7 +217,7 @@ static void uplink_free(ref *ref) close( uplink->cacheFd ); } // Finally let go of image. It was acquired either in uplink_shutdown or in the cleanup code - // of the uplink thread, depending on who set the uplink->shutdown flag. + // of the uplink thread, depending on who set the uplink->shutdown flag. (Or uplink_init if that failed) image_release( uplink->image ); free( uplink ); // !!! } @@ -446,11 +452,6 @@ static void* uplink_mainloop(void *data) logadd( LOG_WARNING, "Cannot open cache file %s for writing (errno=%d); will just proxy traffic without caching!", uplink->image->path, errno ); } // - uplink->signal = signal_new(); - if ( uplink->signal == NULL ) { - logadd( LOG_WARNING, "error creating signal. Uplink unavailable." ); - goto cleanup; - } events[EV_SIGNAL].events = POLLIN; events[EV_SIGNAL].fd = signal_getWaitFd( uplink->signal ); events[EV_SOCKET].fd = -1; -- cgit v1.2.3-55-g7522 From 5765ce49f5e1e26505fd6b162db73a732603d1a8 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Thu, 5 Sep 2019 16:52:31 +0200 Subject: [SERVER] integrity checker: Improve flushing logic --- src/server/integrity.c | 199 +++++++++++++++++++++++++++---------------------- src/server/uplink.c | 2 +- 2 files changed, 111 insertions(+), 90 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/integrity.c b/src/server/integrity.c index a9fbae6..fddb755 100644 --- a/src/server/integrity.c +++ b/src/server/integrity.c @@ -35,6 +35,7 @@ static int queueLen = -1; static atomic_bool bRunning = false; static void* integrity_main(void *data); +static void flushFileRange(dnbd3_image_t *image, uint64_t start, uint64_t end); /** * Initialize the integrity check thread @@ -88,14 +89,17 @@ void integrity_check(dnbd3_image_t *image, int block) for (i = 0; i < queueLen; ++i) { if ( freeSlot == -1 && checkQueue[i].image == NULL ) { freeSlot = i; - } else if ( checkQueue[i].image == image - && checkQueue[i].block <= block && checkQueue[i].block + checkQueue[i].count >= block ) { - // Already queued check dominates this one, or at least lies directly before this block - if ( checkQueue[i].block + checkQueue[i].count == block ) { - // It's directly before this one; expand range + } else if ( checkQueue[i].image == image && checkQueue[i].block <= block ) { + if ( checkQueue[i].count == CHECK_ALL ) { + logadd( LOG_DEBUG2, "Dominated by full image scan request (%d/%d) (at %d)", i, queueLen, checkQueue[i].block ); + } else if ( checkQueue[i].block + checkQueue[i].count == block ) { checkQueue[i].count += 1; + logadd( LOG_DEBUG2, "Attaching to existing check request (%d/%d) (at %d, %d to go)", i, queueLen, checkQueue[i].block, checkQueue[i].count ); + } else if ( checkQueue[i].block + checkQueue[i].count > block ) { + logadd( LOG_DEBUG2, "Dominated by existing check request (%d/%d) (at %d, %d to go)", i, queueLen, checkQueue[i].block, checkQueue[i].count ); + } else { + continue; } - logadd( LOG_DEBUG2, "Attaching to existing check request (%d/%d) (%d +%d)", i, queueLen, checkQueue[i].block, checkQueue[i].count ); mutex_unlock( &integrityQueueLock ); return; } @@ -123,8 +127,6 @@ void integrity_check(dnbd3_image_t *image, int block) static void* integrity_main(void * data UNUSED) { int i; - uint8_t *buffer = NULL; - size_t bufferSize = 0; setThreadName( "image-check" ); blockNoncriticalSignals(); #if defined(linux) || defined(__linux) @@ -150,88 +152,70 @@ static void* integrity_main(void * data UNUSED) // We have the image. Call image_release() some time const int qCount = checkQueue[i].count; bool foundCorrupted = false; - mutex_lock( &image->lock ); if ( image->crc32 != NULL && image->realFilesize != 0 ) { int blocks[2] = { checkQueue[i].block, -1 }; mutex_unlock( &integrityQueueLock ); - // Make copy of crc32 list as it might go away const uint64_t fileSize = image->realFilesize; const int numHashBlocks = IMGSIZE_TO_HASHBLOCKS(fileSize); - const size_t required = numHashBlocks * sizeof(uint32_t); - if ( buffer == NULL || required > bufferSize ) { - bufferSize = required; - if ( buffer != NULL ) free( buffer ); - buffer = malloc( bufferSize ); - } - memcpy( buffer, image->crc32, required ); - mutex_unlock( &image->lock ); - // Open for direct I/O if possible; this prevents polluting the fs cache - int fd = open( image->path, O_RDONLY | O_DIRECT ); - bool direct = fd != -1; - if ( unlikely( !direct ) ) { - // Try unbuffered; flush to disk for that - logadd( LOG_DEBUG1, "O_DIRECT failed for %s", image->path ); - image_ensureOpen( image ); - fd = image->readFd; - } int checkCount = MIN( qCount, 5 ); - if ( fd != -1 ) { - while ( blocks[0] < numHashBlocks && !_shutdown ) { - const uint64_t start = blocks[0] * HASH_BLOCK_SIZE; - const uint64_t end = MIN( (uint64_t)(blocks[0] + 1) * HASH_BLOCK_SIZE, image->virtualFilesize ); - bool complete = true; - if ( qCount == CHECK_ALL ) { - dnbd3_cache_map_t *cache = ref_get_cachemap( image ); - if ( cache != NULL ) { - // When checking full image, skip incomplete blocks, otherwise assume block is complete - complete = image_isHashBlockComplete( cache->map, blocks[0], fileSize ); - ref_put( &cache->reference ); - } - } -#if defined(linux) || defined(__linux) - while ( sync_file_range( fd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 ) -#else - while ( fsync( fd ) == -1 ) -#endif - { - if ( _shutdown ) - break; - if ( errno == EINTR ) - continue; - logadd( LOG_ERROR, "Cannot flush %s for integrity check (errno=%d)", image->path, errno ); - exit( 1 ); + int readFd = -1, directFd = -1; + while ( blocks[0] < numHashBlocks && !_shutdown ) { + const uint64_t start = blocks[0] * HASH_BLOCK_SIZE; + const uint64_t end = MIN( (uint64_t)(blocks[0] + 1) * HASH_BLOCK_SIZE, image->virtualFilesize ); + bool complete = true; + if ( qCount == CHECK_ALL ) { + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache != NULL ) { + // When checking full image, skip incomplete blocks, otherwise assume block is complete + complete = image_isHashBlockComplete( cache->map, blocks[0], fileSize ); + ref_put( &cache->reference ); } - if ( _shutdown ) - break; + } + // Flush to disk if there's an uplink, as that means the block might have been written recently + if ( image->uplinkref != NULL ) { + flushFileRange( image, start, end ); + } + if ( _shutdown ) + break; + // Open for direct I/O if possible; this prevents polluting the fs cache + if ( directFd == -1 && ( end % DNBD3_BLOCK_SIZE ) == 0 ) { // Use direct I/O only if read length is multiple of 4096 to be on the safe side - int tfd; - if ( direct && ( end % DNBD3_BLOCK_SIZE ) == 0 ) { - // Suitable for direct io - tfd = fd; - } else if ( !image_ensureOpen( image ) ) { - logadd( LOG_WARNING, "Cannot open %s for reading", image->path ); - break; + directFd = open( image->path, O_RDONLY | O_DIRECT ); + if ( directFd == -1 ) { + logadd( LOG_DEBUG2, "O_DIRECT failed for %s (errno=%d)", image->path, errno ); + directFd = -2; } else { - tfd = image->readFd; - // Evict from cache so we have to re-read, making sure data was properly stored - posix_fadvise( fd, start, end - start, POSIX_FADV_DONTNEED ); + readFd = directFd; } - if ( complete && !image_checkBlocksCrc32( tfd, (uint32_t*)buffer, blocks, fileSize ) ) { - logadd( LOG_WARNING, "Hash check for block %d of %s failed!", blocks[0], image->name ); - image_updateCachemap( image, start, end, false ); - // If this is not a full check, queue one - if ( qCount != CHECK_ALL ) { - logadd( LOG_INFO, "Queueing full check for %s", image->name ); - integrity_check( image, -1 ); - } - foundCorrupted = true; - } - blocks[0]++; // Increase before break, so it always points to the next block to check after loop - if ( complete && --checkCount == 0 ) break; } - if ( direct ) { - close( fd ); + if ( readFd == -1 ) { // Try buffered; flush to disk for that + image_ensureOpen( image ); + readFd = image->readFd; + } + if ( readFd == -1 ) { + logadd( LOG_MINOR, "Couldn't get any valid fd for integrity check of %s... ignoring...", image->path ); + } else if ( complete && !image_checkBlocksCrc32( readFd, image->crc32, blocks, fileSize ) ) { + bool iscomplete = true; + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache != NULL ) { + iscomplete = image_isHashBlockComplete( cache->map, blocks[0], fileSize ); + ref_put( &cache->reference ); + } + logadd( LOG_WARNING, "Hash check for block %d of %s failed (complete: was: %d, is: %d)", blocks[0], image->name, (int)complete, (int)iscomplete ); + image_updateCachemap( image, start, end, false ); + // If this is not a full check, queue one + if ( qCount != CHECK_ALL ) { + logadd( LOG_INFO, "Queueing full check for %s", image->name ); + integrity_check( image, -1 ); + } + foundCorrupted = true; } + blocks[0]++; // Increase before break, so it always points to the next block to check after loop + if ( complete && --checkCount == 0 ) + break; + } + if ( directFd != -1 && directFd != -2 ) { + close( directFd ); } mutex_lock( &integrityQueueLock ); assert( checkQueue[i].image == image ); @@ -242,11 +226,8 @@ static void* integrity_main(void * data UNUSED) logadd( LOG_WARNING, "BUG! checkQueue counter ran negative" ); } } - if ( checkCount > 0 || checkQueue[i].count <= 0 || fd == -1 ) { - // Done with this task as nothing left, OR we don't have an fd to read from - if ( fd == -1 ) { - logadd( LOG_WARNING, "Cannot hash check %s: bad fd", image->path ); - } + if ( checkCount > 0 || checkQueue[i].count <= 0 ) { + // Done with this task as nothing left checkQueue[i].image = NULL; if ( i + 1 == queueLen ) queueLen--; // Mark as working again if applicable @@ -263,10 +244,8 @@ static void* integrity_main(void * data UNUSED) // Still more blocks to go... checkQueue[i].block = blocks[0]; } - } else { - mutex_unlock( &image->lock ); } - if ( foundCorrupted ) { + if ( foundCorrupted && !_shutdown ) { // Something was fishy, make sure uplink exists mutex_lock( &image->lock ); image->working = false; @@ -278,10 +257,52 @@ static void* integrity_main(void * data UNUSED) } } mutex_unlock( &integrityQueueLock ); - if ( buffer != NULL ) { - free( buffer ); - } bRunning = false; return NULL; } +static void flushFileRange(dnbd3_image_t *image, uint64_t start, uint64_t end) +{ + int flushFd; + int writableFd = -1; + dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); + if ( uplink != NULL ) { // Try to steal uplink's writable fd + if ( uplink->cacheFd != -1 ) { + writableFd = dup( uplink->cacheFd ); + } + ref_put( &uplink->reference ); + } + if ( writableFd == -1 ) { // Open file as writable + writableFd = open( image->path, O_WRONLY ); + } + if ( writableFd == -1 ) { // Fallback to readFd (should work on Linux and BSD...) + logadd( LOG_WARNING, "flushFileRange: Cannot open %s for writing. Trying readFd.", image->path ); + image_ensureOpen( image ); + flushFd = image->readFd; + } else { + flushFd = writableFd; + } + if ( flushFd == -1 ) + return; +#if defined(linux) || defined(__linux) + while ( sync_file_range( flushFd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 ) +#else + while ( fsync( flushFd ) == -1 ) // TODO: fdatasync() should be available since FreeBSD 12.0 ... Might be a tad bit faster +#endif + { + if ( _shutdown ) + break; + int e = errno; + if ( e == EINTR ) + continue; + logadd( LOG_ERROR, "Cannot flush %s for integrity check (errno=%d)", image->path, e ); + if ( e == EIO ) { + exit( 1 ); + } + } + // Evict from cache too so we have to re-read, making sure data was properly stored + posix_fadvise( flushFd, start, end - start, POSIX_FADV_DONTNEED ); + if ( writableFd != -1 ) { + close( writableFd ); + } +} diff --git a/src/server/uplink.c b/src/server/uplink.c index 8a0b06b..dab5c27 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -876,7 +876,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) ret = (int)pwrite( uplink->cacheFd, uplink->recvBuffer + done, inReply.size - done, start + done ); if ( unlikely( ret == -1 ) ) { err = errno; - if ( err == EINTR ) continue; + if ( err == EINTR && !_shutdown ) continue; if ( err == ENOSPC || err == EDQUOT ) { // try to free 256MiB if ( !tryAgain || !image_ensureDiskSpaceLocked( 256ull * 1024 * 1024, true ) ) break; -- cgit v1.2.3-55-g7522 From ebe7d990aa6e2c42ddc8475be5ea65ce2a96605a Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Sat, 7 Sep 2019 10:09:11 +0200 Subject: [SERVER] Fix warning on clang --- src/server/uplink.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/uplink.c b/src/server/uplink.c index dab5c27..f39e633 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -1029,12 +1029,7 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) */ static int uplink_sendKeepalive(const int fd) { - static dnbd3_request_t request = { 0 }; - if ( request.magic == 0 ) { - request.magic = dnbd3_packet_magic; - request.cmd = CMD_KEEPALIVE; - fixup_request( request ); - } + static const dnbd3_request_t request = { .magic = dnbd3_packet_magic, .cmd = net_order_16( CMD_KEEPALIVE ) }; return send( fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request); } -- cgit v1.2.3-55-g7522 From 26c1ad7af0f5749c5343a5823b9c8cece885ce84 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Tue, 3 Mar 2020 12:21:01 +0100 Subject: [SERVER] Remove "working" flag, introduce fine-grained flags Tracking the "working" state of images using one boolean is insufficient regarding the different ways in which providing an image can fail. Introduce separate flags for different conditions, like "file not readable", "file not writable", "no uplink server available", "file content has changed". --- src/server/altservers.c | 4 - src/server/globals.h | 7 +- src/server/image.c | 193 +++++++++++++++++++++++++----------------------- src/server/integrity.c | 20 +---- src/server/net.c | 17 +++-- src/server/uplink.c | 114 ++++++++++++++++++---------- 6 files changed, 197 insertions(+), 158 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/altservers.c b/src/server/altservers.c index 3fdbe0d..a6ad235 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -628,10 +628,6 @@ failed: if ( best.fd != -1 ) { close( best.fd ); } - if ( !image->working || uplink->cycleDetected ) { - image->working = true; - LOG( LOG_DEBUG1, "[RTT] No better alt server found, enabling '%s:%d' again... :-(", image->name, (int)image->rid ); - } uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away mutex_lock( &uplink->rttLock ); uplink->rttTestResult = RTT_DONTCHANGE; diff --git a/src/server/globals.h b/src/server/globals.h index b1336dc..31fbce5 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -136,7 +136,12 @@ struct _dnbd3_image atomic_int completenessEstimate; // Completeness estimate in percent atomic_int users; // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock. int id; // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server - atomic_bool working; // true if image exists and completeness is == 100% or a working upstream proxy is connected + struct { + atomic_bool uplink; // No uplink connected + atomic_bool write; // Error writing to file + atomic_bool read; // Error reading from file + atomic_bool changed; // File disappeared or changed, thorough check required if it seems to be back + } problem; uint16_t rid; // revision of image pthread_mutex_t lock; }; diff --git a/src/server/image.c b/src/server/image.c index 6017e59..1ce1574 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -53,7 +53,7 @@ static bool image_ensureDiskSpace(uint64_t size, bool force); static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize); static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc); -static void image_checkRandomBlocks(dnbd3_image_t *image, const int count); +static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd); static void* closeUnusedFds(void*); static void allocCacheMap(dnbd3_image_t *image, bool complete); @@ -239,35 +239,76 @@ bool image_isComplete(dnbd3_image_t *image) */ bool image_ensureOpen(dnbd3_image_t *image) { - if ( image->readFd != -1 ) return image; - int newFd = open( image->path, O_RDONLY ); + bool sizeChanged = false; + if ( image->readFd != -1 && !image->problem.changed ) + return true; + int newFd = image->readFd == -1 ? open( image->path, O_RDONLY ) : dup( image->readFd ); if ( newFd == -1 ) { - logadd( LOG_WARNING, "Cannot open %s for reading", image->path ); + if ( !image->problem.read ) { + logadd( LOG_WARNING, "Cannot open %s for reading", image->path ); + image->problem.read = true; + } } else { - // Check size + // Check size + read access + char buffer[100]; const off_t flen = lseek( newFd, 0, SEEK_END ); if ( flen == -1 ) { - logadd( LOG_WARNING, "Could not seek to end of %s (errno %d)", image->path, errno ); + if ( !image->problem.read ) { + logadd( LOG_WARNING, "Could not seek to end of %s (errno %d)", image->path, errno ); + image->problem.read = true; + } close( newFd ); newFd = -1; } else if ( (uint64_t)flen != image->realFilesize ) { - logadd( LOG_WARNING, "Size of active image with closed fd changed from %" PRIu64 " to %" PRIu64, image->realFilesize, (uint64_t)flen ); + if ( !image->problem.changed ) { + logadd( LOG_WARNING, "Size of active image with closed fd changed from %" PRIu64 " to %" PRIu64, + image->realFilesize, (uint64_t)flen ); + } + sizeChanged = true; + } else if ( pread( newFd, buffer, sizeof(buffer), 0 ) == -1 ) { + if ( !image->problem.read ) { + logadd( LOG_WARNING, "Reading first %d bytes from %s failed (errno=%d)", + (int)sizeof(buffer), image->path, errno ); + image->problem.read = true; + } close( newFd ); newFd = -1; } } if ( newFd == -1 ) { - mutex_lock( &image->lock ); - image->working = false; - mutex_unlock( &image->lock ); + if ( sizeChanged ) { + image->problem.changed = true; + } return false; } + + // Re-opened. Check if the "size/content changed" flag was set before and if so, check crc32, + // but only if the size we just got above is correct. + if ( image->problem.changed && !sizeChanged ) { + if ( image->crc32 == NULL ) { + // Cannot verify further, hope for the best + image->problem.changed = false; + logadd( LOG_DEBUG1, "Size of image %s:%d changed back to expected value", + image->name, (int)image->rid ); + } else if ( image_checkRandomBlocks( image, 1, newFd ) ) { + // This should have checked the first block (if complete) -> All is well again + image->problem.changed = false; + logadd( LOG_DEBUG1, "Size and CRC of image %s:%d changed back to expected value", + image->name, (int)image->rid ); + } + } else { + image->problem.changed = sizeChanged; + } + mutex_lock( &image->lock ); if ( image->readFd == -1 ) { image->readFd = newFd; + image->problem.read = false; mutex_unlock( &image->lock ); } else { - // There was a race while opening the file (happens cause not locked cause blocking), we lost the race so close new fd and proceed + // There was a race while opening the file (happens cause not locked cause blocking), + // we lost the race so close new fd and proceed. + // *OR* we dup()'ed above for cheating when the image changed before. mutex_unlock( &image->lock ); close( newFd ); } @@ -296,7 +337,7 @@ dnbd3_image_t* image_byId(int imgId) * point... * Locks on: imageListLock, _images[].lock */ -dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) +dnbd3_image_t* image_get(char *name, uint16_t revision, bool ensureFdOpen) { int i; const char *removingText = _removeMissingImages ? ", removing from list" : ""; @@ -326,84 +367,36 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) candidate->users++; mutex_unlock( &imageListLock ); - // Found, see if it works - // TODO: Also make sure a non-working image still has old fd open but created a new one and removed itself from the list - // TODO: But remember size-changed images forever - if ( candidate->working || checkIfWorking ) { - // Is marked working, but might not have an fd open - if ( !image_ensureOpen( candidate ) ) { - mutex_lock( &candidate->lock ); - timing_get( &candidate->lastWorkCheck ); - mutex_unlock( &candidate->lock ); - if ( _removeMissingImages ) { - candidate = image_remove( candidate ); // No release here, the image is still returned and should be released by caller - } - return candidate; - } - } - - if ( !checkIfWorking ) return candidate; // Not interested in re-cechking working state - - // ...not working... - - // Don't re-check too often - mutex_lock( &candidate->lock ); - bool check; - declare_now; - check = timing_diff( &candidate->lastWorkCheck, &now ) > NONWORKING_RECHECK_INTERVAL_SECONDS; - if ( check ) { - candidate->lastWorkCheck = now; - } - mutex_unlock( &candidate->lock ); - if ( !check ) { + if ( !ensureFdOpen ) // Don't want to re-check return candidate; - } - // reaching this point means: - // 1) We should check if the image is working, it might or might not be in working state right now - // 2) The image is open for reading (or at least was at some point, the fd might be stale if images lie on an NFS share etc.) - // 3) We made sure not to re-check this image too often - - // Common for ro and rw images: Size check, read check - const off_t len = lseek( candidate->readFd, 0, SEEK_END ); - bool reload = false; - if ( len == -1 ) { - logadd( LOG_WARNING, "lseek() on %s failed (errno=%d)%s.", candidate->path, errno, removingText ); - reload = true; - } else if ( (uint64_t)len != candidate->realFilesize ) { - logadd( LOG_WARNING, "Size of %s changed at runtime, keeping disabled! Expected: %" PRIu64 ", found: %" PRIu64 - ". Try sending SIGHUP to server if you know what you're doing.", - candidate->path, candidate->realFilesize, (uint64_t)len ); - } else { - // Seek worked, file size is same, now see if we can read from file - char buffer[100]; - if ( pread( candidate->readFd, buffer, sizeof(buffer), 0 ) == -1 ) { - logadd( LOG_WARNING, "Reading first %d bytes from %s failed (errno=%d)%s.", - (int)sizeof(buffer), candidate->path, errno, removingText ); - reload = true; - } else if ( !candidate->working ) { - // Seems everything is fine again \o/ - candidate->working = true; - logadd( LOG_INFO, "Changed state of %s:%d to 'working'", candidate->name, candidate->rid ); - } - } + if ( image_ensureOpen( candidate ) && !candidate->problem.read ) + return candidate; // We have a read fd and no read or changed problems - if ( reload ) { + // -- image could not be opened again, or is open but has problem -- + + if ( _removeMissingImages && !file_isReadable( candidate->path ) ) { + candidate = image_remove( candidate ); + // No image_release here, the image is still returned and should be released by caller + } else if ( candidate->readFd != -1 ) { + // We cannot just close the fd as it might be in use. Make a copy and remove old entry. + candidate = image_remove( candidate ); // Could not access the image with exising fd - mark for reload which will re-open the file. // make a copy of the image struct but keep the old one around. If/When it's not being used // anymore, it will be freed automatically. - logadd( LOG_DEBUG1, "Reloading image file %s", candidate->path ); + logadd( LOG_DEBUG1, "Reloading image file %s because of read problem/changed", candidate->path ); dnbd3_image_t *img = calloc( sizeof(dnbd3_image_t), 1 ); img->path = strdup( candidate->path ); img->name = strdup( candidate->name ); img->virtualFilesize = candidate->virtualFilesize; img->realFilesize = candidate->realFilesize; - img->atime = now; + timing_get( &img->atime ); img->masterCrc32 = candidate->masterCrc32; img->readFd = -1; img->rid = candidate->rid; img->users = 1; - img->working = false; + img->problem.read = true; + img->problem.changed = candidate->problem.changed; img->ref_cacheMap = NULL; mutex_init( &img->lock, LOCK_IMAGE ); if ( candidate->crc32 != NULL ) { @@ -419,18 +412,17 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) if ( image_addToList( img ) ) { image_release( candidate ); candidate = img; + // Check if image is incomplete, initialize uplink + if ( candidate->ref_cacheMap != NULL ) { + uplink_init( candidate, -1, NULL, -1 ); + } + // Try again with new instance + image_ensureOpen( candidate ); } else { img->users = 0; image_free( img ); } - // Check if image is incomplete, initialize uplink - if ( candidate->ref_cacheMap != NULL ) { - uplink_init( candidate, -1, NULL, -1 ); - } - // readFd == -1 and working == FALSE at this point, - // this function needs some splitting up for handling as we need to run most - // of the above code again. for now we know that the next call for this - // name:rid will get ne newly inserted "img" and try to re-open the file. + // readFd == -1 and problem.read == true } return candidate; // We did all we can, hopefully it's working @@ -900,7 +892,6 @@ static bool image_load(char *base, char *path, int withUplink) image->rid = (uint16_t)revision; image->users = 0; image->readFd = -1; - image->working = ( cache == NULL ); timing_get( &image->nextCompletenessEstimate ); image->completenessEstimate = -1; mutex_init( &image->lock, LOCK_IMAGE ); @@ -925,7 +916,7 @@ static bool image_load(char *base, char *path, int withUplink) // Image is definitely incomplete, initialize uplink worker if ( image->ref_cacheMap != NULL ) { - image->working = false; + image->problem.uplink = true; if ( withUplink ) { uplink_init( image, -1, NULL, -1 ); } @@ -937,7 +928,7 @@ static bool image_load(char *base, char *path, int withUplink) // Keep fd for reading fdImage = -1; // Check CRC32 - image_checkRandomBlocks( image, 4 ); + image_checkRandomBlocks( image, 4, -1 ); } else { logadd( LOG_ERROR, "Image list full: Could not add image %s", path ); image->readFd = -1; // Keep fdImage instead, will be closed below @@ -1027,10 +1018,19 @@ static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t f return retval; } -static void image_checkRandomBlocks(dnbd3_image_t *image, const int count) +/** + * Check up to count random blocks from given image. If fromFd is -1, the check will + * be run asynchronously using the integrity checker. Otherwise, the check will + * happen in the function and return the result of the check. + * @param image image to check + * @param count number of blocks to check (max) + * @param fromFd, check synchronously and use this fd for reading, -1 = async + * @return true = OK, false = error. Meaningless if fromFd == -1 + */ +static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd) { if ( image->crc32 == NULL ) - return; + return true; // This checks the first block and (up to) count - 1 random blocks for corruption // via the known crc32 list. This is very sloppy and is merely supposed to detect // accidental corruption due to broken dnbd3-proxy functionality or file system @@ -1038,7 +1038,7 @@ static void image_checkRandomBlocks(dnbd3_image_t *image, const int count) assert( count > 0 ); dnbd3_cache_map_t *cache = ref_get_cachemap( image ); const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( image->virtualFilesize ); - int blocks[count]; + int blocks[count+1]; // +1 for "-1" in sync case int index = 0, j; int block; if ( image_isHashBlockComplete( cache, 0, image->virtualFilesize ) ) { @@ -1062,9 +1062,16 @@ while_end: ; if ( cache != NULL ) { ref_put( &cache->reference ); } - for ( int i = 0; i < index; ++i ) { - integrity_check( image, blocks[i], true ); + if ( fromFd == -1 ) { + // Async + for ( int i = 0; i < index; ++i ) { + integrity_check( image, blocks[i], true ); + } + return true; } + // Sync + blocks[index] = -1; + return image_checkBlocksCrc32( fromFd, image->crc32, blocks, image->realFilesize ); } /** @@ -1306,7 +1313,7 @@ server_fail: ; } else { // Clumsy busy wait, but this should only take as long as it takes to start a thread, so is it really worth using a signalling mechanism? int i = 0; - while ( !image->working && ++i < 100 ) + while ( image->problem.uplink && ++i < 100 ) usleep( 2000 ); } } else if ( uplinkSock != -1 ) { @@ -1599,7 +1606,7 @@ int image_getCompletenessEstimate(dnbd3_image_t * const image) assert( image != NULL ); dnbd3_cache_map_t *cache = ref_get_cachemap( image ); if ( cache == NULL ) - return image->working ? 100 : 0; + return 100; const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); if ( unlikely( len == 0 ) ) { ref_put( &cache->reference ); diff --git a/src/server/integrity.c b/src/server/integrity.c index 4006dfc..91e53b8 100644 --- a/src/server/integrity.c +++ b/src/server/integrity.c @@ -195,9 +195,10 @@ static void* integrity_main(void * data UNUSED) readFd = directFd; } } - if ( readFd == -1 ) { // Try buffered; flush to disk for that - image_ensureOpen( image ); - readFd = image->readFd; + if ( readFd == -1 ) { // Try buffered as fallback + if ( image_ensureOpen( image ) && !image->problem.read ) { + readFd = image->readFd; + } } if ( readFd == -1 ) { logadd( LOG_MINOR, "Couldn't get any valid fd for integrity check of %s... ignoring...", image->path ); @@ -237,16 +238,6 @@ static void* integrity_main(void * data UNUSED) // Done with this task as nothing left checkQueue[i].image = NULL; if ( i + 1 == queueLen ) queueLen--; - // Mark as working again if applicable - if ( !foundCorrupted ) { - dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); - if ( uplink != NULL ) { // TODO: image_determineWorkingState() helper? - mutex_lock( &image->lock ); - image->working = uplink->current.fd != -1 && image->readFd != -1; - mutex_unlock( &image->lock ); - ref_put( &uplink->reference ); - } - } } else { // Still more blocks to go... checkQueue[i].block = blocks[0]; @@ -254,9 +245,6 @@ static void* integrity_main(void * data UNUSED) } if ( foundCorrupted && !_shutdown ) { // Something was fishy, make sure uplink exists - mutex_lock( &image->lock ); - image->working = false; - mutex_unlock( &image->lock ); uplink_init( image, -1, NULL, -1 ); } // Release :-) diff --git a/src/server/net.c b/src/server/net.c index aba4e7d..29147be 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -262,7 +262,7 @@ void* net_handleNewConnection(void *clientPtr) atomic_thread_fence( memory_order_release ); if ( unlikely( image == NULL ) ) { //logadd( LOG_DEBUG1, "Client requested non-existent image '%s' (rid:%d), rejected\n", image_name, (int)rid ); - } else if ( unlikely( !image->working ) ) { + } else if ( unlikely( image->problem.read || image->problem.changed ) ) { logadd( LOG_DEBUG1, "Client %s requested non-working image '%s' (rid:%d), rejected\n", client->hostName, image_name, (int)rid ); } else { @@ -273,8 +273,14 @@ void* net_handleNewConnection(void *clientPtr) if ( uplink != NULL && ( uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) ) { bOk = ( rand() % 4 ) == 1; } - if ( bOk && uplink != NULL && uplink->cacheFd == -1 ) { // Wait 100ms if local caching is not working so this - usleep( 100000 ); // server gets a penalty and is less likely to be selected + if ( bOk && uplink != NULL ) { + if ( uplink->cacheFd == -1 ) { // Wait 100ms if local caching is not working so this + usleep( 100000 ); // server gets a penalty and is less likely to be selected + } + if ( image->problem.uplink ) { + // Penaltize depending on completeness, if no uplink is available + usleep( ( 100 - image->completenessEstimate ) * 100 ); + } } if ( uplink != NULL ) { ref_put( &uplink->reference ); @@ -383,9 +389,8 @@ void* net_handleNewConnection(void *clientPtr) ref_put( &cache->reference ); if ( !isCached ) { if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) { - logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy, disabling image %s:%d", + logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy for image %s:%d", client->hostName, image->name, image->rid ); - image->working = false; goto exit_client_cleanup; } break; // DONE, exit request.cmd switch @@ -456,7 +461,7 @@ void* net_handleNewConnection(void *clientPtr) } if ( err == EBADF || err == EFAULT || err == EINVAL || err == EIO ) { logadd( LOG_INFO, "Disabling %s:%d", image->name, image->rid ); - image->working = false; + image->problem.read = true; } } goto exit_client_cleanup; diff --git a/src/server/uplink.c b/src/server/uplink.c index f39e633..aba53ba 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -53,9 +53,9 @@ static void* uplink_mainloop(void *data); static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly); static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex); static void uplink_handleReceive(dnbd3_uplink_t *uplink); -static int uplink_sendKeepalive(const int fd); +static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink); static void uplink_addCrc32(dnbd3_uplink_t *uplink); -static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink); +static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink); static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force); static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink); static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink); @@ -117,6 +117,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version uplink->current.fd = -1; mutex_unlock( &uplink->sendMutex ); uplink->cycleDetected = false; + image->problem.uplink = true; if ( sock != -1 ) { uplink->better.fd = sock; int index = altservers_hostToIndex( host ); @@ -371,6 +372,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" ); } else { if ( unlikely( uplink->current.fd == -1 ) ) { + uplink->image->problem.uplink = true; mutex_unlock( &uplink->sendMutex ); logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" ); } else { @@ -378,12 +380,14 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin const uint32_t reqSize = (uint32_t)(((uplink->queue[freeSlot].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart); if ( hops < 200 ) ++hops; const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) ); - mutex_unlock( &uplink->sendMutex ); if ( unlikely( !ret ) ) { + uplink->image->problem.uplink = true; + mutex_unlock( &uplink->sendMutex ); logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" ); } else { // Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again int state; + mutex_unlock( &uplink->sendMutex ); mutex_lock( &uplink->queueLock ); if ( !uplink->shutdown && uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) { state = uplink->queue[freeSlot].status; @@ -460,9 +464,9 @@ static void* uplink_mainloop(void *data) } while ( !_shutdown && !uplink->shutdown ) { // poll() - waitTime = uplink->rttTestResult == RTT_DOCHANGE ? 0 : -1; - if ( waitTime == 0 ) { + if ( uplink->rttTestResult == RTT_DOCHANGE ) { // 0 means poll, since we're about to change the server + waitTime = 0; } else { declare_now; waitTime = (int)timing_diffMs( &now, &nextAltCheck ); @@ -495,7 +499,7 @@ static void* uplink_mainloop(void *data) discoverFailCount = 0; if ( fd != -1 ) close( fd ); uplink->replicationHandle = REP_NONE; - uplink->image->working = true; + uplink->image->problem.uplink = false; uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received buffer[0] = '@'; if ( altservers_toString( uplink->current.index, buffer + 1, sizeof(buffer) - 1 ) ) { @@ -510,6 +514,11 @@ static void* uplink_mainloop(void *data) uplink_sendRequests( uplink, false ); uplink_sendReplicationRequest( uplink ); events[EV_SOCKET].events = POLLIN | POLLRDHUP; + if ( uplink->image->problem.uplink ) { + // Some of the requests above must have failed again already :-( + logadd( LOG_DEBUG1, "Newly established uplink connection failed during getCRC or sendRequests" ); + uplink_connectionFailed( uplink, true ); + } timing_gets( &nextAltCheck, altCheckInterval ); // The rtt worker already did the handshake for our image, so there's nothing // more to do here @@ -517,6 +526,7 @@ static void* uplink_mainloop(void *data) // Check events // Signal if ( (events[EV_SIGNAL].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) { + uplink->image->problem.uplink = true; logadd( LOG_WARNING, "poll error on signal in uplink_mainloop!" ); goto cleanup; } else if ( (events[EV_SIGNAL].revents & POLLIN) ) { @@ -553,14 +563,10 @@ static void* uplink_mainloop(void *data) } // Keep-alive if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) { - // Send keep-alive if nothing is happening - if ( uplink_sendKeepalive( uplink->current.fd ) ) { - // Re-trigger periodically, in case it requires a minimum user count - uplink_sendReplicationRequest( uplink ); - } else { + // Send keep-alive if nothing is happening, and try to trigger background rep. + if ( !uplink_sendKeepalive( uplink ) || !uplink_sendReplicationRequest( uplink ) ) { uplink_connectionFailed( uplink, true ); - logadd( LOG_DEBUG1, "Error sending keep-alive, panic!\n" ); - setThreadName( "panic-uplink" ); + logadd( LOG_DEBUG1, "Error sending keep-alive/BGR, panic!\n" ); } } // Don't keep uplink established if we're idle for too much @@ -578,6 +584,7 @@ static void* uplink_mainloop(void *data) // Quit work if image is complete logadd( LOG_INFO, "Replication of %s complete.", uplink->image->name ); setThreadName( "finished-uplink" ); + uplink->image->problem.uplink = false; goto cleanup; } else { // Not complete - do measurement @@ -592,10 +599,6 @@ static void* uplink_mainloop(void *data) } else if ( rttTestResult == RTT_NOT_REACHABLE ) { if ( atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE ) ) { discoverFailCount++; - if ( uplink->image->working && uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) { - logadd( LOG_DEBUG1, "Disabling %s:%d since no uplink is available", uplink->image->name, (int)uplink->image->rid ); - uplink->image->working = false; - } if ( uplink->current.fd == -1 ) { uplink->cycleDetected = false; } @@ -624,8 +627,9 @@ static void* uplink_mainloop(void *data) } } mutex_unlock( &uplink->queueLock ); - if ( resend ) + if ( resend ) { uplink_sendRequests( uplink, true ); + } } #endif } @@ -653,6 +657,9 @@ static void* uplink_mainloop(void *data) return NULL ; } +/** + * Only called from uplink thread. + */ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) { // Scan for new requests @@ -672,13 +679,15 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) if ( hops < 200 ) ++hops; mutex_lock( &uplink->sendMutex ); const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) ); - mutex_unlock( &uplink->sendMutex ); - if ( !ret ) { + if ( likely( ret ) ) { + mutex_unlock( &uplink->sendMutex ); + } else { // Non-critical - if the connection dropped or the server was changed // the thread will re-send this request as soon as the connection // is reestablished. + uplink->image->problem.uplink = true; + mutex_unlock( &uplink->sendMutex ); logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" ); - altservers_serverFailed( uplink->current.index ); return; } mutex_lock( &uplink->queueLock ); @@ -695,21 +704,27 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) * server. This means we might request data we already have, but it makes * the code simpler. Worst case would be only one bit is zero, which means * 4kb are missing, but we will request 32kb. + * + * Only called form uplink thread, so current.fd is assumed to be valid. + * + * @return false if sending request failed, true otherwise (i.e. not necessary/disabled) */ -static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) +static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) { - if ( uplink == NULL || uplink->current.fd == -1 ) return; - if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return; // Don't do background replication + if ( uplink->current.fd == -1 ) + return false; // Should never be called in this state, consider send error + if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) + return true; // Don't do background replication if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE ) - return; // Already a replication request on the wire, or no more blocks to replicate + return true; // Already a replication request on the wire, or no more blocks to replicate dnbd3_image_t * const image = uplink->image; - if ( image->virtualFilesize < DNBD3_BLOCK_SIZE ) return; - if ( image->users < _bgrMinClients ) return; // Not enough active users + if ( image->users < _bgrMinClients ) + return true; // Not enough active users dnbd3_cache_map_t *cache = ref_get_cachemap( image ); - if ( cache == NULL || image->users < _bgrMinClients ) { + if ( cache == NULL || image->users ) { // No cache map (=image complete) ref_put( &cache->reference ); - return; + return true; } const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); const int lastBlockIndex = mapBytes - 1; @@ -741,17 +756,20 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) if ( replicationIndex == -1 ) { // Replication might be complete, uplink_mainloop should take care.... uplink->nextReplicationIndex = -1; - return; + return true; } const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE; uplink->replicationHandle = offset; const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE ); mutex_lock( &uplink->sendMutex ); bool sendOk = dnbd3_get_block( uplink->current.fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->current.version, 1 ) ); - mutex_unlock( &uplink->sendMutex ); - if ( !sendOk ) { + if ( likely( sendOk ) ) { + mutex_unlock( &uplink->sendMutex ); + } else { + uplink->image->problem.uplink = true; + mutex_unlock( &uplink->sendMutex ); logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" ); - return; + return false; } if ( replicationIndex == lastBlockIndex ) { uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks @@ -762,6 +780,7 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) // Just crossed a hash block boundary, look for new candidate starting at this very index uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex ); } + return true; } /** @@ -816,6 +835,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int /** * Receive data from uplink server and process/dispatch * Locks on: uplink.lock, images[].lock + * Only called from uplink thread, so current.fd is assumed to be valid. */ static void uplink_handleReceive(dnbd3_uplink_t *uplink) { @@ -990,11 +1010,14 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) mutex_lock( &uplink->queueLock ); const bool rep = ( uplink->queueLen == 0 ); mutex_unlock( &uplink->queueLock ); - if ( rep ) uplink_sendReplicationRequest( uplink ); + if ( rep ) { + if ( !uplink_sendReplicationRequest( uplink ) ) + goto error_cleanup; + } } return; // Error handling from failed receive or message parsing - error_cleanup: ; +error_cleanup: ; uplink_connectionFailed( uplink, true ); } @@ -1005,8 +1028,10 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) { if ( uplink->current.fd == -1 ) return; + setThreadName( "panic-uplink" ); altservers_serverFailed( uplink->current.index ); mutex_lock( &uplink->sendMutex ); + uplink->image->problem.uplink = true; close( uplink->current.fd ); uplink->current.fd = -1; mutex_unlock( &uplink->sendMutex ); @@ -1025,14 +1050,24 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) } /** - * Send keep alive request to server + * Send keep alive request to server. + * Called from uplink thread, current.fd must be valid. */ -static int uplink_sendKeepalive(const int fd) +static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink) { static const dnbd3_request_t request = { .magic = dnbd3_packet_magic, .cmd = net_order_16( CMD_KEEPALIVE ) }; - return send( fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request); + mutex_lock( &uplink->sendMutex ); + bool sendOk = send( uplink->current.fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request); + mutex_unlock( &uplink->sendMutex ); + return sendOk; } +/** + * Request crclist from uplink. + * Called from uplink thread, current.fd must be valid. + * FIXME This is broken as it could happen that another message arrives after sending + * the request. Refactor, split and move receive into general receive handler. + */ static void uplink_addCrc32(dnbd3_uplink_t *uplink) { dnbd3_image_t *image = uplink->image; @@ -1042,6 +1077,9 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink) uint32_t *buffer = malloc( bytes ); mutex_lock( &uplink->sendMutex ); bool sendOk = dnbd3_get_crc32( uplink->current.fd, &masterCrc, buffer, &bytes ); + if ( !sendOk ) { + uplink->image->problem.uplink = true; + } mutex_unlock( &uplink->sendMutex ); if ( !sendOk || bytes == 0 ) { free( buffer ); -- cgit v1.2.3-55-g7522 From 5bc3badd013b88201da64dc970600d19451daaec Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Tue, 3 Mar 2020 14:55:01 +0100 Subject: [SERVER] Also add a flag for uplink queue overload --- src/server/globals.h | 3 ++- src/server/net.c | 10 +++------- src/server/uplink.c | 11 +++++++++++ 3 files changed, 16 insertions(+), 8 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/globals.h b/src/server/globals.h index 31fbce5..0bd6e47 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -93,7 +93,7 @@ struct _dnbd3_uplink // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block" uint64_t replicationHandle; // Handle of pending replication request atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup. - atomic_int queueLen; // length of queue + int queueLen; // length of queue uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives) dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE]; dnbd3_alt_local_t altData[SERVER_MAX_ALTS]; @@ -141,6 +141,7 @@ struct _dnbd3_image atomic_bool write; // Error writing to file atomic_bool read; // Error reading from file atomic_bool changed; // File disappeared or changed, thorough check required if it seems to be back + atomic_bool queue; // Too many requests waiting on uplink } problem; uint16_t rid; // revision of image pthread_mutex_t lock; diff --git a/src/server/net.c b/src/server/net.c index 29147be..a478e0c 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -269,12 +269,11 @@ void* net_handleNewConnection(void *clientPtr) // Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable bOk = true; if ( image->ref_cacheMap != NULL ) { - dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); - if ( uplink != NULL && ( uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) ) { + if ( image->problem.queue || image->problem.write ) { bOk = ( rand() % 4 ) == 1; } - if ( bOk && uplink != NULL ) { - if ( uplink->cacheFd == -1 ) { // Wait 100ms if local caching is not working so this + if ( bOk ) { + if ( image->problem.write ) { // Wait 100ms if local caching is not working so this usleep( 100000 ); // server gets a penalty and is less likely to be selected } if ( image->problem.uplink ) { @@ -282,9 +281,6 @@ void* net_handleNewConnection(void *clientPtr) usleep( ( 100 - image->completenessEstimate ) * 100 ); } } - if ( uplink != NULL ) { - ref_put( &uplink->reference ); - } } if ( bOk ) { mutex_lock( &image->lock ); diff --git a/src/server/uplink.c b/src/server/uplink.c index aba53ba..97cb2a9 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -118,6 +118,8 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version mutex_unlock( &uplink->sendMutex ); uplink->cycleDetected = false; image->problem.uplink = true; + image->problem.write = true; + image->problem.queue = false; if ( sock != -1 ) { uplink->better.fd = sock; int index = altservers_hostToIndex( host ); @@ -191,6 +193,7 @@ static void cancelAllRequests(dnbd3_uplink_t *uplink) } } uplink->queueLen = 0; + uplink->image->problem.queue = false; } static void uplink_free(ref *ref) @@ -328,6 +331,9 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin goto fail_lock; } freeSlot = uplink->queueLen++; + if ( freeSlot > SERVER_UPLINK_QUEUELEN_THRES ) { + uplink->image->problem.queue = true; + } } // Do not send request to uplink server if we have a matching pending request AND the request either has the // status ULR_NEW/PENDING OR we found a free slot with LOWER index than the one we attach to. Otherwise @@ -904,6 +910,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) continue; // Success, retry write } if ( err == EBADF || err == EINVAL || err == EIO ) { + uplink->image->problem.write = true; if ( !tryAgain || !uplink_reopenCacheFd( uplink, true ) ) break; tryAgain = false; @@ -983,6 +990,9 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) } if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--; } + if ( uplink->queueLen < SERVER_UPLINK_QUEUELEN_THRES ) { + uplink->image->problem.queue = false; + } mutex_unlock( &uplink->queueLock ); #ifdef _DEBUG if ( !served && start != uplink->replicationHandle ) { @@ -1121,6 +1131,7 @@ static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force) close( uplink->cacheFd ); } uplink->cacheFd = open( uplink->image->path, O_WRONLY | O_CREAT, 0644 ); + uplink->image->problem.write = uplink->cacheFd == -1; return uplink->cacheFd != -1; } -- cgit v1.2.3-55-g7522 From 930b65f26cb39687a113641f56711a2d58f886ca Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Wed, 4 Mar 2020 17:49:50 +0100 Subject: [SERVER] Add timer task for saving cache maps Cache maps will now be saved periodically, but only if either they have a "dirty" bit set, which happens if any bits in the map get cleared again (due to corruption), or if new data has been replicated from an uplink server. This either means at least one byte received and 5 minutes have passed, or at least 500MB have been downloaded. The timer currently runs every 20 seconds. --- src/server/altservers.c | 20 +++++++ src/server/altservers.h | 2 + src/server/globals.h | 3 +- src/server/image.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++- src/server/image.h | 2 + src/server/uplink.c | 76 ++------------------------- src/serverconfig.h | 5 +- 7 files changed, 168 insertions(+), 76 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/altservers.c b/src/server/altservers.c index a6ad235..380737c 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -273,6 +273,26 @@ int altservers_getHostListForReplication(const char *image, dnbd3_host_t *server return num; } +/** + * Returns true if there is at least one alt-server the + * given image name would be allowed to be cloned from. + */ +bool altservers_imageHasAltServers(const char *image) +{ + bool ret = false; + mutex_lock( &altServersLock ); + for ( int i = 0; i < numAltServers; ++i ) { + if ( altServers[i].isClientOnly || ( !altServers[i].isPrivate && _proxyPrivateOnly ) ) + continue; + if ( !isImageAllowed( &altServers[i], image ) ) + continue; + ret = true; + break; + } + mutex_unlock( &altServersLock ); + return ret; +} + /** * Get alt servers. If there are more alt servers than * requested, random servers will be picked. diff --git a/src/server/altservers.h b/src/server/altservers.h index 8e29aaa..78f6fcc 100644 --- a/src/server/altservers.h +++ b/src/server/altservers.h @@ -19,6 +19,8 @@ int altservers_getListForClient(dnbd3_client_t *client, dnbd3_server_entry_t *ou int altservers_getHostListForReplication(const char *image, dnbd3_host_t *servers, int size); +bool altservers_imageHasAltServers(const char *image); + bool altservers_toString(int server, char *buffer, size_t len); int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2); diff --git a/src/server/globals.h b/src/server/globals.h index 5de4180..10d3ee3 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -93,6 +93,7 @@ struct _dnbd3_uplink // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block" uint64_t replicationHandle; // Handle of pending replication request atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup. + atomic_uint_fast64_t bytesReceivedLastSave; // Number of bytes received when we last saved the cache map int queueLen; // length of queue uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives) dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE]; @@ -128,7 +129,6 @@ struct _dnbd3_image uint64_t virtualFilesize; // virtual size of image (real size rounded up to multiple of 4k) uint64_t realFilesize; // actual file size on disk ticks atime; // last access time - ticks lastWorkCheck; // last time a non-working image has been checked ticks nextCompletenessEstimate; // next time the completeness estimate should be updated uint32_t *crc32; // list of crc32 checksums for each 16MiB block in image uint32_t masterCrc32; // CRC-32 of the crc-32 list @@ -144,6 +144,7 @@ struct _dnbd3_image atomic_bool queue; // Too many requests waiting on uplink } problem; uint16_t rid; // revision of image + atomic_bool mapDirty; // Cache map has been modified outside uplink (only integrity checker for now) pthread_mutex_t lock; }; diff --git a/src/server/image.c b/src/server/image.c index 3583f86..5a9e15b 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -55,6 +55,8 @@ static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc); static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd); static void* closeUnusedFds(void*); +static void* saveAllCacheMaps(void*); +static bool saveCacheMap(dnbd3_image_t *image); static void allocCacheMap(dnbd3_image_t *image, bool complete); static void cmfree(ref *ref) @@ -73,6 +75,7 @@ void image_serverStartup() mutex_init( &remoteCloneLock, LOCK_REMOTE_CLONE ); mutex_init( &reloadLock, LOCK_RELOAD ); server_addJob( &closeUnusedFds, NULL, 10, 900 ); + server_addJob( &saveAllCacheMaps, NULL, 9, 20 ); } /** @@ -160,6 +163,8 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co integrity_check( image, block, false ); } } + } else if ( !set ) { + image->mapDirty = true; } ref_put( &cache->reference ); } @@ -624,6 +629,7 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image) // this will get called again when the uplink is done. if ( !uplink_shutdown( image ) ) return NULL; + saveCacheMap( image ); mutex_lock( &image->lock ); ref_setref( &image->ref_cacheMap, NULL ); free( image->crc32 ); @@ -1830,6 +1836,135 @@ static void* closeUnusedFds(void* nix UNUSED) return NULL; } +#define IMGCOUNT 5 +static void* saveAllCacheMaps(void* nix UNUSED) +{ + static ticks nextSave; + dnbd3_image_t *list[IMGCOUNT]; + int count = 0; + declare_now; + bool full = timing_reached( &nextSave, &now ); + mutex_lock( &imageListLock ); + for ( int i = 0; i < _num_images; ++i ) { + dnbd3_image_t * const image = _images[i]; + if ( image->mapDirty ) { + // Flag is set if integrity checker found a problem - save out + image->users++; + list[count++] = image; + image->mapDirty = false; + } else { + // Otherwise, consider longer timeout and byte count limits of uplink + dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); + if ( uplink != NULL ) { + assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived ); + uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave; + if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES + || ( full && diff != 0 ) ) { + image->users++; + list[count++] = image; + uplink->bytesReceivedLastSave = uplink->bytesReceived; + } + ref_put( &uplink->reference ); + } + } + if ( count == IMGCOUNT ) + break; + } + mutex_unlock( &imageListLock ); + if ( full && count < IMGCOUNT ) { + // Only update nextSave once we handled all images in the list + timing_addSeconds( &nextSave, &now, CACHE_MAP_MAX_SAVE_DELAY ); + } + for ( int i = 0; i < count; ++i ) { + saveCacheMap( list[i] ); + image_release( list[i] ); + } + return NULL; +} +#undef IMGCOUNT + +/** + * Saves the cache map of the given image. + * Return true on success. + * @param image the image + */ +static bool saveCacheMap(dnbd3_image_t *image) +{ + if ( !_isProxy ) + return true; // Nothing to do + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) + return true; // Nothing to do + // Check if we're a "hybrid proxy", i.e. there are only some namespaces (directories) + // for which we have any upstream servers configured. If there's none, don't touch + // the cache map on disk. + if ( !altservers_imageHasAltServers( image->name ) ) { + ref_put( &cache->reference ); + return true; // Nothing to do + } + + logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); + const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize); + char mapfile[strlen( image->path ) + 4 + 1]; + strcpy( mapfile, image->path ); + strcat( mapfile, ".map" ); + + int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 ); + if ( fd == -1 ) { + const int err = errno; + ref_put( &cache->reference ); + logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile ); + return false; + } + + // On Linux we could use readFd, but in general it's not guaranteed to work + int imgFd = open( image->path, O_WRONLY ); + if ( imgFd == -1 ) { + logadd( LOG_WARNING, "Cannot open %s for fsync(): errno=%d", image->path, errno ); + } else { + if ( fsync( imgFd ) == -1 ) { + logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d. Resetting cache map.", image->path, errno ); + dnbd3_cache_map_t *old = image_loadCacheMap(image->path, image->virtualFilesize); + const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + if ( old == NULL ) { + // Could not load old map. FS might be toast. + logadd( LOG_ERROR, "Cannot load old cache map. Setting all zero." ); + memset( cache->map, 0, mapSize ); + } else { + // AND the maps together to be safe + for ( int i = 0; i < mapSize; ++i ) { + cache->map[i] &= old->map[i]; + } + old->reference.free( &old->reference ); + } + } + close( imgFd ); + } + + // Write current map to file + size_t done = 0; + while ( done < size ) { + const ssize_t ret = write( fd, cache->map + done, size - done ); + if ( ret == -1 ) { + if ( errno == EINTR ) continue; + logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile ); + break; + } + if ( ret <= 0 ) { + logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile ); + break; + } + done += (size_t)ret; + } + ref_put( &cache->reference ); + if ( fsync( fd ) == -1 ) { + logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno ); + } + close( fd ); + // TODO fsync on parent directory + return true; +} + static void allocCacheMap(dnbd3_image_t *image, bool complete) { const uint8_t val = complete ? 0xff : 0; @@ -1846,4 +1981,3 @@ static void allocCacheMap(dnbd3_image_t *image, bool complete) } mutex_unlock( &image->lock ); } - diff --git a/src/server/image.h b/src/server/image.h index 89791fc..4614c74 100644 --- a/src/server/image.h +++ b/src/server/image.h @@ -49,6 +49,8 @@ void image_closeUnusedFd(); bool image_ensureDiskSpaceLocked(uint64_t size, bool force); +bool image_saveCacheMap(dnbd3_image_t *image); + // one byte in the map covers 8 4kib blocks, so 32kib per byte // "+ (1 << 15) - 1" is required to account for the last bit of // the image that is smaller than 32kib diff --git a/src/server/uplink.c b/src/server/uplink.c index 97cb2a9..e5ab9c0 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -57,7 +57,6 @@ static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink); static void uplink_addCrc32(dnbd3_uplink_t *uplink); static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink); static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force); -static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink); static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink); static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew); @@ -103,6 +102,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version mutex_init( &uplink->sendMutex, LOCK_UPLINK_SEND ); uplink->image = image; uplink->bytesReceived = 0; + uplink->bytesReceivedLastSave = 0; uplink->idleTime = 0; uplink->queueLen = 0; uplink->cacheFd = -1; @@ -445,7 +445,6 @@ static void* uplink_mainloop(void *data) int altCheckInterval = SERVER_RTT_INTERVAL_INIT; int rttTestResult; uint32_t discoverFailCount = 0; - uint32_t unsavedSeconds = 0; ticks nextAltCheck, lastKeepalive; char buffer[200]; memset( events, 0, sizeof(events) ); @@ -561,12 +560,6 @@ static void* uplink_mainloop(void *data) if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) { lastKeepalive = now; uplink->idleTime += timepassed; - unsavedSeconds += timepassed; - if ( unsavedSeconds > 240 || ( unsavedSeconds > 60 && uplink->idleTime >= 20 && uplink->idleTime <= 70 ) ) { - // fsync/save every 4 minutes, or every 60 seconds if uplink is idle - unsavedSeconds = 0; - uplink_saveCacheMap( uplink ); - } // Keep-alive if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) { // Send keep-alive if nothing is happening, and try to trigger background rep. @@ -639,9 +632,9 @@ static void* uplink_mainloop(void *data) } #endif } - cleanup: ; - uplink_saveCacheMap( uplink ); +cleanup: ; dnbd3_image_t *image = uplink->image; + image->mapDirty = true; // Force writeout of cache map mutex_lock( &image->lock ); bool exp = false; if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) { @@ -1135,69 +1128,6 @@ static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force) return uplink->cacheFd != -1; } -/** - * Saves the cache map of the given image. - * Return true on success. - * Locks on: imageListLock, image.lock - */ -static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) -{ - dnbd3_image_t *image = uplink->image; - assert( image != NULL ); - - if ( uplink->cacheFd != -1 ) { - if ( fsync( uplink->cacheFd ) == -1 ) { - // A failing fsync means we have no guarantee that any data - // since the last fsync (or open if none) has been saved. Apart - // from keeping the cache map from the last successful fsync - // around and restoring it there isn't much we can do to recover - // a consistent state. Bail out. - logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d", image->path, errno ); - logadd( LOG_ERROR, "Bailing out immediately" ); - exit( 1 ); - } - } - - dnbd3_cache_map_t *cache = ref_get_cachemap( image ); - if ( cache == NULL ) - return true; - logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); - const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize); - assert( image->path != NULL ); - char mapfile[strlen( image->path ) + 4 + 1]; - strcpy( mapfile, image->path ); - strcat( mapfile, ".map" ); - - int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 ); - if ( fd == -1 ) { - const int err = errno; - ref_put( &cache->reference ); - logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile ); - return false; - } - - size_t done = 0; - while ( done < size ) { - const ssize_t ret = write( fd, cache->map + done, size - done ); - if ( ret == -1 ) { - if ( errno == EINTR ) continue; - logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile ); - break; - } - if ( ret <= 0 ) { - logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile ); - break; - } - done += (size_t)ret; - } - ref_put( &cache->reference ); - if ( fsync( fd ) == -1 ) { - logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno ); - } - close( fd ); - return true; -} - static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink) { return ( uplink->idleTime > SERVER_UPLINK_IDLE_TIMEOUT diff --git a/src/serverconfig.h b/src/serverconfig.h index 239f0a2..5c7301d 100644 --- a/src/serverconfig.h +++ b/src/serverconfig.h @@ -17,7 +17,10 @@ #define SERVER_UPLINK_QUEUELEN_THRES 900 // Threshold where we start dropping incoming clients #define SERVER_MAX_PENDING_ALT_CHECKS 500 // Length of queue for pending alt checks requested by uplinks -#define SERVER_CACHE_MAP_SAVE_INTERVAL 90 +// Wait a maximum of 5 minutes before saving cache map (if data was received at all) +#define CACHE_MAP_MAX_SAVE_DELAY 300 +// If more than 500MB have been received from uplink without saving cache map, do so +#define CACHE_MAP_MAX_UNSAVED_BYTES ((uint64_t)500 * 1000 * 1000) // Time in ms to wait for a read/write call to complete on an uplink connection #define SOCKET_TIMEOUT_UPLINK 5000 -- cgit v1.2.3-55-g7522 From 080a06ab22c8ac0841c06fe52ab4dbc982beafc1 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 6 Mar 2020 11:34:58 +0100 Subject: [SERVER] Reload cache maps periodically for local images If an image is incomplete, but has no upstream server that can be used for replication, reload the cache map from disk periodically, in case some other server instance is writing to the image. --- src/server/globals.h | 3 +- src/server/image.c | 129 +++++++++++++++++++++++++++++++++------------------ src/server/uplink.c | 10 +++- 3 files changed, 93 insertions(+), 49 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/globals.h b/src/server/globals.h index 10d3ee3..211fe2d 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -111,6 +111,8 @@ typedef struct typedef struct { ref reference; + atomic_bool dirty; // Cache map has been modified outside uplink (only integrity checker for now) + bool unchanged; // How many times in a row a reloaded cache map went unchanged _Atomic uint8_t map[]; } dnbd3_cache_map_t; @@ -144,7 +146,6 @@ struct _dnbd3_image atomic_bool queue; // Too many requests waiting on uplink } problem; uint16_t rid; // revision of image - atomic_bool mapDirty; // Cache map has been modified outside uplink (only integrity checker for now) pthread_mutex_t lock; }; diff --git a/src/server/image.c b/src/server/image.c index 5a9e15b..7ffe041 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -55,8 +55,9 @@ static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc); static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd); static void* closeUnusedFds(void*); -static void* saveAllCacheMaps(void*); -static bool saveCacheMap(dnbd3_image_t *image); +static bool imageShouldSaveCacheMap(dnbd3_image_t *image); +static void* saveLoadAllCacheMaps(void*); +static void saveCacheMap(dnbd3_image_t *image); static void allocCacheMap(dnbd3_image_t *image, bool complete); static void cmfree(ref *ref) @@ -75,7 +76,7 @@ void image_serverStartup() mutex_init( &remoteCloneLock, LOCK_REMOTE_CLONE ); mutex_init( &reloadLock, LOCK_RELOAD ); server_addJob( &closeUnusedFds, NULL, 10, 900 ); - server_addJob( &saveAllCacheMaps, NULL, 9, 20 ); + server_addJob( &saveLoadAllCacheMaps, NULL, 9, 20 ); } /** @@ -164,7 +165,7 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co } } } else if ( !set ) { - image->mapDirty = true; + cache->dirty = true; } ref_put( &cache->reference ); } @@ -629,7 +630,9 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image) // this will get called again when the uplink is done. if ( !uplink_shutdown( image ) ) return NULL; - saveCacheMap( image ); + if ( imageShouldSaveCacheMap( image ) ) { + saveCacheMap( image ); + } mutex_lock( &image->lock ); ref_setref( &image->ref_cacheMap, NULL ); free( image->crc32 ); @@ -1836,72 +1839,107 @@ static void* closeUnusedFds(void* nix UNUSED) return NULL; } -#define IMGCOUNT 5 -static void* saveAllCacheMaps(void* nix UNUSED) +static bool imageShouldSaveCacheMap(dnbd3_image_t *image) +{ + if ( !_isProxy ) + return false; // Nothing to do + if ( image->ref_cacheMap == NULL ) + return false; // Nothing to do + // Check if we're a "hybrid proxy", i.e. there are only some namespaces (directories) + // for which we have any upstream servers configured. If there's none, don't touch + // the cache map on disk. + if ( !altservers_imageHasAltServers( image->name ) ) + return false; // Nothing to do + return true; +} + +static void* saveLoadAllCacheMaps(void* nix UNUSED) { static ticks nextSave; - dnbd3_image_t *list[IMGCOUNT]; - int count = 0; declare_now; bool full = timing_reached( &nextSave, &now ); mutex_lock( &imageListLock ); for ( int i = 0; i < _num_images; ++i ) { dnbd3_image_t * const image = _images[i]; - if ( image->mapDirty ) { - // Flag is set if integrity checker found a problem - save out - image->users++; - list[count++] = image; - image->mapDirty = false; - } else { - // Otherwise, consider longer timeout and byte count limits of uplink + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) + continue; // No users++ or mutex_unlock yet -> safe + image->users++; + mutex_unlock( &imageListLock ); + if ( imageShouldSaveCacheMap( image ) ) { + // Replicated image, we're responsible for updating the map, so save it + // Save if dirty bit is set, blocks were invalidated + bool save = cache->dirty; dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); - if ( uplink != NULL ) { - assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived ); - uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave; - if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES - || ( full && diff != 0 ) ) { - image->users++; - list[count++] = image; + if ( !save ) { + // Otherwise, consider longer timeout and byte count limits of uplink + if ( uplink != NULL ) { + assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived ); + uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave; + if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES || ( full && diff != 0 ) ) { + save = true; + } + } + } + if ( save ) { + cache->dirty = false; + if ( uplink != NULL ) { uplink->bytesReceivedLastSave = uplink->bytesReceived; } + saveCacheMap( image ); + } + if ( uplink != NULL ) { ref_put( &uplink->reference ); } + } else { + // We're not replicating this image, if there's a cache map, reload + // it periodically, since we might read from a shared storage that + // another server instance is writing to. + if ( full || !cache->unchanged && !image->problem.read ) { + logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", image->name, (int)image->rid ); + dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize); + if ( onDisk == NULL ) { + // Should be complete now + logadd( LOG_DEBUG1, "External replication of %s:%d complete", image->name, (int)image->rid ); + ref_setref( &image->ref_cacheMap, NULL ); + } else { + const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + if ( memcmp( cache->map, onDisk->map, mapSize ) == 0 ) { + // Unchanged + cache->unchanged = true; + onDisk->reference.free( &onDisk->reference ); + } else { + // Replace + ref_setref( &image->ref_cacheMap, &onDisk->reference ); + logadd( LOG_DEBUG2, "Map changed" ); + } + } + } } - if ( count == IMGCOUNT ) - break; + ref_put( &cache->reference ); + image_release( image ); // Always do this instead of users-- to handle freeing + mutex_lock( &imageListLock ); } mutex_unlock( &imageListLock ); - if ( full && count < IMGCOUNT ) { - // Only update nextSave once we handled all images in the list + if ( full ) { timing_addSeconds( &nextSave, &now, CACHE_MAP_MAX_SAVE_DELAY ); } - for ( int i = 0; i < count; ++i ) { - saveCacheMap( list[i] ); - image_release( list[i] ); - } return NULL; } -#undef IMGCOUNT /** * Saves the cache map of the given image. - * Return true on success. + * Return false if this image doesn't have a cache map, or if the image + * doesn't have any uplink to replicate from. In this case the image might + * still have a cache map that was loaded from disk, and should be reloaded + * periodically. * @param image the image */ -static bool saveCacheMap(dnbd3_image_t *image) +static void saveCacheMap(dnbd3_image_t *image) { - if ( !_isProxy ) - return true; // Nothing to do dnbd3_cache_map_t *cache = ref_get_cachemap( image ); if ( cache == NULL ) - return true; // Nothing to do - // Check if we're a "hybrid proxy", i.e. there are only some namespaces (directories) - // for which we have any upstream servers configured. If there's none, don't touch - // the cache map on disk. - if ( !altservers_imageHasAltServers( image->name ) ) { - ref_put( &cache->reference ); - return true; // Nothing to do - } + return; // Race - wasn't NULL in function call above... logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize); @@ -1914,7 +1952,7 @@ static bool saveCacheMap(dnbd3_image_t *image) const int err = errno; ref_put( &cache->reference ); logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile ); - return false; + return; } // On Linux we could use readFd, but in general it's not guaranteed to work @@ -1962,7 +2000,6 @@ static bool saveCacheMap(dnbd3_image_t *image) } close( fd ); // TODO fsync on parent directory - return true; } static void allocCacheMap(dnbd3_image_t *image, bool complete) diff --git a/src/server/uplink.c b/src/server/uplink.c index e5ab9c0..e644e56 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -80,6 +80,8 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version { if ( !_isProxy || _shutdown ) return false; assert( image != NULL ); + if ( sock == -1 && !altservers_imageHasAltServers( image->name ) ) + return false; // Nothing to do mutex_lock( &image->lock ); dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); if ( uplink != NULL ) { @@ -103,7 +105,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version uplink->image = image; uplink->bytesReceived = 0; uplink->bytesReceivedLastSave = 0; - uplink->idleTime = 0; + uplink->idleTime = SERVER_UPLINK_IDLE_TIMEOUT - 90; uplink->queueLen = 0; uplink->cacheFd = -1; uplink->signal = signal_new(); @@ -634,7 +636,11 @@ static void* uplink_mainloop(void *data) } cleanup: ; dnbd3_image_t *image = uplink->image; - image->mapDirty = true; // Force writeout of cache map + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache != NULL ) { + cache->dirty = true; // Force writeout of cache map + ref_put( &cache->reference ); + } mutex_lock( &image->lock ); bool exp = false; if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) { -- cgit v1.2.3-55-g7522 From ff4e770e645c05da48baddb30a77b9dc15ca76fd Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 6 Mar 2020 15:00:46 +0100 Subject: [SERVER] Handle "warn unused result" cases --- src/server/fileutil.c | 2 +- src/server/globals.c | 5 ++++- src/server/image.c | 8 ++++++-- src/server/rpc.c | 2 +- src/server/server.c | 5 ++++- src/server/uplink.c | 14 ++++++++++---- 6 files changed, 26 insertions(+), 10 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/fileutil.c b/src/server/fileutil.c index 336ab68..9a9f066 100644 --- a/src/server/fileutil.c +++ b/src/server/fileutil.c @@ -68,7 +68,7 @@ bool file_setSize(int fd, uint64_t size) // Try really hard... image loading logic relies on the file // having the proper apparent size uint8_t byte = 0; - pread( fd, &byte, 1, size - 1 ); + (void)!pread( fd, &byte, 1, size - 1 ); if ( pwrite( fd, &byte, 1, size - 1 ) == 1 ) return true; return false; } diff --git a/src/server/globals.c b/src/server/globals.c index 2e87400..ac079b1 100644 --- a/src/server/globals.c +++ b/src/server/globals.c @@ -113,7 +113,10 @@ static int ini_handler(void *custom UNUSED, const char* section, const char* key void globals_loadConfig() { char *name = NULL; - asprintf( &name, "%s/%s", _configDir, CONFIG_FILENAME ); + if ( asprintf( &name, "%s/%s", _configDir, CONFIG_FILENAME ) == -1 ) { + logadd( LOG_ERROR, "Memory allocation error for config filename" ); + exit( 1 ); + } if ( name == NULL ) return; if ( initialLoad ) { mutex_init( &loadLock, LOCK_LOAD_CONFIG ); diff --git a/src/server/image.c b/src/server/image.c index 7ffe041..32c9efe 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -1446,9 +1446,13 @@ static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageS logadd( LOG_WARNING, "OTF-Clone: Corrupted CRC-32 list. ignored. (%s)", name ); } else { int fd = open( crcFile, O_WRONLY | O_CREAT, 0644 ); - write( fd, &masterCrc, sizeof(uint32_t) ); - write( fd, crc32list, crc32len ); + ssize_t ret = write( fd, &masterCrc, sizeof(masterCrc) ); + ret += write( fd, crc32list, crc32len ); close( fd ); + if ( (size_t)ret != crc32len + sizeof(masterCrc) ) { + logadd( LOG_WARNING, "Could not save freshly received crc32 list for %s:%d", name, (int)revision ); + unlink( crcFile ); + } } } free( crc32list ); diff --git a/src/server/rpc.c b/src/server/rpc.c index a454d6d..b66b8fe 100644 --- a/src/server/rpc.c +++ b/src/server/rpc.c @@ -101,7 +101,7 @@ void rpc_init() int fd = open( "/dev/urandom", O_RDONLY ); if ( fd != -1 ) { uint32_t bla = 1; - read( fd, &bla, 4 ); + (void)!read( fd, &bla, 4 ); randomRunId = (randomRunId << 32) | bla; } close( fd ); diff --git a/src/server/server.c b/src/server/server.c index 0dddea7..c9edc05 100644 --- a/src/server/server.c +++ b/src/server/server.c @@ -315,7 +315,10 @@ int main(int argc, char *argv[]) // No one-shot detected, normal server operation or errormsg serving if ( demonize ) { logadd( LOG_INFO, "Forking into background, see log file for further information" ); - daemon( 1, 0 ); + if ( daemon( 0, 0 ) == -1 ) { + logadd( LOG_ERROR, "Could not daemon(): errno=%d", errno ); + exit( 1 ); + } } if ( errorMsg != NULL ) { setupNetwork( bindAddress ); diff --git a/src/server/uplink.c b/src/server/uplink.c index e644e56..71d9f94 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -1098,7 +1098,8 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink) lists_crc = crc32( lists_crc, (uint8_t*)buffer, bytes ); lists_crc = net_order_32( lists_crc ); if ( lists_crc != masterCrc ) { - logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s)!", uplink->image->name ); + logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s:%d)!", + uplink->image->name, (int)uplink->image->rid ); free( buffer ); return; } @@ -1108,10 +1109,15 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink) char path[len]; snprintf( path, len, "%s.crc", uplink->image->path ); const int fd = open( path, O_WRONLY | O_CREAT, 0644 ); - if ( fd >= 0 ) { - write( fd, &masterCrc, sizeof(uint32_t) ); - write( fd, buffer, bytes ); + if ( fd != -1 ) { + ssize_t ret = write( fd, &masterCrc, sizeof(masterCrc) ); + ret += write( fd, buffer, bytes ); close( fd ); + if ( (size_t)ret != sizeof(masterCrc) + bytes ) { + unlink( path ); + logadd( LOG_WARNING, "Could not write crc32 file for %s:%d", + uplink->image->name, (int)uplink->image->rid ); + } } } -- cgit v1.2.3-55-g7522 From 9f11c67b291b50e0f1c98d2e85db22a33d2e2d11 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 6 Mar 2020 16:02:54 +0100 Subject: [SERVER] Add printf macro for image (name:rid as %s:%d) --- src/server/altservers.c | 4 ++-- src/server/globals.h | 1 + src/server/image.c | 32 ++++++++++++++------------------ src/server/uplink.c | 20 ++++++++++---------- 4 files changed, 27 insertions(+), 30 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/altservers.c b/src/server/altservers.c index 380737c..35da3a2 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -14,7 +14,7 @@ #include #include -#define LOG(lvl, msg, ...) logadd(lvl, msg " (%s:%d)", __VA_ARGS__, image->name, (int)image->rid) +#define LOG(lvl, msg, ...) logadd(lvl, msg " (%s:%d)", __VA_ARGS__, PIMG(image)) #define LOG_GOTO(jumplabel, lvl, ...) do { LOG(lvl, __VA_ARGS__); goto jumplabel; } while (0); #define ERROR_GOTO(jumplabel, ...) LOG_GOTO(jumplabel, LOG_ERROR, __VA_ARGS__) @@ -524,7 +524,7 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink) logadd( LOG_WARNING, "Image has gone away that was queued for RTT measurement" ); return; } - LOG( LOG_DEBUG2, "Running alt check for %s:%d", image->name, (int)image->rid ); + logadd( LOG_DEBUG2, "Running alt check for %s:%d", PIMG(image) ); assert( uplink->rttTestResult == RTT_INPROGRESS ); // Test them all dnbd3_server_connection_t best = { .fd = -1 }; diff --git a/src/server/globals.h b/src/server/globals.h index 211fe2d..1bb6857 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -148,6 +148,7 @@ struct _dnbd3_image uint16_t rid; // revision of image pthread_mutex_t lock; }; +#define PIMG(x) (x)->name, (int)(x)->rid struct _dnbd3_client { diff --git a/src/server/image.c b/src/server/image.c index 32c9efe..18e91d9 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -290,13 +290,11 @@ bool image_ensureOpen(dnbd3_image_t *image) if ( image->crc32 == NULL ) { // Cannot verify further, hope for the best image->problem.changed = false; - logadd( LOG_DEBUG1, "Size of image %s:%d changed back to expected value", - image->name, (int)image->rid ); + logadd( LOG_DEBUG1, "Size of image %s:%d changed back to expected value", PIMG(image) ); } else if ( image_checkRandomBlocks( image, 1, newFd ) ) { // This should have checked the first block (if complete) -> All is well again image->problem.changed = false; - logadd( LOG_DEBUG1, "Size and CRC of image %s:%d changed back to expected value", - image->name, (int)image->rid ); + logadd( LOG_DEBUG1, "Size and CRC of image %s:%d changed back to expected value", PIMG(image) ); } } else { image->problem.changed = sizeChanged; @@ -624,7 +622,7 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image) { assert( image != NULL ); assert( image->users == 0 ); - logadd( ( _shutdown ? LOG_DEBUG1 : LOG_INFO ), "Freeing image %s:%d", image->name, (int)image->rid ); + logadd( ( _shutdown ? LOG_DEBUG1 : LOG_INFO ), "Freeing image %s:%d", PIMG(image) ); // uplink_shutdown might return false to tell us // that the shutdown is in progress. Bail out since // this will get called again when the uplink is done. @@ -852,16 +850,16 @@ static bool image_load(char *base, char *path, int withUplink) // Compare data just loaded to identical image we apparently already loaded if ( existing != NULL ) { if ( existing->realFilesize != realFilesize ) { - logadd( LOG_WARNING, "Size of image '%s:%d' has changed.", existing->name, (int)existing->rid ); + logadd( LOG_WARNING, "Size of image '%s:%d' has changed.", PIMG(existing) ); // Image will be replaced below } else if ( existing->crc32 != NULL && crc32list != NULL && memcmp( existing->crc32, crc32list, sizeof(uint32_t) * hashBlockCount ) != 0 ) { - logadd( LOG_WARNING, "CRC32 list of image '%s:%d' has changed.", existing->name, (int)existing->rid ); + logadd( LOG_WARNING, "CRC32 list of image '%s:%d' has changed.", PIMG(existing) ); logadd( LOG_WARNING, "The image will be reloaded, but you should NOT replace existing images while the server is running." ); logadd( LOG_WARNING, "Actually even if it's not running this should never be done. Use a new RID instead!" ); // Image will be replaced below } else if ( existing->crc32 == NULL && crc32list != NULL ) { - logadd( LOG_INFO, "Found CRC-32 list for already loaded image '%s:%d', adding...", existing->name, (int)existing->rid ); + logadd( LOG_INFO, "Found CRC-32 list for already loaded image '%s:%d', adding...", PIMG(existing) ); existing->crc32 = crc32list; existing->masterCrc32 = masterCrc; crc32list = NULL; @@ -869,7 +867,7 @@ static bool image_load(char *base, char *path, int withUplink) goto load_error; // Keep existing } else if ( existing->ref_cacheMap != NULL && cache == NULL ) { // Just ignore that fact, if replication is really complete the cache map will be removed anyways - logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", existing->name, (int)existing->rid ); + logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", PIMG(existing) ); function_return = true; goto load_error; // Keep existing } else { @@ -940,7 +938,7 @@ static bool image_load(char *base, char *path, int withUplink) image = image_free( image ); goto load_error; } - logadd( LOG_DEBUG1, "Loaded image '%s:%d'\n", image->name, (int)image->rid ); + logadd( LOG_DEBUG1, "Loaded image '%s:%d'\n", PIMG(image) ); function_return = true; // Clean exit: @@ -1790,7 +1788,7 @@ static bool image_ensureDiskSpace(uint64_t size, bool force) image_release( oldest ); // We did users++ above; image might have to be freed entirely return false; } - logadd( LOG_INFO, "'%s:%d' has to go!", oldest->name, (int)oldest->rid ); + logadd( LOG_INFO, "'%s:%d' has to go!", PIMG(oldest) ); char *filename = strdup( oldest->path ); // Copy name as we remove the image first oldest = image_remove( oldest ); // Remove from list first... oldest = image_release( oldest ); // Decrease users counter; if it falls to 0, image will be freed @@ -1825,10 +1823,8 @@ static void* closeUnusedFds(void* nix UNUSED) dnbd3_image_t * const image = _images[i]; if ( image == NULL || image->readFd == -1 ) continue; - // TODO: Also close for idle uplinks (uplink_connectionShouldShutdown) - // TODO: And close writeFd for idle uplinks.... if ( image->users == 0 && image->uplinkref == NULL && timing_reached( &image->atime, &deadline ) ) { - logadd( LOG_DEBUG1, "Inactive fd closed for %s:%d", image->name, (int)image->rid ); + logadd( LOG_DEBUG1, "Inactive fd closed for %s:%d", PIMG(image) ); fds[fdindex++] = image->readFd; image->readFd = -1; // Not a race; image->users is 0 and to increase it you need imageListLock if ( fdindex == FDCOUNT ) @@ -1900,11 +1896,11 @@ static void* saveLoadAllCacheMaps(void* nix UNUSED) // it periodically, since we might read from a shared storage that // another server instance is writing to. if ( full || !cache->unchanged && !image->problem.read ) { - logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", image->name, (int)image->rid ); + logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", PIMG(image) ); dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize); if ( onDisk == NULL ) { // Should be complete now - logadd( LOG_DEBUG1, "External replication of %s:%d complete", image->name, (int)image->rid ); + logadd( LOG_DEBUG1, "External replication of %s:%d complete", PIMG(image) ); ref_setref( &image->ref_cacheMap, NULL ); } else { const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); @@ -1945,7 +1941,7 @@ static void saveCacheMap(dnbd3_image_t *image) if ( cache == NULL ) return; // Race - wasn't NULL in function call above... - logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); + logadd( LOG_DEBUG2, "Saving cache map of %s:%d", PIMG(image) ); const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize); char mapfile[strlen( image->path ) + 4 + 1]; strcpy( mapfile, image->path ); @@ -2015,7 +2011,7 @@ static void allocCacheMap(dnbd3_image_t *image, bool complete) memset( cache->map, val, byteSize ); mutex_lock( &image->lock ); if ( image->ref_cacheMap != NULL ) { - logadd( LOG_WARNING, "BUG: allocCacheMap called but there already is a cache map for %s:%d", image->name, (int)image->rid ); + logadd( LOG_WARNING, "BUG: allocCacheMap called but there already is a map for %s:%d", PIMG(image) ); free( cache ); } else { ref_setref( &image->ref_cacheMap, &cache->reference ); diff --git a/src/server/uplink.c b/src/server/uplink.c index 71d9f94..7c7cd1c 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -171,7 +171,7 @@ bool uplink_shutdown(dnbd3_image_t *image) image->users++; // Prevent free while uplink shuts down signal_call( uplink->signal ); } else { - logadd( LOG_ERROR, "This will never happen. '%s:%d'", image->name, (int)image->rid ); + logadd( LOG_ERROR, "This will never happen. '%s:%d'", PIMG(image) ); } cancelAllRequests( uplink ); ref_setref( &image->uplinkref, NULL ); @@ -201,7 +201,7 @@ static void cancelAllRequests(dnbd3_uplink_t *uplink) static void uplink_free(ref *ref) { dnbd3_uplink_t *uplink = container_of(ref, dnbd3_uplink_t, reference); - logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", uplink->image->name, (int)uplink->image->rid ); + logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", PIMG(uplink->image) ); assert( uplink->queueLen == 0 ); if ( uplink->signal != NULL ) { signal_close( uplink->signal ); @@ -572,7 +572,7 @@ static void* uplink_mainloop(void *data) } // Don't keep uplink established if we're idle for too much if ( uplink_connectionShouldShutdown( uplink ) ) { - logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", uplink->image->name, (int)uplink->image->rid ); + logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", PIMG(uplink->image) ); goto cleanup; } } @@ -915,11 +915,13 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) tryAgain = false; continue; // Write handle to image successfully re-opened, try again } - logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", uplink->image->name, (int)uplink->image->rid, err ); + logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", + PIMG(uplink->image), err ); break; } if ( unlikely( ret <= 0 || (uint32_t)ret > inReply.size - done ) ) { - logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d", ret, uplink->image->name, (int)uplink->image->rid ); + logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d", + ret, PIMG(uplink->image) ); break; } done += (uint32_t)ret; @@ -929,7 +931,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) } if ( unlikely( ret == -1 && ( err == EBADF || err == EINVAL || err == EIO ) ) ) { logadd( LOG_WARNING, "Error writing received data for %s:%d (errno=%d); disabling caching.", - uplink->image->name, (int)uplink->image->rid, err ); + PIMG(uplink->image), err ); } } // 2) Figure out which clients are interested in it @@ -1098,8 +1100,7 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink) lists_crc = crc32( lists_crc, (uint8_t*)buffer, bytes ); lists_crc = net_order_32( lists_crc ); if ( lists_crc != masterCrc ) { - logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s:%d)!", - uplink->image->name, (int)uplink->image->rid ); + logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s:%d)!", PIMG(uplink->image) ); free( buffer ); return; } @@ -1115,8 +1116,7 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink) close( fd ); if ( (size_t)ret != sizeof(masterCrc) + bytes ) { unlink( path ); - logadd( LOG_WARNING, "Could not write crc32 file for %s:%d", - uplink->image->name, (int)uplink->image->rid ); + logadd( LOG_WARNING, "Could not write crc32 file for %s:%d", PIMG(uplink->image) ); } } } -- cgit v1.2.3-55-g7522 From 290d3478f245bb7d2112bb781286a9fbae42b983 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 13 Mar 2020 16:03:29 +0100 Subject: [SERVER] Rewrite uplink queue handling - Now uses linked lists instead of huge array - Does prefetch data on client requests - Can have multiple replication requests in-flight --- src/server/globals.c | 6 + src/server/globals.h | 35 ++- src/server/image.c | 3 +- src/server/image.h | 44 +++ src/server/net.c | 44 +-- src/server/reference.h | 5 + src/server/uplink.c | 771 +++++++++++++++++++++++++++---------------------- src/server/uplink.h | 2 +- src/serverconfig.h | 3 +- 9 files changed, 518 insertions(+), 395 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/globals.c b/src/server/globals.c index ac079b1..98e0ddb 100644 --- a/src/server/globals.c +++ b/src/server/globals.c @@ -19,6 +19,7 @@ atomic_int _clientPenalty = 0; atomic_bool _isProxy = false; atomic_int _backgroundReplication = BGR_FULL; atomic_int _bgrMinClients = 0; +atomic_int _bgrWindowSize = 1; atomic_bool _lookupMissingForProxy = true; atomic_bool _sparseFiles = false; atomic_bool _ignoreAllocErrors = false; @@ -74,6 +75,7 @@ static int ini_handler(void *custom UNUSED, const char* section, const char* key SAVE_TO_VAR_BOOL( dnbd3, isProxy ); SAVE_TO_VAR_BOOL( dnbd3, proxyPrivateOnly ); SAVE_TO_VAR_INT( dnbd3, bgrMinClients ); + SAVE_TO_VAR_INT( dnbd3, bgrWindowSize ); SAVE_TO_VAR_BOOL( dnbd3, lookupMissingForProxy ); SAVE_TO_VAR_BOOL( dnbd3, sparseFiles ); SAVE_TO_VAR_BOOL( dnbd3, ignoreAllocErrors ); @@ -134,6 +136,9 @@ void globals_loadConfig() logadd( LOG_WARNING, "Ignoring 'sparseFiles=true' since backgroundReplication is set to true and bgrMinClients is too low" ); _sparseFiles = false; } + if ( _bgrWindowSize < 1 ) { + _bgrWindowSize = 1; + } // Dump config as interpreted char buffer[2000]; globals_dumpConfig( buffer, sizeof(buffer) ); @@ -325,6 +330,7 @@ size_t globals_dumpConfig(char *buffer, size_t size) PBOOL(backgroundReplication); } PINT(bgrMinClients); + PINT(bgrWindowSize); PBOOL(lookupMissingForProxy); PBOOL(sparseFiles); PBOOL(ignoreAllocErrors); diff --git a/src/server/globals.h b/src/server/globals.h index 1bb6857..5cee92a 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -18,18 +18,27 @@ typedef struct _dnbd3_uplink dnbd3_uplink_t; typedef struct _dnbd3_image dnbd3_image_t; typedef struct _dnbd3_client dnbd3_client_t; -typedef struct +typedef struct _dnbd3_queue_client { - uint64_t handle; // Client defined handle to pass back in reply - uint64_t from; // First byte offset of requested block (ie. 4096) - uint64_t to; // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191) + struct _dnbd3_queue_client *next; + uint64_t handle; // Handle used by client + uint64_t from, to; // Client range dnbd3_client_t * client; // Client to send reply to - int status; // status of this entry: ULR_* +} dnbd3_queue_client_t; + +typedef struct _dnbd3_queue_entry +{ + struct _dnbd3_queue_entry *next; + uint64_t handle; // Our handle for this entry + uint64_t from; // First byte offset of requested block (ie. 4096) + uint64_t to; // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191) + dnbd3_queue_client_t *clients; #ifdef _DEBUG - ticks entered; // When this request entered the queue (for debugging) + ticks entered; // When this request entered the queue (for debugging) #endif - uint8_t hopCount; // How many hops this request has already taken across proxies -} dnbd3_queued_request_t; + uint8_t hopCount; // How many hops this request has already taken across proxies + bool sent; // Already sent to uplink? +} dnbd3_queue_entry_t; typedef struct _ns { @@ -91,12 +100,12 @@ struct _dnbd3_uplink bool cycleDetected; // connection cycle between proxies detected for current remote server int nextReplicationIndex; // Which index in the cache map we should start looking for incomplete blocks at // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block" - uint64_t replicationHandle; // Handle of pending replication request atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup. atomic_uint_fast64_t bytesReceivedLastSave; // Number of bytes received when we last saved the cache map int queueLen; // length of queue uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives) - dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE]; + dnbd3_queue_entry_t *queue; + atomic_uint_fast32_t queueId; dnbd3_alt_local_t altData[SERVER_MAX_ALTS]; }; @@ -156,6 +165,7 @@ struct _dnbd3_client atomic_uint_fast64_t bytesSent; // Byte counter for this client. dnbd3_image_t * _Atomic image; // Image in use by this client, or NULL during handshake int sock; + _Atomic uint8_t relayedCount; // How many requests are in-flight to the uplink server bool isServer; // true if a server in proxy mode, false if real client dnbd3_host_t host; char hostName[HOSTNAMELEN]; // inet_ntop version of host @@ -242,6 +252,11 @@ extern atomic_int _backgroundReplication; */ extern atomic_int _bgrMinClients; +/** + * How many in-flight replication requests we should target (per uplink) + */ +extern atomic_int _bgrWindowSize; + /** * (In proxy mode): If connecting client is a proxy, and the requested image * is not known locally, should we ask our known alt servers for it? diff --git a/src/server/image.c b/src/server/image.c index 86b6374..81ec479 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -340,7 +340,6 @@ dnbd3_image_t* image_byId(int imgId) dnbd3_image_t* image_get(char *name, uint16_t revision, bool ensureFdOpen) { int i; - const char *removingText = _removeMissingImages ? ", removing from list" : ""; dnbd3_image_t *candidate = NULL; // Simple sanity check const size_t slen = strlen( name ); @@ -1895,7 +1894,7 @@ static void* saveLoadAllCacheMaps(void* nix UNUSED) // We're not replicating this image, if there's a cache map, reload // it periodically, since we might read from a shared storage that // another server instance is writing to. - if ( full || !cache->unchanged && !image->problem.read ) { + if ( full || ( !cache->unchanged && !image->problem.read ) ) { logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", PIMG(image) ); dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize); if ( onDisk == NULL ) { diff --git a/src/server/image.h b/src/server/image.h index 4614c74..b23711b 100644 --- a/src/server/image.h +++ b/src/server/image.h @@ -51,6 +51,50 @@ bool image_ensureDiskSpaceLocked(uint64_t size, bool force); bool image_saveCacheMap(dnbd3_image_t *image); +/** + * Check if given range is cached. Be careful when using this function because: + * 1) you need to hold a reference to the cache map + * 2) start and end are assumed to be 4k aligned + * 3) start and end are not checked to be in bounds (we don't know the image in this context) + */ +static inline bool image_isRangeCachedUnsafe(dnbd3_cache_map_t *cache, uint64_t start, uint64_t end) +{ + const uint64_t firstByteInMap = start >> 15; + const uint64_t lastByteInMap = (end - 1) >> 15; + const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7)); + const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1))); + uint64_t pos; + uint8_t b; + bool isCached; + if ( firstByteInMap == lastByteInMap ) { // Single byte to check, much simpler + b = cache->map[firstByteInMap]; + isCached = ( b & ( fb & lb ) ) == ( fb & lb ); + } else { + isCached = true; + atomic_thread_fence( memory_order_acquire ); + // First byte + if ( isCached ) { + b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed ); + isCached = ( ( b & fb ) == fb ); + } + // Last byte + if ( isCached ) { + b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed ); + isCached = ( ( b & lb ) == lb ); + } + // Middle, must be all bits set (0xff) + if ( isCached ) { + for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) { + if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) { + isCached = false; + break; + } + } + } + } + return isCached; +} + // one byte in the map covers 8 4kib blocks, so 32kib per byte // "+ (1 << 15) - 1" is required to account for the last bit of // the image that is smaller than 32kib diff --git a/src/server/net.c b/src/server/net.c index 954cb8a..9ba9dbc 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -197,6 +197,7 @@ void* net_handleNewConnection(void *clientPtr) client->hostName[HOSTNAMELEN-1] = '\0'; mutex_unlock( &client->lock ); client->bytesSent = 0; + client->relayedCount = 0; if ( !addToList( client ) ) { freeClientStruct( client ); @@ -344,41 +345,18 @@ void* net_handleNewConnection(void *clientPtr) // This is a proxyed image, check if we need to relay the request... const uint64_t start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); const uint64_t end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); - const uint64_t firstByteInMap = start >> 15; - const uint64_t lastByteInMap = (end - 1) >> 15; - const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7)); - const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1))); - uint64_t pos; - uint8_t b; - bool isCached; - if ( firstByteInMap == lastByteInMap ) { // Single byte to check, much simpler - b = cache->map[firstByteInMap]; - isCached = ( b & ( fb & lb ) ) == ( fb & lb ); - } else { - isCached = true; - atomic_thread_fence( memory_order_acquire ); - // First byte - if ( isCached ) { - b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed ); - isCached = ( ( b & fb ) == fb ); - } - // Last byte - if ( isCached ) { - b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed ); - isCached = ( ( b & lb ) == lb ); - } - // Middle, must be all bits set (0xff) - if ( isCached ) { - for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) { - if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) { - isCached = false; - break; - } + if ( !image_isRangeCachedUnsafe( cache, start, end ) ) { + if ( unlikely( client->relayedCount > 250 ) ) { + logadd( LOG_DEBUG1, "Client is overloading uplink; throttling" ); + for ( int i = 0; i < 100 && client->relayedCount > 200; ++i ) { + usleep( 10000 ); + } + if ( client->relayedCount > 250 ) { + logadd( LOG_WARNING, "Could not lower client's uplink backlog; dropping client" ); + goto exit_client_cleanup; } } - } - if ( !isCached ) { - if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) { + if ( !uplink_request( NULL, client, request.handle, offset, request.size, request.hops ) ) { logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy for image %s:%d", client->hostName, image->name, image->rid ); goto exit_client_cleanup; diff --git a/src/server/reference.h b/src/server/reference.h index 4eda546..75a681f 100644 --- a/src/server/reference.h +++ b/src/server/reference.h @@ -39,6 +39,11 @@ static inline ref *ref_get( weakref *weakref ) return ref; } +static inline void ref_inc( ref *ref ) +{ + ++ref->count; +} + static inline void ref_put( ref *ref ) { if ( --ref->count == 0 ) { diff --git a/src/server/uplink.c b/src/server/uplink.c index 7c7cd1c..188bf06 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -8,6 +8,7 @@ #include "../shared/protocol.h" #include "../shared/timing.h" #include "../shared/crc32.h" +#include "threadpool.h" #include "reference.h" #include @@ -21,30 +22,6 @@ #define MAP_BYTES_PER_HASH_BLOCK (int)( HASH_BLOCK_SIZE / FILE_BYTES_PER_MAP_BYTE ) #define MAP_INDEX_HASH_START_MASK ( ~(int)( MAP_BYTES_PER_HASH_BLOCK - 1 ) ) -#define REP_NONE ( (uint64_t)0xffffffffffffffff ) - -// Status of request in queue - -// Slot is free, can be used. -// Must only be set in uplink_handle_receive() or uplink_remove_client() -#define ULR_FREE 0 -// Slot has been filled with a request that hasn't been sent to the upstream server yet, matching request can safely rely on reuse. -// Must only be set in uplink_request() -#define ULR_NEW 1 -// Slot is occupied, reply has not yet been received, matching request can safely rely on reuse. -// Must only be set in uplink_mainloop() or uplink_request() -#define ULR_PENDING 2 -// Slot is being processed, do not consider for hop on. -// Must only be set in uplink_handle_receive() -#define ULR_PROCESSING 3 - -static const char *const NAMES_ULR[4] = { - [ULR_FREE] = "ULR_FREE", - [ULR_NEW] = "ULR_NEW", - [ULR_PENDING] = "ULR_PENDING", - [ULR_PROCESSING] = "ULR_PROCESSING", -}; - static atomic_uint_fast64_t totalBytesReceived = 0; static void cancelAllRequests(dnbd3_uplink_t *uplink); @@ -59,6 +36,15 @@ static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink); static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force); static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink); static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew); +static int numWantedReplicationRequests(dnbd3_uplink_t *uplink); +static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle); +static void *prefetchForClient(void *data); + +typedef struct { + dnbd3_uplink_t *uplink; + uint64_t start; + uint32_t length; +} prefetch_request_t; // ############ uplink connection handling @@ -106,6 +92,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version uplink->bytesReceived = 0; uplink->bytesReceivedLastSave = 0; uplink->idleTime = SERVER_UPLINK_IDLE_TIMEOUT - 90; + uplink->queue = NULL; uplink->queueLen = 0; uplink->cacheFd = -1; uplink->signal = signal_new(); @@ -113,7 +100,6 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version logadd( LOG_WARNING, "Error creating signal. Uplink unavailable." ); goto failure; } - uplink->replicationHandle = REP_NONE; mutex_lock( &uplink->rttLock ); mutex_lock( &uplink->sendMutex ); uplink->current.fd = -1; @@ -175,9 +161,9 @@ bool uplink_shutdown(dnbd3_image_t *image) } cancelAllRequests( uplink ); ref_setref( &image->uplinkref, NULL ); - ref_put( &uplink->reference ); mutex_unlock( &uplink->queueLock ); bool retval = ( exp && image->users == 0 ); + ref_put( &uplink->reference ); mutex_unlock( &image->lock ); return retval; } @@ -188,12 +174,21 @@ bool uplink_shutdown(dnbd3_image_t *image) */ static void cancelAllRequests(dnbd3_uplink_t *uplink) { - for ( int i = 0; i < uplink->queueLen; ++i ) { - if ( uplink->queue[i].status != ULR_FREE ) { - net_sendReply( uplink->queue[i].client, CMD_ERROR, uplink->queue[i].handle ); - uplink->queue[i].status = ULR_FREE; + dnbd3_queue_entry_t *it = uplink->queue; + while ( it != NULL ) { + dnbd3_queue_client_t *cit = it->clients; + while ( cit != NULL ) { + net_sendReply( cit->client, CMD_ERROR, cit->handle ); + cit->client->relayedCount--; + dnbd3_queue_client_t *next = cit->next; + free( cit ); + cit = next; } + dnbd3_queue_entry_t *next = it->next; + free( it ); + it = next; } + uplink->queue = NULL; uplink->queueLen = 0; uplink->image->problem.queue = false; } @@ -234,39 +229,54 @@ static void uplink_free(ref *ref) */ void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client) { + if ( client->relayedCount == 0 ) + return; mutex_lock( &uplink->queueLock ); - for (int i = uplink->queueLen - 1; i >= 0; --i) { - if ( uplink->queue[i].client == client ) { - // Make sure client doesn't get destroyed while we're sending it data - mutex_lock( &client->sendMutex ); - mutex_unlock( &client->sendMutex ); - uplink->queue[i].client = NULL; - uplink->queue[i].status = ULR_FREE; + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + for ( dnbd3_queue_client_t **cit = &it->clients; *cit != NULL; cit = &(**cit).next ) { + if ( (**cit).client == client ) { + --client->relayedCount; + dnbd3_queue_client_t *entry = *cit; + *cit = (**cit).next; + free( entry ); + } } - if ( uplink->queue[i].client == NULL && uplink->queueLen == i + 1 ) uplink->queueLen--; } mutex_unlock( &uplink->queueLock ); + if ( unlikely( client->relayedCount != 0 ) ) { + logadd( LOG_DEBUG1, "Client has relayedCount == %"PRIu8" on disconnect..", client->relayedCount ); + int i; + for ( i = 0; i < 1000 && client->relayedCount != 0; ++i ) { + usleep( 10000 ); + } + if ( client->relayedCount != 0 ) { + logadd( LOG_WARNING, "Client relayedCount still %"PRIu8" after sleeping!", client->relayedCount ); + } + } } /** - * Request a chunk of data through an uplink server - * Locks on: image.lock, uplink.queueLock + * Request a chunk of data through an uplink server. Either uplink or client has to be non-NULL. + * If client is NULL, this is assumed to be a background replication request. + * Locks on: uplink.queueLock, uplink.sendMutex */ -bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops) +bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops) { - if ( client == NULL || client->image == NULL ) - return false; + bool getUplink = ( uplink == NULL ); + assert( client != NULL || uplink != NULL ); if ( length > (uint32_t)_maxPayload ) { logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length ); return false; } - dnbd3_uplink_t * uplink = ref_get_uplink( &client->image->uplinkref ); - if ( unlikely( uplink == NULL ) ) { - uplink_init( client->image, -1, NULL, -1 ); + if ( getUplink ) { uplink = ref_get_uplink( &client->image->uplinkref ); - if ( uplink == NULL ) { - logadd( LOG_DEBUG1, "Uplink request for image with no uplink" ); - return false; + if ( unlikely( uplink == NULL ) ) { + uplink_init( client->image, -1, NULL, -1 ); + uplink = ref_get_uplink( &client->image->uplinkref ); + if ( uplink == NULL ) { + logadd( LOG_DEBUG1, "Uplink request for image with no uplink" ); + return false; + } } } if ( uplink->shutdown ) { @@ -275,163 +285,179 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin } // Check if the client is the same host as the uplink. If so assume this is a circular proxy chain // This might be a false positive if there are multiple instances running on the same host (IP) - if ( hops != 0 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) { + if ( client != NULL && hops != 0 + && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) { uplink->cycleDetected = true; signal_call( uplink->signal ); logadd( LOG_WARNING, "Proxy cycle detected (same host)." ); goto fail_ref; } - int foundExisting = -1; // Index of a pending request that is a superset of our range, -1 otherwise - int existingType = -1; // ULR_* type of existing request - int i; - int freeSlot = -1; - int firstUsedSlot = -1; - bool requestLoop = false; - const uint64_t end = start + length; - - mutex_lock( &uplink->queueLock ); - if ( uplink->shutdown ) { // Check again after locking to prevent lost requests - goto fail_lock; - } - for (i = 0; i < uplink->queueLen; ++i) { - // find free slot to place this request into - if ( uplink->queue[i].status == ULR_FREE ) { - if ( freeSlot == -1 || existingType != ULR_PROCESSING ) { - freeSlot = i; - } - continue; - } - if ( firstUsedSlot == -1 ) { - firstUsedSlot = i; - } - // find existing request to attach to - if ( uplink->queue[i].from > start || uplink->queue[i].to < end ) - continue; // Range not suitable - // Detect potential proxy cycle. New request hopcount is greater, range is same, old request has already been sent -> suspicious - if ( hops > uplink->queue[i].hopCount && uplink->queue[i].from == start && uplink->queue[i].to == end && uplink->queue[i].status == ULR_PENDING ) { - requestLoop = true; - break; - } - if ( foundExisting == -1 || existingType == ULR_PROCESSING ) { - foundExisting = i; - existingType = uplink->queue[i].status; - } - } - if ( unlikely( requestLoop ) ) { - uplink->cycleDetected = true; - signal_call( uplink->signal ); - logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops ); - goto fail_lock; - } - if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) { - freeSlot = -1; // Not attaching to existing request, make it use a higher slot - } - if ( freeSlot == -1 ) { - if ( uplink->queueLen >= SERVER_MAX_UPLINK_QUEUE ) { - logadd( LOG_WARNING, "Uplink queue is full, consider increasing SERVER_MAX_UPLINK_QUEUE. Dropping client..." ); + struct { + uint64_t handle, start, end; + } req; + do { + const uint64_t end = start + length; + dnbd3_queue_entry_t *request = NULL, *last = NULL; + bool isNew; + mutex_lock( &uplink->queueLock ); + if ( uplink->shutdown ) { // Check again after locking to prevent lost requests goto fail_lock; } - freeSlot = uplink->queueLen++; - if ( freeSlot > SERVER_UPLINK_QUEUELEN_THRES ) { - uplink->image->problem.queue = true; + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + if ( it->from <= start && it->to >= end ) { + // Matching range, attach + request = it; + break; + } + if ( it->next == NULL ) { + // Not matching, last in list, remember + last = it; + break; + } } - } - // Do not send request to uplink server if we have a matching pending request AND the request either has the - // status ULR_NEW/PENDING OR we found a free slot with LOWER index than the one we attach to. Otherwise - // explicitly send this request to the uplink server. The second condition mentioned here is to prevent - // a race condition where the reply for the outstanding request already arrived and the uplink thread - // is currently traversing the request queue. As it is processing the queue from highest to lowest index, it might - // already have passed the index of the free slot we determined, but not reached the existing request we just found above. - if ( foundExisting != -1 && existingType == ULR_PROCESSING && freeSlot > foundExisting ) { - foundExisting = -1; // -1 means "send request" - } -#ifdef _DEBUG - if ( foundExisting != -1 ) { - logadd( LOG_DEBUG2, "%p (%s) Found existing request of type %s at slot %d, attaching in slot %d.\n", (void*)uplink, uplink->image->name, NAMES_ULR[existingType], foundExisting, freeSlot ); - logadd( LOG_DEBUG2, "Original %" PRIu64 "-%" PRIu64 " (%p)\n" - "New %" PRIu64 "-%" PRIu64 " (%p)\n", - uplink->queue[foundExisting].from, uplink->queue[foundExisting].to, (void*)uplink->queue[foundExisting].client, - start, end, (void*)client ); - } -#endif - // Fill structure - uplink->queue[freeSlot].from = start; - uplink->queue[freeSlot].to = end; - uplink->queue[freeSlot].handle = handle; - uplink->queue[freeSlot].client = client; - //int old = uplink->queue[freeSlot].status; - uplink->queue[freeSlot].status = ( foundExisting == -1 ? ULR_NEW : - ( existingType == ULR_NEW ? ULR_PENDING : existingType ) ); - uplink->queue[freeSlot].hopCount = hops; + dnbd3_queue_client_t **c; + if ( request == NULL ) { + // No existing request to attach to + if ( uplink->queueLen >= UPLINK_MAX_QUEUE ) { + logadd( LOG_WARNING, "Uplink queue is full, consider increasing UPLINK_MAX_QUEUE. Dropping client..." ); + goto fail_lock; + } + uplink->queueLen++; + if ( uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) { + uplink->image->problem.queue = true; + } + request = malloc( sizeof(*request) ); + if ( last == NULL ) { + uplink->queue = request; + } else { + last->next = request; + } + request->next = NULL; + request->handle = ++uplink->queueId; + request->from = start & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + request->to = (end + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); #ifdef _DEBUG - timing_get( &uplink->queue[freeSlot].entered ); - //logadd( LOG_DEBUG2 %p] Inserting request at slot %d, was %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 "\n", (void*)uplink, freeSlot, old, uplink->queue[freeSlot].status, uplink->queue[freeSlot, ".handle, start, end ); + timing_get( &request->entered ); #endif - mutex_unlock( &uplink->queueLock ); + request->hopCount = hops; + request->sent = true; // Optimistic; would be set to false on failure + if ( client == NULL ) { + // BGR + request->clients = NULL; + } else { + c = &request->clients; + } + isNew = true; + } else if ( client == NULL ) { + // Replication request that maches existing request. Do nothing + isNew = false; + } else { + // Existing request. Check if potential cycle + if ( hops > request->hopCount + 1 && request->from == start && request->to == end ) { + logadd( LOG_DEBUG1, "Request cycle detected on uplink for %s:%d", PIMG(uplink->image) ); + goto fail_lock; + } + // Count number if clients, get tail of list + int count = 0; + c = &request->clients; + while ( *c != NULL ) { + c = &(**c).next; + if ( ++count >= UPLINK_MAX_CLIENTS_PER_REQUEST ) { + logadd( LOG_DEBUG2, "Won't accept more than %d clients per request, dropping client", count ); + goto fail_lock; + } + } + isNew = false; + } + req.handle = request->handle; + req.start = request->from; + req.end = request->to; + if ( client != NULL ) { + *c = malloc( sizeof( *request->clients ) ); + (**c).next = NULL; + (**c).handle = handle; + (**c).from = start; + (**c).to = end; + (**c).client = client; + client->relayedCount++; + } + mutex_unlock( &uplink->queueLock ); - if ( foundExisting != -1 ) { - ref_put( &uplink->reference ); - return true; // Attached to pending request, do nothing - } + if ( !isNew ) { + goto success_ref; // Attached to pending request, do nothing + } + } while (0); - // See if we can fire away the request - if ( unlikely( mutex_trylock( &uplink->sendMutex ) != 0 ) ) { - logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" ); + // Fire away the request + mutex_lock( &uplink->sendMutex ); + if ( unlikely( uplink->current.fd == -1 ) ) { + uplink->image->problem.uplink = true; + markRequestUnsent( uplink, req.handle ); + mutex_unlock( &uplink->sendMutex ); + logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" ); } else { - if ( unlikely( uplink->current.fd == -1 ) ) { + if ( hops < 200 ) ++hops; + const bool ret = dnbd3_get_block( uplink->current.fd, req.start, req.end - req.start, + req.handle, COND_HOPCOUNT( uplink->current.version, hops ) ); + if ( unlikely( !ret ) ) { + markRequestUnsent( uplink, req.handle ); uplink->image->problem.uplink = true; mutex_unlock( &uplink->sendMutex ); - logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" ); + logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing (%"PRIu64")", req.handle ); } else { - const uint64_t reqStart = uplink->queue[freeSlot].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); - const uint32_t reqSize = (uint32_t)(((uplink->queue[freeSlot].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart); - if ( hops < 200 ) ++hops; - const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) ); - if ( unlikely( !ret ) ) { - uplink->image->problem.uplink = true; - mutex_unlock( &uplink->sendMutex ); - logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" ); - } else { - // Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again - int state; - mutex_unlock( &uplink->sendMutex ); - mutex_lock( &uplink->queueLock ); - if ( !uplink->shutdown && uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) { - state = uplink->queue[freeSlot].status; - if ( uplink->queue[freeSlot].status == ULR_NEW ) { - uplink->queue[freeSlot].status = ULR_PENDING; - } - } else { - state = -1; - } - mutex_unlock( &uplink->queueLock ); - if ( state == -1 ) { - logadd( LOG_DEBUG2, "Direct uplink request queue entry gone after sending and re-locking queue. *shrug*" ); - } else if ( state == ULR_NEW ) { - //logadd( LOG_DEBUG2, "Direct uplink request" ); - } else { - logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] ); - } - ref_put( &uplink->reference ); - return true; - } - // Fall through to waking up sender thread + // OK + mutex_unlock( &uplink->sendMutex ); + goto success_ref; } + // Fall through to waking up sender thread } if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) { logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno ); } - ref_put( &uplink->reference ); + +success_ref: + if ( client != NULL ) { + // Was from client -- potential prefetch + uint32_t len = MIN( uplink->image->virtualFilesize - req.end, req.end - req.start ); + if ( len > 0 ) { + prefetch_request_t *job = malloc( sizeof( *job ) ); + job->start = req.end; + job->length = len; + job->uplink = uplink; + ref_inc( &uplink->reference ); // Hold one for the thread, thread will return it + threadpool_run( &prefetchForClient, (void*)job ); + } + } + if ( getUplink ) { + ref_put( &uplink->reference ); + } return true; fail_lock: mutex_unlock( &uplink->queueLock ); fail_ref: - ref_put( &uplink->reference ); + if ( getUplink ) { + ref_put( &uplink->reference ); + } return false; } +static void *prefetchForClient(void *data) +{ + prefetch_request_t *job = (prefetch_request_t*)data; + dnbd3_cache_map_t *cache = ref_get_cachemap( job->uplink->image ); + if ( cache != NULL ) { + if ( !image_isRangeCachedUnsafe( cache, job->start, job->start + job->length ) ) { + uplink_request( job->uplink, NULL, ++job->uplink->queueId, job->start, job->length, 0 ); + } + ref_put( &cache->reference ); + } + ref_put( &job->uplink->reference ); + free( job ); + return NULL; +} + /** * Uplink thread. * Locks are irrelevant as this is never called from another function @@ -443,7 +469,7 @@ static void* uplink_mainloop(void *data) #define EV_COUNT (2) struct pollfd events[EV_COUNT]; dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data; - int numSocks, i, waitTime; + int numSocks, waitTime; int altCheckInterval = SERVER_RTT_INTERVAL_INIT; int rttTestResult; uint32_t discoverFailCount = 0; @@ -478,7 +504,7 @@ static void* uplink_mainloop(void *data) declare_now; waitTime = (int)timing_diffMs( &now, &nextAltCheck ); if ( waitTime < 100 ) waitTime = 100; - if ( waitTime > 10000 ) waitTime = 10000; + else if ( waitTime > 10000 ) waitTime = 10000; } events[EV_SOCKET].fd = uplink->current.fd; numSocks = poll( events, EV_COUNT, waitTime ); @@ -505,7 +531,6 @@ static void* uplink_mainloop(void *data) mutex_unlock( &uplink->rttLock ); discoverFailCount = 0; if ( fd != -1 ) close( fd ); - uplink->replicationHandle = REP_NONE; uplink->image->problem.uplink = false; uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received buffer[0] = '@'; @@ -559,11 +584,11 @@ static void* uplink_mainloop(void *data) } declare_now; uint32_t timepassed = timing_diff( &lastKeepalive, &now ); - if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) { + if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL || ( timepassed >= 2 && uplink->idleTime < _bgrWindowSize ) ) { lastKeepalive = now; uplink->idleTime += timepassed; // Keep-alive - if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) { + if ( uplink->current.fd != -1 && uplink->queueLen < _bgrWindowSize ) { // Send keep-alive if nothing is happening, and try to trigger background rep. if ( !uplink_sendKeepalive( uplink ) || !uplink_sendReplicationRequest( uplink ) ) { uplink_connectionFailed( uplink, true ); @@ -612,19 +637,16 @@ static void* uplink_mainloop(void *data) ticks deadline; timing_set( &deadline, &now, -10 ); mutex_lock( &uplink->queueLock ); - for (i = 0; i < uplink->queueLen; ++i) { - if ( uplink->queue[i].status != ULR_FREE && timing_reached( &uplink->queue[i].entered, &deadline ) ) { - snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n" - "%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)uplink, i, uplink->queue[i].client->image->name, - uplink->queue[i].from, uplink->queue[i].to, uplink->queue[i].status ); - uplink->queue[i].entered = now; + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + if ( timing_reached( &it->entered, &deadline ) ) { + logadd( LOG_WARNING, "Starving request detected:" + " (from %" PRIu64 " to %" PRIu64 ", sent: %d) %s:%d", + it->from, it->to, (int)it->sent, PIMG(uplink->image) ); + it->entered = now; #ifdef _DEBUG_RESEND_STARVING - uplink->queue[i].status = ULR_NEW; + it->sent = false; resend = true; #endif - mutex_unlock( &uplink->queueLock ); - logadd( LOG_WARNING, "%s", buffer ); - mutex_lock( &uplink->queueLock ); } } mutex_unlock( &uplink->queueLock ); @@ -667,37 +689,54 @@ cleanup: ; */ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) { - // Scan for new requests - int j; + // Scan for new requests, or optionally, (re)send all + // Build a buffer, so if there aren't too many requests, we can send them after + // unlocking the queue again. Otherwise we need flushes during iteration, which + // is no ideal, but in that case the uplink is probably overwhelmed anyways. + // Try 125 as that's exactly 300bytes, usually 2*MTU. +#define MAX_RESEND_BATCH 125 + dnbd3_request_t reqs[MAX_RESEND_BATCH]; + int count = 0; mutex_lock( &uplink->queueLock ); - for (j = 0; j < uplink->queueLen; ++j) { - if ( uplink->queue[j].status != ULR_NEW && (newOnly || uplink->queue[j].status != ULR_PENDING) ) continue; - uplink->queue[j].status = ULR_PENDING; - uint8_t hops = uplink->queue[j].hopCount; - const uint64_t reqStart = uplink->queue[j].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); - const uint32_t reqSize = (uint32_t)(((uplink->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart); - /* - logadd( LOG_DEBUG2, "[%p] Sending slot %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 " (%" PRIu64 "-%" PRIu64 ")", - (void*)uplink, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize ); - */ - mutex_unlock( &uplink->queueLock ); - if ( hops < 200 ) ++hops; - mutex_lock( &uplink->sendMutex ); - const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) ); - if ( likely( ret ) ) { - mutex_unlock( &uplink->sendMutex ); - } else { - // Non-critical - if the connection dropped or the server was changed - // the thread will re-send this request as soon as the connection - // is reestablished. - uplink->image->problem.uplink = true; + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + if ( newOnly && it->sent ) + continue; + it->sent = true; + dnbd3_request_t *hdr = &reqs[count++]; + hdr->magic = dnbd3_packet_magic; + hdr->cmd = CMD_GET_BLOCK; + hdr->size = it->to - it->from; + hdr->offset_small = it->from; + hdr->hops = it->hopCount + 1; + hdr->handle = it->handle; + fixup_request( *hdr ); + if ( count == MAX_RESEND_BATCH ) { + bool ok = false; + logadd( LOG_DEBUG2, "BLOCKING resend of %d", count ); + count = 0; + mutex_lock( &uplink->sendMutex ); + if ( uplink->current.fd != -1 ) { + ok = ( sock_sendAll( uplink->current.fd, reqs, DNBD3_REQUEST_SIZE * MAX_RESEND_BATCH, 3 ) + == DNBD3_REQUEST_SIZE * MAX_RESEND_BATCH ); + } mutex_unlock( &uplink->sendMutex ); - logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" ); - return; + if ( !ok ) { + uplink->image->problem.uplink = true; + break; + } } - mutex_lock( &uplink->queueLock ); } mutex_unlock( &uplink->queueLock ); + if ( count != 0 ) { + mutex_lock( &uplink->sendMutex ); + if ( uplink->current.fd != -1 ) { + uplink->image->problem.uplink = + ( sock_sendAll( uplink->current.fd, reqs, DNBD3_REQUEST_SIZE * count, 3 ) + != DNBD3_REQUEST_SIZE * count ); + } + mutex_unlock( &uplink->sendMutex ); + } +#undef MAX_RESEND_BATCH } /** @@ -720,71 +759,73 @@ static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) return false; // Should never be called in this state, consider send error if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return true; // Don't do background replication - if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE ) - return true; // Already a replication request on the wire, or no more blocks to replicate + if ( uplink->nextReplicationIndex == -1 ) + return true; // No more blocks to replicate dnbd3_image_t * const image = uplink->image; if ( image->users < _bgrMinClients ) return true; // Not enough active users + const int numNewRequests = numWantedReplicationRequests( uplink ); + if ( numNewRequests <= 0 ) + return true; // Already sufficient amount of requests on the wire dnbd3_cache_map_t *cache = ref_get_cachemap( image ); - if ( cache == NULL || image->users ) { + if ( cache == NULL ) { // No cache map (=image complete) - ref_put( &cache->reference ); return true; } const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); const int lastBlockIndex = mapBytes - 1; - int endByte; - if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks - endByte = uplink->nextReplicationIndex + mapBytes; - } else { // Hashblock based: Only look for match in current hash block - endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK; - if ( endByte > mapBytes ) { - endByte = mapBytes; + for ( int bc = 0; bc < numNewRequests; ++bc ) { + int endByte; + if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks + endByte = uplink->nextReplicationIndex + mapBytes; + } else { // Hashblock based: Only look for match in current hash block + endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK; + if ( endByte > mapBytes ) { + endByte = mapBytes; + } } - } - atomic_thread_fence( memory_order_acquire ); - int replicationIndex = -1; - for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) { - const int i = j % ( mapBytes ); // Wrap around for BGR_FULL - if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff - && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) { - // Found incomplete one - replicationIndex = i; + atomic_thread_fence( memory_order_acquire ); + int replicationIndex = -1; + for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) { + const int i = j % ( mapBytes ); // Wrap around for BGR_FULL + if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff + && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) { + // Found incomplete one + replicationIndex = i; + break; + } + } + if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) { + // Nothing left in current block, find next one + replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte ); + } + if ( replicationIndex == -1 ) { + // Replication might be complete, uplink_mainloop should take care.... + uplink->nextReplicationIndex = -1; break; } + const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE; + const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE ); + const uint64_t handle = ++uplink->queueId; + if ( !uplink_request( uplink, NULL, handle, offset, size, 0 ) ) { + logadd( LOG_DEBUG1, "Error sending background replication request to uplink server (%s:%d)", + PIMG(uplink->image) ); + ref_put( &cache->reference ); + return false; + } + if ( replicationIndex == lastBlockIndex ) { + uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks + } + uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter + if ( _backgroundReplication == BGR_HASHBLOCK + && uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) { + // Just crossed a hash block boundary, look for new candidate starting at this very index + uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex ); + if ( uplink->nextReplicationIndex == -1 ) + break; + } } ref_put( &cache->reference ); - if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) { - // Nothing left in current block, find next one - replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte ); - } - if ( replicationIndex == -1 ) { - // Replication might be complete, uplink_mainloop should take care.... - uplink->nextReplicationIndex = -1; - return true; - } - const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE; - uplink->replicationHandle = offset; - const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE ); - mutex_lock( &uplink->sendMutex ); - bool sendOk = dnbd3_get_block( uplink->current.fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->current.version, 1 ) ); - if ( likely( sendOk ) ) { - mutex_unlock( &uplink->sendMutex ); - } else { - uplink->image->problem.uplink = true; - mutex_unlock( &uplink->sendMutex ); - logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" ); - return false; - } - if ( replicationIndex == lastBlockIndex ) { - uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks - } - uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter - if ( _backgroundReplication == BGR_HASHBLOCK - && uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) { - // Just crossed a hash block boundary, look for new candidate starting at this very index - uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex ); - } return true; } @@ -845,7 +886,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int static void uplink_handleReceive(dnbd3_uplink_t *uplink) { dnbd3_reply_t inReply, outReply; - int ret, i; + int ret; for (;;) { ret = dnbd3_read_reply( uplink->current.fd, &inReply, false ); if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue; @@ -881,13 +922,34 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) } // Payload read completely // Bail out if we're not interested - if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) ) continue; + if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) ) + continue; // Is a legit block reply - struct iovec iov[2]; - const uint64_t start = inReply.handle; - const uint64_t end = inReply.handle + inReply.size; totalBytesReceived += inReply.size; uplink->bytesReceived += inReply.size; + // Get entry from queue + dnbd3_queue_entry_t *entry; + mutex_lock( &uplink->queueLock ); + for ( entry = uplink->queue; entry != NULL; entry = entry->next ) { + if ( entry->handle == inReply.handle ) + break; + } + if ( entry == NULL ) { + mutex_unlock( &uplink->queueLock ); // Do not dereference pointer after unlock! + logadd( LOG_DEBUG1, "Received block reply on uplink, but handle %"PRIu64" is unknown (%s:%d)", + inReply.handle, PIMG(uplink->image) ); + continue; + } + const uint64_t start = entry->from; + const uint64_t end = entry->to; + mutex_unlock( &uplink->queueLock ); // Do not dereference pointer after unlock! + // We don't remove the entry from the list here yet, to slightly increase the chance of other + // clients attaching to this request while we write the data to disk + if ( end - start != inReply.size ) { + logadd( LOG_WARNING, "Received payload length does not match! (is: %"PRIu32", expect: %u, %s:%d)", + inReply.size, (unsigned int)( end - start ), PIMG(uplink->image) ); + } + struct iovec iov[2]; // 1) Write to cache file if ( unlikely( uplink->cacheFd == -1 ) ) { uplink_reopenCacheFd( uplink, false ); @@ -934,98 +996,76 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) PIMG(uplink->image), err ); } } - // 2) Figure out which clients are interested in it - // Mark as ULR_PROCESSING, since we unlock repeatedly in the second loop - // below; this prevents uplink_request() from attaching to this request - // by populating a slot with index greater than the highest matching - // request with ULR_PROCESSING (assuming there is no ULR_PENDING or ULR_NEW - // where it's fine if the index is greater) + bool found = false; + dnbd3_queue_entry_t **it; mutex_lock( &uplink->queueLock ); - for (i = 0; i < uplink->queueLen; ++i) { - dnbd3_queued_request_t * const req = &uplink->queue[i]; - assert( req->status != ULR_PROCESSING ); - if ( req->status != ULR_PENDING && req->status != ULR_NEW ) continue; - assert( req->client != NULL ); - if ( req->from >= start && req->to <= end ) { // Match :-) - req->status = ULR_PROCESSING; - } - } - // 3) Send to interested clients - iterate backwards so request collaboration works, and - // so we can decrease queueLen on the fly while iterating. Should you ever change this to start - // from 0, you also need to change the "attach to existing request"-logic in uplink_request() - outReply.magic = dnbd3_packet_magic; - bool served = false; - for ( i = uplink->queueLen - 1; i >= 0; --i ) { - dnbd3_queued_request_t * const req = &uplink->queue[i]; - if ( req->status == ULR_PROCESSING ) { - size_t bytesSent = 0; - assert( req->from >= start && req->to <= end ); - dnbd3_client_t * const client = req->client; - outReply.cmd = CMD_GET_BLOCK; - outReply.handle = req->handle; - outReply.size = (uint32_t)( req->to - req->from ); - iov[0].iov_base = &outReply; - iov[0].iov_len = sizeof outReply; - iov[1].iov_base = uplink->recvBuffer + (req->from - start); - iov[1].iov_len = outReply.size; - fixup_reply( outReply ); - req->status = ULR_FREE; - req->client = NULL; - served = true; - mutex_lock( &client->sendMutex ); - mutex_unlock( &uplink->queueLock ); - if ( client->sock != -1 ) { - ssize_t sent = writev( client->sock, iov, 2 ); - if ( sent > (ssize_t)sizeof outReply ) { - bytesSent = (size_t)sent - sizeof outReply; - } - } - if ( bytesSent != 0 ) { - client->bytesSent += bytesSent; - } - mutex_unlock( &client->sendMutex ); - mutex_lock( &uplink->queueLock ); - if ( i > uplink->queueLen ) { - i = uplink->queueLen; // Might have been set to 0 by cancelAllRequests - } + for ( it = &uplink->queue; *it != NULL; it = &(**it).next ) { + if ( *it == entry && entry->handle == inReply.handle ) { // ABA check + assert( found == false ); + *it = (**it).next; + found = true; + uplink->queueLen--; + break; } - if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--; } if ( uplink->queueLen < SERVER_UPLINK_QUEUELEN_THRES ) { uplink->image->problem.queue = false; } mutex_unlock( &uplink->queueLock ); -#ifdef _DEBUG - if ( !served && start != uplink->replicationHandle ) { - logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)uplink, uplink->image->name, start, end ); + if ( !found ) { + logadd( LOG_DEBUG1, "Replication request vanished from queue after writing to disk (%s:%d)", + PIMG(uplink->image) ); + continue; } -#endif - if ( start == uplink->replicationHandle ) { - // Was our background replication - uplink->replicationHandle = REP_NONE; - // Try to remove from fs cache if no client was interested in this data - if ( !served && uplink->cacheFd != -1 ) { - posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED ); + outReply.magic = dnbd3_packet_magic; + dnbd3_queue_client_t *next; + for ( dnbd3_queue_client_t *c = entry->clients; c != NULL; c = next ) { + size_t bytesSent = 0; + assert( c->from >= start && c->to <= end ); + dnbd3_client_t * const client = c->client; + outReply.cmd = CMD_GET_BLOCK; + outReply.handle = c->handle; + outReply.size = (uint32_t)( c->to - c->from ); + iov[0].iov_base = &outReply; + iov[0].iov_len = sizeof outReply; + iov[1].iov_base = uplink->recvBuffer + (c->from - start); + iov[1].iov_len = outReply.size; + fixup_reply( outReply ); + mutex_lock( &client->sendMutex ); + if ( client->sock != -1 ) { + ssize_t sent = writev( client->sock, iov, 2 ); + if ( sent > (ssize_t)sizeof outReply ) { + bytesSent = (size_t)sent - sizeof outReply; + } + if ( bytesSent != 0 ) { + client->bytesSent += bytesSent; + } } + mutex_unlock( &client->sendMutex ); + client->relayedCount--; + next = c->next; + free( c ); } - if ( served ) { + if ( entry->clients != NULL ) { // Was some client -- reset idle counter uplink->idleTime = 0; // Re-enable replication if disabled if ( uplink->nextReplicationIndex == -1 ) { uplink->nextReplicationIndex = (int)( start / FILE_BYTES_PER_MAP_BYTE ) & MAP_INDEX_HASH_START_MASK; } + } else { + if ( uplink->cacheFd != -1 ) { + // Try to remove from fs cache if no client was interested in this data + posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED ); + } } + free( entry ); + } // main receive loop + // Trigger background replication if applicable + if ( !uplink_sendReplicationRequest( uplink ) ) { + goto error_cleanup; } - if ( uplink->replicationHandle == REP_NONE ) { - mutex_lock( &uplink->queueLock ); - const bool rep = ( uplink->queueLen == 0 ); - mutex_unlock( &uplink->queueLock ); - if ( rep ) { - if ( !uplink_sendReplicationRequest( uplink ) ) - goto error_cleanup; - } - } + // Normal end return; // Error handling from failed receive or message parsing error_cleanup: ; @@ -1046,7 +1086,6 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) close( uplink->current.fd ); uplink->current.fd = -1; mutex_unlock( &uplink->sendMutex ); - uplink->replicationHandle = REP_NONE; if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) { uplink->nextReplicationIndex = 0; } @@ -1156,3 +1195,39 @@ bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len) return false; return altservers_toString( current, buffer, len ); } + +/** + * Get number of replication requests that should be sent right now to + * meet the configured bgrWindowSize. Returns 0 if any client requests + * are pending + */ +static int numWantedReplicationRequests(dnbd3_uplink_t *uplink) +{ + int ret = MIN( _bgrWindowSize, uplink->idleTime + 1 ); + if ( uplink->queueLen == 0 ) + return ret; + mutex_lock( &uplink->queueLock ); + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + if ( it->clients == NULL ) { + ret--; + } else { + ret = 0; // Do not allow BGR if client requests are being handled + break; + } + } + mutex_unlock( &uplink->queueLock ); + return ret; +} + +static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle) +{ + mutex_lock( &uplink->queueLock ); + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + if ( it->handle == handle ) { + it->sent = false; + break; + } + } + mutex_unlock( &uplink->queueLock ); +} + diff --git a/src/server/uplink.h b/src/server/uplink.h index 49ff0b4..8f69b05 100644 --- a/src/server/uplink.h +++ b/src/server/uplink.h @@ -12,7 +12,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client); -bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hopCount); +bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops); bool uplink_shutdown(dnbd3_image_t *image); diff --git a/src/serverconfig.h b/src/serverconfig.h index 5c7301d..31708de 100644 --- a/src/serverconfig.h +++ b/src/serverconfig.h @@ -13,7 +13,8 @@ #define SERVER_BAD_UPLINK_MAX 20 // Hard block server if it failed this many times #define SERVER_BAD_UPLINK_LOCAL_BLOCK 10 // If a server didn't supply the requested image this many times, block it for some time #define SERVER_BAD_UPLINK_IGNORE 180 // How many seconds is a server ignored -#define SERVER_MAX_UPLINK_QUEUE 1500 // Maximum number of queued requests per uplink +#define UPLINK_MAX_QUEUE 500 // Maximum number of queued requests per uplink +#define UPLINK_MAX_CLIENTS_PER_REQUEST 32 // Maximum number of clients that can attach to one uplink request #define SERVER_UPLINK_QUEUELEN_THRES 900 // Threshold where we start dropping incoming clients #define SERVER_MAX_PENDING_ALT_CHECKS 500 // Length of queue for pending alt checks requested by uplinks -- cgit v1.2.3-55-g7522 From 03a2ac45f217793f532af16fd75a163e42e6f18d Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 13 Mar 2020 22:28:11 +0100 Subject: [SERVER] Check and increase hopCount when adding uplink request --- src/server/uplink.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/uplink.c b/src/server/uplink.c index 188bf06..e7bbe70 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -264,6 +264,10 @@ bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t han { bool getUplink = ( uplink == NULL ); assert( client != NULL || uplink != NULL ); + if ( hops++ > 200 ) { // This is just silly + logadd( LOG_WARNING, "Refusing to relay a request that has > 200 hops" ); + return false; + } if ( length > (uint32_t)_maxPayload ) { logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length ); return false; @@ -285,7 +289,7 @@ bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t han } // Check if the client is the same host as the uplink. If so assume this is a circular proxy chain // This might be a false positive if there are multiple instances running on the same host (IP) - if ( client != NULL && hops != 0 + if ( client != NULL && hops > 1 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) { uplink->cycleDetected = true; signal_call( uplink->signal ); @@ -354,7 +358,7 @@ bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t han isNew = false; } else { // Existing request. Check if potential cycle - if ( hops > request->hopCount + 1 && request->from == start && request->to == end ) { + if ( hops > request->hopCount && request->from == start && request->to == end ) { logadd( LOG_DEBUG1, "Request cycle detected on uplink for %s:%d", PIMG(uplink->image) ); goto fail_lock; } @@ -397,7 +401,6 @@ bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t han mutex_unlock( &uplink->sendMutex ); logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" ); } else { - if ( hops < 200 ) ++hops; const bool ret = dnbd3_get_block( uplink->current.fd, req.start, req.end - req.start, req.handle, COND_HOPCOUNT( uplink->current.version, hops ) ); if ( unlikely( !ret ) ) { @@ -707,7 +710,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) hdr->cmd = CMD_GET_BLOCK; hdr->size = it->to - it->from; hdr->offset_small = it->from; - hdr->hops = it->hopCount + 1; + hdr->hops = it->hopCount; hdr->handle = it->handle; fixup_request( *hdr ); if ( count == MAX_RESEND_BATCH ) { -- cgit v1.2.3-55-g7522 From 8e0115f6c9ffbf9d9773f8c625c5e353c4b38583 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 13 Mar 2020 22:40:45 +0100 Subject: [SERVER] Check server version before setting hopCount field --- src/server/uplink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/server/uplink.c') diff --git a/src/server/uplink.c b/src/server/uplink.c index e7bbe70..b01df58 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -710,7 +710,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) hdr->cmd = CMD_GET_BLOCK; hdr->size = it->to - it->from; hdr->offset_small = it->from; - hdr->hops = it->hopCount; + hdr->hops = COND_HOPCOUNT( uplink->current.version, it->hopCount ); hdr->handle = it->handle; fixup_request( *hdr ); if ( count == MAX_RESEND_BATCH ) { -- cgit v1.2.3-55-g7522 From eddfdc8482b8d28c263d1b1f85e6d5e4badc49ed Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Sat, 14 Mar 2020 15:09:11 +0100 Subject: [SERVER] Use image:rid in log messages --- src/server/uplink.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/uplink.c b/src/server/uplink.c index b01df58..efe7fa0 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -895,19 +895,19 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue; if ( ret == REPLY_AGAIN ) break; if ( unlikely( ret == REPLY_CLOSED ) ) { - logadd( LOG_INFO, "Uplink: Remote host hung up (%s)", uplink->image->path ); + logadd( LOG_INFO, "Uplink: Remote host hung up (%s:%d)", PIMG(uplink->image) ); goto error_cleanup; } if ( unlikely( ret == REPLY_WRONGMAGIC ) ) { - logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s)", uplink->image->path ); + logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s:%d)", PIMG(uplink->image) ); goto error_cleanup; } if ( unlikely( ret != REPLY_OK ) ) { - logadd( LOG_INFO, "Uplink: Connection error %d (%s)", ret, uplink->image->path ); + logadd( LOG_INFO, "Uplink: Connection error %d (%s:%d)", ret, PIMG(uplink->image) ); goto error_cleanup; } if ( unlikely( inReply.size > (uint32_t)_maxPayload ) ) { - logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s", inReply.size, uplink->image->path ); + logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s:%d", inReply.size, PIMG(uplink->image) ); goto error_cleanup; } @@ -920,7 +920,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) } } if ( unlikely( (uint32_t)sock_recv( uplink->current.fd, uplink->recvBuffer, inReply.size ) != inReply.size ) ) { - logadd( LOG_INFO, "Lost connection to uplink server of %s (payload)", uplink->image->path ); + logadd( LOG_INFO, "Lost connection to uplink server of %s:%d (payload)", PIMG(uplink->image) ); goto error_cleanup; } // Payload read completely -- cgit v1.2.3-55-g7522 From 023145f531c54bdfa9e329a5caf38a3061dc42c5 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Sat, 14 Mar 2020 15:41:50 +0100 Subject: [SERVER] Add comments, assert for uplink thread --- src/server/uplink.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/uplink.c b/src/server/uplink.c index efe7fa0..df2f082 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -46,6 +46,8 @@ typedef struct { uint32_t length; } prefetch_request_t; +#define assert_uplink_thread() assert( pthread_equal( uplink->thread, pthread_self() ) ) + // ############ uplink connection handling void uplink_globalsInit() @@ -692,6 +694,7 @@ cleanup: ; */ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) { + assert_uplink_thread(); // Scan for new requests, or optionally, (re)send all // Build a buffer, so if there aren't too many requests, we can send them after // unlocking the queue again. Otherwise we need flushes during iteration, which @@ -758,6 +761,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) */ static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) { + assert_uplink_thread(); if ( uplink->current.fd == -1 ) return false; // Should never be called in this state, consider send error if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) @@ -890,6 +894,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) { dnbd3_reply_t inReply, outReply; int ret; + assert_uplink_thread(); for (;;) { ret = dnbd3_read_reply( uplink->current.fd, &inReply, false ); if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue; @@ -1023,7 +1028,6 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) outReply.magic = dnbd3_packet_magic; dnbd3_queue_client_t *next; for ( dnbd3_queue_client_t *c = entry->clients; c != NULL; c = next ) { - size_t bytesSent = 0; assert( c->from >= start && c->to <= end ); dnbd3_client_t * const client = c->client; outReply.cmd = CMD_GET_BLOCK; @@ -1038,10 +1042,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) if ( client->sock != -1 ) { ssize_t sent = writev( client->sock, iov, 2 ); if ( sent > (ssize_t)sizeof outReply ) { - bytesSent = (size_t)sent - sizeof outReply; - } - if ( bytesSent != 0 ) { - client->bytesSent += bytesSent; + client->bytesSent += (size_t)sent - sizeof outReply; } } mutex_unlock( &client->sendMutex ); @@ -1080,6 +1081,7 @@ error_cleanup: ; */ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) { + assert_uplink_thread(); if ( uplink->current.fd == -1 ) return; setThreadName( "panic-uplink" ); @@ -1109,6 +1111,7 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink) { static const dnbd3_request_t request = { .magic = dnbd3_packet_magic, .cmd = net_order_16( CMD_KEEPALIVE ) }; + assert_uplink_thread(); mutex_lock( &uplink->sendMutex ); bool sendOk = send( uplink->current.fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request); mutex_unlock( &uplink->sendMutex ); @@ -1182,6 +1185,12 @@ static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force) return uplink->cacheFd != -1; } +/** + * Returns true if the uplink has been idle for some time (apart from + * background replication, if it is set to hashblock, or if it has + * a minimum number of active clients configured that is not currently + * reached) + */ static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink) { return ( uplink->idleTime > SERVER_UPLINK_IDLE_TIMEOUT @@ -1202,7 +1211,12 @@ bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len) /** * Get number of replication requests that should be sent right now to * meet the configured bgrWindowSize. Returns 0 if any client requests - * are pending + * are pending. + * This applies a sort of "slow start" in case the uplink was recently + * dealing with actual client requests, in that the uplink's idle time + * (in seconds) is an upper bound for the number returned, so we don't + * saturate the uplink with loads of requests right away, in case that + * client triggers more requests to the uplink server. */ static int numWantedReplicationRequests(dnbd3_uplink_t *uplink) { -- cgit v1.2.3-55-g7522 From 3680e4819cbd7edbe632372e69533d254f1ae2c2 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Sat, 14 Mar 2020 15:51:52 +0100 Subject: [SERVER] Remove uplink_ prefix from static (private) functions --- src/server/uplink.c | 80 ++++++++++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 40 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/uplink.c b/src/server/uplink.c index df2f082..d6b319b 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -25,17 +25,17 @@ static atomic_uint_fast64_t totalBytesReceived = 0; static void cancelAllRequests(dnbd3_uplink_t *uplink); -static void uplink_free(ref *ref); +static void freeUplinkStruct(ref *ref); static void* uplink_mainloop(void *data); -static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly); -static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex); -static void uplink_handleReceive(dnbd3_uplink_t *uplink); -static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink); -static void uplink_addCrc32(dnbd3_uplink_t *uplink); -static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink); -static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force); -static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink); -static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew); +static void sendQueuedRequests(dnbd3_uplink_t *uplink, bool newOnly); +static int findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex); +static void handleReceive(dnbd3_uplink_t *uplink); +static bool sendKeepalive(dnbd3_uplink_t *uplink); +static void requestCrc32List(dnbd3_uplink_t *uplink); +static bool sendReplicationRequest(dnbd3_uplink_t *uplink); +static bool reopenCacheFd(dnbd3_uplink_t *uplink, const bool force); +static bool connectionShouldShutdown(dnbd3_uplink_t *uplink); +static void connectionFailed(dnbd3_uplink_t *uplink, bool findNew); static int numWantedReplicationRequests(dnbd3_uplink_t *uplink); static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle); static void *prefetchForClient(void *data); @@ -86,7 +86,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version } uplink = calloc( 1, sizeof(dnbd3_uplink_t) ); // Start with one reference for the uplink thread. We'll return it when the thread finishes - ref_init( &uplink->reference, uplink_free, 1 ); + ref_init( &uplink->reference, freeUplinkStruct, 1 ); mutex_init( &uplink->queueLock, LOCK_UPLINK_QUEUE ); mutex_init( &uplink->rttLock, LOCK_UPLINK_RTT ); mutex_init( &uplink->sendMutex, LOCK_UPLINK_SEND ); @@ -132,7 +132,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version return true; failure: ; if ( uplink != NULL ) { - image->users++; // Expected by uplink_free() + image->users++; // Expected by freeUplinkStruct() ref_put( &uplink->reference ); // The ref for the uplink thread that never was } mutex_unlock( &image->lock ); @@ -195,7 +195,7 @@ static void cancelAllRequests(dnbd3_uplink_t *uplink) uplink->image->problem.queue = false; } -static void uplink_free(ref *ref) +static void freeUplinkStruct(ref *ref) { dnbd3_uplink_t *uplink = container_of(ref, dnbd3_uplink_t, reference); logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", PIMG(uplink->image) ); @@ -489,7 +489,7 @@ static void* uplink_mainloop(void *data) thread_detach( uplink->thread ); blockNoncriticalSignals(); // Make sure file is open for writing - if ( !uplink_reopenCacheFd( uplink, false ) ) { + if ( !reopenCacheFd( uplink, false ) ) { // It might have failed - still offer proxy mode, we just can't cache logadd( LOG_WARNING, "Cannot open cache file %s for writing (errno=%d); will just proxy traffic without caching!", uplink->image->path, errno ); } @@ -545,16 +545,16 @@ static void* uplink_mainloop(void *data) } // If we don't have a crc32 list yet, see if the new server has one if ( uplink->image->crc32 == NULL ) { - uplink_addCrc32( uplink ); + requestCrc32List( uplink ); } // Re-send all pending requests - uplink_sendRequests( uplink, false ); - uplink_sendReplicationRequest( uplink ); + sendQueuedRequests( uplink, false ); + sendReplicationRequest( uplink ); events[EV_SOCKET].events = POLLIN | POLLRDHUP; if ( uplink->image->problem.uplink ) { // Some of the requests above must have failed again already :-( logadd( LOG_DEBUG1, "Newly established uplink connection failed during getCRC or sendRequests" ); - uplink_connectionFailed( uplink, true ); + connectionFailed( uplink, true ); } timing_gets( &nextAltCheck, altCheckInterval ); // The rtt worker already did the handshake for our image, so there's nothing @@ -573,18 +573,18 @@ static void* uplink_mainloop(void *data) } if ( uplink->current.fd != -1 ) { // Uplink seems fine, relay requests to it... - uplink_sendRequests( uplink, true ); + sendQueuedRequests( uplink, true ); } else if ( uplink->queueLen != 0 ) { // No uplink; maybe it was shutdown since it was idle for too long uplink->idleTime = 0; } } // Uplink socket if ( (events[EV_SOCKET].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) { - uplink_connectionFailed( uplink, true ); + connectionFailed( uplink, true ); logadd( LOG_DEBUG1, "Uplink gone away, panic! (revents=%d)\n", (int)events[EV_SOCKET].revents ); setThreadName( "panic-uplink" ); } else if ( (events[EV_SOCKET].revents & POLLIN) ) { - uplink_handleReceive( uplink ); + handleReceive( uplink ); if ( _shutdown || uplink->shutdown ) goto cleanup; } declare_now; @@ -595,13 +595,13 @@ static void* uplink_mainloop(void *data) // Keep-alive if ( uplink->current.fd != -1 && uplink->queueLen < _bgrWindowSize ) { // Send keep-alive if nothing is happening, and try to trigger background rep. - if ( !uplink_sendKeepalive( uplink ) || !uplink_sendReplicationRequest( uplink ) ) { - uplink_connectionFailed( uplink, true ); + if ( !sendKeepalive( uplink ) || !sendReplicationRequest( uplink ) ) { + connectionFailed( uplink, true ); logadd( LOG_DEBUG1, "Error sending keep-alive/BGR, panic!\n" ); } } // Don't keep uplink established if we're idle for too much - if ( uplink_connectionShouldShutdown( uplink ) ) { + if ( connectionShouldShutdown( uplink ) ) { logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", PIMG(uplink->image) ); goto cleanup; } @@ -656,7 +656,7 @@ static void* uplink_mainloop(void *data) } mutex_unlock( &uplink->queueLock ); if ( resend ) { - uplink_sendRequests( uplink, true ); + sendQueuedRequests( uplink, true ); } } #endif @@ -692,7 +692,7 @@ cleanup: ; /** * Only called from uplink thread. */ -static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) +static void sendQueuedRequests(dnbd3_uplink_t *uplink, bool newOnly) { assert_uplink_thread(); // Scan for new requests, or optionally, (re)send all @@ -759,7 +759,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) * * @return false if sending request failed, true otherwise (i.e. not necessary/disabled) */ -static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) +static bool sendReplicationRequest(dnbd3_uplink_t *uplink) { assert_uplink_thread(); if ( uplink->current.fd == -1 ) @@ -804,7 +804,7 @@ static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) } if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) { // Nothing left in current block, find next one - replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte ); + replicationIndex = findNextIncompleteHashBlock( uplink, endByte ); } if ( replicationIndex == -1 ) { // Replication might be complete, uplink_mainloop should take care.... @@ -827,7 +827,7 @@ static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) if ( _backgroundReplication == BGR_HASHBLOCK && uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) { // Just crossed a hash block boundary, look for new candidate starting at this very index - uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex ); + uplink->nextReplicationIndex = findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex ); if ( uplink->nextReplicationIndex == -1 ) break; } @@ -841,7 +841,7 @@ static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) * of a hash block which is neither completely empty nor completely * replicated yet. Returns -1 if no match. */ -static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int startMapIndex) +static int findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int startMapIndex) { int retval = -1; dnbd3_cache_map_t *cache = ref_get_cachemap( uplink->image ); @@ -890,7 +890,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int * Locks on: uplink.lock, images[].lock * Only called from uplink thread, so current.fd is assumed to be valid. */ -static void uplink_handleReceive(dnbd3_uplink_t *uplink) +static void handleReceive(dnbd3_uplink_t *uplink) { dnbd3_reply_t inReply, outReply; int ret; @@ -960,7 +960,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) struct iovec iov[2]; // 1) Write to cache file if ( unlikely( uplink->cacheFd == -1 ) ) { - uplink_reopenCacheFd( uplink, false ); + reopenCacheFd( uplink, false ); } if ( likely( uplink->cacheFd != -1 ) ) { int err = 0; @@ -980,7 +980,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) } if ( err == EBADF || err == EINVAL || err == EIO ) { uplink->image->problem.write = true; - if ( !tryAgain || !uplink_reopenCacheFd( uplink, true ) ) + if ( !tryAgain || !reopenCacheFd( uplink, true ) ) break; tryAgain = false; continue; // Write handle to image successfully re-opened, try again @@ -1066,20 +1066,20 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) free( entry ); } // main receive loop // Trigger background replication if applicable - if ( !uplink_sendReplicationRequest( uplink ) ) { + if ( !sendReplicationRequest( uplink ) ) { goto error_cleanup; } // Normal end return; // Error handling from failed receive or message parsing error_cleanup: ; - uplink_connectionFailed( uplink, true ); + connectionFailed( uplink, true ); } /** * Only call from uplink thread */ -static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) +static void connectionFailed(dnbd3_uplink_t *uplink, bool findNew) { assert_uplink_thread(); if ( uplink->current.fd == -1 ) @@ -1108,7 +1108,7 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) * Send keep alive request to server. * Called from uplink thread, current.fd must be valid. */ -static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink) +static bool sendKeepalive(dnbd3_uplink_t *uplink) { static const dnbd3_request_t request = { .magic = dnbd3_packet_magic, .cmd = net_order_16( CMD_KEEPALIVE ) }; assert_uplink_thread(); @@ -1124,7 +1124,7 @@ static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink) * FIXME This is broken as it could happen that another message arrives after sending * the request. Refactor, split and move receive into general receive handler. */ -static void uplink_addCrc32(dnbd3_uplink_t *uplink) +static void requestCrc32List(dnbd3_uplink_t *uplink) { dnbd3_image_t *image = uplink->image; if ( image == NULL || image->virtualFilesize == 0 ) return; @@ -1174,7 +1174,7 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink) * it will be closed first. Otherwise, nothing will happen and true will be returned * immediately. */ -static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force) +static bool reopenCacheFd(dnbd3_uplink_t *uplink, const bool force) { if ( uplink->cacheFd != -1 ) { if ( !force ) return true; @@ -1191,7 +1191,7 @@ static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force) * a minimum number of active clients configured that is not currently * reached) */ -static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink) +static bool connectionShouldShutdown(dnbd3_uplink_t *uplink) { return ( uplink->idleTime > SERVER_UPLINK_IDLE_TIMEOUT && ( _backgroundReplication != BGR_FULL || _bgrMinClients > uplink->image->users ) ); -- cgit v1.2.3-55-g7522 From a2cbfba828bd8fcd5803d9786a3b3050823b27fc Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Tue, 17 Mar 2020 13:00:39 +0100 Subject: [SERVER] Don't prefetch across hash blocks in BGS_HASHBLOCK mode --- src/server/uplink.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/uplink.c b/src/server/uplink.c index d6b319b..9bf48d3 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -44,7 +44,7 @@ typedef struct { dnbd3_uplink_t *uplink; uint64_t start; uint32_t length; -} prefetch_request_t; +} prefetch_job_t; #define assert_uplink_thread() assert( pthread_equal( uplink->thread, pthread_self() ) ) @@ -425,9 +425,12 @@ bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t han success_ref: if ( client != NULL ) { // Was from client -- potential prefetch + // Same size as this request, but consider end of image... uint32_t len = MIN( uplink->image->virtualFilesize - req.end, req.end - req.start ); - if ( len > 0 ) { - prefetch_request_t *job = malloc( sizeof( *job ) ); + // Also don't prefetch if we cross a hash block border and BGR mode == hashblock + if ( len > 0 && ( _backgroundReplication != BGR_HASHBLOCK + || req.start % HASH_BLOCK_SIZE == (req.end-1) % HASH_BLOCK_SIZE ) ) { + prefetch_job_t *job = malloc( sizeof( *job ) ); job->start = req.end; job->length = len; job->uplink = uplink; @@ -450,7 +453,7 @@ fail_ref: static void *prefetchForClient(void *data) { - prefetch_request_t *job = (prefetch_request_t*)data; + prefetch_job_t *job = (prefetch_job_t*)data; dnbd3_cache_map_t *cache = ref_get_cachemap( job->uplink->image ); if ( cache != NULL ) { if ( !image_isRangeCachedUnsafe( cache, job->start, job->start + job->length ) ) { @@ -458,7 +461,7 @@ static void *prefetchForClient(void *data) } ref_put( &cache->reference ); } - ref_put( &job->uplink->reference ); + ref_put( &job->uplink->reference ); // Acquired in uplink_request free( job ); return NULL; } -- cgit v1.2.3-55-g7522 From 79d36aa260f49716ede72cd6bea5cf10aa688651 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Tue, 17 Mar 2020 13:26:06 +0100 Subject: [SERVER] Make sure bgrWindowSize doesn't overwhelm uplink queue --- src/server/globals.c | 4 ++++ src/server/uplink.c | 2 ++ 2 files changed, 6 insertions(+) (limited to 'src/server/uplink.c') diff --git a/src/server/globals.c b/src/server/globals.c index 98e0ddb..9914f89 100644 --- a/src/server/globals.c +++ b/src/server/globals.c @@ -138,6 +138,10 @@ void globals_loadConfig() } if ( _bgrWindowSize < 1 ) { _bgrWindowSize = 1; + } else if ( _bgrWindowSize > UPLINK_MAX_QUEUE - 10 ) { + _bgrWindowSize = UPLINK_MAX_QUEUE - 10; + logadd( LOG_MINOR, "Limiting bgrWindowSize to %d, because of UPLINK_MAX_QUEUE", + _bgrWindowSize ); } // Dump config as interpreted char buffer[2000]; diff --git a/src/server/uplink.c b/src/server/uplink.c index 9bf48d3..af854d6 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -786,6 +786,8 @@ static bool sendReplicationRequest(dnbd3_uplink_t *uplink) const int lastBlockIndex = mapBytes - 1; for ( int bc = 0; bc < numNewRequests; ++bc ) { int endByte; + if ( UPLINK_MAX_QUEUE - uplink->queueLen < 10 ) + break; // Don't overload queue if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks endByte = uplink->nextReplicationIndex + mapBytes; } else { // Hashblock based: Only look for match in current hash block -- cgit v1.2.3-55-g7522 From ba617b55eb606ab487f154b124750e121518d5e5 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Thu, 19 Mar 2020 11:26:12 +0100 Subject: [SERVER] Add name param to threadpool_run --- src/server/altservers.c | 2 +- src/server/image.c | 2 ++ src/server/server.c | 6 +++--- src/server/threadpool.c | 8 +++++++- src/server/threadpool.h | 3 ++- src/server/uplink.c | 2 +- 6 files changed, 16 insertions(+), 7 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/altservers.c b/src/server/altservers.c index 1ba75f4..5076a05 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -172,7 +172,7 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink) if ( uplink->rttTestResult != RTT_INPROGRESS ) { dnbd3_uplink_t *current = ref_get_uplink( &uplink->image->uplinkref ); if ( current == uplink ) { - threadpool_run( &altservers_runCheck, uplink ); + threadpool_run( &altservers_runCheck, uplink, "UPLINK" ); } else if ( current != NULL ) { ref_put( ¤t->reference ); } diff --git a/src/server/image.c b/src/server/image.c index 81ec479..0ec1d58 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -1817,6 +1817,7 @@ static void* closeUnusedFds(void* nix UNUSED) timing_gets( &deadline, -UNUSED_FD_TIMEOUT ); int fds[FDCOUNT]; int fdindex = 0; + setThreadName( "unused-fd-close" ); mutex_lock( &imageListLock ); for ( int i = 0; i < _num_images; ++i ) { dnbd3_image_t * const image = _images[i]; @@ -1857,6 +1858,7 @@ static void* saveLoadAllCacheMaps(void* nix UNUSED) static ticks nextSave; declare_now; bool full = timing_reached( &nextSave, &now ); + setThreadName( "cache-mapper" ); mutex_lock( &imageListLock ); for ( int i = 0; i < _num_images; ++i ) { dnbd3_image_t * const image = _images[i]; diff --git a/src/server/server.c b/src/server/server.c index 71a49b9..fa7bcda 100644 --- a/src/server/server.c +++ b/src/server/server.c @@ -404,7 +404,7 @@ int main(int argc, char *argv[]) if ( sigReload ) { sigReload = false; logadd( LOG_INFO, "SIGHUP received, re-scanning image directory" ); - threadpool_run( &server_asyncImageListLoad, NULL ); + threadpool_run( &server_asyncImageListLoad, NULL, "IMAGE_RELOAD" ); } if ( sigLogCycle ) { sigLogCycle = false; @@ -431,7 +431,7 @@ int main(int argc, char *argv[]) continue; } - if ( !threadpool_run( &net_handleNewConnection, (void *)dnbd3_client ) ) { + if ( !threadpool_run( &net_handleNewConnection, (void *)dnbd3_client, "CLIENT" ) ) { logadd( LOG_ERROR, "Could not start thread for new connection." ); free( dnbd3_client ); continue; @@ -574,7 +574,7 @@ static int handlePendingJobs(void) jobHead = *temp; // Make it list head *temp = NULL; // Split off part before that while ( todo != NULL ) { - threadpool_run( todo->startRoutine, todo->arg ); + threadpool_run( todo->startRoutine, todo->arg, "TIMER_TASK" ); old = todo; todo = todo->next; if ( old->intervalSecs == 0 ) { diff --git a/src/server/threadpool.c b/src/server/threadpool.c index 96162a6..63ae19f 100644 --- a/src/server/threadpool.c +++ b/src/server/threadpool.c @@ -8,6 +8,7 @@ typedef struct _entry_t { dnbd3_signal_t* signal; void *(*startRoutine)(void *); void * arg; + const char *name; } entry_t; static void *threadpool_worker(void *entryPtr); @@ -56,7 +57,7 @@ void threadpool_waitEmpty() } while ( activeThreads != 0 ); } -bool threadpool_run(void *(*startRoutine)(void *), void *arg) +bool threadpool_run(void *(*startRoutine)(void *), void *arg, const char *name) { if ( unlikely( _shutdown ) ) { logadd( LOG_MINOR, "Cannot submit work to threadpool while shutting down!" ); @@ -97,6 +98,7 @@ bool threadpool_run(void *(*startRoutine)(void *), void *arg) } entry->startRoutine = startRoutine; entry->arg = arg; + entry->name = name; atomic_thread_fence( memory_order_release ); signal_call( entry->signal ); return true; @@ -126,6 +128,9 @@ keep_going:; logadd( LOG_ERROR, "Worker woke up but has no work to do!" ); exit( 1 ); } + if ( entry->name != NULL ) { + setThreadName( entry->name ); + } #endif // Start assigned work (*entry->startRoutine)( entry->arg ); @@ -146,6 +151,7 @@ keep_going:; // Reaching here means pool is full; just let the thread exit break; } + setThreadName( "[dead]" ); signal_close( entry->signal ); free( entry ); activeThreads--; diff --git a/src/server/threadpool.h b/src/server/threadpool.h index ee0b3aa..d8a526e 100644 --- a/src/server/threadpool.h +++ b/src/server/threadpool.h @@ -26,9 +26,10 @@ void threadpool_waitEmpty(); * Run a thread using the thread pool. * @param startRoutine function to run in new thread * @param arg argument to pass to thead + * @param name STRING CONSTANT (literal) for debugging purposes * @return true if thread was started */ -bool threadpool_run(void *(*startRoutine)(void *), void *arg); +bool threadpool_run(void *(*startRoutine)(void *), void *arg, const char *name); #endif diff --git a/src/server/uplink.c b/src/server/uplink.c index af854d6..a7f140f 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -435,7 +435,7 @@ success_ref: job->length = len; job->uplink = uplink; ref_inc( &uplink->reference ); // Hold one for the thread, thread will return it - threadpool_run( &prefetchForClient, (void*)job ); + threadpool_run( &prefetchForClient, (void*)job, "PREFETCH" ); } } if ( getUplink ) { -- cgit v1.2.3-55-g7522 From a9f5b836d9fddb3e1851c5b0a77c566b0f267ead Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 20 Mar 2020 12:08:10 +0100 Subject: [SERVER] Fix warnings, add assertions --- src/server/globals.h | 2 +- src/server/image.c | 7 +++++-- src/server/uplink.c | 15 +++++++++------ 3 files changed, 15 insertions(+), 9 deletions(-) (limited to 'src/server/uplink.c') diff --git a/src/server/globals.h b/src/server/globals.h index 5cee92a..08ec303 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -103,7 +103,7 @@ struct _dnbd3_uplink atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup. atomic_uint_fast64_t bytesReceivedLastSave; // Number of bytes received when we last saved the cache map int queueLen; // length of queue - uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives) + int idleTime; // How many seconds the uplink was idle (apart from keep-alives) dnbd3_queue_entry_t *queue; atomic_uint_fast32_t queueId; dnbd3_alt_local_t altData[SERVER_MAX_ALTS]; diff --git a/src/server/image.c b/src/server/image.c index 0ec1d58..ef40325 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -440,6 +440,7 @@ dnbd3_image_t* image_lock(dnbd3_image_t *image) mutex_lock( &imageListLock ); for (i = 0; i < _num_images; ++i) { if ( _images[i] == image ) { + assert( _images[i]->id == image->id ); image->users++; mutex_unlock( &imageListLock ); return image; @@ -470,6 +471,7 @@ dnbd3_image_t* image_release(dnbd3_image_t *image) // responsible for freeing it for (int i = 0; i < _num_images; ++i) { if ( _images[i] == image ) { // Found, do nothing + assert( _images[i]->id == image->id ); mutex_unlock( &imageListLock ); return NULL; } @@ -509,6 +511,7 @@ static dnbd3_image_t* image_remove(dnbd3_image_t *image) mutex_lock( &imageListLock ); for ( int i = _num_images - 1; i >= 0; --i ) { if ( _images[i] == image ) { + assert( _images[i]->id == image->id ); _images[i] = NULL; mustFree = ( image->users == 0 ); } @@ -1088,7 +1091,7 @@ bool image_create(char *image, int revision, uint64_t size) logadd( LOG_ERROR, "revision id invalid: %d", revision ); return false; } - char path[PATHLEN], cache[PATHLEN]; + char path[PATHLEN], cache[PATHLEN+4]; char *lastSlash = strrchr( image, '/' ); if ( lastSlash == NULL ) { snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision ); @@ -1099,7 +1102,7 @@ bool image_create(char *image, int revision, uint64_t size) *lastSlash = '/'; snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision ); } - snprintf( cache, PATHLEN, "%s.map", path ); + snprintf( cache, PATHLEN+4, "%s.map", path ); size = (size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); const int mapsize = IMGSIZE_TO_MAPBYTES(size); // Write files diff --git a/src/server/uplink.c b/src/server/uplink.c index a7f140f..f5ac6ac 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -403,8 +403,9 @@ bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t han mutex_unlock( &uplink->sendMutex ); logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" ); } else { - const bool ret = dnbd3_get_block( uplink->current.fd, req.start, req.end - req.start, - req.handle, COND_HOPCOUNT( uplink->current.version, hops ) ); + const bool ret = dnbd3_get_block( uplink->current.fd, req.start, + (uint32_t)( req.end - req.start ), req.handle, + COND_HOPCOUNT( uplink->current.version, hops ) ); if ( unlikely( !ret ) ) { markRequestUnsent( uplink, req.handle ); uplink->image->problem.uplink = true; @@ -426,7 +427,8 @@ success_ref: if ( client != NULL ) { // Was from client -- potential prefetch // Same size as this request, but consider end of image... - uint32_t len = MIN( uplink->image->virtualFilesize - req.end, req.end - req.start ); + uint32_t len = (uint32_t)MIN( uplink->image->virtualFilesize - req.end, + req.end - req.start ); // Also don't prefetch if we cross a hash block border and BGR mode == hashblock if ( len > 0 && ( _backgroundReplication != BGR_HASHBLOCK || req.start % HASH_BLOCK_SIZE == (req.end-1) % HASH_BLOCK_SIZE ) ) { @@ -592,7 +594,8 @@ static void* uplink_mainloop(void *data) } declare_now; uint32_t timepassed = timing_diff( &lastKeepalive, &now ); - if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL || ( timepassed >= 2 && uplink->idleTime < _bgrWindowSize ) ) { + if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL + || ( timepassed >= 2 && uplink->idleTime < _bgrWindowSize ) ) { lastKeepalive = now; uplink->idleTime += timepassed; // Keep-alive @@ -714,8 +717,8 @@ static void sendQueuedRequests(dnbd3_uplink_t *uplink, bool newOnly) dnbd3_request_t *hdr = &reqs[count++]; hdr->magic = dnbd3_packet_magic; hdr->cmd = CMD_GET_BLOCK; - hdr->size = it->to - it->from; - hdr->offset_small = it->from; + hdr->size = (uint32_t)( it->to - it->from ); + hdr->offset = it->from; // Offset first, then hops! (union) hdr->hops = COND_HOPCOUNT( uplink->current.version, it->hopCount ); hdr->handle = it->handle; fixup_request( *hdr ); -- cgit v1.2.3-55-g7522 From c61f65ebd977d0fa4f1f486458655242f3aeb3e5 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Wed, 10 Jun 2020 16:01:36 +0200 Subject: [SERVER] Fix list walk when removing client from uplink --- src/server/uplink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'src/server/uplink.c') diff --git a/src/server/uplink.c b/src/server/uplink.c index f5ac6ac..bf6f32e 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -235,12 +235,14 @@ void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client) return; mutex_lock( &uplink->queueLock ); for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { - for ( dnbd3_queue_client_t **cit = &it->clients; *cit != NULL; cit = &(**cit).next ) { + for ( dnbd3_queue_client_t **cit = &it->clients; *cit != NULL; ) { if ( (**cit).client == client ) { --client->relayedCount; dnbd3_queue_client_t *entry = *cit; *cit = (**cit).next; free( entry ); + } else { + cit = &(**cit).next; } } } -- cgit v1.2.3-55-g7522