From 50d5e7351d5339c4da700a8c2b20e7a5c1992ce8 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Tue, 17 Oct 2017 14:12:24 +0200 Subject: [SERVER] Try to connect to different server if proxy cycle is detected --- src/server/altservers.c | 24 +++++++++++++++++------- src/server/globals.h | 1 + src/server/uplink.c | 17 +++++++++++++++-- 3 files changed, 33 insertions(+), 9 deletions(-) (limited to 'src/server') diff --git a/src/server/altservers.c b/src/server/altservers.c index 1e4af7e..04d93e5 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -362,7 +362,7 @@ static void *altservers_main(void *data UNUSED) if ( _shutdown ) goto cleanup; if ( ret == SIGNAL_ERROR ) { if ( errno == EAGAIN || errno == EINTR ) continue; - logadd( LOG_WARNING, "Error on signal_clear on alservers_main! Things will break!" ); + logadd( LOG_WARNING, "Error %d on signal_clear on alservers_main! Things will break!", errno ); usleep( 100000 ); } // Work your way through the queue @@ -408,8 +408,8 @@ static void *altservers_main(void *data UNUSED) int bestSock = -1; int bestIndex = -1; int bestProtocolVersion = -1; - unsigned int bestRtt = 0xfffffff; - unsigned int currentRtt = 0xfffffff; + unsigned long bestRtt = RTT_UNREACHABLE; + unsigned long currentRtt = RTT_UNREACHABLE; for (itAlt = 0; itAlt < numAlts; ++itAlt) { usleep( 1000 ); // Wait a very short moment for the network to recover (we might be doing lots of measurements...) // Connect @@ -461,8 +461,13 @@ static void *altservers_main(void *data UNUSED) clock_gettime( CLOCK_MONOTONIC, &end ); // Measurement done - everything fine so far const unsigned int rtt = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000; // µs - const unsigned int avg = altservers_updateRtt( &servers[itAlt], rtt ); - if ( uplink->fd != -1 && isSameAddressPort( &servers[itAlt], &uplink->currentServer ) ) { + unsigned int avg = altservers_updateRtt( &servers[itAlt], rtt ); + spin_lock( &uplink->rttLock ); + const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->currentServer ); + // If a cycle was detected, or we lost connection to the current (last) server, penaltize it + if ( ( uplink->cycleDetected || uplink->fd == -1 ) && isCurrent ) avg = (avg * 2) + 100000; + spin_unlock( &uplink->rttLock ); + if ( uplink->fd != -1 && isCurrent ) { // Was measuring current server currentRtt = avg; close( sock ); @@ -486,7 +491,6 @@ static void *altservers_main(void *data UNUSED) server_image_not_available: ; close( sock ); } - image_release( image ); // Done testing all servers. See if we should switch if ( bestSock != -1 && (uplink->fd == -1 || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) { // yep @@ -498,7 +502,7 @@ static void *altservers_main(void *data UNUSED) uplink->rttTestResult = RTT_DOCHANGE; spin_unlock( &uplink->rttLock ); signal_call( uplink->signal ); - } else if (bestSock == -1) { + } else if ( bestSock == -1 && currentRtt == RTT_UNREACHABLE ) { // No server was reachable spin_lock( &uplink->rttLock ); uplink->rttTestResult = RTT_NOT_REACHABLE; @@ -508,8 +512,14 @@ static void *altservers_main(void *data UNUSED) if ( bestSock != -1 ) close( bestSock ); spin_lock( &uplink->rttLock ); uplink->rttTestResult = RTT_DONTCHANGE; + uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away spin_unlock( &uplink->rttLock ); + if ( !image->working ) { + image->working = true; + logadd( LOG_DEBUG1, "No better alt server found, enabling '%s:%d' again.", image->name, (int)image->rid ); + } } + image_release( image ); // end of loop over all pending uplinks spin_lock( &pendingLockWrite ); pending[itLink] = NULL; diff --git a/src/server/globals.h b/src/server/globals.h index b1740f4..41d52df 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -60,6 +60,7 @@ struct _dnbd3_connection uint32_t recvBufferLen; // Len of ^^ volatile bool shutdown; // signal this thread to stop, must only be set from uplink_shutdown() or cleanup in uplink_mainloop() bool replicatedLastBlock; // bool telling if the last block has been replicated yet + bool cycleDetected; // connection cycle between proxies detected for current remote server int nextReplicationIndex; // Which index in the cache map we should start looking for incomplete blocks at uint64_t replicationHandle; // Handle of pending replication request uint64_t bytesReceived; // Number of bytes received by the connection. diff --git a/src/server/uplink.c b/src/server/uplink.c index 65cefb5..9fb5c13 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -68,6 +68,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version link->signal = NULL; link->replicationHandle = 0; spin_lock( &link->rttLock ); + link->cycleDetected = false; if ( sock >= 0 ) { link->betterFd = sock; link->betterServer = *host; @@ -166,6 +167,10 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin if ( hops != 0 && isSameAddress( &uplink->currentServer, &client->host ) ) { spin_unlock( &client->image->lock ); logadd( LOG_WARNING, "Proxy cycle detected (same host)." ); + spin_lock( &uplink->rttLock ); + uplink->cycleDetected = true; + spin_unlock( &uplink->rttLock ); + signal_call( uplink->signal ); return false; } @@ -199,6 +204,10 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin if ( requestLoop ) { spin_unlock( &uplink->queueLock ); logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops ); + spin_lock( &uplink->rttLock ); + uplink->cycleDetected = true; + spin_unlock( &uplink->rttLock ); + signal_call( uplink->signal ); return false; } if ( freeSlot == -1 ) { @@ -291,6 +300,7 @@ static void* uplink_mainloop(void *data) link->betterFd = -1; link->currentServer = link->betterServer; link->version = link->betterVersion; + link->cycleDetected = false; spin_unlock( &link->rttLock ); discoverFailCount = 0; if ( fd != -1 ) close( fd ); @@ -371,7 +381,7 @@ static void* uplink_mainloop(void *data) if ( now + SERVER_RTT_DELAY_FAILED < nextAltCheck ) { // This probably means the system time was changed - handle this case properly by capping the timeout nextAltCheck = now + SERVER_RTT_DELAY_FAILED / 2; - } else if ( now >= nextAltCheck || link->fd == -1 ) { + } else if ( now >= nextAltCheck || link->fd == -1 || link->cycleDetected ) { // It seems it's time for a check if ( image_isComplete( link->image ) ) { // Quit work if image is complete @@ -395,15 +405,18 @@ static void* uplink_mainloop(void *data) #ifdef _DEBUG if ( link->fd != -1 && !link->shutdown ) { bool resend = false; - const time_t deadline = now - 8; + const time_t deadline = now - 10; spin_lock( &link->queueLock ); for (i = 0; i < link->queueLen; ++i) { if ( link->queue[i].status != ULR_FREE && link->queue[i].entered < deadline ) { snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n" "%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)link, i, link->queue[i].client->image->name, link->queue[i].from, link->queue[i].to, link->queue[i].status ); + link->queue[i].entered = now; +#ifdef _DEBUG_RESEND_STARVING link->queue[i].status = ULR_NEW; resend = true; +#endif spin_unlock( &link->queueLock ); logadd( LOG_WARNING, "%s", buffer ); spin_lock( &link->queueLock ); -- cgit v1.2.3-55-g7522