summaryrefslogtreecommitdiffstats
path: root/src/server
diff options
context:
space:
mode:
authorSimon Rettberg2017-10-17 14:12:24 +0200
committerSimon Rettberg2017-10-17 14:12:24 +0200
commit50d5e7351d5339c4da700a8c2b20e7a5c1992ce8 (patch)
tree0b06ed2d3df3d6af5a65bc12579f7bb7b36d1f3c /src/server
parent[*] Support hop-counting in request header, protocol version 3 (diff)
downloaddnbd3-50d5e7351d5339c4da700a8c2b20e7a5c1992ce8.tar.gz
dnbd3-50d5e7351d5339c4da700a8c2b20e7a5c1992ce8.tar.xz
dnbd3-50d5e7351d5339c4da700a8c2b20e7a5c1992ce8.zip
[SERVER] Try to connect to different server if proxy cycle is detected
Diffstat (limited to 'src/server')
-rw-r--r--src/server/altservers.c24
-rw-r--r--src/server/globals.h1
-rw-r--r--src/server/uplink.c17
3 files changed, 33 insertions, 9 deletions
diff --git a/src/server/altservers.c b/src/server/altservers.c
index 1e4af7e..04d93e5 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -362,7 +362,7 @@ static void *altservers_main(void *data UNUSED)
if ( _shutdown ) goto cleanup;
if ( ret == SIGNAL_ERROR ) {
if ( errno == EAGAIN || errno == EINTR ) continue;
- logadd( LOG_WARNING, "Error on signal_clear on alservers_main! Things will break!" );
+ logadd( LOG_WARNING, "Error %d on signal_clear on alservers_main! Things will break!", errno );
usleep( 100000 );
}
// Work your way through the queue
@@ -408,8 +408,8 @@ static void *altservers_main(void *data UNUSED)
int bestSock = -1;
int bestIndex = -1;
int bestProtocolVersion = -1;
- unsigned int bestRtt = 0xfffffff;
- unsigned int currentRtt = 0xfffffff;
+ unsigned long bestRtt = RTT_UNREACHABLE;
+ unsigned long currentRtt = RTT_UNREACHABLE;
for (itAlt = 0; itAlt < numAlts; ++itAlt) {
usleep( 1000 ); // Wait a very short moment for the network to recover (we might be doing lots of measurements...)
// Connect
@@ -461,8 +461,13 @@ static void *altservers_main(void *data UNUSED)
clock_gettime( CLOCK_MONOTONIC, &end );
// Measurement done - everything fine so far
const unsigned int rtt = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000; // µs
- const unsigned int avg = altservers_updateRtt( &servers[itAlt], rtt );
- if ( uplink->fd != -1 && isSameAddressPort( &servers[itAlt], &uplink->currentServer ) ) {
+ unsigned int avg = altservers_updateRtt( &servers[itAlt], rtt );
+ spin_lock( &uplink->rttLock );
+ const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->currentServer );
+ // If a cycle was detected, or we lost connection to the current (last) server, penaltize it
+ if ( ( uplink->cycleDetected || uplink->fd == -1 ) && isCurrent ) avg = (avg * 2) + 100000;
+ spin_unlock( &uplink->rttLock );
+ if ( uplink->fd != -1 && isCurrent ) {
// Was measuring current server
currentRtt = avg;
close( sock );
@@ -486,7 +491,6 @@ static void *altservers_main(void *data UNUSED)
server_image_not_available: ;
close( sock );
}
- image_release( image );
// Done testing all servers. See if we should switch
if ( bestSock != -1 && (uplink->fd == -1 || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) {
// yep
@@ -498,7 +502,7 @@ static void *altservers_main(void *data UNUSED)
uplink->rttTestResult = RTT_DOCHANGE;
spin_unlock( &uplink->rttLock );
signal_call( uplink->signal );
- } else if (bestSock == -1) {
+ } else if ( bestSock == -1 && currentRtt == RTT_UNREACHABLE ) {
// No server was reachable
spin_lock( &uplink->rttLock );
uplink->rttTestResult = RTT_NOT_REACHABLE;
@@ -508,8 +512,14 @@ static void *altservers_main(void *data UNUSED)
if ( bestSock != -1 ) close( bestSock );
spin_lock( &uplink->rttLock );
uplink->rttTestResult = RTT_DONTCHANGE;
+ uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away
spin_unlock( &uplink->rttLock );
+ if ( !image->working ) {
+ image->working = true;
+ logadd( LOG_DEBUG1, "No better alt server found, enabling '%s:%d' again.", image->name, (int)image->rid );
+ }
}
+ image_release( image );
// end of loop over all pending uplinks
spin_lock( &pendingLockWrite );
pending[itLink] = NULL;
diff --git a/src/server/globals.h b/src/server/globals.h
index b1740f4..41d52df 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -60,6 +60,7 @@ struct _dnbd3_connection
uint32_t recvBufferLen; // Len of ^^
volatile bool shutdown; // signal this thread to stop, must only be set from uplink_shutdown() or cleanup in uplink_mainloop()
bool replicatedLastBlock; // bool telling if the last block has been replicated yet
+ bool cycleDetected; // connection cycle between proxies detected for current remote server
int nextReplicationIndex; // Which index in the cache map we should start looking for incomplete blocks at
uint64_t replicationHandle; // Handle of pending replication request
uint64_t bytesReceived; // Number of bytes received by the connection.
diff --git a/src/server/uplink.c b/src/server/uplink.c
index 65cefb5..9fb5c13 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -68,6 +68,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
link->signal = NULL;
link->replicationHandle = 0;
spin_lock( &link->rttLock );
+ link->cycleDetected = false;
if ( sock >= 0 ) {
link->betterFd = sock;
link->betterServer = *host;
@@ -166,6 +167,10 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
if ( hops != 0 && isSameAddress( &uplink->currentServer, &client->host ) ) {
spin_unlock( &client->image->lock );
logadd( LOG_WARNING, "Proxy cycle detected (same host)." );
+ spin_lock( &uplink->rttLock );
+ uplink->cycleDetected = true;
+ spin_unlock( &uplink->rttLock );
+ signal_call( uplink->signal );
return false;
}
@@ -199,6 +204,10 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
if ( requestLoop ) {
spin_unlock( &uplink->queueLock );
logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops );
+ spin_lock( &uplink->rttLock );
+ uplink->cycleDetected = true;
+ spin_unlock( &uplink->rttLock );
+ signal_call( uplink->signal );
return false;
}
if ( freeSlot == -1 ) {
@@ -291,6 +300,7 @@ static void* uplink_mainloop(void *data)
link->betterFd = -1;
link->currentServer = link->betterServer;
link->version = link->betterVersion;
+ link->cycleDetected = false;
spin_unlock( &link->rttLock );
discoverFailCount = 0;
if ( fd != -1 ) close( fd );
@@ -371,7 +381,7 @@ static void* uplink_mainloop(void *data)
if ( now + SERVER_RTT_DELAY_FAILED < nextAltCheck ) {
// This probably means the system time was changed - handle this case properly by capping the timeout
nextAltCheck = now + SERVER_RTT_DELAY_FAILED / 2;
- } else if ( now >= nextAltCheck || link->fd == -1 ) {
+ } else if ( now >= nextAltCheck || link->fd == -1 || link->cycleDetected ) {
// It seems it's time for a check
if ( image_isComplete( link->image ) ) {
// Quit work if image is complete
@@ -395,15 +405,18 @@ static void* uplink_mainloop(void *data)
#ifdef _DEBUG
if ( link->fd != -1 && !link->shutdown ) {
bool resend = false;
- const time_t deadline = now - 8;
+ const time_t deadline = now - 10;
spin_lock( &link->queueLock );
for (i = 0; i < link->queueLen; ++i) {
if ( link->queue[i].status != ULR_FREE && link->queue[i].entered < deadline ) {
snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n"
"%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)link, i, link->queue[i].client->image->name,
link->queue[i].from, link->queue[i].to, link->queue[i].status );
+ link->queue[i].entered = now;
+#ifdef _DEBUG_RESEND_STARVING
link->queue[i].status = ULR_NEW;
resend = true;
+#endif
spin_unlock( &link->queueLock );
logadd( LOG_WARNING, "%s", buffer );
spin_lock( &link->queueLock );