From a0c5086ff2dc9e2b238d9800e1c97236ed545098 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Mon, 16 Sep 2013 19:08:49 +0200 Subject: [SERVER] More debugging, more safety checks, disable alt servers that fail too often --- src/server/altservers.c | 24 ++++++++++++++++ src/server/altservers.h | 2 ++ src/server/globals.h | 3 +- src/server/image.c | 53 ++++++++++++++++++++++++++++++++++- src/server/image.h | 4 +++ src/server/server.c | 73 +++++++++++++++++++++++++++++-------------------- src/server/uplink.c | 18 ++++++++---- 7 files changed, 140 insertions(+), 37 deletions(-) (limited to 'src/server') diff --git a/src/server/altservers.c b/src/server/altservers.c index b85b6a9..5e4cebd 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -212,6 +212,7 @@ int altservers_get(dnbd3_host_t *output, int size) { if ( size <= 0 ) return 0; int count = 0, i; + const time_t now = time( NULL ); spin_lock( &_alts_lock ); // Flip first server in list with a random one every time this is called if ( _num_alts > 1 ) { @@ -224,6 +225,8 @@ int altservers_get(dnbd3_host_t *output, int size) } for (i = 0; i < _num_alts; ++i) { if ( _alt_servers[i].host.type == 0 ) continue; + if ( _alt_servers[i].numFails > SERVER_MAX_UPLINK_FAILS && now - _alt_servers[i].lastFail > SERVER_BAD_UPLINK_IGNORE ) continue; + _alt_servers[i].numFails = 0; output[count++] = _alt_servers[i].host; if ( count >= size ) break; } @@ -278,6 +281,27 @@ int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2) return retval; } +/** + * Called if an uplink server failed during normal uplink operation. This unit keeps + * track of how often servers fail, and consider them disabled for some time if they + * fail too many times. + */ +void altservers_serverFailed(const dnbd3_host_t * const host) +{ + int i; + const time_t now = time( NULL ); + spin_lock( &_alts_lock ); + for (i = 0; i < _num_alts; ++i) { + if ( !is_same_server( host, &_alt_servers[i].host ) ) continue; + if ( now - _alt_servers[i].lastFail > SERVER_RTT_DELAY_INIT ) { + _alt_servers[i].numFails++; + _alt_servers[i].lastFail = now; + } + break; + } + spin_unlock( &_alts_lock ); +} + static void *altservers_main(void *data) { const int MAXEVENTS = 3; diff --git a/src/server/altservers.h b/src/server/altservers.h index e826946..e07afce 100644 --- a/src/server/altservers.h +++ b/src/server/altservers.h @@ -17,4 +17,6 @@ int altservers_get(dnbd3_host_t *output, int size); int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2); +void altservers_serverFailed(const dnbd3_host_t * const host); + #endif /* UPLINK_CONNECTOR_H_ */ diff --git a/src/server/globals.h b/src/server/globals.h index f53cb11..425db3f 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -70,11 +70,12 @@ typedef struct typedef struct { char comment[COMMENT_LENGTH]; - time_t lastReached; dnbd3_host_t host; int rtt[SERVER_RTT_PROBES]; int rttIndex; int isPrivate; + time_t lastFail; + int numFails; } dnbd3_alt_server_t; typedef struct diff --git a/src/server/image.c b/src/server/image.c index b47be48..13e2e5b 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -142,7 +142,7 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co /** * Mark image as complete by freeing the cache_map and deleting the map file on disk - * DOES NOT LOCK ON THE IMAGE + * DOES NOT LOCK ON THE IMAGE, DO SO BEFORE CALLING */ void image_markComplete(dnbd3_image_t *image) { @@ -235,6 +235,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision) } } + // Not found if ( candidate == NULL ) { spin_unlock( &_images_lock ); return NULL ; @@ -254,6 +255,11 @@ dnbd3_image_t* image_get(char *name, uint16_t revision) } else if ( !candidate->working && candidate->cache_map != NULL && candidate->uplink == NULL && file_isWritable( candidate->path ) ) { // Not working and has file + cache-map, try to init uplink (uplink_init will check if proxy mode is enabled) uplink_init( candidate, -1, NULL ); + } else if ( candidate->working && candidate->uplink != NULL && candidate->uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) { + // To many pending uplink requests. We take that as a hint that the uplink is clogged or no working uplink server + // exists, so "working" is changed to FALSE for now. Should a new uplink server be found the uplink thread will + // set this back to TRUE some time. + candidate->working = FALSE; } return candidate; // Success :-) } @@ -1021,6 +1027,51 @@ int image_generateCrcFile(char *image) return TRUE; } +void image_printAll() +{ + int i, percent, pending; + char buffer[100] = { 0 }; + spin_lock( &_images_lock ); + for (i = 0; i < _num_images; ++i) { + if ( _images[i] == NULL ) continue; + spin_lock( &_images[i]->lock ); + printf( "Image: %s\n", _images[i]->lower_name ); + percent = image_getCompletenessEstimate( _images[i] ); + printf( " Complete: %d%%\n", percent ); + if ( _images[i]->uplink != NULL ) { + host_to_string( &_images[i]->uplink->currentServer, buffer, sizeof(buffer) ); + pending = _images[i]->uplink->queueLen; + printf( " Uplink: %s -- %d pending requests\n", buffer, pending ); + } + printf( " Users: %d\n", _images[i]->users ); + spin_unlock( &_images[i]->lock ); + } + spin_unlock( &_images_lock ); +} + +/** + * Get completeness of an image in percent. Only estimated, not exact. + * Returns: 0-100 + * DOES NOT LOCK, so make sure to do so before calling + */ +int image_getCompletenessEstimate(const dnbd3_image_t * const image) +{ + assert( image != NULL ); + if ( image->cache_map == NULL ) return image->working ? 100 : 0; + int i; + int percent = 0; + const size_t len = IMGSIZE_TO_MAPBYTES(image->filesize); + if ( len == 0 ) return 0; + for (i = 0; i < len; ++i) { + if ( image->cache_map[i] == 0xff ) { + percent += 100; + } else if ( image->cache_map[i] > 0 ) { + percent += 50; + } + } + return percent / len; +} + /** * Check the CRC-32 of the given blocks. The array blocks is of variable length. * !! pass -1 as the last block so the function knows when to stop !! diff --git a/src/server/image.h b/src/server/image.h index 9ba4436..5d6f027 100644 --- a/src/server/image.h +++ b/src/server/image.h @@ -36,6 +36,10 @@ int image_create(char *image, int revision, uint64_t size); int image_generateCrcFile(char *image); +void image_printAll(); + +int image_getCompletenessEstimate(const dnbd3_image_t * const image); + // one byte in the map covers 8 4kib blocks, so 32kib per byte // "+ (1 << 15) - 1" is required to account for the last bit of // the image that is smaller than 32kib diff --git a/src/server/server.c b/src/server/server.c index 57e9b94..4dadd3c 100644 --- a/src/server/server.c +++ b/src/server/server.c @@ -44,6 +44,7 @@ #include "memlog.h" #include "globals.h" #include "integrity.h" +#include "helper.h" #define MAX_SERVER_SOCKETS 50 // Assume there will be no more than 50 sockets the server will listen on static int sockets[MAX_SERVER_SOCKETS], socket_count = 0; @@ -61,10 +62,8 @@ pthread_spinlock_t _clients_lock; static time_t _startupTime = 0; static int dnbd3_add_client(dnbd3_client_t *client); -static void dnbd3_load_config(); -static void dnbd3_handle_sigpipe(int signum); -static void dnbd3_handle_sigterm(int signum); -static void dnbd3_handle_sigusr1(int signum); +static void dnbd3_handle_signal(int signum); +static void dnbd3_printClients(); /** * Print help text for usage instructions @@ -287,14 +286,12 @@ int main(int argc, char *argv[]) debug_locks_start_watchdog(); #endif - // load config file - dnbd3_load_config(); - // setup signal handler - signal( SIGPIPE, dnbd3_handle_sigpipe ); - signal( SIGTERM, dnbd3_handle_sigterm ); - signal( SIGINT, dnbd3_handle_sigterm ); - signal( SIGUSR1, dnbd3_handle_sigusr1 ); + signal( SIGPIPE, dnbd3_handle_signal ); + signal( SIGTERM, dnbd3_handle_signal ); + signal( SIGINT, dnbd3_handle_signal ); + signal( SIGUSR1, dnbd3_handle_signal ); + signal( SIGUSR2, dnbd3_handle_signal ); printf( "Loading images....\n" ); // Load all images in base path @@ -305,6 +302,9 @@ int main(int argc, char *argv[]) _startupTime = time( NULL ); + // Give other threads some time to start up before accepting connections + sleep( 2 ); + // setup network sockets[socket_count] = sock_listen_any( PF_INET, PORT ); if ( sockets[socket_count] != -1 ) ++socket_count; @@ -467,29 +467,44 @@ static int dnbd3_add_client(dnbd3_client_t *client) return TRUE; } -static void dnbd3_load_config() -{ - // Load configuration -} - -static void dnbd3_handle_sigpipe(int signum) +static void dnbd3_handle_signal(int signum) { - memlogf( "INFO: SIGPIPE received (%s)", strsignal( signum ) ); -} - -static void dnbd3_handle_sigterm(int signum) -{ - memlogf( "INFO: SIGTERM or SIGINT received (%s)", strsignal( signum ) ); - dnbd3_cleanup(); + if ( signum == SIGPIPE ) { + memlogf( "INFO: SIGPIPE received (%s)", strsignal( signum ) ); + } else if ( signum == SIGINT || signum == SIGTERM ) { + memlogf( "INFO: SIGTERM or SIGINT received (%s)", strsignal( signum ) ); + dnbd3_cleanup(); + } else if ( signum == SIGUSR1 ) { + memlogf( "INFO: SIGUSR1 (%s) received, re-scanning image directory", strsignal( signum ) ); + image_loadAll( NULL ); + } else if ( signum == SIGUSR1 ) { + printf( "[DEBUG] SIGUSR2 (%s) received, stats incoming\n", strsignal( signum ) ); + printf( " ** Images **\n" ); + image_printAll(); + printf( " ** Clients **\n" ); + dnbd3_printClients(); + } else { + printf( "SIGNAL: %d (%s)\n", signum, strsignal( signum ) ); + } } -void dnbd3_handle_sigusr1(int signum) +int dnbd3_serverUptime() { - memlogf( "INFO: SIGUSR1 (%s) received, re-scanning image directory", strsignal( signum ) ); - image_loadAll( NULL ); + return (int)(time( NULL ) - _startupTime); } -int dnbd3_serverUptime() +static void dnbd3_printClients() { - return (int)(time( NULL ) - _startupTime); + int i; + char buffer[100]; + spin_lock( &_clients_lock ); + for (i = 0; i < _num_clients; ++i) { + if ( _clients[i] == NULL ) continue; + spin_lock( &_clients[i]->lock ); + host_to_string( &_clients[i]->host, buffer, sizeof(buffer) ); + printf( "Client %s\n", buffer ); + if ( _clients[i]->image != NULL ) printf( " Image: %s\n", _clients[i]->image->lower_name ); + spin_unlock( &_clients[i]->lock ); + } + spin_unlock( &_clients_lock ); } diff --git a/src/server/uplink.c b/src/server/uplink.c index 6c50837..61b353f 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -341,6 +341,7 @@ static void* uplink_mainloop(void *data) if ( link->fd != -1 ) { if ( !uplink_send_keepalive( link->fd ) ) { printf( "[DEBUG] Error sending keep-alive to uplink\n" ); + altservers_serverFailed( &link->currentServer ); const int fd = link->fd; link->fd = -1; close( fd ); @@ -398,12 +399,14 @@ static void uplink_send_requests(dnbd3_connection_t *link, int newOnly) request.size = link->queue[j].to - link->queue[j].from; spin_unlock( &link->queueLock ); fixup_request( request ); - const int ret = write( link->fd, &request, sizeof request ); + const int ret = send( link->fd, &request, sizeof request, MSG_NOSIGNAL ); if ( ret != sizeof(request) ) { // Non-critical - if the connection dropped or the server was changed // the thread will re-send this request as soon as the connection // is reestablished. printf( "[DEBUG] Error sending request to uplink server!\n" ); + altservers_serverFailed( &link->currentServer ); + break; } spin_lock( &link->queueLock ); } @@ -419,7 +422,7 @@ static void uplink_handle_receive(dnbd3_connection_t *link) dnbd3_reply_t inReply, outReply; int ret, i; for (;;) { - ret = recv( link->fd, &inReply, sizeof inReply, MSG_DONTWAIT ); + ret = recv( link->fd, &inReply, sizeof inReply, MSG_DONTWAIT | MSG_NOSIGNAL ); if ( ret < 0 ) { const int err = errno; if ( err == EAGAIN || err == EWOULDBLOCK || err == EINTR ) return; // OK cases @@ -429,7 +432,7 @@ static void uplink_handle_receive(dnbd3_connection_t *link) memlogf( "[INFO] Uplink: Remote host hung up (%s)", link->image->path ); goto error_cleanup; } - if ( ret != sizeof inReply ) ret += recv( link->fd, &inReply + ret, sizeof(inReply) - ret, MSG_WAITALL ); + if ( ret != sizeof inReply ) ret += recv( link->fd, &inReply + ret, sizeof(inReply) - ret, MSG_WAITALL | MSG_NOSIGNAL ); if ( ret != sizeof inReply ) { const int err = errno; memlogf( "[INFO] Lost connection to uplink server for %s (header %d/%d, e=%d)", link->image->path, ret, (int)sizeof(inReply), @@ -452,7 +455,7 @@ static void uplink_handle_receive(dnbd3_connection_t *link) } uint32_t done = 0; while ( done < inReply.size ) { - ret = recv( link->fd, link->recvBuffer + done, inReply.size - done, 0 ); + ret = recv( link->fd, link->recvBuffer + done, inReply.size - done, MSG_NOSIGNAL ); if ( ret <= 0 ) { memlogf( "[INFO] Lost connection to uplink server of %s (payload)", link->image->path ); goto error_cleanup; @@ -484,7 +487,9 @@ static void uplink_handle_receive(dnbd3_connection_t *link) req->status = ULR_PROCESSING; } } - // 3) Send to interested clients + // 3) Send to interested clients - iterate backwards so request collaboration works, and + // so we can decrease queueLen on the fly while iterating. Should you ever change this to start + // from 0, you also need to change the "attach to existing request"-logic in uplink_request() outReply.magic = dnbd3_packet_magic; for (i = link->queueLen - 1; i >= 0; --i) { dnbd3_queued_request_t * const req = &link->queue[i]; @@ -510,6 +515,7 @@ static void uplink_handle_receive(dnbd3_connection_t *link) spin_unlock( &link->queueLock ); } error_cleanup: ; + altservers_serverFailed( &link->currentServer ); const int fd = link->fd; link->fd = -1; if ( fd != -1 ) close( fd ); @@ -526,7 +532,7 @@ static int uplink_send_keepalive(const int fd) request.cmd = CMD_KEEPALIVE; fixup_request( request ); } - return send( fd, &request, sizeof(request), 0 ) == sizeof(request); + return send( fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request); } static void uplink_addCrc32(dnbd3_connection_t *uplink) -- cgit v1.2.3-55-g7522