diff options
-rw-r--r-- | src/server/altservers.c | 20 | ||||
-rw-r--r-- | src/server/altservers.h | 2 | ||||
-rw-r--r-- | src/server/globals.h | 3 | ||||
-rw-r--r-- | src/server/image.c | 136 | ||||
-rw-r--r-- | src/server/image.h | 2 | ||||
-rw-r--r-- | src/server/uplink.c | 76 | ||||
-rw-r--r-- | src/serverconfig.h | 5 |
7 files changed, 168 insertions, 76 deletions
diff --git a/src/server/altservers.c b/src/server/altservers.c index a6ad235..380737c 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -274,6 +274,26 @@ int altservers_getHostListForReplication(const char *image, dnbd3_host_t *server } /** + * Returns true if there is at least one alt-server the + * given image name would be allowed to be cloned from. + */ +bool altservers_imageHasAltServers(const char *image) +{ + bool ret = false; + mutex_lock( &altServersLock ); + for ( int i = 0; i < numAltServers; ++i ) { + if ( altServers[i].isClientOnly || ( !altServers[i].isPrivate && _proxyPrivateOnly ) ) + continue; + if ( !isImageAllowed( &altServers[i], image ) ) + continue; + ret = true; + break; + } + mutex_unlock( &altServersLock ); + return ret; +} + +/** * Get <size> alt servers. If there are more alt servers than * requested, random servers will be picked. * This function is suited for finding uplink servers as diff --git a/src/server/altservers.h b/src/server/altservers.h index 8e29aaa..78f6fcc 100644 --- a/src/server/altservers.h +++ b/src/server/altservers.h @@ -19,6 +19,8 @@ int altservers_getListForClient(dnbd3_client_t *client, dnbd3_server_entry_t *ou int altservers_getHostListForReplication(const char *image, dnbd3_host_t *servers, int size); +bool altservers_imageHasAltServers(const char *image); + bool altservers_toString(int server, char *buffer, size_t len); int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2); diff --git a/src/server/globals.h b/src/server/globals.h index 5de4180..10d3ee3 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -93,6 +93,7 @@ struct _dnbd3_uplink // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block" uint64_t replicationHandle; // Handle of pending replication request atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup. + atomic_uint_fast64_t bytesReceivedLastSave; // Number of bytes received when we last saved the cache map int queueLen; // length of queue uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives) dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE]; @@ -128,7 +129,6 @@ struct _dnbd3_image uint64_t virtualFilesize; // virtual size of image (real size rounded up to multiple of 4k) uint64_t realFilesize; // actual file size on disk ticks atime; // last access time - ticks lastWorkCheck; // last time a non-working image has been checked ticks nextCompletenessEstimate; // next time the completeness estimate should be updated uint32_t *crc32; // list of crc32 checksums for each 16MiB block in image uint32_t masterCrc32; // CRC-32 of the crc-32 list @@ -144,6 +144,7 @@ struct _dnbd3_image atomic_bool queue; // Too many requests waiting on uplink } problem; uint16_t rid; // revision of image + atomic_bool mapDirty; // Cache map has been modified outside uplink (only integrity checker for now) pthread_mutex_t lock; }; diff --git a/src/server/image.c b/src/server/image.c index 3583f86..5a9e15b 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -55,6 +55,8 @@ static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc); static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd); static void* closeUnusedFds(void*); +static void* saveAllCacheMaps(void*); +static bool saveCacheMap(dnbd3_image_t *image); static void allocCacheMap(dnbd3_image_t *image, bool complete); static void cmfree(ref *ref) @@ -73,6 +75,7 @@ void image_serverStartup() mutex_init( &remoteCloneLock, LOCK_REMOTE_CLONE ); mutex_init( &reloadLock, LOCK_RELOAD ); server_addJob( &closeUnusedFds, NULL, 10, 900 ); + server_addJob( &saveAllCacheMaps, NULL, 9, 20 ); } /** @@ -160,6 +163,8 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co integrity_check( image, block, false ); } } + } else if ( !set ) { + image->mapDirty = true; } ref_put( &cache->reference ); } @@ -624,6 +629,7 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image) // this will get called again when the uplink is done. if ( !uplink_shutdown( image ) ) return NULL; + saveCacheMap( image ); mutex_lock( &image->lock ); ref_setref( &image->ref_cacheMap, NULL ); free( image->crc32 ); @@ -1830,6 +1836,135 @@ static void* closeUnusedFds(void* nix UNUSED) return NULL; } +#define IMGCOUNT 5 +static void* saveAllCacheMaps(void* nix UNUSED) +{ + static ticks nextSave; + dnbd3_image_t *list[IMGCOUNT]; + int count = 0; + declare_now; + bool full = timing_reached( &nextSave, &now ); + mutex_lock( &imageListLock ); + for ( int i = 0; i < _num_images; ++i ) { + dnbd3_image_t * const image = _images[i]; + if ( image->mapDirty ) { + // Flag is set if integrity checker found a problem - save out + image->users++; + list[count++] = image; + image->mapDirty = false; + } else { + // Otherwise, consider longer timeout and byte count limits of uplink + dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); + if ( uplink != NULL ) { + assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived ); + uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave; + if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES + || ( full && diff != 0 ) ) { + image->users++; + list[count++] = image; + uplink->bytesReceivedLastSave = uplink->bytesReceived; + } + ref_put( &uplink->reference ); + } + } + if ( count == IMGCOUNT ) + break; + } + mutex_unlock( &imageListLock ); + if ( full && count < IMGCOUNT ) { + // Only update nextSave once we handled all images in the list + timing_addSeconds( &nextSave, &now, CACHE_MAP_MAX_SAVE_DELAY ); + } + for ( int i = 0; i < count; ++i ) { + saveCacheMap( list[i] ); + image_release( list[i] ); + } + return NULL; +} +#undef IMGCOUNT + +/** + * Saves the cache map of the given image. + * Return true on success. + * @param image the image + */ +static bool saveCacheMap(dnbd3_image_t *image) +{ + if ( !_isProxy ) + return true; // Nothing to do + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) + return true; // Nothing to do + // Check if we're a "hybrid proxy", i.e. there are only some namespaces (directories) + // for which we have any upstream servers configured. If there's none, don't touch + // the cache map on disk. + if ( !altservers_imageHasAltServers( image->name ) ) { + ref_put( &cache->reference ); + return true; // Nothing to do + } + + logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); + const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize); + char mapfile[strlen( image->path ) + 4 + 1]; + strcpy( mapfile, image->path ); + strcat( mapfile, ".map" ); + + int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 ); + if ( fd == -1 ) { + const int err = errno; + ref_put( &cache->reference ); + logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile ); + return false; + } + + // On Linux we could use readFd, but in general it's not guaranteed to work + int imgFd = open( image->path, O_WRONLY ); + if ( imgFd == -1 ) { + logadd( LOG_WARNING, "Cannot open %s for fsync(): errno=%d", image->path, errno ); + } else { + if ( fsync( imgFd ) == -1 ) { + logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d. Resetting cache map.", image->path, errno ); + dnbd3_cache_map_t *old = image_loadCacheMap(image->path, image->virtualFilesize); + const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + if ( old == NULL ) { + // Could not load old map. FS might be toast. + logadd( LOG_ERROR, "Cannot load old cache map. Setting all zero." ); + memset( cache->map, 0, mapSize ); + } else { + // AND the maps together to be safe + for ( int i = 0; i < mapSize; ++i ) { + cache->map[i] &= old->map[i]; + } + old->reference.free( &old->reference ); + } + } + close( imgFd ); + } + + // Write current map to file + size_t done = 0; + while ( done < size ) { + const ssize_t ret = write( fd, cache->map + done, size - done ); + if ( ret == -1 ) { + if ( errno == EINTR ) continue; + logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile ); + break; + } + if ( ret <= 0 ) { + logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile ); + break; + } + done += (size_t)ret; + } + ref_put( &cache->reference ); + if ( fsync( fd ) == -1 ) { + logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno ); + } + close( fd ); + // TODO fsync on parent directory + return true; +} + static void allocCacheMap(dnbd3_image_t *image, bool complete) { const uint8_t val = complete ? 0xff : 0; @@ -1846,4 +1981,3 @@ static void allocCacheMap(dnbd3_image_t *image, bool complete) } mutex_unlock( &image->lock ); } - diff --git a/src/server/image.h b/src/server/image.h index 89791fc..4614c74 100644 --- a/src/server/image.h +++ b/src/server/image.h @@ -49,6 +49,8 @@ void image_closeUnusedFd(); bool image_ensureDiskSpaceLocked(uint64_t size, bool force); +bool image_saveCacheMap(dnbd3_image_t *image); + // one byte in the map covers 8 4kib blocks, so 32kib per byte // "+ (1 << 15) - 1" is required to account for the last bit of // the image that is smaller than 32kib diff --git a/src/server/uplink.c b/src/server/uplink.c index 97cb2a9..e5ab9c0 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -57,7 +57,6 @@ static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink); static void uplink_addCrc32(dnbd3_uplink_t *uplink); static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink); static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force); -static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink); static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink); static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew); @@ -103,6 +102,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version mutex_init( &uplink->sendMutex, LOCK_UPLINK_SEND ); uplink->image = image; uplink->bytesReceived = 0; + uplink->bytesReceivedLastSave = 0; uplink->idleTime = 0; uplink->queueLen = 0; uplink->cacheFd = -1; @@ -445,7 +445,6 @@ static void* uplink_mainloop(void *data) int altCheckInterval = SERVER_RTT_INTERVAL_INIT; int rttTestResult; uint32_t discoverFailCount = 0; - uint32_t unsavedSeconds = 0; ticks nextAltCheck, lastKeepalive; char buffer[200]; memset( events, 0, sizeof(events) ); @@ -561,12 +560,6 @@ static void* uplink_mainloop(void *data) if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) { lastKeepalive = now; uplink->idleTime += timepassed; - unsavedSeconds += timepassed; - if ( unsavedSeconds > 240 || ( unsavedSeconds > 60 && uplink->idleTime >= 20 && uplink->idleTime <= 70 ) ) { - // fsync/save every 4 minutes, or every 60 seconds if uplink is idle - unsavedSeconds = 0; - uplink_saveCacheMap( uplink ); - } // Keep-alive if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) { // Send keep-alive if nothing is happening, and try to trigger background rep. @@ -639,9 +632,9 @@ static void* uplink_mainloop(void *data) } #endif } - cleanup: ; - uplink_saveCacheMap( uplink ); +cleanup: ; dnbd3_image_t *image = uplink->image; + image->mapDirty = true; // Force writeout of cache map mutex_lock( &image->lock ); bool exp = false; if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) { @@ -1135,69 +1128,6 @@ static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force) return uplink->cacheFd != -1; } -/** - * Saves the cache map of the given image. - * Return true on success. - * Locks on: imageListLock, image.lock - */ -static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) -{ - dnbd3_image_t *image = uplink->image; - assert( image != NULL ); - - if ( uplink->cacheFd != -1 ) { - if ( fsync( uplink->cacheFd ) == -1 ) { - // A failing fsync means we have no guarantee that any data - // since the last fsync (or open if none) has been saved. Apart - // from keeping the cache map from the last successful fsync - // around and restoring it there isn't much we can do to recover - // a consistent state. Bail out. - logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d", image->path, errno ); - logadd( LOG_ERROR, "Bailing out immediately" ); - exit( 1 ); - } - } - - dnbd3_cache_map_t *cache = ref_get_cachemap( image ); - if ( cache == NULL ) - return true; - logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); - const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize); - assert( image->path != NULL ); - char mapfile[strlen( image->path ) + 4 + 1]; - strcpy( mapfile, image->path ); - strcat( mapfile, ".map" ); - - int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 ); - if ( fd == -1 ) { - const int err = errno; - ref_put( &cache->reference ); - logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile ); - return false; - } - - size_t done = 0; - while ( done < size ) { - const ssize_t ret = write( fd, cache->map + done, size - done ); - if ( ret == -1 ) { - if ( errno == EINTR ) continue; - logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile ); - break; - } - if ( ret <= 0 ) { - logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile ); - break; - } - done += (size_t)ret; - } - ref_put( &cache->reference ); - if ( fsync( fd ) == -1 ) { - logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno ); - } - close( fd ); - return true; -} - static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink) { return ( uplink->idleTime > SERVER_UPLINK_IDLE_TIMEOUT diff --git a/src/serverconfig.h b/src/serverconfig.h index 239f0a2..5c7301d 100644 --- a/src/serverconfig.h +++ b/src/serverconfig.h @@ -17,7 +17,10 @@ #define SERVER_UPLINK_QUEUELEN_THRES 900 // Threshold where we start dropping incoming clients #define SERVER_MAX_PENDING_ALT_CHECKS 500 // Length of queue for pending alt checks requested by uplinks -#define SERVER_CACHE_MAP_SAVE_INTERVAL 90 +// Wait a maximum of 5 minutes before saving cache map (if data was received at all) +#define CACHE_MAP_MAX_SAVE_DELAY 300 +// If more than 500MB have been received from uplink without saving cache map, do so +#define CACHE_MAP_MAX_UNSAVED_BYTES ((uint64_t)500 * 1000 * 1000) // Time in ms to wait for a read/write call to complete on an uplink connection #define SOCKET_TIMEOUT_UPLINK 5000 |