summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSimon Rettberg2020-03-04 17:49:50 +0100
committerSimon Rettberg2020-03-04 17:49:50 +0100
commit930b65f26cb39687a113641f56711a2d58f886ca (patch)
treee0a0c5e73ea9820a4d64641209bde320359c37a2
parent[SERVER] Likewise, get rid of same loops in client handler (diff)
downloaddnbd3-930b65f26cb39687a113641f56711a2d58f886ca.tar.gz
dnbd3-930b65f26cb39687a113641f56711a2d58f886ca.tar.xz
dnbd3-930b65f26cb39687a113641f56711a2d58f886ca.zip
[SERVER] Add timer task for saving cache maps
Cache maps will now be saved periodically, but only if either they have a "dirty" bit set, which happens if any bits in the map get cleared again (due to corruption), or if new data has been replicated from an uplink server. This either means at least one byte received and 5 minutes have passed, or at least 500MB have been downloaded. The timer currently runs every 20 seconds.
-rw-r--r--src/server/altservers.c20
-rw-r--r--src/server/altservers.h2
-rw-r--r--src/server/globals.h3
-rw-r--r--src/server/image.c136
-rw-r--r--src/server/image.h2
-rw-r--r--src/server/uplink.c76
-rw-r--r--src/serverconfig.h5
7 files changed, 168 insertions, 76 deletions
diff --git a/src/server/altservers.c b/src/server/altservers.c
index a6ad235..380737c 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -274,6 +274,26 @@ int altservers_getHostListForReplication(const char *image, dnbd3_host_t *server
}
/**
+ * Returns true if there is at least one alt-server the
+ * given image name would be allowed to be cloned from.
+ */
+bool altservers_imageHasAltServers(const char *image)
+{
+ bool ret = false;
+ mutex_lock( &altServersLock );
+ for ( int i = 0; i < numAltServers; ++i ) {
+ if ( altServers[i].isClientOnly || ( !altServers[i].isPrivate && _proxyPrivateOnly ) )
+ continue;
+ if ( !isImageAllowed( &altServers[i], image ) )
+ continue;
+ ret = true;
+ break;
+ }
+ mutex_unlock( &altServersLock );
+ return ret;
+}
+
+/**
* Get <size> alt servers. If there are more alt servers than
* requested, random servers will be picked.
* This function is suited for finding uplink servers as
diff --git a/src/server/altservers.h b/src/server/altservers.h
index 8e29aaa..78f6fcc 100644
--- a/src/server/altservers.h
+++ b/src/server/altservers.h
@@ -19,6 +19,8 @@ int altservers_getListForClient(dnbd3_client_t *client, dnbd3_server_entry_t *ou
int altservers_getHostListForReplication(const char *image, dnbd3_host_t *servers, int size);
+bool altservers_imageHasAltServers(const char *image);
+
bool altservers_toString(int server, char *buffer, size_t len);
int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2);
diff --git a/src/server/globals.h b/src/server/globals.h
index 5de4180..10d3ee3 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -93,6 +93,7 @@ struct _dnbd3_uplink
// If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block"
uint64_t replicationHandle; // Handle of pending replication request
atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup.
+ atomic_uint_fast64_t bytesReceivedLastSave; // Number of bytes received when we last saved the cache map
int queueLen; // length of queue
uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives)
dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE];
@@ -128,7 +129,6 @@ struct _dnbd3_image
uint64_t virtualFilesize; // virtual size of image (real size rounded up to multiple of 4k)
uint64_t realFilesize; // actual file size on disk
ticks atime; // last access time
- ticks lastWorkCheck; // last time a non-working image has been checked
ticks nextCompletenessEstimate; // next time the completeness estimate should be updated
uint32_t *crc32; // list of crc32 checksums for each 16MiB block in image
uint32_t masterCrc32; // CRC-32 of the crc-32 list
@@ -144,6 +144,7 @@ struct _dnbd3_image
atomic_bool queue; // Too many requests waiting on uplink
} problem;
uint16_t rid; // revision of image
+ atomic_bool mapDirty; // Cache map has been modified outside uplink (only integrity checker for now)
pthread_mutex_t lock;
};
diff --git a/src/server/image.c b/src/server/image.c
index 3583f86..5a9e15b 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -55,6 +55,8 @@ static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const
static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc);
static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd);
static void* closeUnusedFds(void*);
+static void* saveAllCacheMaps(void*);
+static bool saveCacheMap(dnbd3_image_t *image);
static void allocCacheMap(dnbd3_image_t *image, bool complete);
static void cmfree(ref *ref)
@@ -73,6 +75,7 @@ void image_serverStartup()
mutex_init( &remoteCloneLock, LOCK_REMOTE_CLONE );
mutex_init( &reloadLock, LOCK_RELOAD );
server_addJob( &closeUnusedFds, NULL, 10, 900 );
+ server_addJob( &saveAllCacheMaps, NULL, 9, 20 );
}
/**
@@ -160,6 +163,8 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
integrity_check( image, block, false );
}
}
+ } else if ( !set ) {
+ image->mapDirty = true;
}
ref_put( &cache->reference );
}
@@ -624,6 +629,7 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image)
// this will get called again when the uplink is done.
if ( !uplink_shutdown( image ) )
return NULL;
+ saveCacheMap( image );
mutex_lock( &image->lock );
ref_setref( &image->ref_cacheMap, NULL );
free( image->crc32 );
@@ -1830,6 +1836,135 @@ static void* closeUnusedFds(void* nix UNUSED)
return NULL;
}
+#define IMGCOUNT 5
+static void* saveAllCacheMaps(void* nix UNUSED)
+{
+ static ticks nextSave;
+ dnbd3_image_t *list[IMGCOUNT];
+ int count = 0;
+ declare_now;
+ bool full = timing_reached( &nextSave, &now );
+ mutex_lock( &imageListLock );
+ for ( int i = 0; i < _num_images; ++i ) {
+ dnbd3_image_t * const image = _images[i];
+ if ( image->mapDirty ) {
+ // Flag is set if integrity checker found a problem - save out
+ image->users++;
+ list[count++] = image;
+ image->mapDirty = false;
+ } else {
+ // Otherwise, consider longer timeout and byte count limits of uplink
+ dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+ if ( uplink != NULL ) {
+ assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived );
+ uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave;
+ if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES
+ || ( full && diff != 0 ) ) {
+ image->users++;
+ list[count++] = image;
+ uplink->bytesReceivedLastSave = uplink->bytesReceived;
+ }
+ ref_put( &uplink->reference );
+ }
+ }
+ if ( count == IMGCOUNT )
+ break;
+ }
+ mutex_unlock( &imageListLock );
+ if ( full && count < IMGCOUNT ) {
+ // Only update nextSave once we handled all images in the list
+ timing_addSeconds( &nextSave, &now, CACHE_MAP_MAX_SAVE_DELAY );
+ }
+ for ( int i = 0; i < count; ++i ) {
+ saveCacheMap( list[i] );
+ image_release( list[i] );
+ }
+ return NULL;
+}
+#undef IMGCOUNT
+
+/**
+ * Saves the cache map of the given image.
+ * Return true on success.
+ * @param image the image
+ */
+static bool saveCacheMap(dnbd3_image_t *image)
+{
+ if ( !_isProxy )
+ return true; // Nothing to do
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache == NULL )
+ return true; // Nothing to do
+ // Check if we're a "hybrid proxy", i.e. there are only some namespaces (directories)
+ // for which we have any upstream servers configured. If there's none, don't touch
+ // the cache map on disk.
+ if ( !altservers_imageHasAltServers( image->name ) ) {
+ ref_put( &cache->reference );
+ return true; // Nothing to do
+ }
+
+ logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid );
+ const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize);
+ char mapfile[strlen( image->path ) + 4 + 1];
+ strcpy( mapfile, image->path );
+ strcat( mapfile, ".map" );
+
+ int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 );
+ if ( fd == -1 ) {
+ const int err = errno;
+ ref_put( &cache->reference );
+ logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile );
+ return false;
+ }
+
+ // On Linux we could use readFd, but in general it's not guaranteed to work
+ int imgFd = open( image->path, O_WRONLY );
+ if ( imgFd == -1 ) {
+ logadd( LOG_WARNING, "Cannot open %s for fsync(): errno=%d", image->path, errno );
+ } else {
+ if ( fsync( imgFd ) == -1 ) {
+ logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d. Resetting cache map.", image->path, errno );
+ dnbd3_cache_map_t *old = image_loadCacheMap(image->path, image->virtualFilesize);
+ const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+ if ( old == NULL ) {
+ // Could not load old map. FS might be toast.
+ logadd( LOG_ERROR, "Cannot load old cache map. Setting all zero." );
+ memset( cache->map, 0, mapSize );
+ } else {
+ // AND the maps together to be safe
+ for ( int i = 0; i < mapSize; ++i ) {
+ cache->map[i] &= old->map[i];
+ }
+ old->reference.free( &old->reference );
+ }
+ }
+ close( imgFd );
+ }
+
+ // Write current map to file
+ size_t done = 0;
+ while ( done < size ) {
+ const ssize_t ret = write( fd, cache->map + done, size - done );
+ if ( ret == -1 ) {
+ if ( errno == EINTR ) continue;
+ logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile );
+ break;
+ }
+ if ( ret <= 0 ) {
+ logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile );
+ break;
+ }
+ done += (size_t)ret;
+ }
+ ref_put( &cache->reference );
+ if ( fsync( fd ) == -1 ) {
+ logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno );
+ }
+ close( fd );
+ // TODO fsync on parent directory
+ return true;
+}
+
static void allocCacheMap(dnbd3_image_t *image, bool complete)
{
const uint8_t val = complete ? 0xff : 0;
@@ -1846,4 +1981,3 @@ static void allocCacheMap(dnbd3_image_t *image, bool complete)
}
mutex_unlock( &image->lock );
}
-
diff --git a/src/server/image.h b/src/server/image.h
index 89791fc..4614c74 100644
--- a/src/server/image.h
+++ b/src/server/image.h
@@ -49,6 +49,8 @@ void image_closeUnusedFd();
bool image_ensureDiskSpaceLocked(uint64_t size, bool force);
+bool image_saveCacheMap(dnbd3_image_t *image);
+
// one byte in the map covers 8 4kib blocks, so 32kib per byte
// "+ (1 << 15) - 1" is required to account for the last bit of
// the image that is smaller than 32kib
diff --git a/src/server/uplink.c b/src/server/uplink.c
index 97cb2a9..e5ab9c0 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -57,7 +57,6 @@ static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink);
static void uplink_addCrc32(dnbd3_uplink_t *uplink);
static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink);
static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force);
-static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink);
static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink);
static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew);
@@ -103,6 +102,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
mutex_init( &uplink->sendMutex, LOCK_UPLINK_SEND );
uplink->image = image;
uplink->bytesReceived = 0;
+ uplink->bytesReceivedLastSave = 0;
uplink->idleTime = 0;
uplink->queueLen = 0;
uplink->cacheFd = -1;
@@ -445,7 +445,6 @@ static void* uplink_mainloop(void *data)
int altCheckInterval = SERVER_RTT_INTERVAL_INIT;
int rttTestResult;
uint32_t discoverFailCount = 0;
- uint32_t unsavedSeconds = 0;
ticks nextAltCheck, lastKeepalive;
char buffer[200];
memset( events, 0, sizeof(events) );
@@ -561,12 +560,6 @@ static void* uplink_mainloop(void *data)
if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) {
lastKeepalive = now;
uplink->idleTime += timepassed;
- unsavedSeconds += timepassed;
- if ( unsavedSeconds > 240 || ( unsavedSeconds > 60 && uplink->idleTime >= 20 && uplink->idleTime <= 70 ) ) {
- // fsync/save every 4 minutes, or every 60 seconds if uplink is idle
- unsavedSeconds = 0;
- uplink_saveCacheMap( uplink );
- }
// Keep-alive
if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) {
// Send keep-alive if nothing is happening, and try to trigger background rep.
@@ -639,9 +632,9 @@ static void* uplink_mainloop(void *data)
}
#endif
}
- cleanup: ;
- uplink_saveCacheMap( uplink );
+cleanup: ;
dnbd3_image_t *image = uplink->image;
+ image->mapDirty = true; // Force writeout of cache map
mutex_lock( &image->lock );
bool exp = false;
if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) {
@@ -1135,69 +1128,6 @@ static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force)
return uplink->cacheFd != -1;
}
-/**
- * Saves the cache map of the given image.
- * Return true on success.
- * Locks on: imageListLock, image.lock
- */
-static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink)
-{
- dnbd3_image_t *image = uplink->image;
- assert( image != NULL );
-
- if ( uplink->cacheFd != -1 ) {
- if ( fsync( uplink->cacheFd ) == -1 ) {
- // A failing fsync means we have no guarantee that any data
- // since the last fsync (or open if none) has been saved. Apart
- // from keeping the cache map from the last successful fsync
- // around and restoring it there isn't much we can do to recover
- // a consistent state. Bail out.
- logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d", image->path, errno );
- logadd( LOG_ERROR, "Bailing out immediately" );
- exit( 1 );
- }
- }
-
- dnbd3_cache_map_t *cache = ref_get_cachemap( image );
- if ( cache == NULL )
- return true;
- logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid );
- const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize);
- assert( image->path != NULL );
- char mapfile[strlen( image->path ) + 4 + 1];
- strcpy( mapfile, image->path );
- strcat( mapfile, ".map" );
-
- int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 );
- if ( fd == -1 ) {
- const int err = errno;
- ref_put( &cache->reference );
- logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile );
- return false;
- }
-
- size_t done = 0;
- while ( done < size ) {
- const ssize_t ret = write( fd, cache->map + done, size - done );
- if ( ret == -1 ) {
- if ( errno == EINTR ) continue;
- logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile );
- break;
- }
- if ( ret <= 0 ) {
- logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile );
- break;
- }
- done += (size_t)ret;
- }
- ref_put( &cache->reference );
- if ( fsync( fd ) == -1 ) {
- logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno );
- }
- close( fd );
- return true;
-}
-
static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink)
{
return ( uplink->idleTime > SERVER_UPLINK_IDLE_TIMEOUT
diff --git a/src/serverconfig.h b/src/serverconfig.h
index 239f0a2..5c7301d 100644
--- a/src/serverconfig.h
+++ b/src/serverconfig.h
@@ -17,7 +17,10 @@
#define SERVER_UPLINK_QUEUELEN_THRES 900 // Threshold where we start dropping incoming clients
#define SERVER_MAX_PENDING_ALT_CHECKS 500 // Length of queue for pending alt checks requested by uplinks
-#define SERVER_CACHE_MAP_SAVE_INTERVAL 90
+// Wait a maximum of 5 minutes before saving cache map (if data was received at all)
+#define CACHE_MAP_MAX_SAVE_DELAY 300
+// If more than 500MB have been received from uplink without saving cache map, do so
+#define CACHE_MAP_MAX_UNSAVED_BYTES ((uint64_t)500 * 1000 * 1000)
// Time in ms to wait for a read/write call to complete on an uplink connection
#define SOCKET_TIMEOUT_UPLINK 5000