From 69f5bf408b9587a6e2008fba2224c2d506f1a895 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Tue, 27 Aug 2019 16:13:07 +0200 Subject: [SERVER] Use reference counting for uplink First step towards less locking for proxy mode --- src/server/reference.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 src/server/reference.h (limited to 'src/server/reference.h') diff --git a/src/server/reference.h b/src/server/reference.h new file mode 100644 index 0000000..0bc081a --- /dev/null +++ b/src/server/reference.h @@ -0,0 +1,54 @@ +#ifndef _REFERENCE_H_ +#define _REFERENCE_H_ + +#include "reftypes.h" +#include +#include + +#define container_of(ptr, type, member) \ + ((type *)((char *)(ptr) - (char *)&(((type *)NULL)->member))) + +void ref_init( ref *reference, void ( *freefun )( ref * ), long count ); + +void ref_setref( weakref *weakref, ref *ref ); + +_Noreturn void _ref_error( const char *message ); + +static inline ref *ref_get( weakref *weakref ) +{ + char *old_weakref = (char *)*weakref; + do { + if ( old_weakref == NULL ) + return NULL; + if ( aligned_ref( old_weakref ) != aligned_ref( old_weakref + 1 ) ) { + old_weakref = (char *)*weakref; + continue; + } + } while ( !atomic_compare_exchange_weak( weakref, (void **)&old_weakref, old_weakref + 1 ) ); + struct _ref_ *ref = aligned_ref( old_weakref )->ref; + if ( unlikely( ++ref->count == -1 ) ) { + _ref_error( "Reference counter overflow. Aborting.\n" ); + } + char *cur_weakref = ( char * )*weakref; + do { + if ( aligned_ref( cur_weakref ) != aligned_ref( old_weakref ) ) { + ref->count--; + break; + } + } while ( !atomic_compare_exchange_weak( weakref, (void **)&cur_weakref, cur_weakref - 1 ) ); + return ref; +} + +static inline void ref_put( ref *ref ) +{ + if ( --ref->count == 0 ) { + ref->free( ref ); + } +} + +#define ref_get_uplink(wr) ({ \ + ref* ref = ref_get( wr ); \ + ref == NULL ? NULL : container_of(ref, dnbd3_uplink_t, reference); \ +}) + +#endif -- cgit v1.2.3-55-g7522 From 291eba00d392e17925576ead20b781d774e68134 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Thu, 29 Aug 2019 14:48:58 +0200 Subject: [SERVER] reference: Fix error msg usage --- src/server/reference.c | 2 +- src/server/reference.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'src/server/reference.h') diff --git a/src/server/reference.c b/src/server/reference.c index 468e00b..64109ca 100644 --- a/src/server/reference.c +++ b/src/server/reference.c @@ -13,7 +13,7 @@ void ref_init( ref *reference, void ( *freefun )( ref * ), long count ) _Noreturn void _ref_error( const char *message ) { - fprintf( stderr, "Reference counter overflow\n" ); + fprintf( stderr, "%s\n", message ); abort(); } diff --git a/src/server/reference.h b/src/server/reference.h index 0bc081a..8883eb1 100644 --- a/src/server/reference.h +++ b/src/server/reference.h @@ -27,7 +27,7 @@ static inline ref *ref_get( weakref *weakref ) } while ( !atomic_compare_exchange_weak( weakref, (void **)&old_weakref, old_weakref + 1 ) ); struct _ref_ *ref = aligned_ref( old_weakref )->ref; if ( unlikely( ++ref->count == -1 ) ) { - _ref_error( "Reference counter overflow. Aborting.\n" ); + _ref_error( "Reference counter overflow. Aborting." ); } char *cur_weakref = ( char * )*weakref; do { -- cgit v1.2.3-55-g7522 From 88695877f085af475a6ca8a01c2fbb08eb5b15da Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Thu, 29 Aug 2019 14:49:18 +0200 Subject: [SERVER] Use weakref for cache maps Gets rid of a bunch of locking, especially the hot path in net.c where clients are requesting data. Many clients unsing the same incomplete image previously created a bottleneck here. --- src/server/globals.h | 10 ++- src/server/image.c | 208 +++++++++++++++++++++++++++++++------------------ src/server/image.h | 2 +- src/server/integrity.c | 10 ++- src/server/net.c | 81 +++++++++---------- src/server/reference.h | 5 ++ src/server/uplink.c | 64 +++++++-------- 7 files changed, 220 insertions(+), 160 deletions(-) (limited to 'src/server/reference.h') diff --git a/src/server/globals.h b/src/server/globals.h index f940666..221af78 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -99,6 +99,12 @@ typedef struct int permissions; } dnbd3_access_rule_t; +typedef struct +{ + ref reference; + atomic_uint_least8_t map[]; +} dnbd3_cache_map_t; + /** * Image struct. An image path could be something like * /mnt/images/rz/zfs/Windows7 ZfS.vmdk.r1 @@ -110,7 +116,7 @@ struct _dnbd3_image char *path; // absolute path of the image char *name; // public name of the image (usually relative path minus revision ID) weakref uplinkref; // pointer to a server connection - uint8_t *cache_map; // cache map telling which parts are locally cached, NULL if complete + weakref ref_cacheMap; // cache map telling which parts are locally cached, NULL if complete uint64_t virtualFilesize; // virtual size of image (real size rounded up to multiple of 4k) uint64_t realFilesize; // actual file size on disk ticks atime; // last access time @@ -119,7 +125,7 @@ struct _dnbd3_image uint32_t *crc32; // list of crc32 checksums for each 16MiB block in image uint32_t masterCrc32; // CRC-32 of the crc-32 list int readFd; // used to read the image. Used from multiple threads, so use atomic operations (pread et al) - int completenessEstimate; // Completeness estimate in percent + atomic_int completenessEstimate; // Completeness estimate in percent atomic_int users; // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock. int id; // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server atomic_bool working; // true if image exists and completeness is == 100% or a working upstream proxy is connected diff --git a/src/server/image.c b/src/server/image.c index 4eab1d2..1972f48 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -51,10 +51,18 @@ static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageS static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc); static bool image_ensureDiskSpace(uint64_t size, bool force); -static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize); +static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize); static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc); -static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, uint8_t * const cache_map); +static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, atomic_uint_least8_t * const cache_map); static void* closeUnusedFds(void*); +static void allocCacheMap(dnbd3_image_t *image, bool complete); + +static void cmfree(ref *ref) +{ + dnbd3_cache_map_t *cache = container_of(ref, dnbd3_cache_map_t, reference); + logadd( LOG_DEBUG2, "Freeing a cache map" ); + free( cache ); +} // ########################################## @@ -70,7 +78,6 @@ void image_serverStartup() /** * Update cache-map of given image for the given byte range * start (inclusive) - end (exclusive) - * Locks on: images[].lock */ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set) { @@ -91,33 +98,55 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co if ( start >= end ) return; bool setNewBlocks = false; - uint64_t pos = start; - mutex_lock( &image->lock ); - if ( image->cache_map == NULL ) { + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) { // Image seems already complete if ( set ) { // This makes no sense - mutex_unlock( &image->lock ); - logadd( LOG_DEBUG1, "image_updateCachemap(true) with no cache_map: %s", image->path ); + logadd( LOG_DEBUG1, "image_updateCachemap(true) with no cache map: %s", image->path ); return; } // Recreate a cache map, set it to all 1 initially as we assume the image was complete - const int byteSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); - image->cache_map = malloc( byteSize ); - memset( image->cache_map, 0xff, byteSize ); - } - while ( pos < end ) { - const size_t map_y = (int)( pos >> 15 ); - const int map_x = (int)( (pos >> 12) & 7 ); // mod 8 - const int bit_mask = 1 << map_x; - if ( set ) { - if ( (image->cache_map[map_y] & bit_mask) == 0 ) setNewBlocks = true; - image->cache_map[map_y] |= (uint8_t)bit_mask; - } else { - image->cache_map[map_y] &= (uint8_t)~bit_mask; + allocCacheMap( image, true ); + cache = ref_get_cachemap( image ); + if ( cache == NULL ) { + logadd( LOG_WARNING, "WHAT!!!?!?!= No cache map right after alloc?! %s", image->path ); + return; } - pos += DNBD3_BLOCK_SIZE; } + // Set/unset + const uint64_t firstByteInMap = start >> 15; + const uint64_t lastByteInMap = (end - 1) >> 15; + uint64_t pos; + // First byte + uint8_t fb = 0, lb = 0; + for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) { + const int map_x = (pos >> 12) & 7; // mod 8 + const uint8_t bit_mask = (uint8_t)( 1 << map_x ); + fb |= bit_mask; + } + // Last byte + for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) { + const int map_x = (pos >> 12) & 7; // mod 8 + const uint8_t bit_mask = (uint8_t)( 1 << map_x ); + lb |= bit_mask; + } + if ( set ) { + uint8_t fo = atomic_fetch_or_explicit( &cache->map[firstByteInMap], fb, memory_order_relaxed ); + uint8_t lo = atomic_fetch_or_explicit( &cache->map[lastByteInMap], lb, memory_order_relaxed ); + setNewBlocks = ( fo != cache->map[firstByteInMap] || lo != cache->map[lastByteInMap] ); + } else { + atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~fb, memory_order_relaxed ); + atomic_fetch_and_explicit( &cache->map[lastByteInMap], (uint8_t)~lb, memory_order_relaxed ); + } + const uint8_t nval = set ? 0xff : 0; + // Everything in between + for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) { + if ( atomic_exchange_explicit( &cache->map[pos], nval, memory_order_relaxed ) != nval && set ) { + setNewBlocks = true; + } + } + atomic_thread_fence( memory_order_release ); if ( setNewBlocks && image->crc32 != NULL ) { // If setNewBlocks is set, at least one of the blocks was not cached before, so queue all hash blocks // for checking, even though this might lead to checking some hash block again, if it was @@ -125,19 +154,14 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co // First set start and end to borders of hash blocks start &= ~(uint64_t)(HASH_BLOCK_SIZE - 1); end = (end + HASH_BLOCK_SIZE - 1) & ~(uint64_t)(HASH_BLOCK_SIZE - 1); - pos = start; - while ( pos < end ) { - if ( image->cache_map == NULL ) break; + for ( pos = start; pos < end; pos += HASH_BLOCK_SIZE ) { const int block = (int)( pos / HASH_BLOCK_SIZE ); - if ( image_isHashBlockComplete( image->cache_map, block, image->realFilesize ) ) { - mutex_unlock( &image->lock ); + if ( image_isHashBlockComplete( cache->map, block, image->realFilesize ) ) { integrity_check( image, block ); - mutex_lock( &image->lock ); } - pos += HASH_BLOCK_SIZE; } } - mutex_unlock( &image->lock ); + ref_put( &cache->reference ); } /** @@ -149,20 +173,18 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co bool image_isComplete(dnbd3_image_t *image) { assert( image != NULL ); - mutex_lock( &image->lock ); if ( image->virtualFilesize == 0 ) { - mutex_unlock( &image->lock ); return false; } - if ( image->cache_map == NULL ) { - mutex_unlock( &image->lock ); + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) { return true; } bool complete = true; int j; const int map_len_bytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); for (j = 0; j < map_len_bytes - 1; ++j) { - if ( image->cache_map[j] != 0xFF ) { + if ( cache->map[j] != 0xFF ) { complete = false; break; } @@ -177,18 +199,27 @@ bool image_isComplete(dnbd3_image_t *image) for (j = 0; j < blocks_in_last_byte; ++j) last_byte |= (uint8_t)(1 << j); } - complete = ((image->cache_map[map_len_bytes - 1] & last_byte) == last_byte); + complete = ((cache->map[map_len_bytes - 1] & last_byte) == last_byte); } - if ( !complete ) { - mutex_unlock( &image->lock ); + ref_put( &cache->reference ); + if ( !complete ) return false; + mutex_lock( &image->lock ); + // Lock and make sure current cache map is still the one we saw complete + dnbd3_cache_map_t *current = ref_get_cachemap( image ); + if ( current == cache ) { + // Set cache map NULL as it's complete + ref_setref( &image->ref_cacheMap, NULL ); + } + if ( current != NULL ) { + ref_put( ¤t->reference ); } - char mapfile[PATHLEN] = ""; - free( image->cache_map ); - image->cache_map = NULL; - snprintf( mapfile, PATHLEN, "%s.map", image->path ); mutex_unlock( &image->lock ); - unlink( mapfile ); + if ( current == cache ) { // Successfully set cache map to NULL above + char mapfile[PATHLEN] = ""; + snprintf( mapfile, PATHLEN, "%s.map", image->path ); + unlink( mapfile ); + } return true; } @@ -350,19 +381,18 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) img->rid = candidate->rid; img->users = 1; img->working = false; + img->ref_cacheMap = NULL; mutex_init( &img->lock, LOCK_IMAGE ); if ( candidate->crc32 != NULL ) { const size_t mb = IMGSIZE_TO_HASHBLOCKS( candidate->virtualFilesize ) * sizeof(uint32_t); img->crc32 = malloc( mb ); memcpy( img->crc32, candidate->crc32, mb ); } - mutex_lock( &candidate->lock ); - if ( candidate->cache_map != NULL ) { - const size_t mb = IMGSIZE_TO_MAPBYTES( candidate->virtualFilesize ); - img->cache_map = malloc( mb ); - memcpy( img->cache_map, candidate->cache_map, mb ); + dnbd3_cache_map_t *cache = ref_get_cachemap( candidate ); + if ( cache != NULL ) { + ref_setref( &img->ref_cacheMap, &cache->reference ); + ref_put( &cache->reference ); } - mutex_unlock( &candidate->lock ); if ( image_addToList( img ) ) { image_release( candidate ); candidate = img; @@ -377,7 +407,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) } // Check if image is incomplete, handle - if ( candidate->cache_map != NULL ) { + if ( candidate->ref_cacheMap != NULL ) { uplink_init( candidate, -1, NULL, -1 ); } @@ -585,11 +615,10 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image) if ( !uplink_shutdown( image ) ) return NULL; mutex_lock( &image->lock ); - free( image->cache_map ); + ref_setref( &image->ref_cacheMap, NULL ); free( image->crc32 ); free( image->path ); free( image->name ); - image->cache_map = NULL; image->crc32 = NULL; image->path = NULL; image->name = NULL; @@ -600,7 +629,7 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image) return NULL ; } -bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t realFilesize) +bool image_isHashBlockComplete(atomic_uint_least8_t * const cacheMap, const uint64_t block, const uint64_t realFilesize) { if ( cacheMap == NULL ) return true; const uint64_t end = (block + 1) * HASH_BLOCK_SIZE; @@ -707,7 +736,7 @@ static bool image_load(char *base, char *path, int withUplink) { int revision = -1; struct stat st; - uint8_t *cache_map = NULL; + dnbd3_cache_map_t *cache = NULL; uint32_t *crc32list = NULL; dnbd3_image_t *existing = NULL; int fdImage = -1; @@ -790,7 +819,7 @@ static bool image_load(char *base, char *path, int withUplink) } // 1. Allocate memory for the cache map if the image is incomplete - cache_map = image_loadCacheMap( path, virtualFilesize ); + cache = image_loadCacheMap( path, virtualFilesize ); // XXX: Maybe try sha-256 or 512 first if you're paranoid (to be implemented) @@ -802,7 +831,7 @@ static bool image_load(char *base, char *path, int withUplink) // Check CRC32 if ( crc32list != NULL ) { - if ( !image_checkRandomBlocks( 4, fdImage, realFilesize, crc32list, cache_map ) ) { + if ( !image_checkRandomBlocks( 4, fdImage, realFilesize, crc32list, cache != NULL ? cache->map : NULL ) ) { logadd( LOG_ERROR, "quick crc32 check of %s failed. Data corruption?", path ); doFullCheck = true; } @@ -826,7 +855,7 @@ static bool image_load(char *base, char *path, int withUplink) crc32list = NULL; function_return = true; goto load_error; // Keep existing - } else if ( existing->cache_map != NULL && cache_map == NULL ) { + } else if ( existing->ref_cacheMap != NULL && cache == NULL ) { // Just ignore that fact, if replication is really complete the cache map will be removed anyways logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", existing->name, (int)existing->rid ); function_return = true; @@ -846,7 +875,8 @@ static bool image_load(char *base, char *path, int withUplink) dnbd3_image_t *image = calloc( 1, sizeof(dnbd3_image_t) ); image->path = strdup( path ); image->name = strdup( imgName ); - image->cache_map = cache_map; + image->ref_cacheMap = NULL; + ref_setref( &image->ref_cacheMap, &cache->reference ); image->crc32 = crc32list; image->masterCrc32 = masterCrc; image->uplinkref = NULL; @@ -855,7 +885,7 @@ static bool image_load(char *base, char *path, int withUplink) image->rid = (uint16_t)revision; image->users = 0; image->readFd = -1; - image->working = (image->cache_map == NULL ); + image->working = ( cache == NULL ); timing_get( &image->nextCompletenessEstimate ); image->completenessEstimate = -1; mutex_init( &image->lock, LOCK_IMAGE ); @@ -870,16 +900,16 @@ static bool image_load(char *base, char *path, int withUplink) timing_gets( &image->atime, offset ); // Prevent freeing in cleanup - cache_map = NULL; + cache = NULL; crc32list = NULL; // Get rid of cache map if image is complete - if ( image->cache_map != NULL ) { + if ( image->ref_cacheMap != NULL ) { image_isComplete( image ); } // Image is definitely incomplete, initialize uplink worker - if ( image->cache_map != NULL ) { + if ( image->ref_cacheMap != NULL ) { image->working = false; if ( withUplink ) { uplink_init( image, -1, NULL, -1 ); @@ -910,21 +940,22 @@ static bool image_load(char *base, char *path, int withUplink) load_error: ; if ( existing != NULL ) existing = image_release( existing ); if ( crc32list != NULL ) free( crc32list ); - if ( cache_map != NULL ) free( cache_map ); + if ( cache != NULL ) free( cache ); if ( fdImage != -1 ) close( fdImage ); return function_return; } -static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize) +static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize) { - uint8_t *retval = NULL; + dnbd3_cache_map_t *retval = NULL; char mapFile[strlen( imagePath ) + 10 + 1]; sprintf( mapFile, "%s.map", imagePath ); int fdMap = open( mapFile, O_RDONLY ); - if ( fdMap >= 0 ) { + if ( fdMap != -1 ) { const int map_size = IMGSIZE_TO_MAPBYTES( fileSize ); - retval = calloc( 1, map_size ); - const ssize_t rd = read( fdMap, retval, map_size ); + retval = calloc( 1, sizeof(*retval) + map_size ); + ref_init( &retval->reference, cmfree, 0 ); + const ssize_t rd = read( fdMap, retval->map, map_size ); if ( map_size != rd ) { logadd( LOG_WARNING, "Could only read %d of expected %d bytes of cache map of '%s'", (int)rd, (int)map_size, imagePath ); // Could not read complete map, that means the rest of the image file will be considered incomplete @@ -985,7 +1016,7 @@ static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t f return retval; } -static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t realFilesize, uint32_t * const crc32list, uint8_t * const cache_map) +static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t realFilesize, uint32_t * const crc32list, atomic_uint_least8_t * const cache_map) { // This checks the first block and (up to) count - 1 random blocks for corruption // via the known crc32 list. This is very sloppy and is merely supposed to detect @@ -1529,30 +1560,37 @@ json_t* image_getListAsJson() /** * Get completeness of an image in percent. Only estimated, not exact. * Returns: 0-100 - * DOES NOT LOCK, so make sure to do so before calling */ int image_getCompletenessEstimate(dnbd3_image_t * const image) { assert( image != NULL ); - if ( image->cache_map == NULL ) return image->working ? 100 : 0; + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) + return image->working ? 100 : 0; + const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + if ( unlikely( len == 0 ) ) { + ref_put( &cache->reference ); + return 0; + } declare_now; if ( !timing_reached( &image->nextCompletenessEstimate, &now ) ) { // Since this operation is relatively expensive, we cache the result for a while + ref_put( &cache->reference ); return image->completenessEstimate; } int i; int percent = 0; - const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); - if ( len == 0 ) return 0; for ( i = 0; i < len; ++i ) { - if ( image->cache_map[i] == 0xff ) { + const uint8_t v = atomic_load_explicit( &cache->map[i], memory_order_relaxed ); + if ( v == 0xff ) { percent += 100; - } else if ( image->cache_map[i] != 0 ) { + } else if ( v != 0 ) { percent += 50; } } + ref_put( &cache->reference ); image->completenessEstimate = percent / len; - timing_set( &image->nextCompletenessEstimate, &now, 8 + rand() % 32 ); + timing_set( &image->nextCompletenessEstimate, &now, 4 + rand() % 16 ); return image->completenessEstimate; } @@ -1744,3 +1782,21 @@ static void* closeUnusedFds(void* nix UNUSED) } return NULL; } + +static void allocCacheMap(dnbd3_image_t *image, bool complete) +{ + const uint8_t val = complete ? 0xff : 0; + const int byteSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + dnbd3_cache_map_t *cache = malloc( sizeof(*cache) + byteSize ); + ref_init( &cache->reference, cmfree, 0 ); + memset( cache->map, val, byteSize ); + mutex_lock( &image->lock ); + if ( image->ref_cacheMap != NULL ) { + logadd( LOG_WARNING, "BUG: allocCacheMap called but there already is a cache map for %s:%d", image->name, (int)image->rid ); + free( cache ); + } else { + ref_setref( &image->ref_cacheMap, &cache->reference ); + } + mutex_unlock( &image->lock ); +} + diff --git a/src/server/image.h b/src/server/image.h index 4668eff..cd87f03 100644 --- a/src/server/image.h +++ b/src/server/image.h @@ -9,7 +9,7 @@ void image_serverStartup(); bool image_isComplete(dnbd3_image_t *image); -bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t fileSize); +bool image_isHashBlockComplete(atomic_uint_least8_t * const cacheMap, const uint64_t block, const uint64_t fileSize); void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set); diff --git a/src/server/integrity.c b/src/server/integrity.c index 1fcb558..a9fbae6 100644 --- a/src/server/integrity.c +++ b/src/server/integrity.c @@ -181,10 +181,12 @@ static void* integrity_main(void * data UNUSED) const uint64_t end = MIN( (uint64_t)(blocks[0] + 1) * HASH_BLOCK_SIZE, image->virtualFilesize ); bool complete = true; if ( qCount == CHECK_ALL ) { - // When checking full image, skip incomplete blocks, otherwise assume block is complete - mutex_lock( &image->lock ); - complete = image_isHashBlockComplete( image->cache_map, blocks[0], fileSize ); - mutex_unlock( &image->lock ); + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache != NULL ) { + // When checking full image, skip incomplete blocks, otherwise assume block is complete + complete = image_isHashBlockComplete( cache->map, blocks[0], fileSize ); + ref_put( &cache->reference ); + } } #if defined(linux) || defined(__linux) while ( sync_file_range( fd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 ) diff --git a/src/server/net.c b/src/server/net.c index 9c855e4..12bcdad 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -246,7 +246,7 @@ void* net_handleNewConnection(void *clientPtr) // We're a proxy, client is another proxy, we don't do BGR, but connecting proxy does... // Reject, as this would basically force this proxy to do BGR too. image = image_get( image_name, rid, true ); - if ( image != NULL && image->cache_map != NULL ) { + if ( image != NULL && image->ref_cacheMap != NULL ) { // Only exception is if the image is complete locally image = image_release( image ); } @@ -268,7 +268,7 @@ void* net_handleNewConnection(void *clientPtr) } else { // Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable bOk = true; - if ( image->cache_map != NULL ) { + if ( image->ref_cacheMap != NULL ) { dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); if ( uplink == NULL || uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) { bOk = ( rand() % 4 ) == 1; @@ -338,57 +338,52 @@ void* net_handleNewConnection(void *clientPtr) break; } - if ( request.size != 0 && image->cache_map != NULL ) { + dnbd3_cache_map_t *cache; + if ( request.size != 0 && ( cache = ref_get_cachemap( image ) ) != NULL ) { // This is a proxyed image, check if we need to relay the request... start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); bool isCached = true; - mutex_lock( &image->lock ); - // Check again as we only aquired the lock just now - if ( image->cache_map != NULL ) { - const uint64_t firstByteInMap = start >> 15; - const uint64_t lastByteInMap = (end - 1) >> 15; - uint64_t pos; - // Middle - quick checking - if ( isCached ) { - pos = firstByteInMap + 1; - while ( pos < lastByteInMap ) { - if ( image->cache_map[pos] != 0xff ) { - isCached = false; - break; - } - ++pos; + const uint64_t firstByteInMap = start >> 15; + const uint64_t lastByteInMap = (end - 1) >> 15; + uint64_t pos; + uint8_t b; + atomic_thread_fence( memory_order_acquire ); + // Middle - quick checking + if ( isCached ) { + for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) { + if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) { + isCached = false; + break; } } - // First byte - if ( isCached ) { - pos = start; - do { - const int map_x = (pos >> 12) & 7; // mod 8 - const uint8_t bit_mask = (uint8_t)( 1 << map_x ); - if ( (image->cache_map[firstByteInMap] & bit_mask) == 0 ) { - isCached = false; - break; - } - pos += DNBD3_BLOCK_SIZE; - } while ( firstByteInMap == (pos >> 15) && pos < end ); + } + // First byte + if ( isCached ) { + b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed ); + for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) { + const int map_x = (pos >> 12) & 7; // mod 8 + const uint8_t bit_mask = (uint8_t)( 1 << map_x ); + if ( (b & bit_mask) == 0 ) { + isCached = false; + break; + } } - // Last byte - only check if request spans multiple bytes in cache map - if ( isCached && firstByteInMap != lastByteInMap ) { - pos = lastByteInMap << 15; - while ( pos < end ) { - assert( lastByteInMap == (pos >> 15) ); - const int map_x = (pos >> 12) & 7; // mod 8 - const uint8_t bit_mask = (uint8_t)( 1 << map_x ); - if ( (image->cache_map[lastByteInMap] & bit_mask) == 0 ) { - isCached = false; - break; - } - pos += DNBD3_BLOCK_SIZE; + } + // Last byte - only check if request spans multiple bytes in cache map + if ( isCached && firstByteInMap != lastByteInMap ) { + b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed ); + for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) { + assert( lastByteInMap == (pos >> 15) ); + const int map_x = (pos >> 12) & 7; // mod 8 + const uint8_t bit_mask = (uint8_t)( 1 << map_x ); + if ( (b & bit_mask) == 0 ) { + isCached = false; + break; } } } - mutex_unlock( &image->lock ); + ref_put( &cache->reference ); if ( !isCached ) { if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) { logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy, disabling image %s:%d", diff --git a/src/server/reference.h b/src/server/reference.h index 8883eb1..2a80955 100644 --- a/src/server/reference.h +++ b/src/server/reference.h @@ -51,4 +51,9 @@ static inline void ref_put( ref *ref ) ref == NULL ? NULL : container_of(ref, dnbd3_uplink_t, reference); \ }) +#define ref_get_cachemap(image) ({ \ + ref* ref = ref_get( &(image)->ref_cacheMap ); \ + ref == NULL ? NULL : container_of(ref, dnbd3_cache_map_t, reference); \ +}) + #endif diff --git a/src/server/uplink.c b/src/server/uplink.c index d77be9c..0a6bd11 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -91,7 +91,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version ref_put( &uplink->reference ); return true; // There's already an uplink, so should we consider this success or failure? } - if ( image->cache_map == NULL ) { + if ( image->ref_cacheMap == NULL ) { logadd( LOG_WARNING, "Uplink was requested for image %s, but it is already complete", image->name ); goto failure; } @@ -170,7 +170,7 @@ bool uplink_shutdown(dnbd3_image_t *image) mutex_unlock( &uplink->queueLock ); bool retval = ( exp && image->users == 0 ); mutex_unlock( &image->lock ); - return exp; + return retval; } /** @@ -214,7 +214,7 @@ static void uplink_free(ref *ref) dnbd3_image_t *image = image_lock( uplink->image ); if ( image != NULL ) { // != NULL means image is still in list... - if ( !_shutdown && image->cache_map != NULL ) { + if ( !_shutdown && image->ref_cacheMap != NULL ) { // Ingegrity checker must have found something in the meantime uplink_init( image, -1, NULL, 0 ); } @@ -707,13 +707,14 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) if ( uplink == NULL || uplink->current.fd == -1 ) return; if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return; // Don't do background replication if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE ) - return; + return; // Already a replication request on the wire, or no more blocks to replicate dnbd3_image_t * const image = uplink->image; if ( image->virtualFilesize < DNBD3_BLOCK_SIZE ) return; - mutex_lock( &image->lock ); - if ( image == NULL || image->cache_map == NULL || image->users < _bgrMinClients ) { - // No cache map (=image complete), or replication pending, or not enough users, do nothing - mutex_unlock( &image->lock ); + if ( image->users < _bgrMinClients ) return; // Not enough active users + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL || image->users < _bgrMinClients ) { + // No cache map (=image complete) + ref_put( &cache->reference ); return; } const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); @@ -727,16 +728,18 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) endByte = mapBytes; } } + atomic_thread_fence( memory_order_acquire ); int replicationIndex = -1; for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) { const int i = j % ( mapBytes ); // Wrap around for BGR_FULL - if ( image->cache_map[i] != 0xff && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) { + if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff + && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) { // Found incomplete one replicationIndex = i; break; } } - mutex_unlock( &image->lock ); + ref_put( &cache->reference ); if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) { // Nothing left in current block, find next one replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte ); @@ -768,23 +771,24 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) } /** - * find next index into cache_map that corresponds to the beginning + * find next index into cache map that corresponds to the beginning * of a hash block which is neither completely empty nor completely * replicated yet. Returns -1 if no match. */ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int startMapIndex) { int retval = -1; - mutex_lock( &uplink->image->lock ); - const int mapBytes = IMGSIZE_TO_MAPBYTES( uplink->image->virtualFilesize ); - const uint8_t *cache_map = uplink->image->cache_map; - if ( cache_map != NULL ) { - int j; + dnbd3_cache_map_t *cache = ref_get_cachemap( uplink->image ); + if ( cache != NULL ) { + const int mapBytes = IMGSIZE_TO_MAPBYTES( uplink->image->virtualFilesize ); const int start = ( startMapIndex & MAP_INDEX_HASH_START_MASK ); + atomic_thread_fence( memory_order_acquire ); + int j; for (j = 0; j < mapBytes; ++j) { const int i = ( start + j ) % mapBytes; - const bool isFull = cache_map[i] == 0xff || ( i + 1 == mapBytes && uplink->replicatedLastBlock ); - const bool isEmpty = cache_map[i] == 0; + const uint8_t b = atomic_load_explicit( &cache->map[i], memory_order_relaxed ); + const bool isFull = b == 0xff || ( i + 1 == mapBytes && uplink->replicatedLastBlock ); + const bool isEmpty = b == 0; if ( !isEmpty && !isFull ) { // Neither full nor empty, replicate if ( retval == -1 ) { @@ -811,7 +815,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int retval = -1; } } - mutex_unlock( &uplink->image->lock ); + ref_put( &cache->reference ); return retval; } @@ -1107,7 +1111,7 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) if ( fsync( uplink->cacheFd ) == -1 ) { // A failing fsync means we have no guarantee that any data // since the last fsync (or open if none) has been saved. Apart - // from keeping the cache_map from the last successful fsync + // from keeping the cache map from the last successful fsync // around and restoring it there isn't much we can do to recover // a consistent state. Bail out. logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d", image->path, errno ); @@ -1116,21 +1120,13 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) } } - if ( image->cache_map == NULL ) return true; - logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); - mutex_lock( &image->lock ); - // Lock and get a copy of the cache map, as it could be freed by another thread that is just about to - // figure out that this image's cache copy is complete - if ( image->cache_map == NULL || image->virtualFilesize < DNBD3_BLOCK_SIZE ) { - mutex_unlock( &image->lock ); + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) return true; - } + logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize); - uint8_t *map = malloc( size ); - memcpy( map, image->cache_map, size ); // Unlock. Use path and cacheFd without locking. path should never change after initialization of the image, // cacheFd is owned by the uplink thread and we don't want to hold a spinlock during I/O - mutex_unlock( &image->lock ); assert( image->path != NULL ); char mapfile[strlen( image->path ) + 4 + 1]; strcpy( mapfile, image->path ); @@ -1139,14 +1135,14 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 ); if ( fd == -1 ) { const int err = errno; - free( map ); + ref_put( &cache->reference ); logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile ); return false; } size_t done = 0; while ( done < size ) { - const ssize_t ret = write( fd, map, size - done ); + const ssize_t ret = write( fd, cache->map + done, size - done ); if ( ret == -1 ) { if ( errno == EINTR ) continue; logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile ); @@ -1158,11 +1154,11 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink) } done += (size_t)ret; } + ref_put( &cache->reference ); if ( fsync( fd ) == -1 ) { logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno ); } close( fd ); - free( map ); return true; } -- cgit v1.2.3-55-g7522 From 0edf0a0888b1e40769e19eee031c2cefdcf37d26 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Mon, 2 Sep 2019 13:26:47 +0200 Subject: [SERVER] Fix compiler warnings --- src/server/altservers.c | 4 ++-- src/server/reference.h | 4 ++-- src/shared/protocol.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'src/server/reference.h') diff --git a/src/server/altservers.c b/src/server/altservers.c index ff3c95b..9e30cd0 100644 --- a/src/server/altservers.c +++ b/src/server/altservers.c @@ -140,7 +140,7 @@ int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output if ( altServers[i].host.type == 0 || altServers[i].isPrivate ) continue; // Slot is empty or uplink is for replication only if ( host->type == altServers[i].host.type ) { - scores[i] = 10 + altservers_netCloseness( host, &altServers[i].host ); + scores[i] = (uint16_t)( 10 + altservers_netCloseness( host, &altServers[i].host ) ); } else { scores[i] = 1; // Wrong address family } @@ -400,7 +400,7 @@ const dnbd3_host_t* altservers_indexToHost(int server) static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink) { const int ALTS = 4; - int ret, itAlt, numAlts, current; + int itAlt, numAlts, current; bool panic; int servers[ALTS + 1]; struct timespec start, end; diff --git a/src/server/reference.h b/src/server/reference.h index 2a80955..4eda546 100644 --- a/src/server/reference.h +++ b/src/server/reference.h @@ -46,12 +46,12 @@ static inline void ref_put( ref *ref ) } } -#define ref_get_uplink(wr) ({ \ +#define ref_get_uplink(wr) __extension__({ \ ref* ref = ref_get( wr ); \ ref == NULL ? NULL : container_of(ref, dnbd3_uplink_t, reference); \ }) -#define ref_get_cachemap(image) ({ \ +#define ref_get_cachemap(image) __extension__({ \ ref* ref = ref_get( &(image)->ref_cacheMap ); \ ref == NULL ? NULL : container_of(ref, dnbd3_cache_map_t, reference); \ }) diff --git a/src/shared/protocol.h b/src/shared/protocol.h index 92dbe11..2b21c21 100644 --- a/src/shared/protocol.h +++ b/src/shared/protocol.h @@ -20,7 +20,7 @@ #define COND_HOPCOUNT(vers,hopcount) ( (vers) >= 3 ? (hopcount) : 0 ) // 2017-11-02: Macro to set flags in select image message properly if we're a server, as BG_REP depends on global var -#define SI_SERVER_FLAGS ( (_pretendClient ? 0 : FLAGS8_SERVER) | (_backgroundReplication == BGR_FULL ? FLAGS8_BG_REP : 0) ) +#define SI_SERVER_FLAGS ( (uint8_t)( (_pretendClient ? 0 : FLAGS8_SERVER) | (_backgroundReplication == BGR_FULL ? FLAGS8_BG_REP : 0) ) ) #define REPLY_OK (0) #define REPLY_ERRNO (-1) -- cgit v1.2.3-55-g7522 From 290d3478f245bb7d2112bb781286a9fbae42b983 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 13 Mar 2020 16:03:29 +0100 Subject: [SERVER] Rewrite uplink queue handling - Now uses linked lists instead of huge array - Does prefetch data on client requests - Can have multiple replication requests in-flight --- src/server/globals.c | 6 + src/server/globals.h | 35 ++- src/server/image.c | 3 +- src/server/image.h | 44 +++ src/server/net.c | 44 +-- src/server/reference.h | 5 + src/server/uplink.c | 771 +++++++++++++++++++++++++++---------------------- src/server/uplink.h | 2 +- src/serverconfig.h | 3 +- 9 files changed, 518 insertions(+), 395 deletions(-) (limited to 'src/server/reference.h') diff --git a/src/server/globals.c b/src/server/globals.c index ac079b1..98e0ddb 100644 --- a/src/server/globals.c +++ b/src/server/globals.c @@ -19,6 +19,7 @@ atomic_int _clientPenalty = 0; atomic_bool _isProxy = false; atomic_int _backgroundReplication = BGR_FULL; atomic_int _bgrMinClients = 0; +atomic_int _bgrWindowSize = 1; atomic_bool _lookupMissingForProxy = true; atomic_bool _sparseFiles = false; atomic_bool _ignoreAllocErrors = false; @@ -74,6 +75,7 @@ static int ini_handler(void *custom UNUSED, const char* section, const char* key SAVE_TO_VAR_BOOL( dnbd3, isProxy ); SAVE_TO_VAR_BOOL( dnbd3, proxyPrivateOnly ); SAVE_TO_VAR_INT( dnbd3, bgrMinClients ); + SAVE_TO_VAR_INT( dnbd3, bgrWindowSize ); SAVE_TO_VAR_BOOL( dnbd3, lookupMissingForProxy ); SAVE_TO_VAR_BOOL( dnbd3, sparseFiles ); SAVE_TO_VAR_BOOL( dnbd3, ignoreAllocErrors ); @@ -134,6 +136,9 @@ void globals_loadConfig() logadd( LOG_WARNING, "Ignoring 'sparseFiles=true' since backgroundReplication is set to true and bgrMinClients is too low" ); _sparseFiles = false; } + if ( _bgrWindowSize < 1 ) { + _bgrWindowSize = 1; + } // Dump config as interpreted char buffer[2000]; globals_dumpConfig( buffer, sizeof(buffer) ); @@ -325,6 +330,7 @@ size_t globals_dumpConfig(char *buffer, size_t size) PBOOL(backgroundReplication); } PINT(bgrMinClients); + PINT(bgrWindowSize); PBOOL(lookupMissingForProxy); PBOOL(sparseFiles); PBOOL(ignoreAllocErrors); diff --git a/src/server/globals.h b/src/server/globals.h index 1bb6857..5cee92a 100644 --- a/src/server/globals.h +++ b/src/server/globals.h @@ -18,18 +18,27 @@ typedef struct _dnbd3_uplink dnbd3_uplink_t; typedef struct _dnbd3_image dnbd3_image_t; typedef struct _dnbd3_client dnbd3_client_t; -typedef struct +typedef struct _dnbd3_queue_client { - uint64_t handle; // Client defined handle to pass back in reply - uint64_t from; // First byte offset of requested block (ie. 4096) - uint64_t to; // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191) + struct _dnbd3_queue_client *next; + uint64_t handle; // Handle used by client + uint64_t from, to; // Client range dnbd3_client_t * client; // Client to send reply to - int status; // status of this entry: ULR_* +} dnbd3_queue_client_t; + +typedef struct _dnbd3_queue_entry +{ + struct _dnbd3_queue_entry *next; + uint64_t handle; // Our handle for this entry + uint64_t from; // First byte offset of requested block (ie. 4096) + uint64_t to; // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191) + dnbd3_queue_client_t *clients; #ifdef _DEBUG - ticks entered; // When this request entered the queue (for debugging) + ticks entered; // When this request entered the queue (for debugging) #endif - uint8_t hopCount; // How many hops this request has already taken across proxies -} dnbd3_queued_request_t; + uint8_t hopCount; // How many hops this request has already taken across proxies + bool sent; // Already sent to uplink? +} dnbd3_queue_entry_t; typedef struct _ns { @@ -91,12 +100,12 @@ struct _dnbd3_uplink bool cycleDetected; // connection cycle between proxies detected for current remote server int nextReplicationIndex; // Which index in the cache map we should start looking for incomplete blocks at // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block" - uint64_t replicationHandle; // Handle of pending replication request atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup. atomic_uint_fast64_t bytesReceivedLastSave; // Number of bytes received when we last saved the cache map int queueLen; // length of queue uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives) - dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE]; + dnbd3_queue_entry_t *queue; + atomic_uint_fast32_t queueId; dnbd3_alt_local_t altData[SERVER_MAX_ALTS]; }; @@ -156,6 +165,7 @@ struct _dnbd3_client atomic_uint_fast64_t bytesSent; // Byte counter for this client. dnbd3_image_t * _Atomic image; // Image in use by this client, or NULL during handshake int sock; + _Atomic uint8_t relayedCount; // How many requests are in-flight to the uplink server bool isServer; // true if a server in proxy mode, false if real client dnbd3_host_t host; char hostName[HOSTNAMELEN]; // inet_ntop version of host @@ -242,6 +252,11 @@ extern atomic_int _backgroundReplication; */ extern atomic_int _bgrMinClients; +/** + * How many in-flight replication requests we should target (per uplink) + */ +extern atomic_int _bgrWindowSize; + /** * (In proxy mode): If connecting client is a proxy, and the requested image * is not known locally, should we ask our known alt servers for it? diff --git a/src/server/image.c b/src/server/image.c index 86b6374..81ec479 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -340,7 +340,6 @@ dnbd3_image_t* image_byId(int imgId) dnbd3_image_t* image_get(char *name, uint16_t revision, bool ensureFdOpen) { int i; - const char *removingText = _removeMissingImages ? ", removing from list" : ""; dnbd3_image_t *candidate = NULL; // Simple sanity check const size_t slen = strlen( name ); @@ -1895,7 +1894,7 @@ static void* saveLoadAllCacheMaps(void* nix UNUSED) // We're not replicating this image, if there's a cache map, reload // it periodically, since we might read from a shared storage that // another server instance is writing to. - if ( full || !cache->unchanged && !image->problem.read ) { + if ( full || ( !cache->unchanged && !image->problem.read ) ) { logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", PIMG(image) ); dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize); if ( onDisk == NULL ) { diff --git a/src/server/image.h b/src/server/image.h index 4614c74..b23711b 100644 --- a/src/server/image.h +++ b/src/server/image.h @@ -51,6 +51,50 @@ bool image_ensureDiskSpaceLocked(uint64_t size, bool force); bool image_saveCacheMap(dnbd3_image_t *image); +/** + * Check if given range is cached. Be careful when using this function because: + * 1) you need to hold a reference to the cache map + * 2) start and end are assumed to be 4k aligned + * 3) start and end are not checked to be in bounds (we don't know the image in this context) + */ +static inline bool image_isRangeCachedUnsafe(dnbd3_cache_map_t *cache, uint64_t start, uint64_t end) +{ + const uint64_t firstByteInMap = start >> 15; + const uint64_t lastByteInMap = (end - 1) >> 15; + const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7)); + const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1))); + uint64_t pos; + uint8_t b; + bool isCached; + if ( firstByteInMap == lastByteInMap ) { // Single byte to check, much simpler + b = cache->map[firstByteInMap]; + isCached = ( b & ( fb & lb ) ) == ( fb & lb ); + } else { + isCached = true; + atomic_thread_fence( memory_order_acquire ); + // First byte + if ( isCached ) { + b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed ); + isCached = ( ( b & fb ) == fb ); + } + // Last byte + if ( isCached ) { + b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed ); + isCached = ( ( b & lb ) == lb ); + } + // Middle, must be all bits set (0xff) + if ( isCached ) { + for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) { + if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) { + isCached = false; + break; + } + } + } + } + return isCached; +} + // one byte in the map covers 8 4kib blocks, so 32kib per byte // "+ (1 << 15) - 1" is required to account for the last bit of // the image that is smaller than 32kib diff --git a/src/server/net.c b/src/server/net.c index 954cb8a..9ba9dbc 100644 --- a/src/server/net.c +++ b/src/server/net.c @@ -197,6 +197,7 @@ void* net_handleNewConnection(void *clientPtr) client->hostName[HOSTNAMELEN-1] = '\0'; mutex_unlock( &client->lock ); client->bytesSent = 0; + client->relayedCount = 0; if ( !addToList( client ) ) { freeClientStruct( client ); @@ -344,41 +345,18 @@ void* net_handleNewConnection(void *clientPtr) // This is a proxyed image, check if we need to relay the request... const uint64_t start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); const uint64_t end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); - const uint64_t firstByteInMap = start >> 15; - const uint64_t lastByteInMap = (end - 1) >> 15; - const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7)); - const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1))); - uint64_t pos; - uint8_t b; - bool isCached; - if ( firstByteInMap == lastByteInMap ) { // Single byte to check, much simpler - b = cache->map[firstByteInMap]; - isCached = ( b & ( fb & lb ) ) == ( fb & lb ); - } else { - isCached = true; - atomic_thread_fence( memory_order_acquire ); - // First byte - if ( isCached ) { - b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed ); - isCached = ( ( b & fb ) == fb ); - } - // Last byte - if ( isCached ) { - b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed ); - isCached = ( ( b & lb ) == lb ); - } - // Middle, must be all bits set (0xff) - if ( isCached ) { - for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) { - if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) { - isCached = false; - break; - } + if ( !image_isRangeCachedUnsafe( cache, start, end ) ) { + if ( unlikely( client->relayedCount > 250 ) ) { + logadd( LOG_DEBUG1, "Client is overloading uplink; throttling" ); + for ( int i = 0; i < 100 && client->relayedCount > 200; ++i ) { + usleep( 10000 ); + } + if ( client->relayedCount > 250 ) { + logadd( LOG_WARNING, "Could not lower client's uplink backlog; dropping client" ); + goto exit_client_cleanup; } } - } - if ( !isCached ) { - if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) { + if ( !uplink_request( NULL, client, request.handle, offset, request.size, request.hops ) ) { logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy for image %s:%d", client->hostName, image->name, image->rid ); goto exit_client_cleanup; diff --git a/src/server/reference.h b/src/server/reference.h index 4eda546..75a681f 100644 --- a/src/server/reference.h +++ b/src/server/reference.h @@ -39,6 +39,11 @@ static inline ref *ref_get( weakref *weakref ) return ref; } +static inline void ref_inc( ref *ref ) +{ + ++ref->count; +} + static inline void ref_put( ref *ref ) { if ( --ref->count == 0 ) { diff --git a/src/server/uplink.c b/src/server/uplink.c index 7c7cd1c..188bf06 100644 --- a/src/server/uplink.c +++ b/src/server/uplink.c @@ -8,6 +8,7 @@ #include "../shared/protocol.h" #include "../shared/timing.h" #include "../shared/crc32.h" +#include "threadpool.h" #include "reference.h" #include @@ -21,30 +22,6 @@ #define MAP_BYTES_PER_HASH_BLOCK (int)( HASH_BLOCK_SIZE / FILE_BYTES_PER_MAP_BYTE ) #define MAP_INDEX_HASH_START_MASK ( ~(int)( MAP_BYTES_PER_HASH_BLOCK - 1 ) ) -#define REP_NONE ( (uint64_t)0xffffffffffffffff ) - -// Status of request in queue - -// Slot is free, can be used. -// Must only be set in uplink_handle_receive() or uplink_remove_client() -#define ULR_FREE 0 -// Slot has been filled with a request that hasn't been sent to the upstream server yet, matching request can safely rely on reuse. -// Must only be set in uplink_request() -#define ULR_NEW 1 -// Slot is occupied, reply has not yet been received, matching request can safely rely on reuse. -// Must only be set in uplink_mainloop() or uplink_request() -#define ULR_PENDING 2 -// Slot is being processed, do not consider for hop on. -// Must only be set in uplink_handle_receive() -#define ULR_PROCESSING 3 - -static const char *const NAMES_ULR[4] = { - [ULR_FREE] = "ULR_FREE", - [ULR_NEW] = "ULR_NEW", - [ULR_PENDING] = "ULR_PENDING", - [ULR_PROCESSING] = "ULR_PROCESSING", -}; - static atomic_uint_fast64_t totalBytesReceived = 0; static void cancelAllRequests(dnbd3_uplink_t *uplink); @@ -59,6 +36,15 @@ static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink); static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force); static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink); static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew); +static int numWantedReplicationRequests(dnbd3_uplink_t *uplink); +static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle); +static void *prefetchForClient(void *data); + +typedef struct { + dnbd3_uplink_t *uplink; + uint64_t start; + uint32_t length; +} prefetch_request_t; // ############ uplink connection handling @@ -106,6 +92,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version uplink->bytesReceived = 0; uplink->bytesReceivedLastSave = 0; uplink->idleTime = SERVER_UPLINK_IDLE_TIMEOUT - 90; + uplink->queue = NULL; uplink->queueLen = 0; uplink->cacheFd = -1; uplink->signal = signal_new(); @@ -113,7 +100,6 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version logadd( LOG_WARNING, "Error creating signal. Uplink unavailable." ); goto failure; } - uplink->replicationHandle = REP_NONE; mutex_lock( &uplink->rttLock ); mutex_lock( &uplink->sendMutex ); uplink->current.fd = -1; @@ -175,9 +161,9 @@ bool uplink_shutdown(dnbd3_image_t *image) } cancelAllRequests( uplink ); ref_setref( &image->uplinkref, NULL ); - ref_put( &uplink->reference ); mutex_unlock( &uplink->queueLock ); bool retval = ( exp && image->users == 0 ); + ref_put( &uplink->reference ); mutex_unlock( &image->lock ); return retval; } @@ -188,12 +174,21 @@ bool uplink_shutdown(dnbd3_image_t *image) */ static void cancelAllRequests(dnbd3_uplink_t *uplink) { - for ( int i = 0; i < uplink->queueLen; ++i ) { - if ( uplink->queue[i].status != ULR_FREE ) { - net_sendReply( uplink->queue[i].client, CMD_ERROR, uplink->queue[i].handle ); - uplink->queue[i].status = ULR_FREE; + dnbd3_queue_entry_t *it = uplink->queue; + while ( it != NULL ) { + dnbd3_queue_client_t *cit = it->clients; + while ( cit != NULL ) { + net_sendReply( cit->client, CMD_ERROR, cit->handle ); + cit->client->relayedCount--; + dnbd3_queue_client_t *next = cit->next; + free( cit ); + cit = next; } + dnbd3_queue_entry_t *next = it->next; + free( it ); + it = next; } + uplink->queue = NULL; uplink->queueLen = 0; uplink->image->problem.queue = false; } @@ -234,39 +229,54 @@ static void uplink_free(ref *ref) */ void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client) { + if ( client->relayedCount == 0 ) + return; mutex_lock( &uplink->queueLock ); - for (int i = uplink->queueLen - 1; i >= 0; --i) { - if ( uplink->queue[i].client == client ) { - // Make sure client doesn't get destroyed while we're sending it data - mutex_lock( &client->sendMutex ); - mutex_unlock( &client->sendMutex ); - uplink->queue[i].client = NULL; - uplink->queue[i].status = ULR_FREE; + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + for ( dnbd3_queue_client_t **cit = &it->clients; *cit != NULL; cit = &(**cit).next ) { + if ( (**cit).client == client ) { + --client->relayedCount; + dnbd3_queue_client_t *entry = *cit; + *cit = (**cit).next; + free( entry ); + } } - if ( uplink->queue[i].client == NULL && uplink->queueLen == i + 1 ) uplink->queueLen--; } mutex_unlock( &uplink->queueLock ); + if ( unlikely( client->relayedCount != 0 ) ) { + logadd( LOG_DEBUG1, "Client has relayedCount == %"PRIu8" on disconnect..", client->relayedCount ); + int i; + for ( i = 0; i < 1000 && client->relayedCount != 0; ++i ) { + usleep( 10000 ); + } + if ( client->relayedCount != 0 ) { + logadd( LOG_WARNING, "Client relayedCount still %"PRIu8" after sleeping!", client->relayedCount ); + } + } } /** - * Request a chunk of data through an uplink server - * Locks on: image.lock, uplink.queueLock + * Request a chunk of data through an uplink server. Either uplink or client has to be non-NULL. + * If client is NULL, this is assumed to be a background replication request. + * Locks on: uplink.queueLock, uplink.sendMutex */ -bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops) +bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops) { - if ( client == NULL || client->image == NULL ) - return false; + bool getUplink = ( uplink == NULL ); + assert( client != NULL || uplink != NULL ); if ( length > (uint32_t)_maxPayload ) { logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length ); return false; } - dnbd3_uplink_t * uplink = ref_get_uplink( &client->image->uplinkref ); - if ( unlikely( uplink == NULL ) ) { - uplink_init( client->image, -1, NULL, -1 ); + if ( getUplink ) { uplink = ref_get_uplink( &client->image->uplinkref ); - if ( uplink == NULL ) { - logadd( LOG_DEBUG1, "Uplink request for image with no uplink" ); - return false; + if ( unlikely( uplink == NULL ) ) { + uplink_init( client->image, -1, NULL, -1 ); + uplink = ref_get_uplink( &client->image->uplinkref ); + if ( uplink == NULL ) { + logadd( LOG_DEBUG1, "Uplink request for image with no uplink" ); + return false; + } } } if ( uplink->shutdown ) { @@ -275,163 +285,179 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin } // Check if the client is the same host as the uplink. If so assume this is a circular proxy chain // This might be a false positive if there are multiple instances running on the same host (IP) - if ( hops != 0 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) { + if ( client != NULL && hops != 0 + && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) { uplink->cycleDetected = true; signal_call( uplink->signal ); logadd( LOG_WARNING, "Proxy cycle detected (same host)." ); goto fail_ref; } - int foundExisting = -1; // Index of a pending request that is a superset of our range, -1 otherwise - int existingType = -1; // ULR_* type of existing request - int i; - int freeSlot = -1; - int firstUsedSlot = -1; - bool requestLoop = false; - const uint64_t end = start + length; - - mutex_lock( &uplink->queueLock ); - if ( uplink->shutdown ) { // Check again after locking to prevent lost requests - goto fail_lock; - } - for (i = 0; i < uplink->queueLen; ++i) { - // find free slot to place this request into - if ( uplink->queue[i].status == ULR_FREE ) { - if ( freeSlot == -1 || existingType != ULR_PROCESSING ) { - freeSlot = i; - } - continue; - } - if ( firstUsedSlot == -1 ) { - firstUsedSlot = i; - } - // find existing request to attach to - if ( uplink->queue[i].from > start || uplink->queue[i].to < end ) - continue; // Range not suitable - // Detect potential proxy cycle. New request hopcount is greater, range is same, old request has already been sent -> suspicious - if ( hops > uplink->queue[i].hopCount && uplink->queue[i].from == start && uplink->queue[i].to == end && uplink->queue[i].status == ULR_PENDING ) { - requestLoop = true; - break; - } - if ( foundExisting == -1 || existingType == ULR_PROCESSING ) { - foundExisting = i; - existingType = uplink->queue[i].status; - } - } - if ( unlikely( requestLoop ) ) { - uplink->cycleDetected = true; - signal_call( uplink->signal ); - logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops ); - goto fail_lock; - } - if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) { - freeSlot = -1; // Not attaching to existing request, make it use a higher slot - } - if ( freeSlot == -1 ) { - if ( uplink->queueLen >= SERVER_MAX_UPLINK_QUEUE ) { - logadd( LOG_WARNING, "Uplink queue is full, consider increasing SERVER_MAX_UPLINK_QUEUE. Dropping client..." ); + struct { + uint64_t handle, start, end; + } req; + do { + const uint64_t end = start + length; + dnbd3_queue_entry_t *request = NULL, *last = NULL; + bool isNew; + mutex_lock( &uplink->queueLock ); + if ( uplink->shutdown ) { // Check again after locking to prevent lost requests goto fail_lock; } - freeSlot = uplink->queueLen++; - if ( freeSlot > SERVER_UPLINK_QUEUELEN_THRES ) { - uplink->image->problem.queue = true; + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + if ( it->from <= start && it->to >= end ) { + // Matching range, attach + request = it; + break; + } + if ( it->next == NULL ) { + // Not matching, last in list, remember + last = it; + break; + } } - } - // Do not send request to uplink server if we have a matching pending request AND the request either has the - // status ULR_NEW/PENDING OR we found a free slot with LOWER index than the one we attach to. Otherwise - // explicitly send this request to the uplink server. The second condition mentioned here is to prevent - // a race condition where the reply for the outstanding request already arrived and the uplink thread - // is currently traversing the request queue. As it is processing the queue from highest to lowest index, it might - // already have passed the index of the free slot we determined, but not reached the existing request we just found above. - if ( foundExisting != -1 && existingType == ULR_PROCESSING && freeSlot > foundExisting ) { - foundExisting = -1; // -1 means "send request" - } -#ifdef _DEBUG - if ( foundExisting != -1 ) { - logadd( LOG_DEBUG2, "%p (%s) Found existing request of type %s at slot %d, attaching in slot %d.\n", (void*)uplink, uplink->image->name, NAMES_ULR[existingType], foundExisting, freeSlot ); - logadd( LOG_DEBUG2, "Original %" PRIu64 "-%" PRIu64 " (%p)\n" - "New %" PRIu64 "-%" PRIu64 " (%p)\n", - uplink->queue[foundExisting].from, uplink->queue[foundExisting].to, (void*)uplink->queue[foundExisting].client, - start, end, (void*)client ); - } -#endif - // Fill structure - uplink->queue[freeSlot].from = start; - uplink->queue[freeSlot].to = end; - uplink->queue[freeSlot].handle = handle; - uplink->queue[freeSlot].client = client; - //int old = uplink->queue[freeSlot].status; - uplink->queue[freeSlot].status = ( foundExisting == -1 ? ULR_NEW : - ( existingType == ULR_NEW ? ULR_PENDING : existingType ) ); - uplink->queue[freeSlot].hopCount = hops; + dnbd3_queue_client_t **c; + if ( request == NULL ) { + // No existing request to attach to + if ( uplink->queueLen >= UPLINK_MAX_QUEUE ) { + logadd( LOG_WARNING, "Uplink queue is full, consider increasing UPLINK_MAX_QUEUE. Dropping client..." ); + goto fail_lock; + } + uplink->queueLen++; + if ( uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) { + uplink->image->problem.queue = true; + } + request = malloc( sizeof(*request) ); + if ( last == NULL ) { + uplink->queue = request; + } else { + last->next = request; + } + request->next = NULL; + request->handle = ++uplink->queueId; + request->from = start & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + request->to = (end + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); #ifdef _DEBUG - timing_get( &uplink->queue[freeSlot].entered ); - //logadd( LOG_DEBUG2 %p] Inserting request at slot %d, was %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 "\n", (void*)uplink, freeSlot, old, uplink->queue[freeSlot].status, uplink->queue[freeSlot, ".handle, start, end ); + timing_get( &request->entered ); #endif - mutex_unlock( &uplink->queueLock ); + request->hopCount = hops; + request->sent = true; // Optimistic; would be set to false on failure + if ( client == NULL ) { + // BGR + request->clients = NULL; + } else { + c = &request->clients; + } + isNew = true; + } else if ( client == NULL ) { + // Replication request that maches existing request. Do nothing + isNew = false; + } else { + // Existing request. Check if potential cycle + if ( hops > request->hopCount + 1 && request->from == start && request->to == end ) { + logadd( LOG_DEBUG1, "Request cycle detected on uplink for %s:%d", PIMG(uplink->image) ); + goto fail_lock; + } + // Count number if clients, get tail of list + int count = 0; + c = &request->clients; + while ( *c != NULL ) { + c = &(**c).next; + if ( ++count >= UPLINK_MAX_CLIENTS_PER_REQUEST ) { + logadd( LOG_DEBUG2, "Won't accept more than %d clients per request, dropping client", count ); + goto fail_lock; + } + } + isNew = false; + } + req.handle = request->handle; + req.start = request->from; + req.end = request->to; + if ( client != NULL ) { + *c = malloc( sizeof( *request->clients ) ); + (**c).next = NULL; + (**c).handle = handle; + (**c).from = start; + (**c).to = end; + (**c).client = client; + client->relayedCount++; + } + mutex_unlock( &uplink->queueLock ); - if ( foundExisting != -1 ) { - ref_put( &uplink->reference ); - return true; // Attached to pending request, do nothing - } + if ( !isNew ) { + goto success_ref; // Attached to pending request, do nothing + } + } while (0); - // See if we can fire away the request - if ( unlikely( mutex_trylock( &uplink->sendMutex ) != 0 ) ) { - logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" ); + // Fire away the request + mutex_lock( &uplink->sendMutex ); + if ( unlikely( uplink->current.fd == -1 ) ) { + uplink->image->problem.uplink = true; + markRequestUnsent( uplink, req.handle ); + mutex_unlock( &uplink->sendMutex ); + logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" ); } else { - if ( unlikely( uplink->current.fd == -1 ) ) { + if ( hops < 200 ) ++hops; + const bool ret = dnbd3_get_block( uplink->current.fd, req.start, req.end - req.start, + req.handle, COND_HOPCOUNT( uplink->current.version, hops ) ); + if ( unlikely( !ret ) ) { + markRequestUnsent( uplink, req.handle ); uplink->image->problem.uplink = true; mutex_unlock( &uplink->sendMutex ); - logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" ); + logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing (%"PRIu64")", req.handle ); } else { - const uint64_t reqStart = uplink->queue[freeSlot].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); - const uint32_t reqSize = (uint32_t)(((uplink->queue[freeSlot].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart); - if ( hops < 200 ) ++hops; - const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) ); - if ( unlikely( !ret ) ) { - uplink->image->problem.uplink = true; - mutex_unlock( &uplink->sendMutex ); - logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" ); - } else { - // Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again - int state; - mutex_unlock( &uplink->sendMutex ); - mutex_lock( &uplink->queueLock ); - if ( !uplink->shutdown && uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) { - state = uplink->queue[freeSlot].status; - if ( uplink->queue[freeSlot].status == ULR_NEW ) { - uplink->queue[freeSlot].status = ULR_PENDING; - } - } else { - state = -1; - } - mutex_unlock( &uplink->queueLock ); - if ( state == -1 ) { - logadd( LOG_DEBUG2, "Direct uplink request queue entry gone after sending and re-locking queue. *shrug*" ); - } else if ( state == ULR_NEW ) { - //logadd( LOG_DEBUG2, "Direct uplink request" ); - } else { - logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] ); - } - ref_put( &uplink->reference ); - return true; - } - // Fall through to waking up sender thread + // OK + mutex_unlock( &uplink->sendMutex ); + goto success_ref; } + // Fall through to waking up sender thread } if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) { logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno ); } - ref_put( &uplink->reference ); + +success_ref: + if ( client != NULL ) { + // Was from client -- potential prefetch + uint32_t len = MIN( uplink->image->virtualFilesize - req.end, req.end - req.start ); + if ( len > 0 ) { + prefetch_request_t *job = malloc( sizeof( *job ) ); + job->start = req.end; + job->length = len; + job->uplink = uplink; + ref_inc( &uplink->reference ); // Hold one for the thread, thread will return it + threadpool_run( &prefetchForClient, (void*)job ); + } + } + if ( getUplink ) { + ref_put( &uplink->reference ); + } return true; fail_lock: mutex_unlock( &uplink->queueLock ); fail_ref: - ref_put( &uplink->reference ); + if ( getUplink ) { + ref_put( &uplink->reference ); + } return false; } +static void *prefetchForClient(void *data) +{ + prefetch_request_t *job = (prefetch_request_t*)data; + dnbd3_cache_map_t *cache = ref_get_cachemap( job->uplink->image ); + if ( cache != NULL ) { + if ( !image_isRangeCachedUnsafe( cache, job->start, job->start + job->length ) ) { + uplink_request( job->uplink, NULL, ++job->uplink->queueId, job->start, job->length, 0 ); + } + ref_put( &cache->reference ); + } + ref_put( &job->uplink->reference ); + free( job ); + return NULL; +} + /** * Uplink thread. * Locks are irrelevant as this is never called from another function @@ -443,7 +469,7 @@ static void* uplink_mainloop(void *data) #define EV_COUNT (2) struct pollfd events[EV_COUNT]; dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data; - int numSocks, i, waitTime; + int numSocks, waitTime; int altCheckInterval = SERVER_RTT_INTERVAL_INIT; int rttTestResult; uint32_t discoverFailCount = 0; @@ -478,7 +504,7 @@ static void* uplink_mainloop(void *data) declare_now; waitTime = (int)timing_diffMs( &now, &nextAltCheck ); if ( waitTime < 100 ) waitTime = 100; - if ( waitTime > 10000 ) waitTime = 10000; + else if ( waitTime > 10000 ) waitTime = 10000; } events[EV_SOCKET].fd = uplink->current.fd; numSocks = poll( events, EV_COUNT, waitTime ); @@ -505,7 +531,6 @@ static void* uplink_mainloop(void *data) mutex_unlock( &uplink->rttLock ); discoverFailCount = 0; if ( fd != -1 ) close( fd ); - uplink->replicationHandle = REP_NONE; uplink->image->problem.uplink = false; uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received buffer[0] = '@'; @@ -559,11 +584,11 @@ static void* uplink_mainloop(void *data) } declare_now; uint32_t timepassed = timing_diff( &lastKeepalive, &now ); - if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) { + if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL || ( timepassed >= 2 && uplink->idleTime < _bgrWindowSize ) ) { lastKeepalive = now; uplink->idleTime += timepassed; // Keep-alive - if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) { + if ( uplink->current.fd != -1 && uplink->queueLen < _bgrWindowSize ) { // Send keep-alive if nothing is happening, and try to trigger background rep. if ( !uplink_sendKeepalive( uplink ) || !uplink_sendReplicationRequest( uplink ) ) { uplink_connectionFailed( uplink, true ); @@ -612,19 +637,16 @@ static void* uplink_mainloop(void *data) ticks deadline; timing_set( &deadline, &now, -10 ); mutex_lock( &uplink->queueLock ); - for (i = 0; i < uplink->queueLen; ++i) { - if ( uplink->queue[i].status != ULR_FREE && timing_reached( &uplink->queue[i].entered, &deadline ) ) { - snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n" - "%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)uplink, i, uplink->queue[i].client->image->name, - uplink->queue[i].from, uplink->queue[i].to, uplink->queue[i].status ); - uplink->queue[i].entered = now; + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + if ( timing_reached( &it->entered, &deadline ) ) { + logadd( LOG_WARNING, "Starving request detected:" + " (from %" PRIu64 " to %" PRIu64 ", sent: %d) %s:%d", + it->from, it->to, (int)it->sent, PIMG(uplink->image) ); + it->entered = now; #ifdef _DEBUG_RESEND_STARVING - uplink->queue[i].status = ULR_NEW; + it->sent = false; resend = true; #endif - mutex_unlock( &uplink->queueLock ); - logadd( LOG_WARNING, "%s", buffer ); - mutex_lock( &uplink->queueLock ); } } mutex_unlock( &uplink->queueLock ); @@ -667,37 +689,54 @@ cleanup: ; */ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly) { - // Scan for new requests - int j; + // Scan for new requests, or optionally, (re)send all + // Build a buffer, so if there aren't too many requests, we can send them after + // unlocking the queue again. Otherwise we need flushes during iteration, which + // is no ideal, but in that case the uplink is probably overwhelmed anyways. + // Try 125 as that's exactly 300bytes, usually 2*MTU. +#define MAX_RESEND_BATCH 125 + dnbd3_request_t reqs[MAX_RESEND_BATCH]; + int count = 0; mutex_lock( &uplink->queueLock ); - for (j = 0; j < uplink->queueLen; ++j) { - if ( uplink->queue[j].status != ULR_NEW && (newOnly || uplink->queue[j].status != ULR_PENDING) ) continue; - uplink->queue[j].status = ULR_PENDING; - uint8_t hops = uplink->queue[j].hopCount; - const uint64_t reqStart = uplink->queue[j].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); - const uint32_t reqSize = (uint32_t)(((uplink->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart); - /* - logadd( LOG_DEBUG2, "[%p] Sending slot %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 " (%" PRIu64 "-%" PRIu64 ")", - (void*)uplink, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize ); - */ - mutex_unlock( &uplink->queueLock ); - if ( hops < 200 ) ++hops; - mutex_lock( &uplink->sendMutex ); - const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) ); - if ( likely( ret ) ) { - mutex_unlock( &uplink->sendMutex ); - } else { - // Non-critical - if the connection dropped or the server was changed - // the thread will re-send this request as soon as the connection - // is reestablished. - uplink->image->problem.uplink = true; + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + if ( newOnly && it->sent ) + continue; + it->sent = true; + dnbd3_request_t *hdr = &reqs[count++]; + hdr->magic = dnbd3_packet_magic; + hdr->cmd = CMD_GET_BLOCK; + hdr->size = it->to - it->from; + hdr->offset_small = it->from; + hdr->hops = it->hopCount + 1; + hdr->handle = it->handle; + fixup_request( *hdr ); + if ( count == MAX_RESEND_BATCH ) { + bool ok = false; + logadd( LOG_DEBUG2, "BLOCKING resend of %d", count ); + count = 0; + mutex_lock( &uplink->sendMutex ); + if ( uplink->current.fd != -1 ) { + ok = ( sock_sendAll( uplink->current.fd, reqs, DNBD3_REQUEST_SIZE * MAX_RESEND_BATCH, 3 ) + == DNBD3_REQUEST_SIZE * MAX_RESEND_BATCH ); + } mutex_unlock( &uplink->sendMutex ); - logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" ); - return; + if ( !ok ) { + uplink->image->problem.uplink = true; + break; + } } - mutex_lock( &uplink->queueLock ); } mutex_unlock( &uplink->queueLock ); + if ( count != 0 ) { + mutex_lock( &uplink->sendMutex ); + if ( uplink->current.fd != -1 ) { + uplink->image->problem.uplink = + ( sock_sendAll( uplink->current.fd, reqs, DNBD3_REQUEST_SIZE * count, 3 ) + != DNBD3_REQUEST_SIZE * count ); + } + mutex_unlock( &uplink->sendMutex ); + } +#undef MAX_RESEND_BATCH } /** @@ -720,71 +759,73 @@ static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink) return false; // Should never be called in this state, consider send error if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return true; // Don't do background replication - if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE ) - return true; // Already a replication request on the wire, or no more blocks to replicate + if ( uplink->nextReplicationIndex == -1 ) + return true; // No more blocks to replicate dnbd3_image_t * const image = uplink->image; if ( image->users < _bgrMinClients ) return true; // Not enough active users + const int numNewRequests = numWantedReplicationRequests( uplink ); + if ( numNewRequests <= 0 ) + return true; // Already sufficient amount of requests on the wire dnbd3_cache_map_t *cache = ref_get_cachemap( image ); - if ( cache == NULL || image->users ) { + if ( cache == NULL ) { // No cache map (=image complete) - ref_put( &cache->reference ); return true; } const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); const int lastBlockIndex = mapBytes - 1; - int endByte; - if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks - endByte = uplink->nextReplicationIndex + mapBytes; - } else { // Hashblock based: Only look for match in current hash block - endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK; - if ( endByte > mapBytes ) { - endByte = mapBytes; + for ( int bc = 0; bc < numNewRequests; ++bc ) { + int endByte; + if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks + endByte = uplink->nextReplicationIndex + mapBytes; + } else { // Hashblock based: Only look for match in current hash block + endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK; + if ( endByte > mapBytes ) { + endByte = mapBytes; + } } - } - atomic_thread_fence( memory_order_acquire ); - int replicationIndex = -1; - for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) { - const int i = j % ( mapBytes ); // Wrap around for BGR_FULL - if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff - && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) { - // Found incomplete one - replicationIndex = i; + atomic_thread_fence( memory_order_acquire ); + int replicationIndex = -1; + for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) { + const int i = j % ( mapBytes ); // Wrap around for BGR_FULL + if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff + && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) { + // Found incomplete one + replicationIndex = i; + break; + } + } + if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) { + // Nothing left in current block, find next one + replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte ); + } + if ( replicationIndex == -1 ) { + // Replication might be complete, uplink_mainloop should take care.... + uplink->nextReplicationIndex = -1; break; } + const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE; + const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE ); + const uint64_t handle = ++uplink->queueId; + if ( !uplink_request( uplink, NULL, handle, offset, size, 0 ) ) { + logadd( LOG_DEBUG1, "Error sending background replication request to uplink server (%s:%d)", + PIMG(uplink->image) ); + ref_put( &cache->reference ); + return false; + } + if ( replicationIndex == lastBlockIndex ) { + uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks + } + uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter + if ( _backgroundReplication == BGR_HASHBLOCK + && uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) { + // Just crossed a hash block boundary, look for new candidate starting at this very index + uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex ); + if ( uplink->nextReplicationIndex == -1 ) + break; + } } ref_put( &cache->reference ); - if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) { - // Nothing left in current block, find next one - replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte ); - } - if ( replicationIndex == -1 ) { - // Replication might be complete, uplink_mainloop should take care.... - uplink->nextReplicationIndex = -1; - return true; - } - const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE; - uplink->replicationHandle = offset; - const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE ); - mutex_lock( &uplink->sendMutex ); - bool sendOk = dnbd3_get_block( uplink->current.fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->current.version, 1 ) ); - if ( likely( sendOk ) ) { - mutex_unlock( &uplink->sendMutex ); - } else { - uplink->image->problem.uplink = true; - mutex_unlock( &uplink->sendMutex ); - logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" ); - return false; - } - if ( replicationIndex == lastBlockIndex ) { - uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks - } - uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter - if ( _backgroundReplication == BGR_HASHBLOCK - && uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) { - // Just crossed a hash block boundary, look for new candidate starting at this very index - uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex ); - } return true; } @@ -845,7 +886,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int static void uplink_handleReceive(dnbd3_uplink_t *uplink) { dnbd3_reply_t inReply, outReply; - int ret, i; + int ret; for (;;) { ret = dnbd3_read_reply( uplink->current.fd, &inReply, false ); if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue; @@ -881,13 +922,34 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) } // Payload read completely // Bail out if we're not interested - if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) ) continue; + if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) ) + continue; // Is a legit block reply - struct iovec iov[2]; - const uint64_t start = inReply.handle; - const uint64_t end = inReply.handle + inReply.size; totalBytesReceived += inReply.size; uplink->bytesReceived += inReply.size; + // Get entry from queue + dnbd3_queue_entry_t *entry; + mutex_lock( &uplink->queueLock ); + for ( entry = uplink->queue; entry != NULL; entry = entry->next ) { + if ( entry->handle == inReply.handle ) + break; + } + if ( entry == NULL ) { + mutex_unlock( &uplink->queueLock ); // Do not dereference pointer after unlock! + logadd( LOG_DEBUG1, "Received block reply on uplink, but handle %"PRIu64" is unknown (%s:%d)", + inReply.handle, PIMG(uplink->image) ); + continue; + } + const uint64_t start = entry->from; + const uint64_t end = entry->to; + mutex_unlock( &uplink->queueLock ); // Do not dereference pointer after unlock! + // We don't remove the entry from the list here yet, to slightly increase the chance of other + // clients attaching to this request while we write the data to disk + if ( end - start != inReply.size ) { + logadd( LOG_WARNING, "Received payload length does not match! (is: %"PRIu32", expect: %u, %s:%d)", + inReply.size, (unsigned int)( end - start ), PIMG(uplink->image) ); + } + struct iovec iov[2]; // 1) Write to cache file if ( unlikely( uplink->cacheFd == -1 ) ) { uplink_reopenCacheFd( uplink, false ); @@ -934,98 +996,76 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink) PIMG(uplink->image), err ); } } - // 2) Figure out which clients are interested in it - // Mark as ULR_PROCESSING, since we unlock repeatedly in the second loop - // below; this prevents uplink_request() from attaching to this request - // by populating a slot with index greater than the highest matching - // request with ULR_PROCESSING (assuming there is no ULR_PENDING or ULR_NEW - // where it's fine if the index is greater) + bool found = false; + dnbd3_queue_entry_t **it; mutex_lock( &uplink->queueLock ); - for (i = 0; i < uplink->queueLen; ++i) { - dnbd3_queued_request_t * const req = &uplink->queue[i]; - assert( req->status != ULR_PROCESSING ); - if ( req->status != ULR_PENDING && req->status != ULR_NEW ) continue; - assert( req->client != NULL ); - if ( req->from >= start && req->to <= end ) { // Match :-) - req->status = ULR_PROCESSING; - } - } - // 3) Send to interested clients - iterate backwards so request collaboration works, and - // so we can decrease queueLen on the fly while iterating. Should you ever change this to start - // from 0, you also need to change the "attach to existing request"-logic in uplink_request() - outReply.magic = dnbd3_packet_magic; - bool served = false; - for ( i = uplink->queueLen - 1; i >= 0; --i ) { - dnbd3_queued_request_t * const req = &uplink->queue[i]; - if ( req->status == ULR_PROCESSING ) { - size_t bytesSent = 0; - assert( req->from >= start && req->to <= end ); - dnbd3_client_t * const client = req->client; - outReply.cmd = CMD_GET_BLOCK; - outReply.handle = req->handle; - outReply.size = (uint32_t)( req->to - req->from ); - iov[0].iov_base = &outReply; - iov[0].iov_len = sizeof outReply; - iov[1].iov_base = uplink->recvBuffer + (req->from - start); - iov[1].iov_len = outReply.size; - fixup_reply( outReply ); - req->status = ULR_FREE; - req->client = NULL; - served = true; - mutex_lock( &client->sendMutex ); - mutex_unlock( &uplink->queueLock ); - if ( client->sock != -1 ) { - ssize_t sent = writev( client->sock, iov, 2 ); - if ( sent > (ssize_t)sizeof outReply ) { - bytesSent = (size_t)sent - sizeof outReply; - } - } - if ( bytesSent != 0 ) { - client->bytesSent += bytesSent; - } - mutex_unlock( &client->sendMutex ); - mutex_lock( &uplink->queueLock ); - if ( i > uplink->queueLen ) { - i = uplink->queueLen; // Might have been set to 0 by cancelAllRequests - } + for ( it = &uplink->queue; *it != NULL; it = &(**it).next ) { + if ( *it == entry && entry->handle == inReply.handle ) { // ABA check + assert( found == false ); + *it = (**it).next; + found = true; + uplink->queueLen--; + break; } - if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--; } if ( uplink->queueLen < SERVER_UPLINK_QUEUELEN_THRES ) { uplink->image->problem.queue = false; } mutex_unlock( &uplink->queueLock ); -#ifdef _DEBUG - if ( !served && start != uplink->replicationHandle ) { - logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)uplink, uplink->image->name, start, end ); + if ( !found ) { + logadd( LOG_DEBUG1, "Replication request vanished from queue after writing to disk (%s:%d)", + PIMG(uplink->image) ); + continue; } -#endif - if ( start == uplink->replicationHandle ) { - // Was our background replication - uplink->replicationHandle = REP_NONE; - // Try to remove from fs cache if no client was interested in this data - if ( !served && uplink->cacheFd != -1 ) { - posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED ); + outReply.magic = dnbd3_packet_magic; + dnbd3_queue_client_t *next; + for ( dnbd3_queue_client_t *c = entry->clients; c != NULL; c = next ) { + size_t bytesSent = 0; + assert( c->from >= start && c->to <= end ); + dnbd3_client_t * const client = c->client; + outReply.cmd = CMD_GET_BLOCK; + outReply.handle = c->handle; + outReply.size = (uint32_t)( c->to - c->from ); + iov[0].iov_base = &outReply; + iov[0].iov_len = sizeof outReply; + iov[1].iov_base = uplink->recvBuffer + (c->from - start); + iov[1].iov_len = outReply.size; + fixup_reply( outReply ); + mutex_lock( &client->sendMutex ); + if ( client->sock != -1 ) { + ssize_t sent = writev( client->sock, iov, 2 ); + if ( sent > (ssize_t)sizeof outReply ) { + bytesSent = (size_t)sent - sizeof outReply; + } + if ( bytesSent != 0 ) { + client->bytesSent += bytesSent; + } } + mutex_unlock( &client->sendMutex ); + client->relayedCount--; + next = c->next; + free( c ); } - if ( served ) { + if ( entry->clients != NULL ) { // Was some client -- reset idle counter uplink->idleTime = 0; // Re-enable replication if disabled if ( uplink->nextReplicationIndex == -1 ) { uplink->nextReplicationIndex = (int)( start / FILE_BYTES_PER_MAP_BYTE ) & MAP_INDEX_HASH_START_MASK; } + } else { + if ( uplink->cacheFd != -1 ) { + // Try to remove from fs cache if no client was interested in this data + posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED ); + } } + free( entry ); + } // main receive loop + // Trigger background replication if applicable + if ( !uplink_sendReplicationRequest( uplink ) ) { + goto error_cleanup; } - if ( uplink->replicationHandle == REP_NONE ) { - mutex_lock( &uplink->queueLock ); - const bool rep = ( uplink->queueLen == 0 ); - mutex_unlock( &uplink->queueLock ); - if ( rep ) { - if ( !uplink_sendReplicationRequest( uplink ) ) - goto error_cleanup; - } - } + // Normal end return; // Error handling from failed receive or message parsing error_cleanup: ; @@ -1046,7 +1086,6 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew) close( uplink->current.fd ); uplink->current.fd = -1; mutex_unlock( &uplink->sendMutex ); - uplink->replicationHandle = REP_NONE; if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) { uplink->nextReplicationIndex = 0; } @@ -1156,3 +1195,39 @@ bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len) return false; return altservers_toString( current, buffer, len ); } + +/** + * Get number of replication requests that should be sent right now to + * meet the configured bgrWindowSize. Returns 0 if any client requests + * are pending + */ +static int numWantedReplicationRequests(dnbd3_uplink_t *uplink) +{ + int ret = MIN( _bgrWindowSize, uplink->idleTime + 1 ); + if ( uplink->queueLen == 0 ) + return ret; + mutex_lock( &uplink->queueLock ); + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + if ( it->clients == NULL ) { + ret--; + } else { + ret = 0; // Do not allow BGR if client requests are being handled + break; + } + } + mutex_unlock( &uplink->queueLock ); + return ret; +} + +static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle) +{ + mutex_lock( &uplink->queueLock ); + for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) { + if ( it->handle == handle ) { + it->sent = false; + break; + } + } + mutex_unlock( &uplink->queueLock ); +} + diff --git a/src/server/uplink.h b/src/server/uplink.h index 49ff0b4..8f69b05 100644 --- a/src/server/uplink.h +++ b/src/server/uplink.h @@ -12,7 +12,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client); -bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hopCount); +bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops); bool uplink_shutdown(dnbd3_image_t *image); diff --git a/src/serverconfig.h b/src/serverconfig.h index 5c7301d..31708de 100644 --- a/src/serverconfig.h +++ b/src/serverconfig.h @@ -13,7 +13,8 @@ #define SERVER_BAD_UPLINK_MAX 20 // Hard block server if it failed this many times #define SERVER_BAD_UPLINK_LOCAL_BLOCK 10 // If a server didn't supply the requested image this many times, block it for some time #define SERVER_BAD_UPLINK_IGNORE 180 // How many seconds is a server ignored -#define SERVER_MAX_UPLINK_QUEUE 1500 // Maximum number of queued requests per uplink +#define UPLINK_MAX_QUEUE 500 // Maximum number of queued requests per uplink +#define UPLINK_MAX_CLIENTS_PER_REQUEST 32 // Maximum number of clients that can attach to one uplink request #define SERVER_UPLINK_QUEUELEN_THRES 900 // Threshold where we start dropping incoming clients #define SERVER_MAX_PENDING_ALT_CHECKS 500 // Length of queue for pending alt checks requested by uplinks -- cgit v1.2.3-55-g7522