diff options
Diffstat (limited to 'src/server/image.c')
-rw-r--r-- | src/server/image.c | 519 |
1 files changed, 274 insertions, 245 deletions
diff --git a/src/server/image.c b/src/server/image.c index bfba6cb..6259e38 100644 --- a/src/server/image.c +++ b/src/server/image.c @@ -8,6 +8,7 @@ #include "../shared/protocol.h" #include "../shared/timing.h" #include "../shared/crc32.h" +#include "reference.h" #include <assert.h> #include <fcntl.h> @@ -50,24 +51,33 @@ static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageS static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc); static bool image_ensureDiskSpace(uint64_t size, bool force); -static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize); +static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize); static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc); -static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, uint8_t * const cache_map); +static void image_checkRandomBlocks(dnbd3_image_t *image, const int count); +static void* closeUnusedFds(void*); +static void allocCacheMap(dnbd3_image_t *image, bool complete); + +static void cmfree(ref *ref) +{ + dnbd3_cache_map_t *cache = container_of(ref, dnbd3_cache_map_t, reference); + logadd( LOG_DEBUG2, "Freeing a cache map" ); + free( cache ); +} // ########################################## void image_serverStartup() { srand( (unsigned int)time( NULL ) ); - mutex_init( &imageListLock ); - mutex_init( &remoteCloneLock ); - mutex_init( &reloadLock ); + mutex_init( &imageListLock, LOCK_IMAGE_LIST ); + mutex_init( &remoteCloneLock, LOCK_REMOTE_CLONE ); + mutex_init( &reloadLock, LOCK_RELOAD ); + server_addJob( &closeUnusedFds, NULL, 10, 900 ); } /** * Update cache-map of given image for the given byte range * start (inclusive) - end (exclusive) - * Locks on: images[].lock */ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set) { @@ -88,33 +98,59 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co if ( start >= end ) return; bool setNewBlocks = false; - uint64_t pos = start; - mutex_lock( &image->lock ); - if ( image->cache_map == NULL ) { + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) { // Image seems already complete if ( set ) { // This makes no sense - mutex_unlock( &image->lock ); - logadd( LOG_DEBUG1, "image_updateCachemap(true) with no cache_map: %s", image->path ); + logadd( LOG_DEBUG1, "image_updateCachemap(true) with no cache map: %s", image->path ); return; } // Recreate a cache map, set it to all 1 initially as we assume the image was complete - const int byteSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); - image->cache_map = malloc( byteSize ); - memset( image->cache_map, 0xff, byteSize ); - } - while ( pos < end ) { - const size_t map_y = (int)( pos >> 15 ); - const int map_x = (int)( (pos >> 12) & 7 ); // mod 8 - const int bit_mask = 1 << map_x; - if ( set ) { - if ( (image->cache_map[map_y] & bit_mask) == 0 ) setNewBlocks = true; - image->cache_map[map_y] |= (uint8_t)bit_mask; - } else { - image->cache_map[map_y] &= (uint8_t)~bit_mask; + allocCacheMap( image, true ); + cache = ref_get_cachemap( image ); + if ( cache == NULL ) { + logadd( LOG_WARNING, "WHAT!!!?!?!= No cache map right after alloc?! %s", image->path ); + return; } - pos += DNBD3_BLOCK_SIZE; } + // Set/unset + const uint64_t firstByteInMap = start >> 15; + const uint64_t lastByteInMap = (end - 1) >> 15; + uint64_t pos; + // First byte + uint8_t fb = 0, lb = 0; + for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) { + const int map_x = (pos >> 12) & 7; // mod 8 + const uint8_t bit_mask = (uint8_t)( 1 << map_x ); + fb |= bit_mask; + } + // Last byte + if ( lastByteInMap != firstByteInMap ) { + for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) { + assert( lastByteInMap == (pos >> 15) ); + const int map_x = (pos >> 12) & 7; // mod 8 + const uint8_t bit_mask = (uint8_t)( 1 << map_x ); + lb |= bit_mask; + } + } + atomic_thread_fence( memory_order_acquire ); + if ( set ) { + uint8_t fo = atomic_fetch_or_explicit( &cache->map[firstByteInMap], fb, memory_order_relaxed ); + uint8_t lo = atomic_fetch_or_explicit( &cache->map[lastByteInMap], lb, memory_order_relaxed ); + setNewBlocks = ( fo != cache->map[firstByteInMap] || lo != cache->map[lastByteInMap] ); + } else { + atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~fb, memory_order_relaxed ); + atomic_fetch_and_explicit( &cache->map[lastByteInMap], (uint8_t)~lb, memory_order_relaxed ); + } + const uint8_t nval = set ? 0xff : 0; + // Everything in between + for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) { + if ( atomic_exchange_explicit( &cache->map[pos], nval, memory_order_relaxed ) != nval && set ) { + setNewBlocks = true; + } + } + atomic_thread_fence( memory_order_release ); if ( setNewBlocks && image->crc32 != NULL ) { // If setNewBlocks is set, at least one of the blocks was not cached before, so queue all hash blocks // for checking, even though this might lead to checking some hash block again, if it was @@ -122,19 +158,14 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co // First set start and end to borders of hash blocks start &= ~(uint64_t)(HASH_BLOCK_SIZE - 1); end = (end + HASH_BLOCK_SIZE - 1) & ~(uint64_t)(HASH_BLOCK_SIZE - 1); - pos = start; - while ( pos < end ) { - if ( image->cache_map == NULL ) break; + for ( pos = start; pos < end; pos += HASH_BLOCK_SIZE ) { const int block = (int)( pos / HASH_BLOCK_SIZE ); - if ( image_isHashBlockComplete( image->cache_map, block, image->realFilesize ) ) { - mutex_unlock( &image->lock ); - integrity_check( image, block ); - mutex_lock( &image->lock ); + if ( image_isHashBlockComplete( cache->map, block, image->realFilesize ) ) { + integrity_check( image, block, false ); } - pos += HASH_BLOCK_SIZE; } } - mutex_unlock( &image->lock ); + ref_put( &cache->reference ); } /** @@ -146,20 +177,18 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co bool image_isComplete(dnbd3_image_t *image) { assert( image != NULL ); - mutex_lock( &image->lock ); if ( image->virtualFilesize == 0 ) { - mutex_unlock( &image->lock ); return false; } - if ( image->cache_map == NULL ) { - mutex_unlock( &image->lock ); + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) { return true; } bool complete = true; int j; const int map_len_bytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); for (j = 0; j < map_len_bytes - 1; ++j) { - if ( image->cache_map[j] != 0xFF ) { + if ( cache->map[j] != 0xFF ) { complete = false; break; } @@ -174,18 +203,27 @@ bool image_isComplete(dnbd3_image_t *image) for (j = 0; j < blocks_in_last_byte; ++j) last_byte |= (uint8_t)(1 << j); } - complete = ((image->cache_map[map_len_bytes - 1] & last_byte) == last_byte); + complete = ((cache->map[map_len_bytes - 1] & last_byte) == last_byte); } - if ( !complete ) { - mutex_unlock( &image->lock ); + ref_put( &cache->reference ); + if ( !complete ) return false; + mutex_lock( &image->lock ); + // Lock and make sure current cache map is still the one we saw complete + dnbd3_cache_map_t *current = ref_get_cachemap( image ); + if ( current == cache ) { + // Set cache map NULL as it's complete + ref_setref( &image->ref_cacheMap, NULL ); + } + if ( current != NULL ) { + ref_put( ¤t->reference ); } - char mapfile[PATHLEN] = ""; - free( image->cache_map ); - image->cache_map = NULL; - snprintf( mapfile, PATHLEN, "%s.map", image->path ); mutex_unlock( &image->lock ); - unlink( mapfile ); + if ( current == cache ) { // Successfully set cache map to NULL above + char mapfile[PATHLEN] = ""; + snprintf( mapfile, PATHLEN, "%s.map", image->path ); + unlink( mapfile ); + } return true; } @@ -203,7 +241,9 @@ bool image_ensureOpen(dnbd3_image_t *image) { if ( image->readFd != -1 ) return image; int newFd = open( image->path, O_RDONLY ); - if ( newFd != -1 ) { + if ( newFd == -1 ) { + logadd( LOG_WARNING, "Cannot open %s for reading", image->path ); + } else { // Check size const off_t flen = lseek( newFd, 0, SEEK_END ); if ( flen == -1 ) { @@ -234,6 +274,22 @@ bool image_ensureOpen(dnbd3_image_t *image) return image->readFd != -1; } +dnbd3_image_t* image_byId(int imgId) +{ + int i; + mutex_lock( &imageListLock ); + for (i = 0; i < _num_images; ++i) { + dnbd3_image_t * const image = _images[i]; + if ( image != NULL && image->id == imgId ) { + image->users++; + mutex_unlock( &imageListLock ); + return image; + } + } + mutex_unlock( &imageListLock ); + return NULL; +} + /** * Get an image by name+rid. This function increases a reference counter, * so you HAVE TO CALL image_release for every image_get() call at some @@ -267,14 +323,12 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) return NULL ; } - mutex_lock( &candidate->lock ); - mutex_unlock( &imageListLock ); candidate->users++; - mutex_unlock( &candidate->lock ); + mutex_unlock( &imageListLock ); // Found, see if it works -// TODO: Also make sure a non-working image still has old fd open but created a new one and removed itself from the list -// TODO: But remember size-changed images forever + // TODO: Also make sure a non-working image still has old fd open but created a new one and removed itself from the list + // TODO: But remember size-changed images forever if ( candidate->working || checkIfWorking ) { // Is marked working, but might not have an fd open if ( !image_ensureOpen( candidate ) ) { @@ -317,14 +371,14 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) logadd( LOG_WARNING, "lseek() on %s failed (errno=%d)%s.", candidate->path, errno, removingText ); reload = true; } else if ( (uint64_t)len != candidate->realFilesize ) { - logadd( LOG_DEBUG1, "Size of %s changed at runtime, keeping disabled! Expected: %" PRIu64 ", found: %" PRIu64 + logadd( LOG_WARNING, "Size of %s changed at runtime, keeping disabled! Expected: %" PRIu64 ", found: %" PRIu64 ". Try sending SIGHUP to server if you know what you're doing.", candidate->path, candidate->realFilesize, (uint64_t)len ); } else { // Seek worked, file size is same, now see if we can read from file char buffer[100]; if ( pread( candidate->readFd, buffer, sizeof(buffer), 0 ) == -1 ) { - logadd( LOG_DEBUG2, "Reading first %d bytes from %s failed (errno=%d)%s.", + logadd( LOG_WARNING, "Reading first %d bytes from %s failed (errno=%d)%s.", (int)sizeof(buffer), candidate->path, errno, removingText ); reload = true; } else if ( !candidate->working ) { @@ -338,6 +392,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) // Could not access the image with exising fd - mark for reload which will re-open the file. // make a copy of the image struct but keep the old one around. If/When it's not being used // anymore, it will be freed automatically. + logadd( LOG_DEBUG1, "Reloading image file %s", candidate->path ); dnbd3_image_t *img = calloc( sizeof(dnbd3_image_t), 1 ); img->path = strdup( candidate->path ); img->name = strdup( candidate->name ); @@ -349,19 +404,18 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) img->rid = candidate->rid; img->users = 1; img->working = false; - mutex_init( &img->lock ); + img->ref_cacheMap = NULL; + mutex_init( &img->lock, LOCK_IMAGE ); if ( candidate->crc32 != NULL ) { const size_t mb = IMGSIZE_TO_HASHBLOCKS( candidate->virtualFilesize ) * sizeof(uint32_t); img->crc32 = malloc( mb ); memcpy( img->crc32, candidate->crc32, mb ); } - mutex_lock( &candidate->lock ); - if ( candidate->cache_map != NULL ) { - const size_t mb = IMGSIZE_TO_MAPBYTES( candidate->virtualFilesize ); - img->cache_map = malloc( mb ); - memcpy( img->cache_map, candidate->cache_map, mb ); + dnbd3_cache_map_t *cache = ref_get_cachemap( candidate ); + if ( cache != NULL ) { + ref_setref( &img->ref_cacheMap, &cache->reference ); + ref_put( &cache->reference ); } - mutex_unlock( &candidate->lock ); if ( image_addToList( img ) ) { image_release( candidate ); candidate = img; @@ -369,19 +423,16 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) img->users = 0; image_free( img ); } + // Check if image is incomplete, initialize uplink + if ( candidate->ref_cacheMap != NULL ) { + uplink_init( candidate, -1, NULL, -1 ); + } // readFd == -1 and working == FALSE at this point, // this function needs some splitting up for handling as we need to run most // of the above code again. for now we know that the next call for this // name:rid will get ne newly inserted "img" and try to re-open the file. } - // Check if image is incomplete, handle - if ( candidate->cache_map != NULL ) { - if ( candidate->uplink == NULL ) { - uplink_init( candidate, -1, NULL, -1 ); - } - } - return candidate; // We did all we can, hopefully it's working } @@ -391,17 +442,15 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) * Every call to image_lock() needs to be followed by a call to image_release() at some point. * Locks on: imageListLock, _images[].lock */ -dnbd3_image_t* image_lock(dnbd3_image_t *image) // TODO: get rid, fix places that do image->users-- +dnbd3_image_t* image_lock(dnbd3_image_t *image) { if ( image == NULL ) return NULL ; int i; mutex_lock( &imageListLock ); for (i = 0; i < _num_images; ++i) { if ( _images[i] == image ) { - mutex_lock( &image->lock ); - mutex_unlock( &imageListLock ); image->users++; - mutex_unlock( &image->lock ); + mutex_unlock( &imageListLock ); return image; } } @@ -419,12 +468,9 @@ dnbd3_image_t* image_release(dnbd3_image_t *image) { if ( image == NULL ) return NULL; mutex_lock( &imageListLock ); - mutex_lock( &image->lock ); assert( image->users > 0 ); - image->users--; - bool inUse = image->users != 0; - mutex_unlock( &image->lock ); - if ( inUse ) { // Still in use, do nothing + // Decrement and check for 0 + if ( --image->users != 0 ) { // Still in use, do nothing mutex_unlock( &imageListLock ); return NULL; } @@ -439,7 +485,7 @@ dnbd3_image_t* image_release(dnbd3_image_t *image) } mutex_unlock( &imageListLock ); // So it wasn't in the images list anymore either, get rid of it - if ( !inUse ) image = image_free( image ); + image = image_free( image ); return NULL; } @@ -470,7 +516,6 @@ static dnbd3_image_t* image_remove(dnbd3_image_t *image) { bool mustFree = false; mutex_lock( &imageListLock ); - mutex_lock( &image->lock ); for ( int i = _num_images - 1; i >= 0; --i ) { if ( _images[i] == image ) { _images[i] = NULL; @@ -478,7 +523,6 @@ static dnbd3_image_t* image_remove(dnbd3_image_t *image) } if ( _images[i] == NULL && i + 1 == _num_images ) _num_images--; } - mutex_unlock( &image->lock ); mutex_unlock( &imageListLock ); if ( mustFree ) image = image_free( image ); return image; @@ -493,17 +537,7 @@ void image_killUplinks() mutex_lock( &imageListLock ); for (i = 0; i < _num_images; ++i) { if ( _images[i] == NULL ) continue; - mutex_lock( &_images[i]->lock ); - if ( _images[i]->uplink != NULL ) { - mutex_lock( &_images[i]->uplink->queueLock ); - if ( !_images[i]->uplink->shutdown ) { - thread_detach( _images[i]->uplink->thread ); - _images[i]->uplink->shutdown = true; - } - mutex_unlock( &_images[i]->uplink->queueLock ); - signal_call( _images[i]->uplink->signal ); - } - mutex_unlock( &_images[i]->lock ); + uplink_shutdown( _images[i] ); } mutex_unlock( &imageListLock ); } @@ -542,18 +576,14 @@ bool image_loadAll(char *path) // Lock again, see if image is still there, free if required mutex_lock( &imageListLock ); if ( ret || i >= _num_images || _images[i] == NULL || _images[i]->id != imgId ) continue; - // Image needs to be removed + // File not readable but still in list -- needs to be removed imgHandle = _images[i]; _images[i] = NULL; if ( i + 1 == _num_images ) _num_images--; - mutex_lock( &imgHandle->lock ); - const bool freeImg = ( imgHandle->users == 0 ); - mutex_unlock( &imgHandle->lock ); - // We unlocked, but the image has been removed from the list already, so - // there's no way the users-counter can increase at this point. - if ( freeImg ) { + if ( imgHandle->users == 0 ) { // Image is not in use anymore, free the dangling entry immediately - mutex_unlock( &imageListLock ); // image_free might do several fs operations; unlock + mutex_unlock( &imageListLock ); // image_free locks on this, and + // might do several fs operations; unlock image_free( imgHandle ); mutex_lock( &imageListLock ); } @@ -581,12 +611,10 @@ bool image_tryFreeAll() { mutex_lock( &imageListLock ); for (int i = _num_images - 1; i >= 0; --i) { - if ( _images[i] != NULL && _images[i]->users == 0 ) { // XXX Data race... + if ( _images[i] != NULL && _images[i]->users == 0 ) { dnbd3_image_t *image = _images[i]; _images[i] = NULL; - mutex_unlock( &imageListLock ); image = image_free( image ); - mutex_lock( &imageListLock ); } if ( i + 1 == _num_images && _images[i] == NULL ) _num_images--; } @@ -596,35 +624,34 @@ bool image_tryFreeAll() /** * Free image. DOES NOT check if it's in use. - * Indirectly locks on imageListLock, image.lock, uplink.queueLock + * (Indirectly) locks on image.lock, uplink.queueLock */ static dnbd3_image_t* image_free(dnbd3_image_t *image) { assert( image != NULL ); - if ( !_shutdown ) { - logadd( LOG_INFO, "Freeing image %s:%d", image->name, (int)image->rid ); - } - // - uplink_shutdown( image ); + assert( image->users == 0 ); + logadd( ( _shutdown ? LOG_DEBUG1 : LOG_INFO ), "Freeing image %s:%d", image->name, (int)image->rid ); + // uplink_shutdown might return false to tell us + // that the shutdown is in progress. Bail out since + // this will get called again when the uplink is done. + if ( !uplink_shutdown( image ) ) + return NULL; mutex_lock( &image->lock ); - free( image->cache_map ); + ref_setref( &image->ref_cacheMap, NULL ); free( image->crc32 ); free( image->path ); free( image->name ); - image->cache_map = NULL; image->crc32 = NULL; image->path = NULL; image->name = NULL; mutex_unlock( &image->lock ); if ( image->readFd != -1 ) close( image->readFd ); mutex_destroy( &image->lock ); - // - memset( image, 0, sizeof(*image) ); free( image ); return NULL ; } -bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t realFilesize) +bool image_isHashBlockComplete(atomic_uint_least8_t * const cacheMap, const uint64_t block, const uint64_t realFilesize) { if ( cacheMap == NULL ) return true; const uint64_t end = (block + 1) * HASH_BLOCK_SIZE; @@ -731,7 +758,7 @@ static bool image_load(char *base, char *path, int withUplink) { int revision = -1; struct stat st; - uint8_t *cache_map = NULL; + dnbd3_cache_map_t *cache = NULL; uint32_t *crc32list = NULL; dnbd3_image_t *existing = NULL; int fdImage = -1; @@ -814,24 +841,15 @@ static bool image_load(char *base, char *path, int withUplink) } // 1. Allocate memory for the cache map if the image is incomplete - cache_map = image_loadCacheMap( path, virtualFilesize ); + cache = image_loadCacheMap( path, virtualFilesize ); // XXX: Maybe try sha-256 or 512 first if you're paranoid (to be implemented) // 2. Load CRC-32 list of image - bool doFullCheck = false; uint32_t masterCrc = 0; const int hashBlockCount = IMGSIZE_TO_HASHBLOCKS( virtualFilesize ); crc32list = image_loadCrcList( path, virtualFilesize, &masterCrc ); - // Check CRC32 - if ( crc32list != NULL ) { - if ( !image_checkRandomBlocks( 4, fdImage, realFilesize, crc32list, cache_map ) ) { - logadd( LOG_ERROR, "quick crc32 check of %s failed. Data corruption?", path ); - doFullCheck = true; - } - } - // Compare data just loaded to identical image we apparently already loaded if ( existing != NULL ) { if ( existing->realFilesize != realFilesize ) { @@ -850,7 +868,7 @@ static bool image_load(char *base, char *path, int withUplink) crc32list = NULL; function_return = true; goto load_error; // Keep existing - } else if ( existing->cache_map != NULL && cache_map == NULL ) { + } else if ( existing->ref_cacheMap != NULL && cache == NULL ) { // Just ignore that fact, if replication is really complete the cache map will be removed anyways logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", existing->name, (int)existing->rid ); function_return = true; @@ -870,19 +888,20 @@ static bool image_load(char *base, char *path, int withUplink) dnbd3_image_t *image = calloc( 1, sizeof(dnbd3_image_t) ); image->path = strdup( path ); image->name = strdup( imgName ); - image->cache_map = cache_map; + image->ref_cacheMap = NULL; + ref_setref( &image->ref_cacheMap, &cache->reference ); image->crc32 = crc32list; image->masterCrc32 = masterCrc; - image->uplink = NULL; + image->uplinkref = NULL; image->realFilesize = realFilesize; image->virtualFilesize = virtualFilesize; image->rid = (uint16_t)revision; image->users = 0; image->readFd = -1; - image->working = (image->cache_map == NULL ); + image->working = ( cache == NULL ); timing_get( &image->nextCompletenessEstimate ); image->completenessEstimate = -1; - mutex_init( &image->lock ); + mutex_init( &image->lock, LOCK_IMAGE ); int32_t offset; if ( stat( path, &st ) == 0 ) { // Negatively offset atime by file modification time @@ -894,16 +913,16 @@ static bool image_load(char *base, char *path, int withUplink) timing_gets( &image->atime, offset ); // Prevent freeing in cleanup - cache_map = NULL; + cache = NULL; crc32list = NULL; // Get rid of cache map if image is complete - if ( image->cache_map != NULL ) { + if ( image->ref_cacheMap != NULL ) { image_isComplete( image ); } // Image is definitely incomplete, initialize uplink worker - if ( image->cache_map != NULL ) { + if ( image->ref_cacheMap != NULL ) { image->working = false; if ( withUplink ) { uplink_init( image, -1, NULL, -1 ); @@ -915,6 +934,8 @@ static bool image_load(char *base, char *path, int withUplink) if ( image_addToList( image ) ) { // Keep fd for reading fdImage = -1; + // Check CRC32 + image_checkRandomBlocks( image, 4 ); } else { logadd( LOG_ERROR, "Image list full: Could not add image %s", path ); image->readFd = -1; // Keep fdImage instead, will be closed below @@ -922,33 +943,28 @@ static bool image_load(char *base, char *path, int withUplink) goto load_error; } logadd( LOG_DEBUG1, "Loaded image '%s:%d'\n", image->name, (int)image->rid ); - // CRC errors found... - if ( doFullCheck ) { - logadd( LOG_INFO, "Queueing full CRC32 check for '%s:%d'\n", image->name, (int)image->rid ); - integrity_check( image, -1 ); - } - function_return = true; // Clean exit: load_error: ; if ( existing != NULL ) existing = image_release( existing ); if ( crc32list != NULL ) free( crc32list ); - if ( cache_map != NULL ) free( cache_map ); + if ( cache != NULL ) free( cache ); if ( fdImage != -1 ) close( fdImage ); return function_return; } -static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize) +static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize) { - uint8_t *retval = NULL; + dnbd3_cache_map_t *retval = NULL; char mapFile[strlen( imagePath ) + 10 + 1]; sprintf( mapFile, "%s.map", imagePath ); int fdMap = open( mapFile, O_RDONLY ); - if ( fdMap >= 0 ) { + if ( fdMap != -1 ) { const int map_size = IMGSIZE_TO_MAPBYTES( fileSize ); - retval = calloc( 1, map_size ); - const ssize_t rd = read( fdMap, retval, map_size ); + retval = calloc( 1, sizeof(*retval) + map_size ); + ref_init( &retval->reference, cmfree, 0 ); + const ssize_t rd = read( fdMap, retval->map, map_size ); if ( map_size != rd ) { logadd( LOG_WARNING, "Could only read %d of expected %d bytes of cache map of '%s'", (int)rd, (int)map_size, imagePath ); // Could not read complete map, that means the rest of the image file will be considered incomplete @@ -1009,18 +1025,26 @@ static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t f return retval; } -static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t realFilesize, uint32_t * const crc32list, uint8_t * const cache_map) +static void image_checkRandomBlocks(dnbd3_image_t *image, const int count) { + if ( image->crc32 == NULL ) + return; // This checks the first block and (up to) count - 1 random blocks for corruption // via the known crc32 list. This is very sloppy and is merely supposed to detect // accidental corruption due to broken dnbd3-proxy functionality or file system - // corruption. + // corruption, or people replacing/updating images which is a very stupid thing. assert( count > 0 ); - const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( realFilesize ); - int blocks[count + 1]; + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( image->virtualFilesize ); + int blocks[count]; int index = 0, j; int block; - if ( image_isHashBlockComplete( cache_map, 0, realFilesize ) ) blocks[index++] = 0; + if ( image_isHashBlockComplete( cache->map, 0, image->virtualFilesize ) ) { + blocks[index++] = 0; + } + if ( hashBlocks > 1 && image_isHashBlockComplete( cache->map, hashBlocks - 1, image->virtualFilesize ) ) { + blocks[index++] = hashBlocks - 1; + } int tries = count * 5; // Try only so many times to find a non-duplicate complete block while ( index + 1 < count && --tries > 0 ) { block = rand() % hashBlocks; // Random block @@ -1028,11 +1052,15 @@ static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t if ( blocks[j] == block ) goto while_end; } // Block complete? If yes, add to list - if ( image_isHashBlockComplete( cache_map, block, realFilesize ) ) blocks[index++] = block; + if ( image_isHashBlockComplete( cache->map, block, image->virtualFilesize ) ) { + blocks[index++] = block; + } while_end: ; } - blocks[MIN(index, count)] = -1; // End of array has to be marked by a -1 - return image_checkBlocksCrc32( fdImage, crc32list, blocks, realFilesize ); // Return result of check + ref_put( &cache->reference ); + for ( int i = 0; i < index; ++i ) { + integrity_check( image, blocks[i], true ); + } } /** @@ -1191,7 +1219,7 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision, dnbd3_host_t servers[REP_NUM_SRV]; int uplinkSock = -1; dnbd3_host_t uplinkServer; - const int count = altservers_getListForUplink( servers, REP_NUM_SRV, false ); + const int count = altservers_getHostListForReplication( name, servers, REP_NUM_SRV ); uint16_t remoteProtocolVersion; uint16_t remoteRid = revision; uint64_t remoteImageSize; @@ -1504,9 +1532,9 @@ json_t* image_getListAsJson() json_t *imagesJson = json_array(); json_t *jsonImage; int i; - char uplinkName[100] = { 0 }; + char uplinkName[100]; uint64_t bytesReceived; - int users, completeness, idleTime; + int completeness, idleTime; declare_now; mutex_lock( &imageListLock ); @@ -1514,27 +1542,26 @@ json_t* image_getListAsJson() if ( _images[i] == NULL ) continue; dnbd3_image_t *image = _images[i]; mutex_lock( &image->lock ); - mutex_unlock( &imageListLock ); - users = image->users; idleTime = (int)timing_diff( &image->atime, &now ); completeness = image_getCompletenessEstimate( image ); - if ( image->uplink == NULL ) { + mutex_unlock( &image->lock ); + dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref ); + if ( uplink == NULL ) { bytesReceived = 0; uplinkName[0] = '\0'; } else { - bytesReceived = image->uplink->bytesReceived; - if ( image->uplink->fd == -1 || !host_to_string( &image->uplink->currentServer, uplinkName, sizeof(uplinkName) ) ) { + bytesReceived = uplink->bytesReceived; + if ( !uplink_getHostString( uplink, uplinkName, sizeof(uplinkName) ) ) { uplinkName[0] = '\0'; } + ref_put( &uplink->reference ); } - image->users++; // Prevent freeing after we unlock - mutex_unlock( &image->lock ); jsonImage = json_pack( "{sisssisisisisI}", "id", image->id, // id, name, rid never change, so access them without locking "name", image->name, "rid", (int) image->rid, - "users", users, + "users", image->users, "complete", completeness, "idle", idleTime, "size", (json_int_t)image->virtualFilesize ); @@ -1546,8 +1573,6 @@ json_t* image_getListAsJson() } json_array_append_new( imagesJson, jsonImage ); - image = image_release( image ); // Since we did image->users++; - mutex_lock( &imageListLock ); } mutex_unlock( &imageListLock ); return imagesJson; @@ -1556,30 +1581,37 @@ json_t* image_getListAsJson() /** * Get completeness of an image in percent. Only estimated, not exact. * Returns: 0-100 - * DOES NOT LOCK, so make sure to do so before calling */ int image_getCompletenessEstimate(dnbd3_image_t * const image) { assert( image != NULL ); - if ( image->cache_map == NULL ) return image->working ? 100 : 0; + dnbd3_cache_map_t *cache = ref_get_cachemap( image ); + if ( cache == NULL ) + return image->working ? 100 : 0; + const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + if ( unlikely( len == 0 ) ) { + ref_put( &cache->reference ); + return 0; + } declare_now; if ( !timing_reached( &image->nextCompletenessEstimate, &now ) ) { // Since this operation is relatively expensive, we cache the result for a while + ref_put( &cache->reference ); return image->completenessEstimate; } int i; int percent = 0; - const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); - if ( len == 0 ) return 0; for ( i = 0; i < len; ++i ) { - if ( image->cache_map[i] == 0xff ) { + const uint8_t v = atomic_load_explicit( &cache->map[i], memory_order_relaxed ); + if ( v == 0xff ) { percent += 100; - } else if ( image->cache_map[i] != 0 ) { + } else if ( v != 0 ) { percent += 50; } } + ref_put( &cache->reference ); image->completenessEstimate = percent / len; - timing_set( &image->nextCompletenessEstimate, &now, 8 + rand() % 32 ); + timing_set( &image->nextCompletenessEstimate, &now, 4 + rand() % 16 ); return image->completenessEstimate; } @@ -1611,7 +1643,7 @@ bool image_checkBlocksCrc32(const int fd, uint32_t *crc32list, const int *blocks static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc) { // Make buffer 4k aligned in case fd has O_DIRECT set -#define BSIZE 262144 +#define BSIZE (512*1024) char rawBuffer[BSIZE + DNBD3_BLOCK_SIZE]; char * const buffer = (char*)( ( (uintptr_t)rawBuffer + ( DNBD3_BLOCK_SIZE - 1 ) ) & ~( DNBD3_BLOCK_SIZE - 1 ) ); // How many bytes to read from the input file @@ -1669,7 +1701,7 @@ bool image_ensureDiskSpaceLocked(uint64_t size, bool force) * TODO: Store last access time of images. Currently the * last access time is reset to the file modification time * on server restart. Thus it will - * currently only delete images if server uptime is > 10 hours. + * currently only delete images if server uptime is > 24 hours. * This can be overridden by setting force to true, in case * free space is desperately needed. * Return true iff enough space is available. false in random other cases @@ -1679,48 +1711,55 @@ static bool image_ensureDiskSpace(uint64_t size, bool force) for ( int maxtries = 0; maxtries < 20; ++maxtries ) { uint64_t available; if ( !file_freeDiskSpace( _basePath, NULL, &available ) ) { - const int e = errno; - logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left... ;-)\n", e ); + logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left... ;-)\n", errno ); return true; } - if ( available > size ) return true; - if ( !force && dnbd3_serverUptime() < 10 * 3600 ) { - logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < 10 hours...", (int)(available / (1024ll * 1024ll)), - (int)(size / (1024 * 1024)) ); + if ( available > size ) + return true; // Yay + if ( !_isProxy || _autoFreeDiskSpaceDelay == -1 ) + return false; // If not in proxy mode at all, or explicitly disabled, never delete anything + if ( !force && dnbd3_serverUptime() < (uint32_t)_autoFreeDiskSpaceDelay ) { + logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < %d minutes...", (int)(available / (1024ll * 1024ll)), + (int)(size / (1024 * 1024)), _autoFreeDiskSpaceDelay / 60 ); return false; } logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, freeing an image...", (int)(available / (1024ll * 1024ll)), (int)(size / (1024 * 1024)) ); // Find least recently used image dnbd3_image_t *oldest = NULL; - int i; // XXX improve locking + int i; + mutex_lock( &imageListLock ); for (i = 0; i < _num_images; ++i) { - if ( _images[i] == NULL ) continue; - dnbd3_image_t *current = image_lock( _images[i] ); + dnbd3_image_t *current = _images[i]; if ( current == NULL ) continue; - if ( current->users == 1 ) { // Just from the lock above + if ( current->users == 0 ) { // Not in use :-) if ( oldest == NULL || timing_1le2( ¤t->atime, &oldest->atime ) ) { // Oldest access time so far oldest = current; } } - current = image_release( current ); + } + if ( oldest != NULL ) { + oldest->users++; + } + mutex_unlock( &imageListLock ); + if ( oldest == NULL ) { + logadd( LOG_INFO, "All images are currently in use :-(" ); + return false; } declare_now; - if ( oldest == NULL || ( !_sparseFiles && timing_diff( &oldest->atime, &now ) < 86400 ) ) { - if ( oldest == NULL ) { - logadd( LOG_INFO, "All images are currently in use :-(" ); - } else { - logadd( LOG_INFO, "Won't free any image, all have been in use in the past 24 hours :-(" ); - } + if ( !_sparseFiles && timing_diff( &oldest->atime, &now ) < 86400 ) { + logadd( LOG_INFO, "Won't free any image, all have been in use in the past 24 hours :-(" ); + image_release( oldest ); // We did users++ above; image might have to be freed entirely return false; } - oldest = image_lock( oldest ); - if ( oldest == NULL ) continue; // Image freed in the meantime? Try again logadd( LOG_INFO, "'%s:%d' has to go!", oldest->name, (int)oldest->rid ); - char *filename = strdup( oldest->path ); - oldest = image_remove( oldest ); - oldest = image_release( oldest ); + char *filename = strdup( oldest->path ); // Copy name as we remove the image first + oldest = image_remove( oldest ); // Remove from list first... + oldest = image_release( oldest ); // Decrease users counter; if it falls to 0, image will be freed + // Technically the image might have been grabbed again, but chances for + // this should be close to zero anyways since the image went unused for more than 24 hours.. + // Proper fix would be a "delete" flag in the image struct that will be checked in image_free unlink( filename ); size_t len = strlen( filename ) + 10; char buffer[len]; @@ -1735,62 +1774,52 @@ static bool image_ensureDiskSpace(uint64_t size, bool force) return false; } -void image_closeUnusedFd() +#define FDCOUNT (400) +static void* closeUnusedFds(void* nix UNUSED) { - int fd, i; + if ( !_closeUnusedFd ) + return NULL; ticks deadline; timing_gets( &deadline, -UNUSED_FD_TIMEOUT ); - char imgstr[300]; + int fds[FDCOUNT]; + int fdindex = 0; mutex_lock( &imageListLock ); - for (i = 0; i < _num_images; ++i) { + for ( int i = 0; i < _num_images; ++i ) { dnbd3_image_t * const image = _images[i]; - if ( image == NULL ) + if ( image == NULL || image->readFd == -1 ) continue; - mutex_lock( &image->lock ); - mutex_unlock( &imageListLock ); - if ( image->users == 0 && image->uplink == NULL && timing_reached( &image->atime, &deadline ) ) { - snprintf( imgstr, sizeof(imgstr), "%s:%d", image->name, (int)image->rid ); - fd = image->readFd; - image->readFd = -1; - } else { - fd = -1; - } - mutex_unlock( &image->lock ); - if ( fd != -1 ) { - close( fd ); - logadd( LOG_DEBUG1, "Inactive fd closed for %s", imgstr ); + // TODO: Also close for idle uplinks (uplink_connectionShouldShutdown) + // TODO: And close writeFd for idle uplinks.... + if ( image->users == 0 && image->uplinkref == NULL && timing_reached( &image->atime, &deadline ) ) { + logadd( LOG_DEBUG1, "Inactive fd closed for %s:%d", image->name, (int)image->rid ); + fds[fdindex++] = image->readFd; + image->readFd = -1; // Not a race; image->users is 0 and to increase it you need imageListLock + if ( fdindex == FDCOUNT ) + break; } - mutex_lock( &imageListLock ); } mutex_unlock( &imageListLock ); + // Do this after unlock since close might block + for ( int i = 0; i < fdindex; ++i ) { + close( fds[i] ); + } + return NULL; +} + +static void allocCacheMap(dnbd3_image_t *image, bool complete) +{ + const uint8_t val = complete ? 0xff : 0; + const int byteSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + dnbd3_cache_map_t *cache = malloc( sizeof(*cache) + byteSize ); + ref_init( &cache->reference, cmfree, 0 ); + memset( cache->map, val, byteSize ); + mutex_lock( &image->lock ); + if ( image->ref_cacheMap != NULL ) { + logadd( LOG_WARNING, "BUG: allocCacheMap called but there already is a cache map for %s:%d", image->name, (int)image->rid ); + free( cache ); + } else { + ref_setref( &image->ref_cacheMap, &cache->reference ); + } + mutex_unlock( &image->lock ); } -/* - void image_find_latest() - { - // Not in array or most recent rid is requested, try file system - if (revision != 0) { - // Easy case - specific RID - char - } else { - // Determine base directory where the image in question has to reside. - // Eg, the _basePath is "/srv/", requested image is "rz/ubuntu/default-13.04" - // Then searchPath has to be set to "/srv/rz/ubuntu" - char searchPath[strlen(_basePath) + len + 1]; - char *lastSlash = strrchr(name, '/'); - char *baseName; // Name of the image. In the example above, it will be "default-13.04" - if ( lastSlash == NULL ) { - *searchPath = '\0'; - baseName = name; - } else { - char *from = name, *to = searchPath; - while (from < lastSlash) *to++ = *from++; - *to = '\0'; - baseName = lastSlash + 1; - } - // Now we have the search path in our real file system and the expected image name. - // The revision naming sceme is <IMAGENAME>.r<RID>, so if we're looking for revision 13, - // our example image has to be named default-13.04.r13 - } - } - */ |