diff options
Diffstat (limited to 'src/server/image.c')
-rw-r--r-- | src/server/image.c | 1794 |
1 files changed, 1794 insertions, 0 deletions
diff --git a/src/server/image.c b/src/server/image.c new file mode 100644 index 0000000..061f9a3 --- /dev/null +++ b/src/server/image.c @@ -0,0 +1,1794 @@ +#include "image.h" +#include "helper.h" +#include "fileutil.h" +#include "uplink.h" +#include "locks.h" +#include "integrity.h" +#include "altservers.h" +#include "../shared/protocol.h" +#include "../shared/timing.h" +#include "../shared/crc32.h" + +#include <assert.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <dirent.h> +#include <inttypes.h> +#include <glob.h> +#include <jansson.h> + +#define PATHLEN (2000) +#define NONWORKING_RECHECK_INTERVAL_SECONDS (60) + +// ########################################## + +static dnbd3_image_t *_images[SERVER_MAX_IMAGES]; +static int _num_images = 0; + +static pthread_spinlock_t imageListLock; +static pthread_mutex_t remoteCloneLock = PTHREAD_MUTEX_INITIALIZER; +static pthread_mutex_t reloadLock = PTHREAD_MUTEX_INITIALIZER; +#define NAMELEN 500 +#define CACHELEN 20 +typedef struct +{ + char name[NAMELEN]; + uint16_t rid; + ticks deadline; +} imagecache; +static imagecache remoteCloneCache[CACHELEN]; + +// ########################################## + +static bool isForbiddenExtension(const char* name); +static dnbd3_image_t* image_remove(dnbd3_image_t *image); +static dnbd3_image_t* image_free(dnbd3_image_t *image); +static bool image_load_all_internal(char *base, char *path); +static bool image_addToList(dnbd3_image_t *image); +static bool image_load(char *base, char *path, int withUplink); +static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageSize); +static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc); +static bool image_ensureDiskSpace(uint64_t size, bool force); + +static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize); +static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc); +static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, uint8_t * const cache_map); + +// ########################################## + +void image_serverStartup() +{ + srand( (unsigned int)time( NULL ) ); + spin_init( &imageListLock, PTHREAD_PROCESS_PRIVATE ); +} + +/** + * Update cache-map of given image for the given byte range + * start (inclusive) - end (exclusive) + * Locks on: images[].lock + */ +void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set) +{ + assert( image != NULL ); + // This should always be block borders due to how the protocol works, but better be safe + // than accidentally mark blocks as cached when they really aren't entirely cached. + assert( end <= image->virtualFilesize ); + assert( start <= end ); + if ( set ) { + // If we set as cached, move "inwards" in case we're not at 4k border + end &= ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + start = (uint64_t)(start + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + } else { + // If marking as NOT cached, move "outwards" in case we're not at 4k border + start &= ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + end = (uint64_t)(end + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + } + if ( start >= end ) + return; + bool setNewBlocks = false; + uint64_t pos = start; + spin_lock( &image->lock ); + if ( image->cache_map == NULL ) { + // Image seems already complete + if ( set ) { + // This makes no sense + spin_unlock( &image->lock ); + logadd( LOG_DEBUG1, "image_updateCachemap(true) with no cache_map: %s", image->path ); + return; + } + // Recreate a cache map, set it to all 1 initially as we assume the image was complete + const int byteSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + image->cache_map = malloc( byteSize ); + memset( image->cache_map, 0xff, byteSize ); + } + while ( pos < end ) { + const size_t map_y = (int)( pos >> 15 ); + const int map_x = (int)( (pos >> 12) & 7 ); // mod 8 + const int bit_mask = 1 << map_x; + if ( set ) { + if ( (image->cache_map[map_y] & bit_mask) == 0 ) setNewBlocks = true; + image->cache_map[map_y] |= (uint8_t)bit_mask; + } else { + image->cache_map[map_y] &= (uint8_t)~bit_mask; + } + pos += DNBD3_BLOCK_SIZE; + } + if ( setNewBlocks && image->crc32 != NULL ) { + // If setNewBlocks is set, at least one of the blocks was not cached before, so queue all hash blocks + // for checking, even though this might lead to checking some hash block again, if it was + // already complete and the block range spanned at least two hash blocks. + // First set start and end to borders of hash blocks + start &= ~(uint64_t)(HASH_BLOCK_SIZE - 1); + end = (end + HASH_BLOCK_SIZE - 1) & ~(uint64_t)(HASH_BLOCK_SIZE - 1); + pos = start; + while ( pos < end ) { + if ( image->cache_map == NULL ) break; + const int block = (int)( pos / HASH_BLOCK_SIZE ); + if ( image_isHashBlockComplete( image->cache_map, block, image->realFilesize ) ) { + spin_unlock( &image->lock ); + integrity_check( image, block ); + spin_lock( &image->lock ); + } + pos += HASH_BLOCK_SIZE; + } + } + spin_unlock( &image->lock ); +} + +/** + * Returns true if the given image is complete. + * Also frees cache_map and deletes it on disk + * if it hasn't been complete before + * Locks on: image.lock + */ +bool image_isComplete(dnbd3_image_t *image) +{ + assert( image != NULL ); + spin_lock( &image->lock ); + if ( image->virtualFilesize == 0 ) { + spin_unlock( &image->lock ); + return false; + } + if ( image->cache_map == NULL ) { + spin_unlock( &image->lock ); + return true; + } + bool complete = true; + int j; + const int map_len_bytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + for (j = 0; j < map_len_bytes - 1; ++j) { + if ( image->cache_map[j] != 0xFF ) { + complete = false; + break; + } + } + if ( complete ) { // Every block except the last one is complete + // Last one might need extra treatment if it's not a full byte + const int blocks_in_last_byte = (image->virtualFilesize >> 12) & 7; + uint8_t last_byte = 0; + if ( blocks_in_last_byte == 0 ) { + last_byte = 0xFF; + } else { + for (j = 0; j < blocks_in_last_byte; ++j) + last_byte |= (uint8_t)(1 << j); + } + complete = ((image->cache_map[map_len_bytes - 1] & last_byte) == last_byte); + } + if ( !complete ) { + spin_unlock( &image->lock ); + return false; + } + char mapfile[PATHLEN] = ""; + free( image->cache_map ); + image->cache_map = NULL; + snprintf( mapfile, PATHLEN, "%s.map", image->path ); + spin_unlock( &image->lock ); + unlink( mapfile ); + return true; +} + +/** + * Make sure readFd is open, useful when closeUnusedFd is active. + * This function assumes you called image_lock first, so its known + * to be active and the fd won't be closed halfway through the + * function. + * Does not update atime, so the fd might be closed again very soon. + * Since the caller should have image_lock()ed first, it could do + * a quick operation on it before calling image_release which + * guarantees that the fd will not be closed meanwhile. + */ +bool image_ensureOpen(dnbd3_image_t *image) +{ + if ( image->readFd != -1 ) return image; + int newFd = open( image->path, O_RDONLY ); + if ( newFd != -1 ) { + // Check size + const off_t flen = lseek( newFd, 0, SEEK_END ); + if ( flen == -1 ) { + logadd( LOG_WARNING, "Could not seek to end of %s (errno %d)", image->path, errno ); + close( newFd ); + newFd = -1; + } else if ( (uint64_t)flen != image->realFilesize ) { + logadd( LOG_WARNING, "Size of active image with closed fd changed from %" PRIu64 " to %" PRIu64, image->realFilesize, (uint64_t)flen ); + close( newFd ); + newFd = -1; + } + } + if ( newFd == -1 ) { + spin_lock( &image->lock ); + image->working = false; + spin_unlock( &image->lock ); + return false; + } + spin_lock( &image->lock ); + if ( image->readFd == -1 ) { + image->readFd = newFd; + spin_unlock( &image->lock ); + } else { + // There was a race while opening the file (happens cause not locked cause blocking), we lost the race so close new fd and proceed + spin_unlock( &image->lock ); + close( newFd ); + } + return image->readFd != -1; +} + +/** + * Get an image by name+rid. This function increases a reference counter, + * so you HAVE TO CALL image_release for every image_get() call at some + * point... + * Locks on: imageListLock, _images[].lock + */ +dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) +{ + int i; + const char *removingText = _removeMissingImages ? ", removing from list" : ""; + dnbd3_image_t *candidate = NULL; + // Simple sanity check + const size_t slen = strlen( name ); + if ( slen == 0 || name[slen - 1] == '/' || name[0] == '/' ) return NULL ; + // Go through array + spin_lock( &imageListLock ); + for (i = 0; i < _num_images; ++i) { + dnbd3_image_t * const image = _images[i]; + if ( image == NULL || strcmp( image->name, name ) != 0 ) continue; + if ( revision == image->rid ) { + candidate = image; + break; + } else if ( revision == 0 && (candidate == NULL || candidate->rid < image->rid) ) { + candidate = image; + } + } + + // Not found + if ( candidate == NULL ) { + spin_unlock( &imageListLock ); + return NULL ; + } + + spin_lock( &candidate->lock ); + spin_unlock( &imageListLock ); + candidate->users++; + spin_unlock( &candidate->lock ); + + // Found, see if it works +// TODO: Also make sure a non-working image still has old fd open but created a new one and removed itself from the list +// TODO: But remember size-changed images forever + if ( candidate->working || checkIfWorking ) { + // Is marked working, but might not have an fd open + if ( !image_ensureOpen( candidate ) ) { + spin_lock( &candidate->lock ); + timing_get( &candidate->lastWorkCheck ); + spin_unlock( &candidate->lock ); + if ( _removeMissingImages ) { + candidate = image_remove( candidate ); // No release here, the image is still returned and should be released by caller + } + return candidate; + } + } + + if ( !checkIfWorking ) return candidate; // Not interested in re-cechking working state + + // ...not working... + + // Don't re-check too often + spin_lock( &candidate->lock ); + bool check; + declare_now; + check = timing_diff( &candidate->lastWorkCheck, &now ) > NONWORKING_RECHECK_INTERVAL_SECONDS; + if ( check ) { + candidate->lastWorkCheck = now; + } + spin_unlock( &candidate->lock ); + if ( !check ) { + return candidate; + } + + // reaching this point means: + // 1) We should check if the image is working, it might or might not be in working state right now + // 2) The image is open for reading (or at least was at some point, the fd might be stale if images lie on an NFS share etc.) + // 3) We made sure not to re-check this image too often + + // Common for ro and rw images: Size check, read check + const off_t len = lseek( candidate->readFd, 0, SEEK_END ); + bool reload = false; + if ( len == -1 ) { + logadd( LOG_WARNING, "lseek() on %s failed (errno=%d)%s.", candidate->path, errno, removingText ); + reload = true; + } else if ( (uint64_t)len != candidate->realFilesize ) { + logadd( LOG_DEBUG1, "Size of %s changed at runtime, keeping disabled! Expected: %" PRIu64 ", found: %" PRIu64 + ". Try sending SIGHUP to server if you know what you're doing.", + candidate->path, candidate->realFilesize, (uint64_t)len ); + } else { + // Seek worked, file size is same, now see if we can read from file + char buffer[100]; + if ( pread( candidate->readFd, buffer, sizeof(buffer), 0 ) == -1 ) { + logadd( LOG_DEBUG2, "Reading first %d bytes from %s failed (errno=%d)%s.", + (int)sizeof(buffer), candidate->path, errno, removingText ); + reload = true; + } else if ( !candidate->working ) { + // Seems everything is fine again \o/ + candidate->working = true; + logadd( LOG_INFO, "Changed state of %s:%d to 'working'", candidate->name, candidate->rid ); + } + } + + if ( reload ) { + // Could not access the image with exising fd - mark for reload which will re-open the file. + // make a copy of the image struct but keep the old one around. If/When it's not being used + // anymore, it will be freed automatically. + dnbd3_image_t *img = calloc( sizeof(dnbd3_image_t), 1 ); + img->path = strdup( candidate->path ); + img->name = strdup( candidate->name ); + img->virtualFilesize = candidate->virtualFilesize; + img->realFilesize = candidate->realFilesize; + img->atime = now; + img->masterCrc32 = candidate->masterCrc32; + img->readFd = -1; + img->rid = candidate->rid; + img->users = 1; + img->working = false; + spin_init( &img->lock, PTHREAD_PROCESS_PRIVATE ); + if ( candidate->crc32 != NULL ) { + const size_t mb = IMGSIZE_TO_HASHBLOCKS( candidate->virtualFilesize ) * sizeof(uint32_t); + img->crc32 = malloc( mb ); + memcpy( img->crc32, candidate->crc32, mb ); + } + spin_lock( &candidate->lock ); + if ( candidate->cache_map != NULL ) { + const size_t mb = IMGSIZE_TO_MAPBYTES( candidate->virtualFilesize ); + img->cache_map = malloc( mb ); + memcpy( img->cache_map, candidate->cache_map, mb ); + } + spin_unlock( &candidate->lock ); + if ( image_addToList( img ) ) { + image_release( candidate ); + candidate = img; + } else { + img->users = 0; + image_free( img ); + } + // readFd == -1 and working == FALSE at this point, + // this function needs some splitting up for handling as we need to run most + // of the above code again. for now we know that the next call for this + // name:rid will get ne newly inserted "img" and try to re-open the file. + } + + // Check if image is incomplete, handle + if ( candidate->cache_map != NULL ) { + if ( candidate->uplink == NULL ) { + uplink_init( candidate, -1, NULL, -1 ); + } + } + + return candidate; // We did all we can, hopefully it's working +} + +/** + * Lock the image by increasing its users count + * Returns the image on success, NULL if it is not found in the image list + * Every call to image_lock() needs to be followed by a call to image_release() at some point. + * Locks on: imageListLock, _images[].lock + */ +dnbd3_image_t* image_lock(dnbd3_image_t *image) // TODO: get rid, fix places that do image->users-- +{ + if ( image == NULL ) return NULL ; + int i; + spin_lock( &imageListLock ); + for (i = 0; i < _num_images; ++i) { + if ( _images[i] == image ) { + spin_lock( &image->lock ); + spin_unlock( &imageListLock ); + image->users++; + spin_unlock( &image->lock ); + return image; + } + } + spin_unlock( &imageListLock ); + return NULL ; +} + +/** + * Release given image. This will decrease the reference counter of the image. + * If the usage counter reaches 0 and the image is not in the images array + * anymore, the image will be freed + * Locks on: imageListLock, _images[].lock + */ +dnbd3_image_t* image_release(dnbd3_image_t *image) +{ + if ( image == NULL ) return NULL; + spin_lock( &imageListLock ); + spin_lock( &image->lock ); + assert( image->users > 0 ); + image->users--; + bool inUse = image->users != 0; + spin_unlock( &image->lock ); + if ( inUse ) { // Still in use, do nothing + spin_unlock( &imageListLock ); + return NULL; + } + // Getting here means we decreased the usage counter to zero + // If the image is not in the images list anymore, we're + // responsible for freeing it + for (int i = 0; i < _num_images; ++i) { + if ( _images[i] == image ) { // Found, do nothing + spin_unlock( &imageListLock ); + return NULL; + } + } + spin_unlock( &imageListLock ); + // So it wasn't in the images list anymore either, get rid of it + if ( !inUse ) image = image_free( image ); + return NULL; +} + +/** + * Returns true if the given file name ends in one of our meta data + * file extensions. Used to prevent loading them as images. + */ +static bool isForbiddenExtension(const char* name) +{ + const size_t len = strlen( name ); + if ( len < 4 ) return false; + const char *ptr = name + len - 4; + if ( strcmp( ptr, ".crc" ) == 0 ) return true; // CRC list + if ( strcmp( ptr, ".map" ) == 0 ) return true; // cache map for incomplete images + if ( len < 5 ) return false; + --ptr; + if ( strcmp( ptr, ".meta" ) == 0 ) return true; // Meta data (currently not in use) + return false; +} + +/** + * Remove image from images array. Only free it if it has + * no active users and was actually in the list. + * Locks on: imageListLock, image[].lock + * @return NULL if image was also freed, image otherwise + */ +static dnbd3_image_t* image_remove(dnbd3_image_t *image) +{ + bool mustFree = false; + spin_lock( &imageListLock ); + spin_lock( &image->lock ); + for ( int i = _num_images - 1; i >= 0; --i ) { + if ( _images[i] == image ) { + _images[i] = NULL; + mustFree = ( image->users == 0 ); + } + if ( _images[i] == NULL && i + 1 == _num_images ) _num_images--; + } + spin_unlock( &image->lock ); + spin_unlock( &imageListLock ); + if ( mustFree ) image = image_free( image ); + return image; +} + +/** + * Kill all uplinks + */ +void image_killUplinks() +{ + int i; + spin_lock( &imageListLock ); + for (i = 0; i < _num_images; ++i) { + if ( _images[i] == NULL ) continue; + spin_lock( &_images[i]->lock ); + if ( _images[i]->uplink != NULL ) { + spin_lock( &_images[i]->uplink->queueLock ); + if ( !_images[i]->uplink->shutdown ) { + thread_detach( _images[i]->uplink->thread ); + _images[i]->uplink->shutdown = true; + } + spin_unlock( &_images[i]->uplink->queueLock ); + signal_call( _images[i]->uplink->signal ); + } + spin_unlock( &_images[i]->lock ); + } + spin_unlock( &imageListLock ); +} + +/** + * Load all images in given path recursively. + * Pass NULL to use path from config. + */ +bool image_loadAll(char *path) +{ + bool ret; + char imgPath[PATHLEN]; + int imgId; + dnbd3_image_t *imgHandle; + + if ( path == NULL ) path = _basePath; + if ( pthread_mutex_trylock( &reloadLock ) != 0 ) { + logadd( LOG_MINOR, "Could not (re)load image list, already in progress." ); + return false; + } + if ( _removeMissingImages ) { + // Check if all loaded images still exist on disk + logadd( LOG_INFO, "Checking for vanished images" ); + spin_lock( &imageListLock ); + for ( int i = _num_images - 1; i >= 0; --i ) { + if ( _shutdown ) break; + if ( _images[i] == NULL ) { + if ( i + 1 == _num_images ) _num_images--; + continue; + } + imgId = _images[i]->id; + snprintf( imgPath, PATHLEN, "%s", _images[i]->path ); + spin_unlock( &imageListLock ); // isReadable hits the fs; unlock + // Check if fill can still be opened for reading + ret = file_isReadable( imgPath ); + // Lock again, see if image is still there, free if required + spin_lock( &imageListLock ); + if ( ret || i >= _num_images || _images[i] == NULL || _images[i]->id != imgId ) continue; + // Image needs to be removed + imgHandle = _images[i]; + _images[i] = NULL; + if ( i + 1 == _num_images ) _num_images--; + spin_lock( &imgHandle->lock ); + const bool freeImg = ( imgHandle->users == 0 ); + spin_unlock( &imgHandle->lock ); + // We unlocked, but the image has been removed from the list already, so + // there's no way the users-counter can increase at this point. + if ( freeImg ) { + // Image is not in use anymore, free the dangling entry immediately + spin_unlock( &imageListLock ); // image_free might do several fs operations; unlock + image_free( imgHandle ); + spin_lock( &imageListLock ); + } + } + spin_unlock( &imageListLock ); + if ( _shutdown ) { + pthread_mutex_unlock( &reloadLock ); + return true; + } + } + // Now scan for new images + logadd( LOG_INFO, "Scanning for new or modified images" ); + ret = image_load_all_internal( path, path ); + pthread_mutex_unlock( &reloadLock ); + logadd( LOG_INFO, "Finished scanning %s", path ); + return ret; +} + +/** + * Free all images we have, but only if they're not in use anymore. + * Locks on imageListLock + * @return true if all images have been freed + */ +bool image_tryFreeAll() +{ + spin_lock( &imageListLock ); + for (int i = _num_images - 1; i >= 0; --i) { + if ( _images[i] != NULL && _images[i]->users == 0 ) { // XXX Data race... + dnbd3_image_t *image = _images[i]; + _images[i] = NULL; + spin_unlock( &imageListLock ); + image = image_free( image ); + spin_lock( &imageListLock ); + } + if ( i + 1 == _num_images && _images[i] == NULL ) _num_images--; + } + spin_unlock( &imageListLock ); + return _num_images == 0; +} + +/** + * Free image. DOES NOT check if it's in use. + * Indirectly locks on imageListLock, image.lock, uplink.queueLock + */ +static dnbd3_image_t* image_free(dnbd3_image_t *image) +{ + assert( image != NULL ); + if ( !_shutdown ) { + logadd( LOG_INFO, "Freeing image %s:%d", image->name, (int)image->rid ); + } + // + uplink_shutdown( image ); + spin_lock( &image->lock ); + free( image->cache_map ); + free( image->crc32 ); + free( image->path ); + free( image->name ); + image->cache_map = NULL; + image->crc32 = NULL; + image->path = NULL; + image->name = NULL; + spin_unlock( &image->lock ); + if ( image->readFd != -1 ) close( image->readFd ); + spin_destroy( &image->lock ); + // + memset( image, 0, sizeof(*image) ); + free( image ); + return NULL ; +} + +bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t realFilesize) +{ + if ( cacheMap == NULL ) return true; + const uint64_t end = (block + 1) * HASH_BLOCK_SIZE; + if ( end <= realFilesize ) { + // Trivial case: block in question is not the last block (well, or image size is multiple of HASH_BLOCK_SIZE) + const int startCacheIndex = (int)( ( block * HASH_BLOCK_SIZE ) / ( DNBD3_BLOCK_SIZE * 8 ) ); + const int endCacheIndex = startCacheIndex + (int)( HASH_BLOCK_SIZE / ( DNBD3_BLOCK_SIZE * 8 ) ); + for ( int i = startCacheIndex; i < endCacheIndex; ++i ) { + if ( cacheMap[i] != 0xff ) { + return false; + } + } + } else { + // Special case: Checking last block, which is smaller than HASH_BLOCK_SIZE + for (uint64_t mapPos = block * HASH_BLOCK_SIZE; mapPos < realFilesize; mapPos += DNBD3_BLOCK_SIZE ) { + const size_t map_y = (size_t)( mapPos >> 15 ); + const int map_x = (int)( (mapPos >> 12) & 7 ); // mod 8 + const int mask = 1 << map_x; + if ( (cacheMap[map_y] & mask) == 0 ) return false; + } + } + return true; +} + +/** + * Load all images in the given path recursively, + * consider *base the base path that is to be cut off + */ +static bool image_load_all_internal(char *base, char *path) +{ +#define SUBDIR_LEN 150 + assert( path != NULL ); + assert( *path == '/' ); + struct dirent entry, *entryPtr; + const size_t pathLen = strlen( path ); + char subpath[PATHLEN]; + struct stat st; + DIR * const dir = opendir( path ); + + if ( dir == NULL ) { + logadd( LOG_ERROR, "Could not opendir '%s' for loading", path ); + return false; + } + + while ( !_shutdown && (entryPtr = readdir( dir )) != NULL ) { + entry = *entryPtr; + if ( strcmp( entry.d_name, "." ) == 0 || strcmp( entry.d_name, ".." ) == 0 ) continue; + if ( strlen( entry.d_name ) > SUBDIR_LEN ) { + logadd( LOG_WARNING, "Skipping entry %s: Too long (max %d bytes)", entry.d_name, (int)SUBDIR_LEN ); + continue; + } + if ( entry.d_name[0] == '/' || path[pathLen - 1] == '/' ) { + snprintf( subpath, PATHLEN, "%s%s", path, entry.d_name ); + } else { + snprintf( subpath, PATHLEN, "%s/%s", path, entry.d_name ); + } + if ( stat( subpath, &st ) < 0 ) { + logadd( LOG_WARNING, "stat() for '%s' failed. Ignoring....", subpath ); + continue; + } + if ( S_ISDIR( st.st_mode ) ) { + image_load_all_internal( base, subpath ); // Recurse + } else if ( !isForbiddenExtension( subpath ) ) { + image_load( base, subpath, true ); // Load image if possible + } + } + closedir( dir ); + return true; +#undef SUBDIR_LEN +} + +/** + */ +static bool image_addToList(dnbd3_image_t *image) +{ + int i; + static int imgIdCounter = 0; // Used to assign unique numeric IDs to images + spin_lock( &imageListLock ); + // Now we're locked, assign unique ID to image (unique for this running server instance!) + image->id = ++imgIdCounter; + for ( i = 0; i < _num_images; ++i ) { + if ( _images[i] != NULL ) continue; + _images[i] = image; + break; + } + if ( i >= _num_images ) { + if ( _num_images >= _maxImages ) { + spin_unlock( &imageListLock ); + return false; + } + _images[_num_images++] = image; + } + spin_unlock( &imageListLock ); + return true; +} + +/** + * Load image from given path. This will check if the image is + * already loaded and updates its information in that case. + * Note that this is NOT THREAD SAFE so make sure its always + * called on one thread only. + */ +static bool image_load(char *base, char *path, int withUplink) +{ + int revision = -1; + struct stat st; + uint8_t *cache_map = NULL; + uint32_t *crc32list = NULL; + dnbd3_image_t *existing = NULL; + int fdImage = -1; + bool function_return = false; // Return false by default + assert( base != NULL ); + assert( path != NULL ); + assert( *path == '/' ); + assert( strncmp( path, base, strlen(base)) == 0 ); + assert( base[strlen(base) - 1] != '/' ); + assert( strlen(path) > strlen(base) ); + char *lastSlash = strrchr( path, '/' ); + char *fileName = lastSlash + 1; + char imgName[strlen( path )]; + const size_t fileNameLen = strlen( fileName ); + + // Copy virtual path (relative path in "base") + char * const virtBase = path + strlen( base ) + 1; + assert( *virtBase != '/' ); + char *src = virtBase, *dst = imgName; + while ( src <= lastSlash ) { + *dst++ = *src++; + } + *dst = '\0'; + + do { + // Parse file name for revision + // Try to parse *.r<ID> syntax + size_t i; + for (i = fileNameLen - 1; i > 1; --i) { + if ( fileName[i] < '0' || fileName[i] > '9' ) break; + } + if ( i != fileNameLen - 1 && fileName[i] == 'r' && fileName[i - 1] == '.' ) { + revision = atoi( fileName + i + 1 ); + src = fileName; + while ( src < fileName + i - 1 ) { + *dst++ = *src++; + } + *dst = '\0'; + } + } while (0); + + // Legacy mode enabled and no rid extracted from filename? + if ( _vmdkLegacyMode && revision == -1 ) { + fdImage = open( path, O_RDONLY ); // Check if it exists + if ( fdImage == -1 ) goto load_error; + // Yes, simply append full file name and set rid to 1 + strcat( dst, fileName ); + revision = 1; + } + // Did we get anything? + if ( revision <= 0 || revision >= 65536 ) { + logadd( LOG_WARNING, "Image '%s' has invalid revision ID %d", path, revision ); + goto load_error; + } + + // Get pointer to already existing image if possible + existing = image_get( imgName, (uint16_t)revision, true ); + + // ### Now load the actual image related data ### + if ( fdImage == -1 ) { + fdImage = open( path, O_RDONLY ); + } + if ( fdImage == -1 ) { + logadd( LOG_ERROR, "Could not open '%s' for reading...", path ); + goto load_error; + } + // Determine file size + const off_t seekret = lseek( fdImage, 0, SEEK_END ); + if ( seekret < 0 ) { + logadd( LOG_ERROR, "Could not seek to end of file '%s'", path ); + goto load_error; + } else if ( seekret == 0 ) { + logadd( LOG_WARNING, "Empty image file '%s'", path ); + goto load_error; + } + const uint64_t realFilesize = (uint64_t)seekret; + const uint64_t virtualFilesize = ( realFilesize + (DNBD3_BLOCK_SIZE - 1) ) & ~(DNBD3_BLOCK_SIZE - 1); + if ( realFilesize != virtualFilesize ) { + logadd( LOG_DEBUG1, "Image size of '%s' is %" PRIu64 ", virtual size: %" PRIu64, path, realFilesize, virtualFilesize ); + } + + // 1. Allocate memory for the cache map if the image is incomplete + cache_map = image_loadCacheMap( path, virtualFilesize ); + + // XXX: Maybe try sha-256 or 512 first if you're paranoid (to be implemented) + + // 2. Load CRC-32 list of image + bool doFullCheck = false; + uint32_t masterCrc = 0; + const int hashBlockCount = IMGSIZE_TO_HASHBLOCKS( virtualFilesize ); + crc32list = image_loadCrcList( path, virtualFilesize, &masterCrc ); + + // Check CRC32 + if ( crc32list != NULL ) { + if ( !image_checkRandomBlocks( 4, fdImage, realFilesize, crc32list, cache_map ) ) { + logadd( LOG_ERROR, "quick crc32 check of %s failed. Data corruption?", path ); + doFullCheck = true; + } + } + + // Compare data just loaded to identical image we apparently already loaded + if ( existing != NULL ) { + if ( existing->realFilesize != realFilesize ) { + logadd( LOG_WARNING, "Size of image '%s:%d' has changed.", existing->name, (int)existing->rid ); + // Image will be replaced below + } else if ( existing->crc32 != NULL && crc32list != NULL + && memcmp( existing->crc32, crc32list, sizeof(uint32_t) * hashBlockCount ) != 0 ) { + logadd( LOG_WARNING, "CRC32 list of image '%s:%d' has changed.", existing->name, (int)existing->rid ); + logadd( LOG_WARNING, "The image will be reloaded, but you should NOT replace existing images while the server is running." ); + logadd( LOG_WARNING, "Actually even if it's not running this should never be done. Use a new RID instead!" ); + // Image will be replaced below + } else if ( existing->crc32 == NULL && crc32list != NULL ) { + logadd( LOG_INFO, "Found CRC-32 list for already loaded image '%s:%d', adding...", existing->name, (int)existing->rid ); + existing->crc32 = crc32list; + existing->masterCrc32 = masterCrc; + crc32list = NULL; + function_return = true; + goto load_error; // Keep existing + } else if ( existing->cache_map != NULL && cache_map == NULL ) { + // Just ignore that fact, if replication is really complete the cache map will be removed anyways + logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", existing->name, (int)existing->rid ); + function_return = true; + goto load_error; // Keep existing + } else { + // Nothing changed about the existing image, so do nothing + logadd( LOG_DEBUG1, "Did not change" ); + function_return = true; + goto load_error; // Keep existing + } + // Remove existing image from images array, so it will be replaced by the reloaded image + existing = image_remove( existing ); + existing = image_release( existing ); + } + + // Load fresh image + dnbd3_image_t *image = calloc( 1, sizeof(dnbd3_image_t) ); + image->path = strdup( path ); + image->name = strdup( imgName ); + image->cache_map = cache_map; + image->crc32 = crc32list; + image->masterCrc32 = masterCrc; + image->uplink = NULL; + image->realFilesize = realFilesize; + image->virtualFilesize = virtualFilesize; + image->rid = (uint16_t)revision; + image->users = 0; + image->readFd = -1; + image->working = (image->cache_map == NULL ); + timing_get( &image->nextCompletenessEstimate ); + image->completenessEstimate = -1; + spin_init( &image->lock, PTHREAD_PROCESS_PRIVATE ); + int32_t offset; + if ( stat( path, &st ) == 0 ) { + // Negatively offset atime by file modification time + offset = (int32_t)( st.st_mtime - time( NULL ) ); + if ( offset > 0 ) offset = 0; + } else { + offset = 0; + } + timing_gets( &image->atime, offset ); + + // Prevent freeing in cleanup + cache_map = NULL; + crc32list = NULL; + + // Get rid of cache map if image is complete + if ( image->cache_map != NULL ) { + image_isComplete( image ); + } + + // Image is definitely incomplete, initialize uplink worker + if ( image->cache_map != NULL ) { + image->working = false; + if ( withUplink ) { + uplink_init( image, -1, NULL, -1 ); + } + } + + // ### Reaching this point means loading succeeded + image->readFd = fdImage; + if ( image_addToList( image ) ) { + // Keep fd for reading + fdImage = -1; + } else { + logadd( LOG_ERROR, "Image list full: Could not add image %s", path ); + image->readFd = -1; // Keep fdImage instead, will be closed below + image = image_free( image ); + goto load_error; + } + logadd( LOG_DEBUG1, "Loaded image '%s:%d'\n", image->name, (int)image->rid ); + // CRC errors found... + if ( doFullCheck ) { + logadd( LOG_INFO, "Queueing full CRC32 check for '%s:%d'\n", image->name, (int)image->rid ); + integrity_check( image, -1 ); + } + + function_return = true; + + // Clean exit: +load_error: ; + if ( existing != NULL ) existing = image_release( existing ); + if ( crc32list != NULL ) free( crc32list ); + if ( cache_map != NULL ) free( cache_map ); + if ( fdImage != -1 ) close( fdImage ); + return function_return; +} + +static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize) +{ + uint8_t *retval = NULL; + char mapFile[strlen( imagePath ) + 10 + 1]; + sprintf( mapFile, "%s.map", imagePath ); + int fdMap = open( mapFile, O_RDONLY ); + if ( fdMap >= 0 ) { + const int map_size = IMGSIZE_TO_MAPBYTES( fileSize ); + retval = calloc( 1, map_size ); + const ssize_t rd = read( fdMap, retval, map_size ); + if ( map_size != rd ) { + logadd( LOG_WARNING, "Could only read %d of expected %d bytes of cache map of '%s'", (int)rd, (int)map_size, imagePath ); + // Could not read complete map, that means the rest of the image file will be considered incomplete + } + close( fdMap ); + // Later on we check if the hash map says the image is complete + } + return retval; +} + +static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc) +{ + assert( masterCrc != NULL ); + uint32_t *retval = NULL; + const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( fileSize ); + // Currently this should only prevent accidental corruption (esp. regarding transparent proxy mode) + // but maybe later on you want better security + char hashFile[strlen( imagePath ) + 10 + 1]; + sprintf( hashFile, "%s.crc", imagePath ); + int fdHash = open( hashFile, O_RDONLY ); + if ( fdHash >= 0 ) { + off_t fs = lseek( fdHash, 0, SEEK_END ); + if ( fs < (hashBlocks + 1) * 4 ) { + logadd( LOG_WARNING, "Ignoring crc32 list for '%s' as it is too short", imagePath ); + } else { + if ( pread( fdHash, masterCrc, sizeof(uint32_t), 0 ) != sizeof(uint32_t) ) { + logadd( LOG_WARNING, "Error reading first crc32 of '%s'", imagePath ); + } else { + const size_t crcFileLen = hashBlocks * sizeof(uint32_t); + size_t pos = 0; + retval = calloc( hashBlocks, sizeof(uint32_t) ); + while ( pos < crcFileLen ) { + ssize_t ret = pread( fdHash, retval + pos, crcFileLen - pos, pos + sizeof(uint32_t) /* skip master-crc */ ); + if ( ret == -1 ) { + if ( errno == EINTR || errno == EAGAIN ) continue; + } + if ( ret <= 0 ) break; + pos += ret; + } + if ( pos != crcFileLen ) { + free( retval ); + retval = NULL; + logadd( LOG_WARNING, "Could not read crc32 list of '%s'", imagePath ); + } else { + uint32_t lists_crc = crc32( 0, NULL, 0 ); + lists_crc = crc32( lists_crc, (uint8_t*)retval, hashBlocks * sizeof(uint32_t) ); + lists_crc = net_order_32( lists_crc ); + if ( lists_crc != *masterCrc ) { + free( retval ); + retval = NULL; + logadd( LOG_WARNING, "CRC-32 of CRC-32 list mismatch. CRC-32 list of '%s' might be corrupted.", imagePath ); + } + } + } + } + close( fdHash ); + } + return retval; +} + +static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t realFilesize, uint32_t * const crc32list, uint8_t * const cache_map) +{ + // This checks the first block and (up to) count - 1 random blocks for corruption + // via the known crc32 list. This is very sloppy and is merely supposed to detect + // accidental corruption due to broken dnbd3-proxy functionality or file system + // corruption. + assert( count > 0 ); + const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( realFilesize ); + int blocks[count + 1]; + int index = 0, j; + int block; + if ( image_isHashBlockComplete( cache_map, 0, realFilesize ) ) blocks[index++] = 0; + int tries = count * 5; // Try only so many times to find a non-duplicate complete block + while ( index + 1 < count && --tries > 0 ) { + block = rand() % hashBlocks; // Random block + for ( j = 0; j < index; ++j ) { // Random block already in list? + if ( blocks[j] == block ) goto while_end; + } + // Block complete? If yes, add to list + if ( image_isHashBlockComplete( cache_map, block, realFilesize ) ) blocks[index++] = block; +while_end: ; + } + blocks[MIN(index, count)] = -1; // End of array has to be marked by a -1 + return image_checkBlocksCrc32( fdImage, crc32list, blocks, realFilesize ); // Return result of check +} + +/** + * Create a new image with the given image name and revision id in _basePath + * Returns true on success, false otherwise + */ +bool image_create(char *image, int revision, uint64_t size) +{ + assert( image != NULL ); + assert( size >= DNBD3_BLOCK_SIZE ); + if ( revision <= 0 ) { + logadd( LOG_ERROR, "revision id invalid: %d", revision ); + return false; + } + char path[PATHLEN], cache[PATHLEN]; + char *lastSlash = strrchr( image, '/' ); + if ( lastSlash == NULL ) { + snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision ); + } else { + *lastSlash = '\0'; + snprintf( path, PATHLEN, "%s/%s", _basePath, image ); + mkdir_p( path ); + *lastSlash = '/'; + snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision ); + } + snprintf( cache, PATHLEN, "%s.map", path ); + size = (size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + const int mapsize = IMGSIZE_TO_MAPBYTES(size); + // Write files + int fdImage = -1, fdCache = -1; + fdImage = open( path, O_RDWR | O_TRUNC | O_CREAT, 0644 ); + fdCache = open( cache, O_RDWR | O_TRUNC | O_CREAT, 0644 ); + if ( fdImage < 0 ) { + logadd( LOG_ERROR, "Could not open %s for writing.", path ); + goto failure_cleanup; + } + if ( fdCache < 0 ) { + logadd( LOG_ERROR, "Could not open %s for writing.", cache ); + goto failure_cleanup; + } + // Try cache map first + if ( !file_alloc( fdCache, 0, mapsize ) && !file_setSize( fdCache, mapsize ) ) { + const int err = errno; + logadd( LOG_DEBUG1, "Could not allocate %d bytes for %s (errno=%d)", mapsize, cache, err ); + } + // Now write image + if ( !_sparseFiles && !file_alloc( fdImage, 0, size ) ) { + logadd( LOG_ERROR, "Could not allocate %" PRIu64 " bytes for %s (errno=%d)", size, path, errno ); + logadd( LOG_ERROR, "It is highly recommended to use a file system that supports preallocating disk" + " space without actually writing all zeroes to the block device." ); + logadd( LOG_ERROR, "If you cannot fix this, try setting sparseFiles=true, but don't expect" + " divine performance during replication." ); + goto failure_cleanup; + } else if ( _sparseFiles && !file_setSize( fdImage, size ) ) { + logadd( LOG_ERROR, "Could not create sparse file of %" PRIu64 " bytes for %s (errno=%d)", size, path, errno ); + logadd( LOG_ERROR, "Make sure you have enough disk space, check directory permissions, fs errors etc." ); + goto failure_cleanup; + } + close( fdImage ); + close( fdCache ); + return true; + // +failure_cleanup: ; + if ( fdImage >= 0 ) close( fdImage ); + if ( fdCache >= 0 ) close( fdCache ); + remove( path ); + remove( cache ); + return false; +} + +static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision, const size_t len); +static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requestedRid); + +/** + * Does the same as image_get, but if the image is not known locally, or if + * revision 0 is requested, it will: + * a) Try to clone it from an authoritative dnbd3 server, if + * the server is running in proxy mode. + * b) Try to load it from disk by constructing the appropriate file name, if not + * running in proxy mode. + * + * If the return value is not NULL, + * image_release needs to be called on the image at some point. + * Locks on: remoteCloneLock, imageListLock, _images[].lock + */ +dnbd3_image_t* image_getOrLoad(char * const name, const uint16_t revision) +{ + // specific revision - try shortcut + if ( revision != 0 ) { + dnbd3_image_t *image = image_get( name, revision, true ); + if ( image != NULL ) return image; + } + const size_t len = strlen( name ); + // Sanity check + if ( len == 0 || name[len - 1] == '/' || name[0] == '/' + || name[0] == '.' || strstr( name, "/." ) != NULL ) return NULL; + // Call specific function depending on whether this is a proxy or not + if ( _isProxy ) { + return loadImageProxy( name, revision, len ); + } else { + return loadImageServer( name, revision ); + } +} + +/** + * Called if specific rid is not loaded, or if rid is 0 (some version might be loaded locally, + * but we should check if there's a higher rid on a remote server). + */ +static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision, const size_t len) +{ + // Already existing locally? + dnbd3_image_t *image = NULL; + if ( revision == 0 ) { + image = image_get( name, revision, true ); + } + + // Doesn't exist or is rid 0, try remote if not already tried it recently + declare_now; + char *cmpname = name; + int useIndex = -1, fallbackIndex = 0; + if ( len >= NAMELEN ) cmpname += 1 + len - NAMELEN; + pthread_mutex_lock( &remoteCloneLock ); + for (int i = 0; i < CACHELEN; ++i) { + if ( remoteCloneCache[i].rid == revision && strcmp( cmpname, remoteCloneCache[i].name ) == 0 ) { + useIndex = i; + if ( timing_reached( &remoteCloneCache[i].deadline, &now ) ) break; + pthread_mutex_unlock( &remoteCloneLock ); // Was recently checked... + return image; + } + if ( timing_1le2( &remoteCloneCache[i].deadline, &remoteCloneCache[fallbackIndex].deadline ) ) { + fallbackIndex = i; + } + } + // Re-check to prevent two clients at the same time triggering this, + // but only if rid != 0, since we would just get an old rid then + if ( revision != 0 ) { + if ( image == NULL ) image = image_get( name, revision, true ); + if ( image != NULL ) { + pthread_mutex_unlock( &remoteCloneLock ); + return image; + } + } + // Reaching this point means we should contact an authority server + serialized_buffer_t serialized; + // Mark as recently checked + if ( useIndex == -1 ) { + useIndex = fallbackIndex; + } + timing_set( &remoteCloneCache[useIndex].deadline, &now, SERVER_REMOTE_IMAGE_CHECK_CACHETIME ); + snprintf( remoteCloneCache[useIndex].name, NAMELEN, "%s", cmpname ); + remoteCloneCache[useIndex].rid = revision; + pthread_mutex_unlock( &remoteCloneLock ); + + // Get some alt servers and try to get the image from there +#define REP_NUM_SRV (8) + dnbd3_host_t servers[REP_NUM_SRV]; + int uplinkSock = -1; + dnbd3_host_t uplinkServer; + const int count = altservers_getListForUplink( servers, REP_NUM_SRV, false ); + uint16_t remoteProtocolVersion; + uint16_t remoteRid = revision; + uint64_t remoteImageSize; + struct sockaddr_storage sa; + socklen_t salen; + poll_list_t *cons = sock_newPollList(); + logadd( LOG_DEBUG2, "Trying to clone %s:%d from %d hosts", name, (int)revision, count ); + for (int i = 0; i < count + 5; ++i) { // "i < count + 5" for 5 additional iterations, waiting on pending connects + char *remoteName; + bool ok = false; + int sock; + if ( i >= count ) { + sock = sock_multiConnect( cons, NULL, 100, 1000 ); + if ( sock == -2 ) break; + } else { + if ( log_hasMask( LOG_DEBUG2 ) ) { + char host[50]; + size_t len = sock_printHost( &servers[i], host, sizeof(host) ); + host[len] = '\0'; + logadd( LOG_DEBUG2, "Trying to replicate from %s", host ); + } + sock = sock_multiConnect( cons, &servers[i], 100, 1000 ); + } + if ( sock == -1 || sock == -2 ) continue; + salen = sizeof(sa); + if ( getpeername( sock, (struct sockaddr*)&sa, &salen ) == -1 ) { + logadd( LOG_MINOR, "getpeername on successful connection failed!? (errno=%d)", errno ); + goto server_fail; + } + if ( !dnbd3_select_image( sock, name, revision, SI_SERVER_FLAGS ) ) goto server_fail; + if ( !dnbd3_select_image_reply( &serialized, sock, &remoteProtocolVersion, &remoteName, &remoteRid, &remoteImageSize ) ) goto server_fail; + if ( remoteProtocolVersion < MIN_SUPPORTED_SERVER || remoteRid == 0 ) goto server_fail; + if ( revision != 0 && remoteRid != revision ) goto server_fail; // Want specific revision but uplink supplied different rid + if ( revision == 0 && image != NULL && image->rid >= remoteRid ) goto server_fail; // Not actually a failure: Highest remote rid is <= highest local rid - don't clone! + if ( remoteImageSize < DNBD3_BLOCK_SIZE || remoteName == NULL || strcmp( name, remoteName ) != 0 ) goto server_fail; + if ( remoteImageSize > _maxReplicationSize ) { + logadd( LOG_MINOR, "Won't proxy '%s:%d': Larger than maxReplicationSize", name, (int)revision ); + goto server_fail; + } + pthread_mutex_lock( &reloadLock ); + // Ensure disk space entirely if not using sparse files, otherwise just make sure we have some room at least + if ( _sparseFiles ) { + ok = image_ensureDiskSpace( 2ull * 1024 * 1024 * 1024, false ); // 2GiB, maybe configurable one day + } else { + ok = image_ensureDiskSpace( remoteImageSize + ( 10 * 1024 * 1024 ), false ); // some extra space for cache map etc. + } + ok = ok && image_clone( sock, name, remoteRid, remoteImageSize ); // This sets up the file+map+crc and loads the img + pthread_mutex_unlock( &reloadLock ); + if ( !ok ) goto server_fail; + + // Cloning worked :-) + uplinkSock = sock; + if ( !sock_sockaddrToDnbd3( (struct sockaddr*)&sa, &uplinkServer ) ) { + uplinkServer.type = 0; + } + break; + +server_fail: ; + close( sock ); + } + sock_destroyPollList( cons ); + + // If we still have a pointer to a local image, release the reference + if ( image != NULL ) image_release( image ); + // If everything worked out, this call should now actually return the image + image = image_get( name, remoteRid, false ); + if ( image != NULL && uplinkSock != -1 ) { + // If so, init the uplink and pass it the socket + sock_setTimeout( uplinkSock, _uplinkTimeout ); + if ( !uplink_init( image, uplinkSock, &uplinkServer, remoteProtocolVersion ) ) { + close( uplinkSock ); + } else { + // Clumsy busy wait, but this should only take as long as it takes to start a thread, so is it really worth using a signalling mechanism? + int i = 0; + while ( !image->working && ++i < 100 ) + usleep( 2000 ); + } + } else if ( uplinkSock != -1 ) { + close( uplinkSock ); + } + return image; +} + +/** + * Called if specific rid is not loaded, or if rid is 0, in which case we check on + * disk which revision is latest. + */ +static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requestedRid) +{ + char imageFile[PATHLEN] = ""; + uint16_t detectedRid = 0; + + if ( requestedRid != 0 ) { + snprintf( imageFile, PATHLEN, "%s/%s.r%d", _basePath, name, (int)requestedRid ); + detectedRid = requestedRid; + } else { + glob_t g; + snprintf( imageFile, PATHLEN, "%s/%s.r*", _basePath, name ); + const int ret = glob( imageFile, GLOB_NOSORT | GLOB_MARK, NULL, &g ); + imageFile[0] = '\0'; + if ( ret == 0 ) { + long int best = 0; + for ( size_t i = 0; i < g.gl_pathc; ++i ) { + const char * const path = g.gl_pathv[i]; + const char * rev = strrchr( path, 'r' ); + if ( rev == NULL || rev == path || *(rev - 1) != '.' ) continue; + rev++; + if ( *rev < '0' || *rev > '9' ) continue; + char *err = NULL; + long int val = strtol( rev, &err, 10 ); + if ( err == NULL || *err != '\0' ) continue; + if ( val > best ) { + best = val; + snprintf( imageFile, PATHLEN, "%s", g.gl_pathv[i] ); + } + } + if ( best > 0 && best < 65536 ) { + detectedRid = (uint16_t)best; + } + } + globfree( &g ); + } + if ( _vmdkLegacyMode && requestedRid <= 1 + && !isForbiddenExtension( name ) + && ( detectedRid == 0 || !file_isReadable( imageFile ) ) ) { + snprintf( imageFile, PATHLEN, "%s/%s", _basePath, name ); + detectedRid = 1; + } + logadd( LOG_DEBUG2, "Trying to load %s:%d ( -> %d) as %s", name, (int)requestedRid, (int)detectedRid, imageFile ); + // No file was determined, or it doesn't seem to exist/be readable + if ( detectedRid == 0 ) { + logadd( LOG_DEBUG2, "Not found, bailing out" ); + return image_get( name, requestedRid, true ); + } + if ( !_vmdkLegacyMode && requestedRid == 0 ) { + // rid 0 requested - check if detected rid is readable, decrease rid if not until we reach 0 + while ( detectedRid != 0 ) { + dnbd3_image_t *image = image_get( name, detectedRid, true ); + if ( image != NULL ) { + // globbed rid already loaded, return + return image; + } + if ( file_isReadable( imageFile ) ) { + // globbed rid is + break; + } + logadd( LOG_DEBUG2, "%s: rid %d globbed but not readable, trying lower rid...", name, (int)detectedRid ); + detectedRid--; + snprintf( imageFile, PATHLEN, "%s/%s.r%d", _basePath, name, requestedRid ); + } + } + + // Now lock on the loading mutex, then check again if the image exists (we're multi-threaded) + pthread_mutex_lock( &reloadLock ); + dnbd3_image_t* image = image_get( name, detectedRid, true ); + if ( image != NULL ) { + // The image magically appeared in the meantime + logadd( LOG_DEBUG2, "Magically appeared" ); + pthread_mutex_unlock( &reloadLock ); + return image; + } + // Still not loaded, let's try to do so + logadd( LOG_DEBUG2, "Calling load" ); + image_load( _basePath, imageFile, false ); + pthread_mutex_unlock( &reloadLock ); + // If loading succeeded, this will return the image + logadd( LOG_DEBUG2, "Calling get" ); + return image_get( name, requestedRid, true ); +} + +/** + * Prepare a cloned image: + * 1. Allocate empty image file and its cache map + * 2. Use passed socket to request the crc32 list and save it to disk + * 3. Load the image from disk + * Returns: true on success, false otherwise + */ +static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageSize) +{ + // Allocate disk space and create cache map + if ( !image_create( name, revision, imageSize ) ) return false; + // CRC32 + const size_t len = strlen( _basePath ) + strlen( name ) + 20; + char crcFile[len]; + snprintf( crcFile, len, "%s/%s.r%d.crc", _basePath, name, (int)revision ); + if ( !file_isReadable( crcFile ) ) { + // Get crc32list from remote server + size_t crc32len = IMGSIZE_TO_HASHBLOCKS(imageSize) * sizeof(uint32_t); + uint32_t masterCrc; + uint8_t *crc32list = malloc( crc32len ); + if ( !dnbd3_get_crc32( sock, &masterCrc, crc32list, &crc32len ) ) { + free( crc32list ); + return false; + } + if ( crc32len != 0 ) { + uint32_t lists_crc = crc32( 0, NULL, 0 ); + lists_crc = crc32( lists_crc, (uint8_t*)crc32list, crc32len ); + lists_crc = net_order_32( lists_crc ); + if ( lists_crc != masterCrc ) { + logadd( LOG_WARNING, "OTF-Clone: Corrupted CRC-32 list. ignored. (%s)", name ); + } else { + int fd = open( crcFile, O_WRONLY | O_CREAT, 0644 ); + write( fd, &masterCrc, sizeof(uint32_t) ); + write( fd, crc32list, crc32len ); + close( fd ); + } + } + free( crc32list ); + } + // HACK: Chop of ".crc" to get the image file name + crcFile[strlen( crcFile ) - 4] = '\0'; + return image_load( _basePath, crcFile, false ); +} + +/** + * Generate the crc32 block list file for the given file. + * This function wants a plain file name instead of a dnbd3_image_t, + * as it can be used directly from the command line. + */ +bool image_generateCrcFile(char *image) +{ + int fdCrc = -1; + uint32_t crc; + char crcFile[strlen( image ) + 4 + 1]; + int fdImage = open( image, O_RDONLY ); + + if ( fdImage == -1 ) { + logadd( LOG_ERROR, "Could not open %s.", image ); + return false; + } + + const int64_t fileLen = lseek( fdImage, 0, SEEK_END ); + if ( fileLen <= 0 ) { + logadd( LOG_ERROR, "Error seeking to end, or file is empty." ); + goto cleanup_fail; + } + + struct stat sst; + sprintf( crcFile, "%s.crc", image ); + if ( stat( crcFile, &sst ) == 0 ) { + logadd( LOG_ERROR, "CRC File for %s already exists! Delete it first if you want to regen.", image ); + goto cleanup_fail; + } + + fdCrc = open( crcFile, O_RDWR | O_CREAT, 0644 ); + if ( fdCrc == -1 ) { + logadd( LOG_ERROR, "Could not open CRC File %s for writing..", crcFile ); + goto cleanup_fail; + } + // CRC of all CRCs goes first. Don't know it yet, write 4 bytes dummy data. + if ( write( fdCrc, crcFile, sizeof(crc) ) != sizeof(crc) ) { + logadd( LOG_ERROR, "Write error" ); + goto cleanup_fail; + } + + printf( "Generating CRC32" ); + fflush( stdout ); + const int blockCount = IMGSIZE_TO_HASHBLOCKS( fileLen ); + for ( int i = 0; i < blockCount; ++i ) { + if ( !image_calcBlockCrc32( fdImage, i, fileLen, &crc ) ) { + goto cleanup_fail; + } + if ( write( fdCrc, &crc, sizeof(crc) ) != sizeof(crc) ) { + printf( "\nWrite error writing crc file: %d\n", errno ); + goto cleanup_fail; + } + putchar( '.' ); + fflush( stdout ); + } + close( fdImage ); + fdImage = -1; + printf( "done!\n" ); + + logadd( LOG_INFO, "Generating master-crc..." ); + fflush( stdout ); + // File is written - read again to calc master crc + if ( lseek( fdCrc, 4, SEEK_SET ) != 4 ) { + logadd( LOG_ERROR, "Could not seek to beginning of crc list in file" ); + goto cleanup_fail; + } + char buffer[400]; + int blocksToGo = blockCount; + crc = crc32( 0, NULL, 0 ); + while ( blocksToGo > 0 ) { + const int numBlocks = MIN( (int)( sizeof(buffer) / sizeof(crc) ), blocksToGo ); + if ( read( fdCrc, buffer, numBlocks * sizeof(crc) ) != numBlocks * (int)sizeof(crc) ) { + logadd( LOG_ERROR, "Could not re-read from crc32 file" ); + goto cleanup_fail; + } + crc = crc32( crc, (uint8_t*)buffer, numBlocks * sizeof(crc) ); + blocksToGo -= numBlocks; + } + crc = net_order_32( crc ); + if ( pwrite( fdCrc, &crc, sizeof(crc), 0 ) != sizeof(crc) ) { + logadd( LOG_ERROR, "Could not write master crc to file" ); + goto cleanup_fail; + } + logadd( LOG_INFO, "CRC-32 file successfully generated." ); + fflush( stdout ); + return true; + +cleanup_fail:; + if ( fdImage != -1 ) close( fdImage ); + if ( fdCrc != -1 ) close( fdCrc ); + return false; +} + +json_t* image_getListAsJson() +{ + json_t *imagesJson = json_array(); + json_t *jsonImage; + int i; + char uplinkName[100] = { 0 }; + uint64_t bytesReceived; + int users, completeness, idleTime; + declare_now; + + spin_lock( &imageListLock ); + for ( i = 0; i < _num_images; ++i ) { + if ( _images[i] == NULL ) continue; + dnbd3_image_t *image = _images[i]; + spin_lock( &image->lock ); + spin_unlock( &imageListLock ); + users = image->users; + idleTime = (int)timing_diff( &image->atime, &now ); + completeness = image_getCompletenessEstimate( image ); + if ( image->uplink == NULL ) { + bytesReceived = 0; + uplinkName[0] = '\0'; + } else { + bytesReceived = image->uplink->bytesReceived; + if ( image->uplink->fd == -1 || !host_to_string( &image->uplink->currentServer, uplinkName, sizeof(uplinkName) ) ) { + uplinkName[0] = '\0'; + } + } + image->users++; // Prevent freeing after we unlock + spin_unlock( &image->lock ); + + jsonImage = json_pack( "{sisssisisisisI}", + "id", image->id, // id, name, rid never change, so access them without locking + "name", image->name, + "rid", (int) image->rid, + "users", users, + "complete", completeness, + "idle", idleTime, + "size", (json_int_t)image->virtualFilesize ); + if ( bytesReceived != 0 ) { + json_object_set_new( jsonImage, "bytesReceived", json_integer( (json_int_t) bytesReceived ) ); + } + if ( uplinkName[0] != '\0' ) { + json_object_set_new( jsonImage, "uplinkServer", json_string( uplinkName ) ); + } + json_array_append_new( imagesJson, jsonImage ); + + image = image_release( image ); // Since we did image->users++; + spin_lock( &imageListLock ); + } + spin_unlock( &imageListLock ); + return imagesJson; +} + +/** + * Get completeness of an image in percent. Only estimated, not exact. + * Returns: 0-100 + * DOES NOT LOCK, so make sure to do so before calling + */ +int image_getCompletenessEstimate(dnbd3_image_t * const image) +{ + assert( image != NULL ); + if ( image->cache_map == NULL ) return image->working ? 100 : 0; + declare_now; + if ( !timing_reached( &image->nextCompletenessEstimate, &now ) ) { + // Since this operation is relatively expensive, we cache the result for a while + return image->completenessEstimate; + } + int i; + int percent = 0; + const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + if ( len == 0 ) return 0; + for ( i = 0; i < len; ++i ) { + if ( image->cache_map[i] == 0xff ) { + percent += 100; + } else if ( image->cache_map[i] != 0 ) { + percent += 50; + } + } + image->completenessEstimate = percent / len; + timing_set( &image->nextCompletenessEstimate, &now, 8 + rand() % 32 ); + return image->completenessEstimate; +} + +/** + * Check the CRC-32 of the given blocks. The array "blocks" is of variable length. + * !! pass -1 as the last block so the function knows when to stop !! + * Does NOT check whether block index is within image. + * Returns true or false + */ +bool image_checkBlocksCrc32(const int fd, uint32_t *crc32list, const int *blocks, const uint64_t realFilesize) +{ + while ( *blocks != -1 ) { + uint32_t crc; + if ( !image_calcBlockCrc32( fd, *blocks, realFilesize, &crc ) ) { + return false; + } + if ( crc != crc32list[*blocks] ) { + logadd( LOG_WARNING, "Block %d is %x, should be %x", *blocks, crc, crc32list[*blocks] ); + return false; + } + blocks++; + } + return true; +} + +/** + * Calc CRC-32 of block. Value is returned as little endian. + */ +static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc) +{ + // Make buffer 4k aligned in case fd has O_DIRECT set +#define BSIZE 262144 + char rawBuffer[BSIZE + DNBD3_BLOCK_SIZE]; + char * const buffer = (char*)( ( (uintptr_t)rawBuffer + ( DNBD3_BLOCK_SIZE - 1 ) ) & ~( DNBD3_BLOCK_SIZE - 1 ) ); + // How many bytes to read from the input file + const uint64_t bytesFromFile = MIN( HASH_BLOCK_SIZE, realFilesize - ( block * HASH_BLOCK_SIZE) ); + // Determine how many bytes we had to read if the file size were a multiple of 4k + // This might be the same value if the real file's size is a multiple of 4k + const uint64_t vbs = ( ( realFilesize + ( DNBD3_BLOCK_SIZE - 1 ) ) & ~( DNBD3_BLOCK_SIZE - 1 ) ) - ( block * HASH_BLOCK_SIZE ); + const uint64_t virtualBytesFromFile = MIN( HASH_BLOCK_SIZE, vbs ); + const off_t readPos = (int64_t)block * HASH_BLOCK_SIZE; + size_t bytes = 0; + assert( vbs >= bytesFromFile ); + *crc = crc32( 0, NULL, 0 ); + // Calculate the crc32 by reading data from the file + while ( bytes < bytesFromFile ) { + const size_t n = (size_t)MIN( BSIZE, bytesFromFile - bytes ); + const ssize_t r = pread( fd, buffer, n, readPos + bytes ); + if ( r <= 0 ) { + logadd( LOG_WARNING, "CRC: Read error (errno=%d)", errno ); + return false; + } + *crc = crc32( *crc, (uint8_t*)buffer, r ); + bytes += (size_t)r; + } + // If the virtual file size is different, keep going using nullbytes + if ( bytesFromFile < virtualBytesFromFile ) { + memset( buffer, 0, BSIZE ); + bytes = (size_t)( virtualBytesFromFile - bytesFromFile ); + while ( bytes != 0 ) { + const size_t len = MIN( BSIZE, bytes ); + *crc = crc32( *crc, (uint8_t*)buffer, len ); + bytes -= len; + } + } + *crc = net_order_32( *crc ); + return true; +#undef BSIZE +} + +/** + * Call image_ensureDiskSpace (below), but aquire + * reloadLock first. + */ +bool image_ensureDiskSpaceLocked(uint64_t size, bool force) +{ + bool ret; + pthread_mutex_lock( &reloadLock ); + ret = image_ensureDiskSpace( size, force ); + pthread_mutex_unlock( &reloadLock ); + return ret; +} + +/** + * Make sure at least size bytes are available in _basePath. + * Will delete old images to make room for new ones. + * TODO: Store last access time of images. Currently the + * last access time is reset to the file modification time + * on server restart. Thus it will + * currently only delete images if server uptime is > 10 hours. + * This can be overridden by setting force to true, in case + * free space is desperately needed. + * Return true iff enough space is available. false in random other cases + */ +static bool image_ensureDiskSpace(uint64_t size, bool force) +{ + for ( int maxtries = 0; maxtries < 20; ++maxtries ) { + uint64_t available; + if ( !file_freeDiskSpace( _basePath, NULL, &available ) ) { + const int e = errno; + logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left... ;-)\n", e ); + return true; + } + if ( available > size ) return true; + if ( !force && dnbd3_serverUptime() < 10 * 3600 ) { + logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < 10 hours...", (int)(available / (1024ll * 1024ll)), + (int)(size / (1024 * 1024)) ); + return false; + } + logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, freeing an image...", (int)(available / (1024ll * 1024ll)), + (int)(size / (1024 * 1024)) ); + // Find least recently used image + dnbd3_image_t *oldest = NULL; + int i; // XXX improve locking + for (i = 0; i < _num_images; ++i) { + if ( _images[i] == NULL ) continue; + dnbd3_image_t *current = image_lock( _images[i] ); + if ( current == NULL ) continue; + if ( current->users == 1 ) { // Just from the lock above + if ( oldest == NULL || timing_1le2( ¤t->atime, &oldest->atime ) ) { + // Oldest access time so far + oldest = current; + } + } + current = image_release( current ); + } + declare_now; + if ( oldest == NULL || ( !_sparseFiles && timing_diff( &oldest->atime, &now ) < 86400 ) ) { + if ( oldest == NULL ) { + logadd( LOG_INFO, "All images are currently in use :-(" ); + } else { + logadd( LOG_INFO, "Won't free any image, all have been in use in the past 24 hours :-(" ); + } + return false; + } + oldest = image_lock( oldest ); + if ( oldest == NULL ) continue; // Image freed in the meantime? Try again + logadd( LOG_INFO, "'%s:%d' has to go!", oldest->name, (int)oldest->rid ); + char *filename = strdup( oldest->path ); + oldest = image_remove( oldest ); + oldest = image_release( oldest ); + unlink( filename ); + size_t len = strlen( filename ) + 10; + char buffer[len]; + snprintf( buffer, len, "%s.map", filename ); + unlink( buffer ); + snprintf( buffer, len, "%s.crc", filename ); + unlink( buffer ); + snprintf( buffer, len, "%s.meta", filename ); + unlink( buffer ); + free( filename ); + } + return false; +} + +void image_closeUnusedFd() +{ + int fd, i; + ticks deadline; + timing_gets( &deadline, -UNUSED_FD_TIMEOUT ); + char imgstr[300]; + spin_lock( &imageListLock ); + for (i = 0; i < _num_images; ++i) { + dnbd3_image_t * const image = _images[i]; + if ( image == NULL ) + continue; + spin_lock( &image->lock ); + spin_unlock( &imageListLock ); + if ( image->users == 0 && image->uplink == NULL && timing_reached( &image->atime, &deadline ) ) { + snprintf( imgstr, sizeof(imgstr), "%s:%d", image->name, (int)image->rid ); + fd = image->readFd; + image->readFd = -1; + } else { + fd = -1; + } + spin_unlock( &image->lock ); + if ( fd != -1 ) { + close( fd ); + logadd( LOG_DEBUG1, "Inactive fd closed for %s", imgstr ); + } + spin_lock( &imageListLock ); + } + spin_unlock( &imageListLock ); +} + +/* + void image_find_latest() + { + // Not in array or most recent rid is requested, try file system + if (revision != 0) { + // Easy case - specific RID + char + } else { + // Determine base directory where the image in question has to reside. + // Eg, the _basePath is "/srv/", requested image is "rz/ubuntu/default-13.04" + // Then searchPath has to be set to "/srv/rz/ubuntu" + char searchPath[strlen(_basePath) + len + 1]; + char *lastSlash = strrchr(name, '/'); + char *baseName; // Name of the image. In the example above, it will be "default-13.04" + if ( lastSlash == NULL ) { + *searchPath = '\0'; + baseName = name; + } else { + char *from = name, *to = searchPath; + while (from < lastSlash) *to++ = *from++; + *to = '\0'; + baseName = lastSlash + 1; + } + // Now we have the search path in our real file system and the expected image name. + // The revision naming sceme is <IMAGENAME>.r<RID>, so if we're looking for revision 13, + // our example image has to be named default-13.04.r13 + } + } + */ |