#include "image.h" #include "helper.h" #include "fileutil.h" #include "log.h" #include "uplink.h" #include "locks.h" #include "integrity.h" #include "../protocol.h" #include "sockhelper.h" #include "altservers.h" #include "server.h" #include "signal.h" #include #include #include #include #include #include #include #include #include #include #include #include // ########################################## dnbd3_image_t *_images[SERVER_MAX_IMAGES]; int _num_images = 0; pthread_spinlock_t _images_lock; static pthread_mutex_t remoteCloneLock = PTHREAD_MUTEX_INITIALIZER; #define NAMELEN 500 #define CACHELEN 20 typedef struct { char name[NAMELEN]; uint16_t rid; time_t deadline; } imagecache; static imagecache remoteCloneCache[CACHELEN]; static int remoteCloneCacheIndex = 0; // ########################################## static dnbd3_image_t* image_free(dnbd3_image_t *image); static bool image_isHashBlockComplete(uint8_t * const cacheMap, const uint64_t block, const uint64_t fileSize); static bool image_load_all_internal(char *base, char *path); static bool image_load(char *base, char *path, int withUplink); static int64_t image_pad(const char *path, const int64_t currentSize); static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageSize); static bool image_ensureDiskSpace(uint64_t size); static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize); static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc); static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, uint8_t * const cache_map); // ########################################## /** * Returns true if the given image is complete */ bool image_isComplete(dnbd3_image_t *image) { assert( image != NULL ); if ( image->working && image->cache_map == NULL ) { return true; } if ( image->filesize == 0 ) { return false; } bool complete = true; int j; const int map_len_bytes = IMGSIZE_TO_MAPBYTES( image->filesize ); for (j = 0; j < map_len_bytes - 1; ++j) { if ( image->cache_map[j] != 0xFF ) { complete = false; break; } } if ( complete ) // Every block except the last one is complete { // Last one might need extra treatment if it's not a full byte const int blocks_in_last_byte = (image->filesize >> 12) & 7; uint8_t last_byte = 0; if ( blocks_in_last_byte == 0 ) { last_byte = 0xFF; } else { for (j = 0; j < blocks_in_last_byte; ++j) last_byte |= (1 << j); } complete = ((image->cache_map[map_len_bytes - 1] & last_byte) == last_byte); } return complete; } /** * Update cache-map of given image for the given byte range * start (inclusive) - end (exclusive) * Locks on: images[].lock */ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set) { assert( image != NULL ); // This should always be block borders due to how the protocol works, but better be safe // than accidentally mark blocks as cached when they really aren't entirely cached. end &= ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); start = (uint64_t)(start + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); bool dirty = false; uint64_t pos = start; spin_lock( &image->lock ); if ( image->cache_map == NULL ) { // Image seems already complete spin_unlock( &image->lock ); logadd( LOG_DEBUG1, "image_updateCachemap with no cache_map: %s", image->path ); return; } while ( pos < end ) { const int map_y = pos >> 15; const int map_x = (pos >> 12) & 7; // mod 8 const uint8_t bit_mask = 1 << map_x; if ( set ) { if ( (image->cache_map[map_y] & bit_mask) == 0 ) dirty = true; image->cache_map[map_y] |= bit_mask; } else { image->cache_map[map_y] &= ~bit_mask; } pos += DNBD3_BLOCK_SIZE; } if ( dirty && image->crc32 != NULL ) { // If dirty is set, at least one of the blocks was not cached before, so queue all hash blocks // for checking, even though this might lead to checking some hash block again, if it was // already complete and the block range spanned at least two hash blocks. // First set start and end to borders of hash blocks start &= ~(uint64_t)(HASH_BLOCK_SIZE - 1); end = (end + HASH_BLOCK_SIZE - 1) & ~(uint64_t)(HASH_BLOCK_SIZE - 1); pos = start; while ( pos < end ) { if ( image->cache_map == NULL ) break; const int block = pos / HASH_BLOCK_SIZE; if ( image_isHashBlockComplete( image->cache_map, block, image->filesize ) ) { spin_unlock( &image->lock ); integrity_check( image, block ); spin_lock( &image->lock ); } pos += HASH_BLOCK_SIZE; } } spin_unlock( &image->lock ); } /** * Mark image as complete by freeing the cache_map and deleting the map file on disk * DOES NOT LOCK ON THE IMAGE, DO SO BEFORE CALLING */ void image_markComplete(dnbd3_image_t *image) { assert( image != NULL ); if ( image->cache_map == NULL ) return; free( image->cache_map ); image->cache_map = NULL; char mapfile[strlen( image->path ) + 4 + 1]; sprintf( mapfile, "%s.map", image->path ); remove( mapfile ); } /** * Save cache map of every image */ void image_saveAllCacheMaps() { for (int i = 0; i < _num_images; ++i) { image_saveCacheMap( _images[i] ); } } /** * Saves the cache map of the given image. * Return true on success. * Locks on: image.lock */ bool image_saveCacheMap(dnbd3_image_t *image) { if ( image == NULL || image->cache_map == NULL ) return true; spin_lock( &image->lock ); // Lock and get a copy of the cache map, as it could be freed by another thread that is just about to // figure out that this image's cache copy is complete if ( image->cache_map == NULL || image->filesize < DNBD3_BLOCK_SIZE ) { spin_unlock( &image->lock ); return true; } const size_t size = IMGSIZE_TO_MAPBYTES(image->filesize); uint8_t *map = malloc( size ); memcpy( map, image->cache_map, size ); // Unlock. Use path and cacheFd without locking. path should never change after initialization of the image, // cacheFd is written to and we don't hold a spinlock during I/O // By increasing the user count we make sure the image is not freed in the meantime image->users++; spin_unlock( &image->lock ); assert( image->path != NULL ); char mapfile[strlen( image->path ) + 4 + 1]; int fd; strcpy( mapfile, image->path ); strcat( mapfile, ".map" ); fd = open( mapfile, O_WRONLY | O_CREAT, 0644 ); if ( fd < 0 ) { spin_lock( &image->lock ); image->users--; spin_unlock( &image->lock ); free( map ); return false; } write( fd, map, size ); if ( image->cacheFd != -1 ) { fdatasync( image->cacheFd ); } fdatasync( fd ); close( fd ); free( map ); spin_lock( &image->lock ); image->users--; spin_unlock( &image->lock ); return true; } /** * Get an image by name+rid. This function increases a reference counter, * so you HAVE TO CALL image_release for every image_get() call at some * point... * Locks on: _images_lock, _images[].lock */ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) { int i; dnbd3_image_t *candidate = NULL; // Simple sanity check const int len = strlen( name ); if ( len == 0 || name[len - 1] == '/' || name[0] == '/' ) return NULL ; // Always use lowercase name strtolower( name ); // Go through array spin_lock( &_images_lock ); for (i = 0; i < _num_images; ++i) { dnbd3_image_t * const image = _images[i]; if ( image == NULL || strcmp( image->lower_name, name ) != 0 ) continue; if ( revision == image->rid ) { candidate = image; break; } else if ( revision == 0 && (candidate == NULL || candidate->rid < image->rid) ) { candidate = image; } } // Not found if ( candidate == NULL ) { spin_unlock( &_images_lock ); return NULL ; } spin_lock( &candidate->lock ); spin_unlock( &_images_lock ); candidate->users++; spin_unlock( &candidate->lock ); if ( !checkIfWorking ) return candidate; // Found, see if it works if ( !candidate->working && candidate->cache_map != NULL && candidate->uplink == NULL && file_isWritable( candidate->path ) ) { // Not working and has file + cache-map, try to init uplink (uplink_init will check if proxy mode is enabled) uplink_init( candidate, -1, NULL ); } else if ( candidate->working && candidate->uplink != NULL && candidate->uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) { // To many pending uplink requests. We take that as a hint that the uplink is clogged or no working uplink server // exists, so "working" is changed to false for now. Should a new uplink server be found the uplink thread will // set this back to true some time. candidate->working = false; } return candidate; // Success :-) } /** * Lock the image by increasing its users count * Returns the image on success, NULL if it is not found in the image list * Every call to image_lock() needs to be followed by a call to image_release() at some point. * Locks on: _images_lock, _images[].lock */ dnbd3_image_t* image_lock(dnbd3_image_t *image) { if ( image == NULL ) return NULL ; int i; spin_lock( &_images_lock ); for (i = 0; i < _num_images; ++i) { if ( _images[i] == image ) { spin_lock( &image->lock ); spin_unlock( &_images_lock ); image->users++; spin_unlock( &image->lock ); return image; } } spin_unlock( &_images_lock ); return NULL ; } /** * Release given image. This will decrease the reference counter of the image. * If the usage counter reaches 0 and the image is not in the images array * anymore, the image will be freed * Locks on: _images_lock, _images[].lock */ dnbd3_image_t* image_release(dnbd3_image_t *image) { if ( image == NULL ) return NULL; spin_lock( &image->lock ); assert( image->users > 0 ); image->users--; if ( image->users > 0 ) { // Still in use, do nothing spin_unlock( &image->lock ); return NULL; } spin_unlock( &image->lock ); // Getting here means we decreased the usage counter to zero // If the image is not in the images list anymore, we're // responsible for freeing it spin_lock( &_images_lock ); for (int i = 0; i < _num_images; ++i) { if ( _images[i] == image ) { // Found, do nothing spin_unlock( &_images_lock ); return NULL; } } spin_unlock( &_images_lock ); // So it wasn't in the images list anymore either, get rid of it, // but check usage count once again, since it might have been increased // after we unlocked above if ( image->users == 0 ) image_free( image ); return NULL; } /** * Remove image from images array. Only free it if it has * no active users * Locks on: _images_lock, image[].lock */ void image_remove(dnbd3_image_t *image) { spin_lock( &_images_lock ); spin_lock( &image->lock ); for (int i = _num_images - 1; i >= 0; --i) { if ( _images[i] != image ) continue; _images[i] = NULL; if ( i + 1 == _num_images ) _num_images--; } spin_unlock( &image->lock ); if ( image->users <= 0 ) image = image_free( image ); spin_unlock( &_images_lock ); } /** * Kill all uplinks */ void image_killUplinks() { int i; spin_lock( &_images_lock ); for (i = 0; i < _num_images; ++i) { if ( _images[i] == NULL ) continue; spin_lock( &_images[i]->lock ); if ( _images[i]->uplink != NULL ) { _images[i]->uplink->shutdown = true; signal_call( _images[i]->uplink->signal ); } spin_unlock( &_images[i]->lock ); } spin_unlock( &_images_lock ); } /** * Load all images in given path recursively. * Pass NULL to use path from config. */ bool image_loadAll(char *path) { if ( path == NULL ) { return image_load_all_internal( _basePath, _basePath ); } return image_load_all_internal( path, path ); } /** * Free all images we have, but only if they're not in use anymore. * Locks on _images_lock * @return true if all images have been freed */ bool image_tryFreeAll() { spin_lock( &_images_lock ); for (int i = _num_images - 1; i >= 0; --i) { if ( _images[i] != NULL && _images[i]->users == 0 ) { _images[i] = image_free( _images[i] ); } if ( i + 1 == _num_images && _images[i] == NULL ) _num_images--; } spin_unlock( &_images_lock ); return _num_images == 0; } /** * Free image. DOES NOT check if it's in use. * Indirectly locks on image.lock, uplink.queueLock */ static dnbd3_image_t* image_free(dnbd3_image_t *image) { assert( image != NULL ); // image_saveCacheMap( image ); uplink_shutdown( image ); spin_lock( &image->lock ); free( image->cache_map ); free( image->crc32 ); free( image->path ); free( image->lower_name ); spin_unlock( &image->lock ); if ( image->cacheFd != -1 ) close( image->cacheFd ); if ( image->readFd != -1 ) close( image->readFd ); spin_destroy( &image->lock ); // memset( image, 0, sizeof(*image) ); free( image ); return NULL ; } static bool image_isHashBlockComplete(uint8_t * const cacheMap, const uint64_t block, const uint64_t fileSize) { if ( cacheMap == NULL ) return true; const uint64_t end = (block + 1) * HASH_BLOCK_SIZE; if ( end <= fileSize ) { for (uint64_t mapPos = block * HASH_BLOCK_SIZE; mapPos < end; mapPos += (DNBD3_BLOCK_SIZE * 8)) { if ( cacheMap[mapPos / (DNBD3_BLOCK_SIZE * 8)] != 0xff ) { return false; } } } else { for (uint64_t mapPos = block * HASH_BLOCK_SIZE; mapPos < fileSize; mapPos += DNBD3_BLOCK_SIZE ) { const int map_y = mapPos >> 15; const int map_x = (mapPos >> 12) & 7; // mod 8 const int mask = 1 << map_x; if ( (cacheMap[map_y] & mask) == 0 ) return false; } } return true; } /** * Load all images in the given path recursively, * consider *base the base path that is to be cut off */ static bool image_load_all_internal(char *base, char *path) { #define SUBDIR_LEN 120 assert( path != NULL ); assert( *path == '/' ); struct dirent *entry; DIR *dir = opendir( path ); if ( dir == NULL ) { logadd( LOG_ERROR, "Could not opendir '%s' for loading", path ); return false; } const int pathLen = strlen( path ); const int len = pathLen + SUBDIR_LEN + 1; char subpath[len]; struct stat st; while ( (entry = readdir( dir )) != NULL ) { if ( strcmp( entry->d_name, "." ) == 0 || strcmp( entry->d_name, ".." ) == 0 ) continue; if ( strlen( entry->d_name ) > SUBDIR_LEN ) { logadd( LOG_WARNING, "Skipping entry %s: Too long (max %d bytes)", entry->d_name, (int)SUBDIR_LEN ); continue; } if ( entry->d_name[0] == '/' || path[pathLen - 1] == '/' ) { snprintf( subpath, len, "%s%s", path, entry->d_name ); } else { snprintf( subpath, len, "%s/%s", path, entry->d_name ); } if ( stat( subpath, &st ) < 0 ) { logadd( LOG_WARNING, "stat() for '%s' failed. Ignoring....", subpath ); continue; } if ( S_ISDIR( st.st_mode )) { image_load_all_internal( base, subpath ); // Recurse } else { image_load( base, subpath, true ); // Load image if possible } } closedir( dir ); return true; #undef SUBDIR_LEN } static bool image_load(char *base, char *path, int withUplink) { static int imgIdCounter = 0; // Used to assign unique numeric IDs to images int i, revision; struct stat st; uint8_t *cache_map = NULL; uint32_t *crc32list = NULL; dnbd3_image_t *existing = NULL; int fdImage = -1; bool function_return = false; // Return false by default assert( base != NULL ); assert( path != NULL ); assert( *path == '/' ); assert( strncmp( path, base, strlen(base)) == 0 ); assert( base[strlen(base) - 1] != '/' ); assert( strlen(path) > strlen(base) ); char *lastSlash = strrchr( path, '/' ); char *fileName = lastSlash + 1; char imgName[strlen( path )]; const int fileNameLen = strlen( fileName ); // Copy virtual path (relative path in "base") char * const virtBase = path + strlen( base ) + 1; assert( *virtBase != '/' ); char *src = virtBase, *dst = imgName; while ( src <= lastSlash ) { *dst++ = *src++; } *dst = '\0'; // Parse file name for revision if ( _vmdkLegacyMode && strend( fileName, ".vmdk" ) ) { // Easy - legacy mode, simply append full file name and set rid to 1 strcat( dst, fileName ); revision = 1; } else { // Try to parse *.r syntax for (i = fileNameLen - 1; i > 1; --i) { if ( fileName[i] < '0' || fileName[i] > '9' ) break; } if ( i == fileNameLen - 1 ) return false; if ( fileName[i] != 'r' ) return false; if ( fileName[i - 1] != '.' ) return false; revision = atoi( fileName + i + 1 ); src = fileName; while ( src < fileName + i - 1 ) { *dst++ = *src++; } *dst = '\0'; } if ( revision <= 0 ) { logadd( LOG_WARNING, "Image '%s' has invalid revision ID %d", path, revision ); goto load_error; } strtolower( imgName ); // Get pointer to already existing image if possible existing = image_get( imgName, revision, true ); // ### Now load the actual image related data ### fdImage = open( path, O_RDONLY ); if ( fdImage < 0 ) { logadd( LOG_ERROR, "Could not open '%s' for reading...", path ); goto load_error; } // Determine file size const off_t seekret = lseek( fdImage, 0, SEEK_END ); if ( seekret < 0 ) { logadd( LOG_ERROR, "Could not seek to end of file '%s'", path ); goto load_error; } else if ( seekret == 0 ) { logadd( LOG_WARNING, "Empty image file '%s'", path ); goto load_error; } uint64_t fileSize = (uint64_t)seekret; // Filesize must be multiple of 4096 if ( fileSize % DNBD3_BLOCK_SIZE != 0 ) { logadd( LOG_INFO, "Image size of '%s' is not a multiple of %d, fixing...", path, (int)DNBD3_BLOCK_SIZE ); fileSize = image_pad( path, fileSize ); if ( fileSize == 0 ) goto load_error; } // 1. Allocate memory for the cache map if the image is incomplete cache_map = image_loadCacheMap( path, fileSize ); // TODO: Maybe try sha-256 or 512 first if you're paranoid (to be implemented) // 2. Load CRC-32 list of image uint32_t masterCrc; const int hashBlockCount = IMGSIZE_TO_HASHBLOCKS( fileSize ); crc32list = image_loadCrcList( path, fileSize, &masterCrc ); // Check CRC32 if ( crc32list != NULL ) { if ( !image_checkRandomBlocks( 4, fdImage, fileSize, crc32list, cache_map ) ) { logadd( LOG_ERROR, "quick crc32 check of %s failed. Data corruption?", path ); goto load_error; } } // Compare data just loaded to identical image we apparently already loaded if ( existing != NULL ) { if ( existing->filesize != fileSize ) { // Image will be replaced below logadd( LOG_WARNING, "Size of image '%s:%d' has changed.", existing->lower_name, (int)existing->rid ); } else if ( existing->crc32 != NULL && crc32list != NULL && memcmp( existing->crc32, crc32list, sizeof(uint32_t) * hashBlockCount ) != 0 ) { // Image will be replaced below logadd( LOG_WARNING, "CRC32 list of image '%s:%d' has changed.", existing->lower_name, (int)existing->rid ); logadd( LOG_WARNING, "The image will be reloaded, but you should NOT replace existing images while the server is running." ); logadd( LOG_WARNING, "Actually even if it's not running this should never be done. Use a new RID instead!" ); } else if ( existing->crc32 == NULL && crc32list != NULL ) { logadd( LOG_INFO, "Found CRC-32 list for already loaded image '%s:%d', adding...", existing->lower_name, (int)existing->rid ); existing->crc32 = crc32list; existing->masterCrc32 = masterCrc; crc32list = NULL; function_return = true; goto load_error; } else if ( existing->cache_map != NULL && cache_map == NULL ) { // Just ignore that fact, if replication is really complete the cache map will be removed anyways logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", existing->lower_name, (int)existing->rid ); function_return = true; goto load_error; } else { // Nothing changed about the existing image, so do nothing function_return = true; goto load_error; } // Remove image from images array image_release( existing ); image_remove( existing ); existing = NULL; } // Load fresh image dnbd3_image_t *image = calloc( 1, sizeof(dnbd3_image_t) ); image->path = strdup( path ); image->lower_name = strdup( imgName ); image->cache_map = cache_map; image->crc32 = crc32list; image->masterCrc32 = masterCrc; image->uplink = NULL; image->filesize = fileSize; image->rid = revision; image->users = 0; image->readFd = -1; image->cacheFd = -1; image->working = (image->cache_map == NULL ); spin_init( &image->lock, PTHREAD_PROCESS_PRIVATE ); if ( stat( path, &st ) == 0 ) { image->atime = st.st_mtime; } else { image->atime = time( NULL ); } // Prevent freeing in cleanup cache_map = NULL; crc32list = NULL; // Get rid of cache map if image is complete if ( image->cache_map != NULL && image_isComplete( image ) ) { image_markComplete( image ); image->working = true; } // Image is definitely incomplete, open image file for writing, so we can update the cache if ( image->cache_map != NULL ) { image->working = false; image->cacheFd = open( path, O_WRONLY ); if ( image->cacheFd < 0 ) { // Proxy mode without disk caching is pointless, bail out image->cacheFd = -1; logadd( LOG_ERROR, "Could not open incomplete image %s for writing!", path ); image = image_free( image ); goto load_error; } if ( withUplink ) { uplink_init( image, -1, NULL ); } } // ### Reaching this point means loading succeeded // Add to images array spin_lock( &_images_lock ); // Now we're locked, assign unique ID to image (unique for this running server instance!) image->id = ++imgIdCounter; for (i = 0; i < _num_images; ++i) { if ( _images[i] != NULL ) continue; _images[i] = image; break; } if ( i >= _num_images ) { if ( _num_images >= SERVER_MAX_IMAGES ) { logadd( LOG_ERROR, "Cannot load image '%s': maximum number of images reached.", path ); spin_unlock( &_images_lock ); image = image_free( image ); goto load_error; } _images[_num_images++] = image; logadd( LOG_DEBUG1, "Loaded image '%s'\n", image->lower_name ); } // Keep fd for reading image->readFd = fdImage; fdImage = -1; spin_unlock( &_images_lock ); function_return = true; // Clean exit: load_error: ; if ( existing != NULL ) image_release( existing ); if ( crc32list != NULL ) free( crc32list ); if ( cache_map != NULL ) free( cache_map ); if ( fdImage != -1 ) close( fdImage ); return function_return; } static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize) { uint8_t *retval = NULL; char mapFile[strlen( imagePath ) + 10 + 1]; sprintf( mapFile, "%s.map", imagePath ); int fdMap = open( mapFile, O_RDONLY ); if ( fdMap >= 0 ) { const int map_size = IMGSIZE_TO_MAPBYTES( fileSize ); retval = calloc( 1, map_size ); const ssize_t rd = read( fdMap, retval, map_size ); if ( map_size != rd ) { logadd( LOG_WARNING, "Could only read %d of expected %d bytes of cache map of '%s'", (int)rd, (int)map_size, fileSize ); // Could not read complete map, that means the rest of the image file will be considered incomplete } close( fdMap ); // Later on we check if the hash map says the image is complete } return retval; } static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc) { assert( masterCrc != NULL ); uint32_t *retval = NULL; const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( fileSize ); // Currently this should only prevent accidental corruption (esp. regarding transparent proxy mode) // but maybe later on you want better security char hashFile[strlen( imagePath ) + 10 + 1]; sprintf( hashFile, "%s.crc", imagePath ); int fdHash = open( hashFile, O_RDONLY ); if ( fdHash >= 0 ) { off_t fs = lseek( fdHash, 0, SEEK_END ); if ( fs < (hashBlocks + 1) * 4 ) { logadd( LOG_WARNING, "Ignoring crc32 list for '%s' as it is too short", imagePath ); } else { if ( 0 != lseek( fdHash, 0, SEEK_SET ) ) { logadd( LOG_WARNING, "Could not seek back to beginning of '%s'", hashFile ); } else { if ( read( fdHash, masterCrc, sizeof(uint32_t) ) != 4 ) { logadd( LOG_WARNING, "Error reading first crc32 of '%s'", imagePath ); } else { retval = calloc( hashBlocks, sizeof(uint32_t) ); if ( read( fdHash, retval, hashBlocks * sizeof(uint32_t) ) != hashBlocks * (ssize_t)sizeof(uint32_t) ) { free( retval ); retval = NULL; logadd( LOG_WARNING, "Could not read crc32 list of '%s'", imagePath ); } else { uint32_t lists_crc = crc32( 0L, Z_NULL, 0 ); lists_crc = crc32( lists_crc, (Bytef*)retval, hashBlocks * sizeof(uint32_t) ); if ( lists_crc != *masterCrc ) { free( retval ); retval = NULL; logadd( LOG_WARNING, "CRC-32 of CRC-32 list mismatch. CRC-32 list of '%s' might be corrupted.", imagePath ); } } } } } close( fdHash ); } return retval; } static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, uint8_t * const cache_map) { // This checks the first block and (up to) count - 1 random blocks for corruption // via the known crc32 list. This is very sloppy and is merely supposed to detect // accidental corruption due to broken dnbd3-proxy functionality or file system // corruption. assert( count > 0 ); const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( fileSize ); int blocks[count + 1]; int index = 0, j; int block; if ( image_isHashBlockComplete( cache_map, 0, fileSize ) ) blocks[index++] = 0; int tries = count * 5; // Try only so many times to find a non-duplicate complete block while ( index + 1 < count && --tries > 0 ) { block = rand() % hashBlocks; // Random block for ( j = 0; j < index; ++j ) { // Random block already in list? if ( blocks[j] == block ) goto while_end; } // Block complete? If yes, add to list if ( image_isHashBlockComplete( cache_map, block, fileSize ) ) blocks[index++] = block; while_end: ; } blocks[MIN(index, count)] = -1; // End of array has to be marked by a -1 return image_checkBlocksCrc32( fdImage, crc32list, blocks, fileSize ); // Return result of check } /** * Create a new image with the given image name and revision id in _basePath * Returns true on success, false otherwise */ bool image_create(char *image, int revision, uint64_t size) { assert( image != NULL ); assert( size >= DNBD3_BLOCK_SIZE ); if ( revision <= 0 ) { logadd( LOG_ERROR, "revision id invalid: %d", revision ); return false; } const int PATHLEN = 2000; char path[PATHLEN], cache[PATHLEN]; char *lastSlash = strrchr( image, '/' ); if ( lastSlash == NULL ) { snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision ); } else { *lastSlash = '\0'; snprintf( path, PATHLEN, "%s/%s", _basePath, image ); mkdir_p( path ); *lastSlash = '/'; snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision ); } if ( file_isReadable( path ) ) { logadd( LOG_ERROR, "Image %s with rid %d already exists!", image, revision ); return false; } snprintf( cache, PATHLEN, "%s.map", path ); size = (size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); const int mapsize = IMGSIZE_TO_MAPBYTES(size); // Write files int fdImage = -1, fdCache = -1; fdImage = open( path, O_RDWR | O_TRUNC | O_CREAT, 0644 ); fdCache = open( cache, O_RDWR | O_TRUNC | O_CREAT, 0644 ); if ( fdImage < 0 ) { logadd( LOG_ERROR, "Could not open %s for writing.", path ); goto failure_cleanup; } if ( fdCache < 0 ) { logadd( LOG_ERROR, "Could not open %s for writing.", cache ); goto failure_cleanup; } // Try cache map first if ( !file_alloc( fdCache, 0, mapsize ) ) { const int err = errno; logadd( LOG_ERROR, "Could not allocate %d bytes for %s (errno=%d)", mapsize, cache, err ); goto failure_cleanup; } // Now write image if ( !file_alloc( fdImage, 0, size ) ) { const int err = errno; logadd( LOG_ERROR, "Could not allocate %" PRIu64 " bytes for %s (errno=%d)", size, path, err ); goto failure_cleanup; } close( fdImage ); close( fdCache ); return true; // failure_cleanup: ; if ( fdImage >= 0 ) close( fdImage ); if ( fdCache >= 0 ) close( fdCache ); remove( path ); remove( cache ); return false; } /** * Does the same as image_get, but if the image is not found locally, * it will try to clone it from an authoritative dnbd3 server and return the * image. If the return value is not NULL, image_release needs to be called * on the image at some point. * Locks on: remoteCloneLock, _images_lock, _images[].lock */ dnbd3_image_t* image_getOrClone(char *name, uint16_t revision) { if ( !_isProxy ) return image_get( name, revision, true ); int i; const size_t len = strlen( name ); // Sanity check if ( len == 0 || name[len - 1] == '/' || name[0] == '/' ) return NULL ; // Already existing locally? dnbd3_image_t *image = image_get( name, revision, true ); if ( image != NULL ) return image; // Doesn't exist, try remote if not already tried it recently const time_t now = time( NULL ); char *cmpname = name; int useIndex = -1; if ( len >= NAMELEN ) cmpname += 1 + len - NAMELEN; pthread_mutex_lock( &remoteCloneLock ); for (i = 0; i < CACHELEN; ++i) { if ( remoteCloneCache[i].rid == revision && strcmp( cmpname, remoteCloneCache[i].name ) == 0 ) { useIndex = i; if ( remoteCloneCache[i].deadline < now ) break; pthread_mutex_unlock( &remoteCloneLock ); // Was recently checked... return image_get( name, revision, true ); } } // Re-check to prevent two clients at the same time triggering this image = image_get( name, revision, true ); if ( image != NULL ) { pthread_mutex_unlock( &remoteCloneLock ); return image; } // Reaching this point means we should contact an authority server serialized_buffer_t serialized; // Mark as recently checked if ( useIndex == -1 ) { useIndex = remoteCloneCacheIndex = (remoteCloneCacheIndex + 1) % CACHELEN; } remoteCloneCache[useIndex].deadline = now + SERVER_REMOTE_IMAGE_CHECK_CACHETIME; snprintf( remoteCloneCache[useIndex].name, NAMELEN, "%s", cmpname ); remoteCloneCache[useIndex].rid = revision; // Get some alt servers and try to get the image from there dnbd3_host_t servers[4]; int uplinkSock = -1; dnbd3_host_t *uplinkServer = NULL; const int count = altservers_get( servers, 4, false ); uint16_t remoteVersion, remoteRid; uint64_t remoteImageSize; for (i = 0; i < count; ++i) { int sock = sock_connect( &servers[i], 750, _uplinkTimeout ); if ( sock < 0 ) continue; if ( !dnbd3_select_image( sock, name, revision, FLAGS8_SERVER ) ) goto server_fail; char *remoteName; if ( !dnbd3_select_image_reply( &serialized, sock, &remoteVersion, &remoteName, &remoteRid, &remoteImageSize ) ) goto server_fail; if ( remoteVersion < MIN_SUPPORTED_SERVER ) goto server_fail; if ( revision != 0 && remoteRid != revision ) goto server_fail; if ( remoteImageSize < DNBD3_BLOCK_SIZE || remoteName == NULL || strcmp( name, remoteName ) != 0 ) goto server_fail; if ( remoteImageSize > SERVER_MAX_PROXY_IMAGE_SIZE ) goto server_fail; if ( !image_ensureDiskSpace( remoteImageSize ) ) goto server_fail; if ( !image_clone( sock, name, remoteRid, remoteImageSize ) ) goto server_fail; // Cloning worked :-) uplinkSock = sock; uplinkServer = &servers[i]; break; server_fail: ; close( sock ); } pthread_mutex_unlock( &remoteCloneLock ); // If everything worked out, this call should now actually return the image image = image_get( name, remoteRid, false ); if ( image != NULL && uplinkSock != -1 && uplinkServer != NULL ) { // If so, init the uplink and pass it the socket if ( !uplink_init( image, uplinkSock, uplinkServer ) ) close( uplinkSock ); i = 0; while ( !image->working && ++i < 100 ) usleep( 1000 ); } else if ( uplinkSock >= 0 ) { close( uplinkSock ); } return image; } /** * Prepare a cloned image: * 1. Allocate empty image file and its cache map * 2. Use passed socket to request the crc32 list and save it to disk * 3. Load the image from disk * Returns: true on success, false otherwise */ static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageSize) { // Allocate disk space and create cache map if ( !image_create( name, revision, imageSize ) ) return false; // CRC32 const size_t len = strlen( _basePath ) + strlen( name ) + 20; char crcFile[len]; snprintf( crcFile, len, "%s/%s.r%d.crc", _basePath, name, (int)revision ); if ( !file_isReadable( crcFile ) ) { // Get crc32list from remote server size_t crc32len = IMGSIZE_TO_HASHBLOCKS(imageSize) * sizeof(uint32_t); uint32_t masterCrc; uint8_t *crc32list = malloc( crc32len ); if ( !dnbd3_get_crc32( sock, &masterCrc, crc32list, &crc32len ) ) { free( crc32list ); return false; } if ( crc32len != 0 ) { uint32_t lists_crc = crc32( 0L, Z_NULL, 0 ); lists_crc = crc32( lists_crc, (Bytef*)crc32list, crc32len ); if ( lists_crc != masterCrc ) { logadd( LOG_WARNING, "OTF-Clone: Corrupted CRC-32 list. ignored. (%s)", name ); } else { int fd = open( crcFile, O_WRONLY | O_CREAT, 0644 ); write( fd, &lists_crc, sizeof(uint32_t) ); write( fd, crc32list, crc32len ); close( fd ); } } free( crc32list ); } // HACK: Chop of ".crc" to get the image file name crcFile[strlen( crcFile ) - 4] = '\0'; return image_load( _basePath, crcFile, false ); } /** * Generate the crc32 block list file for the given file. * This function wants a plain file name instead of a dnbd3_image_t, * as it can be used directly from the command line. */ bool image_generateCrcFile(char *image) { int fdImage = open( image, O_RDONLY ); if ( fdImage < 0 ) { logadd( LOG_ERROR, "Could not open %s.", image ); return false; } // force size to be multiple of DNBD3_BLOCK_SIZE int64_t fileLen = lseek( fdImage, 0, SEEK_END ); if ( fileLen <= 0 ) { logadd( LOG_ERROR, "Error seeking to end, or file is empty." ); close( fdImage ); return false; } if ( fileLen % DNBD3_BLOCK_SIZE != 0 ) { logadd( LOG_WARNING, "File length is not a multiple of DNBD3_BLOCK_SIZE" ); const int64_t ret = image_pad( image, fileLen ); if ( ret < fileLen ) { logadd( LOG_ERROR, "Error appending to file in order to make it block aligned." ); close( fdImage ); return false; } logadd( LOG_INFO, "...fixed!" ); fileLen = ret; } if ( lseek( fdImage, 0, SEEK_SET ) != 0 ) { logadd( LOG_ERROR, "Seeking back to start failed." ); close( fdImage ); return false; } char crcFile[strlen( image ) + 4 + 1]; sprintf( crcFile, "%s.crc", image ); struct stat sst; if ( stat( crcFile, &sst ) == 0 ) { logadd( LOG_ERROR, "CRC File for %s already exists! Delete it first if you want to regen.", image ); close( fdImage ); return false; } int fdCrc = open( crcFile, O_RDWR | O_CREAT, 0644 ); if ( fdCrc < 0 ) { logadd( LOG_ERROR, "Could not open CRC File %s for writing..", crcFile ); close( fdImage ); return false; } // CRC of all CRCs goes first. Don't know it yet, write 4 bytes dummy data. if ( write( fdCrc, crcFile, 4 ) != 4 ) { logadd( LOG_ERROR, "Write error" ); close( fdImage ); close( fdCrc ); return false; } char buffer[80000]; // Read buffer from image bool finished = false; // end of file reached int hasSum; // unwritten (unfinished?) crc32 exists int blocksToGo = 0; // Count number of checksums written printf( "Generating CRC32" ); fflush( stdout ); do { // Start of a block - init uint32_t crc = crc32( 0L, Z_NULL, 0 ); int remaining = HASH_BLOCK_SIZE; hasSum = false; while ( remaining > 0 ) { const int blockSize = MIN(remaining, (int)sizeof(buffer)); const int ret = read( fdImage, buffer, blockSize ); if ( ret < 0 ) { // Error printf( "Read error\n" ); close( fdImage ); close( fdCrc ); return false; } else if ( ret == 0 ) { // EOF finished = true; break; } else { // Read something hasSum = true; crc = crc32( crc, (Bytef*)buffer, ret ); remaining -= ret; } } // Write to file if ( hasSum ) { if ( write( fdCrc, &crc, 4 ) != 4 ) { printf( "Write error\n" ); close( fdImage ); close( fdCrc ); return false; } putchar( '.' ); fflush( stdout ); blocksToGo++; } } while ( !finished ); close( fdImage ); printf( "done!" ); logadd( LOG_INFO, "Generating master-crc..." ); fflush( stdout ); // File is written - read again to calc master crc if ( lseek( fdCrc, 4, SEEK_SET ) != 4 ) { logadd( LOG_ERROR, "Could not seek to beginning of crc list in file" ); close( fdCrc ); return false; } uint32_t crc = crc32( 0L, Z_NULL, 0 ); while ( blocksToGo > 0 ) { const int numBlocks = MIN(1000, blocksToGo); if ( read( fdCrc, buffer, numBlocks * 4 ) != numBlocks * 4 ) { logadd( LOG_ERROR, "Could not re-read from crc32 file" ); close( fdCrc ); return false; } crc = crc32( crc, (Bytef*)buffer, numBlocks * 4 ); blocksToGo -= numBlocks; } if ( lseek( fdCrc, 0, SEEK_SET ) != 0 ) { logadd( LOG_ERROR, "Could not seek back to beginning of crc32 file" ); close( fdCrc ); return false; } if ( write( fdCrc, &crc, 4 ) != 4 ) { logadd( LOG_ERROR, "Could not write master crc to file" ); close( fdCrc ); return false; } logadd( LOG_INFO, "CRC-32 file successfully generated." ); fflush( stdout ); return true; } json_t* image_fillJson() { json_t *imagesJson = json_array(); json_t *image; int i; char buffer[100] = { 0 }; spin_lock( &_images_lock ); for (i = 0; i < _num_images; ++i) { if ( _images[i] == NULL ) continue; spin_lock( &_images[i]->lock ); image = json_pack( "{sisssisIsi}", "id", _images[i]->id, "name", _images[i]->lower_name, "rid", (int) _images[i]->rid, "users", (json_int_t) _images[i]->users, "complete", image_getCompletenessEstimate( _images[i] ) ); if ( _images[i]->uplink != NULL ) { host_to_string( &_images[i]->uplink->currentServer, buffer, sizeof(buffer) ); json_object_set_new( image, "uplinkServer", json_string( buffer ) ); json_object_set_new( image, "receivedBytes", json_integer( (json_int_t) _images[i]->uplink->bytesReceived ) ); } json_array_append_new( imagesJson, image ); spin_unlock( &_images[i]->lock ); } spin_unlock( &_images_lock ); return imagesJson; } /** * Get completeness of an image in percent. Only estimated, not exact. * Returns: 0-100 * DOES NOT LOCK, so make sure to do so before calling */ int image_getCompletenessEstimate(const dnbd3_image_t * const image) { assert( image != NULL ); if ( image->cache_map == NULL ) return image->working ? 100 : 0; int i; int percent = 0; const int len = IMGSIZE_TO_MAPBYTES(image->filesize); if ( len == 0 ) return 0; for ( i = 0; i < len; ++i ) { if ( image->cache_map[i] == 0xff ) { percent += 100; } else if ( image->cache_map[i] > 0 ) { percent += 50; } } return percent / len; } /** * Check the CRC-32 of the given blocks. The array blocks is of variable length. * !! pass -1 as the last block so the function knows when to stop !! * Returns true or false */ bool image_checkBlocksCrc32(int fd, uint32_t *crc32list, const int *blocks, const uint64_t fileSize) { char buffer[40000]; while ( *blocks != -1 ) { uint32_t crc = crc32( 0L, Z_NULL, 0 ); int bytes = 0; const int bytesToGo = MIN( HASH_BLOCK_SIZE, fileSize - ((int64_t)*blocks * HASH_BLOCK_SIZE) ); off_t readPos = (int64_t)*blocks * HASH_BLOCK_SIZE; while ( bytes < bytesToGo ) { const int n = MIN( (int)sizeof(buffer), bytesToGo - bytes ); const int r = pread( fd, buffer, n, readPos ); if ( r <= 0 ) { logadd( LOG_WARNING, "CRC-Check: Read error (errno=%d)", errno ); return false; } crc = crc32( crc, (Bytef*)buffer, r ); bytes += r; readPos += r; } if ( crc != crc32list[*blocks] ) { logadd( LOG_WARNING, "Block %d is %x, should be %x", *blocks, crc, crc32list[*blocks] ); return false; } blocks++; } return true; } static int64_t image_pad(const char *path, const int64_t currentSize) { const int missing = DNBD3_BLOCK_SIZE - (currentSize % DNBD3_BLOCK_SIZE ); char buffer[missing]; memset( buffer, 0, missing ); int tmpFd = open( path, O_WRONLY | O_APPEND ); bool success = false; if ( tmpFd < 0 ) { logadd( LOG_WARNING, "Can't open image for writing, can't fix %s", path ); } else if ( lseek( tmpFd, currentSize, SEEK_SET ) != currentSize ) { logadd( LOG_WARNING, "lseek() failed, can't fix %s", path ); } else if ( write( tmpFd, buffer, missing ) != missing ) { logadd( LOG_WARNING, "write() failed, can't fix %s", path ); } else { success = true; } if ( tmpFd >= 0 ) close( tmpFd ); if ( success ) { return currentSize + missing; } else { return 0; } } /** * Make sure at least size bytes are available in _basePath. * Will delete old images to make room for new ones. * TODO: Store last access time of images. Currently the * last access time is reset on server restart. Thus it will * currently only delete images if server uptime is > 10 hours * Return true iff enough space is available. false in random other cases */ static bool image_ensureDiskSpace(uint64_t size) { for (;;) { const int64_t available = file_freeDiskSpace( _basePath ); if ( available == -1 ) { const int e = errno; logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left... ;-)\n", e ); return true; } if ( (uint64_t)available > size ) return true; if ( dnbd3_serverUptime() < 10 * 3600 ) { logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < 10 hours...", (int)(available / (1024ll * 1024ll)), (int)(size / (1024 * 1024)) ); return false; } logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, freeing an image...", (int)(available / (1024ll * 1024ll)), (int)(size / (1024 * 1024)) ); // Find least recently used image dnbd3_image_t *oldest = NULL; int i; for (i = 0; i < _num_images; ++i) { if ( _images[i] == NULL ) continue; dnbd3_image_t *current = image_lock( _images[i] ); if ( current == NULL ) continue; if ( current->atime != 0 && current->users == 1 ) { // Just from the lock above if ( oldest == NULL || oldest->atime > current->atime ) { // Oldest access time so far oldest = current; } } image_release( current ); } if ( oldest == NULL || time( NULL ) - oldest->atime < 86400 ) { logadd( LOG_DEBUG1, "No image is old enough :-(\n" ); return false; } oldest = image_lock( oldest ); if ( oldest == NULL ) return false; logadd( LOG_INFO, "'%s:%d' has to go!", oldest->lower_name, (int)oldest->rid ); unlink( oldest->path ); size_t len = strlen( oldest->path ) + 5 + 1; char buffer[len]; snprintf( buffer, len, "%s.map", oldest->path ); unlink( buffer ); snprintf( buffer, len, "%s.crc", oldest->path ); unlink( buffer ); snprintf( buffer, len, "%s.meta", oldest->path ); unlink( buffer ); image_remove( oldest ); image_release( oldest ); } return false; } /* void image_find_latest() { // Not in array or most recent rid is requested, try file system if (revision != 0) { // Easy case - specific RID char } else { // Determine base directory where the image in question has to reside. // Eg, the _basePath is "/srv/", requested image is "rz/ubuntu/default-13.04" // Then searchPath has to be set to "/srv/rz/ubuntu" char searchPath[strlen(_basePath) + len + 1]; char *lastSlash = strrchr(name, '/'); char *baseName; // Name of the image. In the example above, it will be "default-13.04" if ( lastSlash == NULL ) { *searchPath = '\0'; baseName = name; } else { char *from = name, *to = searchPath; while (from < lastSlash) *to++ = *from++; *to = '\0'; baseName = lastSlash + 1; } // Now we have the search path in our real file system and the expected image name. // The revision naming sceme is .r, so if we're looking for revision 13, // our example image has to be named default-13.04.r13 } } */