diff options
Diffstat (limited to 'src/fuse/cowfile.c')
-rw-r--r-- | src/fuse/cowfile.c | 928 |
1 files changed, 487 insertions, 441 deletions
diff --git a/src/fuse/cowfile.c b/src/fuse/cowfile.c index 8e816a2..a53b101 100644 --- a/src/fuse/cowfile.c +++ b/src/fuse/cowfile.c @@ -10,10 +10,13 @@ #include <curl/curl.h> #define UUID_STRLEN 36 +// Maximum assumed page size, in case the cow data gets transferred between different architectures +// 16k should be the largest minimum in existence (Itanium) +#define MAX_PAGE_SIZE 16384 extern void image_ll_getattr( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi ); -static const int CURRENT_COW_VERSION = 1; +static const int CURRENT_COW_VERSION = 2; static bool statStdout; static bool statFile; @@ -30,18 +33,51 @@ atomic_bool uploadLoopDone = false; // Upload loop has finished all work? static struct cow { - pthread_mutex_t l2CreateLock; - int fhm; - int fhd; - int fhs; char *metadata_mmap; l1 *l1; - l2 *firstL2; - size_t maxImageSize; - size_t l1Size; //size of l1 array - + l2 *l2; + int fdMeta; + int fdData; + int fdStats; + pthread_mutex_t l2CreateLock; } cow; +static int countOneBits( atomic_uchar *bf, int numBytes ) +{ + int bitCount = 0; + for ( int i = 0; i < numBytes; ++i ) { + unsigned char value = bf[i]; + while ( value > 0 ) { + if ( ( value & 1 ) == 1 ) { + bitCount++; + } + value >>= 1; + } + } + return bitCount; +} + +#define IS_4K_ALIGNED(v) ( ( (uint64_t)(v) & DNBD3_BLOCK_MASK ) == 0 ) + +static bool writeAll( int fd, const char *buf, size_t count, off_t offset ) +{ + while ( count > 0 ) { + ssize_t ret = pwrite( fd, buf, count, offset ); + if ( ret == (ssize_t)count ) + return true; + if ( ret == -1 ) { + if ( errno == EINTR ) + continue; + return false; + } + if ( ret == 0 ) + return false; + count -= ret; + buf += ret; + } + return true; +} + /** * @brief Computes the l1 index for an absolute file offset * @@ -83,7 +119,7 @@ static int getBitfieldOffsetBit( size_t offset ) * @param to end bit * @param value set bits to 1 or 0 */ -static void setBits( atomic_char *byte, int from, int to, bool value ) +static void setBits( atomic_uchar *byte, int64_t from, int64_t to, bool value ) { char mask = (char)( ( 255 >> ( 7 - ( to - from ) ) ) << from ); if ( value ) { @@ -101,13 +137,13 @@ static void setBits( atomic_char *byte, int from, int to, bool value ) * @param to end bit * @param value set bits to 1 or 0 */ -static void setBitsInBitfield( atomic_char *bitfield, int from, int to, bool value ) +static void setBitsInBitfield( atomic_uchar *bitfield, int64_t from, int64_t to, bool value ) { - assert( from >= 0 || to < COW_BITFIELD_SIZE * 8 ); - int start = from / 8; - int end = to / 8; + assert( from >= 0 && to < COW_BITFIELD_SIZE * 8 ); + int64_t start = from / 8; + int64_t end = to / 8; - for ( int i = start; i <= end; i++ ) { + for ( int64_t i = start; i <= end; i++ ) { setBits( ( bitfield + i ), from - i * 8, MIN( 7, to - i * 8 ), value ); from = ( i + 1 ) * 8; } @@ -119,9 +155,9 @@ static void setBitsInBitfield( atomic_char *bitfield, int from, int to, bool val * @param bitfield of a cow_l2_entry * @param n the bit which should be checked */ -static bool checkBit( atomic_char *bitfield, int n ) +static bool checkBit( atomic_uchar *bitfield, int64_t n ) { - return ( atomic_load( ( bitfield + ( n / 8 ) ) ) >> ( n % 8 ) ) & 1; + return ( bitfield[n / 8] >> ( n % 8 ) ) & 1; } @@ -225,32 +261,50 @@ size_t curlReadCallbackUploadBlock( char *ptr, size_t size, size_t nmemb, void * cow_curl_read_upload_t *uploadBlock = (cow_curl_read_upload_t *)userdata; size_t len = 0; // Check if we're still in the bitfield - if ( uploadBlock->position < (size_t)metadata->bitfieldSize ) { - size_t lenCpy = MIN( metadata->bitfieldSize - uploadBlock->position, size * nmemb ); - memcpy( ptr, uploadBlock->block->bitfield + uploadBlock->position, lenCpy ); + if ( uploadBlock->position < COW_BITFIELD_SIZE ) { + size_t lenCpy = MIN( COW_BITFIELD_SIZE - uploadBlock->position, size * nmemb ); + memcpy( ptr + uploadBlock->position, uploadBlock->bitfield + uploadBlock->position, + lenCpy ); uploadBlock->position += lenCpy; len += lenCpy; } // No elseif here, might just have crossed over... - if ( uploadBlock->position >= (size_t)metadata->bitfieldSize ) { - ssize_t wantRead = (ssize_t)MIN( - COW_DATA_CLUSTER_SIZE - ( uploadBlock->position - ( metadata->bitfieldSize ) ), - ( size * nmemb ) - len ); - off_t inClusterOffset = uploadBlock->position - metadata->bitfieldSize; - ssize_t lengthRead = pread( cow.fhd, ( ptr + len ), wantRead, uploadBlock->block->offset + inClusterOffset ); - if ( lengthRead == -1 ) { - logadd( LOG_ERROR, "Upload: Reading from COW file failed with errno %d", errno ); - return CURL_READFUNC_ABORT; - } - - if ( wantRead > lengthRead ) { - // fill up since last block may not be a full block - memset( ptr + len + lengthRead, 0, wantRead - lengthRead ); - // TODO what about partial read? We should know how much data there actually is... - lengthRead = wantRead; + if ( uploadBlock->position >= COW_BITFIELD_SIZE ) { + // Subtract the bitfield size from everything first + off_t inClusterOffset = uploadBlock->position - COW_BITFIELD_SIZE; + ssize_t spaceLeft = ( size * nmemb ) - len; + // Only read blocks that have been written to the cluster. Saves bandwidth. Not optimal since + // we do a lot of 4k/32k reads, but it's not that performance critical I guess... + while ( spaceLeft >= (ssize_t)DNBD3_BLOCK_SIZE && inClusterOffset < (off_t)COW_DATA_CLUSTER_SIZE ) { + int bitNumber = (int)( inClusterOffset / DNBD3_BLOCK_SIZE ); + size_t readSize; + // Small performance hack: All bits one in a byte, do a 32k instead of 4k read + if ( spaceLeft >= (ssize_t)DNBD3_BLOCK_SIZE * 8 + && bitNumber % 8 == 0 + && uploadBlock->bitfield[bitNumber / 8] == 0xff ) { + readSize = DNBD3_BLOCK_SIZE * 8; + } else { + readSize = DNBD3_BLOCK_SIZE; + } + // Check bits in our copy, as global bitfield could change + if ( checkBit( uploadBlock->bitfield, bitNumber ) ) { + ssize_t lengthRead = pread( cow.fdData, ( ptr + len ), readSize, + uploadBlock->block->offset + inClusterOffset ); + if ( lengthRead == -1 ) { + logadd( LOG_ERROR, "Upload: Reading from COW file failed with errno %d", errno ); + return CURL_READFUNC_ABORT; + } + if ( lengthRead != (ssize_t)readSize ) { + logadd( LOG_ERROR, "Upload: Reading from COW file failed with short read (%d/%d)", + (int)lengthRead, (int)readSize ); + return CURL_READFUNC_ABORT; + } + len += lengthRead; + spaceLeft -= lengthRead; + } + inClusterOffset += readSize; + uploadBlock->position += readSize; } - uploadBlock->position += lengthRead; - len += lengthRead; } return len; } @@ -280,7 +334,7 @@ bool mergeRequest() part = curl_mime_addpart( mime ); curl_mime_name( part, "originalFileSize" ); char buf[21]; - snprintf( buf, sizeof buf, "%" PRIu64, metadata->originalImageSize ); + snprintf( buf, sizeof buf, "%" PRIu64, metadata->validRemoteSize ); curl_mime_data( part, buf, CURL_ZERO_TERMINATED ); part = curl_mime_addpart( mime ); @@ -341,15 +395,15 @@ int progress_callback( void *clientp, __attribute__((unused)) curl_off_t dlTotal __attribute__((unused)) curl_off_t dlNow, __attribute__((unused)) curl_off_t ulTotal, curl_off_t ulNow ) { CURL *eh = (CURL *)clientp; - cow_curl_read_upload_t *curlUploadBlock; + cow_curl_read_upload_t *uploadingCluster; CURLcode res; - res = curl_easy_getinfo( eh, CURLINFO_PRIVATE, &curlUploadBlock ); + res = curl_easy_getinfo( eh, CURLINFO_PRIVATE, &uploadingCluster ); if ( res != CURLE_OK ) { logadd( LOG_ERROR, "ERROR" ); return 0; } - bytesUploaded += ( ulNow - curlUploadBlock->ulLast ); - curlUploadBlock->ulLast = ulNow; + bytesUploaded += ( ulNow - uploadingCluster->ulLast ); + uploadingCluster->ulLast = ulNow; return 0; } @@ -381,7 +435,7 @@ void updateCowStatsFile( uint64_t inQueue, uint64_t modified, uint64_t idle, cha "modifiedClusters=%" PRIu64 "\n" "idleClusters=%" PRIu64 "\n" "totalClustersUploaded=%" PRIu64 "\n" - "activeUploads=:%i\n" + "activeUploads=%i\n" "%s%s", state, inQueue, modified, idle, totalBlocksUploaded, activeUploads, COW_SHOW_UL_SPEED ? "ulspeed=" : "", @@ -398,9 +452,10 @@ void updateCowStatsFile( uint64_t inQueue, uint64_t modified, uint64_t idle, cha if ( statFile ) { // Pad with a bunch of newlines so we don't change the file size all the time - ssize_t extra = MIN( 20, sizeof(buffer) - len - 1 ); + ssize_t extra = MIN( 20, (ssize_t)sizeof(buffer) - len - 1 ); memset( buffer + len, '\n', extra ); - if ( pwrite( cow.fhs, buffer, len + extra, 43 ) != len ) { + lseek( cow.fdStats, 43, SEEK_SET ); + if ( write( cow.fdStats, buffer, len + extra ) != len ) { logadd( LOG_WARNING, "Could not update cow status file" ); } #ifdef COW_DUMP_BLOCK_UPLOADS @@ -412,7 +467,7 @@ void updateCowStatsFile( uint64_t inQueue, uint64_t modified, uint64_t idle, cha } int cmpfunc( const void *a, const void *b ) { - return (int)( ( (cow_block_upload_statistics_t *)b )->uploads - ( (cow_block_upload_statistics_t *)a )->uploads ); + return (int)( ( (cow_cluster_statistics_t *)b )->uploads - ( (cow_cluster_statistics_t *)a )->uploads ); } /** * @brief Writes all block numbers sorted by the number of uploads into the statsfile. @@ -422,26 +477,25 @@ void dumpBlockUploads() { long unsigned int l1MaxOffset = 1 + ( ( metadata->imageSize - 1 ) / COW_FULL_L2_TABLE_DATA_SIZE ); - cow_block_upload_statistics_t blockUploads[l1MaxOffset * COW_L2_TABLE_SIZE]; + cow_cluster_statistics_t blockUploads[l1MaxOffset * COW_L2_TABLE_SIZE]; uint64_t currentBlock = 0; for ( long unsigned int l1Index = 0; l1Index < l1MaxOffset; l1Index++ ) { if ( cow.l1[l1Index] == -1 ) { continue; } for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) { - cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index ); + cow_l2_entry_t *block = ( cow.l2[cow.l1[l1Index]] + l2Index ); blockUploads[currentBlock].uploads = block->uploads; - blockUploads[currentBlock].blocknumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index ); + blockUploads[currentBlock].clusterNumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index ); currentBlock++; } } - qsort( blockUploads, currentBlock, sizeof( cow_block_upload_statistics_t ), cmpfunc ); - lseek( cow.fhs, 0, SEEK_END ); + qsort( blockUploads, currentBlock, sizeof( cow_cluster_statistics_t ), cmpfunc ); - dprintf( cow.fhs, "\n\nblocknumber: uploads\n==Block Upload Dump===\n" ); + dprintf( cow.fdStats, "\n\nclusterNumber: uploads\n==Block Upload Dump===\n" ); for ( uint64_t i = 0; i < currentBlock; i++ ) { - dprintf( cow.fhs, "%" PRIu64 ": %" PRIu64 " \n", blockUploads[i].blocknumber, blockUploads[i].uploads ); + dprintf( cow.fdStats, "%" PRIu64 ": %" PRIu64 " \n", blockUploads[i].clusterNumber, blockUploads[i].uploads ); } } @@ -449,29 +503,32 @@ void dumpBlockUploads() * @brief Starts the upload of a given block. * * @param cm Curl_multi - * @param curlUploadBlock containing the data for the block to upload. + * @param uploadingCluster containing the data for the block to upload. */ -bool addUpload( CURLM *cm, cow_curl_read_upload_t *curlUploadBlock, struct curl_slist *headers ) +bool addUpload( CURLM *cm, cow_curl_read_upload_t *uploadingCluster, struct curl_slist *headers ) { CURL *eh = curl_easy_init(); char url[COW_URL_STRING_SIZE]; - snprintf( url, COW_URL_STRING_SIZE, COW_API_UPDATE, cowServerAddress, metadata->uuid, curlUploadBlock->blocknumber ); + snprintf( url, COW_URL_STRING_SIZE, COW_API_UPDATE, cowServerAddress, metadata->uuid, uploadingCluster->clusterNumber ); curl_easy_setopt( eh, CURLOPT_URL, url ); curl_easy_setopt( eh, CURLOPT_POST, 1L ); curl_easy_setopt( eh, CURLOPT_READFUNCTION, curlReadCallbackUploadBlock ); - curl_easy_setopt( eh, CURLOPT_READDATA, (void *)curlUploadBlock ); - curl_easy_setopt( eh, CURLOPT_PRIVATE, (void *)curlUploadBlock ); + curl_easy_setopt( eh, CURLOPT_READDATA, (void *)uploadingCluster ); + curl_easy_setopt( eh, CURLOPT_PRIVATE, (void *)uploadingCluster ); // min upload speed of 1kb/s over 10 sec otherwise the upload is canceled. curl_easy_setopt( eh, CURLOPT_LOW_SPEED_TIME, 10L ); curl_easy_setopt( eh, CURLOPT_LOW_SPEED_LIMIT, 1000L ); - curl_easy_setopt( - eh, CURLOPT_POSTFIELDSIZE_LARGE, (long)( metadata->bitfieldSize + COW_DATA_CLUSTER_SIZE ) ); + curl_easy_setopt( eh, CURLOPT_POSTFIELDSIZE_LARGE, + (long)( COW_BITFIELD_SIZE + + DNBD3_BLOCK_SIZE * countOneBits( uploadingCluster->bitfield, COW_BITFIELD_SIZE ) ) + ); + if ( COW_SHOW_UL_SPEED ) { - curlUploadBlock->ulLast = 0; + uploadingCluster->ulLast = 0; curl_easy_setopt( eh, CURLOPT_NOPROGRESS, 0L ); curl_easy_setopt( eh, CURLOPT_XFERINFOFUNCTION, progress_callback ); curl_easy_setopt( eh, CURLOPT_XFERINFODATA, eh ); @@ -495,35 +552,35 @@ bool addUpload( CURLM *cm, cow_curl_read_upload_t *curlUploadBlock, struct curl_ bool finishUpload( CURLM *cm, CURLMsg *msg, struct curl_slist *headers ) { bool status = true; - cow_curl_read_upload_t *curlUploadBlock; + cow_curl_read_upload_t *uploadingCluster; CURLcode res; CURLcode res2; - res = curl_easy_getinfo( msg->easy_handle, CURLINFO_PRIVATE, &curlUploadBlock ); + res = curl_easy_getinfo( msg->easy_handle, CURLINFO_PRIVATE, &uploadingCluster ); long http_code = 0; res2 = curl_easy_getinfo( msg->easy_handle, CURLINFO_RESPONSE_CODE, &http_code ); if ( res != CURLE_OK || res2 != CURLE_OK || http_code < 200 || http_code >= 300 || msg->msg != CURLMSG_DONE ) { - curlUploadBlock->fails++; - logadd( LOG_ERROR, "COW_API_UPDATE failed %i/5: %s\n", curlUploadBlock->fails, + uploadingCluster->fails++; + logadd( LOG_ERROR, "COW_API_UPDATE failed %i/5: %s\n", uploadingCluster->fails, curl_easy_strerror( msg->data.result ) ); - if ( curlUploadBlock->fails <= 5 ) { - addUpload( cm, curlUploadBlock, headers ); + if ( uploadingCluster->fails < 5 ) { + addUpload( cm, uploadingCluster, headers ); goto CLEANUP; } - free( curlUploadBlock ); + free( uploadingCluster ); status = false; goto CLEANUP; } // everything went ok, update timeChanged - atomic_compare_exchange_strong( &curlUploadBlock->block->timeChanged, &curlUploadBlock->time, 0 ); + atomic_compare_exchange_strong( &uploadingCluster->block->timeChanged, &uploadingCluster->time, 0 ); - curlUploadBlock->block->uploads++; + uploadingCluster->block->uploads++; totalBlocksUploaded++; - free( curlUploadBlock ); + free( uploadingCluster ); CLEANUP: curl_multi_remove_handle( cm, msg->easy_handle ); curl_easy_cleanup( msg->easy_handle ); @@ -593,7 +650,7 @@ bool uploaderLoop( bool ignoreMinUploadDelay, CURLM *cm ) } // Now all L2 blocks for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) { - cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index ); + cow_l2_entry_t *block = ( cow.l2[cow.l1[l1Index]] + l2Index ); if ( block->offset == -1 ) { continue; // Not allocated } @@ -613,10 +670,15 @@ bool uploaderLoop( bool ignoreMinUploadDelay, CURLM *cm ) && activeUploads > 0 ); cow_curl_read_upload_t *b = malloc( sizeof( cow_curl_read_upload_t ) ); b->block = block; - b->blocknumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index ); + b->clusterNumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index ); b->fails = 0; b->position = 0; b->time = block->timeChanged; + // Copy, so it doesn't change during upload + // when we assemble the data in curlReadCallbackUploadBlock() + for ( int i = 0; i < COW_BITFIELD_SIZE; ++i ) { + b->bitfield[i] = block->bitfield[i]; + } addUpload( cm, b, headers ); if ( !ignoreMinUploadDelay && !uploadLoop ) { goto DONE; @@ -637,7 +699,7 @@ DONE: * */ -void *cowfile_statUpdater( __attribute__( ( unused ) ) void *something ) +void *cowfile_statUpdater( __attribute__((unused)) void *something ) { uint64_t lastUpdateTime = time( NULL ); @@ -653,7 +715,7 @@ void *cowfile_statUpdater( __attribute__( ( unused ) ) void *something ) continue; } for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) { - cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index ); + cow_l2_entry_t *block = ( cow.l2[cow.l1[l1Index]] + l2Index ); if ( block->offset == -1 ) { continue; } @@ -738,12 +800,12 @@ static bool createCowStatsFile( char *path ) logadd( LOG_INFO, "%s", buffer ); } if ( statFile ) { - if ( ( cow.fhs = open( pathStatus, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) { + if ( ( cow.fdStats = open( pathStatus, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) { logadd( LOG_ERROR, "Could not create cow status file. Bye.\n" ); return false; } - if ( pwrite( cow.fhs, buffer, len, 0 ) != len ) { + if ( pwrite( cow.fdStats, buffer, len, 0 ) != len ) { logadd( LOG_ERROR, "Could not write to cow status file. Bye.\n" ); return false; } @@ -770,67 +832,72 @@ bool cowfile_init( char *path, const char *image_Name, uint16_t imageVersion, snprintf( pathMeta, strlen( path ) + 6, "%s%s", path, "/meta" ); snprintf( pathData, strlen( path ) + 6, "%s%s", path, "/data" ); - if ( ( cow.fhm = open( pathMeta, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) { + if ( ( cow.fdMeta = open( pathMeta, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR ) ) == -1 ) { logadd( LOG_ERROR, "Could not create cow meta file. Bye.\n %s \n", pathMeta ); return false; } - if ( ( cow.fhd = open( pathData, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) { + if ( ( cow.fdData = open( pathData, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR ) ) == -1 ) { logadd( LOG_ERROR, "Could not create cow data file. Bye.\n" ); return false; } - - int maxPageSize = 8192; + struct stat fs; + if ( fstat( cow.fdData, &fs ) == -1 || fs.st_size != 0 ) { + logadd( LOG_ERROR, "/data file already exists and is not empty" ); + return false; + } size_t metaDataSizeHeader = sizeof( cowfile_metadata_header_t ); - cow.maxImageSize = COW_MAX_IMAGE_SIZE; - cow.l1Size = ( ( cow.maxImageSize + COW_FULL_L2_TABLE_DATA_SIZE - 1LL ) / COW_FULL_L2_TABLE_DATA_SIZE ); + // Calculate how many full l2 tables we need to address COW_MAX_IMAGE_SIZE + size_t l1NumEntries = ( ( COW_MAX_IMAGE_SIZE + COW_FULL_L2_TABLE_DATA_SIZE - 1 ) + / COW_FULL_L2_TABLE_DATA_SIZE ); + // Make sure l1 and l2 are aligned to struct size + size_t sizeL1 = sizeof(cow.l1[0]); + size_t sizeL2 = sizeof(cow.l2[0]); + size_t startL1 = ( ( metaDataSizeHeader + sizeL1 - 1 ) / sizeL1 ) * sizeL1; + size_t startL2 = ( ( startL1 + l1NumEntries * sizeL1 + sizeL2 - 1 ) / sizeL2 ) * sizeL2; // size of l1 array + number of l2's * size of l2 - size_t metadata_size = cow.l1Size * sizeof( l1 ) + cow.l1Size * sizeof( l2 ); + size_t ps = getpagesize(); + size_t metaSize = ( ( startL2 + l1NumEntries * sizeof( l2 ) + ps - 1 ) / ps ) * ps; - // compute next fitting multiple of getpagesize() - size_t meta_data_start = ( ( metaDataSizeHeader + maxPageSize - 1 ) / maxPageSize ) * maxPageSize; - - size_t metadataFileSize = meta_data_start + metadata_size; - if ( ftruncate( cow.fhm, metadataFileSize ) != 0 ) { + if ( ftruncate( cow.fdMeta, metaSize ) != 0 ) { logadd( LOG_ERROR, "Could not set file size of meta data file (errno=%d). Bye.\n", errno ); return false; } - cow.metadata_mmap = mmap( NULL, metadataFileSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fhm, 0 ); + cow.metadata_mmap = mmap( NULL, metaSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fdMeta, 0 ); if ( cow.metadata_mmap == MAP_FAILED ) { - logadd( LOG_ERROR, "Error while mapping mmap:\n%s \n Bye.\n", strerror( errno ) ); + logadd( LOG_ERROR, "Error while mmap()ing meta data, errno=%d", errno ); return false; } metadata = (cowfile_metadata_header_t *)( cow.metadata_mmap ); metadata->magicValue = COW_FILE_META_MAGIC_VALUE; + metadata->imageSize = **imageSizePtr; metadata->version = CURRENT_COW_VERSION; - metadata->dataFileSize = ATOMIC_VAR_INIT( COW_DATA_CLUSTER_SIZE ); - metadata->metadataFileSize = ATOMIC_VAR_INIT( metadataFileSize ); - metadata->blocksize = DNBD3_BLOCK_SIZE; - metadata->originalImageSize = **imageSizePtr; - metadata->imageSize = metadata->originalImageSize; - metadata->creationTime = time( NULL ); - *imageSizePtr = &metadata->imageSize; - metadata->metaDataStart = meta_data_start; + metadata->validRemoteSize = **imageSizePtr; + metadata->startL1 = (uint32_t)startL1; + metadata->startL2 = (uint32_t)startL2; metadata->bitfieldSize = COW_BITFIELD_SIZE; - metadata->maxImageSize = cow.maxImageSize; - snprintf( metadata->imageName, 200, "%s", image_Name ); - cow.l1 = (l1 *)( cow.metadata_mmap + meta_data_start ); metadata->nextL2 = 0; + metadata->metaSize = ATOMIC_VAR_INIT( metaSize ); + metadata->nextClusterOffset = ATOMIC_VAR_INIT( COW_DATA_CLUSTER_SIZE ); + metadata->maxImageSize = COW_MAX_IMAGE_SIZE; + metadata->creationTime = time( NULL ); + snprintf( metadata->imageName, 200, "%s", image_Name ); - for ( size_t i = 0; i < cow.l1Size; i++ ) { + cow.l1 = (l1 *)( cow.metadata_mmap + startL1 ); + cow.l2 = (l2 *)( cow.metadata_mmap + startL2 ); + for ( size_t i = 0; i < l1NumEntries; i++ ) { cow.l1[i] = -1; } - cow.firstL2 = (l2 *)( ( (char *)cow.l1 ) + cow.l1Size ); // write header to data file uint64_t header = COW_FILE_DATA_MAGIC_VALUE; - if ( pwrite( cow.fhd, &header, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) { + if ( pwrite( cow.fdData, &header, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) { logadd( LOG_ERROR, "Could not write header to cow data file. Bye.\n" ); return false; } @@ -848,6 +915,7 @@ bool cowfile_init( char *path, const char *image_Name, uint16_t imageVersion, return false; } createCowStatsFile( path ); + *imageSizePtr = &metadata->imageSize; return true; } @@ -871,11 +939,11 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server snprintf( pathData, strlen( path ) + 6, "%s%s", path, "/data" ); - if ( ( cow.fhm = open( pathMeta, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) { + if ( ( cow.fdMeta = open( pathMeta, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) { logadd( LOG_ERROR, "Could not open cow meta file. Bye.\n" ); return false; } - if ( ( cow.fhd = open( pathData, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) { + if ( ( cow.fdData = open( pathData, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) { logadd( LOG_ERROR, "Could not open cow data file. Bye.\n" ); return false; } @@ -885,7 +953,7 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server size_t sizeToRead = sizeof( cowfile_metadata_header_t ); size_t readBytes = 0; while ( readBytes < sizeToRead ) { - ssize_t bytes = pread( cow.fhm, ( ( &header ) + readBytes ), sizeToRead, 0 ); + ssize_t bytes = pread( cow.fdMeta, ( ( &header ) + readBytes ), sizeToRead - readBytes, 0 ); if ( bytes <= 0 ) { logadd( LOG_ERROR, "Error while reading meta file header. Bye.\n" ); return false; @@ -902,44 +970,55 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server logadd( LOG_ERROR, "cow meta file of unkown format. Bye.\n" ); return false; } + + if ( header.bitfieldSize != COW_BITFIELD_SIZE ) { + logadd( LOG_ERROR, "cow meta file has unexpected bitfield size %d", (int)header.bitfieldSize ); + return false; + } + if ( header.startL1 >= header.startL2 || header.startL2 >= header.metaSize ) { + logadd( LOG_ERROR, "l1/l2 offset messed up in metadata." ); + return false; + } + struct stat st; - fstat( cow.fhm, &st ); - if ( st.st_size < (off_t)( header.metaDataStart + header.nextL2 * sizeof( l2 ) ) ) { - logadd( LOG_ERROR, "cow meta file to small. Bye.\n" ); + fstat( cow.fdMeta, &st ); + if ( st.st_size < (off_t)header.metaSize ) { + logadd( LOG_ERROR, "cow meta file too small. Bye." ); return false; } } { uint64_t magicValueDataFile; - if ( pread( cow.fhd, &magicValueDataFile, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) { - logadd( LOG_ERROR, "Error while reading cow data file, wrong file?. Bye.\n" ); + if ( pread( cow.fdData, &magicValueDataFile, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) { + logadd( LOG_ERROR, "Error while reading cow data file, wrong file?. Bye." ); return false; } if ( magicValueDataFile != COW_FILE_DATA_MAGIC_VALUE ) { if ( __builtin_bswap64( magicValueDataFile ) == COW_FILE_DATA_MAGIC_VALUE ) { - logadd( LOG_ERROR, "cow data file of wrong endianess. Bye.\n" ); + logadd( LOG_ERROR, "cow data file of wrong endianess. Bye." ); return false; } - logadd( LOG_ERROR, "cow data file of unkown format. Bye.\n" ); + logadd( LOG_ERROR, "cow data file of unkown format. Bye." ); return false; } struct stat st; - fstat( cow.fhd, &st ); - if ( (off_t)header.dataFileSize > st.st_size ) { - logadd( LOG_ERROR, "cow data file to small. Bye.\n" ); + fstat( cow.fdData, &st ); // add cluster size, since we don't preallocate + if ( header.nextClusterOffset > st.st_size + (int)COW_DATA_CLUSTER_SIZE ) { + logadd( LOG_ERROR, "cow data file too small. Expected=%jd, Is=%jd.", + (intmax_t)header.nextClusterOffset, (intmax_t)st.st_size ); return false; } } - cow.metadata_mmap = mmap( NULL, header.metadataFileSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fhm, 0 ); + cow.metadata_mmap = mmap( NULL, header.metaSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fdMeta, 0 ); if ( cow.metadata_mmap == MAP_FAILED ) { - logadd( LOG_ERROR, "Error while mapping mmap:\n%s \n Bye.\n", strerror( errno ) ); + logadd( LOG_ERROR, "Error while mapping mmap, errno=%d.", errno ); return false; } if ( header.version != CURRENT_COW_VERSION ) { - logadd( LOG_ERROR, "Error wrong file version got: %i expected: %i. Bye.\n", + logadd( LOG_ERROR, "Error wrong file version got: %i expected: %i. Bye.", metadata->version, CURRENT_COW_VERSION ); return false; } @@ -948,11 +1027,8 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server metadata = (cowfile_metadata_header_t *)( cow.metadata_mmap ); *imageSizePtr = &metadata->imageSize; - cow.l1 = (l1 *)( cow.metadata_mmap + metadata->metaDataStart ); - cow.maxImageSize = metadata->maxImageSize; - cow.l1Size = ( ( cow.maxImageSize + COW_FULL_L2_TABLE_DATA_SIZE - 1LL ) / COW_FULL_L2_TABLE_DATA_SIZE ); - - cow.firstL2 = (l2 *)( ( (char *)cow.l1 ) + cow.l1Size ); + cow.l1 = (l1 *)( cow.metadata_mmap + metadata->startL1 ); + cow.l2 = (l2 *)( cow.metadata_mmap + metadata->startL2 ); pthread_mutex_init( &cow.l2CreateLock, NULL ); createCowStatsFile( path ); return true; @@ -961,8 +1037,8 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server * @brief Starts the cow BackgroundThreads which are needed for stats and data upload * */ -bool cowfile_startBackgroundThreads() { - +bool cowfile_startBackgroundThreads() +{ if( pthread_create( &tidCowUploader, NULL, &uploaderThreadMain, NULL ) != 0 ) { logadd( LOG_ERROR, "Could not create cow uploader thread"); return false; @@ -977,55 +1053,15 @@ bool cowfile_startBackgroundThreads() { } /** - * @brief writes the given data in the data file - * - * @param buffer containing the data - * @param size of the buffer - * @param netSize which actually contributes to the fuse write request (can be different from size if partial full blocks are written) - * @param cowRequest <---- !???? TODO - * @param block block being written to - * @param inClusterOffset offset in this cluster to be written to - */ -static void writeData( const char *buffer, ssize_t size, size_t netSize, atomic_int *errorCode, - atomic_size_t *bytesWorkedOn, cow_l2_entry_t *block, off_t inClusterOffset ) -{ - // TODO: Assert that size + inClusterOffset <= COW_DATA_CLUSTER_SIZE? - ssize_t totalBytesWritten = 0; - while ( totalBytesWritten < size ) { - ssize_t bytesWritten = pwrite( cow.fhd, ( buffer + totalBytesWritten ), size - totalBytesWritten, - block->offset + inClusterOffset + totalBytesWritten ); - if ( bytesWritten == -1 ) { - *errorCode = errno; - logadd( LOG_ERROR, - "size:%zu netSize:%zu errorCode:%i bytesWorkedOn:%zu inClusterOffset:%ld block->offset:%ld \n", size, - netSize, *errorCode, *bytesWorkedOn, inClusterOffset, block->offset ); - break; - } else if ( bytesWritten == 0 ) { - *errorCode = EIO; - logadd( LOG_ERROR, - "size:%zu netSize:%zu errorCode:%i bytesWorkedOn:%zu inClusterOffset:%ld block->offset:%ld \n", size, - netSize, *errorCode, *bytesWorkedOn, inClusterOffset, block->offset ); - break; - } - totalBytesWritten += bytesWritten; - } - atomic_fetch_add( bytesWorkedOn, netSize ); - setBitsInBitfield( block->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ), - (int)( ( inClusterOffset + totalBytesWritten - 1 ) / DNBD3_BLOCK_SIZE ), 1 ); - - block->timeChanged = time( NULL ); -} - -/** - * @brief Increases the metadata->dataFileSize by COW_DATA_CLUSTER_SIZE. - * The space is not reserved on disk. - * - * @param block for which the space should be reserved. + * Check if block at given offset is local, i.e. has been modified. + * @param meta The cow_l2_entry for the according cluster MUST be provided + * @param offset offset of data, can be absolute image offset as it will be transformed into cluster offset */ -static bool allocateMetaBlockData( cow_l2_entry_t *block ) +static bool isBlockLocal( cow_l2_entry_t *meta, off_t offset ) { - block->offset = (atomic_long)atomic_fetch_add( &metadata->dataFileSize, COW_DATA_CLUSTER_SIZE ); - return true; + if ( meta == NULL ) + return false; + return checkBit( meta->bitfield, ( offset % COW_DATA_CLUSTER_SIZE ) / DNBD3_BLOCK_SIZE ); } /** @@ -1036,34 +1072,38 @@ static bool allocateMetaBlockData( cow_l2_entry_t *block ) * @param l2Index * @return cow_l2_entry_t* */ -static cow_l2_entry_t *getL2Entry( int l1Index, int l2Index ) +static cow_l2_entry_t *getL2Entry( int l1Index, int l2Index, bool create ) { - cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index ); + if ( cow.l1[l1Index] == -1 ) + return NULL; + cow_l2_entry_t *block = cow.l2[cow.l1[l1Index]] + l2Index; if ( block->offset == -1 ) { - allocateMetaBlockData( block ); + if ( !create ) + return NULL; + block->offset = atomic_fetch_add( &metadata->nextClusterOffset, COW_DATA_CLUSTER_SIZE ); } return block; } /** - * @brief creates an new L2 Block and initializes the containing cow_l2_entry_t blocks + * @brief creates an new L2 table and initializes the containing cow_l2_entry_t * * @param l1Index */ -static bool createL2Block( int l1Index ) +static bool createL2Table( int l1Index ) { pthread_mutex_lock( &cow.l2CreateLock ); if ( cow.l1[l1Index] == -1 ) { + int idx = metadata->nextL2++; for ( int i = 0; i < COW_L2_TABLE_SIZE; i++ ) { - cow.firstL2[metadata->nextL2][i].offset = -1; - cow.firstL2[metadata->nextL2][i].timeChanged = ATOMIC_VAR_INIT( 0 ); - cow.firstL2[metadata->nextL2][i].uploads = ATOMIC_VAR_INIT( 0 ); + cow.l2[idx][i].offset = -1; + cow.l2[idx][i].timeChanged = ATOMIC_VAR_INIT( 0 ); + cow.l2[idx][i].uploads = ATOMIC_VAR_INIT( 0 ); for ( int j = 0; j < COW_BITFIELD_SIZE; j++ ) { - cow.firstL2[metadata->nextL2][i].bitfield[j] = ATOMIC_VAR_INIT( 0 ); + cow.l2[idx][i].bitfield[j] = ATOMIC_VAR_INIT( 0 ); } } - cow.l1[l1Index] = metadata->nextL2; - metadata->nextL2 += 1; + cow.l1[l1Index] = idx; } pthread_mutex_unlock( &cow.l2CreateLock ); return true; @@ -1080,13 +1120,19 @@ static bool createL2Block( int l1Index ) static void finishWriteRequest( fuse_req_t req, cow_request_t *cowRequest ) { + if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) != 1 ) + return; // More sub-requests are pending, bail out if ( cowRequest->errorCode != 0 ) { fuse_reply_err( req, cowRequest->errorCode ); - } else { - uint64_t oldSize = metadata->imageSize; - uint64_t ns = MAX( oldSize, cowRequest->bytesWorkedOn + cowRequest->fuseRequestOffset ); - atomic_compare_exchange_strong( &metadata->imageSize, &oldSize, ns ); + uint64_t newSize = cowRequest->bytesWorkedOn + cowRequest->fuseRequestOffset; + if ( newSize > metadata->imageSize ) { + uint64_t oldSize; + do { + oldSize = metadata->imageSize; + newSize = MAX( oldSize, newSize ); + } while ( !atomic_compare_exchange_weak( &metadata->imageSize, &oldSize, newSize ) ); + } fuse_reply_write( req, cowRequest->bytesWorkedOn ); } free( cowRequest ); @@ -1100,67 +1146,104 @@ static void finishWriteRequest( fuse_req_t req, cow_request_t *cowRequest ) */ static void writePaddedBlock( cow_sub_request_t *sRequest ) { - //copy write Data - // TODO Assert that we have enough space in writeBuffer at that offset - memcpy( ( sRequest->writeBuffer + ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ) ), sRequest->writeSrc, - sRequest->size ); - writeData( sRequest->writeBuffer, DNBD3_BLOCK_SIZE, (ssize_t)sRequest->size, &sRequest->cowRequest->errorCode, - &sRequest->cowRequest->bytesWorkedOn, sRequest->block, - ( sRequest->inClusterOffset - ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ) ) ); - - - if ( atomic_fetch_sub( &sRequest->cowRequest->workCounter, 1 ) == 1 ) { - finishWriteRequest( sRequest->dRequest.fuse_req, sRequest->cowRequest ); + assert( ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ) + sRequest->size <= DNBD3_BLOCK_SIZE ); + // Here, we again check if the block is written locally - there might have been a second write + // that wrote the full block, hence didn't have to wait for remote data and finished faster. + // In that case, don't pad from remote as we'd overwrite newer data. + if ( isBlockLocal( sRequest->block, sRequest->inClusterOffset ) ) { + logadd( LOG_INFO, "It happened!" ); + } else { + // copy write Data + // writeBuffer is the received data, patch data from fuse write into it + memcpy( sRequest->writeBuffer + ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ), sRequest->writeSrc, + sRequest->size ); + if ( !writeAll( cow.fdData, sRequest->writeBuffer, DNBD3_BLOCK_SIZE, + sRequest->block->offset + ( sRequest->inClusterOffset & ~DNBD3_BLOCK_MASK ) ) ) { + sRequest->cowRequest->errorCode = errno; + } else { + sRequest->cowRequest->bytesWorkedOn += sRequest->size; + int64_t bit = sRequest->inClusterOffset / DNBD3_BLOCK_SIZE; + setBitsInBitfield( sRequest->block->bitfield, bit, bit, true ); + sRequest->block->timeChanged = time( NULL ); + } } + + finishWriteRequest( sRequest->dRequest.fuse_req, sRequest->cowRequest ); free( sRequest ); } /** * @brief If a block does not start or finish on an multiple of DNBD3_BLOCK_SIZE, the blocks need to be * padded. If this block is inside the original image size, the padding data will be read from the server. - * Otherwise it will be padded with 0 since the it must be the block at the end of the image. - * TODO: Properly document the arguments and what value range they can be, i.e. see below for the 4k case - * + * Otherwise it will be padded with 0 since the it must be a block after the end of the image. + * @param req fuse_req_t + * @param cowRequest cow_request_t + * @param startOffset Absolute offset where the real data starts + * @param endOffset Absolute offset where the real data ends + * @param srcBuffer pointer to the data that needs to be padded, ie. data from user space. */ -static void padBlockFromRemote( fuse_req_t req, off_t offset, cow_request_t *cowRequest, const char *buffer, - size_t size, cow_l2_entry_t *block, off_t inClusterOffset ) +static bool padBlockForWrite( fuse_req_t req, cow_request_t *cowRequest, + off_t startOffset, off_t endOffset, const char *srcBuffer ) { - // TODO: Is this *guaranteed* to be the case on the caller site? Add comment to ^ - assert( ( offset % DNBD3_BLOCK_SIZE ) + size <= DNBD3_BLOCK_SIZE ); - if ( offset >= (off_t)metadata->originalImageSize ) { - // Writing past the end of the image - inClusterOffset -= inClusterOffset % DNBD3_BLOCK_SIZE; - char buf[DNBD3_BLOCK_SIZE] = { 0 }; - memcpy( buf + ( offset % DNBD3_BLOCK_SIZE ), buffer, size ); - // At this point we should have a 4k block with user-space data to write, and possibly - // zero-padding at start and/or end - - writeData( buf, DNBD3_BLOCK_SIZE, (ssize_t)size, &cowRequest->errorCode, &cowRequest->bytesWorkedOn, - block, inClusterOffset ); - return; - } - // Need to fetch padding from upstream - cow_sub_request_t *sRequest = calloc( sizeof( cow_sub_request_t ) + DNBD3_BLOCK_SIZE, 1 ); - sRequest->callback = writePaddedBlock; - sRequest->inClusterOffset = inClusterOffset; - sRequest->block = block; - sRequest->size = size; - sRequest->writeSrc = buffer; - sRequest->cowRequest = cowRequest; - - sRequest->dRequest.length = (uint32_t)MIN( DNBD3_BLOCK_SIZE, metadata->originalImageSize - offset ); - sRequest->dRequest.offset = offset - ( offset % DNBD3_BLOCK_SIZE ); - sRequest->dRequest.fuse_req = req; - - atomic_fetch_add( &cowRequest->workCounter, 1 ); - if ( !connection_read( &sRequest->dRequest ) ) { - cowRequest->errorCode = EIO; - if ( atomic_fetch_sub( &sRequest->cowRequest->workCounter, 1 ) == 1 ) { - finishWriteRequest( sRequest->dRequest.fuse_req, sRequest->cowRequest ); + // Make sure we pad exactly one block + endOffset = MIN( (uint64_t)endOffset, ( startOffset + DNBD3_BLOCK_SIZE ) & ~DNBD3_BLOCK_MASK ); + assert( startOffset < endOffset ); + size_t size = (size_t)( endOffset - startOffset ); + int l1Index = offsetToL1Index( startOffset ); + int l2Index = offsetToL2Index( startOffset ); + off_t inClusterOffset = startOffset % COW_DATA_CLUSTER_SIZE; + cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, true ); + if ( isBlockLocal( cluster, startOffset ) ) { + // No padding at all, keep existing data + bool ret = writeAll( cow.fdData, srcBuffer, size, cluster->offset + inClusterOffset ); + if ( ret ) { + cowRequest->bytesWorkedOn += size; + cluster->timeChanged = time( NULL ); } - free( sRequest ); - return; + return ret; + } + // Not local, need some form of padding + createL2Table( l1Index ); + if ( cluster == NULL ) { + cluster = getL2Entry( l1Index, l2Index, true ); + } + uint64_t validImageSize = metadata->validRemoteSize; // As we don't lock + if ( startOffset >= (off_t)validImageSize ) { + // After end of remote valid data, pad with zeros entirely + char buf[DNBD3_BLOCK_SIZE] = {0}; + off_t start = startOffset % DNBD3_BLOCK_SIZE; + assert( start + size <= DNBD3_BLOCK_SIZE ); + memcpy( buf + start, srcBuffer, size ); + bool ret = writeAll( cow.fdData, buf, DNBD3_BLOCK_SIZE, + cluster->offset + ( inClusterOffset & ~DNBD3_BLOCK_MASK ) ); + if ( ret ) { + int64_t bit = inClusterOffset / DNBD3_BLOCK_SIZE; + setBitsInBitfield( cluster->bitfield, bit, bit, true ); + cowRequest->bytesWorkedOn += size; + cluster->timeChanged = time( NULL ); + } + return ret; + } + // Need to fetch padding from upstream, allocate struct plus one block + cow_sub_request_t *sub = calloc( sizeof( *sub ) + DNBD3_BLOCK_SIZE, 1 ); + sub->callback = writePaddedBlock; + sub->inClusterOffset = inClusterOffset; + sub->block = cluster; + sub->size = size; + sub->writeSrc = srcBuffer; + sub->cowRequest = cowRequest; + + sub->dRequest.length = (uint32_t)MIN( DNBD3_BLOCK_SIZE, validImageSize - startOffset ); + sub->dRequest.offset = startOffset & ~DNBD3_BLOCK_MASK; + sub->dRequest.fuse_req = req; + + if ( !connection_read( &sub->dRequest ) ) { + free( sub ); + errno = ENOTSOCK; + return false; } + atomic_fetch_add( &cowRequest->workCounter, 1 ); + return true; } /** @@ -1187,15 +1270,19 @@ void readRemoteData( cow_sub_request_t *sRequest ) atomic_fetch_add( &sRequest->cowRequest->bytesWorkedOn, sRequest->dRequest.length ); if ( atomic_fetch_sub( &sRequest->cowRequest->workCounter, 1 ) == 1 ) { - if ( sRequest->cowRequest->bytesWorkedOn < sRequest->cowRequest->fuseRequestSize ) { - // TODO: Is this a logic bug somewhere, reagarding accounting? + if ( sRequest->cowRequest->bytesWorkedOn != sRequest->cowRequest->fuseRequestSize ) { // Because connection_read() will always return exactly as many bytes as requested, // or simply never finish. - // Otherwise, we should return EIO... - logadd( LOG_ERROR, "pad read to small\n" ); + logadd( LOG_ERROR, "BUG? Pad read has invalid size. worked on: %"PRIu64", request size: %" + PRIu64", offset: %"PRIu64, + (uint64_t)sRequest->cowRequest->bytesWorkedOn, + (uint64_t)sRequest->cowRequest->fuseRequestSize, + (uint64_t)sRequest->cowRequest->fuseRequestOffset ); + fuse_reply_err( sRequest->dRequest.fuse_req, EIO ); + } else { + fuse_reply_buf( sRequest->dRequest.fuse_req, sRequest->cowRequest->readBuffer, + sRequest->cowRequest->bytesWorkedOn ); } - fuse_reply_buf( sRequest->dRequest.fuse_req, sRequest->cowRequest->readBuffer, - sRequest->cowRequest->bytesWorkedOn ); free( sRequest->cowRequest->readBuffer ); free( sRequest->cowRequest ); } @@ -1213,69 +1300,61 @@ void readRemoteData( cow_sub_request_t *sRequest ) void cowfile_setSize( fuse_req_t req, size_t size, fuse_ino_t ino, struct fuse_file_info *fi ) { - // decrease if ( size < metadata->imageSize ) { - if ( size < metadata->originalImageSize ) { - metadata->originalImageSize = size; + // truncate file + if ( size < metadata->validRemoteSize ) { + metadata->validRemoteSize = size; } - // TODO.... so.... - // originalImageSize = smallest we have seen - // imageSize = current - // ? - - // increase } else if ( size > metadata->imageSize ) { + // grow file, pad with zeroes off_t offset = metadata->imageSize; int l1Index = offsetToL1Index( offset ); int l2Index = offsetToL2Index( offset ); int l1EndIndex = offsetToL1Index( size ); int l2EndIndex = offsetToL2Index( size ); - // special case first block TODO: What is the special case? What is happening here? - if ( cow.l1[l1Index] != -1 ) { - cow_l2_entry_t *block = getL2Entry( l1Index, l2Index ); - if ( metadata->imageSize % DNBD3_BLOCK_SIZE != 0 ) { - off_t inClusterOffset = metadata->imageSize % COW_DATA_CLUSTER_SIZE; + // Special case, first cluster through which the size change passes + cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, false ); + if ( cluster != NULL ) { + off_t inClusterOffset = offset % COW_DATA_CLUSTER_SIZE; + // if the new size is inside a DNBD3_BLOCK it might still contain old data before a truncate + if ( !IS_4K_ALIGNED( metadata->imageSize ) ) { size_t sizeToWrite = DNBD3_BLOCK_SIZE - ( metadata->imageSize % DNBD3_BLOCK_SIZE ); - if ( checkBit( block->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ) ) ) { - char buf[sizeToWrite]; - memset( buf, 0, sizeToWrite ); - - ssize_t bytesWritten = pwrite( cow.fhd, buf, sizeToWrite, block->offset + inClusterOffset ); + if ( checkBit( cluster->bitfield, inClusterOffset / DNBD3_BLOCK_SIZE ) ) { + char buf[DNBD3_BLOCK_SIZE] = {0}; + ssize_t bytesWritten = pwrite( cow.fdData, buf, sizeToWrite, cluster->offset + inClusterOffset ); if ( bytesWritten < (ssize_t)sizeToWrite ) { fuse_reply_err( req, bytesWritten == -1 ? errno : EIO ); return; } - block->timeChanged = time( NULL ); + cluster->timeChanged = time( NULL ); offset += sizeToWrite; } } - // rest of block set bits 0 - l1Index = offsetToL1Index( offset ); - l2Index = offsetToL2Index( offset ); - block = getL2Entry( l1Index, l2Index ); - off_t inClusterOffset = offset % COW_DATA_CLUSTER_SIZE; - setBitsInBitfield( - block->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ), ( COW_BITFIELD_SIZE * 8 ) - 1, 0 ); - block->timeChanged = time( NULL ); + // all remaining bits in cluster will get set to 0 + inClusterOffset = offset % COW_DATA_CLUSTER_SIZE; + setBitsInBitfield( cluster->bitfield, inClusterOffset / DNBD3_BLOCK_SIZE, + ( COW_BITFIELD_SIZE * 8 ) - 1, false ); + cluster->timeChanged = time( NULL ); l2Index++; if ( l2Index >= COW_L2_TABLE_SIZE ) { l2Index = 0; l1Index++; } } - // null all bitfields - while ( !( l1Index > l1EndIndex || ( l1Index == l1EndIndex && l2EndIndex < l2Index ) ) ) { + // normal case, if clusters exist, null bitfields + while ( l1Index < l1EndIndex || ( l1Index == l1EndIndex && l2Index <= l2EndIndex ) ) { if ( cow.l1[l1Index] == -1 ) { l1Index++; l2Index = 0; continue; } - - cow_l2_entry_t *block = getL2Entry( l1Index, l2Index ); - setBitsInBitfield( block->bitfield, 0, ( COW_BITFIELD_SIZE * 8 ) - 1, 0 ); - block->timeChanged = time( NULL ); + cluster = getL2Entry( l1Index, l2Index, false ); + if ( cluster != NULL ) { + memset( cluster->bitfield, 0, COW_BITFIELD_SIZE ); + cluster->timeChanged = time( NULL ); + } l2Index++; if ( l2Index >= COW_L2_TABLE_SIZE ) { l2Index = 0; @@ -1308,98 +1387,82 @@ void cowfile_write( fuse_req_t req, cow_request_t *cowRequest, off_t offset, siz off_t currentOffset = offset; off_t endOffset = offset + size; + if ( !IS_4K_ALIGNED( currentOffset ) ) { + // Handle case where start is not 4k aligned + if ( !padBlockForWrite( req, cowRequest, currentOffset, endOffset, cowRequest->writeBuffer ) ) { + goto fail; + } + // Move forward to next block border + currentOffset = ( currentOffset + DNBD3_BLOCK_SIZE ) & ~DNBD3_BLOCK_MASK; + } + if ( currentOffset < endOffset && !IS_4K_ALIGNED( endOffset ) ) { + // Handle case where end is not 4k aligned + off_t lastBlockStart = endOffset & ~DNBD3_BLOCK_MASK; + if ( !padBlockForWrite( req, cowRequest, lastBlockStart, endOffset, + cowRequest->writeBuffer + ( lastBlockStart - offset ) ) ) { + goto fail; + } + endOffset = lastBlockStart; + } + + // From here on start and end are block-aligned int l1Index = offsetToL1Index( currentOffset ); int l2Index = offsetToL2Index( currentOffset ); while ( currentOffset < endOffset ) { if ( cow.l1[l1Index] == -1 ) { - createL2Block( l1Index ); + createL2Table( l1Index ); } //loop over L2 array (metadata) while ( currentOffset < endOffset && l2Index < COW_L2_TABLE_SIZE ) { - cow_l2_entry_t *metaBlock = getL2Entry( l1Index, l2Index ); - - // Calc absolute offset in image corresponding to current cluster - size_t clusterAbsoluteStartOffset = l1Index * COW_FULL_L2_TABLE_DATA_SIZE + l2Index * COW_DATA_CLUSTER_SIZE; - - size_t inClusterOffset = currentOffset - clusterAbsoluteStartOffset; - // How many bytes we can write to this cluster before crossing a boundary, or before the write request is completed + cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, true ); + size_t inClusterOffset = currentOffset % COW_DATA_CLUSTER_SIZE; + // How many bytes we can write to this cluster before crossing a boundary, + // or before the write request is complete size_t bytesToWriteToCluster = MIN( (size_t)( endOffset - currentOffset ), COW_DATA_CLUSTER_SIZE - inClusterOffset ); - ///////////////////////// - // lock for the half block probably needed - if ( currentOffset % DNBD3_BLOCK_SIZE != 0 - && !checkBit( metaBlock->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ) ) ) { - // Block has not been written locally before, and write does not start on block boundary. - // Need to fetch the first couple bytes of the block from remote before writing the block to disk. - size_t writeSize = MIN( bytesToWriteToCluster, DNBD3_BLOCK_SIZE - ( (size_t)currentOffset % DNBD3_BLOCK_SIZE ) ); - const char *sbuf = cowRequest->writeBuffer + ( ( currentOffset - offset ) ); - padBlockFromRemote( req, currentOffset, cowRequest, sbuf, writeSize, metaBlock, (off_t)inClusterOffset ); - currentOffset += writeSize; - continue; + if ( !writeAll( cow.fdData, cowRequest->writeBuffer + ( currentOffset - offset ), + bytesToWriteToCluster, cluster->offset + inClusterOffset ) ) { + goto fail; } - - size_t endPaddedSize = 0; // In case we need to skip over a pending pad request to remote - if ( ( currentOffset + bytesToWriteToCluster ) % DNBD3_BLOCK_SIZE != 0 - && metadata->originalImageSize > currentOffset + bytesToWriteToCluster ) { - // Write request does not end on block boundary, and ends before end of image - // End offset of this write - off_t clusterEndOffset = currentOffset + bytesToWriteToCluster; - // Start of last block of write, i.e. start of the last, incomplete block - off_t lastBlockStartOffset = clusterEndOffset - ( clusterEndOffset % DNBD3_BLOCK_SIZE ); - // Where that last block starts relative to its cluster - off_t inClusterBlockOffset = lastBlockStartOffset - clusterAbsoluteStartOffset; - if ( !checkBit( metaBlock->bitfield, (int)( inClusterBlockOffset / DNBD3_BLOCK_SIZE ) ) ) { - // Block indeed not modified before, need to fetch - const char *sbuf = cowRequest->writeBuffer + ( ( lastBlockStartOffset - offset ) ); - padBlockFromRemote( req, lastBlockStartOffset, cowRequest, sbuf, clusterEndOffset - lastBlockStartOffset, metaBlock, - inClusterBlockOffset ); - - - bytesToWriteToCluster -= clusterEndOffset - lastBlockStartOffset; - endPaddedSize = clusterEndOffset - lastBlockStartOffset; - } - } - writeData( cowRequest->writeBuffer + ( ( currentOffset - offset ) ), (ssize_t)bytesToWriteToCluster, - bytesToWriteToCluster, &cowRequest->errorCode, &cowRequest->bytesWorkedOn, metaBlock, inClusterOffset ); - + int64_t f = inClusterOffset / DNBD3_BLOCK_SIZE; + int64_t t = ( inClusterOffset + bytesToWriteToCluster - 1 ) / DNBD3_BLOCK_SIZE; + setBitsInBitfield( cluster->bitfield, f, t, true ); + cowRequest->bytesWorkedOn += bytesToWriteToCluster; currentOffset += bytesToWriteToCluster; - // Account for skipped-over bytes - currentOffset += endPaddedSize; - - + cluster->timeChanged = time( NULL ); l2Index++; } l1Index++; l2Index = 0; } - if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) == 1 ) { - finishWriteRequest( req, cowRequest ); + goto success; + +fail: + if ( cowRequest->errorCode == 0 ) { + cowRequest->errorCode = errno != 0 ? errno : EIO; } +success: + finishWriteRequest( req, cowRequest ); } /** * @brief Request data, that is not available locally, via the network. * - * @param req fuse_req_t + * @param req fuse_req_t * @param offset from the start of the file * @param size of data to request * @param buffer into which the data is to be written - * @param workCounter workCounter is increased by one and later reduced by one again when the request is completed. TODO There is no such param, but cowRequest.. + * @param cowRequest cow_request_t */ static void readRemote( fuse_req_t req, off_t offset, ssize_t size, char *buffer, cow_request_t *cowRequest ) { - // edgecase: Image size got reduced before on a non block border - if ( offset + size > (long int) metadata->originalImageSize ) { // TODO How does this check if it's a non block border? - size_t padZeroSize = ( offset + size ) - metadata->originalImageSize; - off_t padZeroOffset = metadata->originalImageSize - offset; - assert( offset > 0 ); // TODO Should this be padZeroOffset? - // ... But isn't it possible that offset > originalImageSize, in which case it would be negative? - memset( ( buffer + padZeroOffset ), 0, padZeroSize ); - - atomic_fetch_add( &cowRequest->bytesWorkedOn, padZeroSize ); - } + assert( offset < (off_t)metadata->validRemoteSize ); + assert( offset + size <= (off_t)metadata->validRemoteSize ); + if ( size == 0 ) + return; + assert( size > 0 ); cow_sub_request_t *sRequest = malloc( sizeof( cow_sub_request_t ) ); sRequest->callback = readRemoteData; sRequest->dRequest.length = (uint32_t)size; @@ -1410,35 +1473,33 @@ static void readRemote( fuse_req_t req, off_t offset, ssize_t size, char *buffer atomic_fetch_add( &cowRequest->workCounter, 1 ); if ( !connection_read( &sRequest->dRequest ) ) { - cowRequest->errorCode = EIO; // TODO We set an error... + cowRequest->errorCode = EIO; free( sRequest ); if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) == 1 ) { - // .... but would still report success if this happens to be the last pending sub-request!? - fuse_reply_buf( req, cowRequest->readBuffer, cowRequest->bytesWorkedOn ); + fuse_reply_err( req, EIO ); + free( cowRequest->readBuffer ); + free( cowRequest ); } - free( cowRequest->readBuffer ); - free( cowRequest ); - return; } } /** * @brief Get the Block Data Source object * - * @param block - * @param bitfieldOffset - * @param offset - * @return enum dataSource + * @param block + * @param bitfieldOffset + * @param offset + * @return enum dataSource */ enum dataSource getBlockDataSource( cow_l2_entry_t *block, off_t bitfieldOffset, off_t offset ) { - if ( block != NULL && checkBit( block->bitfield, (int)bitfieldOffset ) ) { - return local; + if ( block != NULL && checkBit( block->bitfield, bitfieldOffset ) ) { + return ds_local; } - if ( offset >= (off_t)metadata->originalImageSize ) { - return zero; + if ( offset >= (off_t)metadata->validRemoteSize ) { + return ds_zero; } - return remote; + return ds_remote; } /** @@ -1450,124 +1511,109 @@ enum dataSource getBlockDataSource( cow_l2_entry_t *block, off_t bitfieldOffset, * @param offset offset where the read starts. * @return uint64_t Number of bytes read. */ -void cowfile_read( fuse_req_t req, size_t size, off_t offset ) +void cowfile_read( fuse_req_t req, size_t size, off_t startOffset ) { cow_request_t *cowRequest = malloc( sizeof( cow_request_t ) ); cowRequest->fuseRequestSize = size; cowRequest->bytesWorkedOn = ATOMIC_VAR_INIT( 0 ); cowRequest->workCounter = ATOMIC_VAR_INIT( 1 ); cowRequest->errorCode = ATOMIC_VAR_INIT( 0 ); - cowRequest->readBuffer = malloc( size ); - cowRequest->fuseRequestOffset = offset; - off_t lastReadOffset = offset; - off_t endOffset = offset + size; - off_t searchOffset = offset; - int l1Index = offsetToL1Index( offset ); - int l2Index = offsetToL2Index( offset ); - int bitfieldOffset = getBitfieldOffsetBit( offset ); - enum dataSource dataState; - cow_l2_entry_t *cluster = NULL; - - if ( cow.l1[l1Index] != -1 ) { - cluster = getL2Entry( l1Index, l2Index ); - } + cowRequest->readBuffer = calloc( size, 1 ); + cowRequest->fuseRequestOffset = startOffset; + off_t lastReadOffset = -1; + off_t endOffset = startOffset + size; + off_t searchOffset = startOffset; + int l1Index = offsetToL1Index( startOffset ); + int l2Index = offsetToL2Index( startOffset ); + int bitfieldOffset = getBitfieldOffsetBit( startOffset ); + cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, false ); + enum dataSource dataState = ds_invalid; + bool flushCurrentSpan = false; // Set if we need to read the current span and start the next one + bool newSourceType = true; // Set if we're starting a new span, and the source type needs to be determined - bool doRead = false; - bool firstLoop = true; - bool updateBlock = false; while ( searchOffset < endOffset ) { - if ( firstLoop ) { - firstLoop = false; + if ( newSourceType ) { + newSourceType = false; lastReadOffset = searchOffset; - // TODO: Why is this only set on first iteration and not for every block/cluster? dataState = getBlockDataSource( cluster, bitfieldOffset, searchOffset ); } else if ( getBlockDataSource( cluster, bitfieldOffset, searchOffset ) != dataState ) { - // TODO So data source changed, but we don't update the dataState var... How can this possibly work? - doRead = true; + // Source type changed, obviously need to flush current span + flushCurrentSpan = true; } else { bitfieldOffset++; - } - - if ( bitfieldOffset >= COW_BITFIELD_SIZE * 8 ) { - // Advance to next cluster in current l2 table - bitfieldOffset = 0; - l2Index++; - if ( l2Index >= COW_L2_TABLE_SIZE ) { - // Advance to next l1 entry, reset l2 index - l2Index = 0; - l1Index++; - } - // Also set flag that we need to update the 'cluster' struct at the end of this iteration - // TODO: Why do we update all the values above, but not the cluster struct? We access those - // variables in the code below, so we have updated offset and index, but operate on the - // old cluster struct. How does that make sense? - updateBlock = true; - if ( dataState == local ) { - doRead = true; + // If reading from local cow file, crossing a cluster border means we need to flush + // since the next cluster might be somewhere else in the data file + if ( dataState == ds_local && bitfieldOffset == COW_BITFIELD_SIZE * 8 ) { + flushCurrentSpan = true; } } - // compute the original file offset from bitfieldOffset, l2Index and l1Index - // TODO ??? As stated above, this is using the updated values, so isn't this the next - // offset tather than original offset? - searchOffset = DNBD3_BLOCK_SIZE * ( bitfieldOffset ) + l2Index * COW_DATA_CLUSTER_SIZE + + // compute the absolute image offset from bitfieldOffset, l2Index and l1Index + // bitfieldOffset might be out of bounds here, but that doesn't matter for the calculation + searchOffset = DNBD3_BLOCK_SIZE * bitfieldOffset + l2Index * COW_DATA_CLUSTER_SIZE + l1Index * COW_FULL_L2_TABLE_DATA_SIZE; - if ( doRead || searchOffset >= endOffset ) { - ssize_t sizeToRead = MIN( searchOffset, endOffset ); - if ( dataState == remote ) { - if ( sizeToRead > (ssize_t) metadata->originalImageSize ) { - //pad rest with 0 - memset( cowRequest->readBuffer - + ( ( lastReadOffset - offset ) + ( metadata->originalImageSize - offset ) ), - 0, sizeToRead - metadata->originalImageSize ); - atomic_fetch_add( &cowRequest->bytesWorkedOn, sizeToRead - metadata->originalImageSize ); - sizeToRead = metadata->originalImageSize; + if ( flushCurrentSpan || searchOffset >= endOffset ) { + ssize_t spanEndOffset = MIN( searchOffset, endOffset ); + if ( dataState == ds_remote ) { + if ( spanEndOffset > (ssize_t)metadata->validRemoteSize ) { + // Account for bytes we leave zero, because they're beyond the (truncated) original image size + atomic_fetch_add( &cowRequest->bytesWorkedOn, spanEndOffset - metadata->validRemoteSize ); + spanEndOffset = metadata->validRemoteSize; } - sizeToRead -= lastReadOffset; - readRemote( - req, lastReadOffset, sizeToRead, cowRequest->readBuffer + ( lastReadOffset - offset ), cowRequest ); - } else if ( dataState == zero ) { - sizeToRead -= lastReadOffset; - memset( cowRequest->readBuffer + ( lastReadOffset - offset ), 0, sizeToRead ); - atomic_fetch_add( &cowRequest->bytesWorkedOn, sizeToRead ); - } else { - sizeToRead -= lastReadOffset; - // Compute the offset in the data file where the read starts - off_t localRead = - cluster->offset + ( ( lastReadOffset % COW_FULL_L2_TABLE_DATA_SIZE ) % COW_DATA_CLUSTER_SIZE ); + readRemote( req, lastReadOffset, spanEndOffset - lastReadOffset, + cowRequest->readBuffer + ( lastReadOffset - startOffset ), cowRequest ); + } else if ( dataState == ds_zero ) { + // Past end of image, account for leaving them zero + ssize_t numBytes = spanEndOffset - lastReadOffset; + atomic_fetch_add( &cowRequest->bytesWorkedOn, numBytes ); + } else if ( dataState == ds_local ) { + ssize_t numBytes = spanEndOffset - lastReadOffset; + // Compute the startOffset in the data file where the read starts + off_t localRead = cluster->offset + ( lastReadOffset % COW_DATA_CLUSTER_SIZE ); ssize_t totalBytesRead = 0; - while ( totalBytesRead < sizeToRead ) { - ssize_t bytesRead = - pread( cow.fhd, cowRequest->readBuffer + ( lastReadOffset - offset ), sizeToRead, localRead ); + while ( totalBytesRead < numBytes ) { + ssize_t bytesRead = pread( cow.fdData, cowRequest->readBuffer + ( lastReadOffset - startOffset ), + numBytes - totalBytesRead, localRead + totalBytesRead ); if ( bytesRead == -1 ) { cowRequest->errorCode = errno; goto fail; - } else if ( bytesRead <= 0 ) { + } else if ( bytesRead == 0 ) { + logadd( LOG_ERROR, "EOF for read at localRead=%"PRIu64", totalBR=%"PRIu64, + (uint64_t)localRead, (uint64_t)totalBytesRead ); + logadd( LOG_ERROR, "searchOffset=%"PRIu64", endOffset=%"PRIu64", imageSize=%"PRIu64, + searchOffset, endOffset, metadata->imageSize ); cowRequest->errorCode = EIO; goto fail; } totalBytesRead += bytesRead; } - atomic_fetch_add( &cowRequest->bytesWorkedOn, totalBytesRead ); + atomic_fetch_add( &cowRequest->bytesWorkedOn, numBytes ); + } else { + assert( 4 == 6 ); } lastReadOffset = searchOffset; - doRead = false; - firstLoop = true; + flushCurrentSpan = false; + // Since the source type changed, reset + newSourceType = true; } - - if ( updateBlock ) { - if ( cow.l1[l1Index] != -1 ) { - cluster = getL2Entry( l1Index, l2Index ); - } else { - cluster = NULL; + if ( bitfieldOffset == COW_BITFIELD_SIZE * 8 ) { + // Advance to next cluster in current l2 table + bitfieldOffset = 0; + l2Index++; + if ( l2Index >= COW_L2_TABLE_SIZE ) { + // Advance to next l1 entry, reset l2 index + l2Index = 0; + l1Index++; } - updateBlock = false; + cluster = getL2Entry( l1Index, l2Index, false ); } } fail:; if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) == 1 ) { - if ( cowRequest->errorCode != 0 || cowRequest->bytesWorkedOn < size ) { - logadd( LOG_ERROR, "incomplete read or I/O error (errno=%d)", cowRequest->errorCode ); + if ( cowRequest->errorCode != 0 || cowRequest->bytesWorkedOn != size ) { + logadd( LOG_ERROR, "incomplete read or I/O error (errno=%d, workedOn: %"PRIu64", size: %"PRIu64")", + cowRequest->errorCode, (uint64_t)cowRequest->bytesWorkedOn, (uint64_t)size ); fuse_reply_err( req, cowRequest->errorCode != 0 ? cowRequest->errorCode : EIO ); } else { fuse_reply_buf( req, cowRequest->readBuffer, cowRequest->bytesWorkedOn ); |