From 9bf6fbce6dfccd16bb54a9801ca287bac6950a09 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 14 Apr 2023 17:10:02 +0200 Subject: [FUSE] cow: More fixes and refactoring Don't allocate a data cluster in data file for empty l2 entries when resizing the image file. Calculating l2 offset in metadata file was broken and overlapping l1. Delete unneeded entries from cow struct. Rename a few more variables. Fix a few possible race conditions. Only upload modified blocks from cluster. Simplify cow_write() function by handling misaligned start/end first. Try to also simplify cow_read() a bit. TODO: Documentation, update the cow merger service. --- inc/dnbd3/config.h | 1 + inc/dnbd3/config/cow.h | 2 +- src/cowtest/main.c | 50 ++- src/fuse/cowfile.c | 928 ++++++++++++++++++++++++++----------------------- src/fuse/cowfile.h | 52 +-- src/fuse/main.c | 17 +- 6 files changed, 560 insertions(+), 490 deletions(-) diff --git a/inc/dnbd3/config.h b/inc/dnbd3/config.h index eb4b8b1..482bd29 100644 --- a/inc/dnbd3/config.h +++ b/inc/dnbd3/config.h @@ -39,5 +39,6 @@ // +++++ Block Device +++++ #define DNBD3_BLOCK_SIZE ((uint64_t)4096) // NEVER CHANGE THIS OR THE WORLD WILL END! +#define DNBD3_BLOCK_MASK ((uint64_t)4095) // NEVER CHANGE THIS OR THE WORLD WILL END! #endif /* CONFIG_H_ */ diff --git a/inc/dnbd3/config/cow.h b/inc/dnbd3/config/cow.h index 9ed59a0..a7f3615 100644 --- a/inc/dnbd3/config/cow.h +++ b/inc/dnbd3/config/cow.h @@ -11,7 +11,7 @@ #define COW_MAX_PARALLEL_BACKGROUND_UPLOADS 2 // maximum number of parallel uploads while the image is still mounted #define COW_URL_STRING_SIZE 500 // Max string size for an url #define COW_SHOW_UL_SPEED 1 // enable display of ul speed in cow status file -#define COW_MAX_IMAGE_SIZE 1000LL * 1000LL * 1000LL * 1000LL; // Maximum size an image can have(tb*gb*mb*kb) +#define COW_MAX_IMAGE_SIZE (1000LL * 1000LL * 1000LL * 1000LL) // Maximum size an image can have(tb*gb*mb*kb) // +++++ COW API Endpoints +++++ #define COW_API_CREATE "%s/api/file/create" #define COW_API_UPDATE "%s/api/file/update?guid=%s&clusterindex=%lu" diff --git a/src/cowtest/main.c b/src/cowtest/main.c index 38d0f16..c7da4ac 100644 --- a/src/cowtest/main.c +++ b/src/cowtest/main.c @@ -43,7 +43,7 @@ atomic_bool randomTestLoop = true; #define RND_MAX_WRITE_SIZE 4096 * 320 #define RND_TRUNCATE_PROBABILITY 5 -#define RND_UNALIGNED_WRITE_PROBABILITY 5 +#define RND_UNALIGNED_WRITE_PROBABILITY 80 #define RND_DEFAULT_MIN_SIZE_PERCENT 0.9f #define RND_DEFAULT_MAX_SIZE_PERCENT 1.1f #define BASE_DATA (char)42 @@ -97,10 +97,29 @@ bool generateTestFile( char *path, size_t size ) * @param str * @param len */ -void printCharInHexadecimal( const char *str, int len ) +void printCharInHexadecimal( const char *str, const char *got, int len ) { + int pr = 0; for ( int i = 0; i < len; ++i ) { - printf( "0x%02x ", (int)str[i] ); + if ( pr > 0 ) { + pr--; + if ( str[i] != got[i] ) { + printf( "[%02x/%02x] ", (int)str[i], (int)got[i] ); + } else { + printf( "%02x ", (int)str[i] ); + } + if ( pr == 0 ) { + printf( " .." ); + } + } else { + if ( str[i] != got[i] ) { + pr = 4; + i = MAX( -1, i - 4 ); + if ( i != -1 ) { + printf(".. " ); + } + } + } } printf( "\n" ); } @@ -118,12 +137,10 @@ void printCharInHexadecimal( const char *str, int len ) bool compare( char buff[], char expected[], size_t size, char errorMessage[] ) { if ( memcmp( buff, expected, size ) != 0 ) { - printf( "%s", errorMessage ); + printf( "%s\n", errorMessage ); if ( printOnError ) { - printf( "Expected: \n" ); - printCharInHexadecimal( expected, (int)size ); - printf( "Got: \n " ); - printCharInHexadecimal( buff, (int)size ); + printf( "Diff [want/got]: \n" ); + printCharInHexadecimal( expected, buff, (int)size ); } return false; } @@ -247,14 +264,14 @@ bool verifySingleBit() expected[0] = 1; if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE, 0, "SingleBit test Failed: first read to small" ) ) return false; - if ( !compare( buff, expected, DNBD3_BLOCK_SIZE, "SingleBit test Failed: first write not as expected" ) ) + if ( !compare( buff, expected, DNBD3_BLOCK_SIZE, "SingleBit test Failed: first read not as expected" ) ) return false; expected[0] = BASE_DATA; expected[DNBD3_BLOCK_SIZE / 2] = 1; if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE, DNBD3_BLOCK_SIZE, "SingleBit test Failed: second read to small" ) ) return false; - if ( !compare( buff, expected, DNBD3_BLOCK_SIZE, "SingleBit test Failed: second write not as expected" ) ) + if ( !compare( buff, expected, DNBD3_BLOCK_SIZE, "SingleBit test Failed: second read not as expected" ) ) return false; printf( "testSingleBit successful!\n" ); return true; @@ -426,7 +443,7 @@ bool verifyLongNonAlignedPattern() if ( !readSizeTested( fh, buffer, sizeToRead, offset, "writeLongNonAlignedPattern test Failed: read failed" ) ) { return false; } - if ( !compare( buffer, expected, sizeToRead, "writeLongNonAlignedPattern test Failed: read failed" ) ) + if ( !compare( buffer, expected, sizeToRead, "writeLongNonAlignedPattern test Failed: compare failed" ) ) return false; offset += sizeToRead; } @@ -755,7 +772,7 @@ bool verifyFinalFile( char *path ) size_t fileSize = testFileSize + 2 * l2Capacity; struct stat st; - stat( path, &st ); + fstat( fh, &st ); size_t size = st.st_size; if ( size != fileSize ) { printf( "verify Failed, wrong file size\n expectedSize: %zu\n got: %zu\n", fileSize, size ); @@ -1023,7 +1040,7 @@ bool randomWriteTest( char *mountedImagePath, char *normalImagePath, float minS return (void*) false; } // RANDOM WRITE LOOP - printf( "Press any key to cancel\n" ); + printf( "Press Ctrl-C to stop and compare\n" ); while ( randomTestLoop ) { //select test int r = rand() % 100; @@ -1042,7 +1059,10 @@ bool randomWriteTest( char *mountedImagePath, char *normalImagePath, float minS } else { // write test off_t offset = rand() % maxOffset; - size_t size = rand() % RND_MAX_WRITE_SIZE; + size_t size = ( rand() + offset ) % RND_MAX_WRITE_SIZE; + if ( size < RND_MAX_WRITE_SIZE / 2 ) { + size /= rand() % 8192; + } size = MAX( size, 1 ); if ( r > RND_TRUNCATE_PROBABILITY + RND_UNALIGNED_WRITE_PROBABILITY ) { // align to block @@ -1051,7 +1071,7 @@ bool randomWriteTest( char *mountedImagePath, char *normalImagePath, float minS } generateRandomData( fhr, buf, size ); - printf( "write offset: %zu size: %zu\n", offset, size ); + printf( "write offset: %zu size: %zu r: %d\n", offset, size, r ); if ( !writeSizeTested( fhm, buf, size, offset, "failed to write on mounted image" ) ) return false; if ( !writeSizeTested( fhn, buf, size, offset, "failed to write on normal image" ) ) diff --git a/src/fuse/cowfile.c b/src/fuse/cowfile.c index 8e816a2..a53b101 100644 --- a/src/fuse/cowfile.c +++ b/src/fuse/cowfile.c @@ -10,10 +10,13 @@ #include #define UUID_STRLEN 36 +// Maximum assumed page size, in case the cow data gets transferred between different architectures +// 16k should be the largest minimum in existence (Itanium) +#define MAX_PAGE_SIZE 16384 extern void image_ll_getattr( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi ); -static const int CURRENT_COW_VERSION = 1; +static const int CURRENT_COW_VERSION = 2; static bool statStdout; static bool statFile; @@ -30,18 +33,51 @@ atomic_bool uploadLoopDone = false; // Upload loop has finished all work? static struct cow { - pthread_mutex_t l2CreateLock; - int fhm; - int fhd; - int fhs; char *metadata_mmap; l1 *l1; - l2 *firstL2; - size_t maxImageSize; - size_t l1Size; //size of l1 array - + l2 *l2; + int fdMeta; + int fdData; + int fdStats; + pthread_mutex_t l2CreateLock; } cow; +static int countOneBits( atomic_uchar *bf, int numBytes ) +{ + int bitCount = 0; + for ( int i = 0; i < numBytes; ++i ) { + unsigned char value = bf[i]; + while ( value > 0 ) { + if ( ( value & 1 ) == 1 ) { + bitCount++; + } + value >>= 1; + } + } + return bitCount; +} + +#define IS_4K_ALIGNED(v) ( ( (uint64_t)(v) & DNBD3_BLOCK_MASK ) == 0 ) + +static bool writeAll( int fd, const char *buf, size_t count, off_t offset ) +{ + while ( count > 0 ) { + ssize_t ret = pwrite( fd, buf, count, offset ); + if ( ret == (ssize_t)count ) + return true; + if ( ret == -1 ) { + if ( errno == EINTR ) + continue; + return false; + } + if ( ret == 0 ) + return false; + count -= ret; + buf += ret; + } + return true; +} + /** * @brief Computes the l1 index for an absolute file offset * @@ -83,7 +119,7 @@ static int getBitfieldOffsetBit( size_t offset ) * @param to end bit * @param value set bits to 1 or 0 */ -static void setBits( atomic_char *byte, int from, int to, bool value ) +static void setBits( atomic_uchar *byte, int64_t from, int64_t to, bool value ) { char mask = (char)( ( 255 >> ( 7 - ( to - from ) ) ) << from ); if ( value ) { @@ -101,13 +137,13 @@ static void setBits( atomic_char *byte, int from, int to, bool value ) * @param to end bit * @param value set bits to 1 or 0 */ -static void setBitsInBitfield( atomic_char *bitfield, int from, int to, bool value ) +static void setBitsInBitfield( atomic_uchar *bitfield, int64_t from, int64_t to, bool value ) { - assert( from >= 0 || to < COW_BITFIELD_SIZE * 8 ); - int start = from / 8; - int end = to / 8; + assert( from >= 0 && to < COW_BITFIELD_SIZE * 8 ); + int64_t start = from / 8; + int64_t end = to / 8; - for ( int i = start; i <= end; i++ ) { + for ( int64_t i = start; i <= end; i++ ) { setBits( ( bitfield + i ), from - i * 8, MIN( 7, to - i * 8 ), value ); from = ( i + 1 ) * 8; } @@ -119,9 +155,9 @@ static void setBitsInBitfield( atomic_char *bitfield, int from, int to, bool val * @param bitfield of a cow_l2_entry * @param n the bit which should be checked */ -static bool checkBit( atomic_char *bitfield, int n ) +static bool checkBit( atomic_uchar *bitfield, int64_t n ) { - return ( atomic_load( ( bitfield + ( n / 8 ) ) ) >> ( n % 8 ) ) & 1; + return ( bitfield[n / 8] >> ( n % 8 ) ) & 1; } @@ -225,32 +261,50 @@ size_t curlReadCallbackUploadBlock( char *ptr, size_t size, size_t nmemb, void * cow_curl_read_upload_t *uploadBlock = (cow_curl_read_upload_t *)userdata; size_t len = 0; // Check if we're still in the bitfield - if ( uploadBlock->position < (size_t)metadata->bitfieldSize ) { - size_t lenCpy = MIN( metadata->bitfieldSize - uploadBlock->position, size * nmemb ); - memcpy( ptr, uploadBlock->block->bitfield + uploadBlock->position, lenCpy ); + if ( uploadBlock->position < COW_BITFIELD_SIZE ) { + size_t lenCpy = MIN( COW_BITFIELD_SIZE - uploadBlock->position, size * nmemb ); + memcpy( ptr + uploadBlock->position, uploadBlock->bitfield + uploadBlock->position, + lenCpy ); uploadBlock->position += lenCpy; len += lenCpy; } // No elseif here, might just have crossed over... - if ( uploadBlock->position >= (size_t)metadata->bitfieldSize ) { - ssize_t wantRead = (ssize_t)MIN( - COW_DATA_CLUSTER_SIZE - ( uploadBlock->position - ( metadata->bitfieldSize ) ), - ( size * nmemb ) - len ); - off_t inClusterOffset = uploadBlock->position - metadata->bitfieldSize; - ssize_t lengthRead = pread( cow.fhd, ( ptr + len ), wantRead, uploadBlock->block->offset + inClusterOffset ); - if ( lengthRead == -1 ) { - logadd( LOG_ERROR, "Upload: Reading from COW file failed with errno %d", errno ); - return CURL_READFUNC_ABORT; - } - - if ( wantRead > lengthRead ) { - // fill up since last block may not be a full block - memset( ptr + len + lengthRead, 0, wantRead - lengthRead ); - // TODO what about partial read? We should know how much data there actually is... - lengthRead = wantRead; + if ( uploadBlock->position >= COW_BITFIELD_SIZE ) { + // Subtract the bitfield size from everything first + off_t inClusterOffset = uploadBlock->position - COW_BITFIELD_SIZE; + ssize_t spaceLeft = ( size * nmemb ) - len; + // Only read blocks that have been written to the cluster. Saves bandwidth. Not optimal since + // we do a lot of 4k/32k reads, but it's not that performance critical I guess... + while ( spaceLeft >= (ssize_t)DNBD3_BLOCK_SIZE && inClusterOffset < (off_t)COW_DATA_CLUSTER_SIZE ) { + int bitNumber = (int)( inClusterOffset / DNBD3_BLOCK_SIZE ); + size_t readSize; + // Small performance hack: All bits one in a byte, do a 32k instead of 4k read + if ( spaceLeft >= (ssize_t)DNBD3_BLOCK_SIZE * 8 + && bitNumber % 8 == 0 + && uploadBlock->bitfield[bitNumber / 8] == 0xff ) { + readSize = DNBD3_BLOCK_SIZE * 8; + } else { + readSize = DNBD3_BLOCK_SIZE; + } + // Check bits in our copy, as global bitfield could change + if ( checkBit( uploadBlock->bitfield, bitNumber ) ) { + ssize_t lengthRead = pread( cow.fdData, ( ptr + len ), readSize, + uploadBlock->block->offset + inClusterOffset ); + if ( lengthRead == -1 ) { + logadd( LOG_ERROR, "Upload: Reading from COW file failed with errno %d", errno ); + return CURL_READFUNC_ABORT; + } + if ( lengthRead != (ssize_t)readSize ) { + logadd( LOG_ERROR, "Upload: Reading from COW file failed with short read (%d/%d)", + (int)lengthRead, (int)readSize ); + return CURL_READFUNC_ABORT; + } + len += lengthRead; + spaceLeft -= lengthRead; + } + inClusterOffset += readSize; + uploadBlock->position += readSize; } - uploadBlock->position += lengthRead; - len += lengthRead; } return len; } @@ -280,7 +334,7 @@ bool mergeRequest() part = curl_mime_addpart( mime ); curl_mime_name( part, "originalFileSize" ); char buf[21]; - snprintf( buf, sizeof buf, "%" PRIu64, metadata->originalImageSize ); + snprintf( buf, sizeof buf, "%" PRIu64, metadata->validRemoteSize ); curl_mime_data( part, buf, CURL_ZERO_TERMINATED ); part = curl_mime_addpart( mime ); @@ -341,15 +395,15 @@ int progress_callback( void *clientp, __attribute__((unused)) curl_off_t dlTotal __attribute__((unused)) curl_off_t dlNow, __attribute__((unused)) curl_off_t ulTotal, curl_off_t ulNow ) { CURL *eh = (CURL *)clientp; - cow_curl_read_upload_t *curlUploadBlock; + cow_curl_read_upload_t *uploadingCluster; CURLcode res; - res = curl_easy_getinfo( eh, CURLINFO_PRIVATE, &curlUploadBlock ); + res = curl_easy_getinfo( eh, CURLINFO_PRIVATE, &uploadingCluster ); if ( res != CURLE_OK ) { logadd( LOG_ERROR, "ERROR" ); return 0; } - bytesUploaded += ( ulNow - curlUploadBlock->ulLast ); - curlUploadBlock->ulLast = ulNow; + bytesUploaded += ( ulNow - uploadingCluster->ulLast ); + uploadingCluster->ulLast = ulNow; return 0; } @@ -381,7 +435,7 @@ void updateCowStatsFile( uint64_t inQueue, uint64_t modified, uint64_t idle, cha "modifiedClusters=%" PRIu64 "\n" "idleClusters=%" PRIu64 "\n" "totalClustersUploaded=%" PRIu64 "\n" - "activeUploads=:%i\n" + "activeUploads=%i\n" "%s%s", state, inQueue, modified, idle, totalBlocksUploaded, activeUploads, COW_SHOW_UL_SPEED ? "ulspeed=" : "", @@ -398,9 +452,10 @@ void updateCowStatsFile( uint64_t inQueue, uint64_t modified, uint64_t idle, cha if ( statFile ) { // Pad with a bunch of newlines so we don't change the file size all the time - ssize_t extra = MIN( 20, sizeof(buffer) - len - 1 ); + ssize_t extra = MIN( 20, (ssize_t)sizeof(buffer) - len - 1 ); memset( buffer + len, '\n', extra ); - if ( pwrite( cow.fhs, buffer, len + extra, 43 ) != len ) { + lseek( cow.fdStats, 43, SEEK_SET ); + if ( write( cow.fdStats, buffer, len + extra ) != len ) { logadd( LOG_WARNING, "Could not update cow status file" ); } #ifdef COW_DUMP_BLOCK_UPLOADS @@ -412,7 +467,7 @@ void updateCowStatsFile( uint64_t inQueue, uint64_t modified, uint64_t idle, cha } int cmpfunc( const void *a, const void *b ) { - return (int)( ( (cow_block_upload_statistics_t *)b )->uploads - ( (cow_block_upload_statistics_t *)a )->uploads ); + return (int)( ( (cow_cluster_statistics_t *)b )->uploads - ( (cow_cluster_statistics_t *)a )->uploads ); } /** * @brief Writes all block numbers sorted by the number of uploads into the statsfile. @@ -422,26 +477,25 @@ void dumpBlockUploads() { long unsigned int l1MaxOffset = 1 + ( ( metadata->imageSize - 1 ) / COW_FULL_L2_TABLE_DATA_SIZE ); - cow_block_upload_statistics_t blockUploads[l1MaxOffset * COW_L2_TABLE_SIZE]; + cow_cluster_statistics_t blockUploads[l1MaxOffset * COW_L2_TABLE_SIZE]; uint64_t currentBlock = 0; for ( long unsigned int l1Index = 0; l1Index < l1MaxOffset; l1Index++ ) { if ( cow.l1[l1Index] == -1 ) { continue; } for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) { - cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index ); + cow_l2_entry_t *block = ( cow.l2[cow.l1[l1Index]] + l2Index ); blockUploads[currentBlock].uploads = block->uploads; - blockUploads[currentBlock].blocknumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index ); + blockUploads[currentBlock].clusterNumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index ); currentBlock++; } } - qsort( blockUploads, currentBlock, sizeof( cow_block_upload_statistics_t ), cmpfunc ); - lseek( cow.fhs, 0, SEEK_END ); + qsort( blockUploads, currentBlock, sizeof( cow_cluster_statistics_t ), cmpfunc ); - dprintf( cow.fhs, "\n\nblocknumber: uploads\n==Block Upload Dump===\n" ); + dprintf( cow.fdStats, "\n\nclusterNumber: uploads\n==Block Upload Dump===\n" ); for ( uint64_t i = 0; i < currentBlock; i++ ) { - dprintf( cow.fhs, "%" PRIu64 ": %" PRIu64 " \n", blockUploads[i].blocknumber, blockUploads[i].uploads ); + dprintf( cow.fdStats, "%" PRIu64 ": %" PRIu64 " \n", blockUploads[i].clusterNumber, blockUploads[i].uploads ); } } @@ -449,29 +503,32 @@ void dumpBlockUploads() * @brief Starts the upload of a given block. * * @param cm Curl_multi - * @param curlUploadBlock containing the data for the block to upload. + * @param uploadingCluster containing the data for the block to upload. */ -bool addUpload( CURLM *cm, cow_curl_read_upload_t *curlUploadBlock, struct curl_slist *headers ) +bool addUpload( CURLM *cm, cow_curl_read_upload_t *uploadingCluster, struct curl_slist *headers ) { CURL *eh = curl_easy_init(); char url[COW_URL_STRING_SIZE]; - snprintf( url, COW_URL_STRING_SIZE, COW_API_UPDATE, cowServerAddress, metadata->uuid, curlUploadBlock->blocknumber ); + snprintf( url, COW_URL_STRING_SIZE, COW_API_UPDATE, cowServerAddress, metadata->uuid, uploadingCluster->clusterNumber ); curl_easy_setopt( eh, CURLOPT_URL, url ); curl_easy_setopt( eh, CURLOPT_POST, 1L ); curl_easy_setopt( eh, CURLOPT_READFUNCTION, curlReadCallbackUploadBlock ); - curl_easy_setopt( eh, CURLOPT_READDATA, (void *)curlUploadBlock ); - curl_easy_setopt( eh, CURLOPT_PRIVATE, (void *)curlUploadBlock ); + curl_easy_setopt( eh, CURLOPT_READDATA, (void *)uploadingCluster ); + curl_easy_setopt( eh, CURLOPT_PRIVATE, (void *)uploadingCluster ); // min upload speed of 1kb/s over 10 sec otherwise the upload is canceled. curl_easy_setopt( eh, CURLOPT_LOW_SPEED_TIME, 10L ); curl_easy_setopt( eh, CURLOPT_LOW_SPEED_LIMIT, 1000L ); - curl_easy_setopt( - eh, CURLOPT_POSTFIELDSIZE_LARGE, (long)( metadata->bitfieldSize + COW_DATA_CLUSTER_SIZE ) ); + curl_easy_setopt( eh, CURLOPT_POSTFIELDSIZE_LARGE, + (long)( COW_BITFIELD_SIZE + + DNBD3_BLOCK_SIZE * countOneBits( uploadingCluster->bitfield, COW_BITFIELD_SIZE ) ) + ); + if ( COW_SHOW_UL_SPEED ) { - curlUploadBlock->ulLast = 0; + uploadingCluster->ulLast = 0; curl_easy_setopt( eh, CURLOPT_NOPROGRESS, 0L ); curl_easy_setopt( eh, CURLOPT_XFERINFOFUNCTION, progress_callback ); curl_easy_setopt( eh, CURLOPT_XFERINFODATA, eh ); @@ -495,35 +552,35 @@ bool addUpload( CURLM *cm, cow_curl_read_upload_t *curlUploadBlock, struct curl_ bool finishUpload( CURLM *cm, CURLMsg *msg, struct curl_slist *headers ) { bool status = true; - cow_curl_read_upload_t *curlUploadBlock; + cow_curl_read_upload_t *uploadingCluster; CURLcode res; CURLcode res2; - res = curl_easy_getinfo( msg->easy_handle, CURLINFO_PRIVATE, &curlUploadBlock ); + res = curl_easy_getinfo( msg->easy_handle, CURLINFO_PRIVATE, &uploadingCluster ); long http_code = 0; res2 = curl_easy_getinfo( msg->easy_handle, CURLINFO_RESPONSE_CODE, &http_code ); if ( res != CURLE_OK || res2 != CURLE_OK || http_code < 200 || http_code >= 300 || msg->msg != CURLMSG_DONE ) { - curlUploadBlock->fails++; - logadd( LOG_ERROR, "COW_API_UPDATE failed %i/5: %s\n", curlUploadBlock->fails, + uploadingCluster->fails++; + logadd( LOG_ERROR, "COW_API_UPDATE failed %i/5: %s\n", uploadingCluster->fails, curl_easy_strerror( msg->data.result ) ); - if ( curlUploadBlock->fails <= 5 ) { - addUpload( cm, curlUploadBlock, headers ); + if ( uploadingCluster->fails < 5 ) { + addUpload( cm, uploadingCluster, headers ); goto CLEANUP; } - free( curlUploadBlock ); + free( uploadingCluster ); status = false; goto CLEANUP; } // everything went ok, update timeChanged - atomic_compare_exchange_strong( &curlUploadBlock->block->timeChanged, &curlUploadBlock->time, 0 ); + atomic_compare_exchange_strong( &uploadingCluster->block->timeChanged, &uploadingCluster->time, 0 ); - curlUploadBlock->block->uploads++; + uploadingCluster->block->uploads++; totalBlocksUploaded++; - free( curlUploadBlock ); + free( uploadingCluster ); CLEANUP: curl_multi_remove_handle( cm, msg->easy_handle ); curl_easy_cleanup( msg->easy_handle ); @@ -593,7 +650,7 @@ bool uploaderLoop( bool ignoreMinUploadDelay, CURLM *cm ) } // Now all L2 blocks for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) { - cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index ); + cow_l2_entry_t *block = ( cow.l2[cow.l1[l1Index]] + l2Index ); if ( block->offset == -1 ) { continue; // Not allocated } @@ -613,10 +670,15 @@ bool uploaderLoop( bool ignoreMinUploadDelay, CURLM *cm ) && activeUploads > 0 ); cow_curl_read_upload_t *b = malloc( sizeof( cow_curl_read_upload_t ) ); b->block = block; - b->blocknumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index ); + b->clusterNumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index ); b->fails = 0; b->position = 0; b->time = block->timeChanged; + // Copy, so it doesn't change during upload + // when we assemble the data in curlReadCallbackUploadBlock() + for ( int i = 0; i < COW_BITFIELD_SIZE; ++i ) { + b->bitfield[i] = block->bitfield[i]; + } addUpload( cm, b, headers ); if ( !ignoreMinUploadDelay && !uploadLoop ) { goto DONE; @@ -637,7 +699,7 @@ DONE: * */ -void *cowfile_statUpdater( __attribute__( ( unused ) ) void *something ) +void *cowfile_statUpdater( __attribute__((unused)) void *something ) { uint64_t lastUpdateTime = time( NULL ); @@ -653,7 +715,7 @@ void *cowfile_statUpdater( __attribute__( ( unused ) ) void *something ) continue; } for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) { - cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index ); + cow_l2_entry_t *block = ( cow.l2[cow.l1[l1Index]] + l2Index ); if ( block->offset == -1 ) { continue; } @@ -738,12 +800,12 @@ static bool createCowStatsFile( char *path ) logadd( LOG_INFO, "%s", buffer ); } if ( statFile ) { - if ( ( cow.fhs = open( pathStatus, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) { + if ( ( cow.fdStats = open( pathStatus, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) { logadd( LOG_ERROR, "Could not create cow status file. Bye.\n" ); return false; } - if ( pwrite( cow.fhs, buffer, len, 0 ) != len ) { + if ( pwrite( cow.fdStats, buffer, len, 0 ) != len ) { logadd( LOG_ERROR, "Could not write to cow status file. Bye.\n" ); return false; } @@ -770,67 +832,72 @@ bool cowfile_init( char *path, const char *image_Name, uint16_t imageVersion, snprintf( pathMeta, strlen( path ) + 6, "%s%s", path, "/meta" ); snprintf( pathData, strlen( path ) + 6, "%s%s", path, "/data" ); - if ( ( cow.fhm = open( pathMeta, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) { + if ( ( cow.fdMeta = open( pathMeta, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR ) ) == -1 ) { logadd( LOG_ERROR, "Could not create cow meta file. Bye.\n %s \n", pathMeta ); return false; } - if ( ( cow.fhd = open( pathData, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) { + if ( ( cow.fdData = open( pathData, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR ) ) == -1 ) { logadd( LOG_ERROR, "Could not create cow data file. Bye.\n" ); return false; } - - int maxPageSize = 8192; + struct stat fs; + if ( fstat( cow.fdData, &fs ) == -1 || fs.st_size != 0 ) { + logadd( LOG_ERROR, "/data file already exists and is not empty" ); + return false; + } size_t metaDataSizeHeader = sizeof( cowfile_metadata_header_t ); - cow.maxImageSize = COW_MAX_IMAGE_SIZE; - cow.l1Size = ( ( cow.maxImageSize + COW_FULL_L2_TABLE_DATA_SIZE - 1LL ) / COW_FULL_L2_TABLE_DATA_SIZE ); + // Calculate how many full l2 tables we need to address COW_MAX_IMAGE_SIZE + size_t l1NumEntries = ( ( COW_MAX_IMAGE_SIZE + COW_FULL_L2_TABLE_DATA_SIZE - 1 ) + / COW_FULL_L2_TABLE_DATA_SIZE ); + // Make sure l1 and l2 are aligned to struct size + size_t sizeL1 = sizeof(cow.l1[0]); + size_t sizeL2 = sizeof(cow.l2[0]); + size_t startL1 = ( ( metaDataSizeHeader + sizeL1 - 1 ) / sizeL1 ) * sizeL1; + size_t startL2 = ( ( startL1 + l1NumEntries * sizeL1 + sizeL2 - 1 ) / sizeL2 ) * sizeL2; // size of l1 array + number of l2's * size of l2 - size_t metadata_size = cow.l1Size * sizeof( l1 ) + cow.l1Size * sizeof( l2 ); + size_t ps = getpagesize(); + size_t metaSize = ( ( startL2 + l1NumEntries * sizeof( l2 ) + ps - 1 ) / ps ) * ps; - // compute next fitting multiple of getpagesize() - size_t meta_data_start = ( ( metaDataSizeHeader + maxPageSize - 1 ) / maxPageSize ) * maxPageSize; - - size_t metadataFileSize = meta_data_start + metadata_size; - if ( ftruncate( cow.fhm, metadataFileSize ) != 0 ) { + if ( ftruncate( cow.fdMeta, metaSize ) != 0 ) { logadd( LOG_ERROR, "Could not set file size of meta data file (errno=%d). Bye.\n", errno ); return false; } - cow.metadata_mmap = mmap( NULL, metadataFileSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fhm, 0 ); + cow.metadata_mmap = mmap( NULL, metaSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fdMeta, 0 ); if ( cow.metadata_mmap == MAP_FAILED ) { - logadd( LOG_ERROR, "Error while mapping mmap:\n%s \n Bye.\n", strerror( errno ) ); + logadd( LOG_ERROR, "Error while mmap()ing meta data, errno=%d", errno ); return false; } metadata = (cowfile_metadata_header_t *)( cow.metadata_mmap ); metadata->magicValue = COW_FILE_META_MAGIC_VALUE; + metadata->imageSize = **imageSizePtr; metadata->version = CURRENT_COW_VERSION; - metadata->dataFileSize = ATOMIC_VAR_INIT( COW_DATA_CLUSTER_SIZE ); - metadata->metadataFileSize = ATOMIC_VAR_INIT( metadataFileSize ); - metadata->blocksize = DNBD3_BLOCK_SIZE; - metadata->originalImageSize = **imageSizePtr; - metadata->imageSize = metadata->originalImageSize; - metadata->creationTime = time( NULL ); - *imageSizePtr = &metadata->imageSize; - metadata->metaDataStart = meta_data_start; + metadata->validRemoteSize = **imageSizePtr; + metadata->startL1 = (uint32_t)startL1; + metadata->startL2 = (uint32_t)startL2; metadata->bitfieldSize = COW_BITFIELD_SIZE; - metadata->maxImageSize = cow.maxImageSize; - snprintf( metadata->imageName, 200, "%s", image_Name ); - cow.l1 = (l1 *)( cow.metadata_mmap + meta_data_start ); metadata->nextL2 = 0; + metadata->metaSize = ATOMIC_VAR_INIT( metaSize ); + metadata->nextClusterOffset = ATOMIC_VAR_INIT( COW_DATA_CLUSTER_SIZE ); + metadata->maxImageSize = COW_MAX_IMAGE_SIZE; + metadata->creationTime = time( NULL ); + snprintf( metadata->imageName, 200, "%s", image_Name ); - for ( size_t i = 0; i < cow.l1Size; i++ ) { + cow.l1 = (l1 *)( cow.metadata_mmap + startL1 ); + cow.l2 = (l2 *)( cow.metadata_mmap + startL2 ); + for ( size_t i = 0; i < l1NumEntries; i++ ) { cow.l1[i] = -1; } - cow.firstL2 = (l2 *)( ( (char *)cow.l1 ) + cow.l1Size ); // write header to data file uint64_t header = COW_FILE_DATA_MAGIC_VALUE; - if ( pwrite( cow.fhd, &header, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) { + if ( pwrite( cow.fdData, &header, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) { logadd( LOG_ERROR, "Could not write header to cow data file. Bye.\n" ); return false; } @@ -848,6 +915,7 @@ bool cowfile_init( char *path, const char *image_Name, uint16_t imageVersion, return false; } createCowStatsFile( path ); + *imageSizePtr = &metadata->imageSize; return true; } @@ -871,11 +939,11 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server snprintf( pathData, strlen( path ) + 6, "%s%s", path, "/data" ); - if ( ( cow.fhm = open( pathMeta, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) { + if ( ( cow.fdMeta = open( pathMeta, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) { logadd( LOG_ERROR, "Could not open cow meta file. Bye.\n" ); return false; } - if ( ( cow.fhd = open( pathData, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) { + if ( ( cow.fdData = open( pathData, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) { logadd( LOG_ERROR, "Could not open cow data file. Bye.\n" ); return false; } @@ -885,7 +953,7 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server size_t sizeToRead = sizeof( cowfile_metadata_header_t ); size_t readBytes = 0; while ( readBytes < sizeToRead ) { - ssize_t bytes = pread( cow.fhm, ( ( &header ) + readBytes ), sizeToRead, 0 ); + ssize_t bytes = pread( cow.fdMeta, ( ( &header ) + readBytes ), sizeToRead - readBytes, 0 ); if ( bytes <= 0 ) { logadd( LOG_ERROR, "Error while reading meta file header. Bye.\n" ); return false; @@ -902,44 +970,55 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server logadd( LOG_ERROR, "cow meta file of unkown format. Bye.\n" ); return false; } + + if ( header.bitfieldSize != COW_BITFIELD_SIZE ) { + logadd( LOG_ERROR, "cow meta file has unexpected bitfield size %d", (int)header.bitfieldSize ); + return false; + } + if ( header.startL1 >= header.startL2 || header.startL2 >= header.metaSize ) { + logadd( LOG_ERROR, "l1/l2 offset messed up in metadata." ); + return false; + } + struct stat st; - fstat( cow.fhm, &st ); - if ( st.st_size < (off_t)( header.metaDataStart + header.nextL2 * sizeof( l2 ) ) ) { - logadd( LOG_ERROR, "cow meta file to small. Bye.\n" ); + fstat( cow.fdMeta, &st ); + if ( st.st_size < (off_t)header.metaSize ) { + logadd( LOG_ERROR, "cow meta file too small. Bye." ); return false; } } { uint64_t magicValueDataFile; - if ( pread( cow.fhd, &magicValueDataFile, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) { - logadd( LOG_ERROR, "Error while reading cow data file, wrong file?. Bye.\n" ); + if ( pread( cow.fdData, &magicValueDataFile, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) { + logadd( LOG_ERROR, "Error while reading cow data file, wrong file?. Bye." ); return false; } if ( magicValueDataFile != COW_FILE_DATA_MAGIC_VALUE ) { if ( __builtin_bswap64( magicValueDataFile ) == COW_FILE_DATA_MAGIC_VALUE ) { - logadd( LOG_ERROR, "cow data file of wrong endianess. Bye.\n" ); + logadd( LOG_ERROR, "cow data file of wrong endianess. Bye." ); return false; } - logadd( LOG_ERROR, "cow data file of unkown format. Bye.\n" ); + logadd( LOG_ERROR, "cow data file of unkown format. Bye." ); return false; } struct stat st; - fstat( cow.fhd, &st ); - if ( (off_t)header.dataFileSize > st.st_size ) { - logadd( LOG_ERROR, "cow data file to small. Bye.\n" ); + fstat( cow.fdData, &st ); // add cluster size, since we don't preallocate + if ( header.nextClusterOffset > st.st_size + (int)COW_DATA_CLUSTER_SIZE ) { + logadd( LOG_ERROR, "cow data file too small. Expected=%jd, Is=%jd.", + (intmax_t)header.nextClusterOffset, (intmax_t)st.st_size ); return false; } } - cow.metadata_mmap = mmap( NULL, header.metadataFileSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fhm, 0 ); + cow.metadata_mmap = mmap( NULL, header.metaSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fdMeta, 0 ); if ( cow.metadata_mmap == MAP_FAILED ) { - logadd( LOG_ERROR, "Error while mapping mmap:\n%s \n Bye.\n", strerror( errno ) ); + logadd( LOG_ERROR, "Error while mapping mmap, errno=%d.", errno ); return false; } if ( header.version != CURRENT_COW_VERSION ) { - logadd( LOG_ERROR, "Error wrong file version got: %i expected: %i. Bye.\n", + logadd( LOG_ERROR, "Error wrong file version got: %i expected: %i. Bye.", metadata->version, CURRENT_COW_VERSION ); return false; } @@ -948,11 +1027,8 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server metadata = (cowfile_metadata_header_t *)( cow.metadata_mmap ); *imageSizePtr = &metadata->imageSize; - cow.l1 = (l1 *)( cow.metadata_mmap + metadata->metaDataStart ); - cow.maxImageSize = metadata->maxImageSize; - cow.l1Size = ( ( cow.maxImageSize + COW_FULL_L2_TABLE_DATA_SIZE - 1LL ) / COW_FULL_L2_TABLE_DATA_SIZE ); - - cow.firstL2 = (l2 *)( ( (char *)cow.l1 ) + cow.l1Size ); + cow.l1 = (l1 *)( cow.metadata_mmap + metadata->startL1 ); + cow.l2 = (l2 *)( cow.metadata_mmap + metadata->startL2 ); pthread_mutex_init( &cow.l2CreateLock, NULL ); createCowStatsFile( path ); return true; @@ -961,8 +1037,8 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server * @brief Starts the cow BackgroundThreads which are needed for stats and data upload * */ -bool cowfile_startBackgroundThreads() { - +bool cowfile_startBackgroundThreads() +{ if( pthread_create( &tidCowUploader, NULL, &uploaderThreadMain, NULL ) != 0 ) { logadd( LOG_ERROR, "Could not create cow uploader thread"); return false; @@ -977,55 +1053,15 @@ bool cowfile_startBackgroundThreads() { } /** - * @brief writes the given data in the data file - * - * @param buffer containing the data - * @param size of the buffer - * @param netSize which actually contributes to the fuse write request (can be different from size if partial full blocks are written) - * @param cowRequest <---- !???? TODO - * @param block block being written to - * @param inClusterOffset offset in this cluster to be written to - */ -static void writeData( const char *buffer, ssize_t size, size_t netSize, atomic_int *errorCode, - atomic_size_t *bytesWorkedOn, cow_l2_entry_t *block, off_t inClusterOffset ) -{ - // TODO: Assert that size + inClusterOffset <= COW_DATA_CLUSTER_SIZE? - ssize_t totalBytesWritten = 0; - while ( totalBytesWritten < size ) { - ssize_t bytesWritten = pwrite( cow.fhd, ( buffer + totalBytesWritten ), size - totalBytesWritten, - block->offset + inClusterOffset + totalBytesWritten ); - if ( bytesWritten == -1 ) { - *errorCode = errno; - logadd( LOG_ERROR, - "size:%zu netSize:%zu errorCode:%i bytesWorkedOn:%zu inClusterOffset:%ld block->offset:%ld \n", size, - netSize, *errorCode, *bytesWorkedOn, inClusterOffset, block->offset ); - break; - } else if ( bytesWritten == 0 ) { - *errorCode = EIO; - logadd( LOG_ERROR, - "size:%zu netSize:%zu errorCode:%i bytesWorkedOn:%zu inClusterOffset:%ld block->offset:%ld \n", size, - netSize, *errorCode, *bytesWorkedOn, inClusterOffset, block->offset ); - break; - } - totalBytesWritten += bytesWritten; - } - atomic_fetch_add( bytesWorkedOn, netSize ); - setBitsInBitfield( block->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ), - (int)( ( inClusterOffset + totalBytesWritten - 1 ) / DNBD3_BLOCK_SIZE ), 1 ); - - block->timeChanged = time( NULL ); -} - -/** - * @brief Increases the metadata->dataFileSize by COW_DATA_CLUSTER_SIZE. - * The space is not reserved on disk. - * - * @param block for which the space should be reserved. + * Check if block at given offset is local, i.e. has been modified. + * @param meta The cow_l2_entry for the according cluster MUST be provided + * @param offset offset of data, can be absolute image offset as it will be transformed into cluster offset */ -static bool allocateMetaBlockData( cow_l2_entry_t *block ) +static bool isBlockLocal( cow_l2_entry_t *meta, off_t offset ) { - block->offset = (atomic_long)atomic_fetch_add( &metadata->dataFileSize, COW_DATA_CLUSTER_SIZE ); - return true; + if ( meta == NULL ) + return false; + return checkBit( meta->bitfield, ( offset % COW_DATA_CLUSTER_SIZE ) / DNBD3_BLOCK_SIZE ); } /** @@ -1036,34 +1072,38 @@ static bool allocateMetaBlockData( cow_l2_entry_t *block ) * @param l2Index * @return cow_l2_entry_t* */ -static cow_l2_entry_t *getL2Entry( int l1Index, int l2Index ) +static cow_l2_entry_t *getL2Entry( int l1Index, int l2Index, bool create ) { - cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index ); + if ( cow.l1[l1Index] == -1 ) + return NULL; + cow_l2_entry_t *block = cow.l2[cow.l1[l1Index]] + l2Index; if ( block->offset == -1 ) { - allocateMetaBlockData( block ); + if ( !create ) + return NULL; + block->offset = atomic_fetch_add( &metadata->nextClusterOffset, COW_DATA_CLUSTER_SIZE ); } return block; } /** - * @brief creates an new L2 Block and initializes the containing cow_l2_entry_t blocks + * @brief creates an new L2 table and initializes the containing cow_l2_entry_t * * @param l1Index */ -static bool createL2Block( int l1Index ) +static bool createL2Table( int l1Index ) { pthread_mutex_lock( &cow.l2CreateLock ); if ( cow.l1[l1Index] == -1 ) { + int idx = metadata->nextL2++; for ( int i = 0; i < COW_L2_TABLE_SIZE; i++ ) { - cow.firstL2[metadata->nextL2][i].offset = -1; - cow.firstL2[metadata->nextL2][i].timeChanged = ATOMIC_VAR_INIT( 0 ); - cow.firstL2[metadata->nextL2][i].uploads = ATOMIC_VAR_INIT( 0 ); + cow.l2[idx][i].offset = -1; + cow.l2[idx][i].timeChanged = ATOMIC_VAR_INIT( 0 ); + cow.l2[idx][i].uploads = ATOMIC_VAR_INIT( 0 ); for ( int j = 0; j < COW_BITFIELD_SIZE; j++ ) { - cow.firstL2[metadata->nextL2][i].bitfield[j] = ATOMIC_VAR_INIT( 0 ); + cow.l2[idx][i].bitfield[j] = ATOMIC_VAR_INIT( 0 ); } } - cow.l1[l1Index] = metadata->nextL2; - metadata->nextL2 += 1; + cow.l1[l1Index] = idx; } pthread_mutex_unlock( &cow.l2CreateLock ); return true; @@ -1080,13 +1120,19 @@ static bool createL2Block( int l1Index ) static void finishWriteRequest( fuse_req_t req, cow_request_t *cowRequest ) { + if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) != 1 ) + return; // More sub-requests are pending, bail out if ( cowRequest->errorCode != 0 ) { fuse_reply_err( req, cowRequest->errorCode ); - } else { - uint64_t oldSize = metadata->imageSize; - uint64_t ns = MAX( oldSize, cowRequest->bytesWorkedOn + cowRequest->fuseRequestOffset ); - atomic_compare_exchange_strong( &metadata->imageSize, &oldSize, ns ); + uint64_t newSize = cowRequest->bytesWorkedOn + cowRequest->fuseRequestOffset; + if ( newSize > metadata->imageSize ) { + uint64_t oldSize; + do { + oldSize = metadata->imageSize; + newSize = MAX( oldSize, newSize ); + } while ( !atomic_compare_exchange_weak( &metadata->imageSize, &oldSize, newSize ) ); + } fuse_reply_write( req, cowRequest->bytesWorkedOn ); } free( cowRequest ); @@ -1100,67 +1146,104 @@ static void finishWriteRequest( fuse_req_t req, cow_request_t *cowRequest ) */ static void writePaddedBlock( cow_sub_request_t *sRequest ) { - //copy write Data - // TODO Assert that we have enough space in writeBuffer at that offset - memcpy( ( sRequest->writeBuffer + ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ) ), sRequest->writeSrc, - sRequest->size ); - writeData( sRequest->writeBuffer, DNBD3_BLOCK_SIZE, (ssize_t)sRequest->size, &sRequest->cowRequest->errorCode, - &sRequest->cowRequest->bytesWorkedOn, sRequest->block, - ( sRequest->inClusterOffset - ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ) ) ); - - - if ( atomic_fetch_sub( &sRequest->cowRequest->workCounter, 1 ) == 1 ) { - finishWriteRequest( sRequest->dRequest.fuse_req, sRequest->cowRequest ); + assert( ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ) + sRequest->size <= DNBD3_BLOCK_SIZE ); + // Here, we again check if the block is written locally - there might have been a second write + // that wrote the full block, hence didn't have to wait for remote data and finished faster. + // In that case, don't pad from remote as we'd overwrite newer data. + if ( isBlockLocal( sRequest->block, sRequest->inClusterOffset ) ) { + logadd( LOG_INFO, "It happened!" ); + } else { + // copy write Data + // writeBuffer is the received data, patch data from fuse write into it + memcpy( sRequest->writeBuffer + ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ), sRequest->writeSrc, + sRequest->size ); + if ( !writeAll( cow.fdData, sRequest->writeBuffer, DNBD3_BLOCK_SIZE, + sRequest->block->offset + ( sRequest->inClusterOffset & ~DNBD3_BLOCK_MASK ) ) ) { + sRequest->cowRequest->errorCode = errno; + } else { + sRequest->cowRequest->bytesWorkedOn += sRequest->size; + int64_t bit = sRequest->inClusterOffset / DNBD3_BLOCK_SIZE; + setBitsInBitfield( sRequest->block->bitfield, bit, bit, true ); + sRequest->block->timeChanged = time( NULL ); + } } + + finishWriteRequest( sRequest->dRequest.fuse_req, sRequest->cowRequest ); free( sRequest ); } /** * @brief If a block does not start or finish on an multiple of DNBD3_BLOCK_SIZE, the blocks need to be * padded. If this block is inside the original image size, the padding data will be read from the server. - * Otherwise it will be padded with 0 since the it must be the block at the end of the image. - * TODO: Properly document the arguments and what value range they can be, i.e. see below for the 4k case - * + * Otherwise it will be padded with 0 since the it must be a block after the end of the image. + * @param req fuse_req_t + * @param cowRequest cow_request_t + * @param startOffset Absolute offset where the real data starts + * @param endOffset Absolute offset where the real data ends + * @param srcBuffer pointer to the data that needs to be padded, ie. data from user space. */ -static void padBlockFromRemote( fuse_req_t req, off_t offset, cow_request_t *cowRequest, const char *buffer, - size_t size, cow_l2_entry_t *block, off_t inClusterOffset ) +static bool padBlockForWrite( fuse_req_t req, cow_request_t *cowRequest, + off_t startOffset, off_t endOffset, const char *srcBuffer ) { - // TODO: Is this *guaranteed* to be the case on the caller site? Add comment to ^ - assert( ( offset % DNBD3_BLOCK_SIZE ) + size <= DNBD3_BLOCK_SIZE ); - if ( offset >= (off_t)metadata->originalImageSize ) { - // Writing past the end of the image - inClusterOffset -= inClusterOffset % DNBD3_BLOCK_SIZE; - char buf[DNBD3_BLOCK_SIZE] = { 0 }; - memcpy( buf + ( offset % DNBD3_BLOCK_SIZE ), buffer, size ); - // At this point we should have a 4k block with user-space data to write, and possibly - // zero-padding at start and/or end - - writeData( buf, DNBD3_BLOCK_SIZE, (ssize_t)size, &cowRequest->errorCode, &cowRequest->bytesWorkedOn, - block, inClusterOffset ); - return; - } - // Need to fetch padding from upstream - cow_sub_request_t *sRequest = calloc( sizeof( cow_sub_request_t ) + DNBD3_BLOCK_SIZE, 1 ); - sRequest->callback = writePaddedBlock; - sRequest->inClusterOffset = inClusterOffset; - sRequest->block = block; - sRequest->size = size; - sRequest->writeSrc = buffer; - sRequest->cowRequest = cowRequest; - - sRequest->dRequest.length = (uint32_t)MIN( DNBD3_BLOCK_SIZE, metadata->originalImageSize - offset ); - sRequest->dRequest.offset = offset - ( offset % DNBD3_BLOCK_SIZE ); - sRequest->dRequest.fuse_req = req; - - atomic_fetch_add( &cowRequest->workCounter, 1 ); - if ( !connection_read( &sRequest->dRequest ) ) { - cowRequest->errorCode = EIO; - if ( atomic_fetch_sub( &sRequest->cowRequest->workCounter, 1 ) == 1 ) { - finishWriteRequest( sRequest->dRequest.fuse_req, sRequest->cowRequest ); + // Make sure we pad exactly one block + endOffset = MIN( (uint64_t)endOffset, ( startOffset + DNBD3_BLOCK_SIZE ) & ~DNBD3_BLOCK_MASK ); + assert( startOffset < endOffset ); + size_t size = (size_t)( endOffset - startOffset ); + int l1Index = offsetToL1Index( startOffset ); + int l2Index = offsetToL2Index( startOffset ); + off_t inClusterOffset = startOffset % COW_DATA_CLUSTER_SIZE; + cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, true ); + if ( isBlockLocal( cluster, startOffset ) ) { + // No padding at all, keep existing data + bool ret = writeAll( cow.fdData, srcBuffer, size, cluster->offset + inClusterOffset ); + if ( ret ) { + cowRequest->bytesWorkedOn += size; + cluster->timeChanged = time( NULL ); } - free( sRequest ); - return; + return ret; + } + // Not local, need some form of padding + createL2Table( l1Index ); + if ( cluster == NULL ) { + cluster = getL2Entry( l1Index, l2Index, true ); + } + uint64_t validImageSize = metadata->validRemoteSize; // As we don't lock + if ( startOffset >= (off_t)validImageSize ) { + // After end of remote valid data, pad with zeros entirely + char buf[DNBD3_BLOCK_SIZE] = {0}; + off_t start = startOffset % DNBD3_BLOCK_SIZE; + assert( start + size <= DNBD3_BLOCK_SIZE ); + memcpy( buf + start, srcBuffer, size ); + bool ret = writeAll( cow.fdData, buf, DNBD3_BLOCK_SIZE, + cluster->offset + ( inClusterOffset & ~DNBD3_BLOCK_MASK ) ); + if ( ret ) { + int64_t bit = inClusterOffset / DNBD3_BLOCK_SIZE; + setBitsInBitfield( cluster->bitfield, bit, bit, true ); + cowRequest->bytesWorkedOn += size; + cluster->timeChanged = time( NULL ); + } + return ret; + } + // Need to fetch padding from upstream, allocate struct plus one block + cow_sub_request_t *sub = calloc( sizeof( *sub ) + DNBD3_BLOCK_SIZE, 1 ); + sub->callback = writePaddedBlock; + sub->inClusterOffset = inClusterOffset; + sub->block = cluster; + sub->size = size; + sub->writeSrc = srcBuffer; + sub->cowRequest = cowRequest; + + sub->dRequest.length = (uint32_t)MIN( DNBD3_BLOCK_SIZE, validImageSize - startOffset ); + sub->dRequest.offset = startOffset & ~DNBD3_BLOCK_MASK; + sub->dRequest.fuse_req = req; + + if ( !connection_read( &sub->dRequest ) ) { + free( sub ); + errno = ENOTSOCK; + return false; } + atomic_fetch_add( &cowRequest->workCounter, 1 ); + return true; } /** @@ -1187,15 +1270,19 @@ void readRemoteData( cow_sub_request_t *sRequest ) atomic_fetch_add( &sRequest->cowRequest->bytesWorkedOn, sRequest->dRequest.length ); if ( atomic_fetch_sub( &sRequest->cowRequest->workCounter, 1 ) == 1 ) { - if ( sRequest->cowRequest->bytesWorkedOn < sRequest->cowRequest->fuseRequestSize ) { - // TODO: Is this a logic bug somewhere, reagarding accounting? + if ( sRequest->cowRequest->bytesWorkedOn != sRequest->cowRequest->fuseRequestSize ) { // Because connection_read() will always return exactly as many bytes as requested, // or simply never finish. - // Otherwise, we should return EIO... - logadd( LOG_ERROR, "pad read to small\n" ); + logadd( LOG_ERROR, "BUG? Pad read has invalid size. worked on: %"PRIu64", request size: %" + PRIu64", offset: %"PRIu64, + (uint64_t)sRequest->cowRequest->bytesWorkedOn, + (uint64_t)sRequest->cowRequest->fuseRequestSize, + (uint64_t)sRequest->cowRequest->fuseRequestOffset ); + fuse_reply_err( sRequest->dRequest.fuse_req, EIO ); + } else { + fuse_reply_buf( sRequest->dRequest.fuse_req, sRequest->cowRequest->readBuffer, + sRequest->cowRequest->bytesWorkedOn ); } - fuse_reply_buf( sRequest->dRequest.fuse_req, sRequest->cowRequest->readBuffer, - sRequest->cowRequest->bytesWorkedOn ); free( sRequest->cowRequest->readBuffer ); free( sRequest->cowRequest ); } @@ -1213,69 +1300,61 @@ void readRemoteData( cow_sub_request_t *sRequest ) void cowfile_setSize( fuse_req_t req, size_t size, fuse_ino_t ino, struct fuse_file_info *fi ) { - // decrease if ( size < metadata->imageSize ) { - if ( size < metadata->originalImageSize ) { - metadata->originalImageSize = size; + // truncate file + if ( size < metadata->validRemoteSize ) { + metadata->validRemoteSize = size; } - // TODO.... so.... - // originalImageSize = smallest we have seen - // imageSize = current - // ? - - // increase } else if ( size > metadata->imageSize ) { + // grow file, pad with zeroes off_t offset = metadata->imageSize; int l1Index = offsetToL1Index( offset ); int l2Index = offsetToL2Index( offset ); int l1EndIndex = offsetToL1Index( size ); int l2EndIndex = offsetToL2Index( size ); - // special case first block TODO: What is the special case? What is happening here? - if ( cow.l1[l1Index] != -1 ) { - cow_l2_entry_t *block = getL2Entry( l1Index, l2Index ); - if ( metadata->imageSize % DNBD3_BLOCK_SIZE != 0 ) { - off_t inClusterOffset = metadata->imageSize % COW_DATA_CLUSTER_SIZE; + // Special case, first cluster through which the size change passes + cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, false ); + if ( cluster != NULL ) { + off_t inClusterOffset = offset % COW_DATA_CLUSTER_SIZE; + // if the new size is inside a DNBD3_BLOCK it might still contain old data before a truncate + if ( !IS_4K_ALIGNED( metadata->imageSize ) ) { size_t sizeToWrite = DNBD3_BLOCK_SIZE - ( metadata->imageSize % DNBD3_BLOCK_SIZE ); - if ( checkBit( block->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ) ) ) { - char buf[sizeToWrite]; - memset( buf, 0, sizeToWrite ); - - ssize_t bytesWritten = pwrite( cow.fhd, buf, sizeToWrite, block->offset + inClusterOffset ); + if ( checkBit( cluster->bitfield, inClusterOffset / DNBD3_BLOCK_SIZE ) ) { + char buf[DNBD3_BLOCK_SIZE] = {0}; + ssize_t bytesWritten = pwrite( cow.fdData, buf, sizeToWrite, cluster->offset + inClusterOffset ); if ( bytesWritten < (ssize_t)sizeToWrite ) { fuse_reply_err( req, bytesWritten == -1 ? errno : EIO ); return; } - block->timeChanged = time( NULL ); + cluster->timeChanged = time( NULL ); offset += sizeToWrite; } } - // rest of block set bits 0 - l1Index = offsetToL1Index( offset ); - l2Index = offsetToL2Index( offset ); - block = getL2Entry( l1Index, l2Index ); - off_t inClusterOffset = offset % COW_DATA_CLUSTER_SIZE; - setBitsInBitfield( - block->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ), ( COW_BITFIELD_SIZE * 8 ) - 1, 0 ); - block->timeChanged = time( NULL ); + // all remaining bits in cluster will get set to 0 + inClusterOffset = offset % COW_DATA_CLUSTER_SIZE; + setBitsInBitfield( cluster->bitfield, inClusterOffset / DNBD3_BLOCK_SIZE, + ( COW_BITFIELD_SIZE * 8 ) - 1, false ); + cluster->timeChanged = time( NULL ); l2Index++; if ( l2Index >= COW_L2_TABLE_SIZE ) { l2Index = 0; l1Index++; } } - // null all bitfields - while ( !( l1Index > l1EndIndex || ( l1Index == l1EndIndex && l2EndIndex < l2Index ) ) ) { + // normal case, if clusters exist, null bitfields + while ( l1Index < l1EndIndex || ( l1Index == l1EndIndex && l2Index <= l2EndIndex ) ) { if ( cow.l1[l1Index] == -1 ) { l1Index++; l2Index = 0; continue; } - - cow_l2_entry_t *block = getL2Entry( l1Index, l2Index ); - setBitsInBitfield( block->bitfield, 0, ( COW_BITFIELD_SIZE * 8 ) - 1, 0 ); - block->timeChanged = time( NULL ); + cluster = getL2Entry( l1Index, l2Index, false ); + if ( cluster != NULL ) { + memset( cluster->bitfield, 0, COW_BITFIELD_SIZE ); + cluster->timeChanged = time( NULL ); + } l2Index++; if ( l2Index >= COW_L2_TABLE_SIZE ) { l2Index = 0; @@ -1308,98 +1387,82 @@ void cowfile_write( fuse_req_t req, cow_request_t *cowRequest, off_t offset, siz off_t currentOffset = offset; off_t endOffset = offset + size; + if ( !IS_4K_ALIGNED( currentOffset ) ) { + // Handle case where start is not 4k aligned + if ( !padBlockForWrite( req, cowRequest, currentOffset, endOffset, cowRequest->writeBuffer ) ) { + goto fail; + } + // Move forward to next block border + currentOffset = ( currentOffset + DNBD3_BLOCK_SIZE ) & ~DNBD3_BLOCK_MASK; + } + if ( currentOffset < endOffset && !IS_4K_ALIGNED( endOffset ) ) { + // Handle case where end is not 4k aligned + off_t lastBlockStart = endOffset & ~DNBD3_BLOCK_MASK; + if ( !padBlockForWrite( req, cowRequest, lastBlockStart, endOffset, + cowRequest->writeBuffer + ( lastBlockStart - offset ) ) ) { + goto fail; + } + endOffset = lastBlockStart; + } + + // From here on start and end are block-aligned int l1Index = offsetToL1Index( currentOffset ); int l2Index = offsetToL2Index( currentOffset ); while ( currentOffset < endOffset ) { if ( cow.l1[l1Index] == -1 ) { - createL2Block( l1Index ); + createL2Table( l1Index ); } //loop over L2 array (metadata) while ( currentOffset < endOffset && l2Index < COW_L2_TABLE_SIZE ) { - cow_l2_entry_t *metaBlock = getL2Entry( l1Index, l2Index ); - - // Calc absolute offset in image corresponding to current cluster - size_t clusterAbsoluteStartOffset = l1Index * COW_FULL_L2_TABLE_DATA_SIZE + l2Index * COW_DATA_CLUSTER_SIZE; - - size_t inClusterOffset = currentOffset - clusterAbsoluteStartOffset; - // How many bytes we can write to this cluster before crossing a boundary, or before the write request is completed + cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, true ); + size_t inClusterOffset = currentOffset % COW_DATA_CLUSTER_SIZE; + // How many bytes we can write to this cluster before crossing a boundary, + // or before the write request is complete size_t bytesToWriteToCluster = MIN( (size_t)( endOffset - currentOffset ), COW_DATA_CLUSTER_SIZE - inClusterOffset ); - ///////////////////////// - // lock for the half block probably needed - if ( currentOffset % DNBD3_BLOCK_SIZE != 0 - && !checkBit( metaBlock->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ) ) ) { - // Block has not been written locally before, and write does not start on block boundary. - // Need to fetch the first couple bytes of the block from remote before writing the block to disk. - size_t writeSize = MIN( bytesToWriteToCluster, DNBD3_BLOCK_SIZE - ( (size_t)currentOffset % DNBD3_BLOCK_SIZE ) ); - const char *sbuf = cowRequest->writeBuffer + ( ( currentOffset - offset ) ); - padBlockFromRemote( req, currentOffset, cowRequest, sbuf, writeSize, metaBlock, (off_t)inClusterOffset ); - currentOffset += writeSize; - continue; + if ( !writeAll( cow.fdData, cowRequest->writeBuffer + ( currentOffset - offset ), + bytesToWriteToCluster, cluster->offset + inClusterOffset ) ) { + goto fail; } - - size_t endPaddedSize = 0; // In case we need to skip over a pending pad request to remote - if ( ( currentOffset + bytesToWriteToCluster ) % DNBD3_BLOCK_SIZE != 0 - && metadata->originalImageSize > currentOffset + bytesToWriteToCluster ) { - // Write request does not end on block boundary, and ends before end of image - // End offset of this write - off_t clusterEndOffset = currentOffset + bytesToWriteToCluster; - // Start of last block of write, i.e. start of the last, incomplete block - off_t lastBlockStartOffset = clusterEndOffset - ( clusterEndOffset % DNBD3_BLOCK_SIZE ); - // Where that last block starts relative to its cluster - off_t inClusterBlockOffset = lastBlockStartOffset - clusterAbsoluteStartOffset; - if ( !checkBit( metaBlock->bitfield, (int)( inClusterBlockOffset / DNBD3_BLOCK_SIZE ) ) ) { - // Block indeed not modified before, need to fetch - const char *sbuf = cowRequest->writeBuffer + ( ( lastBlockStartOffset - offset ) ); - padBlockFromRemote( req, lastBlockStartOffset, cowRequest, sbuf, clusterEndOffset - lastBlockStartOffset, metaBlock, - inClusterBlockOffset ); - - - bytesToWriteToCluster -= clusterEndOffset - lastBlockStartOffset; - endPaddedSize = clusterEndOffset - lastBlockStartOffset; - } - } - writeData( cowRequest->writeBuffer + ( ( currentOffset - offset ) ), (ssize_t)bytesToWriteToCluster, - bytesToWriteToCluster, &cowRequest->errorCode, &cowRequest->bytesWorkedOn, metaBlock, inClusterOffset ); - + int64_t f = inClusterOffset / DNBD3_BLOCK_SIZE; + int64_t t = ( inClusterOffset + bytesToWriteToCluster - 1 ) / DNBD3_BLOCK_SIZE; + setBitsInBitfield( cluster->bitfield, f, t, true ); + cowRequest->bytesWorkedOn += bytesToWriteToCluster; currentOffset += bytesToWriteToCluster; - // Account for skipped-over bytes - currentOffset += endPaddedSize; - - + cluster->timeChanged = time( NULL ); l2Index++; } l1Index++; l2Index = 0; } - if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) == 1 ) { - finishWriteRequest( req, cowRequest ); + goto success; + +fail: + if ( cowRequest->errorCode == 0 ) { + cowRequest->errorCode = errno != 0 ? errno : EIO; } +success: + finishWriteRequest( req, cowRequest ); } /** * @brief Request data, that is not available locally, via the network. * - * @param req fuse_req_t + * @param req fuse_req_t * @param offset from the start of the file * @param size of data to request * @param buffer into which the data is to be written - * @param workCounter workCounter is increased by one and later reduced by one again when the request is completed. TODO There is no such param, but cowRequest.. + * @param cowRequest cow_request_t */ static void readRemote( fuse_req_t req, off_t offset, ssize_t size, char *buffer, cow_request_t *cowRequest ) { - // edgecase: Image size got reduced before on a non block border - if ( offset + size > (long int) metadata->originalImageSize ) { // TODO How does this check if it's a non block border? - size_t padZeroSize = ( offset + size ) - metadata->originalImageSize; - off_t padZeroOffset = metadata->originalImageSize - offset; - assert( offset > 0 ); // TODO Should this be padZeroOffset? - // ... But isn't it possible that offset > originalImageSize, in which case it would be negative? - memset( ( buffer + padZeroOffset ), 0, padZeroSize ); - - atomic_fetch_add( &cowRequest->bytesWorkedOn, padZeroSize ); - } + assert( offset < (off_t)metadata->validRemoteSize ); + assert( offset + size <= (off_t)metadata->validRemoteSize ); + if ( size == 0 ) + return; + assert( size > 0 ); cow_sub_request_t *sRequest = malloc( sizeof( cow_sub_request_t ) ); sRequest->callback = readRemoteData; sRequest->dRequest.length = (uint32_t)size; @@ -1410,35 +1473,33 @@ static void readRemote( fuse_req_t req, off_t offset, ssize_t size, char *buffer atomic_fetch_add( &cowRequest->workCounter, 1 ); if ( !connection_read( &sRequest->dRequest ) ) { - cowRequest->errorCode = EIO; // TODO We set an error... + cowRequest->errorCode = EIO; free( sRequest ); if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) == 1 ) { - // .... but would still report success if this happens to be the last pending sub-request!? - fuse_reply_buf( req, cowRequest->readBuffer, cowRequest->bytesWorkedOn ); + fuse_reply_err( req, EIO ); + free( cowRequest->readBuffer ); + free( cowRequest ); } - free( cowRequest->readBuffer ); - free( cowRequest ); - return; } } /** * @brief Get the Block Data Source object * - * @param block - * @param bitfieldOffset - * @param offset - * @return enum dataSource + * @param block + * @param bitfieldOffset + * @param offset + * @return enum dataSource */ enum dataSource getBlockDataSource( cow_l2_entry_t *block, off_t bitfieldOffset, off_t offset ) { - if ( block != NULL && checkBit( block->bitfield, (int)bitfieldOffset ) ) { - return local; + if ( block != NULL && checkBit( block->bitfield, bitfieldOffset ) ) { + return ds_local; } - if ( offset >= (off_t)metadata->originalImageSize ) { - return zero; + if ( offset >= (off_t)metadata->validRemoteSize ) { + return ds_zero; } - return remote; + return ds_remote; } /** @@ -1450,124 +1511,109 @@ enum dataSource getBlockDataSource( cow_l2_entry_t *block, off_t bitfieldOffset, * @param offset offset where the read starts. * @return uint64_t Number of bytes read. */ -void cowfile_read( fuse_req_t req, size_t size, off_t offset ) +void cowfile_read( fuse_req_t req, size_t size, off_t startOffset ) { cow_request_t *cowRequest = malloc( sizeof( cow_request_t ) ); cowRequest->fuseRequestSize = size; cowRequest->bytesWorkedOn = ATOMIC_VAR_INIT( 0 ); cowRequest->workCounter = ATOMIC_VAR_INIT( 1 ); cowRequest->errorCode = ATOMIC_VAR_INIT( 0 ); - cowRequest->readBuffer = malloc( size ); - cowRequest->fuseRequestOffset = offset; - off_t lastReadOffset = offset; - off_t endOffset = offset + size; - off_t searchOffset = offset; - int l1Index = offsetToL1Index( offset ); - int l2Index = offsetToL2Index( offset ); - int bitfieldOffset = getBitfieldOffsetBit( offset ); - enum dataSource dataState; - cow_l2_entry_t *cluster = NULL; - - if ( cow.l1[l1Index] != -1 ) { - cluster = getL2Entry( l1Index, l2Index ); - } + cowRequest->readBuffer = calloc( size, 1 ); + cowRequest->fuseRequestOffset = startOffset; + off_t lastReadOffset = -1; + off_t endOffset = startOffset + size; + off_t searchOffset = startOffset; + int l1Index = offsetToL1Index( startOffset ); + int l2Index = offsetToL2Index( startOffset ); + int bitfieldOffset = getBitfieldOffsetBit( startOffset ); + cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, false ); + enum dataSource dataState = ds_invalid; + bool flushCurrentSpan = false; // Set if we need to read the current span and start the next one + bool newSourceType = true; // Set if we're starting a new span, and the source type needs to be determined - bool doRead = false; - bool firstLoop = true; - bool updateBlock = false; while ( searchOffset < endOffset ) { - if ( firstLoop ) { - firstLoop = false; + if ( newSourceType ) { + newSourceType = false; lastReadOffset = searchOffset; - // TODO: Why is this only set on first iteration and not for every block/cluster? dataState = getBlockDataSource( cluster, bitfieldOffset, searchOffset ); } else if ( getBlockDataSource( cluster, bitfieldOffset, searchOffset ) != dataState ) { - // TODO So data source changed, but we don't update the dataState var... How can this possibly work? - doRead = true; + // Source type changed, obviously need to flush current span + flushCurrentSpan = true; } else { bitfieldOffset++; - } - - if ( bitfieldOffset >= COW_BITFIELD_SIZE * 8 ) { - // Advance to next cluster in current l2 table - bitfieldOffset = 0; - l2Index++; - if ( l2Index >= COW_L2_TABLE_SIZE ) { - // Advance to next l1 entry, reset l2 index - l2Index = 0; - l1Index++; - } - // Also set flag that we need to update the 'cluster' struct at the end of this iteration - // TODO: Why do we update all the values above, but not the cluster struct? We access those - // variables in the code below, so we have updated offset and index, but operate on the - // old cluster struct. How does that make sense? - updateBlock = true; - if ( dataState == local ) { - doRead = true; + // If reading from local cow file, crossing a cluster border means we need to flush + // since the next cluster might be somewhere else in the data file + if ( dataState == ds_local && bitfieldOffset == COW_BITFIELD_SIZE * 8 ) { + flushCurrentSpan = true; } } - // compute the original file offset from bitfieldOffset, l2Index and l1Index - // TODO ??? As stated above, this is using the updated values, so isn't this the next - // offset tather than original offset? - searchOffset = DNBD3_BLOCK_SIZE * ( bitfieldOffset ) + l2Index * COW_DATA_CLUSTER_SIZE + + // compute the absolute image offset from bitfieldOffset, l2Index and l1Index + // bitfieldOffset might be out of bounds here, but that doesn't matter for the calculation + searchOffset = DNBD3_BLOCK_SIZE * bitfieldOffset + l2Index * COW_DATA_CLUSTER_SIZE + l1Index * COW_FULL_L2_TABLE_DATA_SIZE; - if ( doRead || searchOffset >= endOffset ) { - ssize_t sizeToRead = MIN( searchOffset, endOffset ); - if ( dataState == remote ) { - if ( sizeToRead > (ssize_t) metadata->originalImageSize ) { - //pad rest with 0 - memset( cowRequest->readBuffer - + ( ( lastReadOffset - offset ) + ( metadata->originalImageSize - offset ) ), - 0, sizeToRead - metadata->originalImageSize ); - atomic_fetch_add( &cowRequest->bytesWorkedOn, sizeToRead - metadata->originalImageSize ); - sizeToRead = metadata->originalImageSize; + if ( flushCurrentSpan || searchOffset >= endOffset ) { + ssize_t spanEndOffset = MIN( searchOffset, endOffset ); + if ( dataState == ds_remote ) { + if ( spanEndOffset > (ssize_t)metadata->validRemoteSize ) { + // Account for bytes we leave zero, because they're beyond the (truncated) original image size + atomic_fetch_add( &cowRequest->bytesWorkedOn, spanEndOffset - metadata->validRemoteSize ); + spanEndOffset = metadata->validRemoteSize; } - sizeToRead -= lastReadOffset; - readRemote( - req, lastReadOffset, sizeToRead, cowRequest->readBuffer + ( lastReadOffset - offset ), cowRequest ); - } else if ( dataState == zero ) { - sizeToRead -= lastReadOffset; - memset( cowRequest->readBuffer + ( lastReadOffset - offset ), 0, sizeToRead ); - atomic_fetch_add( &cowRequest->bytesWorkedOn, sizeToRead ); - } else { - sizeToRead -= lastReadOffset; - // Compute the offset in the data file where the read starts - off_t localRead = - cluster->offset + ( ( lastReadOffset % COW_FULL_L2_TABLE_DATA_SIZE ) % COW_DATA_CLUSTER_SIZE ); + readRemote( req, lastReadOffset, spanEndOffset - lastReadOffset, + cowRequest->readBuffer + ( lastReadOffset - startOffset ), cowRequest ); + } else if ( dataState == ds_zero ) { + // Past end of image, account for leaving them zero + ssize_t numBytes = spanEndOffset - lastReadOffset; + atomic_fetch_add( &cowRequest->bytesWorkedOn, numBytes ); + } else if ( dataState == ds_local ) { + ssize_t numBytes = spanEndOffset - lastReadOffset; + // Compute the startOffset in the data file where the read starts + off_t localRead = cluster->offset + ( lastReadOffset % COW_DATA_CLUSTER_SIZE ); ssize_t totalBytesRead = 0; - while ( totalBytesRead < sizeToRead ) { - ssize_t bytesRead = - pread( cow.fhd, cowRequest->readBuffer + ( lastReadOffset - offset ), sizeToRead, localRead ); + while ( totalBytesRead < numBytes ) { + ssize_t bytesRead = pread( cow.fdData, cowRequest->readBuffer + ( lastReadOffset - startOffset ), + numBytes - totalBytesRead, localRead + totalBytesRead ); if ( bytesRead == -1 ) { cowRequest->errorCode = errno; goto fail; - } else if ( bytesRead <= 0 ) { + } else if ( bytesRead == 0 ) { + logadd( LOG_ERROR, "EOF for read at localRead=%"PRIu64", totalBR=%"PRIu64, + (uint64_t)localRead, (uint64_t)totalBytesRead ); + logadd( LOG_ERROR, "searchOffset=%"PRIu64", endOffset=%"PRIu64", imageSize=%"PRIu64, + searchOffset, endOffset, metadata->imageSize ); cowRequest->errorCode = EIO; goto fail; } totalBytesRead += bytesRead; } - atomic_fetch_add( &cowRequest->bytesWorkedOn, totalBytesRead ); + atomic_fetch_add( &cowRequest->bytesWorkedOn, numBytes ); + } else { + assert( 4 == 6 ); } lastReadOffset = searchOffset; - doRead = false; - firstLoop = true; + flushCurrentSpan = false; + // Since the source type changed, reset + newSourceType = true; } - - if ( updateBlock ) { - if ( cow.l1[l1Index] != -1 ) { - cluster = getL2Entry( l1Index, l2Index ); - } else { - cluster = NULL; + if ( bitfieldOffset == COW_BITFIELD_SIZE * 8 ) { + // Advance to next cluster in current l2 table + bitfieldOffset = 0; + l2Index++; + if ( l2Index >= COW_L2_TABLE_SIZE ) { + // Advance to next l1 entry, reset l2 index + l2Index = 0; + l1Index++; } - updateBlock = false; + cluster = getL2Entry( l1Index, l2Index, false ); } } fail:; if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) == 1 ) { - if ( cowRequest->errorCode != 0 || cowRequest->bytesWorkedOn < size ) { - logadd( LOG_ERROR, "incomplete read or I/O error (errno=%d)", cowRequest->errorCode ); + if ( cowRequest->errorCode != 0 || cowRequest->bytesWorkedOn != size ) { + logadd( LOG_ERROR, "incomplete read or I/O error (errno=%d, workedOn: %"PRIu64", size: %"PRIu64")", + cowRequest->errorCode, (uint64_t)cowRequest->bytesWorkedOn, (uint64_t)size ); fuse_reply_err( req, cowRequest->errorCode != 0 ? cowRequest->errorCode : EIO ); } else { fuse_reply_buf( req, cowRequest->readBuffer, cowRequest->bytesWorkedOn ); diff --git a/src/fuse/cowfile.h b/src/fuse/cowfile.h index a07469d..0f395de 100644 --- a/src/fuse/cowfile.h +++ b/src/fuse/cowfile.h @@ -24,9 +24,10 @@ _Static_assert( sizeof( atomic_int_least64_t ) == 8, "atomic_int_least64_t not 8 enum dataSource { - local, - remote, - zero + ds_invalid, + ds_local, + ds_remote, + ds_zero }; #define COW_METADATA_HEADER_SIZE 320 @@ -36,19 +37,20 @@ typedef struct cowfile_metadata_header atomic_uint_least64_t imageSize; // 8byte int32_t version; // 4byte int32_t blocksize; // 4byte - uint64_t originalImageSize; // 8byte - the name implies this is the size of the image on the server, but apparently it changes if we truncate the image etc. better name? - uint64_t metaDataStart; // 8byte + uint64_t validRemoteSize; // 8byte + uint32_t startL1; // 4byte + uint32_t startL2; // 4byte int32_t bitfieldSize; // 4byte int32_t nextL2; // 4byte - atomic_uint_least64_t metadataFileSize; // 8byte - atomic_uint_least64_t dataFileSize; // 8byte + atomic_int_least64_t metaSize; // 8byte + atomic_int_least64_t nextClusterOffset; // 8byte uint64_t maxImageSize; // 8byte uint64_t creationTime; // 8byte char uuid[40]; // 40byte char imageName[200]; // 200byte } cowfile_metadata_header_t; -_Static_assert( - sizeof( cowfile_metadata_header_t ) == COW_METADATA_HEADER_SIZE, "cowfile_metadata_header is messed up" ); +_Static_assert( sizeof( cowfile_metadata_header_t ) == COW_METADATA_HEADER_SIZE, + "cowfile_metadata_header is messed up" ); #define COW_L2_ENTRY_SIZE 64 typedef struct cow_l2_entry @@ -56,25 +58,25 @@ typedef struct cow_l2_entry atomic_int_least64_t offset; atomic_uint_least64_t timeChanged; atomic_uint_least64_t uploads; - atomic_char bitfield[COW_BITFIELD_SIZE]; + atomic_uchar bitfield[COW_BITFIELD_SIZE]; } cow_l2_entry_t; _Static_assert( sizeof( cow_l2_entry_t ) == COW_L2_ENTRY_SIZE, "cow_l2_entry_t is messed up" ); /** * Open request for reading/writing the virtual image we expose. - * TODO Please verify field comments */ typedef struct cow_request { size_t fuseRequestSize; // Number of bytes to be read/written off_t fuseRequestOffset; // Absolute offset into the image, as seen by user space - char *readBuffer; // Used only in read case? - const char *writeBuffer; // Used only in write case? - atomic_size_t bytesWorkedOn; // Used for ??? + char *readBuffer; // Used only in read case + const char *writeBuffer; // Used only in write case + atomic_size_t bytesWorkedOn; // Used for tracking how many bytes we have touched (exluding padding etc) atomic_int workCounter; // How many pending sub requests (see below) atomic_int errorCode; // For reporting back to fuse - fuse_ino_t ino; // Inode of file, used for ??? - struct fuse_file_info *fi; // Used for ??? + fuse_ino_t ino; // Inode of file, used for ??? (For reporting back to fuse, dont know if needed?) + struct fuse_file_info *fi; // Used for ??? (For reporting back to fuse, dont know if needed?) + //fuse_req_t req; // Fuse request } cow_request_t; typedef struct cow_sub_request cow_sub_request_t; @@ -88,14 +90,14 @@ typedef void ( *cow_callback )( cow_sub_request_t *sRequest ); typedef struct cow_sub_request { size_t size; // size of this sub-request - off_t inClusterOffset; // offset relative to!? cow-block? DNBD3 block? cluster? - const char *writeSrc; // ??? - char *buffer; // ??? + off_t inClusterOffset; // offset relative to the beginning of the cluster + const char *writeSrc; // pointer to the data of a write request which needs padding + char *buffer; // The pointer points to the original read buffer to the place where the sub read request should be copied to. cow_l2_entry_t *block; // the cluster inClusterOffset refers to cow_callback callback; // Callback when we're done handling this cow_request_t *cowRequest; // parent request dnbd3_async_t dRequest; // Probably request to dnbd3-server for non-aligned writes (wrt 4k dnbd3 block) - char writeBuffer[]; // ??? + char writeBuffer[]; // buffer for a padding write request, gets filled from a remote read, then the writeSrc data gets copied into it. } cow_sub_request_t; typedef struct cow_curl_read_upload @@ -103,18 +105,18 @@ typedef struct cow_curl_read_upload atomic_uint_least64_t time; cow_l2_entry_t *block; size_t position; - long unsigned int blocknumber; + long unsigned int clusterNumber; int fails; int64_t ulLast; + atomic_uchar bitfield[COW_BITFIELD_SIZE]; } cow_curl_read_upload_t; -typedef struct cow_block_upload_statistics +typedef struct cow_cluster_statistics { - uint64_t blocknumber; + uint64_t clusterNumber; uint64_t uploads; -} cow_block_upload_statistics_t; - +} cow_cluster_statistics_t; typedef int32_t l1; typedef cow_l2_entry_t l2[COW_L2_TABLE_SIZE]; diff --git a/src/fuse/main.c b/src/fuse/main.c index 6e7977c..96d8f5c 100644 --- a/src/fuse/main.c +++ b/src/fuse/main.c @@ -329,7 +329,7 @@ static void printUsage( char *argv0, int exitCode ) printf( "\n" ); printf( "Usage: %s [--debug] [--option mountOpts] --host --image [--rid revision] \n", argv0 ); printf( "Or: %s [-d] [-o mountOpts] -h -i [-r revision] \n", argv0 ); - printf( "For cow: %s [-d] [-o mountOpts] -h -i [-r revision] -c -C -m [--cowStatStdout] [--cowStatFile] \n", argv0 ); + printf( "For cow: %s [-d] [-o mountOpts] -h -i [-r revision] -c -C -m [--cow-stats-stdout] [--cow-stats-file] \n", argv0 ); printf( " -d --debug Don't fork, write stats file, and print debug output (fuse -> stderr, dnbd3 -> stdout)\n" ); printf( " -f Don't fork (dnbd3 -> stdout)\n" ); printf( " -h --host List of space separated hosts to use\n" ); @@ -342,12 +342,13 @@ static void printUsage( char *argv0, int exitCode ) printf( " -c Enables cow, creates the cow files at given location\n" ); printf( " -L Loads the cow files from the given location\n" ); printf( " -C Host address of the cow server\n" ); - printf( " --cowStatStdout prints the cow status in stdout\n" ); - printf( " --cowStatFile creates and updates the cow status file\n" ); + printf( "--cow-stats-stdout prints the cow status in stdout\n" ); + printf( "--cow-stats-file creates and updates the cow status file\n" ); + printf( " -m --merge tell server to merge and create new revision on exit\n" ); exit( exitCode ); } -static const char *optString = "dfHh:i:l:o:r:SsVvc:L:C:mxy"; +static const char *optString = "dfHh:i:l:o:r:SsVvc:L:C:m"; static const struct option longOpts[] = { { "debug", no_argument, NULL, 'd' }, { "help", no_argument, NULL, 'H' }, @@ -362,8 +363,8 @@ static const struct option longOpts[] = { { "loadcow", required_argument, NULL, 'L' }, { "cowServer", required_argument, NULL, 'C' }, { "merge", no_argument, NULL, 'm' }, - { "cowStatStdout", no_argument, NULL, 'x' }, - { "cowStatFile", no_argument, NULL, 'y' }, + { "cow-stats-stdout", no_argument, NULL, 'sout' }, + { "cow-stats-file", no_argument, NULL, 'sfil' }, { 0, 0, 0, 0 } }; @@ -467,10 +468,10 @@ int main( int argc, char *argv[] ) useCow = true; loadCow = true; break; - case 'x': + case 'sout': sStdout = true; break; - case 'y': + case 'sfil': sFile = true; break; default: -- cgit v1.2.3-55-g7522