summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--inc/dnbd3/config.h1
-rw-r--r--inc/dnbd3/config/cow.h2
-rw-r--r--src/cowtest/main.c50
-rw-r--r--src/fuse/cowfile.c928
-rw-r--r--src/fuse/cowfile.h52
-rw-r--r--src/fuse/main.c17
6 files changed, 560 insertions, 490 deletions
diff --git a/inc/dnbd3/config.h b/inc/dnbd3/config.h
index eb4b8b1..482bd29 100644
--- a/inc/dnbd3/config.h
+++ b/inc/dnbd3/config.h
@@ -39,5 +39,6 @@
// +++++ Block Device +++++
#define DNBD3_BLOCK_SIZE ((uint64_t)4096) // NEVER CHANGE THIS OR THE WORLD WILL END!
+#define DNBD3_BLOCK_MASK ((uint64_t)4095) // NEVER CHANGE THIS OR THE WORLD WILL END!
#endif /* CONFIG_H_ */
diff --git a/inc/dnbd3/config/cow.h b/inc/dnbd3/config/cow.h
index 9ed59a0..a7f3615 100644
--- a/inc/dnbd3/config/cow.h
+++ b/inc/dnbd3/config/cow.h
@@ -11,7 +11,7 @@
#define COW_MAX_PARALLEL_BACKGROUND_UPLOADS 2 // maximum number of parallel uploads while the image is still mounted
#define COW_URL_STRING_SIZE 500 // Max string size for an url
#define COW_SHOW_UL_SPEED 1 // enable display of ul speed in cow status file
-#define COW_MAX_IMAGE_SIZE 1000LL * 1000LL * 1000LL * 1000LL; // Maximum size an image can have(tb*gb*mb*kb)
+#define COW_MAX_IMAGE_SIZE (1000LL * 1000LL * 1000LL * 1000LL) // Maximum size an image can have(tb*gb*mb*kb)
// +++++ COW API Endpoints +++++
#define COW_API_CREATE "%s/api/file/create"
#define COW_API_UPDATE "%s/api/file/update?guid=%s&clusterindex=%lu"
diff --git a/src/cowtest/main.c b/src/cowtest/main.c
index 38d0f16..c7da4ac 100644
--- a/src/cowtest/main.c
+++ b/src/cowtest/main.c
@@ -43,7 +43,7 @@ atomic_bool randomTestLoop = true;
#define RND_MAX_WRITE_SIZE 4096 * 320
#define RND_TRUNCATE_PROBABILITY 5
-#define RND_UNALIGNED_WRITE_PROBABILITY 5
+#define RND_UNALIGNED_WRITE_PROBABILITY 80
#define RND_DEFAULT_MIN_SIZE_PERCENT 0.9f
#define RND_DEFAULT_MAX_SIZE_PERCENT 1.1f
#define BASE_DATA (char)42
@@ -97,10 +97,29 @@ bool generateTestFile( char *path, size_t size )
* @param str
* @param len
*/
-void printCharInHexadecimal( const char *str, int len )
+void printCharInHexadecimal( const char *str, const char *got, int len )
{
+ int pr = 0;
for ( int i = 0; i < len; ++i ) {
- printf( "0x%02x ", (int)str[i] );
+ if ( pr > 0 ) {
+ pr--;
+ if ( str[i] != got[i] ) {
+ printf( "[%02x/%02x] ", (int)str[i], (int)got[i] );
+ } else {
+ printf( "%02x ", (int)str[i] );
+ }
+ if ( pr == 0 ) {
+ printf( " .." );
+ }
+ } else {
+ if ( str[i] != got[i] ) {
+ pr = 4;
+ i = MAX( -1, i - 4 );
+ if ( i != -1 ) {
+ printf(".. " );
+ }
+ }
+ }
}
printf( "\n" );
}
@@ -118,12 +137,10 @@ void printCharInHexadecimal( const char *str, int len )
bool compare( char buff[], char expected[], size_t size, char errorMessage[] )
{
if ( memcmp( buff, expected, size ) != 0 ) {
- printf( "%s", errorMessage );
+ printf( "%s\n", errorMessage );
if ( printOnError ) {
- printf( "Expected: \n" );
- printCharInHexadecimal( expected, (int)size );
- printf( "Got: \n " );
- printCharInHexadecimal( buff, (int)size );
+ printf( "Diff [want/got]: \n" );
+ printCharInHexadecimal( expected, buff, (int)size );
}
return false;
}
@@ -247,14 +264,14 @@ bool verifySingleBit()
expected[0] = 1;
if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE, 0, "SingleBit test Failed: first read to small" ) )
return false;
- if ( !compare( buff, expected, DNBD3_BLOCK_SIZE, "SingleBit test Failed: first write not as expected" ) )
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE, "SingleBit test Failed: first read not as expected" ) )
return false;
expected[0] = BASE_DATA;
expected[DNBD3_BLOCK_SIZE / 2] = 1;
if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE, DNBD3_BLOCK_SIZE, "SingleBit test Failed: second read to small" ) )
return false;
- if ( !compare( buff, expected, DNBD3_BLOCK_SIZE, "SingleBit test Failed: second write not as expected" ) )
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE, "SingleBit test Failed: second read not as expected" ) )
return false;
printf( "testSingleBit successful!\n" );
return true;
@@ -426,7 +443,7 @@ bool verifyLongNonAlignedPattern()
if ( !readSizeTested( fh, buffer, sizeToRead, offset, "writeLongNonAlignedPattern test Failed: read failed" ) ) {
return false;
}
- if ( !compare( buffer, expected, sizeToRead, "writeLongNonAlignedPattern test Failed: read failed" ) )
+ if ( !compare( buffer, expected, sizeToRead, "writeLongNonAlignedPattern test Failed: compare failed" ) )
return false;
offset += sizeToRead;
}
@@ -755,7 +772,7 @@ bool verifyFinalFile( char *path )
size_t fileSize = testFileSize + 2 * l2Capacity;
struct stat st;
- stat( path, &st );
+ fstat( fh, &st );
size_t size = st.st_size;
if ( size != fileSize ) {
printf( "verify Failed, wrong file size\n expectedSize: %zu\n got: %zu\n", fileSize, size );
@@ -1023,7 +1040,7 @@ bool randomWriteTest( char *mountedImagePath, char *normalImagePath, float minS
return (void*) false;
}
// RANDOM WRITE LOOP
- printf( "Press any key to cancel\n" );
+ printf( "Press Ctrl-C to stop and compare\n" );
while ( randomTestLoop ) {
//select test
int r = rand() % 100;
@@ -1042,7 +1059,10 @@ bool randomWriteTest( char *mountedImagePath, char *normalImagePath, float minS
} else {
// write test
off_t offset = rand() % maxOffset;
- size_t size = rand() % RND_MAX_WRITE_SIZE;
+ size_t size = ( rand() + offset ) % RND_MAX_WRITE_SIZE;
+ if ( size < RND_MAX_WRITE_SIZE / 2 ) {
+ size /= rand() % 8192;
+ }
size = MAX( size, 1 );
if ( r > RND_TRUNCATE_PROBABILITY + RND_UNALIGNED_WRITE_PROBABILITY ) {
// align to block
@@ -1051,7 +1071,7 @@ bool randomWriteTest( char *mountedImagePath, char *normalImagePath, float minS
}
generateRandomData( fhr, buf, size );
- printf( "write offset: %zu size: %zu\n", offset, size );
+ printf( "write offset: %zu size: %zu r: %d\n", offset, size, r );
if ( !writeSizeTested( fhm, buf, size, offset, "failed to write on mounted image" ) )
return false;
if ( !writeSizeTested( fhn, buf, size, offset, "failed to write on normal image" ) )
diff --git a/src/fuse/cowfile.c b/src/fuse/cowfile.c
index 8e816a2..a53b101 100644
--- a/src/fuse/cowfile.c
+++ b/src/fuse/cowfile.c
@@ -10,10 +10,13 @@
#include <curl/curl.h>
#define UUID_STRLEN 36
+// Maximum assumed page size, in case the cow data gets transferred between different architectures
+// 16k should be the largest minimum in existence (Itanium)
+#define MAX_PAGE_SIZE 16384
extern void image_ll_getattr( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi );
-static const int CURRENT_COW_VERSION = 1;
+static const int CURRENT_COW_VERSION = 2;
static bool statStdout;
static bool statFile;
@@ -30,18 +33,51 @@ atomic_bool uploadLoopDone = false; // Upload loop has finished all work?
static struct cow
{
- pthread_mutex_t l2CreateLock;
- int fhm;
- int fhd;
- int fhs;
char *metadata_mmap;
l1 *l1;
- l2 *firstL2;
- size_t maxImageSize;
- size_t l1Size; //size of l1 array
-
+ l2 *l2;
+ int fdMeta;
+ int fdData;
+ int fdStats;
+ pthread_mutex_t l2CreateLock;
} cow;
+static int countOneBits( atomic_uchar *bf, int numBytes )
+{
+ int bitCount = 0;
+ for ( int i = 0; i < numBytes; ++i ) {
+ unsigned char value = bf[i];
+ while ( value > 0 ) {
+ if ( ( value & 1 ) == 1 ) {
+ bitCount++;
+ }
+ value >>= 1;
+ }
+ }
+ return bitCount;
+}
+
+#define IS_4K_ALIGNED(v) ( ( (uint64_t)(v) & DNBD3_BLOCK_MASK ) == 0 )
+
+static bool writeAll( int fd, const char *buf, size_t count, off_t offset )
+{
+ while ( count > 0 ) {
+ ssize_t ret = pwrite( fd, buf, count, offset );
+ if ( ret == (ssize_t)count )
+ return true;
+ if ( ret == -1 ) {
+ if ( errno == EINTR )
+ continue;
+ return false;
+ }
+ if ( ret == 0 )
+ return false;
+ count -= ret;
+ buf += ret;
+ }
+ return true;
+}
+
/**
* @brief Computes the l1 index for an absolute file offset
*
@@ -83,7 +119,7 @@ static int getBitfieldOffsetBit( size_t offset )
* @param to end bit
* @param value set bits to 1 or 0
*/
-static void setBits( atomic_char *byte, int from, int to, bool value )
+static void setBits( atomic_uchar *byte, int64_t from, int64_t to, bool value )
{
char mask = (char)( ( 255 >> ( 7 - ( to - from ) ) ) << from );
if ( value ) {
@@ -101,13 +137,13 @@ static void setBits( atomic_char *byte, int from, int to, bool value )
* @param to end bit
* @param value set bits to 1 or 0
*/
-static void setBitsInBitfield( atomic_char *bitfield, int from, int to, bool value )
+static void setBitsInBitfield( atomic_uchar *bitfield, int64_t from, int64_t to, bool value )
{
- assert( from >= 0 || to < COW_BITFIELD_SIZE * 8 );
- int start = from / 8;
- int end = to / 8;
+ assert( from >= 0 && to < COW_BITFIELD_SIZE * 8 );
+ int64_t start = from / 8;
+ int64_t end = to / 8;
- for ( int i = start; i <= end; i++ ) {
+ for ( int64_t i = start; i <= end; i++ ) {
setBits( ( bitfield + i ), from - i * 8, MIN( 7, to - i * 8 ), value );
from = ( i + 1 ) * 8;
}
@@ -119,9 +155,9 @@ static void setBitsInBitfield( atomic_char *bitfield, int from, int to, bool val
* @param bitfield of a cow_l2_entry
* @param n the bit which should be checked
*/
-static bool checkBit( atomic_char *bitfield, int n )
+static bool checkBit( atomic_uchar *bitfield, int64_t n )
{
- return ( atomic_load( ( bitfield + ( n / 8 ) ) ) >> ( n % 8 ) ) & 1;
+ return ( bitfield[n / 8] >> ( n % 8 ) ) & 1;
}
@@ -225,32 +261,50 @@ size_t curlReadCallbackUploadBlock( char *ptr, size_t size, size_t nmemb, void *
cow_curl_read_upload_t *uploadBlock = (cow_curl_read_upload_t *)userdata;
size_t len = 0;
// Check if we're still in the bitfield
- if ( uploadBlock->position < (size_t)metadata->bitfieldSize ) {
- size_t lenCpy = MIN( metadata->bitfieldSize - uploadBlock->position, size * nmemb );
- memcpy( ptr, uploadBlock->block->bitfield + uploadBlock->position, lenCpy );
+ if ( uploadBlock->position < COW_BITFIELD_SIZE ) {
+ size_t lenCpy = MIN( COW_BITFIELD_SIZE - uploadBlock->position, size * nmemb );
+ memcpy( ptr + uploadBlock->position, uploadBlock->bitfield + uploadBlock->position,
+ lenCpy );
uploadBlock->position += lenCpy;
len += lenCpy;
}
// No elseif here, might just have crossed over...
- if ( uploadBlock->position >= (size_t)metadata->bitfieldSize ) {
- ssize_t wantRead = (ssize_t)MIN(
- COW_DATA_CLUSTER_SIZE - ( uploadBlock->position - ( metadata->bitfieldSize ) ),
- ( size * nmemb ) - len );
- off_t inClusterOffset = uploadBlock->position - metadata->bitfieldSize;
- ssize_t lengthRead = pread( cow.fhd, ( ptr + len ), wantRead, uploadBlock->block->offset + inClusterOffset );
- if ( lengthRead == -1 ) {
- logadd( LOG_ERROR, "Upload: Reading from COW file failed with errno %d", errno );
- return CURL_READFUNC_ABORT;
- }
-
- if ( wantRead > lengthRead ) {
- // fill up since last block may not be a full block
- memset( ptr + len + lengthRead, 0, wantRead - lengthRead );
- // TODO what about partial read? We should know how much data there actually is...
- lengthRead = wantRead;
+ if ( uploadBlock->position >= COW_BITFIELD_SIZE ) {
+ // Subtract the bitfield size from everything first
+ off_t inClusterOffset = uploadBlock->position - COW_BITFIELD_SIZE;
+ ssize_t spaceLeft = ( size * nmemb ) - len;
+ // Only read blocks that have been written to the cluster. Saves bandwidth. Not optimal since
+ // we do a lot of 4k/32k reads, but it's not that performance critical I guess...
+ while ( spaceLeft >= (ssize_t)DNBD3_BLOCK_SIZE && inClusterOffset < (off_t)COW_DATA_CLUSTER_SIZE ) {
+ int bitNumber = (int)( inClusterOffset / DNBD3_BLOCK_SIZE );
+ size_t readSize;
+ // Small performance hack: All bits one in a byte, do a 32k instead of 4k read
+ if ( spaceLeft >= (ssize_t)DNBD3_BLOCK_SIZE * 8
+ && bitNumber % 8 == 0
+ && uploadBlock->bitfield[bitNumber / 8] == 0xff ) {
+ readSize = DNBD3_BLOCK_SIZE * 8;
+ } else {
+ readSize = DNBD3_BLOCK_SIZE;
+ }
+ // Check bits in our copy, as global bitfield could change
+ if ( checkBit( uploadBlock->bitfield, bitNumber ) ) {
+ ssize_t lengthRead = pread( cow.fdData, ( ptr + len ), readSize,
+ uploadBlock->block->offset + inClusterOffset );
+ if ( lengthRead == -1 ) {
+ logadd( LOG_ERROR, "Upload: Reading from COW file failed with errno %d", errno );
+ return CURL_READFUNC_ABORT;
+ }
+ if ( lengthRead != (ssize_t)readSize ) {
+ logadd( LOG_ERROR, "Upload: Reading from COW file failed with short read (%d/%d)",
+ (int)lengthRead, (int)readSize );
+ return CURL_READFUNC_ABORT;
+ }
+ len += lengthRead;
+ spaceLeft -= lengthRead;
+ }
+ inClusterOffset += readSize;
+ uploadBlock->position += readSize;
}
- uploadBlock->position += lengthRead;
- len += lengthRead;
}
return len;
}
@@ -280,7 +334,7 @@ bool mergeRequest()
part = curl_mime_addpart( mime );
curl_mime_name( part, "originalFileSize" );
char buf[21];
- snprintf( buf, sizeof buf, "%" PRIu64, metadata->originalImageSize );
+ snprintf( buf, sizeof buf, "%" PRIu64, metadata->validRemoteSize );
curl_mime_data( part, buf, CURL_ZERO_TERMINATED );
part = curl_mime_addpart( mime );
@@ -341,15 +395,15 @@ int progress_callback( void *clientp, __attribute__((unused)) curl_off_t dlTotal
__attribute__((unused)) curl_off_t dlNow, __attribute__((unused)) curl_off_t ulTotal, curl_off_t ulNow )
{
CURL *eh = (CURL *)clientp;
- cow_curl_read_upload_t *curlUploadBlock;
+ cow_curl_read_upload_t *uploadingCluster;
CURLcode res;
- res = curl_easy_getinfo( eh, CURLINFO_PRIVATE, &curlUploadBlock );
+ res = curl_easy_getinfo( eh, CURLINFO_PRIVATE, &uploadingCluster );
if ( res != CURLE_OK ) {
logadd( LOG_ERROR, "ERROR" );
return 0;
}
- bytesUploaded += ( ulNow - curlUploadBlock->ulLast );
- curlUploadBlock->ulLast = ulNow;
+ bytesUploaded += ( ulNow - uploadingCluster->ulLast );
+ uploadingCluster->ulLast = ulNow;
return 0;
}
@@ -381,7 +435,7 @@ void updateCowStatsFile( uint64_t inQueue, uint64_t modified, uint64_t idle, cha
"modifiedClusters=%" PRIu64 "\n"
"idleClusters=%" PRIu64 "\n"
"totalClustersUploaded=%" PRIu64 "\n"
- "activeUploads=:%i\n"
+ "activeUploads=%i\n"
"%s%s",
state, inQueue, modified, idle, totalBlocksUploaded, activeUploads,
COW_SHOW_UL_SPEED ? "ulspeed=" : "",
@@ -398,9 +452,10 @@ void updateCowStatsFile( uint64_t inQueue, uint64_t modified, uint64_t idle, cha
if ( statFile ) {
// Pad with a bunch of newlines so we don't change the file size all the time
- ssize_t extra = MIN( 20, sizeof(buffer) - len - 1 );
+ ssize_t extra = MIN( 20, (ssize_t)sizeof(buffer) - len - 1 );
memset( buffer + len, '\n', extra );
- if ( pwrite( cow.fhs, buffer, len + extra, 43 ) != len ) {
+ lseek( cow.fdStats, 43, SEEK_SET );
+ if ( write( cow.fdStats, buffer, len + extra ) != len ) {
logadd( LOG_WARNING, "Could not update cow status file" );
}
#ifdef COW_DUMP_BLOCK_UPLOADS
@@ -412,7 +467,7 @@ void updateCowStatsFile( uint64_t inQueue, uint64_t modified, uint64_t idle, cha
}
int cmpfunc( const void *a, const void *b )
{
- return (int)( ( (cow_block_upload_statistics_t *)b )->uploads - ( (cow_block_upload_statistics_t *)a )->uploads );
+ return (int)( ( (cow_cluster_statistics_t *)b )->uploads - ( (cow_cluster_statistics_t *)a )->uploads );
}
/**
* @brief Writes all block numbers sorted by the number of uploads into the statsfile.
@@ -422,26 +477,25 @@ void dumpBlockUploads()
{
long unsigned int l1MaxOffset = 1 + ( ( metadata->imageSize - 1 ) / COW_FULL_L2_TABLE_DATA_SIZE );
- cow_block_upload_statistics_t blockUploads[l1MaxOffset * COW_L2_TABLE_SIZE];
+ cow_cluster_statistics_t blockUploads[l1MaxOffset * COW_L2_TABLE_SIZE];
uint64_t currentBlock = 0;
for ( long unsigned int l1Index = 0; l1Index < l1MaxOffset; l1Index++ ) {
if ( cow.l1[l1Index] == -1 ) {
continue;
}
for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) {
- cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index );
+ cow_l2_entry_t *block = ( cow.l2[cow.l1[l1Index]] + l2Index );
blockUploads[currentBlock].uploads = block->uploads;
- blockUploads[currentBlock].blocknumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index );
+ blockUploads[currentBlock].clusterNumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index );
currentBlock++;
}
}
- qsort( blockUploads, currentBlock, sizeof( cow_block_upload_statistics_t ), cmpfunc );
- lseek( cow.fhs, 0, SEEK_END );
+ qsort( blockUploads, currentBlock, sizeof( cow_cluster_statistics_t ), cmpfunc );
- dprintf( cow.fhs, "\n\nblocknumber: uploads\n==Block Upload Dump===\n" );
+ dprintf( cow.fdStats, "\n\nclusterNumber: uploads\n==Block Upload Dump===\n" );
for ( uint64_t i = 0; i < currentBlock; i++ ) {
- dprintf( cow.fhs, "%" PRIu64 ": %" PRIu64 " \n", blockUploads[i].blocknumber, blockUploads[i].uploads );
+ dprintf( cow.fdStats, "%" PRIu64 ": %" PRIu64 " \n", blockUploads[i].clusterNumber, blockUploads[i].uploads );
}
}
@@ -449,29 +503,32 @@ void dumpBlockUploads()
* @brief Starts the upload of a given block.
*
* @param cm Curl_multi
- * @param curlUploadBlock containing the data for the block to upload.
+ * @param uploadingCluster containing the data for the block to upload.
*/
-bool addUpload( CURLM *cm, cow_curl_read_upload_t *curlUploadBlock, struct curl_slist *headers )
+bool addUpload( CURLM *cm, cow_curl_read_upload_t *uploadingCluster, struct curl_slist *headers )
{
CURL *eh = curl_easy_init();
char url[COW_URL_STRING_SIZE];
- snprintf( url, COW_URL_STRING_SIZE, COW_API_UPDATE, cowServerAddress, metadata->uuid, curlUploadBlock->blocknumber );
+ snprintf( url, COW_URL_STRING_SIZE, COW_API_UPDATE, cowServerAddress, metadata->uuid, uploadingCluster->clusterNumber );
curl_easy_setopt( eh, CURLOPT_URL, url );
curl_easy_setopt( eh, CURLOPT_POST, 1L );
curl_easy_setopt( eh, CURLOPT_READFUNCTION, curlReadCallbackUploadBlock );
- curl_easy_setopt( eh, CURLOPT_READDATA, (void *)curlUploadBlock );
- curl_easy_setopt( eh, CURLOPT_PRIVATE, (void *)curlUploadBlock );
+ curl_easy_setopt( eh, CURLOPT_READDATA, (void *)uploadingCluster );
+ curl_easy_setopt( eh, CURLOPT_PRIVATE, (void *)uploadingCluster );
// min upload speed of 1kb/s over 10 sec otherwise the upload is canceled.
curl_easy_setopt( eh, CURLOPT_LOW_SPEED_TIME, 10L );
curl_easy_setopt( eh, CURLOPT_LOW_SPEED_LIMIT, 1000L );
- curl_easy_setopt(
- eh, CURLOPT_POSTFIELDSIZE_LARGE, (long)( metadata->bitfieldSize + COW_DATA_CLUSTER_SIZE ) );
+ curl_easy_setopt( eh, CURLOPT_POSTFIELDSIZE_LARGE,
+ (long)( COW_BITFIELD_SIZE
+ + DNBD3_BLOCK_SIZE * countOneBits( uploadingCluster->bitfield, COW_BITFIELD_SIZE ) )
+ );
+
if ( COW_SHOW_UL_SPEED ) {
- curlUploadBlock->ulLast = 0;
+ uploadingCluster->ulLast = 0;
curl_easy_setopt( eh, CURLOPT_NOPROGRESS, 0L );
curl_easy_setopt( eh, CURLOPT_XFERINFOFUNCTION, progress_callback );
curl_easy_setopt( eh, CURLOPT_XFERINFODATA, eh );
@@ -495,35 +552,35 @@ bool addUpload( CURLM *cm, cow_curl_read_upload_t *curlUploadBlock, struct curl_
bool finishUpload( CURLM *cm, CURLMsg *msg, struct curl_slist *headers )
{
bool status = true;
- cow_curl_read_upload_t *curlUploadBlock;
+ cow_curl_read_upload_t *uploadingCluster;
CURLcode res;
CURLcode res2;
- res = curl_easy_getinfo( msg->easy_handle, CURLINFO_PRIVATE, &curlUploadBlock );
+ res = curl_easy_getinfo( msg->easy_handle, CURLINFO_PRIVATE, &uploadingCluster );
long http_code = 0;
res2 = curl_easy_getinfo( msg->easy_handle, CURLINFO_RESPONSE_CODE, &http_code );
if ( res != CURLE_OK || res2 != CURLE_OK || http_code < 200 || http_code >= 300
|| msg->msg != CURLMSG_DONE ) {
- curlUploadBlock->fails++;
- logadd( LOG_ERROR, "COW_API_UPDATE failed %i/5: %s\n", curlUploadBlock->fails,
+ uploadingCluster->fails++;
+ logadd( LOG_ERROR, "COW_API_UPDATE failed %i/5: %s\n", uploadingCluster->fails,
curl_easy_strerror( msg->data.result ) );
- if ( curlUploadBlock->fails <= 5 ) {
- addUpload( cm, curlUploadBlock, headers );
+ if ( uploadingCluster->fails < 5 ) {
+ addUpload( cm, uploadingCluster, headers );
goto CLEANUP;
}
- free( curlUploadBlock );
+ free( uploadingCluster );
status = false;
goto CLEANUP;
}
// everything went ok, update timeChanged
- atomic_compare_exchange_strong( &curlUploadBlock->block->timeChanged, &curlUploadBlock->time, 0 );
+ atomic_compare_exchange_strong( &uploadingCluster->block->timeChanged, &uploadingCluster->time, 0 );
- curlUploadBlock->block->uploads++;
+ uploadingCluster->block->uploads++;
totalBlocksUploaded++;
- free( curlUploadBlock );
+ free( uploadingCluster );
CLEANUP:
curl_multi_remove_handle( cm, msg->easy_handle );
curl_easy_cleanup( msg->easy_handle );
@@ -593,7 +650,7 @@ bool uploaderLoop( bool ignoreMinUploadDelay, CURLM *cm )
}
// Now all L2 blocks
for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) {
- cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index );
+ cow_l2_entry_t *block = ( cow.l2[cow.l1[l1Index]] + l2Index );
if ( block->offset == -1 ) {
continue; // Not allocated
}
@@ -613,10 +670,15 @@ bool uploaderLoop( bool ignoreMinUploadDelay, CURLM *cm )
&& activeUploads > 0 );
cow_curl_read_upload_t *b = malloc( sizeof( cow_curl_read_upload_t ) );
b->block = block;
- b->blocknumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index );
+ b->clusterNumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index );
b->fails = 0;
b->position = 0;
b->time = block->timeChanged;
+ // Copy, so it doesn't change during upload
+ // when we assemble the data in curlReadCallbackUploadBlock()
+ for ( int i = 0; i < COW_BITFIELD_SIZE; ++i ) {
+ b->bitfield[i] = block->bitfield[i];
+ }
addUpload( cm, b, headers );
if ( !ignoreMinUploadDelay && !uploadLoop ) {
goto DONE;
@@ -637,7 +699,7 @@ DONE:
*
*/
-void *cowfile_statUpdater( __attribute__( ( unused ) ) void *something )
+void *cowfile_statUpdater( __attribute__((unused)) void *something )
{
uint64_t lastUpdateTime = time( NULL );
@@ -653,7 +715,7 @@ void *cowfile_statUpdater( __attribute__( ( unused ) ) void *something )
continue;
}
for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) {
- cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index );
+ cow_l2_entry_t *block = ( cow.l2[cow.l1[l1Index]] + l2Index );
if ( block->offset == -1 ) {
continue;
}
@@ -738,12 +800,12 @@ static bool createCowStatsFile( char *path )
logadd( LOG_INFO, "%s", buffer );
}
if ( statFile ) {
- if ( ( cow.fhs = open( pathStatus, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ if ( ( cow.fdStats = open( pathStatus, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) {
logadd( LOG_ERROR, "Could not create cow status file. Bye.\n" );
return false;
}
- if ( pwrite( cow.fhs, buffer, len, 0 ) != len ) {
+ if ( pwrite( cow.fdStats, buffer, len, 0 ) != len ) {
logadd( LOG_ERROR, "Could not write to cow status file. Bye.\n" );
return false;
}
@@ -770,67 +832,72 @@ bool cowfile_init( char *path, const char *image_Name, uint16_t imageVersion,
snprintf( pathMeta, strlen( path ) + 6, "%s%s", path, "/meta" );
snprintf( pathData, strlen( path ) + 6, "%s%s", path, "/data" );
- if ( ( cow.fhm = open( pathMeta, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ if ( ( cow.fdMeta = open( pathMeta, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR ) ) == -1 ) {
logadd( LOG_ERROR, "Could not create cow meta file. Bye.\n %s \n", pathMeta );
return false;
}
- if ( ( cow.fhd = open( pathData, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ if ( ( cow.fdData = open( pathData, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR ) ) == -1 ) {
logadd( LOG_ERROR, "Could not create cow data file. Bye.\n" );
return false;
}
-
- int maxPageSize = 8192;
+ struct stat fs;
+ if ( fstat( cow.fdData, &fs ) == -1 || fs.st_size != 0 ) {
+ logadd( LOG_ERROR, "/data file already exists and is not empty" );
+ return false;
+ }
size_t metaDataSizeHeader = sizeof( cowfile_metadata_header_t );
- cow.maxImageSize = COW_MAX_IMAGE_SIZE;
- cow.l1Size = ( ( cow.maxImageSize + COW_FULL_L2_TABLE_DATA_SIZE - 1LL ) / COW_FULL_L2_TABLE_DATA_SIZE );
+ // Calculate how many full l2 tables we need to address COW_MAX_IMAGE_SIZE
+ size_t l1NumEntries = ( ( COW_MAX_IMAGE_SIZE + COW_FULL_L2_TABLE_DATA_SIZE - 1 )
+ / COW_FULL_L2_TABLE_DATA_SIZE );
+ // Make sure l1 and l2 are aligned to struct size
+ size_t sizeL1 = sizeof(cow.l1[0]);
+ size_t sizeL2 = sizeof(cow.l2[0]);
+ size_t startL1 = ( ( metaDataSizeHeader + sizeL1 - 1 ) / sizeL1 ) * sizeL1;
+ size_t startL2 = ( ( startL1 + l1NumEntries * sizeL1 + sizeL2 - 1 ) / sizeL2 ) * sizeL2;
// size of l1 array + number of l2's * size of l2
- size_t metadata_size = cow.l1Size * sizeof( l1 ) + cow.l1Size * sizeof( l2 );
+ size_t ps = getpagesize();
+ size_t metaSize = ( ( startL2 + l1NumEntries * sizeof( l2 ) + ps - 1 ) / ps ) * ps;
- // compute next fitting multiple of getpagesize()
- size_t meta_data_start = ( ( metaDataSizeHeader + maxPageSize - 1 ) / maxPageSize ) * maxPageSize;
-
- size_t metadataFileSize = meta_data_start + metadata_size;
- if ( ftruncate( cow.fhm, metadataFileSize ) != 0 ) {
+ if ( ftruncate( cow.fdMeta, metaSize ) != 0 ) {
logadd( LOG_ERROR, "Could not set file size of meta data file (errno=%d). Bye.\n", errno );
return false;
}
- cow.metadata_mmap = mmap( NULL, metadataFileSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fhm, 0 );
+ cow.metadata_mmap = mmap( NULL, metaSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fdMeta, 0 );
if ( cow.metadata_mmap == MAP_FAILED ) {
- logadd( LOG_ERROR, "Error while mapping mmap:\n%s \n Bye.\n", strerror( errno ) );
+ logadd( LOG_ERROR, "Error while mmap()ing meta data, errno=%d", errno );
return false;
}
metadata = (cowfile_metadata_header_t *)( cow.metadata_mmap );
metadata->magicValue = COW_FILE_META_MAGIC_VALUE;
+ metadata->imageSize = **imageSizePtr;
metadata->version = CURRENT_COW_VERSION;
- metadata->dataFileSize = ATOMIC_VAR_INIT( COW_DATA_CLUSTER_SIZE );
- metadata->metadataFileSize = ATOMIC_VAR_INIT( metadataFileSize );
- metadata->blocksize = DNBD3_BLOCK_SIZE;
- metadata->originalImageSize = **imageSizePtr;
- metadata->imageSize = metadata->originalImageSize;
- metadata->creationTime = time( NULL );
- *imageSizePtr = &metadata->imageSize;
- metadata->metaDataStart = meta_data_start;
+ metadata->validRemoteSize = **imageSizePtr;
+ metadata->startL1 = (uint32_t)startL1;
+ metadata->startL2 = (uint32_t)startL2;
metadata->bitfieldSize = COW_BITFIELD_SIZE;
- metadata->maxImageSize = cow.maxImageSize;
- snprintf( metadata->imageName, 200, "%s", image_Name );
- cow.l1 = (l1 *)( cow.metadata_mmap + meta_data_start );
metadata->nextL2 = 0;
+ metadata->metaSize = ATOMIC_VAR_INIT( metaSize );
+ metadata->nextClusterOffset = ATOMIC_VAR_INIT( COW_DATA_CLUSTER_SIZE );
+ metadata->maxImageSize = COW_MAX_IMAGE_SIZE;
+ metadata->creationTime = time( NULL );
+ snprintf( metadata->imageName, 200, "%s", image_Name );
- for ( size_t i = 0; i < cow.l1Size; i++ ) {
+ cow.l1 = (l1 *)( cow.metadata_mmap + startL1 );
+ cow.l2 = (l2 *)( cow.metadata_mmap + startL2 );
+ for ( size_t i = 0; i < l1NumEntries; i++ ) {
cow.l1[i] = -1;
}
- cow.firstL2 = (l2 *)( ( (char *)cow.l1 ) + cow.l1Size );
// write header to data file
uint64_t header = COW_FILE_DATA_MAGIC_VALUE;
- if ( pwrite( cow.fhd, &header, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) {
+ if ( pwrite( cow.fdData, &header, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) {
logadd( LOG_ERROR, "Could not write header to cow data file. Bye.\n" );
return false;
}
@@ -848,6 +915,7 @@ bool cowfile_init( char *path, const char *image_Name, uint16_t imageVersion,
return false;
}
createCowStatsFile( path );
+ *imageSizePtr = &metadata->imageSize;
return true;
}
@@ -871,11 +939,11 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server
snprintf( pathData, strlen( path ) + 6, "%s%s", path, "/data" );
- if ( ( cow.fhm = open( pathMeta, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ if ( ( cow.fdMeta = open( pathMeta, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
logadd( LOG_ERROR, "Could not open cow meta file. Bye.\n" );
return false;
}
- if ( ( cow.fhd = open( pathData, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ if ( ( cow.fdData = open( pathData, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
logadd( LOG_ERROR, "Could not open cow data file. Bye.\n" );
return false;
}
@@ -885,7 +953,7 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server
size_t sizeToRead = sizeof( cowfile_metadata_header_t );
size_t readBytes = 0;
while ( readBytes < sizeToRead ) {
- ssize_t bytes = pread( cow.fhm, ( ( &header ) + readBytes ), sizeToRead, 0 );
+ ssize_t bytes = pread( cow.fdMeta, ( ( &header ) + readBytes ), sizeToRead - readBytes, 0 );
if ( bytes <= 0 ) {
logadd( LOG_ERROR, "Error while reading meta file header. Bye.\n" );
return false;
@@ -902,44 +970,55 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server
logadd( LOG_ERROR, "cow meta file of unkown format. Bye.\n" );
return false;
}
+
+ if ( header.bitfieldSize != COW_BITFIELD_SIZE ) {
+ logadd( LOG_ERROR, "cow meta file has unexpected bitfield size %d", (int)header.bitfieldSize );
+ return false;
+ }
+ if ( header.startL1 >= header.startL2 || header.startL2 >= header.metaSize ) {
+ logadd( LOG_ERROR, "l1/l2 offset messed up in metadata." );
+ return false;
+ }
+
struct stat st;
- fstat( cow.fhm, &st );
- if ( st.st_size < (off_t)( header.metaDataStart + header.nextL2 * sizeof( l2 ) ) ) {
- logadd( LOG_ERROR, "cow meta file to small. Bye.\n" );
+ fstat( cow.fdMeta, &st );
+ if ( st.st_size < (off_t)header.metaSize ) {
+ logadd( LOG_ERROR, "cow meta file too small. Bye." );
return false;
}
}
{
uint64_t magicValueDataFile;
- if ( pread( cow.fhd, &magicValueDataFile, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) {
- logadd( LOG_ERROR, "Error while reading cow data file, wrong file?. Bye.\n" );
+ if ( pread( cow.fdData, &magicValueDataFile, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) {
+ logadd( LOG_ERROR, "Error while reading cow data file, wrong file?. Bye." );
return false;
}
if ( magicValueDataFile != COW_FILE_DATA_MAGIC_VALUE ) {
if ( __builtin_bswap64( magicValueDataFile ) == COW_FILE_DATA_MAGIC_VALUE ) {
- logadd( LOG_ERROR, "cow data file of wrong endianess. Bye.\n" );
+ logadd( LOG_ERROR, "cow data file of wrong endianess. Bye." );
return false;
}
- logadd( LOG_ERROR, "cow data file of unkown format. Bye.\n" );
+ logadd( LOG_ERROR, "cow data file of unkown format. Bye." );
return false;
}
struct stat st;
- fstat( cow.fhd, &st );
- if ( (off_t)header.dataFileSize > st.st_size ) {
- logadd( LOG_ERROR, "cow data file to small. Bye.\n" );
+ fstat( cow.fdData, &st ); // add cluster size, since we don't preallocate
+ if ( header.nextClusterOffset > st.st_size + (int)COW_DATA_CLUSTER_SIZE ) {
+ logadd( LOG_ERROR, "cow data file too small. Expected=%jd, Is=%jd.",
+ (intmax_t)header.nextClusterOffset, (intmax_t)st.st_size );
return false;
}
}
- cow.metadata_mmap = mmap( NULL, header.metadataFileSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fhm, 0 );
+ cow.metadata_mmap = mmap( NULL, header.metaSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fdMeta, 0 );
if ( cow.metadata_mmap == MAP_FAILED ) {
- logadd( LOG_ERROR, "Error while mapping mmap:\n%s \n Bye.\n", strerror( errno ) );
+ logadd( LOG_ERROR, "Error while mapping mmap, errno=%d.", errno );
return false;
}
if ( header.version != CURRENT_COW_VERSION ) {
- logadd( LOG_ERROR, "Error wrong file version got: %i expected: %i. Bye.\n",
+ logadd( LOG_ERROR, "Error wrong file version got: %i expected: %i. Bye.",
metadata->version, CURRENT_COW_VERSION );
return false;
}
@@ -948,11 +1027,8 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server
metadata = (cowfile_metadata_header_t *)( cow.metadata_mmap );
*imageSizePtr = &metadata->imageSize;
- cow.l1 = (l1 *)( cow.metadata_mmap + metadata->metaDataStart );
- cow.maxImageSize = metadata->maxImageSize;
- cow.l1Size = ( ( cow.maxImageSize + COW_FULL_L2_TABLE_DATA_SIZE - 1LL ) / COW_FULL_L2_TABLE_DATA_SIZE );
-
- cow.firstL2 = (l2 *)( ( (char *)cow.l1 ) + cow.l1Size );
+ cow.l1 = (l1 *)( cow.metadata_mmap + metadata->startL1 );
+ cow.l2 = (l2 *)( cow.metadata_mmap + metadata->startL2 );
pthread_mutex_init( &cow.l2CreateLock, NULL );
createCowStatsFile( path );
return true;
@@ -961,8 +1037,8 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server
* @brief Starts the cow BackgroundThreads which are needed for stats and data upload
*
*/
-bool cowfile_startBackgroundThreads() {
-
+bool cowfile_startBackgroundThreads()
+{
if( pthread_create( &tidCowUploader, NULL, &uploaderThreadMain, NULL ) != 0 ) {
logadd( LOG_ERROR, "Could not create cow uploader thread");
return false;
@@ -977,55 +1053,15 @@ bool cowfile_startBackgroundThreads() {
}
/**
- * @brief writes the given data in the data file
- *
- * @param buffer containing the data
- * @param size of the buffer
- * @param netSize which actually contributes to the fuse write request (can be different from size if partial full blocks are written)
- * @param cowRequest <---- !???? TODO
- * @param block block being written to
- * @param inClusterOffset offset in this cluster to be written to
- */
-static void writeData( const char *buffer, ssize_t size, size_t netSize, atomic_int *errorCode,
- atomic_size_t *bytesWorkedOn, cow_l2_entry_t *block, off_t inClusterOffset )
-{
- // TODO: Assert that size + inClusterOffset <= COW_DATA_CLUSTER_SIZE?
- ssize_t totalBytesWritten = 0;
- while ( totalBytesWritten < size ) {
- ssize_t bytesWritten = pwrite( cow.fhd, ( buffer + totalBytesWritten ), size - totalBytesWritten,
- block->offset + inClusterOffset + totalBytesWritten );
- if ( bytesWritten == -1 ) {
- *errorCode = errno;
- logadd( LOG_ERROR,
- "size:%zu netSize:%zu errorCode:%i bytesWorkedOn:%zu inClusterOffset:%ld block->offset:%ld \n", size,
- netSize, *errorCode, *bytesWorkedOn, inClusterOffset, block->offset );
- break;
- } else if ( bytesWritten == 0 ) {
- *errorCode = EIO;
- logadd( LOG_ERROR,
- "size:%zu netSize:%zu errorCode:%i bytesWorkedOn:%zu inClusterOffset:%ld block->offset:%ld \n", size,
- netSize, *errorCode, *bytesWorkedOn, inClusterOffset, block->offset );
- break;
- }
- totalBytesWritten += bytesWritten;
- }
- atomic_fetch_add( bytesWorkedOn, netSize );
- setBitsInBitfield( block->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ),
- (int)( ( inClusterOffset + totalBytesWritten - 1 ) / DNBD3_BLOCK_SIZE ), 1 );
-
- block->timeChanged = time( NULL );
-}
-
-/**
- * @brief Increases the metadata->dataFileSize by COW_DATA_CLUSTER_SIZE.
- * The space is not reserved on disk.
- *
- * @param block for which the space should be reserved.
+ * Check if block at given offset is local, i.e. has been modified.
+ * @param meta The cow_l2_entry for the according cluster MUST be provided
+ * @param offset offset of data, can be absolute image offset as it will be transformed into cluster offset
*/
-static bool allocateMetaBlockData( cow_l2_entry_t *block )
+static bool isBlockLocal( cow_l2_entry_t *meta, off_t offset )
{
- block->offset = (atomic_long)atomic_fetch_add( &metadata->dataFileSize, COW_DATA_CLUSTER_SIZE );
- return true;
+ if ( meta == NULL )
+ return false;
+ return checkBit( meta->bitfield, ( offset % COW_DATA_CLUSTER_SIZE ) / DNBD3_BLOCK_SIZE );
}
/**
@@ -1036,34 +1072,38 @@ static bool allocateMetaBlockData( cow_l2_entry_t *block )
* @param l2Index
* @return cow_l2_entry_t*
*/
-static cow_l2_entry_t *getL2Entry( int l1Index, int l2Index )
+static cow_l2_entry_t *getL2Entry( int l1Index, int l2Index, bool create )
{
- cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index );
+ if ( cow.l1[l1Index] == -1 )
+ return NULL;
+ cow_l2_entry_t *block = cow.l2[cow.l1[l1Index]] + l2Index;
if ( block->offset == -1 ) {
- allocateMetaBlockData( block );
+ if ( !create )
+ return NULL;
+ block->offset = atomic_fetch_add( &metadata->nextClusterOffset, COW_DATA_CLUSTER_SIZE );
}
return block;
}
/**
- * @brief creates an new L2 Block and initializes the containing cow_l2_entry_t blocks
+ * @brief creates an new L2 table and initializes the containing cow_l2_entry_t
*
* @param l1Index
*/
-static bool createL2Block( int l1Index )
+static bool createL2Table( int l1Index )
{
pthread_mutex_lock( &cow.l2CreateLock );
if ( cow.l1[l1Index] == -1 ) {
+ int idx = metadata->nextL2++;
for ( int i = 0; i < COW_L2_TABLE_SIZE; i++ ) {
- cow.firstL2[metadata->nextL2][i].offset = -1;
- cow.firstL2[metadata->nextL2][i].timeChanged = ATOMIC_VAR_INIT( 0 );
- cow.firstL2[metadata->nextL2][i].uploads = ATOMIC_VAR_INIT( 0 );
+ cow.l2[idx][i].offset = -1;
+ cow.l2[idx][i].timeChanged = ATOMIC_VAR_INIT( 0 );
+ cow.l2[idx][i].uploads = ATOMIC_VAR_INIT( 0 );
for ( int j = 0; j < COW_BITFIELD_SIZE; j++ ) {
- cow.firstL2[metadata->nextL2][i].bitfield[j] = ATOMIC_VAR_INIT( 0 );
+ cow.l2[idx][i].bitfield[j] = ATOMIC_VAR_INIT( 0 );
}
}
- cow.l1[l1Index] = metadata->nextL2;
- metadata->nextL2 += 1;
+ cow.l1[l1Index] = idx;
}
pthread_mutex_unlock( &cow.l2CreateLock );
return true;
@@ -1080,13 +1120,19 @@ static bool createL2Block( int l1Index )
static void finishWriteRequest( fuse_req_t req, cow_request_t *cowRequest )
{
+ if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) != 1 )
+ return; // More sub-requests are pending, bail out
if ( cowRequest->errorCode != 0 ) {
fuse_reply_err( req, cowRequest->errorCode );
-
} else {
- uint64_t oldSize = metadata->imageSize;
- uint64_t ns = MAX( oldSize, cowRequest->bytesWorkedOn + cowRequest->fuseRequestOffset );
- atomic_compare_exchange_strong( &metadata->imageSize, &oldSize, ns );
+ uint64_t newSize = cowRequest->bytesWorkedOn + cowRequest->fuseRequestOffset;
+ if ( newSize > metadata->imageSize ) {
+ uint64_t oldSize;
+ do {
+ oldSize = metadata->imageSize;
+ newSize = MAX( oldSize, newSize );
+ } while ( !atomic_compare_exchange_weak( &metadata->imageSize, &oldSize, newSize ) );
+ }
fuse_reply_write( req, cowRequest->bytesWorkedOn );
}
free( cowRequest );
@@ -1100,67 +1146,104 @@ static void finishWriteRequest( fuse_req_t req, cow_request_t *cowRequest )
*/
static void writePaddedBlock( cow_sub_request_t *sRequest )
{
- //copy write Data
- // TODO Assert that we have enough space in writeBuffer at that offset
- memcpy( ( sRequest->writeBuffer + ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ) ), sRequest->writeSrc,
- sRequest->size );
- writeData( sRequest->writeBuffer, DNBD3_BLOCK_SIZE, (ssize_t)sRequest->size, &sRequest->cowRequest->errorCode,
- &sRequest->cowRequest->bytesWorkedOn, sRequest->block,
- ( sRequest->inClusterOffset - ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ) ) );
-
-
- if ( atomic_fetch_sub( &sRequest->cowRequest->workCounter, 1 ) == 1 ) {
- finishWriteRequest( sRequest->dRequest.fuse_req, sRequest->cowRequest );
+ assert( ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ) + sRequest->size <= DNBD3_BLOCK_SIZE );
+ // Here, we again check if the block is written locally - there might have been a second write
+ // that wrote the full block, hence didn't have to wait for remote data and finished faster.
+ // In that case, don't pad from remote as we'd overwrite newer data.
+ if ( isBlockLocal( sRequest->block, sRequest->inClusterOffset ) ) {
+ logadd( LOG_INFO, "It happened!" );
+ } else {
+ // copy write Data
+ // writeBuffer is the received data, patch data from fuse write into it
+ memcpy( sRequest->writeBuffer + ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ), sRequest->writeSrc,
+ sRequest->size );
+ if ( !writeAll( cow.fdData, sRequest->writeBuffer, DNBD3_BLOCK_SIZE,
+ sRequest->block->offset + ( sRequest->inClusterOffset & ~DNBD3_BLOCK_MASK ) ) ) {
+ sRequest->cowRequest->errorCode = errno;
+ } else {
+ sRequest->cowRequest->bytesWorkedOn += sRequest->size;
+ int64_t bit = sRequest->inClusterOffset / DNBD3_BLOCK_SIZE;
+ setBitsInBitfield( sRequest->block->bitfield, bit, bit, true );
+ sRequest->block->timeChanged = time( NULL );
+ }
}
+
+ finishWriteRequest( sRequest->dRequest.fuse_req, sRequest->cowRequest );
free( sRequest );
}
/**
* @brief If a block does not start or finish on an multiple of DNBD3_BLOCK_SIZE, the blocks need to be
* padded. If this block is inside the original image size, the padding data will be read from the server.
- * Otherwise it will be padded with 0 since the it must be the block at the end of the image.
- * TODO: Properly document the arguments and what value range they can be, i.e. see below for the 4k case
- *
+ * Otherwise it will be padded with 0 since the it must be a block after the end of the image.
+ * @param req fuse_req_t
+ * @param cowRequest cow_request_t
+ * @param startOffset Absolute offset where the real data starts
+ * @param endOffset Absolute offset where the real data ends
+ * @param srcBuffer pointer to the data that needs to be padded, ie. data from user space.
*/
-static void padBlockFromRemote( fuse_req_t req, off_t offset, cow_request_t *cowRequest, const char *buffer,
- size_t size, cow_l2_entry_t *block, off_t inClusterOffset )
+static bool padBlockForWrite( fuse_req_t req, cow_request_t *cowRequest,
+ off_t startOffset, off_t endOffset, const char *srcBuffer )
{
- // TODO: Is this *guaranteed* to be the case on the caller site? Add comment to ^
- assert( ( offset % DNBD3_BLOCK_SIZE ) + size <= DNBD3_BLOCK_SIZE );
- if ( offset >= (off_t)metadata->originalImageSize ) {
- // Writing past the end of the image
- inClusterOffset -= inClusterOffset % DNBD3_BLOCK_SIZE;
- char buf[DNBD3_BLOCK_SIZE] = { 0 };
- memcpy( buf + ( offset % DNBD3_BLOCK_SIZE ), buffer, size );
- // At this point we should have a 4k block with user-space data to write, and possibly
- // zero-padding at start and/or end
-
- writeData( buf, DNBD3_BLOCK_SIZE, (ssize_t)size, &cowRequest->errorCode, &cowRequest->bytesWorkedOn,
- block, inClusterOffset );
- return;
- }
- // Need to fetch padding from upstream
- cow_sub_request_t *sRequest = calloc( sizeof( cow_sub_request_t ) + DNBD3_BLOCK_SIZE, 1 );
- sRequest->callback = writePaddedBlock;
- sRequest->inClusterOffset = inClusterOffset;
- sRequest->block = block;
- sRequest->size = size;
- sRequest->writeSrc = buffer;
- sRequest->cowRequest = cowRequest;
-
- sRequest->dRequest.length = (uint32_t)MIN( DNBD3_BLOCK_SIZE, metadata->originalImageSize - offset );
- sRequest->dRequest.offset = offset - ( offset % DNBD3_BLOCK_SIZE );
- sRequest->dRequest.fuse_req = req;
-
- atomic_fetch_add( &cowRequest->workCounter, 1 );
- if ( !connection_read( &sRequest->dRequest ) ) {
- cowRequest->errorCode = EIO;
- if ( atomic_fetch_sub( &sRequest->cowRequest->workCounter, 1 ) == 1 ) {
- finishWriteRequest( sRequest->dRequest.fuse_req, sRequest->cowRequest );
+ // Make sure we pad exactly one block
+ endOffset = MIN( (uint64_t)endOffset, ( startOffset + DNBD3_BLOCK_SIZE ) & ~DNBD3_BLOCK_MASK );
+ assert( startOffset < endOffset );
+ size_t size = (size_t)( endOffset - startOffset );
+ int l1Index = offsetToL1Index( startOffset );
+ int l2Index = offsetToL2Index( startOffset );
+ off_t inClusterOffset = startOffset % COW_DATA_CLUSTER_SIZE;
+ cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, true );
+ if ( isBlockLocal( cluster, startOffset ) ) {
+ // No padding at all, keep existing data
+ bool ret = writeAll( cow.fdData, srcBuffer, size, cluster->offset + inClusterOffset );
+ if ( ret ) {
+ cowRequest->bytesWorkedOn += size;
+ cluster->timeChanged = time( NULL );
}
- free( sRequest );
- return;
+ return ret;
+ }
+ // Not local, need some form of padding
+ createL2Table( l1Index );
+ if ( cluster == NULL ) {
+ cluster = getL2Entry( l1Index, l2Index, true );
+ }
+ uint64_t validImageSize = metadata->validRemoteSize; // As we don't lock
+ if ( startOffset >= (off_t)validImageSize ) {
+ // After end of remote valid data, pad with zeros entirely
+ char buf[DNBD3_BLOCK_SIZE] = {0};
+ off_t start = startOffset % DNBD3_BLOCK_SIZE;
+ assert( start + size <= DNBD3_BLOCK_SIZE );
+ memcpy( buf + start, srcBuffer, size );
+ bool ret = writeAll( cow.fdData, buf, DNBD3_BLOCK_SIZE,
+ cluster->offset + ( inClusterOffset & ~DNBD3_BLOCK_MASK ) );
+ if ( ret ) {
+ int64_t bit = inClusterOffset / DNBD3_BLOCK_SIZE;
+ setBitsInBitfield( cluster->bitfield, bit, bit, true );
+ cowRequest->bytesWorkedOn += size;
+ cluster->timeChanged = time( NULL );
+ }
+ return ret;
+ }
+ // Need to fetch padding from upstream, allocate struct plus one block
+ cow_sub_request_t *sub = calloc( sizeof( *sub ) + DNBD3_BLOCK_SIZE, 1 );
+ sub->callback = writePaddedBlock;
+ sub->inClusterOffset = inClusterOffset;
+ sub->block = cluster;
+ sub->size = size;
+ sub->writeSrc = srcBuffer;
+ sub->cowRequest = cowRequest;
+
+ sub->dRequest.length = (uint32_t)MIN( DNBD3_BLOCK_SIZE, validImageSize - startOffset );
+ sub->dRequest.offset = startOffset & ~DNBD3_BLOCK_MASK;
+ sub->dRequest.fuse_req = req;
+
+ if ( !connection_read( &sub->dRequest ) ) {
+ free( sub );
+ errno = ENOTSOCK;
+ return false;
}
+ atomic_fetch_add( &cowRequest->workCounter, 1 );
+ return true;
}
/**
@@ -1187,15 +1270,19 @@ void readRemoteData( cow_sub_request_t *sRequest )
atomic_fetch_add( &sRequest->cowRequest->bytesWorkedOn, sRequest->dRequest.length );
if ( atomic_fetch_sub( &sRequest->cowRequest->workCounter, 1 ) == 1 ) {
- if ( sRequest->cowRequest->bytesWorkedOn < sRequest->cowRequest->fuseRequestSize ) {
- // TODO: Is this a logic bug somewhere, reagarding accounting?
+ if ( sRequest->cowRequest->bytesWorkedOn != sRequest->cowRequest->fuseRequestSize ) {
// Because connection_read() will always return exactly as many bytes as requested,
// or simply never finish.
- // Otherwise, we should return EIO...
- logadd( LOG_ERROR, "pad read to small\n" );
+ logadd( LOG_ERROR, "BUG? Pad read has invalid size. worked on: %"PRIu64", request size: %"
+ PRIu64", offset: %"PRIu64,
+ (uint64_t)sRequest->cowRequest->bytesWorkedOn,
+ (uint64_t)sRequest->cowRequest->fuseRequestSize,
+ (uint64_t)sRequest->cowRequest->fuseRequestOffset );
+ fuse_reply_err( sRequest->dRequest.fuse_req, EIO );
+ } else {
+ fuse_reply_buf( sRequest->dRequest.fuse_req, sRequest->cowRequest->readBuffer,
+ sRequest->cowRequest->bytesWorkedOn );
}
- fuse_reply_buf( sRequest->dRequest.fuse_req, sRequest->cowRequest->readBuffer,
- sRequest->cowRequest->bytesWorkedOn );
free( sRequest->cowRequest->readBuffer );
free( sRequest->cowRequest );
}
@@ -1213,69 +1300,61 @@ void readRemoteData( cow_sub_request_t *sRequest )
void cowfile_setSize( fuse_req_t req, size_t size, fuse_ino_t ino, struct fuse_file_info *fi )
{
- // decrease
if ( size < metadata->imageSize ) {
- if ( size < metadata->originalImageSize ) {
- metadata->originalImageSize = size;
+ // truncate file
+ if ( size < metadata->validRemoteSize ) {
+ metadata->validRemoteSize = size;
}
- // TODO.... so....
- // originalImageSize = smallest we have seen
- // imageSize = current
- // ?
-
- // increase
} else if ( size > metadata->imageSize ) {
+ // grow file, pad with zeroes
off_t offset = metadata->imageSize;
int l1Index = offsetToL1Index( offset );
int l2Index = offsetToL2Index( offset );
int l1EndIndex = offsetToL1Index( size );
int l2EndIndex = offsetToL2Index( size );
- // special case first block TODO: What is the special case? What is happening here?
- if ( cow.l1[l1Index] != -1 ) {
- cow_l2_entry_t *block = getL2Entry( l1Index, l2Index );
- if ( metadata->imageSize % DNBD3_BLOCK_SIZE != 0 ) {
- off_t inClusterOffset = metadata->imageSize % COW_DATA_CLUSTER_SIZE;
+ // Special case, first cluster through which the size change passes
+ cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, false );
+ if ( cluster != NULL ) {
+ off_t inClusterOffset = offset % COW_DATA_CLUSTER_SIZE;
+ // if the new size is inside a DNBD3_BLOCK it might still contain old data before a truncate
+ if ( !IS_4K_ALIGNED( metadata->imageSize ) ) {
size_t sizeToWrite = DNBD3_BLOCK_SIZE - ( metadata->imageSize % DNBD3_BLOCK_SIZE );
- if ( checkBit( block->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ) ) ) {
- char buf[sizeToWrite];
- memset( buf, 0, sizeToWrite );
-
- ssize_t bytesWritten = pwrite( cow.fhd, buf, sizeToWrite, block->offset + inClusterOffset );
+ if ( checkBit( cluster->bitfield, inClusterOffset / DNBD3_BLOCK_SIZE ) ) {
+ char buf[DNBD3_BLOCK_SIZE] = {0};
+ ssize_t bytesWritten = pwrite( cow.fdData, buf, sizeToWrite, cluster->offset + inClusterOffset );
if ( bytesWritten < (ssize_t)sizeToWrite ) {
fuse_reply_err( req, bytesWritten == -1 ? errno : EIO );
return;
}
- block->timeChanged = time( NULL );
+ cluster->timeChanged = time( NULL );
offset += sizeToWrite;
}
}
- // rest of block set bits 0
- l1Index = offsetToL1Index( offset );
- l2Index = offsetToL2Index( offset );
- block = getL2Entry( l1Index, l2Index );
- off_t inClusterOffset = offset % COW_DATA_CLUSTER_SIZE;
- setBitsInBitfield(
- block->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ), ( COW_BITFIELD_SIZE * 8 ) - 1, 0 );
- block->timeChanged = time( NULL );
+ // all remaining bits in cluster will get set to 0
+ inClusterOffset = offset % COW_DATA_CLUSTER_SIZE;
+ setBitsInBitfield( cluster->bitfield, inClusterOffset / DNBD3_BLOCK_SIZE,
+ ( COW_BITFIELD_SIZE * 8 ) - 1, false );
+ cluster->timeChanged = time( NULL );
l2Index++;
if ( l2Index >= COW_L2_TABLE_SIZE ) {
l2Index = 0;
l1Index++;
}
}
- // null all bitfields
- while ( !( l1Index > l1EndIndex || ( l1Index == l1EndIndex && l2EndIndex < l2Index ) ) ) {
+ // normal case, if clusters exist, null bitfields
+ while ( l1Index < l1EndIndex || ( l1Index == l1EndIndex && l2Index <= l2EndIndex ) ) {
if ( cow.l1[l1Index] == -1 ) {
l1Index++;
l2Index = 0;
continue;
}
-
- cow_l2_entry_t *block = getL2Entry( l1Index, l2Index );
- setBitsInBitfield( block->bitfield, 0, ( COW_BITFIELD_SIZE * 8 ) - 1, 0 );
- block->timeChanged = time( NULL );
+ cluster = getL2Entry( l1Index, l2Index, false );
+ if ( cluster != NULL ) {
+ memset( cluster->bitfield, 0, COW_BITFIELD_SIZE );
+ cluster->timeChanged = time( NULL );
+ }
l2Index++;
if ( l2Index >= COW_L2_TABLE_SIZE ) {
l2Index = 0;
@@ -1308,98 +1387,82 @@ void cowfile_write( fuse_req_t req, cow_request_t *cowRequest, off_t offset, siz
off_t currentOffset = offset;
off_t endOffset = offset + size;
+ if ( !IS_4K_ALIGNED( currentOffset ) ) {
+ // Handle case where start is not 4k aligned
+ if ( !padBlockForWrite( req, cowRequest, currentOffset, endOffset, cowRequest->writeBuffer ) ) {
+ goto fail;
+ }
+ // Move forward to next block border
+ currentOffset = ( currentOffset + DNBD3_BLOCK_SIZE ) & ~DNBD3_BLOCK_MASK;
+ }
+ if ( currentOffset < endOffset && !IS_4K_ALIGNED( endOffset ) ) {
+ // Handle case where end is not 4k aligned
+ off_t lastBlockStart = endOffset & ~DNBD3_BLOCK_MASK;
+ if ( !padBlockForWrite( req, cowRequest, lastBlockStart, endOffset,
+ cowRequest->writeBuffer + ( lastBlockStart - offset ) ) ) {
+ goto fail;
+ }
+ endOffset = lastBlockStart;
+ }
+
+ // From here on start and end are block-aligned
int l1Index = offsetToL1Index( currentOffset );
int l2Index = offsetToL2Index( currentOffset );
while ( currentOffset < endOffset ) {
if ( cow.l1[l1Index] == -1 ) {
- createL2Block( l1Index );
+ createL2Table( l1Index );
}
//loop over L2 array (metadata)
while ( currentOffset < endOffset && l2Index < COW_L2_TABLE_SIZE ) {
- cow_l2_entry_t *metaBlock = getL2Entry( l1Index, l2Index );
-
- // Calc absolute offset in image corresponding to current cluster
- size_t clusterAbsoluteStartOffset = l1Index * COW_FULL_L2_TABLE_DATA_SIZE + l2Index * COW_DATA_CLUSTER_SIZE;
-
- size_t inClusterOffset = currentOffset - clusterAbsoluteStartOffset;
- // How many bytes we can write to this cluster before crossing a boundary, or before the write request is completed
+ cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, true );
+ size_t inClusterOffset = currentOffset % COW_DATA_CLUSTER_SIZE;
+ // How many bytes we can write to this cluster before crossing a boundary,
+ // or before the write request is complete
size_t bytesToWriteToCluster =
MIN( (size_t)( endOffset - currentOffset ), COW_DATA_CLUSTER_SIZE - inClusterOffset );
- /////////////////////////
- // lock for the half block probably needed
- if ( currentOffset % DNBD3_BLOCK_SIZE != 0
- && !checkBit( metaBlock->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ) ) ) {
- // Block has not been written locally before, and write does not start on block boundary.
- // Need to fetch the first couple bytes of the block from remote before writing the block to disk.
- size_t writeSize = MIN( bytesToWriteToCluster, DNBD3_BLOCK_SIZE - ( (size_t)currentOffset % DNBD3_BLOCK_SIZE ) );
- const char *sbuf = cowRequest->writeBuffer + ( ( currentOffset - offset ) );
- padBlockFromRemote( req, currentOffset, cowRequest, sbuf, writeSize, metaBlock, (off_t)inClusterOffset );
- currentOffset += writeSize;
- continue;
+ if ( !writeAll( cow.fdData, cowRequest->writeBuffer + ( currentOffset - offset ),
+ bytesToWriteToCluster, cluster->offset + inClusterOffset ) ) {
+ goto fail;
}
-
- size_t endPaddedSize = 0; // In case we need to skip over a pending pad request to remote
- if ( ( currentOffset + bytesToWriteToCluster ) % DNBD3_BLOCK_SIZE != 0
- && metadata->originalImageSize > currentOffset + bytesToWriteToCluster ) {
- // Write request does not end on block boundary, and ends before end of image
- // End offset of this write
- off_t clusterEndOffset = currentOffset + bytesToWriteToCluster;
- // Start of last block of write, i.e. start of the last, incomplete block
- off_t lastBlockStartOffset = clusterEndOffset - ( clusterEndOffset % DNBD3_BLOCK_SIZE );
- // Where that last block starts relative to its cluster
- off_t inClusterBlockOffset = lastBlockStartOffset - clusterAbsoluteStartOffset;
- if ( !checkBit( metaBlock->bitfield, (int)( inClusterBlockOffset / DNBD3_BLOCK_SIZE ) ) ) {
- // Block indeed not modified before, need to fetch
- const char *sbuf = cowRequest->writeBuffer + ( ( lastBlockStartOffset - offset ) );
- padBlockFromRemote( req, lastBlockStartOffset, cowRequest, sbuf, clusterEndOffset - lastBlockStartOffset, metaBlock,
- inClusterBlockOffset );
-
-
- bytesToWriteToCluster -= clusterEndOffset - lastBlockStartOffset;
- endPaddedSize = clusterEndOffset - lastBlockStartOffset;
- }
- }
- writeData( cowRequest->writeBuffer + ( ( currentOffset - offset ) ), (ssize_t)bytesToWriteToCluster,
- bytesToWriteToCluster, &cowRequest->errorCode, &cowRequest->bytesWorkedOn, metaBlock, inClusterOffset );
-
+ int64_t f = inClusterOffset / DNBD3_BLOCK_SIZE;
+ int64_t t = ( inClusterOffset + bytesToWriteToCluster - 1 ) / DNBD3_BLOCK_SIZE;
+ setBitsInBitfield( cluster->bitfield, f, t, true );
+ cowRequest->bytesWorkedOn += bytesToWriteToCluster;
currentOffset += bytesToWriteToCluster;
- // Account for skipped-over bytes
- currentOffset += endPaddedSize;
-
-
+ cluster->timeChanged = time( NULL );
l2Index++;
}
l1Index++;
l2Index = 0;
}
- if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) == 1 ) {
- finishWriteRequest( req, cowRequest );
+ goto success;
+
+fail:
+ if ( cowRequest->errorCode == 0 ) {
+ cowRequest->errorCode = errno != 0 ? errno : EIO;
}
+success:
+ finishWriteRequest( req, cowRequest );
}
/**
* @brief Request data, that is not available locally, via the network.
*
- * @param req fuse_req_t
+ * @param req fuse_req_t
* @param offset from the start of the file
* @param size of data to request
* @param buffer into which the data is to be written
- * @param workCounter workCounter is increased by one and later reduced by one again when the request is completed. TODO There is no such param, but cowRequest..
+ * @param cowRequest cow_request_t
*/
static void readRemote( fuse_req_t req, off_t offset, ssize_t size, char *buffer, cow_request_t *cowRequest )
{
- // edgecase: Image size got reduced before on a non block border
- if ( offset + size > (long int) metadata->originalImageSize ) { // TODO How does this check if it's a non block border?
- size_t padZeroSize = ( offset + size ) - metadata->originalImageSize;
- off_t padZeroOffset = metadata->originalImageSize - offset;
- assert( offset > 0 ); // TODO Should this be padZeroOffset?
- // ... But isn't it possible that offset > originalImageSize, in which case it would be negative?
- memset( ( buffer + padZeroOffset ), 0, padZeroSize );
-
- atomic_fetch_add( &cowRequest->bytesWorkedOn, padZeroSize );
- }
+ assert( offset < (off_t)metadata->validRemoteSize );
+ assert( offset + size <= (off_t)metadata->validRemoteSize );
+ if ( size == 0 )
+ return;
+ assert( size > 0 );
cow_sub_request_t *sRequest = malloc( sizeof( cow_sub_request_t ) );
sRequest->callback = readRemoteData;
sRequest->dRequest.length = (uint32_t)size;
@@ -1410,35 +1473,33 @@ static void readRemote( fuse_req_t req, off_t offset, ssize_t size, char *buffer
atomic_fetch_add( &cowRequest->workCounter, 1 );
if ( !connection_read( &sRequest->dRequest ) ) {
- cowRequest->errorCode = EIO; // TODO We set an error...
+ cowRequest->errorCode = EIO;
free( sRequest );
if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) == 1 ) {
- // .... but would still report success if this happens to be the last pending sub-request!?
- fuse_reply_buf( req, cowRequest->readBuffer, cowRequest->bytesWorkedOn );
+ fuse_reply_err( req, EIO );
+ free( cowRequest->readBuffer );
+ free( cowRequest );
}
- free( cowRequest->readBuffer );
- free( cowRequest );
- return;
}
}
/**
* @brief Get the Block Data Source object
*
- * @param block
- * @param bitfieldOffset
- * @param offset
- * @return enum dataSource
+ * @param block
+ * @param bitfieldOffset
+ * @param offset
+ * @return enum dataSource
*/
enum dataSource getBlockDataSource( cow_l2_entry_t *block, off_t bitfieldOffset, off_t offset )
{
- if ( block != NULL && checkBit( block->bitfield, (int)bitfieldOffset ) ) {
- return local;
+ if ( block != NULL && checkBit( block->bitfield, bitfieldOffset ) ) {
+ return ds_local;
}
- if ( offset >= (off_t)metadata->originalImageSize ) {
- return zero;
+ if ( offset >= (off_t)metadata->validRemoteSize ) {
+ return ds_zero;
}
- return remote;
+ return ds_remote;
}
/**
@@ -1450,124 +1511,109 @@ enum dataSource getBlockDataSource( cow_l2_entry_t *block, off_t bitfieldOffset,
* @param offset offset where the read starts.
* @return uint64_t Number of bytes read.
*/
-void cowfile_read( fuse_req_t req, size_t size, off_t offset )
+void cowfile_read( fuse_req_t req, size_t size, off_t startOffset )
{
cow_request_t *cowRequest = malloc( sizeof( cow_request_t ) );
cowRequest->fuseRequestSize = size;
cowRequest->bytesWorkedOn = ATOMIC_VAR_INIT( 0 );
cowRequest->workCounter = ATOMIC_VAR_INIT( 1 );
cowRequest->errorCode = ATOMIC_VAR_INIT( 0 );
- cowRequest->readBuffer = malloc( size );
- cowRequest->fuseRequestOffset = offset;
- off_t lastReadOffset = offset;
- off_t endOffset = offset + size;
- off_t searchOffset = offset;
- int l1Index = offsetToL1Index( offset );
- int l2Index = offsetToL2Index( offset );
- int bitfieldOffset = getBitfieldOffsetBit( offset );
- enum dataSource dataState;
- cow_l2_entry_t *cluster = NULL;
-
- if ( cow.l1[l1Index] != -1 ) {
- cluster = getL2Entry( l1Index, l2Index );
- }
+ cowRequest->readBuffer = calloc( size, 1 );
+ cowRequest->fuseRequestOffset = startOffset;
+ off_t lastReadOffset = -1;
+ off_t endOffset = startOffset + size;
+ off_t searchOffset = startOffset;
+ int l1Index = offsetToL1Index( startOffset );
+ int l2Index = offsetToL2Index( startOffset );
+ int bitfieldOffset = getBitfieldOffsetBit( startOffset );
+ cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, false );
+ enum dataSource dataState = ds_invalid;
+ bool flushCurrentSpan = false; // Set if we need to read the current span and start the next one
+ bool newSourceType = true; // Set if we're starting a new span, and the source type needs to be determined
- bool doRead = false;
- bool firstLoop = true;
- bool updateBlock = false;
while ( searchOffset < endOffset ) {
- if ( firstLoop ) {
- firstLoop = false;
+ if ( newSourceType ) {
+ newSourceType = false;
lastReadOffset = searchOffset;
- // TODO: Why is this only set on first iteration and not for every block/cluster?
dataState = getBlockDataSource( cluster, bitfieldOffset, searchOffset );
} else if ( getBlockDataSource( cluster, bitfieldOffset, searchOffset ) != dataState ) {
- // TODO So data source changed, but we don't update the dataState var... How can this possibly work?
- doRead = true;
+ // Source type changed, obviously need to flush current span
+ flushCurrentSpan = true;
} else {
bitfieldOffset++;
- }
-
- if ( bitfieldOffset >= COW_BITFIELD_SIZE * 8 ) {
- // Advance to next cluster in current l2 table
- bitfieldOffset = 0;
- l2Index++;
- if ( l2Index >= COW_L2_TABLE_SIZE ) {
- // Advance to next l1 entry, reset l2 index
- l2Index = 0;
- l1Index++;
- }
- // Also set flag that we need to update the 'cluster' struct at the end of this iteration
- // TODO: Why do we update all the values above, but not the cluster struct? We access those
- // variables in the code below, so we have updated offset and index, but operate on the
- // old cluster struct. How does that make sense?
- updateBlock = true;
- if ( dataState == local ) {
- doRead = true;
+ // If reading from local cow file, crossing a cluster border means we need to flush
+ // since the next cluster might be somewhere else in the data file
+ if ( dataState == ds_local && bitfieldOffset == COW_BITFIELD_SIZE * 8 ) {
+ flushCurrentSpan = true;
}
}
- // compute the original file offset from bitfieldOffset, l2Index and l1Index
- // TODO ??? As stated above, this is using the updated values, so isn't this the next
- // offset tather than original offset?
- searchOffset = DNBD3_BLOCK_SIZE * ( bitfieldOffset ) + l2Index * COW_DATA_CLUSTER_SIZE
+
+ // compute the absolute image offset from bitfieldOffset, l2Index and l1Index
+ // bitfieldOffset might be out of bounds here, but that doesn't matter for the calculation
+ searchOffset = DNBD3_BLOCK_SIZE * bitfieldOffset + l2Index * COW_DATA_CLUSTER_SIZE
+ l1Index * COW_FULL_L2_TABLE_DATA_SIZE;
- if ( doRead || searchOffset >= endOffset ) {
- ssize_t sizeToRead = MIN( searchOffset, endOffset );
- if ( dataState == remote ) {
- if ( sizeToRead > (ssize_t) metadata->originalImageSize ) {
- //pad rest with 0
- memset( cowRequest->readBuffer
- + ( ( lastReadOffset - offset ) + ( metadata->originalImageSize - offset ) ),
- 0, sizeToRead - metadata->originalImageSize );
- atomic_fetch_add( &cowRequest->bytesWorkedOn, sizeToRead - metadata->originalImageSize );
- sizeToRead = metadata->originalImageSize;
+ if ( flushCurrentSpan || searchOffset >= endOffset ) {
+ ssize_t spanEndOffset = MIN( searchOffset, endOffset );
+ if ( dataState == ds_remote ) {
+ if ( spanEndOffset > (ssize_t)metadata->validRemoteSize ) {
+ // Account for bytes we leave zero, because they're beyond the (truncated) original image size
+ atomic_fetch_add( &cowRequest->bytesWorkedOn, spanEndOffset - metadata->validRemoteSize );
+ spanEndOffset = metadata->validRemoteSize;
}
- sizeToRead -= lastReadOffset;
- readRemote(
- req, lastReadOffset, sizeToRead, cowRequest->readBuffer + ( lastReadOffset - offset ), cowRequest );
- } else if ( dataState == zero ) {
- sizeToRead -= lastReadOffset;
- memset( cowRequest->readBuffer + ( lastReadOffset - offset ), 0, sizeToRead );
- atomic_fetch_add( &cowRequest->bytesWorkedOn, sizeToRead );
- } else {
- sizeToRead -= lastReadOffset;
- // Compute the offset in the data file where the read starts
- off_t localRead =
- cluster->offset + ( ( lastReadOffset % COW_FULL_L2_TABLE_DATA_SIZE ) % COW_DATA_CLUSTER_SIZE );
+ readRemote( req, lastReadOffset, spanEndOffset - lastReadOffset,
+ cowRequest->readBuffer + ( lastReadOffset - startOffset ), cowRequest );
+ } else if ( dataState == ds_zero ) {
+ // Past end of image, account for leaving them zero
+ ssize_t numBytes = spanEndOffset - lastReadOffset;
+ atomic_fetch_add( &cowRequest->bytesWorkedOn, numBytes );
+ } else if ( dataState == ds_local ) {
+ ssize_t numBytes = spanEndOffset - lastReadOffset;
+ // Compute the startOffset in the data file where the read starts
+ off_t localRead = cluster->offset + ( lastReadOffset % COW_DATA_CLUSTER_SIZE );
ssize_t totalBytesRead = 0;
- while ( totalBytesRead < sizeToRead ) {
- ssize_t bytesRead =
- pread( cow.fhd, cowRequest->readBuffer + ( lastReadOffset - offset ), sizeToRead, localRead );
+ while ( totalBytesRead < numBytes ) {
+ ssize_t bytesRead = pread( cow.fdData, cowRequest->readBuffer + ( lastReadOffset - startOffset ),
+ numBytes - totalBytesRead, localRead + totalBytesRead );
if ( bytesRead == -1 ) {
cowRequest->errorCode = errno;
goto fail;
- } else if ( bytesRead <= 0 ) {
+ } else if ( bytesRead == 0 ) {
+ logadd( LOG_ERROR, "EOF for read at localRead=%"PRIu64", totalBR=%"PRIu64,
+ (uint64_t)localRead, (uint64_t)totalBytesRead );
+ logadd( LOG_ERROR, "searchOffset=%"PRIu64", endOffset=%"PRIu64", imageSize=%"PRIu64,
+ searchOffset, endOffset, metadata->imageSize );
cowRequest->errorCode = EIO;
goto fail;
}
totalBytesRead += bytesRead;
}
- atomic_fetch_add( &cowRequest->bytesWorkedOn, totalBytesRead );
+ atomic_fetch_add( &cowRequest->bytesWorkedOn, numBytes );
+ } else {
+ assert( 4 == 6 );
}
lastReadOffset = searchOffset;
- doRead = false;
- firstLoop = true;
+ flushCurrentSpan = false;
+ // Since the source type changed, reset
+ newSourceType = true;
}
-
- if ( updateBlock ) {
- if ( cow.l1[l1Index] != -1 ) {
- cluster = getL2Entry( l1Index, l2Index );
- } else {
- cluster = NULL;
+ if ( bitfieldOffset == COW_BITFIELD_SIZE * 8 ) {
+ // Advance to next cluster in current l2 table
+ bitfieldOffset = 0;
+ l2Index++;
+ if ( l2Index >= COW_L2_TABLE_SIZE ) {
+ // Advance to next l1 entry, reset l2 index
+ l2Index = 0;
+ l1Index++;
}
- updateBlock = false;
+ cluster = getL2Entry( l1Index, l2Index, false );
}
}
fail:;
if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) == 1 ) {
- if ( cowRequest->errorCode != 0 || cowRequest->bytesWorkedOn < size ) {
- logadd( LOG_ERROR, "incomplete read or I/O error (errno=%d)", cowRequest->errorCode );
+ if ( cowRequest->errorCode != 0 || cowRequest->bytesWorkedOn != size ) {
+ logadd( LOG_ERROR, "incomplete read or I/O error (errno=%d, workedOn: %"PRIu64", size: %"PRIu64")",
+ cowRequest->errorCode, (uint64_t)cowRequest->bytesWorkedOn, (uint64_t)size );
fuse_reply_err( req, cowRequest->errorCode != 0 ? cowRequest->errorCode : EIO );
} else {
fuse_reply_buf( req, cowRequest->readBuffer, cowRequest->bytesWorkedOn );
diff --git a/src/fuse/cowfile.h b/src/fuse/cowfile.h
index a07469d..0f395de 100644
--- a/src/fuse/cowfile.h
+++ b/src/fuse/cowfile.h
@@ -24,9 +24,10 @@ _Static_assert( sizeof( atomic_int_least64_t ) == 8, "atomic_int_least64_t not 8
enum dataSource
{
- local,
- remote,
- zero
+ ds_invalid,
+ ds_local,
+ ds_remote,
+ ds_zero
};
#define COW_METADATA_HEADER_SIZE 320
@@ -36,19 +37,20 @@ typedef struct cowfile_metadata_header
atomic_uint_least64_t imageSize; // 8byte
int32_t version; // 4byte
int32_t blocksize; // 4byte
- uint64_t originalImageSize; // 8byte - the name implies this is the size of the image on the server, but apparently it changes if we truncate the image etc. better name?
- uint64_t metaDataStart; // 8byte
+ uint64_t validRemoteSize; // 8byte
+ uint32_t startL1; // 4byte
+ uint32_t startL2; // 4byte
int32_t bitfieldSize; // 4byte
int32_t nextL2; // 4byte
- atomic_uint_least64_t metadataFileSize; // 8byte
- atomic_uint_least64_t dataFileSize; // 8byte
+ atomic_int_least64_t metaSize; // 8byte
+ atomic_int_least64_t nextClusterOffset; // 8byte
uint64_t maxImageSize; // 8byte
uint64_t creationTime; // 8byte
char uuid[40]; // 40byte
char imageName[200]; // 200byte
} cowfile_metadata_header_t;
-_Static_assert(
- sizeof( cowfile_metadata_header_t ) == COW_METADATA_HEADER_SIZE, "cowfile_metadata_header is messed up" );
+_Static_assert( sizeof( cowfile_metadata_header_t ) == COW_METADATA_HEADER_SIZE,
+ "cowfile_metadata_header is messed up" );
#define COW_L2_ENTRY_SIZE 64
typedef struct cow_l2_entry
@@ -56,25 +58,25 @@ typedef struct cow_l2_entry
atomic_int_least64_t offset;
atomic_uint_least64_t timeChanged;
atomic_uint_least64_t uploads;
- atomic_char bitfield[COW_BITFIELD_SIZE];
+ atomic_uchar bitfield[COW_BITFIELD_SIZE];
} cow_l2_entry_t;
_Static_assert( sizeof( cow_l2_entry_t ) == COW_L2_ENTRY_SIZE, "cow_l2_entry_t is messed up" );
/**
* Open request for reading/writing the virtual image we expose.
- * TODO Please verify field comments
*/
typedef struct cow_request
{
size_t fuseRequestSize; // Number of bytes to be read/written
off_t fuseRequestOffset; // Absolute offset into the image, as seen by user space
- char *readBuffer; // Used only in read case?
- const char *writeBuffer; // Used only in write case?
- atomic_size_t bytesWorkedOn; // Used for ???
+ char *readBuffer; // Used only in read case
+ const char *writeBuffer; // Used only in write case
+ atomic_size_t bytesWorkedOn; // Used for tracking how many bytes we have touched (exluding padding etc)
atomic_int workCounter; // How many pending sub requests (see below)
atomic_int errorCode; // For reporting back to fuse
- fuse_ino_t ino; // Inode of file, used for ???
- struct fuse_file_info *fi; // Used for ???
+ fuse_ino_t ino; // Inode of file, used for ??? (For reporting back to fuse, dont know if needed?)
+ struct fuse_file_info *fi; // Used for ??? (For reporting back to fuse, dont know if needed?)
+ //fuse_req_t req; // Fuse request
} cow_request_t;
typedef struct cow_sub_request cow_sub_request_t;
@@ -88,14 +90,14 @@ typedef void ( *cow_callback )( cow_sub_request_t *sRequest );
typedef struct cow_sub_request
{
size_t size; // size of this sub-request
- off_t inClusterOffset; // offset relative to!? cow-block? DNBD3 block? cluster?
- const char *writeSrc; // ???
- char *buffer; // ???
+ off_t inClusterOffset; // offset relative to the beginning of the cluster
+ const char *writeSrc; // pointer to the data of a write request which needs padding
+ char *buffer; // The pointer points to the original read buffer to the place where the sub read request should be copied to.
cow_l2_entry_t *block; // the cluster inClusterOffset refers to
cow_callback callback; // Callback when we're done handling this
cow_request_t *cowRequest; // parent request
dnbd3_async_t dRequest; // Probably request to dnbd3-server for non-aligned writes (wrt 4k dnbd3 block)
- char writeBuffer[]; // ???
+ char writeBuffer[]; // buffer for a padding write request, gets filled from a remote read, then the writeSrc data gets copied into it.
} cow_sub_request_t;
typedef struct cow_curl_read_upload
@@ -103,18 +105,18 @@ typedef struct cow_curl_read_upload
atomic_uint_least64_t time;
cow_l2_entry_t *block;
size_t position;
- long unsigned int blocknumber;
+ long unsigned int clusterNumber;
int fails;
int64_t ulLast;
+ atomic_uchar bitfield[COW_BITFIELD_SIZE];
} cow_curl_read_upload_t;
-typedef struct cow_block_upload_statistics
+typedef struct cow_cluster_statistics
{
- uint64_t blocknumber;
+ uint64_t clusterNumber;
uint64_t uploads;
-} cow_block_upload_statistics_t;
-
+} cow_cluster_statistics_t;
typedef int32_t l1;
typedef cow_l2_entry_t l2[COW_L2_TABLE_SIZE];
diff --git a/src/fuse/main.c b/src/fuse/main.c
index 6e7977c..96d8f5c 100644
--- a/src/fuse/main.c
+++ b/src/fuse/main.c
@@ -329,7 +329,7 @@ static void printUsage( char *argv0, int exitCode )
printf( "\n" );
printf( "Usage: %s [--debug] [--option mountOpts] --host <serverAddress(es)> --image <imageName> [--rid revision] <mountPoint>\n", argv0 );
printf( "Or: %s [-d] [-o mountOpts] -h <serverAddress(es)> -i <imageName> [-r revision] <mountPoint>\n", argv0 );
- printf( "For cow: %s [-d] [-o mountOpts] -h <serverAddress(es)> -i <imageName> [-r revision] -c <path> -C <cowServerAddress> -m [--cowStatStdout] [--cowStatFile] <mountPoint>\n", argv0 );
+ printf( "For cow: %s [-d] [-o mountOpts] -h <serverAddress(es)> -i <imageName> [-r revision] -c <path> -C <cowServerAddress> -m [--cow-stats-stdout] [--cow-stats-file] <mountPoint>\n", argv0 );
printf( " -d --debug Don't fork, write stats file, and print debug output (fuse -> stderr, dnbd3 -> stdout)\n" );
printf( " -f Don't fork (dnbd3 -> stdout)\n" );
printf( " -h --host List of space separated hosts to use\n" );
@@ -342,12 +342,13 @@ static void printUsage( char *argv0, int exitCode )
printf( " -c Enables cow, creates the cow files at given location\n" );
printf( " -L Loads the cow files from the given location\n" );
printf( " -C Host address of the cow server\n" );
- printf( " --cowStatStdout prints the cow status in stdout\n" );
- printf( " --cowStatFile creates and updates the cow status file\n" );
+ printf( "--cow-stats-stdout prints the cow status in stdout\n" );
+ printf( "--cow-stats-file creates and updates the cow status file\n" );
+ printf( " -m --merge tell server to merge and create new revision on exit\n" );
exit( exitCode );
}
-static const char *optString = "dfHh:i:l:o:r:SsVvc:L:C:mxy";
+static const char *optString = "dfHh:i:l:o:r:SsVvc:L:C:m";
static const struct option longOpts[] = {
{ "debug", no_argument, NULL, 'd' },
{ "help", no_argument, NULL, 'H' },
@@ -362,8 +363,8 @@ static const struct option longOpts[] = {
{ "loadcow", required_argument, NULL, 'L' },
{ "cowServer", required_argument, NULL, 'C' },
{ "merge", no_argument, NULL, 'm' },
- { "cowStatStdout", no_argument, NULL, 'x' },
- { "cowStatFile", no_argument, NULL, 'y' },
+ { "cow-stats-stdout", no_argument, NULL, 'sout' },
+ { "cow-stats-file", no_argument, NULL, 'sfil' },
{ 0, 0, 0, 0 }
};
@@ -467,10 +468,10 @@ int main( int argc, char *argv[] )
useCow = true;
loadCow = true;
break;
- case 'x':
+ case 'sout':
sStdout = true;
break;
- case 'y':
+ case 'sfil':
sFile = true;
break;
default: