1 files changed, 487 insertions, 441 deletions
diff --git a/src/fuse/cowfile.c b/src/fuse/cowfile.c
index 8e816a2..a53b101 100644
--- a/src/fuse/cowfile.c
+++ b/src/fuse/cowfile.c
@@ -10,10 +10,13 @@
 #include <curl/curl.h>
 
 #define UUID_STRLEN 36
+// Maximum assumed page size, in case the cow data gets transferred between different architectures
+// 16k should be the largest minimum in existence (Itanium)
+#define MAX_PAGE_SIZE 16384
 
 extern void image_ll_getattr( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi );
 
-static const int CURRENT_COW_VERSION = 1;
+static const int CURRENT_COW_VERSION = 2;
 
 static bool statStdout;
 static bool statFile;
@@ -30,18 +33,51 @@ atomic_bool uploadLoopDone = false; // Upload loop has finished all work?
 
 static struct cow
 {
-	pthread_mutex_t l2CreateLock;
-	int fhm;
-	int fhd;
-	int fhs;
 	char *metadata_mmap;
 	l1 *l1;
-	l2 *firstL2;
-	size_t maxImageSize;
-	size_t l1Size; //size of l1 array
-
+	l2 *l2;
+	int fdMeta;
+	int fdData;
+	int fdStats;
+	pthread_mutex_t l2CreateLock;
 } cow;
 
+static int countOneBits( atomic_uchar *bf, int numBytes )
+{
+	int bitCount = 0;
+	for ( int i = 0; i < numBytes; ++i ) {
+		unsigned char value = bf[i];
+		while ( value > 0 ) {
+			if ( ( value & 1 ) == 1 ) {
+				bitCount++;
+			}
+			value >>= 1;
+		}
+	}
+	return bitCount;
+}
+
+#define IS_4K_ALIGNED(v) ( ( (uint64_t)(v) & DNBD3_BLOCK_MASK ) == 0 )
+
+static bool writeAll( int fd, const char *buf, size_t count, off_t offset )
+{
+	while ( count > 0 ) {
+		ssize_t ret = pwrite( fd, buf, count, offset );
+		if ( ret == (ssize_t)count )
+			return true;
+		if ( ret == -1 ) {
+			if ( errno == EINTR )
+				continue;
+			return false;
+		}
+		if ( ret == 0 )
+			return false;
+		count -= ret;
+		buf += ret;
+	}
+	return true;
+}
+
 /**
  * @brief Computes the l1 index for an absolute file offset
  * 
@@ -83,7 +119,7 @@ static int getBitfieldOffsetBit( size_t offset )
  * @param to end bit
  * @param value set bits to 1 or 0
  */
-static void setBits( atomic_char *byte, int from, int to, bool value )
+static void setBits( atomic_uchar *byte, int64_t from, int64_t to, bool value )
 {
 	char mask = (char)( ( 255 >> ( 7 - ( to - from ) ) ) << from );
 	if ( value ) {
@@ -101,13 +137,13 @@ static void setBits( atomic_char *byte, int from, int to, bool value )
  * @param to end bit
  * @param value set bits to 1 or 0
  */
-static void setBitsInBitfield( atomic_char *bitfield, int from, int to, bool value )
+static void setBitsInBitfield( atomic_uchar *bitfield, int64_t from, int64_t to, bool value )
 {
-	assert( from >= 0 || to < COW_BITFIELD_SIZE * 8 );
-	int start = from / 8;
-	int end = to / 8;
+	assert( from >= 0 && to < COW_BITFIELD_SIZE * 8 );
+	int64_t start = from / 8;
+	int64_t end = to / 8;
 
-	for ( int i = start; i <= end; i++ ) {
+	for ( int64_t i = start; i <= end; i++ ) {
 		setBits( ( bitfield + i ), from - i * 8, MIN( 7, to - i * 8 ), value );
 		from = ( i + 1 ) * 8;
 	}
@@ -119,9 +155,9 @@ static void setBitsInBitfield( atomic_char *bitfield, int from, int to, bool val
  * @param bitfield of a cow_l2_entry
  * @param n the bit which should be checked
  */
-static bool checkBit( atomic_char *bitfield, int n )
+static bool checkBit( atomic_uchar *bitfield, int64_t n )
 {
-	return ( atomic_load( ( bitfield + ( n / 8 ) ) ) >> ( n % 8 ) ) & 1;
+	return ( bitfield[n / 8] >> ( n % 8 ) ) & 1;
 }
 
 
@@ -225,32 +261,50 @@ size_t curlReadCallbackUploadBlock( char *ptr, size_t size, size_t nmemb, void *
 	cow_curl_read_upload_t *uploadBlock = (cow_curl_read_upload_t *)userdata;
 	size_t len = 0;
 	// Check if we're still in the bitfield
-	if ( uploadBlock->position < (size_t)metadata->bitfieldSize ) {
-		size_t lenCpy = MIN( metadata->bitfieldSize - uploadBlock->position, size * nmemb );
-		memcpy( ptr, uploadBlock->block->bitfield + uploadBlock->position, lenCpy );
+	if ( uploadBlock->position < COW_BITFIELD_SIZE ) {
+		size_t lenCpy = MIN( COW_BITFIELD_SIZE - uploadBlock->position, size * nmemb );
+		memcpy( ptr + uploadBlock->position, uploadBlock->bitfield + uploadBlock->position,
+				lenCpy );
 		uploadBlock->position += lenCpy;
 		len += lenCpy;
 	}
 	// No elseif here, might just have crossed over...
-	if ( uploadBlock->position >= (size_t)metadata->bitfieldSize ) {
-		ssize_t wantRead = (ssize_t)MIN(
-				COW_DATA_CLUSTER_SIZE - ( uploadBlock->position - ( metadata->bitfieldSize ) ),
-				( size * nmemb ) - len );
-		off_t inClusterOffset = uploadBlock->position - metadata->bitfieldSize;
-		ssize_t lengthRead = pread( cow.fhd, ( ptr + len ), wantRead, uploadBlock->block->offset + inClusterOffset );
-		if ( lengthRead == -1 ) {
-			logadd( LOG_ERROR, "Upload: Reading from COW file failed with errno %d", errno );
-			return CURL_READFUNC_ABORT;
-		}
-
-		if ( wantRead > lengthRead ) {
-			// fill up since last block may not be a full block
-			memset( ptr + len + lengthRead, 0, wantRead - lengthRead );
-			// TODO what about partial read? We should know how much data there actually is...
-			lengthRead = wantRead;
+	if ( uploadBlock->position >= COW_BITFIELD_SIZE ) {
+		// Subtract the bitfield size from everything first
+		off_t inClusterOffset = uploadBlock->position - COW_BITFIELD_SIZE;
+		ssize_t spaceLeft = ( size * nmemb ) - len;
+		// Only read blocks that have been written to the cluster. Saves bandwidth. Not optimal since
+		// we do a lot of 4k/32k reads, but it's not that performance critical I guess...
+		while ( spaceLeft >= (ssize_t)DNBD3_BLOCK_SIZE && inClusterOffset < (off_t)COW_DATA_CLUSTER_SIZE ) {
+			int bitNumber = (int)( inClusterOffset / DNBD3_BLOCK_SIZE );
+			size_t readSize;
+			// Small performance hack: All bits one in a byte, do a 32k instead of 4k read
+			if ( spaceLeft >= (ssize_t)DNBD3_BLOCK_SIZE * 8
+					&& bitNumber % 8 == 0
+					&& uploadBlock->bitfield[bitNumber / 8] == 0xff ) {
+				readSize = DNBD3_BLOCK_SIZE * 8;
+			} else {
+				readSize = DNBD3_BLOCK_SIZE;
+			}
+			// Check bits in our copy, as global bitfield could change
+			if ( checkBit( uploadBlock->bitfield, bitNumber ) ) {
+				ssize_t lengthRead = pread( cow.fdData, ( ptr + len ), readSize,
+						uploadBlock->block->offset + inClusterOffset );
+				if ( lengthRead == -1 ) {
+					logadd( LOG_ERROR, "Upload: Reading from COW file failed with errno %d", errno );
+					return CURL_READFUNC_ABORT;
+				}
+				if ( lengthRead != (ssize_t)readSize ) {
+					logadd( LOG_ERROR, "Upload: Reading from COW file failed with short read (%d/%d)",
+							(int)lengthRead, (int)readSize );
+					return CURL_READFUNC_ABORT;
+				}
+				len += lengthRead;
+				spaceLeft -= lengthRead;
+			}
+			inClusterOffset += readSize;
+			uploadBlock->position += readSize;
 		}
-		uploadBlock->position += lengthRead;
-		len += lengthRead;
 	}
 	return len;
 }
@@ -280,7 +334,7 @@ bool mergeRequest()
 	part = curl_mime_addpart( mime );
 	curl_mime_name( part, "originalFileSize" );
 	char buf[21];
-	snprintf( buf, sizeof buf, "%" PRIu64, metadata->originalImageSize );
+	snprintf( buf, sizeof buf, "%" PRIu64, metadata->validRemoteSize );
 	curl_mime_data( part, buf, CURL_ZERO_TERMINATED );
 
 	part = curl_mime_addpart( mime );
@@ -341,15 +395,15 @@ int progress_callback( void *clientp, __attribute__((unused)) curl_off_t dlTotal
 		__attribute__((unused)) curl_off_t dlNow, __attribute__((unused)) curl_off_t ulTotal, curl_off_t ulNow )
 {
 	CURL *eh = (CURL *)clientp;
-	cow_curl_read_upload_t *curlUploadBlock;
+	cow_curl_read_upload_t *uploadingCluster;
 	CURLcode res;
-	res = curl_easy_getinfo( eh, CURLINFO_PRIVATE, &curlUploadBlock );
+	res = curl_easy_getinfo( eh, CURLINFO_PRIVATE, &uploadingCluster );
 	if ( res != CURLE_OK ) {
 		logadd( LOG_ERROR, "ERROR" );
 		return 0;
 	}
-	bytesUploaded += ( ulNow - curlUploadBlock->ulLast );
-	curlUploadBlock->ulLast = ulNow;
+	bytesUploaded += ( ulNow - uploadingCluster->ulLast );
+	uploadingCluster->ulLast = ulNow;
 	return 0;
 }
 
@@ -381,7 +435,7 @@ void updateCowStatsFile( uint64_t inQueue, uint64_t modified, uint64_t idle, cha
 			"modifiedClusters=%" PRIu64 "\n"
 			"idleClusters=%" PRIu64 "\n"
 			"totalClustersUploaded=%" PRIu64 "\n"
-			"activeUploads=:%i\n"
+			"activeUploads=%i\n"
 			"%s%s",
 			state, inQueue, modified, idle, totalBlocksUploaded, activeUploads,
 			COW_SHOW_UL_SPEED ? "ulspeed=" : "",
@@ -398,9 +452,10 @@ void updateCowStatsFile( uint64_t inQueue, uint64_t modified, uint64_t idle, cha
 
 	if ( statFile ) {
 		// Pad with a bunch of newlines so we don't change the file size all the time
-		ssize_t extra = MIN( 20, sizeof(buffer) - len - 1 );
+		ssize_t extra = MIN( 20, (ssize_t)sizeof(buffer) - len - 1 );
 		memset( buffer + len, '\n', extra );
-		if ( pwrite( cow.fhs, buffer, len + extra, 43 ) != len ) {
+		lseek( cow.fdStats, 43, SEEK_SET );
+		if ( write( cow.fdStats, buffer, len + extra ) != len ) {
 			logadd( LOG_WARNING, "Could not update cow status file" );
 		}
 #ifdef COW_DUMP_BLOCK_UPLOADS
@@ -412,7 +467,7 @@ void updateCowStatsFile( uint64_t inQueue, uint64_t modified, uint64_t idle, cha
 }
 int cmpfunc( const void *a, const void *b )
 {
-	return (int)( ( (cow_block_upload_statistics_t *)b )->uploads - ( (cow_block_upload_statistics_t *)a )->uploads );
+	return (int)( ( (cow_cluster_statistics_t *)b )->uploads - ( (cow_cluster_statistics_t *)a )->uploads );
 }
 /**
  * @brief Writes all block numbers sorted by the number of uploads into the statsfile.
@@ -422,26 +477,25 @@ void dumpBlockUploads()
 {
 	long unsigned int l1MaxOffset = 1 + ( ( metadata->imageSize - 1 ) / COW_FULL_L2_TABLE_DATA_SIZE );
 
-	cow_block_upload_statistics_t blockUploads[l1MaxOffset * COW_L2_TABLE_SIZE];
+	cow_cluster_statistics_t blockUploads[l1MaxOffset * COW_L2_TABLE_SIZE];
 	uint64_t currentBlock = 0;
 	for ( long unsigned int l1Index = 0; l1Index < l1MaxOffset; l1Index++ ) {
 		if ( cow.l1[l1Index] == -1 ) {
 			continue;
 		}
 		for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) {
-			cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index );
+			cow_l2_entry_t *block = ( cow.l2[cow.l1[l1Index]] + l2Index );
 
 			blockUploads[currentBlock].uploads = block->uploads;
-			blockUploads[currentBlock].blocknumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index );
+			blockUploads[currentBlock].clusterNumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index );
 			currentBlock++;
 		}
 	}
-	qsort( blockUploads, currentBlock, sizeof( cow_block_upload_statistics_t ), cmpfunc );
-	lseek( cow.fhs, 0, SEEK_END );
+	qsort( blockUploads, currentBlock, sizeof( cow_cluster_statistics_t ), cmpfunc );
 
-	dprintf( cow.fhs, "\n\nblocknumber: uploads\n==Block Upload Dump===\n" );
+	dprintf( cow.fdStats, "\n\nclusterNumber: uploads\n==Block Upload Dump===\n" );
 	for ( uint64_t i = 0; i < currentBlock; i++ ) {
-		dprintf( cow.fhs, "%" PRIu64 ": %" PRIu64 " \n", blockUploads[i].blocknumber, blockUploads[i].uploads );
+		dprintf( cow.fdStats, "%" PRIu64 ": %" PRIu64 " \n", blockUploads[i].clusterNumber, blockUploads[i].uploads );
 	}
 }
 
@@ -449,29 +503,32 @@ void dumpBlockUploads()
  * @brief Starts the upload of a given block.
  * 
  * @param cm Curl_multi
- * @param curlUploadBlock containing the data for the block to upload.
+ * @param uploadingCluster containing the data for the block to upload.
  */
-bool addUpload( CURLM *cm, cow_curl_read_upload_t *curlUploadBlock, struct curl_slist *headers )
+bool addUpload( CURLM *cm, cow_curl_read_upload_t *uploadingCluster, struct curl_slist *headers )
 {
 	CURL *eh = curl_easy_init();
 
 	char url[COW_URL_STRING_SIZE];
 
-	snprintf( url, COW_URL_STRING_SIZE, COW_API_UPDATE, cowServerAddress, metadata->uuid, curlUploadBlock->blocknumber );
+	snprintf( url, COW_URL_STRING_SIZE, COW_API_UPDATE, cowServerAddress, metadata->uuid, uploadingCluster->clusterNumber );
 
 	curl_easy_setopt( eh, CURLOPT_URL, url );
 	curl_easy_setopt( eh, CURLOPT_POST, 1L );
 	curl_easy_setopt( eh, CURLOPT_READFUNCTION, curlReadCallbackUploadBlock );
-	curl_easy_setopt( eh, CURLOPT_READDATA, (void *)curlUploadBlock );
-	curl_easy_setopt( eh, CURLOPT_PRIVATE, (void *)curlUploadBlock );
+	curl_easy_setopt( eh, CURLOPT_READDATA, (void *)uploadingCluster );
+	curl_easy_setopt( eh, CURLOPT_PRIVATE, (void *)uploadingCluster );
 	// min upload speed of 1kb/s over 10 sec otherwise the upload is canceled.
 	curl_easy_setopt( eh, CURLOPT_LOW_SPEED_TIME, 10L );
 	curl_easy_setopt( eh, CURLOPT_LOW_SPEED_LIMIT, 1000L );
 
-	curl_easy_setopt(
-			eh, CURLOPT_POSTFIELDSIZE_LARGE, (long)( metadata->bitfieldSize + COW_DATA_CLUSTER_SIZE ) );
+	curl_easy_setopt( eh, CURLOPT_POSTFIELDSIZE_LARGE,
+			(long)( COW_BITFIELD_SIZE
+				+ DNBD3_BLOCK_SIZE * countOneBits( uploadingCluster->bitfield, COW_BITFIELD_SIZE ) )
+			);
+
 	if ( COW_SHOW_UL_SPEED ) {
-		curlUploadBlock->ulLast = 0;
+		uploadingCluster->ulLast = 0;
 		curl_easy_setopt( eh, CURLOPT_NOPROGRESS, 0L );
 		curl_easy_setopt( eh, CURLOPT_XFERINFOFUNCTION, progress_callback );
 		curl_easy_setopt( eh, CURLOPT_XFERINFODATA, eh );
@@ -495,35 +552,35 @@ bool addUpload( CURLM *cm, cow_curl_read_upload_t *curlUploadBlock, struct curl_
 bool finishUpload( CURLM *cm, CURLMsg *msg, struct curl_slist *headers )
 {
 	bool status = true;
-	cow_curl_read_upload_t *curlUploadBlock;
+	cow_curl_read_upload_t *uploadingCluster;
 	CURLcode res;
 	CURLcode res2;
-	res = curl_easy_getinfo( msg->easy_handle, CURLINFO_PRIVATE, &curlUploadBlock );
+	res = curl_easy_getinfo( msg->easy_handle, CURLINFO_PRIVATE, &uploadingCluster );
 
 	long http_code = 0;
 	res2 = curl_easy_getinfo( msg->easy_handle, CURLINFO_RESPONSE_CODE, &http_code );
 
 	if ( res != CURLE_OK || res2 != CURLE_OK || http_code < 200 || http_code >= 300
 			|| msg->msg != CURLMSG_DONE ) {
-		curlUploadBlock->fails++;
-		logadd( LOG_ERROR, "COW_API_UPDATE  failed %i/5: %s\n", curlUploadBlock->fails,
+		uploadingCluster->fails++;
+		logadd( LOG_ERROR, "COW_API_UPDATE  failed %i/5: %s\n", uploadingCluster->fails,
 				curl_easy_strerror( msg->data.result ) );
-		if ( curlUploadBlock->fails <= 5 ) {
-			addUpload( cm, curlUploadBlock, headers );
+		if ( uploadingCluster->fails < 5 ) {
+			addUpload( cm, uploadingCluster, headers );
 			goto CLEANUP;
 		}
-		free( curlUploadBlock );
+		free( uploadingCluster );
 		status = false;
 		goto CLEANUP;
 	}
 
 	// everything went ok, update timeChanged
-	atomic_compare_exchange_strong( &curlUploadBlock->block->timeChanged, &curlUploadBlock->time, 0 );
+	atomic_compare_exchange_strong( &uploadingCluster->block->timeChanged, &uploadingCluster->time, 0 );
 
-	curlUploadBlock->block->uploads++;
+	uploadingCluster->block->uploads++;
 
 	totalBlocksUploaded++;
-	free( curlUploadBlock );
+	free( uploadingCluster );
 CLEANUP:
 	curl_multi_remove_handle( cm, msg->easy_handle );
 	curl_easy_cleanup( msg->easy_handle );
@@ -593,7 +650,7 @@ bool uploaderLoop( bool ignoreMinUploadDelay, CURLM *cm )
 		}
 		// Now all L2 blocks
 		for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) {
-			cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index );
+			cow_l2_entry_t *block = ( cow.l2[cow.l1[l1Index]] + l2Index );
 			if ( block->offset == -1 ) {
 				continue; // Not allocated
 			}
@@ -613,10 +670,15 @@ bool uploaderLoop( bool ignoreMinUploadDelay, CURLM *cm )
 					&& activeUploads > 0 );
 			cow_curl_read_upload_t *b = malloc( sizeof( cow_curl_read_upload_t ) );
 			b->block = block;
-			b->blocknumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index );
+			b->clusterNumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index );
 			b->fails = 0;
 			b->position = 0;
 			b->time = block->timeChanged;
+			// Copy, so it doesn't change during upload
+			// when we assemble the data in curlReadCallbackUploadBlock()
+			for ( int i = 0; i < COW_BITFIELD_SIZE; ++i ) {
+				b->bitfield[i] = block->bitfield[i];
+			}
 			addUpload( cm, b, headers );
 			if ( !ignoreMinUploadDelay && !uploadLoop ) {
 				goto DONE;
@@ -637,7 +699,7 @@ DONE:
  * 
  */
 
-void *cowfile_statUpdater( __attribute__( ( unused ) ) void *something )
+void *cowfile_statUpdater( __attribute__((unused)) void *something )
 {
 	uint64_t lastUpdateTime = time( NULL );
 
@@ -653,7 +715,7 @@ void *cowfile_statUpdater( __attribute__( ( unused ) ) void *something )
 				continue;
 			}
 			for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) {
-				cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index );
+				cow_l2_entry_t *block = ( cow.l2[cow.l1[l1Index]] + l2Index );
 				if ( block->offset == -1 ) {
 					continue;
 				}
@@ -738,12 +800,12 @@ static bool createCowStatsFile( char *path )
 		logadd( LOG_INFO, "%s", buffer );
 	}
 	if ( statFile ) {
-		if ( ( cow.fhs = open( pathStatus, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) {
+		if ( ( cow.fdStats = open( pathStatus, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) {
 			logadd( LOG_ERROR, "Could not create cow status file. Bye.\n" );
 			return false;
 		}
 
-		if ( pwrite( cow.fhs, buffer, len, 0 ) != len ) {
+		if ( pwrite( cow.fdStats, buffer, len, 0 ) != len ) {
 			logadd( LOG_ERROR, "Could not write to cow status file. Bye.\n" );
 			return false;
 		}
@@ -770,67 +832,72 @@ bool cowfile_init( char *path, const char *image_Name, uint16_t imageVersion,
 	snprintf( pathMeta, strlen( path ) + 6, "%s%s", path, "/meta" );
 	snprintf( pathData, strlen( path ) + 6, "%s%s", path, "/data" );
 
-	if ( ( cow.fhm = open( pathMeta, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) {
+	if ( ( cow.fdMeta = open( pathMeta, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR ) ) == -1 ) {
 		logadd( LOG_ERROR, "Could not create cow meta file. Bye.\n %s \n", pathMeta );
 		return false;
 	}
 
-	if ( ( cow.fhd = open( pathData, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) {
+	if ( ( cow.fdData = open( pathData, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR ) ) == -1 ) {
 		logadd( LOG_ERROR, "Could not create cow data file. Bye.\n" );
 		return false;
 	}
-
-	int maxPageSize = 8192;
+	struct stat fs;
+	if ( fstat( cow.fdData, &fs ) == -1 || fs.st_size != 0 ) {
+		logadd( LOG_ERROR, "/data file already exists and is not empty" );
+		return false;
+	}
 
 	size_t metaDataSizeHeader = sizeof( cowfile_metadata_header_t );
 
-	cow.maxImageSize = COW_MAX_IMAGE_SIZE;
-	cow.l1Size = ( ( cow.maxImageSize + COW_FULL_L2_TABLE_DATA_SIZE - 1LL ) / COW_FULL_L2_TABLE_DATA_SIZE );
+	// Calculate how many full l2 tables we need to address COW_MAX_IMAGE_SIZE
+	size_t l1NumEntries = ( ( COW_MAX_IMAGE_SIZE + COW_FULL_L2_TABLE_DATA_SIZE - 1 )
+			/ COW_FULL_L2_TABLE_DATA_SIZE );
+	// Make sure l1 and l2 are aligned to struct size
+	size_t sizeL1 = sizeof(cow.l1[0]);
+	size_t sizeL2 = sizeof(cow.l2[0]);
+	size_t startL1 = ( ( metaDataSizeHeader + sizeL1 - 1 ) / sizeL1 ) * sizeL1;
+	size_t startL2 = ( ( startL1 + l1NumEntries * sizeL1 + sizeL2 - 1 ) / sizeL2 ) * sizeL2;
 
 	// size of l1 array + number of l2's * size of l2
-	size_t metadata_size = cow.l1Size * sizeof( l1 ) + cow.l1Size * sizeof( l2 );
+	size_t ps = getpagesize();
+	size_t metaSize = ( ( startL2 + l1NumEntries * sizeof( l2 ) + ps - 1 ) / ps ) * ps;
 
-	// compute next fitting multiple of getpagesize()
-	size_t meta_data_start = ( ( metaDataSizeHeader + maxPageSize - 1 ) / maxPageSize ) * maxPageSize;
-
-	size_t metadataFileSize = meta_data_start + metadata_size;
-	if ( ftruncate( cow.fhm, metadataFileSize ) != 0 ) {
+	if ( ftruncate( cow.fdMeta, metaSize ) != 0 ) {
 		logadd( LOG_ERROR, "Could not set file size of meta data file (errno=%d). Bye.\n", errno );
 		return false;
 	}
 
-	cow.metadata_mmap = mmap( NULL, metadataFileSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fhm, 0 );
+	cow.metadata_mmap = mmap( NULL, metaSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fdMeta, 0 );
 
 	if ( cow.metadata_mmap == MAP_FAILED ) {
-		logadd( LOG_ERROR, "Error while mapping mmap:\n%s \n Bye.\n", strerror( errno ) );
+		logadd( LOG_ERROR, "Error while mmap()ing meta data, errno=%d", errno );
 		return false;
 	}
 
 	metadata = (cowfile_metadata_header_t *)( cow.metadata_mmap );
 	metadata->magicValue = COW_FILE_META_MAGIC_VALUE;
+	metadata->imageSize = **imageSizePtr;
 	metadata->version = CURRENT_COW_VERSION;
-	metadata->dataFileSize = ATOMIC_VAR_INIT( COW_DATA_CLUSTER_SIZE );
-	metadata->metadataFileSize = ATOMIC_VAR_INIT( metadataFileSize );
-	metadata->blocksize = DNBD3_BLOCK_SIZE;
-	metadata->originalImageSize = **imageSizePtr;
-	metadata->imageSize = metadata->originalImageSize;
-	metadata->creationTime = time( NULL );
-	*imageSizePtr = &metadata->imageSize;
-	metadata->metaDataStart = meta_data_start;
+	metadata->validRemoteSize = **imageSizePtr;
+	metadata->startL1 = (uint32_t)startL1;
+	metadata->startL2 = (uint32_t)startL2;
 	metadata->bitfieldSize = COW_BITFIELD_SIZE;
-	metadata->maxImageSize = cow.maxImageSize;
-	snprintf( metadata->imageName, 200, "%s", image_Name );
-	cow.l1 = (l1 *)( cow.metadata_mmap + meta_data_start );
 	metadata->nextL2 = 0;
+	metadata->metaSize = ATOMIC_VAR_INIT( metaSize );
+	metadata->nextClusterOffset = ATOMIC_VAR_INIT( COW_DATA_CLUSTER_SIZE );
+	metadata->maxImageSize = COW_MAX_IMAGE_SIZE;
+	metadata->creationTime = time( NULL );
+	snprintf( metadata->imageName, 200, "%s", image_Name );
 
-	for ( size_t i = 0; i < cow.l1Size; i++ ) {
+	cow.l1 = (l1 *)( cow.metadata_mmap + startL1 );
+	cow.l2 = (l2 *)( cow.metadata_mmap + startL2 );
+	for ( size_t i = 0; i < l1NumEntries; i++ ) {
 		cow.l1[i] = -1;
 	}
-	cow.firstL2 = (l2 *)( ( (char *)cow.l1 ) + cow.l1Size );
 
 	// write header to data file
 	uint64_t header = COW_FILE_DATA_MAGIC_VALUE;
-	if ( pwrite( cow.fhd, &header, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) {
+	if ( pwrite( cow.fdData, &header, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) {
 		logadd( LOG_ERROR, "Could not write header to cow data file. Bye.\n" );
 		return false;
 	}
@@ -848,6 +915,7 @@ bool cowfile_init( char *path, const char *image_Name, uint16_t imageVersion,
 		return false;
 	}
 	createCowStatsFile( path );
+	*imageSizePtr = &metadata->imageSize;
 	return true;
 }
 
@@ -871,11 +939,11 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server
 	snprintf( pathData, strlen( path ) + 6, "%s%s", path, "/data" );
 
 
-	if ( ( cow.fhm = open( pathMeta, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
+	if ( ( cow.fdMeta = open( pathMeta, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
 		logadd( LOG_ERROR, "Could not open cow meta file. Bye.\n" );
 		return false;
 	}
-	if ( ( cow.fhd = open( pathData, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
+	if ( ( cow.fdData = open( pathData, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
 		logadd( LOG_ERROR, "Could not open cow data file. Bye.\n" );
 		return false;
 	}
@@ -885,7 +953,7 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server
 		size_t sizeToRead = sizeof( cowfile_metadata_header_t );
 		size_t readBytes = 0;
 		while ( readBytes < sizeToRead ) {
-			ssize_t bytes = pread( cow.fhm, ( ( &header ) + readBytes ), sizeToRead, 0 );
+			ssize_t bytes = pread( cow.fdMeta, ( ( &header ) + readBytes ), sizeToRead - readBytes, 0 );
 			if ( bytes <= 0 ) {
 				logadd( LOG_ERROR, "Error while reading meta file header. Bye.\n" );
 				return false;
@@ -902,44 +970,55 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server
 			logadd( LOG_ERROR, "cow meta file of unkown format. Bye.\n" );
 			return false;
 		}
+
+		if ( header.bitfieldSize != COW_BITFIELD_SIZE ) {
+			logadd( LOG_ERROR, "cow meta file has unexpected bitfield size %d", (int)header.bitfieldSize );
+			return false;
+		}
+		if ( header.startL1 >= header.startL2 || header.startL2 >= header.metaSize ) {
+			logadd( LOG_ERROR, "l1/l2 offset messed up in metadata." );
+			return false;
+		}
+
 		struct stat st;
-		fstat( cow.fhm, &st );
-		if ( st.st_size < (off_t)( header.metaDataStart + header.nextL2 * sizeof( l2 ) ) ) {
-			logadd( LOG_ERROR, "cow meta file to small. Bye.\n" );
+		fstat( cow.fdMeta, &st );
+		if ( st.st_size < (off_t)header.metaSize ) {
+			logadd( LOG_ERROR, "cow meta file too small. Bye." );
 			return false;
 		}
 	}
 	{
 		uint64_t magicValueDataFile;
-		if ( pread( cow.fhd, &magicValueDataFile, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) {
-			logadd( LOG_ERROR, "Error while reading cow data file, wrong file?. Bye.\n" );
+		if ( pread( cow.fdData, &magicValueDataFile, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) {
+			logadd( LOG_ERROR, "Error while reading cow data file, wrong file?. Bye." );
 			return false;
 		}
 
 		if ( magicValueDataFile != COW_FILE_DATA_MAGIC_VALUE ) {
 			if ( __builtin_bswap64( magicValueDataFile ) == COW_FILE_DATA_MAGIC_VALUE ) {
-				logadd( LOG_ERROR, "cow data file of wrong endianess. Bye.\n" );
+				logadd( LOG_ERROR, "cow data file of wrong endianess. Bye." );
 				return false;
 			}
-			logadd( LOG_ERROR, "cow data file of unkown format. Bye.\n" );
+			logadd( LOG_ERROR, "cow data file of unkown format. Bye." );
 			return false;
 		}
 		struct stat st;
-		fstat( cow.fhd, &st );
-		if ( (off_t)header.dataFileSize > st.st_size ) {
-			logadd( LOG_ERROR, "cow data file to small. Bye.\n" );
+		fstat( cow.fdData, &st ); // add cluster size, since we don't preallocate
+		if ( header.nextClusterOffset > st.st_size + (int)COW_DATA_CLUSTER_SIZE ) {
+			logadd( LOG_ERROR, "cow data file too small. Expected=%jd, Is=%jd.",
+					(intmax_t)header.nextClusterOffset, (intmax_t)st.st_size );
 			return false;
 		}
 	}
 
-	cow.metadata_mmap = mmap( NULL, header.metadataFileSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fhm, 0 );
+	cow.metadata_mmap = mmap( NULL, header.metaSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fdMeta, 0 );
 
 	if ( cow.metadata_mmap == MAP_FAILED ) {
-		logadd( LOG_ERROR, "Error while mapping mmap:\n%s \n Bye.\n", strerror( errno ) );
+		logadd( LOG_ERROR, "Error while mapping mmap, errno=%d.", errno );
 		return false;
 	}
 	if ( header.version != CURRENT_COW_VERSION ) {
-		logadd( LOG_ERROR, "Error wrong file version got: %i expected: %i. Bye.\n",
+		logadd( LOG_ERROR, "Error wrong file version got: %i expected: %i. Bye.",
 				metadata->version, CURRENT_COW_VERSION );
 		return false;
 	}
@@ -948,11 +1027,8 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server
 	metadata = (cowfile_metadata_header_t *)( cow.metadata_mmap );
 
 	*imageSizePtr = &metadata->imageSize;
-	cow.l1 = (l1 *)( cow.metadata_mmap + metadata->metaDataStart );
-	cow.maxImageSize = metadata->maxImageSize;
-	cow.l1Size = ( ( cow.maxImageSize + COW_FULL_L2_TABLE_DATA_SIZE - 1LL ) / COW_FULL_L2_TABLE_DATA_SIZE );
-
-	cow.firstL2 = (l2 *)( ( (char *)cow.l1 ) + cow.l1Size );
+	cow.l1 = (l1 *)( cow.metadata_mmap + metadata->startL1 );
+	cow.l2 = (l2 *)( cow.metadata_mmap + metadata->startL2 );
 	pthread_mutex_init( &cow.l2CreateLock, NULL );
 	createCowStatsFile( path );
 	return true;
@@ -961,8 +1037,8 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server
  * @brief Starts the cow BackgroundThreads which are needed for stats and data upload
  * 
  */
-bool cowfile_startBackgroundThreads() {
-
+bool cowfile_startBackgroundThreads()
+{
 	if( pthread_create( &tidCowUploader, NULL, &uploaderThreadMain, NULL ) != 0  ) {
 		logadd( LOG_ERROR, "Could not create cow uploader thread");
 		return false;
@@ -977,55 +1053,15 @@ bool cowfile_startBackgroundThreads() {
 }
 
 /**
- * @brief writes the given data in the data file 
- * 
- * @param buffer containing the data
- * @param size of the buffer
- * @param netSize which actually contributes to the fuse write request (can be different from size if partial full blocks are written)
- * @param cowRequest  <---- !???? TODO
- * @param block block being written to
- * @param inClusterOffset offset in this cluster to be written to
- */
-static void writeData( const char *buffer, ssize_t size, size_t netSize, atomic_int *errorCode,
-		atomic_size_t *bytesWorkedOn, cow_l2_entry_t *block, off_t inClusterOffset )
-{
-	// TODO: Assert that size + inClusterOffset <= COW_DATA_CLUSTER_SIZE?
-	ssize_t totalBytesWritten = 0;
-	while ( totalBytesWritten < size ) {
-		ssize_t bytesWritten = pwrite( cow.fhd, ( buffer + totalBytesWritten ), size - totalBytesWritten,
-				block->offset + inClusterOffset + totalBytesWritten );
-		if ( bytesWritten == -1 ) {
-			*errorCode = errno;
-			logadd( LOG_ERROR,
-					"size:%zu netSize:%zu errorCode:%i bytesWorkedOn:%zu inClusterOffset:%ld block->offset:%ld \n", size,
-					netSize, *errorCode, *bytesWorkedOn, inClusterOffset, block->offset );
-			break;
-		} else if ( bytesWritten == 0 ) {
-			*errorCode = EIO;
-			logadd( LOG_ERROR,
-					"size:%zu netSize:%zu errorCode:%i bytesWorkedOn:%zu inClusterOffset:%ld block->offset:%ld \n", size,
-					netSize, *errorCode, *bytesWorkedOn, inClusterOffset, block->offset );
-			break;
-		}
-		totalBytesWritten += bytesWritten;
-	}
-	atomic_fetch_add( bytesWorkedOn, netSize );
-	setBitsInBitfield( block->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ),
-			(int)( ( inClusterOffset + totalBytesWritten - 1 ) / DNBD3_BLOCK_SIZE ), 1 );
-
-	block->timeChanged = time( NULL );
-}
-
-/**
- * @brief Increases the metadata->dataFileSize by COW_DATA_CLUSTER_SIZE.
- * The space is not reserved on disk.
- * 
- * @param block for which the space should be reserved.
+ * Check if block at given offset is local, i.e. has been modified.
+ * @param meta The cow_l2_entry for the according cluster MUST be provided
+ * @param offset offset of data, can be absolute image offset as it will be transformed into cluster offset
  */
-static bool allocateMetaBlockData( cow_l2_entry_t *block )
+static bool isBlockLocal( cow_l2_entry_t *meta, off_t offset )
 {
-	block->offset = (atomic_long)atomic_fetch_add( &metadata->dataFileSize, COW_DATA_CLUSTER_SIZE );
-	return true;
+	if ( meta == NULL )
+		return false;
+	return checkBit( meta->bitfield, ( offset % COW_DATA_CLUSTER_SIZE ) / DNBD3_BLOCK_SIZE );
 }
 
 /**
@@ -1036,34 +1072,38 @@ static bool allocateMetaBlockData( cow_l2_entry_t *block )
  * @param l2Index 
  * @return cow_l2_entry_t* 
  */
-static cow_l2_entry_t *getL2Entry( int l1Index, int l2Index )
+static cow_l2_entry_t *getL2Entry( int l1Index, int l2Index, bool create )
 {
-	cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index );
+	if ( cow.l1[l1Index] == -1 )
+		return NULL;
+	cow_l2_entry_t *block = cow.l2[cow.l1[l1Index]] + l2Index;
 	if ( block->offset == -1 ) {
-		allocateMetaBlockData( block );
+		if ( !create )
+			return NULL;
+		block->offset = atomic_fetch_add( &metadata->nextClusterOffset, COW_DATA_CLUSTER_SIZE );
 	}
 	return block;
 }
 
 /**
- * @brief creates an new L2 Block and initializes the containing cow_l2_entry_t blocks
+ * @brief creates an new L2 table and initializes the containing cow_l2_entry_t
  * 
  * @param l1Index 
  */
-static bool createL2Block( int l1Index )
+static bool createL2Table( int l1Index )
 {
 	pthread_mutex_lock( &cow.l2CreateLock );
 	if ( cow.l1[l1Index] == -1 ) {
+		int idx = metadata->nextL2++;
 		for ( int i = 0; i < COW_L2_TABLE_SIZE; i++ ) {
-			cow.firstL2[metadata->nextL2][i].offset = -1;
-			cow.firstL2[metadata->nextL2][i].timeChanged = ATOMIC_VAR_INIT( 0 );
-			cow.firstL2[metadata->nextL2][i].uploads = ATOMIC_VAR_INIT( 0 );
+			cow.l2[idx][i].offset = -1;
+			cow.l2[idx][i].timeChanged = ATOMIC_VAR_INIT( 0 );
+			cow.l2[idx][i].uploads = ATOMIC_VAR_INIT( 0 );
 			for ( int j = 0; j < COW_BITFIELD_SIZE; j++ ) {
-				cow.firstL2[metadata->nextL2][i].bitfield[j] = ATOMIC_VAR_INIT( 0 );
+				cow.l2[idx][i].bitfield[j] = ATOMIC_VAR_INIT( 0 );
 			}
 		}
-		cow.l1[l1Index] = metadata->nextL2;
-		metadata->nextL2 += 1;
+		cow.l1[l1Index] = idx;
 	}
 	pthread_mutex_unlock( &cow.l2CreateLock );
 	return true;
@@ -1080,13 +1120,19 @@ static bool createL2Block( int l1Index )
 
 static void finishWriteRequest( fuse_req_t req, cow_request_t *cowRequest )
 {
+	if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) != 1 )
+		return; // More sub-requests are pending, bail out
 	if ( cowRequest->errorCode != 0 ) {
 		fuse_reply_err( req, cowRequest->errorCode );
-
 	} else {
-		uint64_t oldSize = metadata->imageSize;
-		uint64_t ns = MAX( oldSize, cowRequest->bytesWorkedOn + cowRequest->fuseRequestOffset );
-		atomic_compare_exchange_strong( &metadata->imageSize, &oldSize, ns );
+		uint64_t newSize = cowRequest->bytesWorkedOn + cowRequest->fuseRequestOffset;
+		if ( newSize > metadata->imageSize ) {
+			uint64_t oldSize;
+			do {
+				oldSize = metadata->imageSize;
+				newSize = MAX( oldSize, newSize );
+			} while ( !atomic_compare_exchange_weak( &metadata->imageSize, &oldSize, newSize ) );
+		}
 		fuse_reply_write( req, cowRequest->bytesWorkedOn );
 	}
 	free( cowRequest );
@@ -1100,67 +1146,104 @@ static void finishWriteRequest( fuse_req_t req, cow_request_t *cowRequest )
  */
 static void writePaddedBlock( cow_sub_request_t *sRequest )
 {
-	//copy write Data
-	// TODO Assert that we have enough space in writeBuffer at that offset
-	memcpy( ( sRequest->writeBuffer + ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ) ), sRequest->writeSrc,
-			sRequest->size );
-	writeData( sRequest->writeBuffer, DNBD3_BLOCK_SIZE, (ssize_t)sRequest->size, &sRequest->cowRequest->errorCode,
-			&sRequest->cowRequest->bytesWorkedOn, sRequest->block,
-			( sRequest->inClusterOffset - ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ) ) );
-
-
-	if ( atomic_fetch_sub( &sRequest->cowRequest->workCounter, 1 ) == 1 ) {
-		finishWriteRequest( sRequest->dRequest.fuse_req, sRequest->cowRequest );
+	assert( ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ) + sRequest->size <= DNBD3_BLOCK_SIZE );
+	// Here, we again check if the block is written locally - there might have been a second write
+	// that wrote the full block, hence didn't have to wait for remote data and finished faster.
+	// In that case, don't pad from remote as we'd overwrite newer data.
+	if ( isBlockLocal( sRequest->block, sRequest->inClusterOffset ) ) {
+		logadd( LOG_INFO, "It happened!" );
+	} else {
+		// copy write Data
+		// writeBuffer is the received data, patch data from fuse write into it
+		memcpy( sRequest->writeBuffer + ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ), sRequest->writeSrc,
+				sRequest->size );
+		if ( !writeAll( cow.fdData, sRequest->writeBuffer, DNBD3_BLOCK_SIZE,
+					sRequest->block->offset + ( sRequest->inClusterOffset & ~DNBD3_BLOCK_MASK ) ) ) {
+			sRequest->cowRequest->errorCode = errno;
+		} else {
+			sRequest->cowRequest->bytesWorkedOn += sRequest->size;
+			int64_t bit = sRequest->inClusterOffset / DNBD3_BLOCK_SIZE;
+			setBitsInBitfield( sRequest->block->bitfield, bit, bit, true );
+			sRequest->block->timeChanged = time( NULL );
+		}
 	}
+
+	finishWriteRequest( sRequest->dRequest.fuse_req, sRequest->cowRequest );
 	free( sRequest );
 }
 
 /**
  * @brief If a block does not start or finish on an multiple of DNBD3_BLOCK_SIZE, the blocks need to be
  * padded. If this block is inside the original image size, the padding data will be read from the server.
- * Otherwise it will be padded with 0 since the it must be the block at the end of the image.
- * TODO: Properly document the arguments and what value range they can be, i.e. see below for the 4k case
- * 
+ * Otherwise it will be padded with 0 since the it must be a block after the end of the image.
+ * @param req fuse_req_t
+ * @param cowRequest cow_request_t
+ * @param startOffset Absolute offset where the real data starts
+ * @param endOffset Absolute offset where the real data ends
+ * @param srcBuffer pointer to the data that needs to be padded, ie. data from user space.
  */
-static void padBlockFromRemote( fuse_req_t req, off_t offset, cow_request_t *cowRequest, const char *buffer,
-		size_t size, cow_l2_entry_t *block, off_t inClusterOffset )
+static bool padBlockForWrite( fuse_req_t req, cow_request_t *cowRequest,
+		off_t startOffset, off_t endOffset, const char *srcBuffer )
 {
-	// TODO: Is this *guaranteed* to be the case on the caller site? Add comment to ^
-	assert( ( offset % DNBD3_BLOCK_SIZE ) + size <= DNBD3_BLOCK_SIZE );
-	if ( offset >= (off_t)metadata->originalImageSize ) {
-		// Writing past the end of the image
-		inClusterOffset -= inClusterOffset % DNBD3_BLOCK_SIZE;
-		char buf[DNBD3_BLOCK_SIZE] = { 0 };
-		memcpy( buf + ( offset % DNBD3_BLOCK_SIZE ), buffer, size );
-		// At this point we should have a 4k block with user-space data to write, and possibly
-		// zero-padding at start and/or end
-
-		writeData( buf, DNBD3_BLOCK_SIZE, (ssize_t)size, &cowRequest->errorCode, &cowRequest->bytesWorkedOn,
-				block, inClusterOffset );
-		return;
-	}
-	// Need to fetch padding from upstream
-	cow_sub_request_t *sRequest = calloc( sizeof( cow_sub_request_t ) + DNBD3_BLOCK_SIZE, 1 );
-	sRequest->callback = writePaddedBlock;
-	sRequest->inClusterOffset = inClusterOffset;
-	sRequest->block = block;
-	sRequest->size = size;
-	sRequest->writeSrc = buffer;
-	sRequest->cowRequest = cowRequest;
-
-	sRequest->dRequest.length = (uint32_t)MIN( DNBD3_BLOCK_SIZE, metadata->originalImageSize - offset );
-	sRequest->dRequest.offset = offset - ( offset % DNBD3_BLOCK_SIZE );
-	sRequest->dRequest.fuse_req = req;
-
-	atomic_fetch_add( &cowRequest->workCounter, 1 );
-	if ( !connection_read( &sRequest->dRequest ) ) {
-		cowRequest->errorCode = EIO;
-		if ( atomic_fetch_sub( &sRequest->cowRequest->workCounter, 1 ) == 1 ) {
-			finishWriteRequest( sRequest->dRequest.fuse_req, sRequest->cowRequest );
+	// Make sure we pad exactly one block
+	endOffset = MIN( (uint64_t)endOffset, ( startOffset + DNBD3_BLOCK_SIZE ) & ~DNBD3_BLOCK_MASK );
+	assert( startOffset < endOffset );
+	size_t size = (size_t)( endOffset - startOffset );
+	int l1Index = offsetToL1Index( startOffset );
+	int l2Index = offsetToL2Index( startOffset );
+	off_t inClusterOffset = startOffset % COW_DATA_CLUSTER_SIZE;
+	cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, true );
+	if ( isBlockLocal( cluster, startOffset ) ) {
+		// No padding at all, keep existing data
+		bool ret = writeAll( cow.fdData, srcBuffer, size, cluster->offset + inClusterOffset );
+		if ( ret ) {
+			cowRequest->bytesWorkedOn += size;
+			cluster->timeChanged = time( NULL );
 		}
-		free( sRequest );
-		return;
+		return ret;
+	}
+	// Not local, need some form of padding
+	createL2Table( l1Index );
+	if ( cluster == NULL ) {
+		cluster = getL2Entry( l1Index, l2Index, true );
+	}
+	uint64_t validImageSize = metadata->validRemoteSize; // As we don't lock
+	if ( startOffset >= (off_t)validImageSize ) {
+		// After end of remote valid data, pad with zeros entirely
+		char buf[DNBD3_BLOCK_SIZE] = {0};
+		off_t start = startOffset % DNBD3_BLOCK_SIZE;
+		assert( start + size <= DNBD3_BLOCK_SIZE );
+		memcpy( buf + start, srcBuffer, size );
+		bool ret = writeAll( cow.fdData, buf, DNBD3_BLOCK_SIZE,
+				cluster->offset + ( inClusterOffset & ~DNBD3_BLOCK_MASK ) );
+		if ( ret ) {
+			int64_t bit = inClusterOffset / DNBD3_BLOCK_SIZE;
+			setBitsInBitfield( cluster->bitfield, bit, bit, true );
+			cowRequest->bytesWorkedOn += size;
+			cluster->timeChanged = time( NULL );
+		}
+		return ret;
+	}
+	// Need to fetch padding from upstream, allocate struct plus one block
+	cow_sub_request_t *sub = calloc( sizeof( *sub ) + DNBD3_BLOCK_SIZE, 1 );
+	sub->callback = writePaddedBlock;
+	sub->inClusterOffset = inClusterOffset;
+	sub->block = cluster;
+	sub->size = size;
+	sub->writeSrc = srcBuffer;
+	sub->cowRequest = cowRequest;
+
+	sub->dRequest.length = (uint32_t)MIN( DNBD3_BLOCK_SIZE, validImageSize - startOffset );
+	sub->dRequest.offset = startOffset & ~DNBD3_BLOCK_MASK;
+	sub->dRequest.fuse_req = req;
+
+	if ( !connection_read( &sub->dRequest ) ) {
+		free( sub );
+		errno = ENOTSOCK;
+		return false;
 	}
+	atomic_fetch_add( &cowRequest->workCounter, 1 );
+	return true;
 }
 
 /**
@@ -1187,15 +1270,19 @@ void readRemoteData( cow_sub_request_t *sRequest )
 	atomic_fetch_add( &sRequest->cowRequest->bytesWorkedOn, sRequest->dRequest.length );
 
 	if ( atomic_fetch_sub( &sRequest->cowRequest->workCounter, 1 ) == 1 ) {
-		if ( sRequest->cowRequest->bytesWorkedOn < sRequest->cowRequest->fuseRequestSize ) {
-			// TODO: Is this a logic bug somewhere, reagarding accounting?
+		if ( sRequest->cowRequest->bytesWorkedOn != sRequest->cowRequest->fuseRequestSize ) {
 			// Because connection_read() will always return exactly as many bytes as requested,
 			// or simply never finish.
-			// Otherwise, we should return EIO...
-			logadd( LOG_ERROR, "pad read to small\n" );
+			logadd( LOG_ERROR, "BUG? Pad read has invalid size. worked on: %"PRIu64", request size: %"
+					PRIu64", offset: %"PRIu64,
+					(uint64_t)sRequest->cowRequest->bytesWorkedOn,
+					(uint64_t)sRequest->cowRequest->fuseRequestSize,
+					(uint64_t)sRequest->cowRequest->fuseRequestOffset );
+			fuse_reply_err( sRequest->dRequest.fuse_req, EIO );
+		} else {
+			fuse_reply_buf( sRequest->dRequest.fuse_req, sRequest->cowRequest->readBuffer,
+					sRequest->cowRequest->bytesWorkedOn );
 		}
-		fuse_reply_buf( sRequest->dRequest.fuse_req, sRequest->cowRequest->readBuffer,
-				sRequest->cowRequest->bytesWorkedOn );
 		free( sRequest->cowRequest->readBuffer );
 		free( sRequest->cowRequest );
 	}
@@ -1213,69 +1300,61 @@ void readRemoteData( cow_sub_request_t *sRequest )
 
 void cowfile_setSize( fuse_req_t req, size_t size, fuse_ino_t ino, struct fuse_file_info *fi )
 {
-	// decrease
 	if ( size < metadata->imageSize ) {
-		if ( size < metadata->originalImageSize ) {
-			metadata->originalImageSize = size;
+		// truncate file
+		if ( size < metadata->validRemoteSize ) {
+			metadata->validRemoteSize = size;
 		}
-		// TODO.... so....
-		// originalImageSize = smallest we have seen
-		// imageSize = current
-		// ?
-
-		// increase
 	} else if ( size > metadata->imageSize ) {
+		// grow file, pad with zeroes
 		off_t offset = metadata->imageSize;
 		int l1Index = offsetToL1Index( offset );
 		int l2Index = offsetToL2Index( offset );
 		int l1EndIndex = offsetToL1Index( size );
 		int l2EndIndex = offsetToL2Index( size );
-		// special case first block TODO: What is the special case? What is happening here?
-		if ( cow.l1[l1Index] != -1 ) {
-			cow_l2_entry_t *block = getL2Entry( l1Index, l2Index );
-			if ( metadata->imageSize % DNBD3_BLOCK_SIZE != 0 ) {
-				off_t inClusterOffset = metadata->imageSize % COW_DATA_CLUSTER_SIZE;
+		// Special case, first cluster through which the size change passes
+		cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, false );
+		if ( cluster != NULL ) {
+			off_t inClusterOffset = offset % COW_DATA_CLUSTER_SIZE;
+			// if the new size is inside a DNBD3_BLOCK it might still contain old data before a truncate
+			if ( !IS_4K_ALIGNED( metadata->imageSize ) ) {
 				size_t sizeToWrite = DNBD3_BLOCK_SIZE - ( metadata->imageSize % DNBD3_BLOCK_SIZE );
 
-				if ( checkBit( block->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ) ) ) {
-					char buf[sizeToWrite];
-					memset( buf, 0, sizeToWrite );
-
-					ssize_t bytesWritten = pwrite( cow.fhd, buf, sizeToWrite, block->offset + inClusterOffset );
+				if ( checkBit( cluster->bitfield, inClusterOffset / DNBD3_BLOCK_SIZE ) ) {
+					char buf[DNBD3_BLOCK_SIZE] = {0};
+					ssize_t bytesWritten = pwrite( cow.fdData, buf, sizeToWrite, cluster->offset + inClusterOffset );
 
 					if ( bytesWritten < (ssize_t)sizeToWrite ) {
 						fuse_reply_err( req, bytesWritten == -1 ? errno : EIO );
 						return;
 					}
-					block->timeChanged = time( NULL );
+					cluster->timeChanged = time( NULL );
 					offset += sizeToWrite;
 				}
 			}
-			// rest of block set bits 0
-			l1Index = offsetToL1Index( offset );
-			l2Index = offsetToL2Index( offset );
-			block = getL2Entry( l1Index, l2Index );
-			off_t inClusterOffset = offset % COW_DATA_CLUSTER_SIZE;
-			setBitsInBitfield(
-					block->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ), ( COW_BITFIELD_SIZE * 8 ) - 1, 0 );
-			block->timeChanged = time( NULL );
+			// all remaining bits in cluster will get set to 0
+			inClusterOffset = offset % COW_DATA_CLUSTER_SIZE;
+			setBitsInBitfield( cluster->bitfield, inClusterOffset / DNBD3_BLOCK_SIZE,
+					( COW_BITFIELD_SIZE * 8 ) - 1, false );
+			cluster->timeChanged = time( NULL );
 			l2Index++;
 			if ( l2Index >= COW_L2_TABLE_SIZE ) {
 				l2Index = 0;
 				l1Index++;
 			}
 		}
-		// null all bitfields
-		while ( !( l1Index > l1EndIndex || ( l1Index == l1EndIndex && l2EndIndex < l2Index ) ) ) {
+		// normal case, if clusters exist, null bitfields
+		while ( l1Index < l1EndIndex || ( l1Index == l1EndIndex && l2Index <= l2EndIndex ) ) {
 			if ( cow.l1[l1Index] == -1 ) {
 				l1Index++;
 				l2Index = 0;
 				continue;
 			}
-
-			cow_l2_entry_t *block = getL2Entry( l1Index, l2Index );
-			setBitsInBitfield( block->bitfield, 0, ( COW_BITFIELD_SIZE * 8 ) - 1, 0 );
-			block->timeChanged = time( NULL );
+			cluster = getL2Entry( l1Index, l2Index, false );
+			if ( cluster != NULL ) {
+				memset( cluster->bitfield, 0, COW_BITFIELD_SIZE );
+				cluster->timeChanged = time( NULL );
+			}
 			l2Index++;
 			if ( l2Index >= COW_L2_TABLE_SIZE ) {
 				l2Index = 0;
@@ -1308,98 +1387,82 @@ void cowfile_write( fuse_req_t req, cow_request_t *cowRequest, off_t offset, siz
 	off_t currentOffset = offset;
 	off_t endOffset = offset + size;
 
+	if ( !IS_4K_ALIGNED( currentOffset ) ) {
+		// Handle case where start is not 4k aligned
+		if ( !padBlockForWrite( req, cowRequest, currentOffset, endOffset, cowRequest->writeBuffer ) ) {
+			goto fail;
+		}
+		// Move forward to next block border
+		currentOffset = ( currentOffset + DNBD3_BLOCK_SIZE ) & ~DNBD3_BLOCK_MASK;
+	}
+	if ( currentOffset < endOffset && !IS_4K_ALIGNED( endOffset ) ) {
+		// Handle case where end is not 4k aligned
+		off_t lastBlockStart = endOffset & ~DNBD3_BLOCK_MASK;
+		if ( !padBlockForWrite( req, cowRequest, lastBlockStart, endOffset,
+				cowRequest->writeBuffer + ( lastBlockStart - offset ) ) ) {
+			goto fail;
+		}
+		endOffset = lastBlockStart;
+	}
+
+	// From here on start and end are block-aligned
 	int l1Index = offsetToL1Index( currentOffset );
 	int l2Index = offsetToL2Index( currentOffset );
 	while ( currentOffset < endOffset ) {
 		if ( cow.l1[l1Index] == -1 ) {
-			createL2Block( l1Index );
+			createL2Table( l1Index );
 		}
 		//loop over L2 array (metadata)
 		while ( currentOffset < endOffset && l2Index < COW_L2_TABLE_SIZE ) {
-			cow_l2_entry_t *metaBlock = getL2Entry( l1Index, l2Index );
-
-			// Calc absolute offset in image corresponding to current cluster
-			size_t clusterAbsoluteStartOffset = l1Index * COW_FULL_L2_TABLE_DATA_SIZE + l2Index * COW_DATA_CLUSTER_SIZE;
-
-			size_t inClusterOffset = currentOffset - clusterAbsoluteStartOffset;
-			// How many bytes we can write to this cluster before crossing a boundary, or before the write request is completed
+			cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, true );
+			size_t inClusterOffset = currentOffset % COW_DATA_CLUSTER_SIZE;
+			// How many bytes we can write to this cluster before crossing a boundary,
+			// or before the write request is complete
 			size_t bytesToWriteToCluster =
 					MIN( (size_t)( endOffset - currentOffset ), COW_DATA_CLUSTER_SIZE - inClusterOffset );
 
-			/////////////////////////
-			// lock for the half block probably needed
-			if ( currentOffset % DNBD3_BLOCK_SIZE != 0
-					&& !checkBit( metaBlock->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ) ) ) {
-				// Block has not been written locally before, and write does not start on block boundary.
-				// Need to fetch the first couple bytes of the block from remote before writing the block to disk.
-				size_t writeSize = MIN( bytesToWriteToCluster, DNBD3_BLOCK_SIZE - ( (size_t)currentOffset % DNBD3_BLOCK_SIZE ) );
-				const char *sbuf = cowRequest->writeBuffer + ( ( currentOffset - offset ) );
-				padBlockFromRemote( req, currentOffset, cowRequest, sbuf, writeSize, metaBlock, (off_t)inClusterOffset );
-				currentOffset += writeSize;
-				continue;
+			if ( !writeAll( cow.fdData, cowRequest->writeBuffer + ( currentOffset - offset ),
+						bytesToWriteToCluster, cluster->offset + inClusterOffset ) ) {
+				goto fail;
 			}
-
-			size_t endPaddedSize = 0; // In case we need to skip over a pending pad request to remote
-			if ( ( currentOffset + bytesToWriteToCluster ) % DNBD3_BLOCK_SIZE != 0
-					&& metadata->originalImageSize > currentOffset + bytesToWriteToCluster ) {
-				// Write request does not end on block boundary, and ends before end of image
-				// End offset of this write
-				off_t clusterEndOffset = currentOffset + bytesToWriteToCluster;
-				// Start of last block of write, i.e. start of the last, incomplete block
-				off_t lastBlockStartOffset = clusterEndOffset - ( clusterEndOffset % DNBD3_BLOCK_SIZE );
-				// Where that last block starts relative to its cluster
-				off_t inClusterBlockOffset = lastBlockStartOffset - clusterAbsoluteStartOffset;
-				if ( !checkBit( metaBlock->bitfield, (int)( inClusterBlockOffset / DNBD3_BLOCK_SIZE ) ) ) {
-					// Block indeed not modified before, need to fetch
-					const char *sbuf = cowRequest->writeBuffer + ( ( lastBlockStartOffset - offset ) );
-					padBlockFromRemote( req, lastBlockStartOffset, cowRequest, sbuf, clusterEndOffset - lastBlockStartOffset, metaBlock,
-							inClusterBlockOffset );
-
-
-					bytesToWriteToCluster -= clusterEndOffset - lastBlockStartOffset;
-					endPaddedSize = clusterEndOffset - lastBlockStartOffset;
-				}
-			}
-			writeData( cowRequest->writeBuffer + ( ( currentOffset - offset ) ), (ssize_t)bytesToWriteToCluster,
-					bytesToWriteToCluster, &cowRequest->errorCode, &cowRequest->bytesWorkedOn, metaBlock, inClusterOffset );
-
+			int64_t f = inClusterOffset / DNBD3_BLOCK_SIZE;
+			int64_t t = ( inClusterOffset + bytesToWriteToCluster - 1 ) / DNBD3_BLOCK_SIZE;
+			setBitsInBitfield( cluster->bitfield, f, t, true );
+			cowRequest->bytesWorkedOn += bytesToWriteToCluster;
 			currentOffset += bytesToWriteToCluster;
-			// Account for skipped-over bytes
-			currentOffset += endPaddedSize;
-
-
+			cluster->timeChanged = time( NULL );
 			l2Index++;
 		}
 		l1Index++;
 		l2Index = 0;
 	}
-	if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) == 1 ) {
-		finishWriteRequest( req, cowRequest );
+	goto success;
+
+fail:
+	if ( cowRequest->errorCode == 0 ) {
+		cowRequest->errorCode = errno != 0 ? errno : EIO;
 	}
+success:
+	finishWriteRequest( req, cowRequest );
 }
 
 
 /**
  * @brief Request data, that is not available locally, via the network.
  * 
- * @param req fuse_req_t 
+ * @param req fuse_req_t
  * @param offset from the start of the file
  * @param size of data to request
  * @param buffer into which the data is to be written
- * @param workCounter workCounter is increased by one and later reduced by one again when the request is completed. TODO There is no such param, but cowRequest..
+ * @param cowRequest cow_request_t
  */
 static void readRemote( fuse_req_t req, off_t offset, ssize_t size, char *buffer, cow_request_t *cowRequest )
 {
-	// edgecase: Image size got reduced before on a non block border
-	if ( offset + size > (long int) metadata->originalImageSize ) { // TODO How does this check if it's a non block border?
-		size_t padZeroSize = ( offset + size ) - metadata->originalImageSize;
-		off_t padZeroOffset = metadata->originalImageSize - offset;
-		assert( offset > 0 ); // TODO Should this be padZeroOffset?
-		// ... But isn't it possible that offset > originalImageSize, in which case it would be negative?
-		memset( ( buffer + padZeroOffset ), 0, padZeroSize );
-
-		atomic_fetch_add( &cowRequest->bytesWorkedOn, padZeroSize );
-	}
+	assert( offset < (off_t)metadata->validRemoteSize );
+	assert( offset + size <= (off_t)metadata->validRemoteSize );
+	if ( size == 0 )
+		return;
+	assert( size > 0 );
 	cow_sub_request_t *sRequest = malloc( sizeof( cow_sub_request_t ) );
 	sRequest->callback = readRemoteData;
 	sRequest->dRequest.length = (uint32_t)size;
@@ -1410,35 +1473,33 @@ static void readRemote( fuse_req_t req, off_t offset, ssize_t size, char *buffer
 
 	atomic_fetch_add( &cowRequest->workCounter, 1 );
 	if ( !connection_read( &sRequest->dRequest ) ) {
-		cowRequest->errorCode = EIO; // TODO We set an error...
+		cowRequest->errorCode = EIO;
 		free( sRequest );
 		if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) == 1 ) {
-			// .... but would still report success if this happens to be the last pending sub-request!?
-			fuse_reply_buf( req, cowRequest->readBuffer, cowRequest->bytesWorkedOn );
+			fuse_reply_err( req, EIO );
+			free( cowRequest->readBuffer );
+			free( cowRequest );
 		}
-		free( cowRequest->readBuffer );
-		free( cowRequest );
-		return;
 	}
 }
 
 /**
  * @brief Get the Block Data Source object
  * 
- * @param block 
- * @param bitfieldOffset 
- * @param offset 
- * @return enum dataSource 
+ * @param block
+ * @param bitfieldOffset
+ * @param offset
+ * @return enum dataSource
  */
 enum dataSource getBlockDataSource( cow_l2_entry_t *block, off_t bitfieldOffset, off_t offset )
 {
-	if ( block != NULL && checkBit( block->bitfield, (int)bitfieldOffset ) ) {
-		return local;
+	if ( block != NULL && checkBit( block->bitfield, bitfieldOffset ) ) {
+		return ds_local;
 	}
-	if ( offset >= (off_t)metadata->originalImageSize ) {
-		return zero;
+	if ( offset >= (off_t)metadata->validRemoteSize ) {
+		return ds_zero;
 	}
-	return remote;
+	return ds_remote;
 }
 
 /**
@@ -1450,124 +1511,109 @@ enum dataSource getBlockDataSource( cow_l2_entry_t *block, off_t bitfieldOffset,
  * @param offset offset where the read starts.
  * @return uint64_t Number of bytes read.
  */
-void cowfile_read( fuse_req_t req, size_t size, off_t offset )
+void cowfile_read( fuse_req_t req, size_t size, off_t startOffset )
 {
 	cow_request_t *cowRequest = malloc( sizeof( cow_request_t ) );
 	cowRequest->fuseRequestSize = size;
 	cowRequest->bytesWorkedOn = ATOMIC_VAR_INIT( 0 );
 	cowRequest->workCounter = ATOMIC_VAR_INIT( 1 );
 	cowRequest->errorCode = ATOMIC_VAR_INIT( 0 );
-	cowRequest->readBuffer = malloc( size );
-	cowRequest->fuseRequestOffset = offset;
-	off_t lastReadOffset = offset;
-	off_t endOffset = offset + size;
-	off_t searchOffset = offset;
-	int l1Index = offsetToL1Index( offset );
-	int l2Index = offsetToL2Index( offset );
-	int bitfieldOffset = getBitfieldOffsetBit( offset );
-	enum dataSource dataState;
-	cow_l2_entry_t *cluster = NULL;
-
-	if ( cow.l1[l1Index] != -1 ) {
-		cluster = getL2Entry( l1Index, l2Index );
-	}
+	cowRequest->readBuffer = calloc( size, 1 );
+	cowRequest->fuseRequestOffset = startOffset;
+	off_t lastReadOffset = -1;
+	off_t endOffset = startOffset + size;
+	off_t searchOffset = startOffset;
+	int l1Index = offsetToL1Index( startOffset );
+	int l2Index = offsetToL2Index( startOffset );
+	int bitfieldOffset = getBitfieldOffsetBit( startOffset );
+	cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, false );
+	enum dataSource dataState = ds_invalid;
+	bool flushCurrentSpan = false; // Set if we need to read the current span and start the next one
+	bool newSourceType = true; // Set if we're starting a new span, and the source type needs to be determined
 
-	bool doRead = false;
-	bool firstLoop = true;
-	bool updateBlock = false;
 	while ( searchOffset < endOffset ) {
-		if ( firstLoop ) {
-			firstLoop = false;
+		if ( newSourceType ) {
+			newSourceType = false;
 			lastReadOffset = searchOffset;
-			// TODO: Why is this only set on first iteration and not for every block/cluster?
 			dataState = getBlockDataSource( cluster, bitfieldOffset, searchOffset );
 		} else if ( getBlockDataSource( cluster, bitfieldOffset, searchOffset ) != dataState ) {
-			// TODO So data source changed, but we don't update the dataState var... How can this possibly work?
-			doRead = true;
+			// Source type changed, obviously need to flush current span
+			flushCurrentSpan = true;
 		} else {
 			bitfieldOffset++;
-		}
-
-		if ( bitfieldOffset >= COW_BITFIELD_SIZE * 8 ) {
-			// Advance to next cluster in current l2 table
-			bitfieldOffset = 0;
-			l2Index++;
-			if ( l2Index >= COW_L2_TABLE_SIZE ) {
-				// Advance to next l1 entry, reset l2 index
-				l2Index = 0;
-				l1Index++;
-			}
-			// Also set flag that we need to update the 'cluster' struct at the end of this iteration
-			// TODO: Why do we update all the values above, but not the cluster struct? We access those
-			// variables in the code below, so we have updated offset and index, but operate on the
-			// old cluster struct. How does that make sense?
-			updateBlock = true;
-			if ( dataState == local ) {
-				doRead = true;
+			// If reading from local cow file, crossing a cluster border means we need to flush
+			// since the next cluster might be somewhere else in the data file
+			if ( dataState == ds_local && bitfieldOffset == COW_BITFIELD_SIZE * 8 ) {
+				flushCurrentSpan = true;
 			}
 		}
-		// compute the original file offset from bitfieldOffset, l2Index and l1Index
-		// TODO ??? As stated above, this is using the updated values, so isn't this the next
-		// offset tather than original offset?
-		searchOffset = DNBD3_BLOCK_SIZE * ( bitfieldOffset ) + l2Index * COW_DATA_CLUSTER_SIZE
+
+		// compute the absolute image offset from bitfieldOffset, l2Index and l1Index
+		// bitfieldOffset might be out of bounds here, but that doesn't matter for the calculation
+		searchOffset = DNBD3_BLOCK_SIZE * bitfieldOffset + l2Index * COW_DATA_CLUSTER_SIZE
 				+ l1Index * COW_FULL_L2_TABLE_DATA_SIZE;
-		if ( doRead || searchOffset >= endOffset ) {
-			ssize_t sizeToRead = MIN( searchOffset, endOffset );
-			if ( dataState == remote ) {
-				if ( sizeToRead > (ssize_t) metadata->originalImageSize ) {
-					//pad rest with 0
-					memset( cowRequest->readBuffer
-									+ ( ( lastReadOffset - offset ) + ( metadata->originalImageSize - offset ) ),
-							0, sizeToRead - metadata->originalImageSize );
-					atomic_fetch_add( &cowRequest->bytesWorkedOn, sizeToRead - metadata->originalImageSize );
-					sizeToRead = metadata->originalImageSize;
+		if ( flushCurrentSpan || searchOffset >= endOffset ) {
+			ssize_t spanEndOffset = MIN( searchOffset, endOffset );
+			if ( dataState == ds_remote ) {
+				if ( spanEndOffset > (ssize_t)metadata->validRemoteSize ) {
+					// Account for bytes we leave zero, because they're beyond the (truncated) original image size
+					atomic_fetch_add( &cowRequest->bytesWorkedOn, spanEndOffset - metadata->validRemoteSize );
+					spanEndOffset = metadata->validRemoteSize;
 				}
-				sizeToRead -= lastReadOffset;
-				readRemote(
-						req, lastReadOffset, sizeToRead, cowRequest->readBuffer + ( lastReadOffset - offset ), cowRequest );
-			} else if ( dataState == zero ) {
-				sizeToRead -= lastReadOffset;
-				memset( cowRequest->readBuffer + ( lastReadOffset - offset ), 0, sizeToRead );
-				atomic_fetch_add( &cowRequest->bytesWorkedOn, sizeToRead );
-			} else {
-				sizeToRead -= lastReadOffset;
-				// Compute the offset in the data file where the read starts
-				off_t localRead =
-						cluster->offset + ( ( lastReadOffset % COW_FULL_L2_TABLE_DATA_SIZE ) % COW_DATA_CLUSTER_SIZE );
+				readRemote( req, lastReadOffset, spanEndOffset - lastReadOffset,
+						cowRequest->readBuffer + ( lastReadOffset - startOffset ), cowRequest );
+			} else if ( dataState == ds_zero ) {
+				// Past end of image, account for leaving them zero
+				ssize_t numBytes = spanEndOffset - lastReadOffset;
+				atomic_fetch_add( &cowRequest->bytesWorkedOn, numBytes );
+			} else if ( dataState == ds_local ) {
+				ssize_t numBytes = spanEndOffset - lastReadOffset;
+				// Compute the startOffset in the data file where the read starts
+				off_t localRead = cluster->offset + ( lastReadOffset % COW_DATA_CLUSTER_SIZE );
 				ssize_t totalBytesRead = 0;
-				while ( totalBytesRead < sizeToRead ) {
-					ssize_t bytesRead =
-							pread( cow.fhd, cowRequest->readBuffer + ( lastReadOffset - offset ), sizeToRead, localRead );
+				while ( totalBytesRead < numBytes ) {
+					ssize_t bytesRead = pread( cow.fdData, cowRequest->readBuffer + ( lastReadOffset - startOffset ),
+							numBytes - totalBytesRead, localRead + totalBytesRead );
 					if ( bytesRead == -1 ) {
 						cowRequest->errorCode = errno;
 						goto fail;
-					} else if ( bytesRead <= 0 ) {
+					} else if ( bytesRead == 0 ) {
+						logadd( LOG_ERROR, "EOF for read at localRead=%"PRIu64", totalBR=%"PRIu64,
+								(uint64_t)localRead, (uint64_t)totalBytesRead );
+						logadd( LOG_ERROR, "searchOffset=%"PRIu64", endOffset=%"PRIu64", imageSize=%"PRIu64,
+								searchOffset, endOffset, metadata->imageSize );
 						cowRequest->errorCode = EIO;
 						goto fail;
 					}
 					totalBytesRead += bytesRead;
 				}
 
-				atomic_fetch_add( &cowRequest->bytesWorkedOn, totalBytesRead );
+				atomic_fetch_add( &cowRequest->bytesWorkedOn, numBytes );
+			} else {
+				assert( 4 == 6 );
 			}
 			lastReadOffset = searchOffset;
-			doRead = false;
-			firstLoop = true;
+			flushCurrentSpan = false;
+			// Since the source type changed, reset
+			newSourceType = true;
 		}
-
-		if ( updateBlock ) {
-			if ( cow.l1[l1Index] != -1 ) {
-				cluster = getL2Entry( l1Index, l2Index );
-			} else {
-				cluster = NULL;
+		if ( bitfieldOffset == COW_BITFIELD_SIZE * 8 ) {
+			// Advance to next cluster in current l2 table
+			bitfieldOffset = 0;
+			l2Index++;
+			if ( l2Index >= COW_L2_TABLE_SIZE ) {
+				// Advance to next l1 entry, reset l2 index
+				l2Index = 0;
+				l1Index++;
 			}
-			updateBlock = false;
+			cluster = getL2Entry( l1Index, l2Index, false );
 		}
 	}
 fail:;
 	if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) == 1 ) {
-		if ( cowRequest->errorCode != 0 || cowRequest->bytesWorkedOn < size ) {
-			logadd( LOG_ERROR, "incomplete read or I/O error (errno=%d)", cowRequest->errorCode );
+		if ( cowRequest->errorCode != 0 || cowRequest->bytesWorkedOn != size ) {
+			logadd( LOG_ERROR, "incomplete read or I/O error (errno=%d, workedOn: %"PRIu64", size: %"PRIu64")",
+					cowRequest->errorCode, (uint64_t)cowRequest->bytesWorkedOn, (uint64_t)size );
 			fuse_reply_err( req, cowRequest->errorCode != 0 ? cowRequest->errorCode : EIO );
 		} else {
 			fuse_reply_buf( req, cowRequest->readBuffer, cowRequest->bytesWorkedOn );