diff options
author | Simon Rettberg | 2023-03-29 12:03:54 +0200 |
---|---|---|
committer | Simon Rettberg | 2023-03-29 12:03:54 +0200 |
commit | 9836f97fa0cbcac9394bd7d1a91671d6521812a9 (patch) | |
tree | 890354eacb15823814907fe72b5e4caf4d479a87 | |
parent | Fixed cow daemon issue (diff) | |
download | dnbd3-9836f97fa0cbcac9394bd7d1a91671d6521812a9.tar.gz dnbd3-9836f97fa0cbcac9394bd7d1a91671d6521812a9.tar.xz dnbd3-9836f97fa0cbcac9394bd7d1a91671d6521812a9.zip |
[FUSE] cow: Cleanup, comments, fixes, minor refactoring
- Use the term "cluster" for a group of dnbd3-blocks instead of also
calling them blocks.
- Use term "table" instead of "array" for the L1 and L2 tables.
- Use term "index" instead of "offset" when addressing those tables
- Fix a few logic bugs, use-after-free
- Add TODOs for parts that need better comments
-rw-r--r-- | inc/dnbd3/config.h | 16 | ||||
-rw-r--r-- | inc/dnbd3/config/cow.h | 20 | ||||
-rw-r--r-- | inc/dnbd3/types.h | 1 | ||||
-rw-r--r-- | src/cowtest/main.c | 3 | ||||
-rw-r--r-- | src/fuse/connection.c | 20 | ||||
-rw-r--r-- | src/fuse/cowDoc/readme.md | 155 | ||||
-rw-r--r-- | src/fuse/cowfile.c | 599 | ||||
-rw-r--r-- | src/fuse/cowfile.h | 93 |
8 files changed, 495 insertions, 412 deletions
diff --git a/inc/dnbd3/config.h b/inc/dnbd3/config.h index df9d0e0..eb4b8b1 100644 --- a/inc/dnbd3/config.h +++ b/inc/dnbd3/config.h @@ -40,20 +40,4 @@ // +++++ Block Device +++++ #define DNBD3_BLOCK_SIZE ((uint64_t)4096) // NEVER CHANGE THIS OR THE WORLD WILL END! -// +++++ COW +++++ -#define COW_BITFIELD_SIZE 40 // NEVER CHANGE THIS OR THE WORLD WILL ALSO END! -#define COW_FILE_META_MAGIC_VALUE ((uint64_t)0xEBE44D6E72F7825E) // Magic Value to recognize a Cow meta file -#define COW_FILE_DATA_MAGIC_VALUE ((uint64_t)0xEBE44D6E72F7825F) // Magic Value to recognize a Cow data file -#define COW_MIN_UPLOAD_DELAY 60 // in seconds -#define COW_STATS_UPDATE_TIME 5 // time in seconds the cow status files gets updated (while uploading blocks) -#define COW_MAX_PARALLEL_UPLOADS 10 // maximum number of parallel uploads -#define COW_MAX_PARALLEL_BACKGROUND_UPLOADS 2 // maximum number of parallel uploads while the image is still mounted -#define COW_URL_STRING_SIZE 500 // Max string size for an url -#define COW_SHOW_UL_SPEED 1 // enable display of ul speed in cow status file -#define COW_MAX_IMAGE_SIZE 1000LL * 1000LL * 1000LL * 1000LL; // Maximum size an image can have(tb*gb*mb*kb) -// +++++ COW API Endpoints +++++ -#define COW_API_CREATE "%s/api/File/Create" -#define COW_API_UPDATE "%s/api/File/Update?guid=%s&BlockNumber=%lu" -#define COW_API_START_MERGE "%s/api/File/StartMerge" - #endif /* CONFIG_H_ */ diff --git a/inc/dnbd3/config/cow.h b/inc/dnbd3/config/cow.h new file mode 100644 index 0000000..9ed59a0 --- /dev/null +++ b/inc/dnbd3/config/cow.h @@ -0,0 +1,20 @@ +#ifndef _COW_CONFIG_H_ +#define _COW_CONFIG_H_ + +// +++++ COW +++++ +#define COW_BITFIELD_SIZE 40 // NEVER CHANGE THIS OR THE WORLD WILL ALSO END! +#define COW_FILE_META_MAGIC_VALUE ((uint64_t)0xEBE44D6E72F7825E) // Magic Value to recognize a Cow meta file +#define COW_FILE_DATA_MAGIC_VALUE ((uint64_t)0xEBE44D6E72F7825F) // Magic Value to recognize a Cow data file +#define COW_MIN_UPLOAD_DELAY 60 // in seconds +#define COW_STATS_UPDATE_TIME 5 // time in seconds the cow status files gets updated (while uploading blocks) +#define COW_MAX_PARALLEL_UPLOADS 10 // maximum number of parallel uploads +#define COW_MAX_PARALLEL_BACKGROUND_UPLOADS 2 // maximum number of parallel uploads while the image is still mounted +#define COW_URL_STRING_SIZE 500 // Max string size for an url +#define COW_SHOW_UL_SPEED 1 // enable display of ul speed in cow status file +#define COW_MAX_IMAGE_SIZE 1000LL * 1000LL * 1000LL * 1000LL; // Maximum size an image can have(tb*gb*mb*kb) +// +++++ COW API Endpoints +++++ +#define COW_API_CREATE "%s/api/file/create" +#define COW_API_UPDATE "%s/api/file/update?guid=%s&clusterindex=%lu" +#define COW_API_START_MERGE "%s/api/file/merge" + +#endif diff --git a/inc/dnbd3/types.h b/inc/dnbd3/types.h index 699fa68..bd15f4e 100644 --- a/inc/dnbd3/types.h +++ b/inc/dnbd3/types.h @@ -30,6 +30,7 @@ #include <stdbool.h> #include <stddef.h> #include <string.h> +#define container_of( ptr, type, member ) ( (type *)( (char *)( ptr ) - (char *)&( ( (type *)NULL )->member ) ) ) #endif #ifndef MIN diff --git a/src/cowtest/main.c b/src/cowtest/main.c index c2e3161..38d0f16 100644 --- a/src/cowtest/main.c +++ b/src/cowtest/main.c @@ -1,3 +1,5 @@ +#include <dnbd3/config/cow.h> +#include <dnbd3/types.h> #include <fcntl.h> #include <errno.h> #include <stdio.h> @@ -9,7 +11,6 @@ #include <string.h> #include <sys/stat.h> #include <stdint.h> -#include <dnbd3/types.h> #include <stdatomic.h> #include <time.h> #include <pthread.h> diff --git a/src/fuse/connection.c b/src/fuse/connection.c index 986393a..38102bc 100644 --- a/src/fuse/connection.c +++ b/src/fuse/connection.c @@ -399,7 +399,7 @@ static void* connection_receiveThreadMain( void *sockPtr ) } } else { // Found a match - const ssize_t ret = receiveRequest( sockFd, request); + const ssize_t ret = receiveRequest( sockFd, request ); if ( ret != (ssize_t)request->length ) { logadd( LOG_DEBUG1, "receiving payload for a block reply failed" ); connection_read( request ); @@ -420,6 +420,7 @@ static void* connection_receiveThreadMain( void *sockPtr ) } unlock_rw( &altLock ); } + // TODO: See comment in receiveRequest() if( useCow ) { cowfile_handleCallback( request ); } @@ -477,8 +478,8 @@ static void* connection_backgroundThread( void *something UNUSED ) while ( keepRunning ) { ticks now; timing_get( &now ); - uint32_t wt1 = timing_diffMs( &now, &nextKeepalive ); - uint32_t wt2 = timing_diffMs( &now, &nextRttCheck ); + uint32_t wt1 = (uint32_t)timing_diffMs( &now, &nextKeepalive ); + uint32_t wt2 = (uint32_t)timing_diffMs( &now, &nextRttCheck ); if ( wt1 > 0 && wt2 > 0 ) { int waitRes = signal_wait( connection.panicSignal, (int)MIN( wt1, wt2 ) + 1 ); if ( !keepRunning ) @@ -841,15 +842,20 @@ fail: } } -static size_t receiveRequest(const int sock, dnbd3_async_t* request ) { - if(useCow){ +static size_t receiveRequest(const int sock, dnbd3_async_t* request ) +{ + if( useCow ) { cow_sub_request_t * cow_request = container_of( request, cow_sub_request_t, dRequest ); - if( cow_request->callback == readRemoteData){ + // TODO This is ugly, we have a callback so we don't need to special-case receiving the + // reply, yet here we check what the callback function is for some reason :-( + // Ideally there should be no cow-related code in this file at all. + // This requires moving the callback to dnbd3_async_t from cow_sub_request_t though... + if( cow_request->callback == readRemoteData ) { return sock_recv( sock, cow_request->buffer, request->length ); } else{ return sock_recv( sock, &cow_request->writeBuffer, request->length ); } - } else { + } else { return sock_recv( sock, container_of( request, dnbd3_async_parent_t, request )->buffer, request->length ); } } diff --git a/src/fuse/cowDoc/readme.md b/src/fuse/cowDoc/readme.md index e7abadf..acb62c1 100644 --- a/src/fuse/cowDoc/readme.md +++ b/src/fuse/cowDoc/readme.md @@ -38,82 +38,88 @@ Example parameters for creating a new cow session: ## Data structure -The data structure is divided into two main parts. The actual data of the writing on the image and the corresponding metadata. It is also important to distinguish between a dnbd3 block, which is 4096 bytes in size, and a cow block, which combines 320 dnbd3 blocks. A cow block has a `cow_block_metadata_t` structure that contains the corresponding metadata. The metadata is used to determine if a dnbd3 block has been written to, where that block is stored in the `data` file, when it was last modified and when it was uploaded. But more on this later. +The data structure is divided into two main parts. The actual data of the writing on the image and the corresponding metadata. It is also important to distinguish between a dnbd3 block, which is 4096 bytes in size, and a cow cluster, which combines 320 dnbd3 blocks. A cow cluster has a `cow_l2_entry_t` structure in its according l2 table that contains the corresponding metadata. The metadata is used to determine if a dnbd3 block has been written to, where that block is stored in the `data` file, when it was last modified and when it was uploaded. But more on this later. -### Blockmetadata +### Metadata ![Datastructure](img/datastructure.jpg) -TThe data structure for storing "cow_block_metadata_t" contains a layer 1 (L1) and a layer 2 (L2). L1 contains pointers to the L2's. -The entire L1 array is initialised at the beginning and cannot be resized, therefore the size of the L1 array limits the total size of the image. -The L2's are created dynamically as they are needed. So at the beginning all L1 pointers are zero. The L2's are arrays containing 1024 -`cow_block_metadata_t` structs. +The metadata file, which ultimately stores the `cow_l2_entry_t` structs, contains a layer 1 (L1) and a layer 2 (L2) table for looking up the struct, which ultimately points to the actual data in the data file. +The entire L1 table is initialised at the beginning and cannot be resized, therefore the size of the L1 table limits the maximum size of the image. +The L2 tables are created dynamically as needed. So at the beginning, all L1 pointers are invalid (-1). + +The L2 tables contain 1024 `cow_l2_entry_t` structs each. An L2 table is created as soon as any data is written to any offset in the image that corresponds to the range it covers, which is a span of 1024 * 320 * 4096 bytes. ```C -typedef struct cow_block_metadata +typedef struct cow_l2_entry { atomic_int_least64_t offset; atomic_uint_least64_t timeChanged; atomic_uint_least64_t uploads; atomic_char bitfield[40]; -} cow_block_metadata_t; +} cow_l2_entry_t; ``` -Each `cow_block_metadata_t` contains a 40 byte, 320 bit bit field. The bit field indicates whether the corresponding dnbd3 block contains data or not. If, for example, the bit field begins with 01... the first 4096 contain no data and the next 4096 contain data. -So each `cow_block_metadata_t` stores the metadata of up to 320*4096 bytes if all bits are set to 1. The offset field is the offset where in the data file the corresponding data is stored. The property timeChanged contains the Unix time when the block was last changed. It is 0 if it has never been changed or if the last changes have already been uploaded. - - -The L2 arrays and `cow_block_metadata_t` are sorted according to the original offsets of the image. Thus the first L1 pointer, i.e. the first L2 array, addresses the first 1024 * 320 * 4096 bytes (L2Size * bitfieldsize * DNBD3Blocksize) of data and so on. +Each `cow_l2_entry_t` contains a 40 byte, 320 bit bit-field. The bit-field indicates whether the corresponding dnbd3 block has been written locally. If, for example, the bit field begins with 01... the first 4096 bytes contain no data and the next 4096 do. +So each `cow_l2_entry_t` stores the metadata for up to 320\*4096 bytes. The offset field is the offset into the data file where the corresponding data is stored. `timeChanged` contains the unix timestamp when the cluster was last written. It is 0 if it has never been changed or if the latest changes have already been uploaded. -For example, to get the "cow_block_metadata_t" for offset 4033085440, one would take L1[3], since +For example, to get the `cow_l2_entry_t` for offset 4033085440, one would take L1[3], since ``` -4033085440 / ( COW_L2_STORAGE_CAPACITY ) ≈ 3.005 +4033085440 / ( COW_FULL_L2_TABLE_DATA_SIZE ) ≈ 3.005 ``` - Then one would take the fifth `cow_block_metadata_t` in the L2 array because of +Then one would take the fifth `cow_l2_entry_t` in the L2 array because of ``` -(4033085440 mod COW_L2_STORAGE_CAPACITY) / COW_METADATA_STORAGE_CAPACITY = 5 +(4033085440 mod COW_FULL_L2_TABLE_DATA_SIZE) / COW_DATA_CLUSTER_SIZE = 5 ``` Where: ``` -COW_L2_STORAGE_CAPACITY = 1024 * 320 * 4096 -COW_METADATA_STORAGE_CAPACITY = 320 * 4096 +COW_FULL_L2_TABLE_DATA_SIZE = 1024 * 320 * 4096 +COW_DATA_CLUSTER_SIZE = 320 * 4096 ``` - - -### Read Request +Since the result is an integer, the offset refers to the first dnbd3 block in that cluster. Otherwise, the block number within the cluster be calculated via +``` +(4033085440 % (320 * 4096)) / 4096 +``` +which is the index in the bit-field that tells whether the block has been written to the data file. +### Read Request -When a read request is made, it is checked for each 4096-byte block whether the block already exists locally on the computer (i.e. has already been written once). If so, it is read from the hard disk, otherwise it is requested from the dnbd3 server. To increase performance, several subsequent blocks that are also local/non-local are combined into a larger read from the hard disk or request from the server. +When a read request is made, for each dnbd3 block block it is checked whether it already exists in the data file (i.e. has already been written to once). If so, it is read from the data file, otherwise it needs to be requested from the dnbd3 server. To increase performance, several subsequent blocks that are also local/non-local are combined into a larger reads from disk or requests from the server. ![readrequest](img/readrequest.svg) -The diagram above is somewhat simplified for clarity. The server's read operations are asynchronous. This means that it does not wait for a response from the server, but continues with the next blocks. As soon as the server's response is complete, the data is written to the fuse buffer. -Each request to the dnbd3 server increases the variable `workCounter` by one, and each time a request is completed, it is decreased by one. As soon as `workCounter` is 0 again, fuse_request is returned. This is done to ensure that all asynchronous requests are completed before the request is returned. - +The diagram above is somewhat simplified for clarity. The server's read operations are asynchronous. This means that while iterating over the 4k blocks from the read request, it does not wait for a response from the server for blocks that are missing locally, but fires off a request to the dnbd3 server asynchronously, continuing to check the remaining blocks. As soon as all pending requests to the server are completed, the combined data is handed over to fuse, completing the request. +To keep track of pending requests, each request to the dnbd3 server increments the field `workCounter` in the according `cow_request_t` by one, and each time a request is completed, it is decreased by one. As soon as `workCounter` reaches `0`, all data is known to be fetched properly and assembled in a buffer that can be handed over to fuse. +### Write Request -For local blocks, the loop must be interrupted as soon as the end of a `cow_block_metadata_t` is reached, as the next data offset of the next `cow_block_metadata_t` is most likely not directly after it in the data file. +If, in a write request, the beginning or end does not match a multiple of 4096, the beginning and/or end block must be padded if the accoding dnbd3 block hasn't been written to before. +This is because the granularity of the cow bit-field represents a full dnbd3 block of 4096 bytes, so we cannot write partial data to those blocks, as there is no mechanism to annotate which parts of the block have been written to, and which are still missing. +To work around this limitation, we need to fill the partial block's missing data with data from the dnbd3 server if it is still within the range of the original image size. If it is outside the original image size (because the image has grown), the missing bytes can simply be set to 0 and no request needs to be made. +The write request calculates the corresponding `cow_l2_entry_t` from the offset. If the corresponding `cow_l2_entry_t` does not yet exist, it is created. The data will be written to the data file, and the offset stored in `cow_l2_entry_t.offset`. +Then the corresponding bit in the bit-field is set and `timeChanged` is updated. If there is more data to write to the current cluster, the next `cow_l2_entry_t` is calculated and the above steps are repeated. +The variable `workCounter` is also used here to ensure that the padding of the data occurs before the fuse request returns. -### Write Request -If, in a write request, the beginning or end does not match a multiple of 4096, the beginning and/or end block must be padded. -This is because each 4096-byte block requires complete data, because if the bit in the bit field for that block is set, all data is read locally. -To fill the block, if it is still within the range of the original image size, the missing bytes are requested from the dnbd3 server. If it is outside the original image size (because the image has become larger), the missing bytes are filled with 0. -The write request calculates the corresponding `cow_block_metadata_t` from the offset. If the corresponding `cow_block_metadata_t` does not yet exist, it is created. The data will be written to the data file, at the offset stored in the `cow_block_metadata_t`. -Then the corresponding bit in the bit fields is set and the `timeChanged` is updated. If there is more data to write, the next `cow_block_metadata_t` is calculated and the above steps are repeated. -The variable `workCounter` is also used here to ensure that the padding of the data occurs before the Fuse request returns. +### Background Cluster Upload -### Block Upload -For uploading blocks, there is a background thread that periodically loops over all Cow blocks and checks whether `timeChanged` is not 0 and the time difference between now and `timeChanged` is greater than `COW_MIN_UPLOAD_DELAY`. If so, the block is uploaded. The `timeChanged` before the upload is buffered. After the upload, `timeChanged` is set to 0 if it still has the same time as the temporarily stored one (if not, there was a change during the upload and it has to be uploaded again). Once the image is unmounted, `COW_MIN_UPLOAD_DELAY` is ignored and all blocks that have a time other than 0 are uploaded. The upload is done via a [rest request](#/api/file/update). There are two different limits for the number of parallel uploads in the [config.h](#config-variables). +For uploading clusters, there is a background thread that periodically loops over all cow clusters and checks whether `timeChanged` is not 0 and the time difference between now and `timeChanged` is greater than `COW_MIN_UPLOAD_DELAY`. +If so, the entire cluster is uploaded. The `timeChanged` before the upload is remembered. +After the upload, `timeChanged` is set to 0 if it still has the same time as the temporarily stored one (if not, there was a change during the upload and it has to be uploaded again). +Once the image is unmounted, `COW_MIN_UPLOAD_DELAY` is ignored and all clusters that still have a `timeChanged` other than 0 are uploaded. +The upload is done via a [rest request](#/api/file/update). +There are two different limits for the number of parallel uploads in the [config/cow.h](#config-variables). ## Files + When a new CoW session is started, a new `meta`, `data` and, if so set in the command line arguments, a `status.txt` file is created. ### status + The file `status.txt` can be activated with the command line parameter `--cowStatFile`. The file will contain the following: @@ -123,28 +129,29 @@ uuid=<uuid> state=backgroundUpload inQueue=0 modifiedBlocks=0 -idleBlocks=0 -totalBlocksUploaded=0 -activeUploads:0 +idleClusters=0 +totalClustersUploaded=0 +activeUploads=0 ulspeed=0.00 ``` - The `uuid` is the session uuid used by the Cow server to identify the session. -- The `status` is `backgroundUpload` when the image is still mounted and cow blocks are uploaded in the background. -It is `uploading` when the image has been unmounted and all blocks that have not yet been uploaded are uploaded. -It is `done` when the image has been unmounted and all blocks have been uploaded. -- `Queue` are the cow blocks that are currently being uploaded or are waiting for a free slot. -- `ModifiedBlocks` are cow blocks that have changes that have not yet been uploaded to the server because the changes are too recent. -- `totalBlocksUploaded` the total amount of cow blocks uploaded since the image was mounted. -- `activeUploads` is the number of blocks currently being uploaded. +- The `status` is `backgroundUpload` when the image is still mounted and cow clusters are uploaded in the background. +It is `uploading` when the image has been unmounted and all clusters that have not yet been uploaded are uploaded. +It is `done` when the image has been unmounted and all clusters have been uploaded. +- `Queue` are the cow clusters that are currently being uploaded or are waiting for a free slot. +- `ModifiedClusters` are cow clusters that have changes that have not yet been uploaded to the server because the changes are too recent. +- `totalClustersUploaded` the total amount of cow clusters uploaded since the image was mounted. +- `activeUploads` is the number of clusters currently being uploaded. - `ulspeed` the current upload speed in kb/s. -Once all blocks have been uploaded, the status is set to `done`. -If you define `COW_DUMP_BLOCK_UPLOADS`, a list of all blocks, sorted by the number of uploads, is copied to the status.txt file after the block upload is completed. +Once all clusters have been uploaded, the status is set to `done`. +If you define `COW_DUMP_BLOCK_UPLOADS`, a list of all clusters, sorted by the number of uploads, is copied to the status.txt file after the cluster upload is completed. With the command line parameter `--cowStatStdout` the same output of the stats file will be printed in stdout. ### meta + The `meta` file contains the following header: ```C // cowfile.h @@ -167,12 +174,13 @@ typedef struct cowfile_metadata_header } cowfile_metadata_header_t; ``` After this header, the above-mentioned l1 and then the l2 data structure begins at byte 8192. -### data -The `data` files contain the magicValue and at the 40 * 8 * 4096 offset (capacity of a cowfile_metadata_header_t) the first data block starts. +### data +The `data` file starts with `COW_FILE_DATA_MAGIC_VALUE` and at the `COW_DATA_CLUSTER_SIZE` (40 * 8 * 4096) offset the first cluster starts. ### magic values in the file headers + The magic values in both files are used to ensure that an appropriate file is read and that the machine has the correct endianness. ```C //config.h @@ -180,25 +188,24 @@ The magic values in both files are used to ensure that an appropriate file is re #define COW_FILE_DATA_MAGIC_VALUE ((uint64_t)0xEBE44D6E72F7825F) // Magic Value to recognize a Cow data file ``` - ### Threads -This extension uses two new threads: + +This extension uses two new threads: ``` tidCowUploader tidStatUpdater ``` -```tidCowUploader``` is the thread that uploads blocks to the cow server. +`tidCowUploader` is the thread that uploads blocks to the cow server. -```tidStatUpdater``` updates the stats in stdout or the stats files (depending on parameters). +`tidStatUpdater` updates the stats in stdout or the stats files (depending on parameters). ### Locks -This extension uses a new lock ```cow.l2CreateLock```. It is used when a new L2 array is allocated. - - +This extension uses a new lock `cow.l2CreateLock`. It is used when a new L2 table is allocated. ### Config Variables -The following configuration variables have been added to ```config.h```. + +The following configuration variables have been added to `config/cow.h`. ```c //config.h // +++++ COW +++++ @@ -206,22 +213,26 @@ The following configuration variables have been added to ```config.h```. #define COW_FILE_META_MAGIC_VALUE ((uint64_t)0xEBE44D6E72F7825E) // Magic Value to recognize a Cow meta file #define COW_FILE_DATA_MAGIC_VALUE ((uint64_t)0xEBE44D6E72F7825F) // Magic Value to recognize a Cow data file #define COW_MIN_UPLOAD_DELAY 60 // in seconds -#define COW_STATS_UPDATE_TIME 5 // time in seconds the cow status files gets updated (while uploading blocks) +#define COW_STATS_UPDATE_TIME 5 // time in seconds the cow status files gets updated (while uploading clusters) #define COW_MAX_PARALLEL_UPLOADS 10 // maximum number of parallel uploads #define COW_MAX_PARALLEL_BACKGROUND_UPLOADS 2 // maximum number of parallel uploads while the image is still mounted #define COW_URL_STRING_SIZE 500 // Max string size for an url #define COW_SHOW_UL_SPEED 1 // enable display of ul speed in cow status file #define COW_MAX_IMAGE_SIZE 1000LL * 1000LL * 1000LL * 1000LL; // Maximum size an image can have(tb*gb*mb*kb) // +++++ COW API Endpoints +++++ -#define COW_API_CREATE "%s/api/File/Create" -#define COW_API_UPDATE "%s/api/File/Update?guid=%s&BlockNumber=%lu" -#define COW_API_START_MERGE "%s/api/File/StartMerge" +#define COW_API_CREATE "%s/api/file/create" +#define COW_API_UPDATE "%s/api/file/update?guid=%s&clusterindex=%lu" +#define COW_API_START_MERGE "%s/api/file/merge" ``` -- ```COW_MIN_UPLOAD_DELAY``` sets the minimum time in seconds that must have elapsed since the last change to a cow block before it is uploaded. This value can be fine-tuned. A larger value usually reduces the double uploading of blocks. A smaller value reduces the time for the final upload after the image has been unmounted. If you set `COW_DUMP_BLOCK_UPLOADS` and set the command line parameter `--cowStatFile`, then a list of all blocks, sorted by the number of uploads, will be written to the status.txt file after the block upload is complete. This can help in fine-tuning `COW_MIN_UPLOAD_DELAY`. -- ```COW_STATS_UPDATE_TIME``` defines the update frequency of the stdout output/statistics file in seconds. Setting it too low could affect performance as a loop runs over all blocks. -- ```COW_MAX_PARALLEL_BACKGROUND_UPLOADS``` defines the maximum number of parallel block uploads. This number is used when the image is still mounted and the user is still using it. -- ```COW_MAX_PARALLEL_UPLOADS``` defines the maximum number of parallel block uploads. This number is used once the image has been unmounted to upload the remaining modified blocks. +- `COW_MIN_UPLOAD_DELAY` sets the minimum time in seconds that must have elapsed since the last change to a cow cluster before it is uploaded. +This value can be fine-tuned. A larger value usually reduces redundant uploading of clusters. +A smaller value reduces the time for the final upload after the image has been unmounted. +If you set `COW_DUMP_BLOCK_UPLOADS` and set the command line parameter `--cowStatFile`, then a list of all clusters, sorted by the number of uploads, will be written to the status.txt file after the cluster upload is complete. +This can help in fine-tuning `COW_MIN_UPLOAD_DELAY`. +- `COW_STATS_UPDATE_TIME` defines the update frequency of the stdout output/statistics file in seconds. Setting it too low could affect performance as a loop runs over all clusters. +- `COW_MAX_PARALLEL_BACKGROUND_UPLOADS` defines the maximum number of parallel cluster uploads. This number is used when the image is still mounted and the user is still using it. +- `COW_MAX_PARALLEL_UPLOADS` defines the maximum number of parallel cluster uploads. This number is used once the image has been unmounted to upload the remaining modified clusters. @@ -248,7 +259,7 @@ This request is used as soon as a new cow session is created. The returned guid | Name | Located in | Description | Required | Schema | | ---- | ---------- | ----------- | -------- | ---- | | guid | query | | Yes | string (uuid) | -| blockNumber | query | | Yes | integer | +| clusterNumber | query | | Yes | integer | ##### Responses @@ -256,7 +267,7 @@ This request is used as soon as a new cow session is created. The returned guid | ---- | ----------- | | 200 | Success | -Used to upload a data block. The block number is the absolute block number. The body contains an "application/octet-stream", where the first bytes are the bit field, directly followed by the actual block data. +Used to upload a cluster. The cluster number is the absolute cluster number. The body contains an "application/octet-stream", where the first bytes are the bit field, directly followed by the actual cluster data. ### /api/File/StartMerge @@ -292,7 +303,7 @@ Used to start the merge on the server. | ---- | ----------- | | 200 | Success | -This request returns a list containing the block IDs and the number of uploads of this block, sorted by the number of uploads. This is useful to adjust the `COW_MIN_UPLOAD_DELAY`. +This request returns a list containing the cluster IDs and the number of uploads, sorted by the number of uploads. This is useful if you want to fine-tune `COW_MIN_UPLOAD_DELAY`. ### /api/File/Status @@ -317,7 +328,7 @@ Returns the SessionStatus model that provides information about the session. | Name | Type | Description | Required | | ---- | ---- | ----------- | -------- | -| blockNumber | integer | | Yes | +| clusterNumber | integer | | Yes | | modifications | integer | | Yes | #### SessionState @@ -334,5 +345,5 @@ Returns the SessionStatus model that provides information about the session. | imageName | string | | Yes | | originalImageVersion | integer | | Yes | | newImageVersion | integer | | Yes | -| mergedBlocks | integer | | Yes | -| totalBlocks | integer | | Yes | +| mergedClusters | integer | | Yes | +| totalClusters | integer | | Yes | diff --git a/src/fuse/cowfile.c b/src/fuse/cowfile.c index b2df74f..8e816a2 100644 --- a/src/fuse/cowfile.c +++ b/src/fuse/cowfile.c @@ -1,8 +1,20 @@ #include "cowfile.h" -#include "math.h" +#include "main.h" +#include "connection.h" + +#include <dnbd3/shared/log.h> +#include <sys/mman.h> +#include <string.h> +#include <pthread.h> +#include <errno.h> +#include <curl/curl.h> + +#define UUID_STRLEN 36 + extern void image_ll_getattr( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi ); -static int cowFileVersion = 1; +static const int CURRENT_COW_VERSION = 1; + static bool statStdout; static bool statFile; static pthread_t tidCowUploader; @@ -13,8 +25,8 @@ static cowfile_metadata_header_t *metadata = NULL; static atomic_uint_fast64_t bytesUploaded; static uint64_t totalBlocksUploaded = 0; static int activeUploads = 0; -atomic_bool uploadLoop = true; -atomic_bool uploadLoopDone = false; +atomic_bool uploadLoop = true; // Keep upload loop running? +atomic_bool uploadLoopDone = false; // Upload loop has finished all work? static struct cow { @@ -31,25 +43,25 @@ static struct cow } cow; /** - * @brief Computes the l1 offset from the absolute file offset + * @brief Computes the l1 index for an absolute file offset * * @param offset absolute file offset - * @return int l2 offset + * @return int l1 index */ -static int getL1Offset( size_t offset ) +static int offsetToL1Index( size_t offset ) { - return (int)( offset / COW_L2_STORAGE_CAPACITY ); + return (int)( offset / COW_FULL_L2_TABLE_DATA_SIZE ); } /** - * @brief Computes the l2 offset from the absolute file offset + * @brief Computes the l2 index for an absolute file offset * * @param offset absolute file offset - * @return int l2 offset + * @return int l2 index */ -static int getL2Offset( size_t offset ) +static int offsetToL2Index( size_t offset ) { - return (int)( ( offset % COW_L2_STORAGE_CAPACITY ) / COW_METADATA_STORAGE_CAPACITY ); + return (int)( ( offset % COW_FULL_L2_TABLE_DATA_SIZE ) / COW_DATA_CLUSTER_SIZE ); } /** @@ -58,7 +70,7 @@ static int getL2Offset( size_t offset ) * @param offset absolute file offset * @return int bit(0-319) in the bitfield */ -static int getBitfieldOffset( size_t offset ) +static int getBitfieldOffsetBit( size_t offset ) { return (int)( offset / DNBD3_BLOCK_SIZE ) % ( COW_BITFIELD_SIZE * 8 ); } @@ -84,7 +96,7 @@ static void setBits( atomic_char *byte, int from, int to, bool value ) /** * @brief Sets the specified bits in the specified range threadsafe to 1. * - * @param bitfield of a cow_block_metadata + * @param bitfield of a cow_l2_entry * @param from start bit * @param to end bit * @param value set bits to 1 or 0 @@ -104,7 +116,7 @@ static void setBitsInBitfield( atomic_char *bitfield, int from, int to, bool val /** * @brief Checks if the n bit of a bit field is 0 or 1. * - * @param bitfield of a cow_block_metadata + * @param bitfield of a cow_l2_entry * @param n the bit which should be checked */ static bool checkBit( atomic_char *bitfield, int n ) @@ -126,13 +138,14 @@ static bool checkBit( atomic_char *bitfield, int n ) */ size_t curlCallbackCreateSession( char *buffer, size_t itemSize, size_t nitems, void *response ) { - size_t bytes = itemSize * nitems; - if ( strlen( response ) + bytes != 36 ) { - logadd( LOG_INFO, "strlen(response): %lu bytes: %lu \n", strlen( response ), bytes ); + uint64_t done = strlen( response ); + uint64_t bytes = itemSize * nitems; + if ( done + bytes > UUID_STRLEN ) { + logadd( LOG_INFO, "strlen(response): %"PRIu64" bytes: %"PRIu64"\n", done, bytes ); return bytes; } - strncat( response, buffer, 36 ); + strncat( response, buffer, UUID_STRLEN - done + 1 ); return bytes; } @@ -185,12 +198,12 @@ bool createSession( const char *imageName, uint16_t version ) long http_code = 0; curl_easy_getinfo( curl, CURLINFO_RESPONSE_CODE, &http_code ); - if ( http_code != 200 ) { + if ( http_code < 200 || http_code >= 300 ) { logadd( LOG_ERROR, "COW_API_CREATE failed http: %ld\n", http_code ); return false; } curl_easy_reset( curl ); - metadata->uuid[36] = '\0'; + metadata->uuid[UUID_STRLEN] = '\0'; logadd( LOG_DEBUG1, "Cow session started, guid: %s\n", metadata->uuid ); return true; } @@ -211,21 +224,30 @@ size_t curlReadCallbackUploadBlock( char *ptr, size_t size, size_t nmemb, void * { cow_curl_read_upload_t *uploadBlock = (cow_curl_read_upload_t *)userdata; size_t len = 0; + // Check if we're still in the bitfield if ( uploadBlock->position < (size_t)metadata->bitfieldSize ) { size_t lenCpy = MIN( metadata->bitfieldSize - uploadBlock->position, size * nmemb ); memcpy( ptr, uploadBlock->block->bitfield + uploadBlock->position, lenCpy ); uploadBlock->position += lenCpy; len += lenCpy; } + // No elseif here, might just have crossed over... if ( uploadBlock->position >= (size_t)metadata->bitfieldSize ) { - size_t lenRead = MIN( COW_METADATA_STORAGE_CAPACITY - ( uploadBlock->position - ( metadata->bitfieldSize ) ), + ssize_t wantRead = (ssize_t)MIN( + COW_DATA_CLUSTER_SIZE - ( uploadBlock->position - ( metadata->bitfieldSize ) ), ( size * nmemb ) - len ); - off_t inBlockOffset = uploadBlock->position - metadata->bitfieldSize; - size_t lengthRead = pread( cow.fhd, ( ptr + len ), lenRead, uploadBlock->block->offset + inBlockOffset ); + off_t inClusterOffset = uploadBlock->position - metadata->bitfieldSize; + ssize_t lengthRead = pread( cow.fhd, ( ptr + len ), wantRead, uploadBlock->block->offset + inClusterOffset ); + if ( lengthRead == -1 ) { + logadd( LOG_ERROR, "Upload: Reading from COW file failed with errno %d", errno ); + return CURL_READFUNC_ABORT; + } - if ( lenRead != lengthRead ) { + if ( wantRead > lengthRead ) { // fill up since last block may not be a full block - lengthRead = lenRead; + memset( ptr + len + lengthRead, 0, wantRead - lengthRead ); + // TODO what about partial read? We should know how much data there actually is... + lengthRead = wantRead; } uploadBlock->position += lengthRead; len += lengthRead; @@ -236,7 +258,6 @@ size_t curlReadCallbackUploadBlock( char *ptr, size_t size, size_t nmemb, void * /** * @brief Requests the merging of the image on the cow server. - */ bool mergeRequest() { @@ -316,8 +337,8 @@ void startMerge() * @param ulNow number of bytes uploaded by this transfer so far. * @return int always returns 0 to continue the callbacks. */ -int progress_callback( void *clientp, __attribute__( ( unused ) ) curl_off_t dlTotal, - __attribute__( ( unused ) ) curl_off_t dlNow, __attribute__( ( unused ) ) curl_off_t ulTotal, curl_off_t ulNow ) +int progress_callback( void *clientp, __attribute__((unused)) curl_off_t dlTotal, + __attribute__((unused)) curl_off_t dlNow, __attribute__((unused)) curl_off_t ulTotal, curl_off_t ulNow ) { CURL *eh = (CURL *)clientp; cow_curl_read_upload_t *curlUploadBlock; @@ -344,37 +365,44 @@ int progress_callback( void *clientp, __attribute__( ( unused ) ) curl_off_t dlT void updateCowStatsFile( uint64_t inQueue, uint64_t modified, uint64_t idle, char *speedBuffer ) { char buffer[300]; - char state[30]; + const char *state; + if ( uploadLoop ) { - snprintf( state, 30, "%s", "backgroundUpload" ); + state = "backgroundUpload"; } else if ( !uploadLoopDone ) { - snprintf( state, 30, "%s", "uploading" ); + state = "uploading"; } else { - snprintf( state, 30, "%s", "done" ); + state = "done"; } int len = snprintf( buffer, 300, "state=%s\n" "inQueue=%" PRIu64 "\n" - "modifiedBlocks=%" PRIu64 "\n" - "idleBlocks=%" PRIu64 "\n" - "totalBlocksUploaded=%" PRIu64 "\n" - "activeUploads:%i\n" - "%s=%s", - state, inQueue, modified, idle, totalBlocksUploaded, activeUploads, COW_SHOW_UL_SPEED ? "ulspeed" : "", + "modifiedClusters=%" PRIu64 "\n" + "idleClusters=%" PRIu64 "\n" + "totalClustersUploaded=%" PRIu64 "\n" + "activeUploads=:%i\n" + "%s%s", + state, inQueue, modified, idle, totalBlocksUploaded, activeUploads, + COW_SHOW_UL_SPEED ? "ulspeed=" : "", speedBuffer ); + if ( len == -1 ) { + logadd( LOG_ERROR, "snprintf error" ); + return; + } + if ( statStdout ) { logadd( LOG_INFO, "%s", buffer ); } if ( statFile ) { - if ( pwrite( cow.fhs, buffer, len, 43 ) != len ) { + // Pad with a bunch of newlines so we don't change the file size all the time + ssize_t extra = MIN( 20, sizeof(buffer) - len - 1 ); + memset( buffer + len, '\n', extra ); + if ( pwrite( cow.fhs, buffer, len + extra, 43 ) != len ) { logadd( LOG_WARNING, "Could not update cow status file" ); } - if ( ftruncate( cow.fhs, 43 + len ) ) { - logadd( LOG_WARNING, "Could not truncate cow status file" ); - } #ifdef COW_DUMP_BLOCK_UPLOADS if ( !uploadLoop && uploadLoopDone ) { dumpBlockUploads(); @@ -392,19 +420,19 @@ int cmpfunc( const void *a, const void *b ) */ void dumpBlockUploads() { - long unsigned int l1MaxOffset = 1 + ( ( metadata->imageSize - 1 ) / COW_L2_STORAGE_CAPACITY ); + long unsigned int l1MaxOffset = 1 + ( ( metadata->imageSize - 1 ) / COW_FULL_L2_TABLE_DATA_SIZE ); - cow_block_upload_statistics_t blockUploads[l1MaxOffset * COW_L2_SIZE]; + cow_block_upload_statistics_t blockUploads[l1MaxOffset * COW_L2_TABLE_SIZE]; uint64_t currentBlock = 0; - for ( long unsigned int l1Offset = 0; l1Offset < l1MaxOffset; l1Offset++ ) { - if ( cow.l1[l1Offset] == -1 ) { + for ( long unsigned int l1Index = 0; l1Index < l1MaxOffset; l1Index++ ) { + if ( cow.l1[l1Index] == -1 ) { continue; } - for ( int l2Offset = 0; l2Offset < COW_L2_SIZE; l2Offset++ ) { - cow_block_metadata_t *block = ( cow.firstL2[cow.l1[l1Offset]] + l2Offset ); + for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) { + cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index ); blockUploads[currentBlock].uploads = block->uploads; - blockUploads[currentBlock].blocknumber = ( l1Offset * COW_L2_SIZE + l2Offset ); + blockUploads[currentBlock].blocknumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index ); currentBlock++; } } @@ -441,7 +469,7 @@ bool addUpload( CURLM *cm, cow_curl_read_upload_t *curlUploadBlock, struct curl_ curl_easy_setopt( eh, CURLOPT_LOW_SPEED_LIMIT, 1000L ); curl_easy_setopt( - eh, CURLOPT_POSTFIELDSIZE_LARGE, (long)( metadata->bitfieldSize + COW_METADATA_STORAGE_CAPACITY ) ); + eh, CURLOPT_POSTFIELDSIZE_LARGE, (long)( metadata->bitfieldSize + COW_DATA_CLUSTER_SIZE ) ); if ( COW_SHOW_UL_SPEED ) { curlUploadBlock->ulLast = 0; curl_easy_setopt( eh, CURLOPT_NOPROGRESS, 0L ); @@ -475,7 +503,8 @@ bool finishUpload( CURLM *cm, CURLMsg *msg, struct curl_slist *headers ) long http_code = 0; res2 = curl_easy_getinfo( msg->easy_handle, CURLINFO_RESPONSE_CODE, &http_code ); - if ( res != CURLE_OK || res2 != CURLE_OK || http_code != 200 || msg->msg != CURLMSG_DONE ) { + if ( res != CURLE_OK || res2 != CURLE_OK || http_code < 200 || http_code >= 300 + || msg->msg != CURLMSG_DONE ) { curlUploadBlock->fails++; logadd( LOG_ERROR, "COW_API_UPDATE failed %i/5: %s\n", curlUploadBlock->fails, curl_easy_strerror( msg->data.result ) ); @@ -521,7 +550,7 @@ bool MessageHandler( do { curl_multi_perform( cm, &activeUploads ); - while ( ( msg = curl_multi_info_read( cm, &msgsLeft ) ) ) { + while ( ( msg = curl_multi_info_read( cm, &msgsLeft ) ) != NULL ) { if ( !finishUpload( cm, msg, headers ) ) { status = false; } @@ -553,38 +582,44 @@ bool uploaderLoop( bool ignoreMinUploadDelay, CURLM *cm ) { bool success = true; struct curl_slist *headers = NULL; + const time_t now = time( NULL ); headers = curl_slist_append( headers, "Content-Type: application/octet-stream" ); - long unsigned int l1MaxOffset = 1 + ( ( metadata->imageSize - 1 ) / COW_L2_STORAGE_CAPACITY ); - for ( long unsigned int l1Offset = 0; l1Offset < l1MaxOffset; l1Offset++ ) { - if ( cow.l1[l1Offset] == -1 ) { - continue; + long unsigned int l1MaxOffset = 1 + ( ( metadata->imageSize - 1 ) / COW_FULL_L2_TABLE_DATA_SIZE ); + // Iterate over all blocks, L1 first + for ( long unsigned int l1Index = 0; l1Index < l1MaxOffset; l1Index++ ) { + if ( cow.l1[l1Index] == -1 ) { + continue; // Not allocated } - for ( int l2Offset = 0; l2Offset < COW_L2_SIZE; l2Offset++ ) { - cow_block_metadata_t *block = ( cow.firstL2[cow.l1[l1Offset]] + l2Offset ); + // Now all L2 blocks + for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) { + cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index ); if ( block->offset == -1 ) { - continue; + continue; // Not allocated } - if ( block->timeChanged != 0 ) { - if ( ( time( NULL ) - block->timeChanged > COW_MIN_UPLOAD_DELAY ) || ignoreMinUploadDelay ) { - do { - if ( !MessageHandler( cm, true, ignoreMinUploadDelay, headers ) ) { - success = false; - } - } while ( !( activeUploads < ( ignoreMinUploadDelay ? COW_MAX_PARALLEL_UPLOADS - : COW_MAX_PARALLEL_BACKGROUND_UPLOADS ) ) - && activeUploads ); - cow_curl_read_upload_t *b = malloc( sizeof( cow_curl_read_upload_t ) ); - b->block = block; - b->blocknumber = ( l1Offset * COW_L2_SIZE + l2Offset ); - b->fails = 0; - b->position = 0; - b->time = block->timeChanged; - addUpload( cm, b, headers ); - if ( !ignoreMinUploadDelay && !uploadLoop ) { - goto DONE; - } + if ( block->timeChanged == 0 ) { + continue; // Not changed + } + if ( !ignoreMinUploadDelay && ( now - block->timeChanged < COW_MIN_UPLOAD_DELAY ) ) { + continue; // Last change not old enough + } + // Run curl mainloop at least one, but keep doing so while max concurrent uploads is reached + do { + if ( !MessageHandler( cm, true, ignoreMinUploadDelay, headers ) ) { + success = false; } + } while ( ( activeUploads >= ( ignoreMinUploadDelay ? COW_MAX_PARALLEL_UPLOADS + : COW_MAX_PARALLEL_BACKGROUND_UPLOADS ) ) + && activeUploads > 0 ); + cow_curl_read_upload_t *b = malloc( sizeof( cow_curl_read_upload_t ) ); + b->block = block; + b->blocknumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index ); + b->fails = 0; + b->position = 0; + b->time = block->timeChanged; + addUpload( cm, b, headers ); + if ( !ignoreMinUploadDelay && !uploadLoop ) { + goto DONE; } } } @@ -611,14 +646,14 @@ void *cowfile_statUpdater( __attribute__( ( unused ) ) void *something ) int modified = 0; int inQueue = 0; int idle = 0; - long unsigned int l1MaxOffset = 1 + ( ( metadata->imageSize - 1 ) / COW_L2_STORAGE_CAPACITY ); + long unsigned int l1MaxOffset = 1 + ( ( metadata->imageSize - 1 ) / COW_FULL_L2_TABLE_DATA_SIZE ); uint64_t now = time( NULL ); - for ( long unsigned int l1Offset = 0; l1Offset < l1MaxOffset; l1Offset++ ) { - if ( cow.l1[l1Offset] == -1 ) { + for ( long unsigned int l1Index = 0; l1Index < l1MaxOffset; l1Index++ ) { + if ( cow.l1[l1Index] == -1 ) { continue; } - for ( int l2Offset = 0; l2Offset < COW_L2_SIZE; l2Offset++ ) { - cow_block_metadata_t *block = ( cow.firstL2[cow.l1[l1Offset]] + l2Offset ); + for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) { + cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index ); if ( block->offset == -1 ) { continue; } @@ -652,7 +687,7 @@ void *cowfile_statUpdater( __attribute__( ( unused ) ) void *something ) /** * @brief main loop for blockupload in the background */ -void *cowfile_uploader( __attribute__( ( unused ) ) void *something ) +static void *uploaderThreadMain( __attribute__((unused)) void *something ) { CURLM *cm; @@ -691,7 +726,7 @@ void *cowfile_uploader( __attribute__( ( unused ) ) void *something ) * @return true * @return false if failed to create or to write into the file */ -bool createCowStatsFile( char *path ) +static bool createCowStatsFile( char *path ) { char pathStatus[strlen( path ) + 12]; @@ -721,9 +756,10 @@ bool createCowStatsFile( char *path ) * * @param path where the files should be stored * @param image_Name name of the original file/image - * @param imageSizePtr + * @param imageSizePtr */ -bool cowfile_init( char *path, const char *image_Name, uint16_t imageVersion, atomic_uint_fast64_t **imageSizePtr, +bool cowfile_init( char *path, const char *image_Name, uint16_t imageVersion, + atomic_uint_fast64_t **imageSizePtr, char *serverAddress, bool sStdout, bool sfile ) { statStdout = sStdout; @@ -746,12 +782,10 @@ bool cowfile_init( char *path, const char *image_Name, uint16_t imageVersion, at int maxPageSize = 8192; - - size_t metaDataSizeHeader = sizeof( cowfile_metadata_header_t ) + strlen( image_Name ); - + size_t metaDataSizeHeader = sizeof( cowfile_metadata_header_t ); cow.maxImageSize = COW_MAX_IMAGE_SIZE; - cow.l1Size = ( ( cow.maxImageSize + COW_L2_STORAGE_CAPACITY - 1LL ) / COW_L2_STORAGE_CAPACITY ); + cow.l1Size = ( ( cow.maxImageSize + COW_FULL_L2_TABLE_DATA_SIZE - 1LL ) / COW_FULL_L2_TABLE_DATA_SIZE ); // size of l1 array + number of l2's * size of l2 size_t metadata_size = cow.l1Size * sizeof( l1 ) + cow.l1Size * sizeof( l2 ); @@ -760,14 +794,13 @@ bool cowfile_init( char *path, const char *image_Name, uint16_t imageVersion, at size_t meta_data_start = ( ( metaDataSizeHeader + maxPageSize - 1 ) / maxPageSize ) * maxPageSize; size_t metadataFileSize = meta_data_start + metadata_size; - if ( pwrite( cow.fhm, "", 1, metadataFileSize ) != 1 ) { - logadd( LOG_ERROR, "Could not write cow meta_data_table to file. Bye.\n" ); + if ( ftruncate( cow.fhm, metadataFileSize ) != 0 ) { + logadd( LOG_ERROR, "Could not set file size of meta data file (errno=%d). Bye.\n", errno ); return false; } cow.metadata_mmap = mmap( NULL, metadataFileSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fhm, 0 ); - if ( cow.metadata_mmap == MAP_FAILED ) { logadd( LOG_ERROR, "Error while mapping mmap:\n%s \n Bye.\n", strerror( errno ) ); return false; @@ -775,10 +808,9 @@ bool cowfile_init( char *path, const char *image_Name, uint16_t imageVersion, at metadata = (cowfile_metadata_header_t *)( cow.metadata_mmap ); metadata->magicValue = COW_FILE_META_MAGIC_VALUE; - metadata->version = cowFileVersion; - metadata->dataFileSize = ATOMIC_VAR_INIT( 0 ); - metadata->metadataFileSize = ATOMIC_VAR_INIT( 0 ); - metadata->metadataFileSize = metadataFileSize; + metadata->version = CURRENT_COW_VERSION; + metadata->dataFileSize = ATOMIC_VAR_INIT( COW_DATA_CLUSTER_SIZE ); + metadata->metadataFileSize = ATOMIC_VAR_INIT( metadataFileSize ); metadata->blocksize = DNBD3_BLOCK_SIZE; metadata->originalImageSize = **imageSizePtr; metadata->imageSize = metadata->originalImageSize; @@ -802,12 +834,9 @@ bool cowfile_init( char *path, const char *image_Name, uint16_t imageVersion, at logadd( LOG_ERROR, "Could not write header to cow data file. Bye.\n" ); return false; } - // move the dataFileSize to make room for the header - atomic_store( &metadata->dataFileSize, COW_METADATA_STORAGE_CAPACITY ); pthread_mutex_init( &cow.l2CreateLock, NULL ); - cowServerAddress = serverAddress; curl_global_init( CURL_GLOBAL_ALL ); curl = curl_easy_init(); @@ -842,11 +871,11 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server snprintf( pathData, strlen( path ) + 6, "%s%s", path, "/data" ); - if ( ( cow.fhm = open( pathMeta, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR ) ) == -1 ) { + if ( ( cow.fhm = open( pathMeta, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) { logadd( LOG_ERROR, "Could not open cow meta file. Bye.\n" ); return false; } - if ( ( cow.fhd = open( pathData, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR ) ) == -1 ) { + if ( ( cow.fhd = open( pathData, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) { logadd( LOG_ERROR, "Could not open cow data file. Bye.\n" ); return false; } @@ -874,8 +903,8 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server return false; } struct stat st; - stat( pathMeta, &st ); - if ( (long)st.st_size < (long)header.metaDataStart + (long)header.nextL2 * (long)sizeof( l2 ) ) { + fstat( cow.fhm, &st ); + if ( st.st_size < (off_t)( header.metaDataStart + header.nextL2 * sizeof( l2 ) ) ) { logadd( LOG_ERROR, "cow meta file to small. Bye.\n" ); return false; } @@ -896,8 +925,8 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server return false; } struct stat st; - stat( pathData, &st ); - if ( (long)header.dataFileSize < st.st_size ) { + fstat( cow.fhd, &st ); + if ( (off_t)header.dataFileSize > st.st_size ) { logadd( LOG_ERROR, "cow data file to small. Bye.\n" ); return false; } @@ -909,8 +938,9 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server logadd( LOG_ERROR, "Error while mapping mmap:\n%s \n Bye.\n", strerror( errno ) ); return false; } - if ( header.version != cowFileVersion ) { - logadd( LOG_ERROR, "Error wrong file version got: %i expected: 1. Bye.\n", metadata->version ); + if ( header.version != CURRENT_COW_VERSION ) { + logadd( LOG_ERROR, "Error wrong file version got: %i expected: %i. Bye.\n", + metadata->version, CURRENT_COW_VERSION ); return false; } @@ -920,7 +950,7 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server *imageSizePtr = &metadata->imageSize; cow.l1 = (l1 *)( cow.metadata_mmap + metadata->metaDataStart ); cow.maxImageSize = metadata->maxImageSize; - cow.l1Size = ( ( cow.maxImageSize + COW_L2_STORAGE_CAPACITY - 1LL ) / COW_L2_STORAGE_CAPACITY ); + cow.l1Size = ( ( cow.maxImageSize + COW_FULL_L2_TABLE_DATA_SIZE - 1LL ) / COW_FULL_L2_TABLE_DATA_SIZE ); cow.firstL2 = (l2 *)( ( (char *)cow.l1 ) + cow.l1Size ); pthread_mutex_init( &cow.l2CreateLock, NULL ); @@ -933,7 +963,7 @@ bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *server */ bool cowfile_startBackgroundThreads() { - if( pthread_create( &tidCowUploader, NULL, &cowfile_uploader, NULL ) != 0 ) { + if( pthread_create( &tidCowUploader, NULL, &uploaderThreadMain, NULL ) != 0 ) { logadd( LOG_ERROR, "Could not create cow uploader thread"); return false; } @@ -952,61 +982,63 @@ bool cowfile_startBackgroundThreads() { * @param buffer containing the data * @param size of the buffer * @param netSize which actually contributes to the fuse write request (can be different from size if partial full blocks are written) - * @param cowRequest - * @param block - * @param inBlockOffset + * @param cowRequest <---- !???? TODO + * @param block block being written to + * @param inClusterOffset offset in this cluster to be written to */ static void writeData( const char *buffer, ssize_t size, size_t netSize, atomic_int *errorCode, - atomic_size_t *bytesWorkedOn, cow_block_metadata_t *block, off_t inBlockOffset ) + atomic_size_t *bytesWorkedOn, cow_l2_entry_t *block, off_t inClusterOffset ) { + // TODO: Assert that size + inClusterOffset <= COW_DATA_CLUSTER_SIZE? ssize_t totalBytesWritten = 0; while ( totalBytesWritten < size ) { ssize_t bytesWritten = pwrite( cow.fhd, ( buffer + totalBytesWritten ), size - totalBytesWritten, - block->offset + inBlockOffset + totalBytesWritten ); + block->offset + inClusterOffset + totalBytesWritten ); if ( bytesWritten == -1 ) { - logadd( LOG_ERROR, - "size:%zu netSize:%zu errorCode:%i bytesWorkedOn:%zu inBlockOffset:%ld block->offset:%ld \n", size, - netSize, *errorCode, *bytesWorkedOn, inBlockOffset, block->offset ); *errorCode = errno; + logadd( LOG_ERROR, + "size:%zu netSize:%zu errorCode:%i bytesWorkedOn:%zu inClusterOffset:%ld block->offset:%ld \n", size, + netSize, *errorCode, *bytesWorkedOn, inClusterOffset, block->offset ); break; } else if ( bytesWritten == 0 ) { - logadd( LOG_ERROR, - "size:%zu netSize:%zu errorCode:%i bytesWorkedOn:%zu inBlockOffset:%ld block->offset:%ld \n", size, - netSize, *errorCode, *bytesWorkedOn, inBlockOffset, block->offset ); *errorCode = EIO; + logadd( LOG_ERROR, + "size:%zu netSize:%zu errorCode:%i bytesWorkedOn:%zu inClusterOffset:%ld block->offset:%ld \n", size, + netSize, *errorCode, *bytesWorkedOn, inClusterOffset, block->offset ); break; } totalBytesWritten += bytesWritten; } atomic_fetch_add( bytesWorkedOn, netSize ); - setBitsInBitfield( block->bitfield, (int)( inBlockOffset / DNBD3_BLOCK_SIZE ), - (int)( ( inBlockOffset + totalBytesWritten - 1 ) / DNBD3_BLOCK_SIZE ), 1 ); + setBitsInBitfield( block->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ), + (int)( ( inClusterOffset + totalBytesWritten - 1 ) / DNBD3_BLOCK_SIZE ), 1 ); block->timeChanged = time( NULL ); } /** - * @brief Increases the metadata->dataFileSize by COW_METADATA_STORAGE_CAPACITY. + * @brief Increases the metadata->dataFileSize by COW_DATA_CLUSTER_SIZE. * The space is not reserved on disk. * * @param block for which the space should be reserved. */ -static bool allocateMetaBlockData( cow_block_metadata_t *block ) +static bool allocateMetaBlockData( cow_l2_entry_t *block ) { - block->offset = (atomic_long)atomic_fetch_add( &metadata->dataFileSize, COW_METADATA_STORAGE_CAPACITY ); + block->offset = (atomic_long)atomic_fetch_add( &metadata->dataFileSize, COW_DATA_CLUSTER_SIZE ); return true; } /** - * @brief Get the cow_block_metadata_t from l1Offset and l2Offset + * @brief Get the cow_l2_entry_t from l1Index and l2Index. + * l1 offset must be valid * - * @param l1Offset - * @param l2Offset - * @return cow_block_metadata_t* + * @param l1Index + * @param l2Index + * @return cow_l2_entry_t* */ -static cow_block_metadata_t *getBlock( int l1Offset, int l2Offset ) +static cow_l2_entry_t *getL2Entry( int l1Index, int l2Index ) { - cow_block_metadata_t *block = ( cow.firstL2[cow.l1[l1Offset]] + l2Offset ); + cow_l2_entry_t *block = ( cow.firstL2[cow.l1[l1Index]] + l2Index ); if ( block->offset == -1 ) { allocateMetaBlockData( block ); } @@ -1014,15 +1046,15 @@ static cow_block_metadata_t *getBlock( int l1Offset, int l2Offset ) } /** - * @brief creates an new L2 Block and initializes the containing cow_block_metadata_t blocks + * @brief creates an new L2 Block and initializes the containing cow_l2_entry_t blocks * - * @param l1Offset + * @param l1Index */ -static bool createL2Block( int l1Offset ) +static bool createL2Block( int l1Index ) { pthread_mutex_lock( &cow.l2CreateLock ); - if ( cow.l1[l1Offset] == -1 ) { - for ( int i = 0; i < COW_L2_SIZE; i++ ) { + if ( cow.l1[l1Index] == -1 ) { + for ( int i = 0; i < COW_L2_TABLE_SIZE; i++ ) { cow.firstL2[metadata->nextL2][i].offset = -1; cow.firstL2[metadata->nextL2][i].timeChanged = ATOMIC_VAR_INIT( 0 ); cow.firstL2[metadata->nextL2][i].uploads = ATOMIC_VAR_INIT( 0 ); @@ -1030,7 +1062,7 @@ static bool createL2Block( int l1Offset ) cow.firstL2[metadata->nextL2][i].bitfield[j] = ATOMIC_VAR_INIT( 0 ); } } - cow.l1[l1Offset] = metadata->nextL2; + cow.l1[l1Index] = metadata->nextL2; metadata->nextL2 += 1; } pthread_mutex_unlock( &cow.l2CreateLock ); @@ -1038,12 +1070,12 @@ static bool createL2Block( int l1Offset ) } /** - * @brief Is called once an fuse write request ist finished. + * @brief Is called once a fuse write request ist finished. * Calls the corrsponding fuse reply depending on the type and * success of the request. * * @param req fuse_req_t - * @param cowRequest + * @param cowRequest */ static void finishWriteRequest( fuse_req_t req, cow_request_t *cowRequest ) @@ -1052,7 +1084,9 @@ static void finishWriteRequest( fuse_req_t req, cow_request_t *cowRequest ) fuse_reply_err( req, cowRequest->errorCode ); } else { - metadata->imageSize = MAX( metadata->imageSize, cowRequest->bytesWorkedOn + cowRequest->fuseRequestOffset ); + uint64_t oldSize = metadata->imageSize; + uint64_t ns = MAX( oldSize, cowRequest->bytesWorkedOn + cowRequest->fuseRequestOffset ); + atomic_compare_exchange_strong( &metadata->imageSize, &oldSize, ns ); fuse_reply_write( req, cowRequest->bytesWorkedOn ); } free( cowRequest ); @@ -1060,18 +1094,19 @@ static void finishWriteRequest( fuse_req_t req, cow_request_t *cowRequest ) /** * @brief Called after the padding data was received from the dnbd3 server. - * The data from the write request will be combined witch the data from the server + * The data from the write request will be combined with the data from the server * so that we get a full DNBD3_BLOCK and is then written on the disk. * @param sRequest */ static void writePaddedBlock( cow_sub_request_t *sRequest ) { //copy write Data - memcpy( ( sRequest->writeBuffer + ( sRequest->inBlockOffset % DNBD3_BLOCK_SIZE ) ), sRequest->writeSrc, + // TODO Assert that we have enough space in writeBuffer at that offset + memcpy( ( sRequest->writeBuffer + ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ) ), sRequest->writeSrc, sRequest->size ); writeData( sRequest->writeBuffer, DNBD3_BLOCK_SIZE, (ssize_t)sRequest->size, &sRequest->cowRequest->errorCode, &sRequest->cowRequest->bytesWorkedOn, sRequest->block, - ( sRequest->inBlockOffset - ( sRequest->inBlockOffset % DNBD3_BLOCK_SIZE ) ) ); + ( sRequest->inClusterOffset - ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ) ) ); if ( atomic_fetch_sub( &sRequest->cowRequest->workCounter, 1 ) == 1 ) { @@ -1081,49 +1116,49 @@ static void writePaddedBlock( cow_sub_request_t *sRequest ) } /** - * @brief If an block does not start or finishes on an multiple of DNBD3_BLOCK_SIZE, the blocks needs to be - * padded. If this block is inside the original image size, the padding data will be read fro the server - * otherwise it will be padded with 0 since the it must be the block at the end of the image. + * @brief If a block does not start or finish on an multiple of DNBD3_BLOCK_SIZE, the blocks need to be + * padded. If this block is inside the original image size, the padding data will be read from the server. + * Otherwise it will be padded with 0 since the it must be the block at the end of the image. + * TODO: Properly document the arguments and what value range they can be, i.e. see below for the 4k case * */ static void padBlockFromRemote( fuse_req_t req, off_t offset, cow_request_t *cowRequest, const char *buffer, - size_t size, cow_block_metadata_t *block, off_t inBlockOffset ) + size_t size, cow_l2_entry_t *block, off_t inClusterOffset ) { - if ( offset > (off_t)metadata->originalImageSize ) { - //pad 0 and done - inBlockOffset -= inBlockOffset % DNBD3_BLOCK_SIZE; + // TODO: Is this *guaranteed* to be the case on the caller site? Add comment to ^ + assert( ( offset % DNBD3_BLOCK_SIZE ) + size <= DNBD3_BLOCK_SIZE ); + if ( offset >= (off_t)metadata->originalImageSize ) { + // Writing past the end of the image + inClusterOffset -= inClusterOffset % DNBD3_BLOCK_SIZE; char buf[DNBD3_BLOCK_SIZE] = { 0 }; - memcpy( buf + ( offset % 4096 ), buffer, size ); + memcpy( buf + ( offset % DNBD3_BLOCK_SIZE ), buffer, size ); + // At this point we should have a 4k block with user-space data to write, and possibly + // zero-padding at start and/or end - writeData( buf, DNBD3_BLOCK_SIZE, (ssize_t)size, &cowRequest->errorCode, &cowRequest->bytesWorkedOn, block, - inBlockOffset ); + writeData( buf, DNBD3_BLOCK_SIZE, (ssize_t)size, &cowRequest->errorCode, &cowRequest->bytesWorkedOn, + block, inClusterOffset ); return; } - cow_sub_request_t *sRequest = calloc( sizeof( cow_sub_request_t ) + DNBD3_BLOCK_SIZE, sizeof( char ) ); + // Need to fetch padding from upstream + cow_sub_request_t *sRequest = calloc( sizeof( cow_sub_request_t ) + DNBD3_BLOCK_SIZE, 1 ); sRequest->callback = writePaddedBlock; - sRequest->inBlockOffset = inBlockOffset; + sRequest->inClusterOffset = inClusterOffset; sRequest->block = block; sRequest->size = size; sRequest->writeSrc = buffer; sRequest->cowRequest = cowRequest; - off_t start = offset - ( offset % DNBD3_BLOCK_SIZE ); - sRequest->dRequest.length = DNBD3_BLOCK_SIZE; - sRequest->dRequest.offset = start; + sRequest->dRequest.length = (uint32_t)MIN( DNBD3_BLOCK_SIZE, metadata->originalImageSize - offset ); + sRequest->dRequest.offset = offset - ( offset % DNBD3_BLOCK_SIZE ); sRequest->dRequest.fuse_req = req; - if ( ( (size_t)( offset + DNBD3_BLOCK_SIZE ) ) > metadata->originalImageSize ) { - sRequest->dRequest.length = - (uint32_t)MIN( DNBD3_BLOCK_SIZE, offset + DNBD3_BLOCK_SIZE - metadata->originalImageSize ); - } - atomic_fetch_add( &cowRequest->workCounter, 1 ); if ( !connection_read( &sRequest->dRequest ) ) { cowRequest->errorCode = EIO; - free( sRequest ); if ( atomic_fetch_sub( &sRequest->cowRequest->workCounter, 1 ) == 1 ) { finishWriteRequest( sRequest->dRequest.fuse_req, sRequest->cowRequest ); } + free( sRequest ); return; } } @@ -1153,10 +1188,14 @@ void readRemoteData( cow_sub_request_t *sRequest ) if ( atomic_fetch_sub( &sRequest->cowRequest->workCounter, 1 ) == 1 ) { if ( sRequest->cowRequest->bytesWorkedOn < sRequest->cowRequest->fuseRequestSize ) { + // TODO: Is this a logic bug somewhere, reagarding accounting? + // Because connection_read() will always return exactly as many bytes as requested, + // or simply never finish. + // Otherwise, we should return EIO... logadd( LOG_ERROR, "pad read to small\n" ); } - fuse_reply_buf( - sRequest->dRequest.fuse_req, sRequest->cowRequest->readBuffer, sRequest->cowRequest->bytesWorkedOn ); + fuse_reply_buf( sRequest->dRequest.fuse_req, sRequest->cowRequest->readBuffer, + sRequest->cowRequest->bytesWorkedOn ); free( sRequest->cowRequest->readBuffer ); free( sRequest->cowRequest ); } @@ -1175,30 +1214,34 @@ void readRemoteData( cow_sub_request_t *sRequest ) void cowfile_setSize( fuse_req_t req, size_t size, fuse_ino_t ino, struct fuse_file_info *fi ) { // decrease - if ( metadata->imageSize > size ) { + if ( size < metadata->imageSize ) { if ( size < metadata->originalImageSize ) { metadata->originalImageSize = size; } + // TODO.... so.... + // originalImageSize = smallest we have seen + // imageSize = current + // ? // increase - } else if ( metadata->imageSize < size ) { + } else if ( size > metadata->imageSize ) { off_t offset = metadata->imageSize; - int l1Offset = getL1Offset( offset ); - int l2Offset = getL2Offset( offset ); - int l1EndOffset = getL1Offset( size ); - int l2EndOffset = getL2Offset( size ); - // special case first block - if ( cow.l1[l1Offset] != -1 ) { - cow_block_metadata_t *block = getBlock( l1Offset, l2Offset ); + int l1Index = offsetToL1Index( offset ); + int l2Index = offsetToL2Index( offset ); + int l1EndIndex = offsetToL1Index( size ); + int l2EndIndex = offsetToL2Index( size ); + // special case first block TODO: What is the special case? What is happening here? + if ( cow.l1[l1Index] != -1 ) { + cow_l2_entry_t *block = getL2Entry( l1Index, l2Index ); if ( metadata->imageSize % DNBD3_BLOCK_SIZE != 0 ) { - off_t inBlockOffset = metadata->imageSize % COW_METADATA_STORAGE_CAPACITY; + off_t inClusterOffset = metadata->imageSize % COW_DATA_CLUSTER_SIZE; size_t sizeToWrite = DNBD3_BLOCK_SIZE - ( metadata->imageSize % DNBD3_BLOCK_SIZE ); - if ( checkBit( block->bitfield, (int)( inBlockOffset / DNBD3_BLOCK_SIZE ) ) ) { + if ( checkBit( block->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ) ) ) { char buf[sizeToWrite]; memset( buf, 0, sizeToWrite ); - ssize_t bytesWritten = pwrite( cow.fhd, buf, sizeToWrite, block->offset + inBlockOffset ); + ssize_t bytesWritten = pwrite( cow.fhd, buf, sizeToWrite, block->offset + inClusterOffset ); if ( bytesWritten < (ssize_t)sizeToWrite ) { fuse_reply_err( req, bytesWritten == -1 ? errno : EIO ); @@ -1209,34 +1252,34 @@ void cowfile_setSize( fuse_req_t req, size_t size, fuse_ino_t ino, struct fuse_f } } // rest of block set bits 0 - l1Offset = getL1Offset( offset ); - l2Offset = getL2Offset( offset ); - block = getBlock( l1Offset, l2Offset ); - off_t inBlockOffset = offset % COW_METADATA_STORAGE_CAPACITY; + l1Index = offsetToL1Index( offset ); + l2Index = offsetToL2Index( offset ); + block = getL2Entry( l1Index, l2Index ); + off_t inClusterOffset = offset % COW_DATA_CLUSTER_SIZE; setBitsInBitfield( - block->bitfield, (int)( inBlockOffset / DNBD3_BLOCK_SIZE ), ( COW_BITFIELD_SIZE * 8 ) - 1, 0 ); + block->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ), ( COW_BITFIELD_SIZE * 8 ) - 1, 0 ); block->timeChanged = time( NULL ); - l2Offset++; - if ( l2Offset >= COW_L2_SIZE ) { - l2Offset = 0; - l1Offset++; + l2Index++; + if ( l2Index >= COW_L2_TABLE_SIZE ) { + l2Index = 0; + l1Index++; } } // null all bitfields - while ( !( l1Offset > l1EndOffset || ( l1Offset == l1EndOffset && l2EndOffset < l2Offset ) ) ) { - if ( cow.l1[l1Offset] == -1 ) { - l1Offset++; - l2Offset = 0; + while ( !( l1Index > l1EndIndex || ( l1Index == l1EndIndex && l2EndIndex < l2Index ) ) ) { + if ( cow.l1[l1Index] == -1 ) { + l1Index++; + l2Index = 0; continue; } - cow_block_metadata_t *block = getBlock( l1Offset, l2Offset ); + cow_l2_entry_t *block = getL2Entry( l1Index, l2Index ); setBitsInBitfield( block->bitfield, 0, ( COW_BITFIELD_SIZE * 8 ) - 1, 0 ); block->timeChanged = time( NULL ); - l2Offset++; - if ( l2Offset >= COW_L2_SIZE ) { - l2Offset = 0; - l1Offset++; + l2Index++; + if ( l2Index >= COW_L2_TABLE_SIZE ) { + l2Index = 0; + l1Index++; } } } @@ -1247,7 +1290,7 @@ void cowfile_setSize( fuse_req_t req, size_t size, fuse_ino_t ino, struct fuse_f } /** - * @brief Implementation of a write request or an truncate. + * @brief Implementation of a write request. * * @param req fuse_req_t * @param cowRequest @@ -1265,64 +1308,70 @@ void cowfile_write( fuse_req_t req, cow_request_t *cowRequest, off_t offset, siz off_t currentOffset = offset; off_t endOffset = offset + size; - // write data - - int l1Offset = getL1Offset( currentOffset ); - int l2Offset = getL2Offset( currentOffset ); + int l1Index = offsetToL1Index( currentOffset ); + int l2Index = offsetToL2Index( currentOffset ); while ( currentOffset < endOffset ) { - if ( cow.l1[l1Offset] == -1 ) { - createL2Block( l1Offset ); + if ( cow.l1[l1Index] == -1 ) { + createL2Block( l1Index ); } //loop over L2 array (metadata) - while ( currentOffset < (off_t)endOffset && l2Offset < COW_L2_SIZE ) { - cow_block_metadata_t *metaBlock = getBlock( l1Offset, l2Offset ); - + while ( currentOffset < endOffset && l2Index < COW_L2_TABLE_SIZE ) { + cow_l2_entry_t *metaBlock = getL2Entry( l1Index, l2Index ); - size_t metaBlockStartOffset = l1Offset * COW_L2_STORAGE_CAPACITY + l2Offset * COW_METADATA_STORAGE_CAPACITY; - - size_t inBlockOffset = currentOffset - metaBlockStartOffset; - size_t sizeToWriteToBlock = - MIN( (size_t)( endOffset - currentOffset ), COW_METADATA_STORAGE_CAPACITY - inBlockOffset ); + // Calc absolute offset in image corresponding to current cluster + size_t clusterAbsoluteStartOffset = l1Index * COW_FULL_L2_TABLE_DATA_SIZE + l2Index * COW_DATA_CLUSTER_SIZE; + size_t inClusterOffset = currentOffset - clusterAbsoluteStartOffset; + // How many bytes we can write to this cluster before crossing a boundary, or before the write request is completed + size_t bytesToWriteToCluster = + MIN( (size_t)( endOffset - currentOffset ), COW_DATA_CLUSTER_SIZE - inClusterOffset ); ///////////////////////// // lock for the half block probably needed if ( currentOffset % DNBD3_BLOCK_SIZE != 0 - && !checkBit( metaBlock->bitfield, (int)( inBlockOffset / DNBD3_BLOCK_SIZE ) ) ) { - // write remote - size_t padSize = MIN( sizeToWriteToBlock, DNBD3_BLOCK_SIZE - ( (size_t)currentOffset % DNBD3_BLOCK_SIZE ) ); + && !checkBit( metaBlock->bitfield, (int)( inClusterOffset / DNBD3_BLOCK_SIZE ) ) ) { + // Block has not been written locally before, and write does not start on block boundary. + // Need to fetch the first couple bytes of the block from remote before writing the block to disk. + size_t writeSize = MIN( bytesToWriteToCluster, DNBD3_BLOCK_SIZE - ( (size_t)currentOffset % DNBD3_BLOCK_SIZE ) ); const char *sbuf = cowRequest->writeBuffer + ( ( currentOffset - offset ) ); - padBlockFromRemote( req, currentOffset, cowRequest, sbuf, padSize, metaBlock, (off_t)inBlockOffset ); - currentOffset += padSize; + padBlockFromRemote( req, currentOffset, cowRequest, sbuf, writeSize, metaBlock, (off_t)inClusterOffset ); + currentOffset += writeSize; continue; } - size_t endPaddedSize = 0; - if ( ( currentOffset + sizeToWriteToBlock ) % DNBD3_BLOCK_SIZE != 0 ) { - off_t currentEndOffset = currentOffset + sizeToWriteToBlock; - off_t padStartOffset = currentEndOffset - ( currentEndOffset % 4096 ); - off_t inBlockPadStartOffset = padStartOffset - metaBlockStartOffset; - if ( !checkBit( metaBlock->bitfield, (int)( inBlockPadStartOffset / DNBD3_BLOCK_SIZE ) ) ) { - const char *sbuf = cowRequest->writeBuffer + ( ( padStartOffset - offset ) ); - padBlockFromRemote( req, padStartOffset, cowRequest, sbuf, (currentEndOffset)-padStartOffset, metaBlock, - inBlockPadStartOffset ); - - - sizeToWriteToBlock -= (currentEndOffset)-padStartOffset; - endPaddedSize = (currentEndOffset)-padStartOffset; + size_t endPaddedSize = 0; // In case we need to skip over a pending pad request to remote + if ( ( currentOffset + bytesToWriteToCluster ) % DNBD3_BLOCK_SIZE != 0 + && metadata->originalImageSize > currentOffset + bytesToWriteToCluster ) { + // Write request does not end on block boundary, and ends before end of image + // End offset of this write + off_t clusterEndOffset = currentOffset + bytesToWriteToCluster; + // Start of last block of write, i.e. start of the last, incomplete block + off_t lastBlockStartOffset = clusterEndOffset - ( clusterEndOffset % DNBD3_BLOCK_SIZE ); + // Where that last block starts relative to its cluster + off_t inClusterBlockOffset = lastBlockStartOffset - clusterAbsoluteStartOffset; + if ( !checkBit( metaBlock->bitfield, (int)( inClusterBlockOffset / DNBD3_BLOCK_SIZE ) ) ) { + // Block indeed not modified before, need to fetch + const char *sbuf = cowRequest->writeBuffer + ( ( lastBlockStartOffset - offset ) ); + padBlockFromRemote( req, lastBlockStartOffset, cowRequest, sbuf, clusterEndOffset - lastBlockStartOffset, metaBlock, + inClusterBlockOffset ); + + + bytesToWriteToCluster -= clusterEndOffset - lastBlockStartOffset; + endPaddedSize = clusterEndOffset - lastBlockStartOffset; } } - writeData( cowRequest->writeBuffer + ( ( currentOffset - offset ) ), (ssize_t)sizeToWriteToBlock, - sizeToWriteToBlock, &cowRequest->errorCode, &cowRequest->bytesWorkedOn, metaBlock, inBlockOffset ); + writeData( cowRequest->writeBuffer + ( ( currentOffset - offset ) ), (ssize_t)bytesToWriteToCluster, + bytesToWriteToCluster, &cowRequest->errorCode, &cowRequest->bytesWorkedOn, metaBlock, inClusterOffset ); - currentOffset += sizeToWriteToBlock; + currentOffset += bytesToWriteToCluster; + // Account for skipped-over bytes currentOffset += endPaddedSize; - l2Offset++; + l2Index++; } - l1Offset++; - l2Offset = 0; + l1Index++; + l2Index = 0; } if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) == 1 ) { finishWriteRequest( req, cowRequest ); @@ -1337,16 +1386,17 @@ void cowfile_write( fuse_req_t req, cow_request_t *cowRequest, off_t offset, siz * @param offset from the start of the file * @param size of data to request * @param buffer into which the data is to be written - * @param workCounter workCounter is increased by one and later reduced by one again when the request is completed. + * @param workCounter workCounter is increased by one and later reduced by one again when the request is completed. TODO There is no such param, but cowRequest.. */ static void readRemote( fuse_req_t req, off_t offset, ssize_t size, char *buffer, cow_request_t *cowRequest ) { // edgecase: Image size got reduced before on a non block border - if ( offset + size > (long int) metadata->originalImageSize ) { + if ( offset + size > (long int) metadata->originalImageSize ) { // TODO How does this check if it's a non block border? size_t padZeroSize = ( offset + size ) - metadata->originalImageSize; off_t padZeroOffset = metadata->originalImageSize - offset; - assert( offset > 0 ); - memset( ( buffer + padZeroOffset ), 0, padZeroOffset ); + assert( offset > 0 ); // TODO Should this be padZeroOffset? + // ... But isn't it possible that offset > originalImageSize, in which case it would be negative? + memset( ( buffer + padZeroOffset ), 0, padZeroSize ); atomic_fetch_add( &cowRequest->bytesWorkedOn, padZeroSize ); } @@ -1360,9 +1410,10 @@ static void readRemote( fuse_req_t req, off_t offset, ssize_t size, char *buffer atomic_fetch_add( &cowRequest->workCounter, 1 ); if ( !connection_read( &sRequest->dRequest ) ) { - cowRequest->errorCode = EIO; + cowRequest->errorCode = EIO; // TODO We set an error... free( sRequest ); if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) == 1 ) { + // .... but would still report success if this happens to be the last pending sub-request!? fuse_reply_buf( req, cowRequest->readBuffer, cowRequest->bytesWorkedOn ); } free( cowRequest->readBuffer ); @@ -1379,7 +1430,7 @@ static void readRemote( fuse_req_t req, off_t offset, ssize_t size, char *buffer * @param offset * @return enum dataSource */ -enum dataSource getBlockDataSource( cow_block_metadata_t *block, off_t bitfieldOffset, off_t offset ) +enum dataSource getBlockDataSource( cow_l2_entry_t *block, off_t bitfieldOffset, off_t offset ) { if ( block != NULL && checkBit( block->bitfield, (int)bitfieldOffset ) ) { return local; @@ -1411,14 +1462,14 @@ void cowfile_read( fuse_req_t req, size_t size, off_t offset ) off_t lastReadOffset = offset; off_t endOffset = offset + size; off_t searchOffset = offset; - int l1Offset = getL1Offset( offset ); - int l2Offset = getL2Offset( offset ); - int bitfieldOffset = getBitfieldOffset( offset ); + int l1Index = offsetToL1Index( offset ); + int l2Index = offsetToL2Index( offset ); + int bitfieldOffset = getBitfieldOffsetBit( offset ); enum dataSource dataState; - cow_block_metadata_t *block = NULL; + cow_l2_entry_t *cluster = NULL; - if ( cow.l1[l1Offset] != -1 ) { - block = getBlock( l1Offset, l2Offset ); + if ( cow.l1[l1Index] != -1 ) { + cluster = getL2Entry( l1Index, l2Index ); } bool doRead = false; @@ -1428,28 +1479,38 @@ void cowfile_read( fuse_req_t req, size_t size, off_t offset ) if ( firstLoop ) { firstLoop = false; lastReadOffset = searchOffset; - dataState = getBlockDataSource( block, bitfieldOffset, searchOffset ); - } else if ( getBlockDataSource( block, bitfieldOffset, searchOffset ) != dataState ) { + // TODO: Why is this only set on first iteration and not for every block/cluster? + dataState = getBlockDataSource( cluster, bitfieldOffset, searchOffset ); + } else if ( getBlockDataSource( cluster, bitfieldOffset, searchOffset ) != dataState ) { + // TODO So data source changed, but we don't update the dataState var... How can this possibly work? doRead = true; } else { bitfieldOffset++; } if ( bitfieldOffset >= COW_BITFIELD_SIZE * 8 ) { + // Advance to next cluster in current l2 table bitfieldOffset = 0; - l2Offset++; - if ( l2Offset >= COW_L2_SIZE ) { - l2Offset = 0; - l1Offset++; + l2Index++; + if ( l2Index >= COW_L2_TABLE_SIZE ) { + // Advance to next l1 entry, reset l2 index + l2Index = 0; + l1Index++; } + // Also set flag that we need to update the 'cluster' struct at the end of this iteration + // TODO: Why do we update all the values above, but not the cluster struct? We access those + // variables in the code below, so we have updated offset and index, but operate on the + // old cluster struct. How does that make sense? updateBlock = true; if ( dataState == local ) { doRead = true; } } - // compute the original file offset from bitfieldOffset, l2Offset and l1Offset - searchOffset = DNBD3_BLOCK_SIZE * ( bitfieldOffset ) + l2Offset * COW_METADATA_STORAGE_CAPACITY - + l1Offset * COW_L2_STORAGE_CAPACITY; + // compute the original file offset from bitfieldOffset, l2Index and l1Index + // TODO ??? As stated above, this is using the updated values, so isn't this the next + // offset tather than original offset? + searchOffset = DNBD3_BLOCK_SIZE * ( bitfieldOffset ) + l2Index * COW_DATA_CLUSTER_SIZE + + l1Index * COW_FULL_L2_TABLE_DATA_SIZE; if ( doRead || searchOffset >= endOffset ) { ssize_t sizeToRead = MIN( searchOffset, endOffset ); if ( dataState == remote ) { @@ -1472,7 +1533,7 @@ void cowfile_read( fuse_req_t req, size_t size, off_t offset ) sizeToRead -= lastReadOffset; // Compute the offset in the data file where the read starts off_t localRead = - block->offset + ( ( lastReadOffset % COW_L2_STORAGE_CAPACITY ) % COW_METADATA_STORAGE_CAPACITY ); + cluster->offset + ( ( lastReadOffset % COW_FULL_L2_TABLE_DATA_SIZE ) % COW_DATA_CLUSTER_SIZE ); ssize_t totalBytesRead = 0; while ( totalBytesRead < sizeToRead ) { ssize_t bytesRead = @@ -1495,26 +1556,20 @@ void cowfile_read( fuse_req_t req, size_t size, off_t offset ) } if ( updateBlock ) { - if ( cow.l1[l1Offset] != -1 ) { - block = getBlock( l1Offset, l2Offset ); + if ( cow.l1[l1Index] != -1 ) { + cluster = getL2Entry( l1Index, l2Index ); } else { - block = NULL; + cluster = NULL; } updateBlock = false; } } fail:; if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) == 1 ) { - if ( cowRequest->errorCode != 0 ) { - if ( cowRequest->bytesWorkedOn < size ) { - logadd( LOG_ERROR, " read to small" ); - } - fuse_reply_err( req, cowRequest->errorCode ); - + if ( cowRequest->errorCode != 0 || cowRequest->bytesWorkedOn < size ) { + logadd( LOG_ERROR, "incomplete read or I/O error (errno=%d)", cowRequest->errorCode ); + fuse_reply_err( req, cowRequest->errorCode != 0 ? cowRequest->errorCode : EIO ); } else { - if ( cowRequest->bytesWorkedOn < size ) { - logadd( LOG_ERROR, " read to small" ); - } fuse_reply_buf( req, cowRequest->readBuffer, cowRequest->bytesWorkedOn ); } free( cowRequest->readBuffer ); diff --git a/src/fuse/cowfile.h b/src/fuse/cowfile.h index d91d122..a07469d 100644 --- a/src/fuse/cowfile.h +++ b/src/fuse/cowfile.h @@ -1,23 +1,20 @@ #ifndef _COWFILE_H_ #define _COWFILE_H_ -#include <stdint.h> -#include <stdlib.h> -#include <stdatomic.h> -#include <dnbd3/shared/log.h> -#include <sys/mman.h> -#include <string.h> -#include <pthread.h> -#include <errno.h> -#include <curl/curl.h> -#include "main.h" #include "connection.h" +#include <dnbd3/config/cow.h> +#include <stdint.h> +#include <stdbool.h> +#include <stdatomic.h> +#include <stdlib.h> -#define COW_METADATA_STORAGE_CAPACITY ( COW_BITFIELD_SIZE * 8 * DNBD3_BLOCK_SIZE ) -#define COW_L2_SIZE 1024 -#define COW_L2_STORAGE_CAPACITY ( COW_L2_SIZE * COW_METADATA_STORAGE_CAPACITY ) -#define container_of( ptr, type, member ) ( (type *)( (char *)( ptr ) - (char *)&( ( (type *)NULL )->member ) ) ) +// Net storage capacity of a single cluster in the data file +#define COW_DATA_CLUSTER_SIZE ( COW_BITFIELD_SIZE * 8 * DNBD3_BLOCK_SIZE ) +// Number of entries per L2 table +#define COW_L2_TABLE_SIZE 1024 +// Net storage capacity in data file represented by a full L2 table +#define COW_FULL_L2_TABLE_DATA_SIZE ( COW_L2_TABLE_SIZE * COW_DATA_CLUSTER_SIZE ) _Static_assert( ATOMIC_INT_LOCK_FREE == 2, "ATOMIC INT not lock free" ); _Static_assert( ATOMIC_LONG_LOCK_FREE == 2, "ATOMIC LONG not lock free" ); @@ -39,7 +36,7 @@ typedef struct cowfile_metadata_header atomic_uint_least64_t imageSize; // 8byte int32_t version; // 4byte int32_t blocksize; // 4byte - uint64_t originalImageSize; // 8byte + uint64_t originalImageSize; // 8byte - the name implies this is the size of the image on the server, but apparently it changes if we truncate the image etc. better name? uint64_t metaDataStart; // 8byte int32_t bitfieldSize; // 4byte int32_t nextL2; // 4byte @@ -53,54 +50,62 @@ typedef struct cowfile_metadata_header _Static_assert( sizeof( cowfile_metadata_header_t ) == COW_METADATA_HEADER_SIZE, "cowfile_metadata_header is messed up" ); -#define COW_METADATA_METADATA_SIZE 64 -typedef struct cow_block_metadata +#define COW_L2_ENTRY_SIZE 64 +typedef struct cow_l2_entry { atomic_int_least64_t offset; atomic_uint_least64_t timeChanged; atomic_uint_least64_t uploads; - atomic_char bitfield[40]; -} cow_block_metadata_t; -_Static_assert( sizeof( cow_block_metadata_t ) == COW_METADATA_METADATA_SIZE, "cow_block_metadata_t is messed up" ); - - + atomic_char bitfield[COW_BITFIELD_SIZE]; +} cow_l2_entry_t; +_Static_assert( sizeof( cow_l2_entry_t ) == COW_L2_ENTRY_SIZE, "cow_l2_entry_t is messed up" ); + +/** + * Open request for reading/writing the virtual image we expose. + * TODO Please verify field comments + */ typedef struct cow_request { - size_t fuseRequestSize; - off_t fuseRequestOffset; - char *readBuffer; - const char *writeBuffer; - atomic_size_t bytesWorkedOn; - atomic_int workCounter; - atomic_int errorCode; - fuse_ino_t ino; - struct fuse_file_info *fi; + size_t fuseRequestSize; // Number of bytes to be read/written + off_t fuseRequestOffset; // Absolute offset into the image, as seen by user space + char *readBuffer; // Used only in read case? + const char *writeBuffer; // Used only in write case? + atomic_size_t bytesWorkedOn; // Used for ??? + atomic_int workCounter; // How many pending sub requests (see below) + atomic_int errorCode; // For reporting back to fuse + fuse_ino_t ino; // Inode of file, used for ??? + struct fuse_file_info *fi; // Used for ??? } cow_request_t; typedef struct cow_sub_request cow_sub_request_t; typedef void ( *cow_callback )( cow_sub_request_t *sRequest ); +/** + * A sub-request for above, which needs to be completed successfully + * before the parent cow_request can be completed. + * TODO Please verify field comments + */ typedef struct cow_sub_request { - size_t size; - off_t inBlockOffset; - const char *writeSrc; - char *buffer; - cow_block_metadata_t *block; - cow_callback callback; - cow_request_t *cowRequest; - dnbd3_async_t dRequest; - char writeBuffer[]; + size_t size; // size of this sub-request + off_t inClusterOffset; // offset relative to!? cow-block? DNBD3 block? cluster? + const char *writeSrc; // ??? + char *buffer; // ??? + cow_l2_entry_t *block; // the cluster inClusterOffset refers to + cow_callback callback; // Callback when we're done handling this + cow_request_t *cowRequest; // parent request + dnbd3_async_t dRequest; // Probably request to dnbd3-server for non-aligned writes (wrt 4k dnbd3 block) + char writeBuffer[]; // ??? } cow_sub_request_t; typedef struct cow_curl_read_upload { atomic_uint_least64_t time; - cow_block_metadata_t *block; + cow_l2_entry_t *block; size_t position; long unsigned int blocknumber; int fails; - curl_off_t ulLast; + int64_t ulLast; } cow_curl_read_upload_t; @@ -112,7 +117,7 @@ typedef struct cow_block_upload_statistics typedef int32_t l1; -typedef cow_block_metadata_t l2[COW_L2_SIZE]; +typedef cow_l2_entry_t l2[COW_L2_TABLE_SIZE]; bool cowfile_init( char *path, const char *image_Name, uint16_t imageVersion, atomic_uint_fast64_t **imageSizePtr, char *serverAddress, bool sStdout, bool sFile ); @@ -133,4 +138,4 @@ int cow_printStats( char *buffer, const size_t len ); void cowfile_close(); -#endif /* COWFILE_H_ */
\ No newline at end of file +#endif /* COWFILE_H_ */ |