summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSimon Rettberg2018-07-04 17:18:09 +0200
committerSimon Rettberg2018-07-04 17:18:09 +0200
commitba052729b316a86d7127a5300582f029780db336 (patch)
tree5b017e57cae150744004c5e205877006211f5925
parent[SERVER] Refactor uplink/cache handling, improve crc checking (diff)
downloaddnbd3-ba052729b316a86d7127a5300582f029780db336.tar.gz
dnbd3-ba052729b316a86d7127a5300582f029780db336.tar.xz
dnbd3-ba052729b316a86d7127a5300582f029780db336.zip
[SERVER] Use O_DIRECT for integrity checks
The idea is that for full image checks, we don't want to pollute the fs cache with gigabytes of data that won't be needed again soon. This would certainly hurt performance on servers that dont have hundreds of GBs of RAM. For single block checks during replication this has the advantage that we don't check the block in memory before it hit the disk once, but actually flush the data to disk, then remove it from the page cache, and only then read it again, from disk. TODO: Might be worth making this a config option
-rw-r--r--src/server/image.c16
-rw-r--r--src/server/integrity.c81
2 files changed, 67 insertions, 30 deletions
diff --git a/src/server/image.c b/src/server/image.c
index 673a269..9d36317 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -72,6 +72,8 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
assert( image != NULL );
// This should always be block borders due to how the protocol works, but better be safe
// than accidentally mark blocks as cached when they really aren't entirely cached.
+ assert( end <= image->virtualFilesize );
+ assert( start <= end );
if ( set ) {
// If we set as cached, move "inwards" in case we're not at 4k border
end &= ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
@@ -81,6 +83,8 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
start &= ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
end = (uint64_t)(end + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
}
+ if ( start >= end )
+ return;
bool setNewBlocks = false;
uint64_t pos = start;
spin_lock( &image->lock );
@@ -1601,7 +1605,10 @@ bool image_checkBlocksCrc32(const int fd, uint32_t *crc32list, const int *blocks
*/
static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc)
{
- char buffer[262144];
+ // Make buffer 4k aligned in case fd has O_DIRECT set
+#define BSIZE 262144
+ char rawBuffer[BSIZE + DNBD3_BLOCK_SIZE];
+ char * const buffer = (char*)( ( (uintptr_t)rawBuffer + ( DNBD3_BLOCK_SIZE - 1 ) ) & ~( DNBD3_BLOCK_SIZE - 1 ) );
// How many bytes to read from the input file
const uint64_t bytesFromFile = MIN( HASH_BLOCK_SIZE, realFilesize - ( block * HASH_BLOCK_SIZE) );
// Determine how many bytes we had to read if the file size were a multiple of 4k
@@ -1614,7 +1621,7 @@ static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_
*crc = crc32( 0, NULL, 0 );
// Calculate the crc32 by reading data from the file
while ( bytes < bytesFromFile ) {
- const size_t n = (size_t)MIN( sizeof(buffer), bytesFromFile - bytes );
+ const size_t n = (size_t)MIN( BSIZE, bytesFromFile - bytes );
const ssize_t r = pread( fd, buffer, n, readPos + bytes );
if ( r <= 0 ) {
logadd( LOG_WARNING, "CRC: Read error (errno=%d)", errno );
@@ -1625,16 +1632,17 @@ static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_
}
// If the virtual file size is different, keep going using nullbytes
if ( bytesFromFile < virtualBytesFromFile ) {
- memset( buffer, 0, sizeof(buffer) );
+ memset( buffer, 0, BSIZE );
bytes = (size_t)( virtualBytesFromFile - bytesFromFile );
while ( bytes != 0 ) {
- const size_t len = MIN( sizeof(buffer), bytes );
+ const size_t len = MIN( BSIZE, bytes );
*crc = crc32( *crc, (uint8_t*)buffer, len );
bytes -= len;
}
}
*crc = net_order_32( *crc );
return true;
+#undef BSIZE
}
/**
diff --git a/src/server/integrity.c b/src/server/integrity.c
index c9ac798..4d01ba7 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -8,6 +8,10 @@
#include <assert.h>
#include <sys/syscall.h>
#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
#define CHECK_QUEUE_SIZE 500
@@ -128,14 +132,6 @@ static void* integrity_main(void * data UNUSED)
}
if ( image == NULL ) continue;
// We have the image. Call image_release() some time
- // Make sure the image is open for reading (closeUnusedFd)
- if ( !image_ensureOpen( image ) ) {
- // TODO: Open new fd for file with O_DIRECT in case we do a full scan,
- // so we don't thrash the whole fs cache
- logadd( LOG_MINOR, "Cannot hash check block %d of %s -- no readFd", checkQueue[i].block, image->path );
- image_release( image );
- continue;
- }
bool full = checkQueue[i].full;
bool foundCorrupted = false;
spin_lock( &image->lock );
@@ -153,27 +149,60 @@ static void* integrity_main(void * data UNUSED)
}
memcpy( buffer, image->crc32, required );
spin_unlock( &image->lock );
+ // Open for direct I/O if possible; this prevents polluting the fs cache
+ int fd = open( image->path, O_RDONLY | O_DIRECT );
+ bool direct = fd != -1;
+ if ( unlikely( !direct ) ) {
+ // Try unbuffered; flush to disk for that
+ logadd( LOG_DEBUG1, "O_DIRECT failed for %s", image->path );
+ image_ensureOpen( image );
+ fd = image->readFd;
+ }
int checkCount = full ? 5 : 1;
- while ( blocks[0] < numHashBlocks ) {
- bool complete = true;
- if ( full ) {
- // When checking full image, skip incomplete blocks, otherwise assume block is complete
- spin_lock( &image->lock );
- complete = image_isHashBlockComplete( image->cache_map, blocks[0], fileSize );
- spin_unlock( &image->lock );
- }
- if ( complete && !image_checkBlocksCrc32( image->readFd, (uint32_t*)buffer, blocks, fileSize ) ) {
- logadd( LOG_WARNING, "Hash check for block %d of %s failed!", blocks[0], image->name );
- image_updateCachemap( image, blocks[0] * HASH_BLOCK_SIZE, (blocks[0] + 1) * HASH_BLOCK_SIZE, false );
- // If this is not a full check, queue one
- if ( !full ) {
- logadd( LOG_INFO, "Queueing full check for %s", image->name );
- integrity_check( image, -1 );
+ if ( fd != -1 ) {
+ while ( blocks[0] < numHashBlocks && !_shutdown ) {
+ const uint64_t start = blocks[0] * HASH_BLOCK_SIZE;
+ const uint64_t end = MIN( (uint64_t)(blocks[0] + 1) * HASH_BLOCK_SIZE, image->virtualFilesize );
+ bool complete = true;
+ if ( full ) {
+ // When checking full image, skip incomplete blocks, otherwise assume block is complete
+ spin_lock( &image->lock );
+ complete = image_isHashBlockComplete( image->cache_map, blocks[0], fileSize );
+ spin_unlock( &image->lock );
}
- foundCorrupted = true;
+ if ( fsync( fd ) == -1 ) {
+ logadd( LOG_ERROR, "Cannot flush %s for integrity check", image->path );
+ exit( 1 );
+ }
+ // Use direct I/O only if read length is multiple of 4096 to be on the safe side
+ int tfd;
+ if ( direct && ( end % DNBD3_BLOCK_SIZE ) == 0 ) {
+ // Suitable for direct io
+ tfd = fd;
+ } else if ( !image_ensureOpen( image ) ) {
+ logadd( LOG_WARNING, "Cannot open %s for reading", image->path );
+ break;
+ } else {
+ tfd = image->readFd;
+ // Evict from cache so we have to re-read, making sure data was properly stored
+ posix_fadvise( fd, start, end - start, POSIX_FADV_DONTNEED );
+ }
+ if ( complete && !image_checkBlocksCrc32( tfd, (uint32_t*)buffer, blocks, fileSize ) ) {
+ logadd( LOG_WARNING, "Hash check for block %d of %s failed!", blocks[0], image->name );
+ image_updateCachemap( image, start, end, false );
+ // If this is not a full check, queue one
+ if ( !full ) {
+ logadd( LOG_INFO, "Queueing full check for %s", image->name );
+ integrity_check( image, -1 );
+ }
+ foundCorrupted = true;
+ }
+ if ( complete && --checkCount == 0 ) break;
+ blocks[0]++;
+ }
+ if ( direct ) {
+ close( fd );
}
- if ( complete && --checkCount == 0 ) break;
- blocks[0]++;
}
pthread_mutex_lock( &integrityQueueLock );
if ( full ) {