summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSimon Rettberg2018-06-25 15:57:50 +0200
committerSimon Rettberg2018-06-25 15:57:50 +0200
commitb4c1fe882758c077fc9bd48599653b147dae3c8a (patch)
tree7c6aa29e51523069e52cd3f255acb3fe3777ce8b
parent[SERVER] Make sure image has read fd before reading (diff)
downloaddnbd3-b4c1fe882758c077fc9bd48599653b147dae3c8a.tar.gz
dnbd3-b4c1fe882758c077fc9bd48599653b147dae3c8a.tar.xz
dnbd3-b4c1fe882758c077fc9bd48599653b147dae3c8a.zip
[SERVER] Try to re-open cacheFd if writing fails
In scenarios where the proxy is using an NFS server as storage (for whatever crazy reason) or when the cacheFd goes bad through e.g. a switchroot, try to re-open it instead of just disabling caching forever.
-rw-r--r--src/server/image.c45
-rw-r--r--src/server/image.h2
-rw-r--r--src/server/uplink.c18
3 files changed, 58 insertions, 7 deletions
diff --git a/src/server/image.c b/src/server/image.c
index 2df4565..02a383f 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -469,7 +469,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
if ( candidate->cache_map != NULL ) {
// -- Incomplete - rw check
if ( candidate->cacheFd == -1 ) { // Make sure file is open for writing
- candidate->cacheFd = open( candidate->path, O_RDWR );
+ image_reopenCacheFd( candidate, false );
// It might have failed - still offer proxy mode, we just can't cache
if ( candidate->cacheFd == -1 ) {
logadd( LOG_WARNING, "Cannot re-open %s for writing - replication disabled", candidate->path );
@@ -484,6 +484,49 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
}
/**
+ * Open the given image's main image file in
+ * rw mode, assigning it to the cacheFd struct member.
+ *
+ * @param force If cacheFd was previously assigned a file descriptor (not == -1),
+ * it will be closed first. Otherwise, nothing will happen and true will be returned
+ * immediately.
+ */
+bool image_reopenCacheFd(dnbd3_image_t *image, const bool force)
+{
+ if ( image->cacheFd != -1 && !force )
+ return true;
+ const int nfd = open( image->path, O_RDWR );
+ if ( nfd == -1 ) {
+ if ( force ) {
+ // Opening new one failed, force is enabled -- close old one
+ spin_lock( &image->lock );
+ int ofd = image->cacheFd;
+ image->cacheFd = -1;
+ spin_unlock( &image->lock );
+ if ( ofd != -1 ) {
+ close( ofd );
+ }
+ }
+ return false;
+ }
+ // Open succeeded, now switch out while holding lock to make it look somewhat atomic
+ spin_lock( &image->lock );
+ int closeFd = -1;
+ if ( force || image->cacheFd == -1 ) {
+ closeFd = image->cacheFd;
+ image->cacheFd = nfd;
+ } else {
+ closeFd = nfd;
+ }
+ spin_unlock( &image->lock );
+ if ( closeFd != -1 ) { // Failed
+ close( closeFd );
+ }
+ return true; // We either replaced the fd in force mode or the old one was -1, or
+ // force was false and cacheFd != -1. Either way we consider that success
+}
+
+/**
* Lock the image by increasing its users count
* Returns the image on success, NULL if it is not found in the image list
* Every call to image_lock() needs to be followed by a call to image_release() at some point.
diff --git a/src/server/image.h b/src/server/image.h
index a06c8c7..dd8b818 100644
--- a/src/server/image.h
+++ b/src/server/image.h
@@ -21,6 +21,8 @@ bool image_ensureOpen(dnbd3_image_t *image);
dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking);
+bool image_reopenCacheFd(dnbd3_image_t *image, const bool force);
+
dnbd3_image_t* image_getOrLoad(char *name, uint16_t revision);
dnbd3_image_t* image_lock(dnbd3_image_t *image);
diff --git a/src/server/uplink.c b/src/server/uplink.c
index debd091..8a6a33a 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -594,9 +594,12 @@ static void uplink_handleReceive(dnbd3_connection_t *link)
link->bytesReceived += inReply.size;
spin_unlock( &link->image->lock );
// 1) Write to cache file
+ if ( link->image->cacheFd == -1 ) {
+ image_reopenCacheFd( link->image, false );
+ }
if ( link->image->cacheFd != -1 ) {
int err = 0;
- bool tryFree = true;
+ bool tryAgain = true; // Allow one retry in case we run out of space or the write fd became invalid
uint32_t done = 0;
ret = 0;
while ( done < inReply.size ) {
@@ -606,10 +609,16 @@ static void uplink_handleReceive(dnbd3_connection_t *link)
if ( err == EINTR ) continue;
if ( err == ENOSPC || err == EDQUOT ) {
// try to free 256MiB
- if ( !tryFree || !image_ensureDiskSpaceLocked( 256ull * 1024 * 1024, true ) ) break;
- tryFree = false;
+ if ( !tryAgain || !image_ensureDiskSpaceLocked( 256ull * 1024 * 1024, true ) ) break;
+ tryAgain = false;
continue; // Success, retry write
}
+ if ( err == EBADF || err == EINVAL || err == EIO ) {
+ if ( !tryAgain || !image_reopenCacheFd( link->image, true ) )
+ break;
+ tryAgain = false;
+ continue; // Write handle to image successfully re-opened, try again
+ }
logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", link->image->name, (int)link->image->rid, err );
break;
}
@@ -624,9 +633,6 @@ static void uplink_handleReceive(dnbd3_connection_t *link)
// Only disable caching if something seems severely wrong, not on ENOSPC since that might get better
logadd( LOG_WARNING, "Error writing received data for %s:%d; disabling caching.",
link->image->name, (int)link->image->rid );
- const int fd = link->image->cacheFd;
- link->image->cacheFd = -1;
- close( fd );
}
}
// 2) Figure out which clients are interested in it