[SERVER] Try to re-open cacheFd if writing fails

In scenarios where the proxy is using an NFS server as storage (for whatever crazy reason) or when the cacheFd goes bad through e.g. a switchroot, try to re-open it instead of just disabling caching forever.
author: Simon Rettberg 2018-06-25 15:57:50 +0200
committer: Simon Rettberg 2018-06-25 15:57:50 +0200
commit: b4c1fe882758c077fc9bd48599653b147dae3c8a (patch)
tree: 7c6aa29e51523069e52cd3f255acb3fe3777ce8b
parent: [SERVER] Make sure image has read fd before reading (diff)
download: dnbd3-b4c1fe882758c077fc9bd48599653b147dae3c8a.tar.gz
dnbd3-b4c1fe882758c077fc9bd48599653b147dae3c8a.tar.xz
dnbd3-b4c1fe882758c077fc9bd48599653b147dae3c8a.zip
3 files changed, 58 insertions, 7 deletions
diff --git a/src/server/image.c b/src/server/image.c
index 2df4565..02a383f 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -469,7 +469,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 	if ( candidate->cache_map != NULL ) {
 		// -- Incomplete - rw check
 		if ( candidate->cacheFd == -1 ) { // Make sure file is open for writing
-			candidate->cacheFd = open( candidate->path, O_RDWR );
+			image_reopenCacheFd( candidate, false );
 			// It might have failed - still offer proxy mode, we just can't cache
 			if ( candidate->cacheFd == -1 ) {
 				logadd( LOG_WARNING, "Cannot re-open %s for writing - replication disabled", candidate->path );
@@ -484,6 +484,49 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 }
 
 /**
+ * Open the given image's main image file in
+ * rw mode, assigning it to the cacheFd struct member.
+ *
+ * @param force If cacheFd was previously assigned a file descriptor (not == -1),
+ * it will be closed first. Otherwise, nothing will happen and true will be returned
+ * immediately.
+ */
+bool image_reopenCacheFd(dnbd3_image_t *image, const bool force)
+{
+	if ( image->cacheFd != -1 && !force )
+		return true;
+	const int nfd = open( image->path, O_RDWR );
+	if ( nfd == -1 ) {
+		if ( force ) {
+			// Opening new one failed, force is enabled -- close old one
+			spin_lock( &image->lock );
+			int ofd = image->cacheFd;
+			image->cacheFd = -1;
+			spin_unlock( &image->lock );
+			if ( ofd != -1 ) {
+				close( ofd );
+			}
+		}
+		return false;
+	}
+	// Open succeeded, now switch out while holding lock to make it look somewhat atomic
+	spin_lock( &image->lock );
+	int closeFd = -1;
+	if ( force || image->cacheFd == -1 ) {
+		closeFd = image->cacheFd;
+		image->cacheFd = nfd;
+	} else {
+		closeFd = nfd;
+	}
+	spin_unlock( &image->lock );
+	if ( closeFd != -1 ) { // Failed
+		close( closeFd );
+	}
+	return true; // We either replaced the fd in force mode or the old one was -1, or
+	// force was false and cacheFd != -1. Either way we consider that success
+}
+
+/**
  * Lock the image by increasing its users count
  * Returns the image on success, NULL if it is not found in the image list
  * Every call to image_lock() needs to be followed by a call to image_release() at some point.
diff --git a/src/server/image.h b/src/server/image.h
index a06c8c7..dd8b818 100644
--- a/src/server/image.h
+++ b/src/server/image.h
@@ -21,6 +21,8 @@ bool image_ensureOpen(dnbd3_image_t *image);
 
 dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking);
 
+bool image_reopenCacheFd(dnbd3_image_t *image, const bool force);
+
 dnbd3_image_t* image_getOrLoad(char *name, uint16_t revision);
 
 dnbd3_image_t* image_lock(dnbd3_image_t *image);
diff --git a/src/server/uplink.c b/src/server/uplink.c
index debd091..8a6a33a 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -594,9 +594,12 @@ static void uplink_handleReceive(dnbd3_connection_t *link)
 		link->bytesReceived += inReply.size;
 		spin_unlock( &link->image->lock );
 		// 1) Write to cache file
+		if ( link->image->cacheFd == -1 ) {
+			image_reopenCacheFd( link->image, false );
+		}
 		if ( link->image->cacheFd != -1 ) {
 			int err = 0;
-			bool tryFree = true;
+			bool tryAgain = true; // Allow one retry in case we run out of space or the write fd became invalid
 			uint32_t done = 0;
 			ret = 0;
 			while ( done < inReply.size ) {
@@ -606,10 +609,16 @@ static void uplink_handleReceive(dnbd3_connection_t *link)
 					if ( err == EINTR ) continue;
 					if ( err == ENOSPC || err == EDQUOT ) {
 						// try to free 256MiB
-						if ( !tryFree || !image_ensureDiskSpaceLocked( 256ull * 1024 * 1024, true ) ) break;
-						tryFree = false;
+						if ( !tryAgain || !image_ensureDiskSpaceLocked( 256ull * 1024 * 1024, true ) ) break;
+						tryAgain = false;
 						continue; // Success, retry write
 					}
+					if ( err == EBADF || err == EINVAL || err == EIO ) {
+						if ( !tryAgain || !image_reopenCacheFd( link->image, true ) )
+							break;
+						tryAgain = false;
+						continue; // Write handle to image successfully re-opened, try again
+					}
 					logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", link->image->name, (int)link->image->rid, err );
 					break;
 				}
@@ -624,9 +633,6 @@ static void uplink_handleReceive(dnbd3_connection_t *link)
 				// Only disable caching if something seems severely wrong, not on ENOSPC since that might get better
 				logadd( LOG_WARNING, "Error writing received data for %s:%d; disabling caching.",
 						link->image->name, (int)link->image->rid );
-				const int fd = link->image->cacheFd;
-				link->image->cacheFd = -1;
-				close( fd );
 			}
 		}
 		// 2) Figure out which clients are interested in it
author	Simon Rettberg	2018-06-25 15:57:50 +0200
committer	Simon Rettberg	2018-06-25 15:57:50 +0200
commit	b4c1fe882758c077fc9bd48599653b147dae3c8a (patch)
tree	7c6aa29e51523069e52cd3f255acb3fe3777ce8b
parent	[SERVER] Make sure image has read fd before reading (diff)
download	dnbd3-b4c1fe882758c077fc9bd48599653b147dae3c8a.tar.gz dnbd3-b4c1fe882758c077fc9bd48599653b147dae3c8a.tar.xz dnbd3-b4c1fe882758c077fc9bd48599653b147dae3c8a.zip