summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSimon Rettberg2021-03-26 13:44:47 +0100
committerSimon Rettberg2021-03-26 13:44:47 +0100
commit3a7891e26aec9e5d15d21a78cb317926d96f51e9 (patch)
tree9c221489ad96ca3f6d7207a4b1c171ef0554b294
parent[KERNEL] Use sockaddr instead of dnbd3_host_t where possible (diff)
downloaddnbd3-3a7891e26aec9e5d15d21a78cb317926d96f51e9.tar.gz
dnbd3-3a7891e26aec9e5d15d21a78cb317926d96f51e9.tar.xz
dnbd3-3a7891e26aec9e5d15d21a78cb317926d96f51e9.zip
[KERNEL] Implement best_count logic for load balancing
Similar logic already exists in the fuse client: Count how many times in a row a server was fastest when measuring RTTs, and lower the switching threshold more the higher the count gets.
-rw-r--r--inc/dnbd3/config/client.h2
-rw-r--r--src/kernel/blk.c24
-rw-r--r--src/kernel/dnbd3_main.c7
-rw-r--r--src/kernel/dnbd3_main.h4
-rw-r--r--src/kernel/net.c35
5 files changed, 49 insertions, 23 deletions
diff --git a/inc/dnbd3/config/client.h b/inc/dnbd3/config/client.h
index f35f673..49d4676 100644
--- a/inc/dnbd3/config/client.h
+++ b/inc/dnbd3/config/client.h
@@ -8,7 +8,7 @@
#define SOCKET_TIMEOUT_CLIENT_DATA 2
#define SOCKET_TIMEOUT_CLIENT_DISCOVERY 1
-#define RTT_THRESHOLD_FACTOR(us) (((us) * 2) / 3) // 2/3 = current to best must be 33% worse
+#define RTT_THRESHOLD_FACTOR(us) (((us) * 3) / 4) // 3/4 = current to best must be 25% worse
#define RTT_ABSOLUTE_THRESHOLD (80000) // Or 80ms worse
#define RTT_UNREACHABLE 0x7FFFFFFul // Use this value for timeout/unreachable as RTT. Don't set too high or you might get overflows. 0x7FFFFFF = 134 seconds
// This must be a power of two:
diff --git a/src/kernel/blk.c b/src/kernel/blk.c
index c313d63..96a330c 100644
--- a/src/kernel/blk.c
+++ b/src/kernel/blk.c
@@ -201,7 +201,10 @@ static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
/* specified server is known, so try to switch to it */
new_addr = alt_server->host;
mutex_unlock(&dev->alt_servers_lock);
- if (!is_same_server(&dev->cur_server.host, &new_addr)) {
+ if (is_same_server(&dev->cur_server.host, &new_addr)) {
+ /* specified server is current server, so do not switch */
+ result = 0;
+ } else {
struct sockaddr_storage old_server;
dev_info(dnbd3_device_to_dev(dev), "manual server switch to %pISpc\n",
@@ -240,13 +243,12 @@ static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
/* switch succeeded, fake very low RTT so we don't switch away again soon */
mutex_lock(&dev->alt_servers_lock);
if (is_same_server(&alt_server->host, &new_addr)) {
- alt_server->rtts[0] = alt_server->rtts[1] = alt_server->rtts[2] = alt_server->rtts[3] = 4;
+ alt_server->rtts[0] = alt_server->rtts[1] = alt_server->rtts[2]
+ = alt_server->rtts[3] = 4;
+ alt_server->best_count = 100;
}
mutex_unlock(&dev->alt_servers_lock);
}
- } else {
- /* specified server is already working, so do not switch */
- result = 0;
}
}
}
@@ -273,20 +275,18 @@ static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
if (cmd == IOCTL_ADD_SRV) {
result = dnbd3_add_server(dev, host);
- if (result == -EEXIST) {
+ if (result == -EEXIST)
dev_info(dnbd3_device_to_dev(dev), "alt server %pISpc already exists\n", &addr);
- } else if (result == -ENOSPC) {
+ else if (result == -ENOSPC)
dev_info(dnbd3_device_to_dev(dev), "cannot add %pISpc; no free slot\n", &addr);
- } else {
+ else
dev_info(dnbd3_device_to_dev(dev), "added alt server %pISpc\n", &addr);
- }
} else { // IOCTL_REM_SRV
result = dnbd3_rem_server(dev, host);
- if (result == -ENOENT) {
+ if (result == -ENOENT)
dev_info(dnbd3_device_to_dev(dev), "alt server %pISpc not found\n", &addr);
- } else {
+ else
dev_info(dnbd3_device_to_dev(dev), "removed alt server %pISpc\n", &addr);
- }
}
break;
}
diff --git a/src/kernel/dnbd3_main.c b/src/kernel/dnbd3_main.c
index 9b5591d..3f492b1 100644
--- a/src/kernel/dnbd3_main.c
+++ b/src/kernel/dnbd3_main.c
@@ -43,12 +43,12 @@ int dnbd3_host_to_sockaddr(const dnbd3_host_t *host, struct sockaddr_storage *de
memset(dest, 0, sizeof(*dest));
if (host->type == HOST_IP4) {
- sin4 = (struct sockaddr_in*)dest;
+ sin4 = (struct sockaddr_in *)dest;
sin4->sin_family = AF_INET;
memcpy(&(sin4->sin_addr), host->addr, 4);
sin4->sin_port = host->port;
} else if (host->type == HOST_IP6) {
- sin6 = (struct sockaddr_in6*)dest;
+ sin6 = (struct sockaddr_in6 *)dest;
sin6->sin6_family = AF_INET6;
memcpy(&(sin6->sin6_addr), host->addr, 16);
sin6->sin6_port = host->port;
@@ -65,6 +65,7 @@ int is_same_server(const struct sockaddr_storage *const x, const struct sockaddr
case AF_INET: {
const struct sockaddr_in *sinx = (const struct sockaddr_in *)x;
const struct sockaddr_in *siny = (const struct sockaddr_in *)y;
+
if (sinx->sin_port != siny->sin_port)
return 0;
if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
@@ -74,6 +75,7 @@ int is_same_server(const struct sockaddr_storage *const x, const struct sockaddr
case AF_INET6: {
const struct sockaddr_in6 *sinx = (const struct sockaddr_in6 *)x;
const struct sockaddr_in6 *siny = (const struct sockaddr_in6 *)y;
+
if (sinx->sin6_port != siny->sin6_port)
return 0;
if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
@@ -160,6 +162,7 @@ int dnbd3_add_server(dnbd3_device_t *dev, dnbd3_host_t *host)
alt_server->rtts[0] = alt_server->rtts[1] = alt_server->rtts[2]
= alt_server->rtts[3] = RTT_UNREACHABLE;
alt_server->failures = 0;
+ alt_server->best_count = 0;
result = 0;
}
}
diff --git a/src/kernel/dnbd3_main.h b/src/kernel/dnbd3_main.h
index 42b9f58..71c3cf8 100644
--- a/src/kernel/dnbd3_main.h
+++ b/src/kernel/dnbd3_main.h
@@ -40,6 +40,7 @@ typedef struct {
unsigned long rtts[4]; // Last four round trip time measurements in µs
uint16_t protocol_version; // dnbd3 protocol version of this server
uint8_t failures; // How many times the server was unreachable
+ uint8_t best_count; // Number of times server measured best
struct sockaddr_storage host; // Address of server
} dnbd3_alt_server_t;
@@ -93,7 +94,8 @@ extern int dnbd3_host_to_sockaddr(const dnbd3_host_t *host, struct sockaddr_stor
extern dnbd3_alt_server_t *get_existing_alt_from_host(const dnbd3_host_t *const host, dnbd3_device_t *const dev);
-extern dnbd3_alt_server_t *get_existing_alt_from_addr(const struct sockaddr_storage *const addr, dnbd3_device_t *const dev);
+extern dnbd3_alt_server_t *get_existing_alt_from_addr(const struct sockaddr_storage *const addr,
+ dnbd3_device_t *const dev);
extern int dnbd3_add_server(dnbd3_device_t *dev, dnbd3_host_t *host);
diff --git a/src/kernel/net.c b/src/kernel/net.c
index 49a4fe7..1789f8d 100644
--- a/src/kernel/net.c
+++ b/src/kernel/net.c
@@ -127,6 +127,7 @@ static int dnbd3_net_discover(void *data)
dnbd3_request_t dnbd3_request;
dnbd3_reply_t dnbd3_reply;
+ dnbd3_alt_server_t *alt;
struct sockaddr_storage host_compare, best_server;
struct msghdr msg;
struct kvec iov[2];
@@ -140,7 +141,7 @@ static int dnbd3_net_discover(void *data)
ktime_t start = 0, end = 0;
unsigned long rtt, best_rtt = 0;
unsigned long irqflags;
- int i, j, isize, fails;
+ int i, j, isize, fails, rtt_threshold;
int turn = 0;
int ready = 0, do_change = 0;
char check_order[NUMBER_SERVERS];
@@ -153,7 +154,9 @@ static int dnbd3_net_discover(void *data)
init_msghdr(msg);
- buf = kmalloc(4096, GFP_KERNEL);
+ BUILD_BUG_ON(sizeof(serialized_buffer_t) > DNBD3_BLOCK_SIZE);
+
+ buf = kmalloc(DNBD3_BLOCK_SIZE, GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -380,8 +383,10 @@ static int dnbd3_net_discover(void *data)
rtt = (dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1]
+ dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3])
/ 4;
+ dev->alt_servers[i].failures = 0;
+ if (dev->alt_servers[i].best_count > 1)
+ dev->alt_servers[i].best_count -= 2;
}
- dev->alt_servers[i].failures = 0;
mutex_unlock(&dev->alt_servers_lock);
if (best_rtt > rtt) {
@@ -413,6 +418,8 @@ error:
if (is_same_server(&dev->alt_servers[i].host, &host_compare)) {
++dev->alt_servers[i].failures;
dev->alt_servers[i].rtts[turn] = RTT_UNREACHABLE;
+ if (dev->alt_servers[i].best_count > 2)
+ dev->alt_servers[i].best_count -= 3;
}
mutex_unlock(&dev->alt_servers_lock);
if (is_same_server(&dev->cur_server.host, &host_compare))
@@ -436,8 +443,21 @@ error:
continue;
}
+ // If best server was repeatedly measured best, lower the switching threshold more
+ mutex_lock(&dev->alt_servers_lock);
+ alt = get_existing_alt_from_addr(&best_server, dev);
+ if (alt != NULL) {
+ if (alt->best_count < 148)
+ alt->best_count += 3;
+ rtt_threshold = 1500 - (alt->best_count * 10);
+ } else {
+ rtt_threshold = 1500;
+ }
+ mutex_unlock(&dev->alt_servers_lock);
+
do_change = ready && !is_same_server(&best_server, &dev->cur_server.host)
- && (ktime_to_us(start) & 3) != 0 && RTT_THRESHOLD_FACTOR(dev->cur_server.rtt) > best_rtt + 1500;
+ && (ktime_to_us(start) & 3) != 0
+ && RTT_THRESHOLD_FACTOR(dev->cur_server.rtt) > best_rtt + rtt_threshold;
if (ready && !do_change && best_sock != NULL) {
spin_lock_irqsave(&dev->blk_lock, irqflags);
@@ -456,8 +476,9 @@ error:
// take server with lowest rtt
// if a (dis)connect is already in progress, we do nothing, this is not panic mode
if (do_change && atomic_cmpxchg(&dev->connection_lock, 0, 1) == 0) {
- dev_info(dnbd3_device_to_dev(dev), "server %pISpc is faster (%lluµs vs. %lluµs)\n", &best_server,
- (unsigned long long)best_rtt, (unsigned long long)dev->cur_server.rtt);
+ dev_info(dnbd3_device_to_dev(dev), "server %pISpc is faster (%lluµs vs. %lluµs)\n",
+ &best_server,
+ (unsigned long long)best_rtt, (unsigned long long)dev->cur_server.rtt);
kfree(buf);
dev->better_sock = best_sock; // Take shortcut by continuing to use open connection
put_task_struct(dev->thread_discover);
@@ -880,7 +901,7 @@ static struct socket *dnbd3_connect(dnbd3_device_t *dev, struct sockaddr_storage
}
if (ret < 0) {
dev_dbg(dnbd3_device_to_dev(dev), "%pISpc: connect timed out (%d, %dms)\n",
- ret, (int)ktime_ms_delta(ktime_get_real(), start));
+ addr, ret, (int)ktime_ms_delta(ktime_get_real(), start));
goto error;
}
}