From 3a7891e26aec9e5d15d21a78cb317926d96f51e9 Mon Sep 17 00:00:00 2001 From: Simon Rettberg Date: Fri, 26 Mar 2021 13:44:47 +0100 Subject: [KERNEL] Implement best_count logic for load balancing Similar logic already exists in the fuse client: Count how many times in a row a server was fastest when measuring RTTs, and lower the switching threshold more the higher the count gets. --- inc/dnbd3/config/client.h | 2 +- src/kernel/blk.c | 24 ++++++++++++------------ src/kernel/dnbd3_main.c | 7 +++++-- src/kernel/dnbd3_main.h | 4 +++- src/kernel/net.c | 35 ++++++++++++++++++++++++++++------- 5 files changed, 49 insertions(+), 23 deletions(-) diff --git a/inc/dnbd3/config/client.h b/inc/dnbd3/config/client.h index f35f673..49d4676 100644 --- a/inc/dnbd3/config/client.h +++ b/inc/dnbd3/config/client.h @@ -8,7 +8,7 @@ #define SOCKET_TIMEOUT_CLIENT_DATA 2 #define SOCKET_TIMEOUT_CLIENT_DISCOVERY 1 -#define RTT_THRESHOLD_FACTOR(us) (((us) * 2) / 3) // 2/3 = current to best must be 33% worse +#define RTT_THRESHOLD_FACTOR(us) (((us) * 3) / 4) // 3/4 = current to best must be 25% worse #define RTT_ABSOLUTE_THRESHOLD (80000) // Or 80ms worse #define RTT_UNREACHABLE 0x7FFFFFFul // Use this value for timeout/unreachable as RTT. Don't set too high or you might get overflows. 0x7FFFFFF = 134 seconds // This must be a power of two: diff --git a/src/kernel/blk.c b/src/kernel/blk.c index c313d63..96a330c 100644 --- a/src/kernel/blk.c +++ b/src/kernel/blk.c @@ -201,7 +201,10 @@ static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int /* specified server is known, so try to switch to it */ new_addr = alt_server->host; mutex_unlock(&dev->alt_servers_lock); - if (!is_same_server(&dev->cur_server.host, &new_addr)) { + if (is_same_server(&dev->cur_server.host, &new_addr)) { + /* specified server is current server, so do not switch */ + result = 0; + } else { struct sockaddr_storage old_server; dev_info(dnbd3_device_to_dev(dev), "manual server switch to %pISpc\n", @@ -240,13 +243,12 @@ static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int /* switch succeeded, fake very low RTT so we don't switch away again soon */ mutex_lock(&dev->alt_servers_lock); if (is_same_server(&alt_server->host, &new_addr)) { - alt_server->rtts[0] = alt_server->rtts[1] = alt_server->rtts[2] = alt_server->rtts[3] = 4; + alt_server->rtts[0] = alt_server->rtts[1] = alt_server->rtts[2] + = alt_server->rtts[3] = 4; + alt_server->best_count = 100; } mutex_unlock(&dev->alt_servers_lock); } - } else { - /* specified server is already working, so do not switch */ - result = 0; } } } @@ -273,20 +275,18 @@ static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int if (cmd == IOCTL_ADD_SRV) { result = dnbd3_add_server(dev, host); - if (result == -EEXIST) { + if (result == -EEXIST) dev_info(dnbd3_device_to_dev(dev), "alt server %pISpc already exists\n", &addr); - } else if (result == -ENOSPC) { + else if (result == -ENOSPC) dev_info(dnbd3_device_to_dev(dev), "cannot add %pISpc; no free slot\n", &addr); - } else { + else dev_info(dnbd3_device_to_dev(dev), "added alt server %pISpc\n", &addr); - } } else { // IOCTL_REM_SRV result = dnbd3_rem_server(dev, host); - if (result == -ENOENT) { + if (result == -ENOENT) dev_info(dnbd3_device_to_dev(dev), "alt server %pISpc not found\n", &addr); - } else { + else dev_info(dnbd3_device_to_dev(dev), "removed alt server %pISpc\n", &addr); - } } break; } diff --git a/src/kernel/dnbd3_main.c b/src/kernel/dnbd3_main.c index 9b5591d..3f492b1 100644 --- a/src/kernel/dnbd3_main.c +++ b/src/kernel/dnbd3_main.c @@ -43,12 +43,12 @@ int dnbd3_host_to_sockaddr(const dnbd3_host_t *host, struct sockaddr_storage *de memset(dest, 0, sizeof(*dest)); if (host->type == HOST_IP4) { - sin4 = (struct sockaddr_in*)dest; + sin4 = (struct sockaddr_in *)dest; sin4->sin_family = AF_INET; memcpy(&(sin4->sin_addr), host->addr, 4); sin4->sin_port = host->port; } else if (host->type == HOST_IP6) { - sin6 = (struct sockaddr_in6*)dest; + sin6 = (struct sockaddr_in6 *)dest; sin6->sin6_family = AF_INET6; memcpy(&(sin6->sin6_addr), host->addr, 16); sin6->sin6_port = host->port; @@ -65,6 +65,7 @@ int is_same_server(const struct sockaddr_storage *const x, const struct sockaddr case AF_INET: { const struct sockaddr_in *sinx = (const struct sockaddr_in *)x; const struct sockaddr_in *siny = (const struct sockaddr_in *)y; + if (sinx->sin_port != siny->sin_port) return 0; if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr) @@ -74,6 +75,7 @@ int is_same_server(const struct sockaddr_storage *const x, const struct sockaddr case AF_INET6: { const struct sockaddr_in6 *sinx = (const struct sockaddr_in6 *)x; const struct sockaddr_in6 *siny = (const struct sockaddr_in6 *)y; + if (sinx->sin6_port != siny->sin6_port) return 0; if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr)) @@ -160,6 +162,7 @@ int dnbd3_add_server(dnbd3_device_t *dev, dnbd3_host_t *host) alt_server->rtts[0] = alt_server->rtts[1] = alt_server->rtts[2] = alt_server->rtts[3] = RTT_UNREACHABLE; alt_server->failures = 0; + alt_server->best_count = 0; result = 0; } } diff --git a/src/kernel/dnbd3_main.h b/src/kernel/dnbd3_main.h index 42b9f58..71c3cf8 100644 --- a/src/kernel/dnbd3_main.h +++ b/src/kernel/dnbd3_main.h @@ -40,6 +40,7 @@ typedef struct { unsigned long rtts[4]; // Last four round trip time measurements in µs uint16_t protocol_version; // dnbd3 protocol version of this server uint8_t failures; // How many times the server was unreachable + uint8_t best_count; // Number of times server measured best struct sockaddr_storage host; // Address of server } dnbd3_alt_server_t; @@ -93,7 +94,8 @@ extern int dnbd3_host_to_sockaddr(const dnbd3_host_t *host, struct sockaddr_stor extern dnbd3_alt_server_t *get_existing_alt_from_host(const dnbd3_host_t *const host, dnbd3_device_t *const dev); -extern dnbd3_alt_server_t *get_existing_alt_from_addr(const struct sockaddr_storage *const addr, dnbd3_device_t *const dev); +extern dnbd3_alt_server_t *get_existing_alt_from_addr(const struct sockaddr_storage *const addr, + dnbd3_device_t *const dev); extern int dnbd3_add_server(dnbd3_device_t *dev, dnbd3_host_t *host); diff --git a/src/kernel/net.c b/src/kernel/net.c index 49a4fe7..1789f8d 100644 --- a/src/kernel/net.c +++ b/src/kernel/net.c @@ -127,6 +127,7 @@ static int dnbd3_net_discover(void *data) dnbd3_request_t dnbd3_request; dnbd3_reply_t dnbd3_reply; + dnbd3_alt_server_t *alt; struct sockaddr_storage host_compare, best_server; struct msghdr msg; struct kvec iov[2]; @@ -140,7 +141,7 @@ static int dnbd3_net_discover(void *data) ktime_t start = 0, end = 0; unsigned long rtt, best_rtt = 0; unsigned long irqflags; - int i, j, isize, fails; + int i, j, isize, fails, rtt_threshold; int turn = 0; int ready = 0, do_change = 0; char check_order[NUMBER_SERVERS]; @@ -153,7 +154,9 @@ static int dnbd3_net_discover(void *data) init_msghdr(msg); - buf = kmalloc(4096, GFP_KERNEL); + BUILD_BUG_ON(sizeof(serialized_buffer_t) > DNBD3_BLOCK_SIZE); + + buf = kmalloc(DNBD3_BLOCK_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -380,8 +383,10 @@ static int dnbd3_net_discover(void *data) rtt = (dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4; + dev->alt_servers[i].failures = 0; + if (dev->alt_servers[i].best_count > 1) + dev->alt_servers[i].best_count -= 2; } - dev->alt_servers[i].failures = 0; mutex_unlock(&dev->alt_servers_lock); if (best_rtt > rtt) { @@ -413,6 +418,8 @@ error: if (is_same_server(&dev->alt_servers[i].host, &host_compare)) { ++dev->alt_servers[i].failures; dev->alt_servers[i].rtts[turn] = RTT_UNREACHABLE; + if (dev->alt_servers[i].best_count > 2) + dev->alt_servers[i].best_count -= 3; } mutex_unlock(&dev->alt_servers_lock); if (is_same_server(&dev->cur_server.host, &host_compare)) @@ -436,8 +443,21 @@ error: continue; } + // If best server was repeatedly measured best, lower the switching threshold more + mutex_lock(&dev->alt_servers_lock); + alt = get_existing_alt_from_addr(&best_server, dev); + if (alt != NULL) { + if (alt->best_count < 148) + alt->best_count += 3; + rtt_threshold = 1500 - (alt->best_count * 10); + } else { + rtt_threshold = 1500; + } + mutex_unlock(&dev->alt_servers_lock); + do_change = ready && !is_same_server(&best_server, &dev->cur_server.host) - && (ktime_to_us(start) & 3) != 0 && RTT_THRESHOLD_FACTOR(dev->cur_server.rtt) > best_rtt + 1500; + && (ktime_to_us(start) & 3) != 0 + && RTT_THRESHOLD_FACTOR(dev->cur_server.rtt) > best_rtt + rtt_threshold; if (ready && !do_change && best_sock != NULL) { spin_lock_irqsave(&dev->blk_lock, irqflags); @@ -456,8 +476,9 @@ error: // take server with lowest rtt // if a (dis)connect is already in progress, we do nothing, this is not panic mode if (do_change && atomic_cmpxchg(&dev->connection_lock, 0, 1) == 0) { - dev_info(dnbd3_device_to_dev(dev), "server %pISpc is faster (%lluµs vs. %lluµs)\n", &best_server, - (unsigned long long)best_rtt, (unsigned long long)dev->cur_server.rtt); + dev_info(dnbd3_device_to_dev(dev), "server %pISpc is faster (%lluµs vs. %lluµs)\n", + &best_server, + (unsigned long long)best_rtt, (unsigned long long)dev->cur_server.rtt); kfree(buf); dev->better_sock = best_sock; // Take shortcut by continuing to use open connection put_task_struct(dev->thread_discover); @@ -880,7 +901,7 @@ static struct socket *dnbd3_connect(dnbd3_device_t *dev, struct sockaddr_storage } if (ret < 0) { dev_dbg(dnbd3_device_to_dev(dev), "%pISpc: connect timed out (%d, %dms)\n", - ret, (int)ktime_ms_delta(ktime_get_real(), start)); + addr, ret, (int)ktime_ms_delta(ktime_get_real(), start)); goto error; } } -- cgit v1.2.3-55-g7522