/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2019 Frederic Robra <frederic@robra.org>
* Parts copyright 2011-2012 Johann Latocha <johann@latocha.de>
*
* This file may be licensed under the terms of of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
* on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
* express or implied. See the GPL for the specific language
* governing rights and limitations.
*
* You should have received a copy of the GPL along with this
* program. If not, go to http://www.gnu.org/licenses/gpl.html
* or write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
*/
#include <net/sock.h>
#include <linux/wait.h>
#include "net.h"
#include "utils.h"
#include "clientconfig.h"
#define DNBD3_REQ_OP_SPECIAL REQ_OP_DRV_IN
#define DNBD3_REQ_OP_CONNECT REQ_OP_DRV_OUT
#define dnbd3_cmd_to_priv(req, cmd) (req)->cmd_flags = DNBD3_REQ_OP_SPECIAL | ((cmd) << REQ_FLAG_BITS)
#define dnbd3_connect(req) (req)->cmd_flags = DNBD3_REQ_OP_CONNECT | ((CMD_SELECT_IMAGE) << REQ_FLAG_BITS)
#define dnbd3_priv_to_cmd(req) ((req)->cmd_flags >> REQ_FLAG_BITS)
#define dnbd3_test_block_to_req(req) \
do { \
(req)->cmd_flags = REQ_OP_READ; \
(req)->__data_len = RTT_BLOCK_SIZE; \
(req)->__sector = 0; \
} while (0)
#define dnbd3_sock_create(af,type,proto,sock) sock_create_kern(&init_net, (af) == HOST_IP4 ? AF_INET : AF_INET6, type, proto, sock)
#define REQUEST_TIMEOUT (HZ * SOCKET_TIMEOUT_CLIENT_DATA)
#define init_msghdr(h) do { \
h.msg_name = NULL; \
h.msg_namelen = 0; \
h.msg_control = NULL; \
h.msg_controllen = 0; \
h.msg_flags = MSG_WAITALL | MSG_NOSIGNAL; \
} while (0)
static DECLARE_WAIT_QUEUE_HEAD(send_wq);
static int dnbd3_socket_connect(struct dnbd3_device *dev, struct dnbd3_server *server);
static int __dnbd3_socket_connect(struct dnbd3_server * server, struct dnbd3_sock *sock);
static int dnbd3_socket_disconnect(struct dnbd3_device *dev, struct dnbd3_server *server, struct dnbd3_sock *sock);
static void dnbd3_print_server_list(struct dnbd3_device *dev)
{
int i;
info_server(dev, &dev->initial_server, "initial server is");
for (i = 0; i < NUMBER_SERVERS; i++) {
if (dev->alt_servers[i].host.addr[0] != 0) {
info_server(dev, &dev->alt_servers[i], "alternative server is");
}
}
}
static inline uint64_t dnbd3_to_wq_signal(int minor, uint16_t dnbd3_cmd, uint16_t sock_nr) {
return ((uint64_t) minor << 32) | ((uint32_t) dnbd3_cmd << 16) | sock_nr;
}
static inline uint64_t dnbd3_to_handle(uint32_t tag, uint32_t cookie) {
return ((uint64_t) tag << 32) | cookie;
}
static inline uint32_t dnbd3_tag_from_handle(uint64_t handle) {
return (uint32_t)(handle >> 32);
}
static inline uint32_t dnbd3_cookie_from_handle(uint64_t handle) {
return (uint32_t) handle;
}
int dnbd3_send_request(struct dnbd3_sock *sock, struct request *req, struct dnbd3_cmd *cmd)
{
dnbd3_request_t dnbd3_request;
struct msghdr msg;
struct kvec iov[2];
size_t iov_num = 1;
size_t send_len;
int result;
uint32_t tag;
uint64_t handle;
serialized_buffer_t payload_buffer;
sock->pending = req;
init_msghdr(msg);
dnbd3_request.magic = dnbd3_packet_magic;
switch (req_op(req)) {
case REQ_OP_READ:
debug_sock(sock, "request operation read");
dnbd3_request.cmd = CMD_GET_BLOCK;
dnbd3_request.offset = blk_rq_pos(req) << 9; // *512
dnbd3_request.size = blk_rq_bytes(req); // bytes left to complete entire request
break;
case DNBD3_REQ_OP_SPECIAL:
debug_sock(sock, "request operation special");
dnbd3_request.cmd = dnbd3_priv_to_cmd(req);
dnbd3_request.size = 0;
break;
case DNBD3_REQ_OP_CONNECT:
debug_sock(sock, "request operation connect to %s", sock->device->imgname);
dnbd3_request.cmd = CMD_SELECT_IMAGE;
serializer_reset_write(&payload_buffer);
serializer_put_uint16(&payload_buffer, PROTOCOL_VERSION);
serializer_put_string(&payload_buffer, sock->device->imgname);
serializer_put_uint16(&payload_buffer, sock->device->rid);
serializer_put_uint8(&payload_buffer, 0); // is_server = false
iov[1].iov_base = &payload_buffer;
dnbd3_request.size = iov[1].iov_len = serializer_get_written_length(&payload_buffer);
iov_num = 2;
break;
default:
return -EIO;
}
sock->cookie++;
if (cmd != NULL) {
cmd->cookie = sock->cookie;
tag = blk_mq_unique_tag(req);
handle = dnbd3_to_handle(tag, sock->cookie);// ((uint64_t) tag << 32) | sock->cookie;
} else {
handle = sock->cookie;
}
memcpy(&dnbd3_request.handle, &handle, sizeof(handle));
fixup_request(dnbd3_request);
iov[0].iov_base = &dnbd3_request;
iov[0].iov_len = sizeof(dnbd3_request);
send_len = iov_num == 1 ? sizeof(dnbd3_request) : iov[0].iov_len + iov[1].iov_len;
result = kernel_sendmsg(sock->sock, &msg, iov, iov_num, send_len);
if (result != send_len) {
error_sock(sock, "connection to server lost");
sock->server->failures++;
goto error;
}
sock->pending = NULL;
error:
return result;
}
static int dnbd3_send_request_cmd(struct dnbd3_sock *sock, uint16_t dnbd3_cmd)
{
int result;
struct request *req = kmalloc(sizeof(struct request), GFP_KERNEL);
if (!req) {
error_sock(sock, "kmalloc failed");
result = -EIO;
goto error;
}
switch (dnbd3_cmd) {
case CMD_KEEPALIVE:
case CMD_GET_SERVERS:
dnbd3_cmd_to_priv(req, dnbd3_cmd);
break;
case CMD_SELECT_IMAGE:
dnbd3_connect(req);
break;
case CMD_GET_BLOCK:
dnbd3_test_block_to_req(req);
break;
default:
warn_sock(sock, "unsupported command for blocking %d", dnbd3_cmd);
result = -EINVAL;
goto error;
}
mutex_lock(&sock->tx_lock);
result = dnbd3_send_request(sock, req, NULL);
if (result <= 0) {
mutex_unlock(&sock->tx_lock);
goto error;
}
mutex_unlock(&sock->tx_lock);
error:
if (req) {
kfree(req);
}
return result;
}
static int dnbd3_receive_cmd(struct dnbd3_sock *sock, dnbd3_reply_t *reply)
{
int result;
struct msghdr msg;
struct kvec iov;
init_msghdr(msg);
iov.iov_base = reply;
iov.iov_len = sizeof(dnbd3_reply_t);
result = kernel_recvmsg(sock->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags);
if (result <= 0) {
return result;
}
fixup_reply(dnbd3_reply);
// check error
if (reply->magic != dnbd3_packet_magic) {
error_sock(sock, "receive cmd wrong magic packet");
return -EIO;
}
if (reply->cmd == 0) {
error_sock(sock, "receive command was 0");
return -EIO;
}
return result;
}
static int dnbd3_receive_cmd_get_block_mq(struct dnbd3_device *dev, struct dnbd3_sock *sock, dnbd3_reply_t *reply)
{
struct dnbd3_cmd *cmd;
struct msghdr msg;
struct request *req = NULL;
struct kvec iov;
struct req_iterator iter;
struct bio_vec bvec_inst;
struct bio_vec *bvec = &bvec_inst;
sigset_t blocked, oldset;
void *kaddr;
uint32_t tag, cookie;
uint16_t hwq;
int result = 0;
uint64_t handle;
init_msghdr(msg);
memcpy(&handle, &reply->handle, sizeof(handle));
cookie = dnbd3_cookie_from_handle(handle);
tag = dnbd3_tag_from_handle(handle);
hwq = blk_mq_unique_tag_to_hwq(tag);
if (hwq < dev->tag_set.nr_hw_queues) {
req = blk_mq_tag_to_rq(dev->tag_set.tags[hwq], blk_mq_unique_tag_to_tag(tag));
}
if (!req || !blk_mq_request_started(req)) {
dev_err(disk_to_dev(dev->disk), "unexpected reply (%d) %p\n", tag, req);
return -EIO;
}
cmd = blk_mq_rq_to_pdu(req);
mutex_lock(&cmd->lock);
if (cmd->cookie != cookie) {
dev_err(disk_to_dev(dev->disk), "double reply on req %p, cookie %u, handle cookie %u\n",
req, cmd->cookie, cookie);
mutex_unlock(&cmd->lock);
return -EIO;
}
rq_for_each_segment(bvec_inst, req, iter) {
siginitsetinv(&blocked, sigmask(SIGKILL));
sigprocmask(SIG_SETMASK, &blocked, &oldset);
kaddr = kmap(bvec->bv_page) + bvec->bv_offset;
iov.iov_base = kaddr;
iov.iov_len = bvec->bv_len;
result = kernel_recvmsg(sock->sock, &msg, &iov, 1, bvec->bv_len, msg.msg_flags);
if (result != bvec->bv_len) {
kunmap(bvec->bv_page);
sigprocmask(SIG_SETMASK, &oldset, NULL );
error_sock(sock, "could not receive form net to block layer");
mutex_unlock(&cmd->lock);
return result;
}
kunmap(bvec->bv_page);
sigprocmask(SIG_SETMASK, &oldset, NULL );
}
mutex_unlock(&cmd->lock);
blk_mq_end_request(req, 0);
return result;
}
static int dnbd3_receive_cmd_get_block_test(struct dnbd3_sock *sock, dnbd3_reply_t *reply)
{
struct msghdr msg;
struct kvec iov;
int result = 0;
char *buf = kmalloc(reply->size, GFP_KERNEL);
if (!buf) {
error_sock(sock, "kmalloc failed");
goto error;
}
init_msghdr(msg);
iov.iov_base = buf;
iov.iov_len = reply->size;
result = kernel_recvmsg(sock->sock, &msg, &iov, 1, reply->size, msg.msg_flags);
if (result != RTT_BLOCK_SIZE) {
error_sock(sock, "receive test block failed");
goto error;
}
error:
if (buf) {
kfree(buf);
}
return result;
}
static int dnbd3_receive_cmd_get_servers(struct dnbd3_device *dev, struct dnbd3_sock *sock, dnbd3_reply_t *reply)
{
struct msghdr msg;
struct kvec iov;
/* return true if did not receive servers, not an error*/
int result = 1;
int count, remaining;
init_msghdr(msg);
debug_sock(sock, "get servers received");
mutex_lock(&dev->device_lock);
if (!dev->use_server_provided_alts) {
remaining = reply->size;
goto consume_payload;
}
dev->new_servers_num = 0;
count = MIN(NUMBER_SERVERS, reply->size / sizeof(dnbd3_server_entry_t));
if (count != 0) {
iov.iov_base = dev->new_servers;
iov.iov_len = count * sizeof(dnbd3_server_entry_t);
result = kernel_recvmsg(sock->sock, &msg, &iov, 1, (count * sizeof(dnbd3_server_entry_t)), msg.msg_flags);
if (result <= 0) {
error_sock(sock, "failed to receive get servers %d", result);
return result;
} else if (result != (count * sizeof(dnbd3_server_entry_t))) {
error_sock(sock, "failed to get servers");
mutex_unlock(&dev->device_lock);
return -EIO;
}
dev->new_servers_num = count;
}
// If there were more servers than accepted, remove the remaining data from the socket buffer
remaining = reply->size - (count * sizeof(dnbd3_server_entry_t));
consume_payload:
while (remaining > 0) {
count = MIN(sizeof(dnbd3_reply_t), remaining); // Abuse the reply struct as the receive buffer
iov.iov_base = reply;
iov.iov_len = count;
result = kernel_recvmsg(sock->sock, &msg, &iov, 1, count, msg.msg_flags);
if (result <= 0) {
error_sock(sock, "failed to receive payload from get servers");
mutex_unlock(&dev->device_lock);
return result;
}
}
mutex_unlock(&dev->device_lock);
return result;
}
static int dnbd3_receive_cmd_latest_rid(struct dnbd3_device *dev, struct dnbd3_sock *sock, dnbd3_reply_t *reply)
{
struct kvec iov;
uint16_t rid;
int result;
struct msghdr msg;
init_msghdr(msg);
debug_sock(sock, "latest rid received");
if (reply->size != 2) {
error_sock(sock, "failed to get latest rid, wrong size");
return -EIO;
}
iov.iov_base = &rid;
iov.iov_len = sizeof(rid);
result = kernel_recvmsg(sock->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags);
if (result <= 0) {
error_sock(sock, "failed to receive latest rid");
return result;
}
rid = net_order_16(rid);
debug_sock(sock, "latest rid of %s is %d (currently using %d)", dev->imgname, (int)rid, (int)dev->rid);
dev->update_available = (rid > dev->rid ? true : false);
return result;
}
static int dnbd3_receive_cmd_select_image(struct dnbd3_device *dev, struct dnbd3_sock *sock, dnbd3_reply_t *reply)
{
struct kvec iov;
uint16_t rid;
char *name;
int result;
struct msghdr msg;
serialized_buffer_t payload_buffer;
uint64_t reported_size;
init_msghdr(msg);
debug_sock(sock, "select image received");
// receive reply payload
iov.iov_base = &payload_buffer;
iov.iov_len = reply->size;
result = kernel_recvmsg(sock->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags);
if (result <= 0) {
error_sock(sock, "failed to receive select image %d", result);
return result;
} else if (result != reply->size) {
error_sock(sock, "could not read CMD_SELECT_IMAGE payload on handshake, size is %d and should be %d",
result, reply->size);
return -EIO;
}
// handle/check reply payload
serializer_reset_read(&payload_buffer, reply->size);
sock->server->protocol_version = serializer_get_uint16(&payload_buffer);
if (sock->server->protocol_version < MIN_SUPPORTED_SERVER) {
error_sock(sock, "server version is lower than min supported version");
return -EIO;
}
//TODO compare RID
name = serializer_get_string(&payload_buffer);
rid = serializer_get_uint16(&payload_buffer);
if (dev->rid != rid && strcmp(name, dev->imgname) != 0) {
error_sock(sock, "server offers image '%s', requested '%s'", name, dev->imgname);
return -EIO;
}
reported_size = serializer_get_uint64(&payload_buffer);
if (!dev->reported_size) {
if (reported_size < 4096) {
error_sock(sock, "reported size by server is < 4096");
return -EIO;
}
dev->reported_size = reported_size;
set_capacity(dev->disk, dev->reported_size >> 9); /* 512 Byte blocks */
} else if (dev->reported_size != reported_size) {
error_sock(sock, "reported size by server is %llu but should be %llu", reported_size, dev->reported_size);
return -EIO;
}
return result;
}
static void dnbd3_receive_worker(struct work_struct *work)
{
struct dnbd3_sock *sock = container_of(work, struct dnbd3_sock, receive_worker);
struct dnbd3_device *dev = sock->device;
dnbd3_reply_t dnbd3_reply;
int result;
debug_sock(sock, "receive worker is starting");
while(1) { // loop until socket returns 0
result = dnbd3_receive_cmd(sock, &dnbd3_reply);
if (result == -EAGAIN) {
continue;
} else if (result <= 0) {
error_sock(sock, "connection to server lost %d", result);
goto error;
}
switch (dnbd3_reply.cmd) {
case CMD_GET_BLOCK:
result = dnbd3_receive_cmd_get_block_mq(dev, sock, &dnbd3_reply);
if (result <= 0) {
error_sock(sock, "receive cmd get block mq failed %d", result);
goto error;
}
continue; // we do not need to wake up anyone, wait for next cmd (block)
case CMD_GET_SERVERS:
result = dnbd3_receive_cmd_get_servers(dev, sock, &dnbd3_reply);
if (result <= 0) {
error_sock(sock, "receive cmd get servers failed %d", result);
goto error;
}
break;
case CMD_LATEST_RID:
result = dnbd3_receive_cmd_latest_rid(dev, sock, &dnbd3_reply);
if (result <= 0) {
error_sock(sock, "receive cmd latest rid failed %d", result);
goto error;
}
break;
case CMD_KEEPALIVE:
if (dnbd3_reply.size != 0) {
error_sock(sock, "got keep alive packet with payload");
goto error;
}
debug_sock(sock, "keep alive received");
break;
case CMD_SELECT_IMAGE:
result = dnbd3_receive_cmd_select_image(dev, sock, &dnbd3_reply);
if (result <= 0) {
error_sock(sock, "receive cmd select image failed %d", result);
goto error;
}
break;
default:
warn_sock(sock, "unknown command received");
break;
}
error:
if (result == 0) {
info_sock(sock, "result is 0, socket seems to be down");
sock->panic = true;
break; //the socket seems to be down
} else if (result < 0) {
sock->server->failures++; // discovery takes care of to many failures
warn_sock(sock, "receive error happened %d, total failures %d", result, sock->server->failures);
}
debug_sock(sock, "receive completed, waiting for next receive");
}
debug_sock(sock, "receive work queue is stopped");
}
static void dnbd3_timer(struct timer_list *arg)
{
struct dnbd3_device *dev = container_of(arg, struct dnbd3_device, timer);
int i;
queue_work(dnbd3_wq, &dev->panic_worker);
if (dev->timer_count % TIMER_INTERVAL_KEEPALIVE_PACKET == 0) {
for (i = 0; i < NUMBER_CONNECTIONS; i++) {
if (dnbd3_is_sock_alive(dev->socks[i])) {
queue_work(dnbd3_wq, &dev->socks[i].keepalive_worker);
}
}
}
if (dev->timer_count % TIMER_INTERVAL_PROBE_NORMAL == 4) { // wait for 4 seconds
queue_work(dnbd3_wq, &dev->discovery_worker);
}
dev->timer_count++;
dev->timer.expires = jiffies + HZ;
add_timer(&dev->timer);
}
static void dnbd3_keepalive_worker(struct work_struct *work)
{
struct dnbd3_sock *sock = container_of(work, struct dnbd3_sock, keepalive_worker);
debug_sock(sock, "starting keepalive worker");
dnbd3_send_request_cmd(sock, CMD_KEEPALIVE);
}
static struct dnbd3_server *dnbd3_find_best_alt_server(struct dnbd3_device *dev) {
int i, j;
uint64_t rtt = 0;
uint64_t best_rtt = RTT_UNREACHABLE;
uint64_t current_best_rtt = RTT_UNREACHABLE;
struct dnbd3_server *best_alt_server = NULL;
struct dnbd3_server *better_alt_server = NULL;
for (i = 0; i < NUMBER_CONNECTIONS; i++) {
if (dnbd3_is_sock_alive(dev->socks[i])) {
rtt = (dev->socks[i].server->rtts[0] + dev->socks[i].server->rtts[1] + dev->socks[i].server->rtts[2] + dev->socks[i].server->rtts[3]) / 4;
if (rtt <= current_best_rtt) {
current_best_rtt = rtt;
}
}
}
best_rtt = RTT_THRESOULD_LIMIT(current_best_rtt);
debug_dev(dev, "best connected rtt is %llu, searching for rtt better than %llu", current_best_rtt, best_rtt);
for (i = 0; i < NUMBER_SERVERS; i++) {
if (dev->alt_servers[i].host.type != 0) {
rtt = (dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4;
if (rtt <= best_rtt) {
better_alt_server = &dev->alt_servers[i];
for (j = 0; j < NUMBER_CONNECTIONS; j++) { // check if already connected
if (better_alt_server == dev->socks[j].server) {
better_alt_server = NULL; // found already connected server
break;
}
}
if (better_alt_server) {
best_alt_server = better_alt_server;
best_rtt = rtt;
}
}
}
}
if (best_alt_server) {
info_server(dev, best_alt_server, "found best alt server with rtt %llu", best_rtt);
} else {
debug_dev(dev, "did not find any alternative server");
}
return best_alt_server;
}
static bool dnbd3_better_rtt(struct dnbd3_server *new_server, struct dnbd3_server *existing_server) {
uint64_t new_rtt = (new_server->rtts[0] + new_server->rtts[1] + new_server->rtts[2] + new_server->rtts[3]) / 4;
uint64_t existing_rtt = (existing_server->rtts[0] + existing_server->rtts[1] + existing_server->rtts[2] + existing_server->rtts[3]) / 4;
if (new_rtt < RTT_THRESHOLD_FACTOR(existing_rtt)) {
return true;
}
return false;
}
static void dnbd3_adjust_connections(struct dnbd3_device *dev) {
int i;
int sock_alive = 0;
uint64_t rtt;
uint64_t best_rtt = RTT_UNREACHABLE;
struct dnbd3_server *server, *existing_server;
// connect empty sockets
sock_alive = 0;
for (i = 0; i < NUMBER_CONNECTIONS; i++) {
if (!dnbd3_is_sock_alive(dev->socks[i])) {
server = dnbd3_find_best_alt_server(dev);
if (server) {
if (dnbd3_socket_connect(dev, server) == 0) {
sock_alive++;
} else {
warn_server(dev, server, "failed to connect");
}
}
} else {
sock_alive++;
}
}
// replace socket with better server
if (sock_alive == NUMBER_CONNECTIONS) {
for (i = 0; i < NUMBER_CONNECTIONS; i++) {
if (dnbd3_is_sock_alive(dev->socks[i])) {
server = dnbd3_find_best_alt_server(dev);
existing_server = dev->socks[i].server;
if (server && dnbd3_better_rtt(server, dev->socks[i].server)) {
dnbd3_socket_disconnect(dev, NULL, &dev->socks[i]);
if (dnbd3_socket_connect(dev, server) != 0) {
warn_server(dev, server, "failed to connect");
dnbd3_socket_connect(dev, existing_server);
}
}
}
}
}
// remove a socket if it is much slower than the others
if (sock_alive > 1) {
for (i = 0; i < NUMBER_CONNECTIONS; i++) {
if (dnbd3_is_sock_alive(dev->socks[i])) {
rtt = (dev->socks[i].server->rtts[0] + dev->socks[i].server->rtts[1] + dev->socks[i].server->rtts[2] + dev->socks[i].server->rtts[3]) / 4;
if (rtt <= best_rtt) {
best_rtt = rtt;
}
}
}
for (i = 0; i < NUMBER_CONNECTIONS; i++) {
if (dnbd3_is_sock_alive(dev->socks[i])) {
rtt = (dev->socks[i].server->rtts[0] + dev->socks[i].server->rtts[1] + dev->socks[i].server->rtts[2] + dev->socks[i].server->rtts[3]) / 4;
if (rtt > RTT_THRESOULD_LIMIT(best_rtt)) {
info_sock(&dev->socks[i], "removing connection with rtt %llu", rtt);
dnbd3_socket_disconnect(dev, NULL, &dev->socks[i]);
sock_alive--;
}
}
}
}
debug_dev(dev, "connected to %d/%d sockets", sock_alive, NUMBER_CONNECTIONS);
}
static int dnbd3_panic_connect(struct dnbd3_device *dev) {
int result, i;
result = dnbd3_socket_connect(dev, &dev->initial_server);
if (result) {
for (i = 0; i < NUMBER_SERVERS; i++) {
if (dev->alt_servers[i].host.type != 0) {
result = dnbd3_socket_connect(dev, &dev->alt_servers[i]);
if (!result) {
info_server(dev, &dev->alt_servers[i], "found server to connect to");
break;
}
}
}
}
return result;
}
static void dnbd3_panic_worker(struct work_struct *work)
{
struct dnbd3_device *dev = container_of(work, struct dnbd3_device, panic_worker);
struct dnbd3_sock *panicked_sock = NULL;
struct dnbd3_server *new_server, *panicked_server;
int i;
int sock_alive = 0;
for (i = 0; i < NUMBER_CONNECTIONS; i++) {
if (dev->socks[i].panic) {
panicked_sock = &dev->socks[i];
} else if (dnbd3_is_sock_alive(dev->socks[i])) {
sock_alive++;
}
}
if (panicked_sock) {
warn_sock(panicked_sock, "panicked, connections still alive %d", sock_alive);
panicked_server = panicked_sock->server;
new_server = dnbd3_find_best_alt_server(dev);
dnbd3_socket_disconnect(dev, panicked_server, panicked_sock);
if (new_server != NULL && new_server != panicked_server) {
info_server(dev, new_server, "found replacement");
if (!dnbd3_socket_connect(dev, new_server)) {
sock_alive++;
}
} else if (sock_alive > 0) {
info_dev(dev, "found no replacement server but still connected to %d servers", sock_alive);
}
} else if (sock_alive == 0) {
new_server = dnbd3_find_best_alt_server(dev);
if (new_server != NULL) {
info_server(dev, new_server, "reconnect to server");
if (!dnbd3_socket_connect(dev, new_server)) {
sock_alive++;
}
}
}
if (sock_alive == 0) {
warn_dev(dev, "did not find a good server, trying to connect to any available server");
if (dnbd3_panic_connect(dev)) {
error_dev(dev, "could not connect to any server");
} else {
info_dev(dev, "found server to connect to");
}
}
}
static int dnbd3_meassure_rtt(struct dnbd3_device *dev, struct dnbd3_server *server)
{
struct timeval start, end;
dnbd3_reply_t reply;
struct request req;
int result;
uint64_t rtt = RTT_UNREACHABLE;
struct dnbd3_sock sock = {
.sock_nr = NUMBER_CONNECTIONS,
.sock = NULL,
.device = dev,
.server = server
};
result = __dnbd3_socket_connect(server, &sock);
if (result) {
error_sock(&sock, "socket connect failed in rtt measurement");
goto error;
}
dnbd3_connect(&req);
result = dnbd3_send_request_cmd(&sock, CMD_SELECT_IMAGE);
if (result <= 0) {
error_sock(&sock, "request select image failed in rtt measurement");
goto error;
}
result = dnbd3_receive_cmd(&sock, &reply);
if (result <= 0) {
error_sock(&sock, "receive select image failed in rtt measurement");
goto error;
}
if (reply.magic != dnbd3_packet_magic || reply.cmd != CMD_SELECT_IMAGE || reply.size < 4) {
error_sock(&sock, "receive select image wrong header in rtt measurement");
result = -EIO;
goto error;
}
result = dnbd3_receive_cmd_select_image(dev, &sock, &reply);
if (result <= 0) {
error_sock(&sock, "receive data select image failed in rtt measurement");
goto error;
}
do_gettimeofday(&start);
result = dnbd3_send_request_cmd(&sock, CMD_GET_BLOCK);
if (result <= 0) {
error_sock(&sock, "request test block failed in rtt measurement");
goto error;
}
result = dnbd3_receive_cmd(&sock, &reply);
if (reply.magic != dnbd3_packet_magic|| reply.cmd != CMD_GET_BLOCK || reply.size != RTT_BLOCK_SIZE) {
error_sock(&sock, "receive header cmd test block failed in rtt measurement");
result = -EIO;
goto error;
}
result = dnbd3_receive_cmd_get_block_test(&sock, &reply);
if (result <= 0) {
error_sock(&sock, "receive test block failed in rtt measurement");
goto error;
}
do_gettimeofday(&end); // end rtt measurement
rtt = (uint64_t)((end.tv_sec - start.tv_sec) * 1000000ull + (end.tv_usec - start.tv_usec));
info_sock(&sock, "new rrt is %llu", rtt);
error:
sock.server->rtts[dev->discovery_count % 4] = rtt;
if (result <= 0) {
server->failures++;
}
if (sock.sock) {
kernel_sock_shutdown(sock.sock, SHUT_RDWR);
sock.server = NULL;
sock_release(sock.sock);
sock.sock = NULL;
}
return result;
}
static void dnbd3_discovery_worker(struct work_struct *work)
{
struct dnbd3_device *dev = container_of(work, struct dnbd3_device, discovery_worker);
int i, j;
struct dnbd3_server *existing_server, *free_server, *failed_server;
dnbd3_server_entry_t *new_server;
debug_dev(dev, "starting discovery worker new server num is %d", dev->new_servers_num);
if (dev->new_servers_num) {
mutex_lock(&dev->device_lock);
for (i = 0; i < dev->new_servers_num; i++) {
new_server = &dev->new_servers[i];
if (new_server->host.type == HOST_IP4 || new_server->host.type == HOST_IP6) {
existing_server = NULL;
free_server = NULL;
failed_server = NULL;
// find servers in alt servers
for (j = 0; j < NUMBER_SERVERS; j++) {
if ((new_server->host.type == dev->alt_servers[j].host.type)
&& (new_server->host.port == dev->alt_servers[j].host.port)
&& (0 == memcmp(new_server->host.addr, dev->alt_servers[j].host.addr,
(new_server->host.type == HOST_IP4 ? 4 : 16)))) {
existing_server = &dev->alt_servers[j];
} else if (dev->alt_servers[j].host.type == 0) {
free_server = &dev->alt_servers[j];
} else if (dev->alt_servers[j].failures > 20) {
failed_server = &dev->alt_servers[j];
}
}
if (existing_server) {
if (new_server->failures == 1) { // remove is requested
info_server(dev, new_server, "remove server is requested");
dnbd3_socket_disconnect(dev, existing_server, NULL); // TODO what to do when only one connection?
existing_server->host.type = 0;
}
// existing_server->failures = 0; // reset failure count
continue;
} else if (free_server) {
free_server->host = new_server->host;
} else if (failed_server) {
failed_server->host = new_server->host;
free_server = failed_server;
} else {
//no server found to replace
continue;
}
info_server(dev, free_server, "got new alt server");
free_server->failures = 0;
free_server->protocol_version = 0;
free_server->rtts[0] = free_server->rtts[1] = free_server->rtts[2] = free_server->rtts[3] = RTT_UNREACHABLE;
}
}
dev->new_servers_num = 0;
mutex_unlock(&dev->device_lock);
}
// measure rtt for all alt servers
for (i = 0; i < NUMBER_SERVERS; i++) {
existing_server = &dev->alt_servers[i];
if (existing_server->host.type) {
if (dnbd3_meassure_rtt(dev, existing_server) <= 0) {
existing_server->failures++;
warn_server(dev, existing_server, "failed to meassure rtt");
}
}
}
dnbd3_adjust_connections(dev);
dev->discovery_count++;
}
static int __dnbd3_socket_connect(struct dnbd3_server *server, struct dnbd3_sock *sock)
{
int result = 0;
struct timeval timeout;
if (server->host.port == 0 || server->host.type == 0) {
error_sock(sock, "host or port not set");
return -EIO;
}
if (sock->sock) {
warn_sock(sock, "already connected");
return -EIO;
}
timeout.tv_sec = SOCKET_TIMEOUT_CLIENT_DATA;
timeout.tv_usec = 0;
if ((result = dnbd3_sock_create(server->host.type, SOCK_STREAM, IPPROTO_TCP, &sock->sock)) < 0) {
error_sock(sock, "could not create socket");
goto error;
}
kernel_setsockopt(sock->sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
kernel_setsockopt(sock->sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
sock->sock->sk->sk_allocation = GFP_NOIO;
if (server->host.type == HOST_IP4) {
struct sockaddr_in sin;
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
memcpy(&(sin.sin_addr), server->host.addr, 4);
sin.sin_port = server->host.port;
if ((result = kernel_connect(sock->sock, (struct sockaddr *)&sin, sizeof(sin), 0)) != 0) {
error_sock(sock, "connection to host failed");
goto error;
}
} else {
struct sockaddr_in6 sin;
memset(&sin, 0, sizeof(sin));
sin.sin6_family = AF_INET6;
memcpy(&(sin.sin6_addr), server->host.addr, 16);
sin.sin6_port = server->host.port;
if ((result = kernel_connect(sock->sock, (struct sockaddr *)&sin, sizeof(sin), 0)) != 0){
error_sock(sock, "connection to host failed");
goto error;
}
}
return 0;
error:
if (sock->sock) {
sock_release(sock->sock);
sock->sock = NULL;
}
return result;
}
/**
* connect a dnbd3 device to a server
*/
static int dnbd3_socket_connect(struct dnbd3_device *dev, struct dnbd3_server *server)
{
int i;
int sock_alive = 0;
int result = -EIO;
dnbd3_reply_t reply;
struct dnbd3_sock *sock = NULL;
for (i = 0; i < NUMBER_CONNECTIONS; i++) {
if (!dnbd3_is_sock_alive(dev->socks[i])) {
sock = &dev->socks[i];
break;
}
}
if (sock == NULL) {
warn_server(dev, server, "could not connect to socket, to many connections");
return -EIO;
}
sock->server = server;
debug_sock(sock, "socket connect");
mutex_init(&sock->tx_lock);
mutex_lock(&sock->tx_lock);
result = __dnbd3_socket_connect(server, sock);
mutex_unlock(&sock->tx_lock);
sock->panic = false;
if (!sock->sock) {
error_sock(sock, "socket is not connected");
server->failures++;
result = -EIO;
goto error;
}
result = dnbd3_send_request_cmd(sock, CMD_SELECT_IMAGE);
if (result <= 0) {
error_sock(sock, "connection to image %s failed", dev->imgname);
result = -EIO;
goto error;
}
result = dnbd3_receive_cmd(sock, &reply);
if (result <= 0) {
error_sock(sock, "receive cmd to image %s failed", dev->imgname);
result = -EIO;
goto error;
}
if (reply.magic != dnbd3_packet_magic || reply.cmd != CMD_SELECT_IMAGE || reply.size < 4) {
error_sock(sock, "receive select image wrong header %s", dev->imgname);
result = -EIO;
goto error;
}
result = dnbd3_receive_cmd_select_image(dev, sock, &reply);
if (result <= 0) {
error_sock(sock, "receive cmd select image %s failed", dev->imgname);
result = -EIO;
goto error;
}
debug_sock(sock, "connected to image %s, filesize %llu", dev->imgname, dev->reported_size);
// start the receiver
INIT_WORK(&sock->receive_worker, dnbd3_receive_worker);
queue_work(dnbd3_wq, &sock->receive_worker);
INIT_WORK(&sock->keepalive_worker, dnbd3_keepalive_worker);
for (i = 0; i < NUMBER_CONNECTIONS; i++) {
if (dev->socks[i].sock && dev->socks[i].server) {
sock_alive++;
}
}
if (sock_alive == 1) { // first socket to connect, start timer and workers
debug_sock(sock, "first connection to server, starting workers");
INIT_WORK(&dev->discovery_worker, dnbd3_discovery_worker);
INIT_WORK(&dev->panic_worker, dnbd3_panic_worker);
timer_setup(&dev->timer, dnbd3_timer, 0);
dev->timer.expires = jiffies + HZ;
add_timer(&dev->timer);
}
blk_mq_update_nr_hw_queues(&dev->tag_set, sock_alive);
// request alternative servers receiver will handle this
if (dnbd3_send_request_cmd(sock, CMD_GET_SERVERS) <= 0) {
error_sock(sock, "failed to get servers in discovery");
}
return 0;
error:
server->failures++;
if (sock->sock) {
kernel_sock_shutdown(sock->sock, SHUT_RDWR);
cancel_work_sync(&sock->receive_worker);
sock_release(sock->sock);
sock->sock = NULL;
}
mutex_destroy(&sock->tx_lock);
return result;
}
static int dnbd3_socket_disconnect(struct dnbd3_device *dev, struct dnbd3_server *server, struct dnbd3_sock *sock)
{
int i;
int sock_alive = 0;
for (i = 0; i < NUMBER_CONNECTIONS; i++) {
if (sock == NULL && dev->socks[i].server == server) {
sock = &dev->socks[i];
}
if (dnbd3_is_sock_alive(dev->socks[i])) {
sock_alive++;
}
}
if (!sock || !sock->sock) {
warn_dev(dev, "could not find socket to disconnect");
return -EIO;
}
blk_mq_update_nr_hw_queues(&dev->tag_set, sock_alive - 1);
if (sock_alive <= 1) {
info_sock(sock, "shutting down last socket and stopping timer");
del_timer_sync(&dev->timer);
// dev->timer_count = 0;
// dev->discovery_count = 0;
// cancel_work_sync(&dev->discovery_worker); // do not wait
// cancel_work_sync(&dev->panic_worker); // do not wait for panic_worker, probably we are called from panic_worker
}
cancel_work_sync(&sock->keepalive_worker);
debug_sock(sock, "socket disconnect");
mutex_lock(&sock->tx_lock);
/*
* Important sequence to shut down socket
* 1. kernel_sock_shutdown
* socket shutdown, receiver which hangs in kernel_recvmsg returns 0
* 2. cancel_work_sync(receiver)
* wait for the receiver to finish, so the socket is not used anymore
* 3. sock_release
* release the socket and set to NULL
*/
if (sock->sock) {
kernel_sock_shutdown(sock->sock, SHUT_RDWR);
}
mutex_unlock(&sock->tx_lock);
mutex_destroy(&sock->tx_lock);
cancel_work_sync(&sock->receive_worker);
if (sock->sock) {
sock_release(sock->sock);
sock->sock = NULL;
}
sock->server = NULL;
sock->panic = false;
return 0;
}
int dnbd3_net_disconnect(struct dnbd3_device *dev)
{
int i;
int result = 0;
del_timer_sync(&dev->timer);
cancel_work_sync(&dev->discovery_worker);
cancel_work_sync(&dev->panic_worker); // be sure it does not recover while disconnecting
for (i = 0; i < NUMBER_CONNECTIONS; i++) {
if (dev->socks[i].sock) {
if (dnbd3_socket_disconnect(dev, NULL, &dev->socks[i])) {
result = -EIO;
}
}
}
return result;
}
int dnbd3_net_connect(struct dnbd3_device *dev)
{
int result;
debug_dev(dev, "connecting to server");
// alt_server[0] is the initial server
if (dnbd3_socket_connect(dev, &dev->alt_servers[0]) == 0) {
dnbd3_print_server_list(dev);
result = 0;
} else {
error_dev(dev, "failed to connect to initial server");
result = -ENOENT;
dev->imgname = NULL;
dev->socks[0].server = NULL;
}
return result;
}