/* * This file is part of the Distributed Network Block Device 3 * * Copyright(c) 2019 Frederic Robra * Parts copyright 2011-2012 Johann Latocha * * This file may be licensed under the terms of of the * GNU General Public License Version 2 (the ``GPL''). * * Software distributed under the License is distributed * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either * express or implied. See the GPL for the specific language * governing rights and limitations. * * You should have received a copy of the GPL along with this * program. If not, go to http://www.gnu.org/licenses/gpl.html * or write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * */ #include #include #include "net.h" #include "utils.h" #include "clientconfig.h" #define DNBD3_REQ_OP_SPECIAL REQ_OP_DRV_IN #define DNBD3_REQ_OP_CONNECT REQ_OP_DRV_OUT #define dnbd3_cmd_to_priv(req, cmd) (req)->cmd_flags = DNBD3_REQ_OP_SPECIAL | ((cmd) << REQ_FLAG_BITS) #define dnbd3_connect(req) (req)->cmd_flags = DNBD3_REQ_OP_CONNECT | ((CMD_SELECT_IMAGE) << REQ_FLAG_BITS) #define dnbd3_priv_to_cmd(req) ((req)->cmd_flags >> REQ_FLAG_BITS) #define dnbd3_sock_create(af,type,proto,sock) sock_create_kern(&init_net, (af) == HOST_IP4 ? AF_INET : AF_INET6, type, proto, sock) #define REQUEST_TIMEOUT (HZ * SOCKET_TIMEOUT_CLIENT_DATA) #define init_msghdr(h) do { \ h.msg_name = NULL; \ h.msg_namelen = 0; \ h.msg_control = NULL; \ h.msg_controllen = 0; \ h.msg_flags = MSG_WAITALL | MSG_NOSIGNAL; \ } while (0) static DECLARE_WAIT_QUEUE_HEAD(send_wq); static atomic64_t send_wq_signal; static int dnbd3_socket_connect(struct dnbd3_device *dev, struct dnbd3_server *server); static int __dnbd3_socket_connect(struct dnbd3_server * server, struct dnbd3_sock *sock); static int dnbd3_socket_disconnect(struct dnbd3_device *dev, struct dnbd3_server *server, struct dnbd3_sock *sock); static void dnbd3_print_server_list(struct dnbd3_device *dev) { int i; print_server(KERN_INFO, dev, &dev->initial_server, "initial server is"); for (i = 0; i < NUMBER_SERVERS; i++) { if (dev->alt_servers[i].host.addr[0] != 0) { print_server(KERN_INFO, dev, &dev->alt_servers[i], "alternative server is"); } } } static inline uint64_t dnbd3_to_wq_signal(int minor, uint16_t dnbd3_cmd, uint16_t sock_nr) { return ((uint64_t) minor << 32) | ((uint32_t) dnbd3_cmd << 16) | sock_nr; } static uint64_t dnbd3_to_handle(uint32_t arg0, uint32_t arg1) { return ((uint64_t) arg0 << 32) | arg1; } static uint32_t dnbd3_arg0_from_handle(uint64_t handle) { return (uint32_t)(handle >> 32); } static uint32_t dnbd3_arg1_from_handle(uint64_t handle) { return (uint32_t) handle; } int dnbd3_send_request(struct dnbd3_sock *sock, struct request *req, struct dnbd3_cmd *cmd) { dnbd3_request_t dnbd3_request; struct msghdr msg; struct kvec iov[2]; size_t iov_num = 1; size_t send_len; int result; uint32_t tag; uint64_t handle; serialized_buffer_t payload_buffer; sock->pending = req; init_msghdr(msg); dnbd3_request.magic = dnbd3_packet_magic; switch (req_op(req)) { case REQ_OP_READ: debug_sock(sock, "request operation read"); dnbd3_request.cmd = CMD_GET_BLOCK; dnbd3_request.offset = blk_rq_pos(req) << 9; // *512 dnbd3_request.size = blk_rq_bytes(req); // bytes left to complete entire request break; case DNBD3_REQ_OP_SPECIAL: debug_sock(sock, "request operation special"); dnbd3_request.cmd = dnbd3_priv_to_cmd(req); dnbd3_request.size = 0; break; case DNBD3_REQ_OP_CONNECT: debug_sock(sock, "request operation connect to %s", sock->device->imgname); dnbd3_request.cmd = CMD_SELECT_IMAGE; serializer_reset_write(&payload_buffer); serializer_put_uint16(&payload_buffer, PROTOCOL_VERSION); serializer_put_string(&payload_buffer, sock->device->imgname); serializer_put_uint16(&payload_buffer, sock->device->rid); serializer_put_uint8(&payload_buffer, 0); // is_server = false iov[1].iov_base = &payload_buffer; dnbd3_request.size = iov[1].iov_len = serializer_get_written_length(&payload_buffer); iov_num = 2; break; default: return -EIO; } sock->cookie++; if (cmd != NULL) { cmd->cookie = sock->cookie; tag = blk_mq_unique_tag(req); handle = dnbd3_to_handle(tag, sock->cookie);// ((uint64_t) tag << 32) | sock->cookie; } else { handle = sock->cookie; } memcpy(&dnbd3_request.handle, &handle, sizeof(handle)); fixup_request(dnbd3_request); iov[0].iov_base = &dnbd3_request; iov[0].iov_len = sizeof(dnbd3_request); send_len = iov_num == 1 ? sizeof(dnbd3_request) : iov[0].iov_len + iov[1].iov_len; if ((result = kernel_sendmsg(sock->sock, &msg, iov, iov_num, send_len)) != send_len) { error_sock(sock, "connection to server lost"); goto error; } sock->pending = NULL; result = 0; error: return result; } int dnbd3_send_request_blocking(struct dnbd3_sock *sock, int dnbd3_cmd) { int result = 0; uint64_t handle; struct request *req = kmalloc(sizeof(struct request), GFP_KERNEL); debug_sock(sock, "request starting blocking request"); if (!req) { error_sock(sock, "kmalloc failed"); goto error; } switch (dnbd3_cmd) { case CMD_KEEPALIVE: case CMD_GET_SERVERS: dnbd3_cmd_to_priv(req, dnbd3_cmd); break; case CMD_SELECT_IMAGE: dnbd3_connect(req); break; default: warn_sock(sock, "unsupported command for blocking %d", dnbd3_cmd); result = -EINVAL; goto error; } mutex_lock(&sock->lock); result = dnbd3_send_request(sock, req, NULL); if (result) { mutex_unlock(&sock->lock); goto error; } atomic64_set(&send_wq_signal, 0); handle = dnbd3_to_wq_signal(sock->device->minor, dnbd3_cmd, sock->sock_nr); mutex_unlock(&sock->lock); if (wait_event_interruptible_timeout(send_wq, atomic64_read(&send_wq_signal) == handle, REQUEST_TIMEOUT) <= 0) { // timeout or interrupt warn_sock(sock, "request timed out, cmd %d", dnbd3_cmd); result = -EIO; goto error; } error: if (req) { kfree(req); } return result; } static int dnbd3_receive_cmd(struct dnbd3_sock *sock, dnbd3_reply_t *reply) { int result; struct msghdr msg; struct kvec iov; init_msghdr(msg); iov.iov_base = reply; iov.iov_len = sizeof(dnbd3_reply_t); result = kernel_recvmsg(sock->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); if (result <= 0) { return result; } fixup_reply(dnbd3_reply); // check error if (reply->magic != dnbd3_packet_magic) { error_sock(sock, "receive cmd wrong magic packet"); return -EIO; } if (reply->cmd == 0) { error_sock(sock, "receive command was 0"); return -EIO; } return result; } static int dnbd3_receive_cmd_get_block_mq(struct dnbd3_device *dev, struct dnbd3_sock *sock, dnbd3_reply_t *reply) { struct dnbd3_cmd *cmd; struct msghdr msg; struct request *req = NULL; struct kvec iov; struct req_iterator iter; struct bio_vec bvec_inst; struct bio_vec *bvec = &bvec_inst; sigset_t blocked, oldset; void *kaddr; uint32_t tag, cookie; uint16_t hwq; int result = 0; uint64_t handle; init_msghdr(msg); memcpy(&handle, &reply->handle, sizeof(handle)); cookie = dnbd3_arg1_from_handle(handle); tag = dnbd3_arg0_from_handle(handle); hwq = blk_mq_unique_tag_to_hwq(tag); if (hwq < dev->tag_set.nr_hw_queues) { req = blk_mq_tag_to_rq(dev->tag_set.tags[hwq], blk_mq_unique_tag_to_tag(tag)); } if (!req || !blk_mq_request_started(req)) { dev_err(disk_to_dev(dev->disk), "unexpected reply (%d) %p\n", tag, req); return -EIO; } cmd = blk_mq_rq_to_pdu(req); mutex_lock(&cmd->lock); if (cmd->cookie != cookie) { dev_err(disk_to_dev(dev->disk), "double reply on req %p, cookie %u, handle cookie %u\n", req, cmd->cookie, cookie); mutex_unlock(&cmd->lock); return -EIO; } rq_for_each_segment(bvec_inst, req, iter) { siginitsetinv(&blocked, sigmask(SIGKILL)); sigprocmask(SIG_SETMASK, &blocked, &oldset); kaddr = kmap(bvec->bv_page) + bvec->bv_offset; iov.iov_base = kaddr; iov.iov_len = bvec->bv_len; result = kernel_recvmsg(sock->sock, &msg, &iov, 1, bvec->bv_len, msg.msg_flags); if (result != bvec->bv_len) { kunmap(bvec->bv_page); sigprocmask(SIG_SETMASK, &oldset, NULL ); error_sock(sock, "could not receive form net to block layer"); mutex_unlock(&cmd->lock); return result; } kunmap(bvec->bv_page); sigprocmask(SIG_SETMASK, &oldset, NULL ); } mutex_unlock(&cmd->lock); blk_mq_end_request(req, 0); return result; } static int dnbd3_receive_cmd_get_servers(struct dnbd3_device *dev, struct dnbd3_sock *sock, dnbd3_reply_t *reply) { struct msghdr msg; struct kvec iov; /* return true if did not receive servers, not an error*/ int result = 1; int count, remaining; init_msghdr(msg); debug_sock(sock, "get servers received"); mutex_lock(&dev->device_lock); if (!dev->use_server_provided_alts) { remaining = reply->size; goto consume_payload; } dev->new_servers_num = 0; count = MIN(NUMBER_SERVERS, reply->size / sizeof(dnbd3_server_entry_t)); if (count != 0) { iov.iov_base = dev->new_servers; iov.iov_len = count * sizeof(dnbd3_server_entry_t); result = kernel_recvmsg(sock->sock, &msg, &iov, 1, (count * sizeof(dnbd3_server_entry_t)), msg.msg_flags); if (result <= 0) { error_sock(sock, "failed to receive get servers %d", result); return result; } else if (result != (count * sizeof(dnbd3_server_entry_t))) { error_sock(sock, "failed to get servers"); mutex_unlock(&dev->device_lock); return -EIO; } dev->new_servers_num = count; } // If there were more servers than accepted, remove the remaining data from the socket buffer remaining = reply->size - (count * sizeof(dnbd3_server_entry_t)); consume_payload: while (remaining > 0) { count = MIN(sizeof(dnbd3_reply_t), remaining); // Abuse the reply struct as the receive buffer iov.iov_base = reply; iov.iov_len = count; result = kernel_recvmsg(sock->sock, &msg, &iov, 1, count, msg.msg_flags); if (result <= 0) { error_sock(sock, "failed to receive payload from get servers"); mutex_unlock(&dev->device_lock); return result; } } mutex_unlock(&dev->device_lock); return result; } static int dnbd3_receive_cmd_latest_rid(struct dnbd3_device *dev, struct dnbd3_sock *sock, dnbd3_reply_t *reply) { struct kvec iov; uint16_t rid; int result; struct msghdr msg; init_msghdr(msg); debug_sock(sock, "latest rid received"); if (reply->size != 2) { error_sock(sock, "failed to get latest rid, wrong size"); return -EIO; } iov.iov_base = &rid; iov.iov_len = sizeof(rid); result = kernel_recvmsg(sock->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); if (result <= 0) { error_sock(sock, "failed to receive latest rid"); return result; } rid = net_order_16(rid); debug_sock(sock, "latest rid of %s is %d (currently using %d)", dev->imgname, (int)rid, (int)dev->rid); dev->update_available = (rid > dev->rid ? 1 : 0); return result; } static int dnbd3_receive_cmd_select_image(struct dnbd3_device *dev, struct dnbd3_sock *sock, dnbd3_reply_t *reply) { struct kvec iov; uint16_t rid; char *name; int result; struct msghdr msg; serialized_buffer_t payload_buffer; uint64_t reported_size; init_msghdr(msg); debug_sock(sock, "select image received"); // receive reply payload iov.iov_base = &payload_buffer; iov.iov_len = reply->size; result = kernel_recvmsg(sock->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); if (result <= 0) { error_sock(sock, "failed to receive select image %d", result); return result; } else if (result != reply->size) { error_sock(sock, "could not read CMD_SELECT_IMAGE payload on handshake, size is %d and should be %d", result, reply->size); return -EIO; } // handle/check reply payload serializer_reset_read(&payload_buffer, reply->size); sock->server->protocol_version = serializer_get_uint16(&payload_buffer); if (sock->server->protocol_version < MIN_SUPPORTED_SERVER) { error_sock(sock, "server version is lower than min supported version"); return -EIO; } //TODO compare RID name = serializer_get_string(&payload_buffer); rid = serializer_get_uint16(&payload_buffer); if (dev->rid != rid && strcmp(name, dev->imgname) != 0) { error_sock(sock, "server offers image '%s', requested '%s'", name, dev->imgname); return -EIO; } reported_size = serializer_get_uint64(&payload_buffer); if (!dev->reported_size) { if (reported_size < 4096) { error_sock(sock, "reported size by server is < 4096"); return -EIO; } dev->reported_size = reported_size; set_capacity(dev->disk, dev->reported_size >> 9); /* 512 Byte blocks */ } else if (dev->reported_size != reported_size) { error_sock(sock, "reported size by server is %llu but should be %llu", reported_size, dev->reported_size); return -EIO; } return result; } static void dnbd3_receive_worker(struct work_struct *work) { struct dnbd3_sock *sock = container_of(work, struct dnbd3_sock, receive_worker); struct dnbd3_device *dev = sock->device; dnbd3_reply_t dnbd3_reply; uint64_t handle; int result; while(1) { // loop until socket returns 0 result = dnbd3_receive_cmd(sock, &dnbd3_reply); if (result == -EAGAIN) { continue; } else if (result <= 0) { error_sock(sock, "connection to server lost %d", result); goto error; } switch (dnbd3_reply.cmd) { case CMD_GET_BLOCK: result = dnbd3_receive_cmd_get_block_mq(dev, sock, &dnbd3_reply); if (result <= 0) { error_sock(sock, "receive cmd get block mq failed %d", result); goto error; } continue; // we do not need to wake up anyone, wait for next cmd (block) case CMD_GET_SERVERS: result = dnbd3_receive_cmd_get_servers(dev, sock, &dnbd3_reply); if (result <= 0) { error_sock(sock, "receive cmd get servers failed %d", result); goto error; } break; case CMD_LATEST_RID: result = dnbd3_receive_cmd_latest_rid(dev, sock, &dnbd3_reply); if (result <= 0) { error_sock(sock, "receive cmd latest rid failed %d", result); goto error; } break; case CMD_KEEPALIVE: if (dnbd3_reply.size != 0) { error_sock(sock, "got keep alive packet with payload"); goto error; } debug_sock(sock, "keep alive received"); break; case CMD_SELECT_IMAGE: result = dnbd3_receive_cmd_select_image(dev, sock, &dnbd3_reply); if (result <= 0) { error_sock(sock, "receive cmd select image failed %d", result); goto error; } break; default: warn_sock(sock, "unknown command eeceived"); break; } error: handle = dnbd3_to_wq_signal(dev->minor, dnbd3_reply.cmd, sock->sock_nr); atomic64_set(&send_wq_signal, handle); wake_up_interruptible(&send_wq); if (result == 0) { info_sock(sock, "result is 0, socket seems to be down"); sock->panic = 1; break; //the socket seems to be down } else if (result < 0) { sock->server->failures++; // discovery takes care of to many failures warn_sock(sock, "receive error happened %d, total failures %d", result, sock->server->failures); } debug_sock(sock, "receive completed, waiting for next receive"); } debug_sock(sock, "receive work queue is stopped"); } static void dnbd3_timer(struct timer_list *arg) { struct dnbd3_device *dev = container_of(arg, struct dnbd3_device, timer); int i; queue_work(dnbd3_wq, &dev->panic_worker); if (dev->timer_count % TIMER_INTERVAL_KEEPALIVE_PACKET == 0) { for (i = 0; i < NUMBER_CONNECTIONS; i++) { if (dnbd3_is_sock_alive(dev->socks[i])) { queue_work(dnbd3_wq, &dev->socks[i].keepalive_worker); } } } if (dev->timer_count % TIMER_INTERVAL_PROBE_NORMAL == 0) { queue_work(dnbd3_wq, &dev->discovery_worker); } dev->timer_count++; dev->timer.expires = jiffies + HZ; add_timer(&dev->timer); } static void dnbd3_keepalive_worker(struct work_struct *work) { struct dnbd3_sock *sock = container_of(work, struct dnbd3_sock, keepalive_worker); debug_sock(sock, "starting keepalive worker"); dnbd3_send_request_blocking(sock, CMD_KEEPALIVE); } static struct dnbd3_server *dnbd3_find_best_alt_server(struct dnbd3_device *dev) { int i, j; uint64_t rtt = 0; uint64_t best_rtt = RTT_UNREACHABLE; uint64_t current_best_rtt = RTT_UNREACHABLE; struct dnbd3_server *best_alt_server = NULL; struct dnbd3_server *better_alt_server = NULL; for (i = 0; i < NUMBER_CONNECTIONS; i++) { if (dnbd3_is_sock_alive(dev->socks[i])) { rtt = (dev->socks[i].server->rtts[0] + dev->socks[i].server->rtts[1] + dev->socks[i].server->rtts[2] + dev->socks[i].server->rtts[3]) / 4; if (rtt <= current_best_rtt) { current_best_rtt = rtt; } } } best_rtt = current_best_rtt * 10; // TODO add DEFINE to control this debug_dev(dev, "best connected rtt is %llu, searching for rtt better than %llu", current_best_rtt, best_rtt); for (i = 0; i < NUMBER_SERVERS; i++) { if (dev->alt_servers[i].host.type != 0) { rtt = (dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4; if (rtt <= best_rtt) { better_alt_server = &dev->alt_servers[i]; for (j = 0; j < NUMBER_CONNECTIONS; j++) { // check if already connected if (better_alt_server == dev->socks[j].server) { better_alt_server = NULL; // found already connected server break; } } if (better_alt_server) { best_alt_server = better_alt_server; best_rtt = rtt; } } } } if (best_alt_server) { print_server(KERN_INFO, dev, best_alt_server, "found best alt server with rtt %llu", rtt); } else { debug_dev(dev, "did not find any alternative server"); } return best_alt_server; } static bool dnbd3_better_rtt(struct dnbd3_server *new_server, struct dnbd3_server *existing_server) { uint64_t new_rtt = (new_server->rtts[0] + new_server->rtts[1] + new_server->rtts[2] + new_server->rtts[3]) / 4; uint64_t existing_rtt = (existing_server->rtts[0] + existing_server->rtts[1] + existing_server->rtts[2] + existing_server->rtts[3]) / 4; if (((new_rtt * 2)/3) < existing_rtt) { return true; } return false; } static void dnbd3_panic_worker(struct work_struct *work) { struct dnbd3_device *dev = container_of(work, struct dnbd3_device, panic_worker); struct dnbd3_sock *panicked_sock = NULL; struct dnbd3_server *new_server, *panicked_server; int i; int sock_alive = 0; for (i = 0; i < NUMBER_CONNECTIONS; i++) { if (dev->socks[i].panic) { panicked_sock = &dev->socks[i]; } else if (dnbd3_is_sock_alive(dev->socks[i])) { sock_alive++; } } if (panicked_sock) { warn_sock(panicked_sock, "panicked, connections still alive %d", sock_alive); panicked_server = panicked_sock->server; new_server = dnbd3_find_best_alt_server(dev); dnbd3_socket_disconnect(dev, panicked_server, panicked_sock); if (new_server != NULL && new_server != panicked_server) { print_server(KERN_INFO, dev, new_server, "found replacement"); dnbd3_socket_connect(dev, new_server); } else if (sock_alive > 0) { info_dev(dev, "found no replacement server but still connected to %d servers", sock_alive); } else { error_dev(dev, "could not reconnect to server"); } } else if (sock_alive == 0) { new_server = dnbd3_find_best_alt_server(dev); if (new_server != NULL) { print_server(KERN_INFO, dev, new_server, "reconnect to server"); dnbd3_socket_connect(dev, new_server); } else { error_dev(dev, "could not reconnect to server"); } } } static void dnbd3_discovery_worker(struct work_struct *work) { struct dnbd3_device *dev = container_of(work, struct dnbd3_device, discovery_worker); struct dnbd3_sock *sock = &dev->socks[dev->discovery_count % NUMBER_CONNECTIONS]; // just take the next int i, j; struct dnbd3_server *existing_server, *free_server, *failed_server; dnbd3_server_entry_t *new_server; struct kvec iov; struct timeval start, end; dnbd3_request_t dnbd3_request; dnbd3_reply_t dnbd3_reply; struct msghdr msg; char *buf; struct request *req = NULL; uint64_t rtt; serialized_buffer_t *payload; if (!dnbd3_is_sock_alive(*sock)) { sock = NULL; for (i = 0; i < NUMBER_CONNECTIONS; i++) { if (dnbd3_is_sock_alive(dev->socks[i])) { sock = &dev->socks[i]; } } if (!sock) { error_dev(dev, "discovery failed, no socket available"); } } debug_sock(sock, "starting discovery worker"); dnbd3_send_request_blocking(sock, CMD_GET_SERVERS); debug_sock(sock, "new server num is %d", dev->new_servers_num); if (dev->new_servers_num) { mutex_lock(&dev->device_lock); for (i = 0; i < dev->new_servers_num; i++) { new_server = &dev->new_servers[i]; if (new_server->host.type == HOST_IP4 || new_server->host.type == HOST_IP6) { existing_server = NULL; free_server = NULL; failed_server = NULL; // find servers in alt servers for (j = 0; j < NUMBER_SERVERS; j++) { if ((new_server->host.type == dev->alt_servers[j].host.type) && (new_server->host.port == dev->alt_servers[j].host.port) && (0 == memcmp(new_server->host.addr, dev->alt_servers[j].host.addr, (new_server->host.type == HOST_IP4 ? 4 : 16)))) { existing_server = &dev->alt_servers[j]; } else if (dev->alt_servers[j].host.type == 0) { free_server = &dev->alt_servers[j]; } else if (dev->alt_servers[j].failures > 20) { failed_server = &dev->alt_servers[j]; } } if (existing_server) { if (new_server->failures == 1) { // remove is requested print_server(KERN_INFO, dev, new_server, "remove server is requested"); dnbd3_socket_disconnect(dev, existing_server, NULL); // TODO what to do when only one connection? existing_server->host.type = 0; } // existing_server->failures = 0; // reset failure count continue; } else if (free_server) { free_server->host = new_server->host; } else if (failed_server) { failed_server->host = new_server->host; free_server = failed_server; } else { //no server found to replace continue; } print_server(KERN_INFO, dev, free_server, "got new alt server"); free_server->failures = 0; free_server->protocol_version = 0; free_server->rtts[0] = free_server->rtts[1] = free_server->rtts[2] = free_server->rtts[3] = RTT_UNREACHABLE; } } dev->new_servers_num = 0; mutex_unlock(&dev->device_lock); } buf = kmalloc(RTT_BLOCK_SIZE, GFP_KERNEL); if (!buf) { error_dev(dev, "kmalloc failed"); goto error; } payload = (serialized_buffer_t *)buf; req = kmalloc(sizeof(struct request), GFP_KERNEL); if (!req) { error_dev(dev, "kmalloc failed"); goto error; } sock = kmalloc(sizeof(struct dnbd3_sock), GFP_KERNEL); if (!sock) { error_dev(dev, "kmalloc failed"); goto error; } sock->sock_nr = NUMBER_CONNECTIONS; // measure rtt for all alt servers for (i = 0; i < NUMBER_SERVERS; i++) { existing_server = &dev->alt_servers[i]; if (existing_server->host.type) { sock->sock = NULL; sock->device = dev; sock->server = existing_server; if (__dnbd3_socket_connect(existing_server, sock)) { error_sock(sock, "socket connect failed in rtt measurement"); goto rtt_error; } dnbd3_connect(req); if (dnbd3_send_request(sock, req, NULL)) { error_sock(sock, "request select image failed in rtt measurement"); goto rtt_error; } if (dnbd3_receive_cmd(sock, &dnbd3_reply) <= 0) { error_sock(sock, "receive select image failed in rtt measurement"); goto rtt_error; } if (dnbd3_reply.magic != dnbd3_packet_magic || dnbd3_reply.cmd != CMD_SELECT_IMAGE || dnbd3_reply.size < 4) { error_sock(sock, "receive select image wrong header in rtt measurement"); goto rtt_error; } if (dnbd3_receive_cmd_select_image(dev, sock, &dnbd3_reply) <= 0) { error_sock(sock, "receive data select image failed in rtt measurement"); goto rtt_error; } // Request block dnbd3_request.magic = dnbd3_packet_magic; dnbd3_request.cmd = CMD_GET_BLOCK; // Do *NOT* pick a random block as it has proven to cause severe // cache thrashing on the server dnbd3_request.offset = 0; dnbd3_request.size = RTT_BLOCK_SIZE; fixup_request(dnbd3_request); iov.iov_base = &dnbd3_request; iov.iov_len = sizeof(dnbd3_request); init_msghdr(msg); // start rtt measurement do_gettimeofday(&start); if (kernel_sendmsg(sock->sock, &msg, &iov, 1, sizeof(dnbd3_request)) <= 0) { error_sock(sock, "request test block failed in rtt measurement"); goto rtt_error; } // receive net reply iov.iov_base = &dnbd3_reply; iov.iov_len = sizeof(dnbd3_reply); if ((j = kernel_recvmsg(sock->sock, &msg, &iov, 1, sizeof(dnbd3_reply), msg.msg_flags)) != sizeof(dnbd3_reply)) { error_sock(sock, "receive header test block failed in rtt measurement %d %ld", j, sizeof(dnbd3_reply)); goto rtt_error; } fixup_reply(dnbd3_reply); if (dnbd3_reply.magic != dnbd3_packet_magic|| dnbd3_reply.cmd != CMD_GET_BLOCK || dnbd3_reply.size != RTT_BLOCK_SIZE) { error_sock(sock, "receive header cmd test block failed in rtt measurement"); goto rtt_error; } // receive data iov.iov_base = buf; iov.iov_len = RTT_BLOCK_SIZE; if (kernel_recvmsg(sock->sock, &msg, &iov, 1, dnbd3_reply.size, msg.msg_flags) != RTT_BLOCK_SIZE) { error_sock(sock, "receive test block failed in rtt measurement"); goto rtt_error; } do_gettimeofday(&end); // end rtt measurement rtt = (uint64_t)((end.tv_sec - start.tv_sec) * 1000000ull + (end.tv_usec - start.tv_usec)); debug_sock(sock, "new rrt is %llu", rtt); existing_server->rtts[dev->discovery_count % 4] = rtt; rtt_error: if (sock->sock) { kernel_sock_shutdown(sock->sock, SHUT_RDWR); sock->server = NULL; } if (sock->sock) { sock_release(sock->sock); sock->sock = NULL; } } } error: if (buf) { kfree(buf); buf = NULL; } if (req) { kfree(req); req = NULL; } if (sock) { kfree(sock); sock = NULL; } // connect empty sockets j = 0; for (i = 0; i < NUMBER_CONNECTIONS; i++) { if (!dnbd3_is_sock_alive(dev->socks[i])) { free_server = dnbd3_find_best_alt_server(dev); if (free_server) { if (dnbd3_socket_connect(dev, free_server) == 0) { j++; } else { print_server(KERN_WARNING, dev, free_server, "failed to connect"); } } } else { j++; } } // replace socket with better server if (j == NUMBER_CONNECTIONS) { for (i = 0; i < NUMBER_CONNECTIONS; i++) { if (dnbd3_is_sock_alive(dev->socks[i])) { free_server = dnbd3_find_best_alt_server(dev); if (free_server && dnbd3_better_rtt(free_server, dev->socks[i].server)) { dnbd3_socket_disconnect(dev, NULL, &dev->socks[i]); if (dnbd3_socket_connect(dev, free_server) != 0) { print_server(KERN_WARNING, dev, free_server, "failed to connect"); } } } } } debug_dev(dev, "connected to %d / %d sockets", j, NUMBER_CONNECTIONS); dev->discovery_count++; } static int __dnbd3_socket_connect(struct dnbd3_server *server, struct dnbd3_sock *sock) { int result = 0; struct timeval timeout; if (server->host.port == 0 || server->host.type == 0) { error_sock(sock, "host or port not set"); return -EIO; } if (sock->sock) { warn_sock(sock, "already connected"); return -EIO; } timeout.tv_sec = SOCKET_TIMEOUT_CLIENT_DATA; timeout.tv_usec = 0; if ((result = dnbd3_sock_create(server->host.type, SOCK_STREAM, IPPROTO_TCP, &sock->sock)) < 0) { error_sock(sock, "could not create socket"); goto error; } kernel_setsockopt(sock->sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout)); kernel_setsockopt(sock->sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout)); sock->sock->sk->sk_allocation = GFP_NOIO; if (server->host.type == HOST_IP4) { struct sockaddr_in sin; memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; memcpy(&(sin.sin_addr), server->host.addr, 4); sin.sin_port = server->host.port; if ((result = kernel_connect(sock->sock, (struct sockaddr *)&sin, sizeof(sin), 0)) != 0) { error_sock(sock, "connection to host failed"); goto error; } } else { struct sockaddr_in6 sin; memset(&sin, 0, sizeof(sin)); sin.sin6_family = AF_INET6; memcpy(&(sin.sin6_addr), server->host.addr, 16); sin.sin6_port = server->host.port; if ((result = kernel_connect(sock->sock, (struct sockaddr *)&sin, sizeof(sin), 0)) != 0){ error_sock(sock, "connection to host failed"); goto error; } } return 0; error: if (sock->sock) { sock_release(sock->sock); sock->sock = NULL; } return result; } static int dnbd3_socket_connect(struct dnbd3_device *dev, struct dnbd3_server *server) { int i; int sock_alive = 0; int result = -EIO; struct dnbd3_sock *sock = NULL; for (i = 0; i < NUMBER_CONNECTIONS; i++) { if (!dnbd3_is_sock_alive(dev->socks[i])) { sock = &dev->socks[i]; break; } } if (sock == NULL) { print_server(KERN_ERR, dev, server, "could not connect to socket, to many connections"); return -EIO; } sock->server = server; debug_sock(sock, "socket connect"); mutex_init(&sock->lock); mutex_lock(&sock->lock); __dnbd3_socket_connect(server, sock); mutex_unlock(&sock->lock); if (!sock->sock) { error_sock(sock, "socket is not connected"); result = -EIO; goto error; } // start the receiver INIT_WORK(&sock->receive_worker, dnbd3_receive_worker); queue_work(dnbd3_wq, &sock->receive_worker); result = dnbd3_send_request_blocking(sock, CMD_SELECT_IMAGE); if (result) { error_sock(sock, "connection to image %s failed", dev->imgname); goto error; } debug_sock(sock, "connected to image %s, filesize %llu", dev->imgname, dev->reported_size); INIT_WORK(&sock->keepalive_worker, dnbd3_keepalive_worker); for (i = 0; i < NUMBER_CONNECTIONS; i++) { if (dev->socks[i].sock && dev->socks[i].server) { sock_alive++; } } if (sock_alive == 1) { // first socket to connect, start timer and workers debug_sock(sock, "first connection to server, starting workers"); INIT_WORK(&dev->discovery_worker, dnbd3_discovery_worker); INIT_WORK(&dev->panic_worker, dnbd3_panic_worker); timer_setup(&dev->timer, dnbd3_timer, 0); dev->timer.expires = jiffies + HZ; add_timer(&dev->timer); } blk_mq_update_nr_hw_queues(&dev->tag_set, sock_alive); return 0; error: if (sock->sock) { sock_release(sock->sock); sock->sock = NULL; } return result; } static int dnbd3_socket_disconnect(struct dnbd3_device *dev, struct dnbd3_server *server, struct dnbd3_sock *sock) { int i; int sock_alive = 0; for (i = 0; i < NUMBER_CONNECTIONS; i++) { if (sock == NULL && dev->socks[i].server == server) { sock = &dev->socks[i]; } if (dnbd3_is_sock_alive(dev->socks[i])) { sock_alive++; } } if (!sock || !sock->sock) { warn_dev(dev, "could not find socket to disconnect"); return -EIO; } blk_mq_update_nr_hw_queues(&dev->tag_set, sock_alive - 1); if (sock_alive <= 1) { info_sock(sock, "shutting down last socket and stopping discovery"); del_timer_sync(&dev->timer); dev->timer_count = 0; dev->discovery_count = 0; cancel_work_sync(&dev->discovery_worker); // cancel_work_sync(&dev->panic_worker); // do not wait for panic_worker, probably we are called from panic_worker } cancel_work_sync(&sock->keepalive_worker); debug_sock(sock, "socket disconnect"); mutex_lock(&sock->lock); /* * Important sequence to shut down socket * 1. kernel_sock_shutdown * socket shutdown, receiver which hangs in kernel_recvmsg returns 0 * 2. cancel_work_sync(receiver) * wait for the receiver to finish, so the socket is not usesd anymore * 3. sock_release * release the socket and set to NULL */ if (sock->sock) { kernel_sock_shutdown(sock->sock, SHUT_RDWR); } mutex_unlock(&sock->lock); mutex_destroy(&sock->lock); cancel_work_sync(&sock->receive_worker); if (sock->sock) { sock_release(sock->sock); sock->sock = NULL; } sock->panic = 0; sock->server = NULL; return 0; } int dnbd3_net_disconnect(struct dnbd3_device *dev) { int i; int result = 0; for (i = 0; i < NUMBER_CONNECTIONS; i++) { if (dev->socks[i].sock) { if (dnbd3_socket_disconnect(dev, NULL, &dev->socks[i])) { result = -EIO; } } } return result; } int dnbd3_net_connect(struct dnbd3_device *dev) { int result; debug_dev(dev, "connecting to server"); // alt_server[0] is the initial server if (dnbd3_socket_connect(dev, &dev->alt_servers[0]) == 0) { dnbd3_print_server_list(dev); result = 0; } else { error_dev(dev, "failed to connect to initial server"); result = -ENOENT; dev->imgname = NULL; dev->socks[0].server = NULL; } return result; }