From 23e585f4af267808c2ed0c69430207d7d7716a5b Mon Sep 17 00:00:00 2001 From: Frederic Robra Date: Sun, 14 Jul 2019 20:04:45 +0200 Subject: device now errors if connection get closed --- src/kernel/core.c | 30 ++++++++++-- src/kernel/dnbd3.h | 10 ++-- src/kernel/net.c | 131 ++++++++++++++++++++++++++++++----------------------- 3 files changed, 106 insertions(+), 65 deletions(-) diff --git a/src/kernel/core.c b/src/kernel/core.c index 8f02900..4704b0d 100644 --- a/src/kernel/core.c +++ b/src/kernel/core.c @@ -77,8 +77,10 @@ static int dnbd3_handle_cmd(struct dnbd3_cmd *cmd, int index) { struct request *req = blk_mq_rq_from_pdu(cmd); struct dnbd3_device *dev = cmd->dnbd3; - struct dnbd3_sock *sock; + struct dnbd3_sock *sock = NULL; int ret = -1; + int i; + int sock_alive = 0; printk(KERN_DEBUG "dnbd3: handle request at position %lu and size %d, device %i\n", blk_rq_pos(req), blk_rq_bytes(req), dev->minor); @@ -89,11 +91,24 @@ static int dnbd3_handle_cmd(struct dnbd3_cmd *cmd, int index) // return -EINVAL; // } - sock = &dev->socks[index]; - if (!sock->sock) { + for (i = 0; i < NUMBER_CONNECTIONS; i++) { + if (dev->socks[i].sock && dev->socks[i].server) { + if (index == sock_alive) { + sock = &dev->socks[index]; + } + sock_alive++; + } + } + + if (!sock) { printk(KERN_INFO "dnbd3: index is %d but no socket was found\n", index); dev_err_ratelimited(disk_to_dev(dev->disk), "attempted send on invalid socket\n"); - blk_mq_start_request(req); + if (sock_alive > 0) { + blk_mq_update_nr_hw_queues(&dev->tag_set, sock_alive); + blk_mq_start_request(req); + return -EINVAL; + } + blk_mq_end_request(req, BLK_STS_IOERR); return -EINVAL; } @@ -188,6 +203,7 @@ static enum blk_eh_timer_return dnbd3_xmit_timeout(struct request *req, bool res dev_err_ratelimited(disk_to_dev(dev->disk), "connection timed out\n"); cmd->status = BLK_STS_IOERR; blk_mq_complete_request(req); + blk_mq_end_request(req, BLK_STS_TIMEOUT); return BLK_EH_DONE; } @@ -292,6 +308,8 @@ static int dnbd3_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd kfree(dev->imgname); dev->imgname = NULL; } + dev->rid = 0; + dev->reported_size = 0; break; case IOCTL_SWITCH: @@ -506,6 +524,10 @@ static void dnbd3_dev_remove(struct dnbd3_device *dnbd3) dnbd3_net_disconnect(dnbd3); disk->private_data = NULL; put_disk(disk); + if (dnbd3->imgname) { + kfree(dnbd3->imgname); + dnbd3->imgname = NULL; + } } mutex_destroy(&dnbd3->device_lock); } diff --git a/src/kernel/dnbd3.h b/src/kernel/dnbd3.h index ae00cca..feb09ae 100644 --- a/src/kernel/dnbd3.h +++ b/src/kernel/dnbd3.h @@ -50,13 +50,12 @@ struct dnbd3_sock { struct mutex lock; struct request *pending; struct dnbd3_server *server; - uint32_t heartbeat_count; uint32_t cookie; uint8_t panic;//, discover, panic_count; struct dnbd3_device *device; - struct work_struct keepalive; + struct work_struct keepalive_worker; struct timer_list keepalive_timer; - struct work_struct receive; + struct work_struct receive_worker; }; struct dnbd3_device { @@ -85,8 +84,9 @@ struct dnbd3_device { uint64_t reported_size; struct work_struct panic_worker; - struct work_struct discovery; // if in irq and need to send request - struct timer_list discovery_timer; + struct work_struct discovery_worker; // if in irq and need to send request + struct timer_list timer; + uint32_t timer_count; }; diff --git a/src/kernel/net.c b/src/kernel/net.c index 0b83a43..e4ab608 100644 --- a/src/kernel/net.c +++ b/src/kernel/net.c @@ -53,7 +53,7 @@ static volatile uint64_t send_wq_signal; //TODO make atomic atomic_64_t static int dnbd3_socket_connect(struct dnbd3_device *dev, struct dnbd3_server *server); static int __dnbd3_socket_connect(struct dnbd3_server * server, struct dnbd3_sock *sock); -static int dnbd3_socket_disconnect(struct dnbd3_device *dev, struct dnbd3_server *server, struct dnbd3_sock *sock, bool wait_for_receiver); +static int dnbd3_socket_disconnect(struct dnbd3_device *dev, struct dnbd3_server *server, struct dnbd3_sock *sock); static void dnbd3_print_host(struct dnbd3_host_t *host, char *msg) @@ -440,7 +440,7 @@ static int dnbd3_receive_cmd_select_image(struct dnbd3_device *dev, struct dnbd3 } static void dnbd3_receive_worker(struct work_struct *work) { - struct dnbd3_sock *sock = container_of(work, struct dnbd3_sock, receive); + struct dnbd3_sock *sock = container_of(work, struct dnbd3_sock, receive_worker); struct dnbd3_device *dev = sock->device; dnbd3_reply_t dnbd3_reply; uint64_t handle; @@ -504,6 +504,7 @@ error: wake_up_interruptible(&send_wq); if (result == 0) { printk(KERN_INFO "dnbd3: result is 0, socket seems to be down\n"); + sock->panic = 1; // dnbd3_socket_disconnect(dev, NULL, sock, false);//TODO use panic or something or start worker to reconnect? break; //the socket seems to be down } else if (result < 0) { @@ -516,28 +517,36 @@ error: } -void dnbd3_keepalive_timer(struct timer_list *arg) +static void dnbd3_timer(struct timer_list *arg) { - struct dnbd3_sock *sock = container_of(arg, struct dnbd3_sock, keepalive_timer); - queue_work(dnbd3_wq, &sock->keepalive); - sock->keepalive_timer.expires = KEEPALIVE_TIMER; - add_timer(&sock->keepalive_timer); + struct dnbd3_device *dev = container_of(arg, struct dnbd3_device, timer); + int i; + + queue_work(dnbd3_wq, &dev->panic_worker); + + if (dev->timer_count % TIMER_INTERVAL_KEEPALIVE_PACKET == 0) { + for (i = 0; i < NUMBER_CONNECTIONS; i++) { + if (dev->socks[i].sock && dev->socks[i].server) { + queue_work(dnbd3_wq, &dev->socks[i].keepalive_worker); + } + } + } + if (dev->timer_count % TIMER_INTERVAL_PROBE_NORMAL == 0) { + queue_work(dnbd3_wq, &dev->discovery_worker); + } + + + dev->timer_count++; + dev->timer.expires = jiffies + HZ; + add_timer(&dev->timer); } + static void dnbd3_keepalive_worker(struct work_struct *work) { - struct dnbd3_sock *sock = container_of(work, struct dnbd3_sock, keepalive); + struct dnbd3_sock *sock = container_of(work, struct dnbd3_sock, keepalive_worker); printk(KERN_DEBUG "dnbd3: starting keepalive worker\n"); dnbd3_send_request_blocking(sock, CMD_KEEPALIVE); - sock->heartbeat_count++; -} - -static void dnbd3_discovery_timer(struct timer_list *arg) -{ - struct dnbd3_device *dev = container_of(arg, struct dnbd3_device, discovery_timer); - queue_work(dnbd3_wq, &dev->discovery); - dev->discovery_timer.expires = DISCOVERY_TIMER; - add_timer(&dev->discovery_timer); } static struct dnbd3_server *dnbd3_find_best_alt_server(struct dnbd3_device *dev) { @@ -578,24 +587,30 @@ static void dnbd3_panic_worker(struct work_struct *work) if (panicked_sock) { printk(KERN_WARNING "dnbd3: socket %d panicked, connections still alive %d\n", panicked_sock->sock_nr, sock_alive); panicked_server = panicked_sock->server; - dnbd3_socket_disconnect(dev, panicked_server, panicked_sock, true); + dnbd3_socket_disconnect(dev, panicked_server, panicked_sock); new_server = dnbd3_find_best_alt_server(dev); if (new_server != NULL && new_server != panicked_server) { - printk(KERN_INFO "dnbd3: found replacement server"); + printk(KERN_INFO "dnbd3: found replacement server\n"); dnbd3_socket_connect(dev, new_server); } else if (sock_alive > 0) { printk(KERN_INFO "dnbd3: found no replacement server but still connected to %d servers\n", sock_alive); } else { printk(KERN_ERR "dnbd3: could not reconnect to server\n"); } + } else if (sock_alive == 0) { + new_server = dnbd3_find_best_alt_server(dev); + if (new_server != NULL) { + printk(KERN_INFO "dnbd3: reconnect to server\n"); + dnbd3_socket_connect(dev, new_server); + } } } static void dnbd3_discovery_worker(struct work_struct *work) { - struct dnbd3_device *dev = container_of(work, struct dnbd3_device, discovery); + struct dnbd3_device *dev = container_of(work, struct dnbd3_device, discovery_worker); struct dnbd3_sock *sock = NULL; int i, j; struct dnbd3_server *existing_server, *free_server, *failed_server; @@ -643,10 +658,10 @@ static void dnbd3_discovery_worker(struct work_struct *work) if (existing_server) { if (new_server->failures == 1) { // remove is requested dnbd3_print_host(&existing_server->host, "remove server"); - dnbd3_socket_disconnect(dev, existing_server, NULL, true); // TODO what to do when only one connection? + dnbd3_socket_disconnect(dev, existing_server, NULL); // TODO what to do when only one connection? existing_server->host.type = 0; } - existing_server->failures = 0; // reset failure count +// existing_server->failures = 0; // reset failure count continue; } else if (free_server) { free_server->host = new_server->host; @@ -682,6 +697,8 @@ static void dnbd3_discovery_worker(struct work_struct *work) printk(KERN_ERR "dnbd3: kmalloc failed\n"); goto error; } + mutex_init(&sock->lock); + mutex_lock(&sock->lock); // measure rtt for all alt servers for (i = 0; i < NUMBER_SERVERS; i++) { existing_server = &dev->alt_servers[i]; @@ -689,8 +706,6 @@ static void dnbd3_discovery_worker(struct work_struct *work) sock->sock = NULL; sock->device = dev; sock->server = existing_server; - - init_msghdr(msg); if (__dnbd3_socket_connect(existing_server, sock)) { printk(KERN_ERR "dnbd3: socket connect failed in rtt measurement\n"); goto rtt_error; @@ -700,13 +715,19 @@ static void dnbd3_discovery_worker(struct work_struct *work) printk(KERN_ERR "dnbd3: request select image failed in rtt measurement\n"); goto rtt_error; } + if (dnbd3_receive_cmd(sock, &dnbd3_reply) <= 0) { - printk(KERN_ERR "dnbd3: receive cmd failed in rtt measurement %d\n", j); + printk(KERN_ERR "dnbd3: receive select image failed in rtt measurement\n"); + goto rtt_error; + + } + if (dnbd3_reply.magic != dnbd3_packet_magic || dnbd3_reply.cmd != CMD_SELECT_IMAGE || dnbd3_reply.size < 4) { + printk(KERN_ERR "dnbd3: receive select image wrong header in rtt measurement\n"); goto rtt_error; } if (dnbd3_receive_cmd_select_image(dev, sock, &dnbd3_reply) <= 0) { - printk(KERN_ERR "dnbd3: receive select image failed in rtt measurement %d\n", j); + printk(KERN_ERR "dnbd3: receive data select image failed in rtt measurement\n"); goto rtt_error; } @@ -767,6 +788,8 @@ rtt_error: } } } + mutex_unlock(&sock->lock); + mutex_destroy(&sock->lock); error: if (buf) { kfree(buf); @@ -841,6 +864,7 @@ error: static int dnbd3_socket_connect(struct dnbd3_device *dev, struct dnbd3_server *server) { int i; + int sock_alive = 0; int result = -EIO; struct dnbd3_sock *sock = NULL; for (i = 0; i < NUMBER_CONNECTIONS; i++) { @@ -869,8 +893,8 @@ static int dnbd3_socket_connect(struct dnbd3_device *dev, struct dnbd3_server *s } // start the receiver - INIT_WORK(&sock->receive, dnbd3_receive_worker); - queue_work(dnbd3_wq, &sock->receive); + INIT_WORK(&sock->receive_worker, dnbd3_receive_worker); + queue_work(dnbd3_wq, &sock->receive_worker); result = dnbd3_send_request_blocking(sock, CMD_SELECT_IMAGE); if (result) { @@ -881,15 +905,14 @@ static int dnbd3_socket_connect(struct dnbd3_device *dev, struct dnbd3_server *s printk(KERN_DEBUG "dnbd3: connected to image %s, filesize %llu\n", dev->imgname, dev->reported_size); - // add heartbeat timer and scheduler for the command - INIT_WORK(&sock->keepalive, dnbd3_keepalive_worker); - sock->heartbeat_count = 0; - timer_setup(&sock->keepalive_timer, dnbd3_keepalive_timer, 0); - sock->keepalive_timer.expires = KEEPALIVE_TIMER; - add_timer(&sock->keepalive_timer); + INIT_WORK(&sock->keepalive_worker, dnbd3_keepalive_worker); - -// kfree(req); + for (i = 0; i < NUMBER_CONNECTIONS; i++) { + if (dev->socks[i].sock && dev->socks[i].server) { + sock_alive++; + } + } + blk_mq_update_nr_hw_queues(&dev->tag_set, sock_alive); return 0; error: if (sock->sock) { @@ -900,7 +923,7 @@ error: } -static int dnbd3_socket_disconnect(struct dnbd3_device *dev, struct dnbd3_server *server, struct dnbd3_sock *sock, bool wait_for_receiver) +static int dnbd3_socket_disconnect(struct dnbd3_device *dev, struct dnbd3_server *server, struct dnbd3_sock *sock) { int i; int sock_alive = 0; @@ -908,7 +931,7 @@ static int dnbd3_socket_disconnect(struct dnbd3_device *dev, struct dnbd3_server if (sock == NULL && dev->socks[i].server == server) { sock = &dev->socks[i]; } - if (dev->socks[i].sock) { + if (dev->socks[i].sock && dev->socks[i].server) { sock_alive++; } } @@ -916,15 +939,16 @@ static int dnbd3_socket_disconnect(struct dnbd3_device *dev, struct dnbd3_server printk(KERN_WARNING "dnbd3: could not find socket to disconnect\n"); return -EIO; } + blk_mq_update_nr_hw_queues(&dev->tag_set, sock_alive - 1); if (sock_alive <= 1) { printk(KERN_INFO "dnbd3: shutting down last socket and stopping discovery\n"); - del_timer_sync(&dev->discovery_timer); - cancel_work_sync(&dev->discovery); + del_timer_sync(&dev->timer); + dev->timer_count = 0; + cancel_work_sync(&dev->discovery_worker); +// cancel_work_sync(&dev->panic_worker); // do not wait for panic_worker, probably we are called from panic_worker } - printk(KERN_DEBUG "dnbd3: stopping keepalive\n"); - del_timer_sync(&sock->keepalive_timer); - cancel_work_sync(&sock->keepalive); + cancel_work_sync(&sock->keepalive_worker); printk(KERN_DEBUG "dnbd3: socket disconnect device %i\n", dev->minor); mutex_lock(&sock->lock); @@ -945,17 +969,14 @@ static int dnbd3_socket_disconnect(struct dnbd3_device *dev, struct dnbd3_server mutex_unlock(&sock->lock); mutex_destroy(&sock->lock); - if (wait_for_receiver) { - printk(KERN_DEBUG "dnbd3: cancel receiver work device %i\n", dev->minor); - cancel_work_sync(&sock->receive); - } + printk(KERN_DEBUG "dnbd3: cancel receiver work device %i\n", dev->minor); + cancel_work_sync(&sock->receive_worker); if (sock->sock) { sock_release(sock->sock); sock->sock = NULL; } sock->panic = 0; - sock->heartbeat_count = 0; return 0; } @@ -966,7 +987,7 @@ int dnbd3_net_disconnect(struct dnbd3_device *dev) for (i = 0; i < NUMBER_CONNECTIONS; i++) { if (dev->socks[i].sock) { - if (dnbd3_socket_disconnect(dev, NULL, &dev->socks[i], true)) { + if (dnbd3_socket_disconnect(dev, NULL, &dev->socks[i])) { result = -EIO; } } @@ -982,13 +1003,11 @@ int dnbd3_net_connect(struct dnbd3_device *dev) if (dnbd3_socket_connect(dev, &dev->alt_servers[0]) == 0) { dnbd3_print_server_list(dev); - INIT_WORK(&dev->discovery, dnbd3_discovery_worker); - timer_setup(&dev->discovery_timer, dnbd3_discovery_timer, 0); - dev->discovery_timer.expires = DISCOVERY_TIMER; - add_timer(&dev->discovery_timer); - - // let it discover alt servers - queue_work(dnbd3_wq, &dev->discovery); + INIT_WORK(&dev->discovery_worker, dnbd3_discovery_worker); + INIT_WORK(&dev->panic_worker, dnbd3_panic_worker); + timer_setup(&dev->timer, dnbd3_timer, 0); + dev->timer.expires = jiffies + HZ; + add_timer(&dev->timer); result = 0; } else { -- cgit v1.2.3-55-g7522