summaryrefslogtreecommitdiffstats
path: root/src/kernel/blk.c
diff options
context:
space:
mode:
authorSimon Rettberg2022-02-12 23:56:35 +0100
committerSimon Rettberg2022-02-18 21:34:55 +0100
commiteb2876f6542af2bfa47c7a6905ecc4f81f1d2ad3 (patch)
tree17ebb5fd2d4770a4dd67f857f2488221cd46874c /src/kernel/blk.c
parent[KERNEL] Add missing include to fix compile on 4.14.x (diff)
downloaddnbd3-eb2876f6542af2bfa47c7a6905ecc4f81f1d2ad3.tar.gz
dnbd3-eb2876f6542af2bfa47c7a6905ecc4f81f1d2ad3.tar.xz
dnbd3-eb2876f6542af2bfa47c7a6905ecc4f81f1d2ad3.zip
[KERNEL] Refactor to use workqueues and blk-mq only
Using workqueues frees us from having to manage the lifecycle of three dedicated threads. Discovery (alt server checks) and sending keepalive packets is now done using work on the power efficient system queue. Sending and receiving happens via dedicated work queues with higher priority. blk-mq has also been around for quite a while in the kernel, so switching to it doesn't hurt backwards compatibility. As the code is now refactored to work more as blk-mq is designed, backwards compatibility even improved while at the same time freeing us from an arsenal of macros that were required to make the blk-mq port look and feel like the old implementation. For example, the code now compiles on CentOS 7 with kernel 3.10 without requiring special macros to detect the heavily modified RedHat kernel with all its backported features. A few other design limitations have been rectified along the way, e.g. switching to another server now doesn't internally disconnect from the current one first, which theoretically could lead to a non-working setup, if the new server isn't reachable and then - because of some transient network error - switching back also fails. As the discover-thread was torn down from the disconnect call, the connection would also not repair itself eventually. we now establish the new connection in parallel to the old one, and only if that succeeds do we replace the old one with it, similar to how the automatic alt-server switch already does it.
Diffstat (limited to 'src/kernel/blk.c')
-rw-r--r--src/kernel/blk.c332
1 files changed, 145 insertions, 187 deletions
diff --git a/src/kernel/blk.c b/src/kernel/blk.c
index ccaa6c1..5795c03 100644
--- a/src/kernel/blk.c
+++ b/src/kernel/blk.c
@@ -34,25 +34,16 @@ static int dnbd3_close_device(dnbd3_device_t *dev)
if (dev->imgname)
dev_info(dnbd3_device_to_dev(dev), "closing down device.\n");
- /* quickly fail all requests */
- dnbd3_blk_fail_all_requests(dev);
- dev->panic = 0;
- dev->discover = 0;
+ dev->panic = false;
result = dnbd3_net_disconnect(dev);
kfree(dev->imgname);
dev->imgname = NULL;
/* new requests might have been queued up, */
/* but now that imgname is NULL no new ones can show up */
- dnbd3_blk_fail_all_requests(dev);
-#ifdef DNBD3_BLK_MQ
blk_mq_freeze_queue(dev->queue);
-#endif
set_capacity(dev->disk, 0);
-#ifdef DNBD3_BLK_MQ
blk_mq_unfreeze_queue(dev->queue);
-#endif
- dev->reported_size = 0;
return result;
}
@@ -65,8 +56,7 @@ static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
#endif
char *imgname = NULL;
dnbd3_ioctl_t *msg = NULL;
- unsigned long irqflags;
- int i = 0;
+ int i = 0, j;
u8 locked = 0;
if (arg != 0) {
@@ -97,7 +87,7 @@ static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
switch (cmd) {
case IOCTL_OPEN:
- if (atomic_cmpxchg(&dev->connection_lock, 0, 1) != 0) {
+ if (!dnbd3_flag_get(dev->connection_lock)) {
result = -EBUSY;
break;
}
@@ -121,7 +111,7 @@ static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
dev_info(dnbd3_device_to_dev(dev), "opening device.\n");
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0)
- // set optimal request size for the queue
+ // set optimal request size for the queue to half the read-ahead
blk_queue_io_opt(dev->queue, (msg->read_ahead_kb * 512));
#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0)
// set readahead from optimal request size of the queue
@@ -158,8 +148,7 @@ static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
if (dev->alt_servers[i].host.ss_family == 0)
continue; // Empty slot
- dev->cur_server.host = dev->alt_servers[i].host;
- result = dnbd3_net_connect(dev);
+ result = dnbd3_new_connection(dev, &dev->alt_servers[i].host, true);
if (result == 0) {
/* connection established, store index of server and exit loop */
result = i;
@@ -168,7 +157,7 @@ static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
}
if (result >= 0) {
- /* probing was successful */
+ /* connection was successful */
dev_dbg(dnbd3_device_to_dev(dev), "server %pISpc is initial server\n",
&dev->cur_server.host);
imgname = NULL; // Prevent kfree at the end
@@ -180,7 +169,7 @@ static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
break;
case IOCTL_CLOSE:
- if (atomic_cmpxchg(&dev->connection_lock, 0, 1) != 0) {
+ if (!dnbd3_flag_get(dev->connection_lock)) {
result = -EBUSY;
break;
}
@@ -189,7 +178,7 @@ static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
break;
case IOCTL_SWITCH:
- if (atomic_cmpxchg(&dev->connection_lock, 0, 1) != 0) {
+ if (!dnbd3_flag_get(dev->connection_lock)) {
result = -EBUSY;
break;
}
@@ -216,40 +205,12 @@ static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
/* specified server is current server, so do not switch */
result = 0;
} else {
- struct sockaddr_storage old_server;
-
dev_info(dnbd3_device_to_dev(dev), "manual server switch to %pISpc\n",
&new_addr);
- /* save current working server */
- /* lock device to get consistent copy of current working server */
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- old_server = dev->cur_server.host;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
-
- /* disconnect old server */
- dnbd3_net_disconnect(dev);
-
- /* connect to new specified server (switching) */
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- dev->cur_server.host = new_addr;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- result = dnbd3_net_connect(dev);
+ result = dnbd3_new_connection(dev, &new_addr, false);
if (result != 0) {
- /* reconnect with old server if switching has failed */
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- dev->cur_server.host = old_server;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- if (dnbd3_net_connect(dev) != 0) {
- /* we couldn't reconnect to the old server */
- /* device is dangling now and needs another SWITCH call */
- dev_warn(
- dnbd3_device_to_dev(dev),
- "switching failed and could not switch back to old server - dangling device\n");
- result = -ECONNABORTED;
- } else {
- /* switching didn't work but we are back to the old server */
- result = -EAGAIN;
- }
+ /* switching didn't work */
+ result = -EAGAIN;
} else {
/* switch succeeded */
/* fake RTT so we don't switch away again soon */
@@ -257,12 +218,13 @@ static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
for (i = 0; i < NUMBER_SERVERS; ++i) {
alt_server = &dev->alt_servers[i];
if (is_same_server(&alt_server->host, &new_addr)) {
- alt_server->rtts[0] = alt_server->rtts[1]
- = alt_server->rtts[2] = alt_server->rtts[3] = 4;
+ for (j = 0; j < DISCOVER_HISTORY_SIZE; ++j)
+ alt_server->rtts[j] = 1;
alt_server->best_count = 100;
} else {
- alt_server->rtts[0] <<= 2;
- alt_server->rtts[2] <<= 2;
+ for (j = 0; j < DISCOVER_HISTORY_SIZE; ++j)
+ if (alt_server->rtts[j] < 5000)
+ alt_server->rtts[j] = 5000;
alt_server->best_count = 0;
}
}
@@ -318,12 +280,11 @@ static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
break;
}
- if (locked)
- atomic_set(&dev->connection_lock, 0);
-
cleanup_return:
kfree(msg);
kfree(imgname);
+ if (locked)
+ dnbd3_flag_reset(dev->connection_lock);
return result;
}
@@ -332,7 +293,18 @@ static const struct block_device_operations dnbd3_blk_ops = {
.ioctl = dnbd3_blk_ioctl,
};
-#ifdef DNBD3_BLK_MQ
+static void dnbd3_add_queue(dnbd3_device_t *dev, struct request *rq)
+{
+ unsigned long irqflags;
+
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ list_add_tail(&rq->queuelist, &dev->send_queue);
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ queue_work(dev->send_wq, &dev->send_work);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+}
+
/*
* Linux kernel blk-mq driver function (entry point) to handle block IO requests
*/
@@ -340,110 +312,108 @@ static blk_status_t dnbd3_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_
{
struct request *rq = bd->rq;
dnbd3_device_t *dev = rq->q->queuedata;
- unsigned long irqflags;
+ struct dnbd3_cmd *cmd;
- if (dev->imgname == NULL)
+ if (dev->imgname == NULL || !device_active(dev))
return BLK_STS_IOERR;
- if (!(dnbd3_req_fs(rq)))
+ if (req_op(rq) != REQ_OP_READ)
return BLK_STS_IOERR;
if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count >= PROBE_COUNT_TIMEOUT)
return BLK_STS_TIMEOUT;
- if (!(dnbd3_req_read(rq)))
+ if (rq_data_dir(rq) != READ)
return BLK_STS_NOTSUPP;
+ cmd = blk_mq_rq_to_pdu(rq);
+ cmd->handle = (u64)blk_mq_unique_tag(rq) | (((u64)jiffies) << 32);
blk_mq_start_request(rq);
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_add_tail(&rq->queuelist, &dev->request_queue_send);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- wake_up(&dev->process_queue_send);
+ dnbd3_add_queue(dev, rq);
return BLK_STS_OK;
}
-static const struct blk_mq_ops dnbd3_mq_ops = {
- .queue_rq = dnbd3_queue_rq,
-};
-
-#else /* DNBD3_BLK_MQ */
-/*
- * Linux kernel blk driver function (entry point) to handle block IO requests
- */
-static void dnbd3_blk_request(struct request_queue *q)
+static enum blk_eh_timer_return dnbd3_rq_timeout(struct request *req, bool reserved)
{
- struct request *rq;
- dnbd3_device_t *dev;
-
- while ((rq = blk_fetch_request(q)) != NULL) {
- dev = rq->rq_disk->private_data;
-
- if (dev->imgname == NULL) {
- __blk_end_request_all(rq, -EIO);
- continue;
- }
-
- if (!(dnbd3_req_fs(rq))) {
- __blk_end_request_all(rq, 0);
- continue;
- }
-
- if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count >= PROBE_COUNT_TIMEOUT) {
- __blk_end_request_all(rq, -EIO);
- continue;
+ unsigned long irqflags;
+ struct request *rq_iter;
+ bool found = false;
+ dnbd3_device_t *dev = req->q->queuedata;
+
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ list_for_each_entry(rq_iter, &dev->send_queue, queuelist) {
+ if (rq_iter == req) {
+ found = true;
+ break;
}
-
- if (!(dnbd3_req_read(rq))) {
- __blk_end_request_all(rq, -EACCES);
- continue;
+ }
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ // If still in send queue, do nothing
+ if (found)
+ return BLK_EH_RESET_TIMER;
+
+ spin_lock_irqsave(&dev->recv_queue_lock, irqflags);
+ list_for_each_entry(rq_iter, &dev->recv_queue, queuelist) {
+ if (rq_iter == req) {
+ found = true;
+ list_del_init(&req->queuelist);
+ break;
}
-
- list_add_tail(&rq->queuelist, &dev->request_queue_send);
- spin_unlock_irq(q->queue_lock);
- wake_up(&dev->process_queue_send);
- spin_lock_irq(q->queue_lock);
}
+ spin_unlock_irqrestore(&dev->recv_queue_lock, irqflags);
+ if (!found) {
+ dev_err(dnbd3_device_to_dev(dev), "timeout request neither found in send nor recv queue, ignoring\n");
+ // Assume it was fnished concurrently
+ return BLK_EH_DONE;
+ }
+ // Add to send queue again and trigger work, reset timeout
+ dnbd3_add_queue(dev, req);
+ return BLK_EH_RESET_TIMER;
}
-#endif /* DNBD3_BLK_MQ */
+
+static
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
+const
+#endif
+struct blk_mq_ops dnbd3_mq_ops = {
+ .queue_rq = dnbd3_queue_rq,
+ .timeout = dnbd3_rq_timeout,
+};
int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor)
{
int ret;
- init_waitqueue_head(&dev->process_queue_send);
- init_waitqueue_head(&dev->process_queue_discover);
- INIT_LIST_HEAD(&dev->request_queue_send);
- INIT_LIST_HEAD(&dev->request_queue_receive);
-
- memset(&dev->cur_server, 0, sizeof(dev->cur_server));
- dev->better_sock = NULL;
+ memset(dev, 0, sizeof(*dev));
+ dev->index = minor;
+ // lock for imgname, cur_server etc.
+ spin_lock_init(&dev->blk_lock);
+ spin_lock_init(&dev->send_queue_lock);
+ spin_lock_init(&dev->recv_queue_lock);
+ INIT_LIST_HEAD(&dev->send_queue);
+ INIT_LIST_HEAD(&dev->recv_queue);
+ dnbd3_flag_reset(dev->connection_lock);
+ dnbd3_flag_reset(dev->discover_running);
+ mutex_init(&dev->alt_servers_lock);
+ dnbd3_net_work_init(dev);
+ // memset has done this already but I like initial values to be explicit
dev->imgname = NULL;
dev->rid = 0;
- dev->update_available = 0;
- mutex_init(&dev->alt_servers_lock);
- memset(dev->alt_servers, 0, sizeof(dev->alt_servers[0]) * NUMBER_SERVERS);
- dev->thread_send = NULL;
- dev->thread_receive = NULL;
- dev->thread_discover = NULL;
- dev->discover = 0;
- atomic_set(&dev->connection_lock, 0);
- dev->panic = 0;
+ dev->update_available = false;
+ dev->panic = false;
dev->panic_count = 0;
dev->reported_size = 0;
- // set up spin lock for request queues for send and receive
- spin_lock_init(&dev->blk_lock);
-
-#ifdef DNBD3_BLK_MQ
// set up tag_set for blk-mq
dev->tag_set.ops = &dnbd3_mq_ops;
dev->tag_set.nr_hw_queues = 1;
dev->tag_set.queue_depth = 128;
dev->tag_set.numa_node = NUMA_NO_NODE;
- dev->tag_set.cmd_size = 0;
+ dev->tag_set.cmd_size = sizeof(struct dnbd3_cmd);
dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tag_set.driver_data = dev;
+ dev->tag_set.timeout = BLOCK_LAYER_TIMEOUT * HZ;
ret = blk_mq_alloc_tag_set(&dev->tag_set);
if (ret) {
@@ -470,16 +440,6 @@ int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor)
}
dev->queue->queuedata = dev;
#endif
-#else
- // set up blk
- dev->queue = blk_init_queue(&dnbd3_blk_request, &dev->blk_lock);
- if (!dev->queue) {
- ret = -ENOMEM;
- dev_err(dnbd3_device_to_dev(dev), "blk_init_queue failed\n");
- goto out;
- }
- dev->queue->queuedata = dev;
-#endif /* DNBD3_BLK_MQ */
blk_queue_logical_block_size(dev->queue, DNBD3_BLOCK_SIZE);
blk_queue_physical_block_size(dev->queue, DNBD3_BLOCK_SIZE);
@@ -527,90 +487,88 @@ int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor)
out_cleanup_queue:
blk_cleanup_queue(dev->queue);
#endif
-#ifdef DNBD3_BLK_MQ
out_cleanup_tags:
blk_mq_free_tag_set(&dev->tag_set);
-#endif
out:
+ mutex_destroy(&dev->alt_servers_lock);
return ret;
}
int dnbd3_blk_del_device(dnbd3_device_t *dev)
{
- while (atomic_cmpxchg(&dev->connection_lock, 0, 1) != 0)
+ while (!dnbd3_flag_get(dev->connection_lock))
schedule();
dnbd3_close_device(dev);
dnbd3_sysfs_exit(dev);
del_gendisk(dev->disk);
#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
blk_cleanup_queue(dev->queue);
-#endif
-#ifdef DNBD3_BLK_MQ
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0)
+#else
blk_cleanup_disk(dev->disk);
#endif
blk_mq_free_tag_set(&dev->tag_set);
-#endif
mutex_destroy(&dev->alt_servers_lock);
#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
put_disk(dev->disk);
#endif
+ mutex_destroy(&dev->alt_servers_lock);
return 0;
}
+void dnbd3_blk_requeue_all_requests(dnbd3_device_t *dev)
+{
+ struct request *blk_request;
+ unsigned long flags;
+ struct list_head local_copy;
+ int count = 0;
+
+ INIT_LIST_HEAD(&local_copy);
+ spin_lock_irqsave(&dev->recv_queue_lock, flags);
+ while (!list_empty(&dev->recv_queue)) {
+ blk_request = list_entry(dev->recv_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ list_add(&blk_request->queuelist, &local_copy);
+ count++;
+ }
+ spin_unlock_irqrestore(&dev->recv_queue_lock, flags);
+ if (count)
+ dev_info(dnbd3_device_to_dev(dev), "re-queueing %d requests\n", count);
+ while (!list_empty(&local_copy)) {
+ blk_request = list_entry(local_copy.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ dnbd3_add_queue(dev, blk_request);
+ }
+}
+
void dnbd3_blk_fail_all_requests(dnbd3_device_t *dev)
{
- struct request *blk_request, *tmp_request;
- struct request *blk_request2, *tmp_request2;
+ struct request *blk_request;
unsigned long flags;
struct list_head local_copy;
- int dup;
+ int count = 0;
INIT_LIST_HEAD(&local_copy);
- spin_lock_irqsave(&dev->blk_lock, flags);
- while (!list_empty(&dev->request_queue_receive)) {
- list_for_each_entry_safe(blk_request, tmp_request, &dev->request_queue_receive, queuelist) {
- list_del_init(&blk_request->queuelist);
- dup = 0;
- list_for_each_entry_safe(blk_request2, tmp_request2, &local_copy, queuelist) {
- if (blk_request == blk_request2) {
- dev_warn(dnbd3_device_to_dev(dev),
- "same request is in request_queue_receive multiple times\n");
- BUG();
- dup = 1;
- break;
- }
- }
- if (!dup)
- list_add(&blk_request->queuelist, &local_copy);
- }
+ spin_lock_irqsave(&dev->recv_queue_lock, flags);
+ while (!list_empty(&dev->recv_queue)) {
+ blk_request = list_entry(dev->recv_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ list_add(&blk_request->queuelist, &local_copy);
+ count++;
}
- while (!list_empty(&dev->request_queue_send)) {
- list_for_each_entry_safe(blk_request, tmp_request, &dev->request_queue_send, queuelist) {
- list_del_init(&blk_request->queuelist);
- dup = 0;
- list_for_each_entry_safe(blk_request2, tmp_request2, &local_copy, queuelist) {
- if (blk_request == blk_request2) {
- dev_warn(dnbd3_device_to_dev(dev), "request is in both lists\n");
- BUG();
- dup = 1;
- break;
- }
- }
- if (!dup)
- list_add(&blk_request->queuelist, &local_copy);
- }
+ spin_unlock_irqrestore(&dev->recv_queue_lock, flags);
+ spin_lock_irqsave(&dev->send_queue_lock, flags);
+ while (!list_empty(&dev->send_queue)) {
+ blk_request = list_entry(dev->send_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ list_add(&blk_request->queuelist, &local_copy);
+ count++;
}
- spin_unlock_irqrestore(&dev->blk_lock, flags);
- list_for_each_entry_safe(blk_request, tmp_request, &local_copy, queuelist) {
+ spin_unlock_irqrestore(&dev->send_queue_lock, flags);
+ if (count)
+ dev_info(dnbd3_device_to_dev(dev), "failing %d requests\n", count);
+ while (!list_empty(&local_copy)) {
+ blk_request = list_entry(local_copy.next, struct request, queuelist);
list_del_init(&blk_request->queuelist);
- if (dnbd3_req_fs(blk_request))
-#ifdef DNBD3_BLK_MQ
- blk_mq_end_request(blk_request, BLK_STS_IOERR);
-#else
- blk_end_request_all(blk_request, -EIO);
-#endif
- else if (dnbd3_req_special(blk_request))
- kfree(blk_request);
+ blk_mq_end_request(blk_request, BLK_STS_IOERR);
}
}