summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorManuel Bentele2020-08-27 11:02:30 +0200
committerManuel Bentele2020-08-27 11:02:30 +0200
commit47c8cb8c78fbace1e5edb2107fc1c4dcbe7058be (patch)
tree55ce28fa6fc8c36b09ac62cec87587c1d987966e
parentcmake: Fixed missing include paths to build the Linux kernel module (diff)
downloaddnbd3-47c8cb8c78fbace1e5edb2107fc1c4dcbe7058be.tar.gz
dnbd3-47c8cb8c78fbace1e5edb2107fc1c4dcbe7058be.tar.xz
dnbd3-47c8cb8c78fbace1e5edb2107fc1c4dcbe7058be.zip
[KERNEL] convert to blk-mq and ktime
This converts the dnbd3 kernel module driver to use the blk-mq infrastructure, which allows the dnbd3 kernel module driver to be compatible with Linux kernels in version 5.x or later. The conversion of the implementation uses one hardware queue to preserve the existing send/receive and load-balancing logic, but can be scaled up in the future. In addition to that, time measurements in the implementation are converted to ktime based accessors to replace the use of deprecated time interfaces.
-rw-r--r--CMakeLists.txt6
-rw-r--r--src/kernel/blk.c276
-rw-r--r--src/kernel/blk.h6
-rw-r--r--src/kernel/dnbd3.h3
-rw-r--r--src/kernel/net.c36
5 files changed, 183 insertions, 144 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 18ff147..21865e5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -231,8 +231,14 @@ IF(BUILD_KERNEL_MODULE)
SET(KERNEL_DIR "/lib/modules/${CMAKE_SYSTEM_VERSION}/build")
ENDIF()
+ SET(KERNEL_C_FLAGS "")
+ IF(KERNEL_C_FLAGS MATCHES Debug)
+ SET(KERNEL_C_FLAGS "-g -DDEBUG")
+ ENDIF()
+
SET(KBUILD_COMMAND ${CMAKE_MAKE_PROGRAM} -C ${KERNEL_DIR}
M=${CMAKE_BINARY_DIR} modules
+ EXTRA_CFLAGS=${KERNEL_C_FLAGS}
)
CONFIGURE_FILE(Kbuild.in ${CMAKE_BINARY_DIR}/Kbuild)
diff --git a/src/kernel/blk.c b/src/kernel/blk.c
index 889b988..dde8dea 100644
--- a/src/kernel/blk.c
+++ b/src/kernel/blk.c
@@ -41,92 +41,7 @@
req->cmd_type == REQ_TYPE_SPECIAL
#endif
-int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor)
-{
- struct gendisk *disk;
- struct request_queue *blk_queue;
-
- init_waitqueue_head(&dev->process_queue_send);
- init_waitqueue_head(&dev->process_queue_receive);
- init_waitqueue_head(&dev->process_queue_discover);
- INIT_LIST_HEAD(&dev->request_queue_send);
- INIT_LIST_HEAD(&dev->request_queue_receive);
-
- memset(&dev->cur_server, 0, sizeof(dev->cur_server));
- memset(&dev->initial_server, 0, sizeof(dev->initial_server));
- dev->better_sock = NULL;
-
- dev->imgname = NULL;
- dev->rid = 0;
- dev->update_available = 0;
- memset(dev->alt_servers, 0, sizeof(dev->alt_servers[0])*NUMBER_SERVERS);
- dev->thread_send = NULL;
- dev->thread_receive = NULL;
- dev->thread_discover = NULL;
- dev->discover = 0;
- dev->disconnecting = 0;
- dev->panic = 0;
- dev->panic_count = 0;
- dev->reported_size = 0;
-
- if (!(disk = alloc_disk(1)))
- {
- printk("ERROR: dnbd3 alloc_disk failed.\n");
- return -EIO;
- }
-
- disk->major = major;
- disk->first_minor = minor;
- sprintf(disk->disk_name, "dnbd%d", minor);
- set_capacity(disk, 0);
- set_disk_ro(disk, 1);
- disk->fops = &dnbd3_blk_ops;
-
- spin_lock_init(&dev->blk_lock);
- if ((blk_queue = blk_init_queue(&dnbd3_blk_request, &dev->blk_lock)) == NULL)
- {
- printk("ERROR: dnbd3 blk_init_queue failed.\n");
- return -EIO;
- }
-
- blk_queue_logical_block_size(blk_queue, DNBD3_BLOCK_SIZE);
- blk_queue_physical_block_size(blk_queue, DNBD3_BLOCK_SIZE);
-
- disk->queue = blk_queue;
- disk->private_data = dev;
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
- blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
- blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
-#else
- queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
-#endif
-#define ONE_MEG (1048576)
- blk_queue_max_segment_size(disk->queue, ONE_MEG);
- blk_queue_max_segments(disk->queue, 0xffff);
- blk_queue_max_hw_sectors(disk->queue, ONE_MEG / DNBD3_BLOCK_SIZE);
- disk->queue->limits.max_sectors = 256;
- dev->disk = disk;
-#undef ONE_MEG
-
- add_disk(disk);
- dnbd3_sysfs_init(dev);
- return 0;
-}
-
-int dnbd3_blk_del_device(dnbd3_device_t *dev)
-{
- dnbd3_sysfs_exit(dev);
- dnbd3_net_disconnect(dev);
- del_gendisk(dev->disk);
- put_disk(dev->disk);
- blk_cleanup_queue(dev->disk->queue);
- return 0;
-}
-
-struct block_device_operations dnbd3_blk_ops =
- { .owner = THIS_MODULE, .ioctl = dnbd3_blk_ioctl, };
-
-int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
+static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
{
int result = -100;
dnbd3_device_t *dev = bdev->bd_disk->private_data;
@@ -225,7 +140,9 @@ int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, u
dnbd3_blk_fail_all_requests(dev);
result = dnbd3_net_disconnect(dev);
dnbd3_blk_fail_all_requests(dev);
+ blk_mq_freeze_queue(dev->queue);
set_capacity(dev->disk, 0);
+ blk_mq_unfreeze_queue(dev->queue);
if (dev->imgname)
{
kfree(dev->imgname);
@@ -275,48 +192,167 @@ cleanup_return:
return result;
}
-/**
- * dev->blk_lock and q->queue_lock are being held
- * when this is called!
- */
-void dnbd3_blk_request(struct request_queue *q)
+static const struct block_device_operations dnbd3_blk_ops = {
+ .owner = THIS_MODULE,
+ .ioctl = dnbd3_blk_ioctl,
+};
+
+static blk_status_t dnbd3_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd)
{
- struct request *req;
- dnbd3_device_t *dev;
+ struct request *rq = bd->rq;
+ dnbd3_device_t *dev = rq->q->queuedata;
+ unsigned long irqflags;
+
+ blk_mq_start_request(rq);
- while ((req = blk_fetch_request(q)) != NULL)
+ if (dev->imgname == NULL)
{
- dev = req->rq_disk->private_data;
+ blk_mq_end_request(rq, BLK_STS_IOERR);
+ goto out;
+ }
- if (dev->imgname == NULL)
- {
- __blk_end_request_all(req, -EIO);
- continue;
- }
+ if (!(dnbd3_req_fs(rq)))
+ {
+ blk_mq_end_request(rq, BLK_STS_IOERR);
+ goto out;
+ }
- if (!(dnbd3_req_fs(req)))
- {
- __blk_end_request_all(req, 0);
- continue;
- }
+ if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count >= PROBE_COUNT_TIMEOUT)
+ {
+ blk_mq_end_request(rq, BLK_STS_TIMEOUT);
+ goto out;
+ }
- if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count >= PROBE_COUNT_TIMEOUT)
- {
- __blk_end_request_all(req, -EIO);
- continue;
- }
+ if (!(dnbd3_req_read(rq)))
+ {
+ blk_mq_end_request(rq, BLK_STS_NOTSUPP);
+ goto out;
+ }
- if (!(dnbd3_req_read(req)))
- {
- __blk_end_request_all(req, -EACCES);
- continue;
- }
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ list_add_tail(&rq->queuelist, &dev->request_queue_send);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ wake_up(&dev->process_queue_send);
+
+out:
+ return BLK_STS_OK;
+}
+
+static const struct blk_mq_ops dnbd3_mq_ops = {
+ .queue_rq = dnbd3_queue_rq,
+};
+
+int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor)
+{
+ int ret;
+
+ init_waitqueue_head(&dev->process_queue_send);
+ init_waitqueue_head(&dev->process_queue_receive);
+ init_waitqueue_head(&dev->process_queue_discover);
+ INIT_LIST_HEAD(&dev->request_queue_send);
+ INIT_LIST_HEAD(&dev->request_queue_receive);
+
+ memset(&dev->cur_server, 0, sizeof(dev->cur_server));
+ memset(&dev->initial_server, 0, sizeof(dev->initial_server));
+ dev->better_sock = NULL;
+
+ dev->imgname = NULL;
+ dev->rid = 0;
+ dev->update_available = 0;
+ memset(dev->alt_servers, 0, sizeof(dev->alt_servers[0])*NUMBER_SERVERS);
+ dev->thread_send = NULL;
+ dev->thread_receive = NULL;
+ dev->thread_discover = NULL;
+ dev->discover = 0;
+ dev->disconnecting = 0;
+ dev->panic = 0;
+ dev->panic_count = 0;
+ dev->reported_size = 0;
+
+ // set up spin lock for request queues for send and receive
+ spin_lock_init(&dev->blk_lock);
+
+ // set up tag_set for blk-mq
+ dev->tag_set.ops = &dnbd3_mq_ops;
+ dev->tag_set.nr_hw_queues = 1;
+ dev->tag_set.queue_depth = 128;
+ dev->tag_set.numa_node = NUMA_NO_NODE;
+ dev->tag_set.cmd_size = 0;
+ dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+ dev->tag_set.driver_data = dev;
+
+ ret = blk_mq_alloc_tag_set(&dev->tag_set);
+ if (ret)
+ {
+ printk(KERN_ERR "ERROR: dnbd3 blk_mq_alloc_tag_set failed.\n");
+ goto out;
+ }
- list_add_tail(&req->queuelist, &dev->request_queue_send);
- spin_unlock_irq(q->queue_lock);
- wake_up(&dev->process_queue_send);
- spin_lock_irq(q->queue_lock);
+ // set up blk-mq
+ dev->queue = blk_mq_init_queue(&dev->tag_set);
+ if (IS_ERR(dev->queue)) {
+ ret = PTR_ERR(dev->queue);
+ goto out_cleanup_tags;
}
+ dev->queue->queuedata = dev;
+
+ blk_queue_logical_block_size(dev->queue, DNBD3_BLOCK_SIZE);
+ blk_queue_physical_block_size(dev->queue, DNBD3_BLOCK_SIZE);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue);
+ blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dev->queue);
+#else
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, dev->queue);
+#endif
+#define ONE_MEG (1048576)
+ blk_queue_max_segment_size(dev->queue, ONE_MEG);
+ blk_queue_max_segments(dev->queue, 0xffff);
+ blk_queue_max_hw_sectors(dev->queue, ONE_MEG / DNBD3_BLOCK_SIZE);
+ dev->queue->limits.max_sectors = 256;
+#undef ONE_MEG
+
+ // set up disk
+ if (!(dev->disk = alloc_disk(1)))
+ {
+ printk(KERN_ERR "ERROR: dnbd3 alloc_disk failed.\n");
+ ret = -ENOMEM;
+ goto out_cleanup_queue;
+ }
+
+ dev->disk->flags |= GENHD_FL_NO_PART_SCAN;
+ dev->disk->major = major;
+ dev->disk->first_minor = minor;
+ dev->disk->fops = &dnbd3_blk_ops;
+ dev->disk->private_data = dev;
+ dev->disk->queue = dev->queue;
+ sprintf(dev->disk->disk_name, "dnbd%d", minor);
+ set_capacity(dev->disk, 0);
+ set_disk_ro(dev->disk, 1);
+ add_disk(dev->disk);
+
+ // set up sysfs
+ dnbd3_sysfs_init(dev);
+
+ return 0;
+
+out_cleanup_queue:
+ blk_cleanup_queue(dev->queue);
+out_cleanup_tags:
+ blk_mq_free_tag_set(&dev->tag_set);
+out:
+ return ret;
+}
+
+int dnbd3_blk_del_device(dnbd3_device_t *dev)
+{
+ dnbd3_sysfs_exit(dev);
+ dnbd3_net_disconnect(dev);
+ del_gendisk(dev->disk);
+ blk_cleanup_queue(dev->queue);
+ blk_mq_free_tag_set(&dev->tag_set);
+ put_disk(dev->disk);
+ return 0;
}
void dnbd3_blk_fail_all_requests(dnbd3_device_t *dev)
@@ -371,7 +407,7 @@ void dnbd3_blk_fail_all_requests(dnbd3_device_t *dev)
if (dnbd3_req_fs(blk_request))
{
spin_lock_irqsave(&dev->blk_lock, flags);
- __blk_end_request_all(blk_request, -EIO);
+ blk_mq_end_request(blk_request, BLK_STS_IOERR);
spin_unlock_irqrestore(&dev->blk_lock, flags);
}
else if (dnbd3_req_special(blk_request))
diff --git a/src/kernel/blk.h b/src/kernel/blk.h
index 5091d19..0afce2e 100644
--- a/src/kernel/blk.h
+++ b/src/kernel/blk.h
@@ -27,12 +27,6 @@
#define REQ_TYPE_SPECIAL REQ_TYPE_DRV_PRIV
#endif
-extern struct block_device_operations dnbd3_blk_ops;
-
-int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg);
-
-void dnbd3_blk_request(struct request_queue *q);
-
int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor);
int dnbd3_blk_del_device(dnbd3_device_t *dev);
diff --git a/src/kernel/dnbd3.h b/src/kernel/dnbd3.h
index f8af69f..aceb853 100644
--- a/src/kernel/dnbd3.h
+++ b/src/kernel/dnbd3.h
@@ -25,6 +25,7 @@
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <net/sock.h>
#define KERNEL_MODULE
@@ -46,6 +47,8 @@ typedef struct
{
// block
struct gendisk *disk;
+ struct blk_mq_tag_set tag_set;
+ struct request_queue *queue;
spinlock_t blk_lock;
// sysfs
diff --git a/src/kernel/net.c b/src/kernel/net.c
index 9e48b86..337cfd7 100644
--- a/src/kernel/net.c
+++ b/src/kernel/net.c
@@ -25,13 +25,15 @@
#include "serialize.h"
-#include <linux/time.h>
+#include <linux/ktime.h>
#include <linux/signal.h>
#ifndef MIN
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
+#define ktime_to_s(kt) ktime_divns(kt, NSEC_PER_SEC)
+
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)
#define dnbd3_sock_create(af,type,proto,sock) sock_create_kern(&init_net, (af) == HOST_IP4 ? AF_INET : AF_INET6, type, proto, sock)
#else
@@ -203,8 +205,8 @@ int dnbd3_net_connect(dnbd3_device_t *dev)
if (dnbd3_sock_create(dev->cur_server.host.type, SOCK_STREAM, IPPROTO_TCP, &dev->sock) < 0)
error_dev("ERROR: Couldn't create socket (v6).");
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
+ kernel_setsockopt(dev->sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&timeout, sizeof(timeout));
+ kernel_setsockopt(dev->sock, SOL_SOCKET, SO_RCVTIMEO_NEW, (char *)&timeout, sizeof(timeout));
dev->sock->sk->sk_allocation = GFP_NOIO;
if (dev->cur_server.host.type == HOST_IP4)
{
@@ -289,8 +291,8 @@ int dnbd3_net_connect(dnbd3_device_t *dev)
debug_dev("INFO: On-the-fly server change.");
dev->sock = dev->better_sock;
dev->better_sock = NULL;
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
+ kernel_setsockopt(dev->sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&timeout, sizeof(timeout));
+ kernel_setsockopt(dev->sock, SOL_SOCKET, SO_RCVTIMEO_NEW, (char *)&timeout, sizeof(timeout));
}
dev->panic = 0;
@@ -459,7 +461,7 @@ int dnbd3_net_discover(void *data)
uint64_t filesize;
uint16_t rid;
- struct timeval start, end;
+ ktime_t start, end;
unsigned long rtt, best_rtt = 0;
unsigned long irqflags;
int i, j, isize, best_server, current_server;
@@ -565,7 +567,7 @@ int dnbd3_net_discover(void *data)
}
if (NUMBER_SERVERS > isize) {
for (i = 0; i < isize; ++i) {
- j = ((start.tv_sec >> i) ^ (start.tv_usec >> j)) % NUMBER_SERVERS;
+ j = ((ktime_to_s(start) >> i) ^ (ktime_to_us(start) >> j)) % NUMBER_SERVERS;
if (j != i) {
mlen = check_order[i];
check_order[i] = check_order[j];
@@ -579,7 +581,7 @@ int dnbd3_net_discover(void *data)
i = check_order[j];
if (dev->alt_servers[i].host.type == 0) // Empty slot
continue;
- if (!dev->panic && dev->alt_servers[i].failures > 50 && (start.tv_usec & 7) != 0) // If not in panic mode, skip server if it failed too many times
+ if (!dev->panic && dev->alt_servers[i].failures > 50 && (ktime_to_us(start) & 7) != 0) // If not in panic mode, skip server if it failed too many times
continue;
if (isize-- <= 0 && !is_same_server(&dev->cur_server, &dev->alt_servers[i]))
continue;
@@ -591,8 +593,8 @@ int dnbd3_net_discover(void *data)
sock = NULL;
continue;
}
- kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
- kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
+ kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&timeout, sizeof(timeout));
+ kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO_NEW, (char *)&timeout, sizeof(timeout));
sock->sk->sk_allocation = GFP_NOIO;
if (dev->alt_servers[i].host.type == HOST_IP4)
{
@@ -693,7 +695,7 @@ int dnbd3_net_discover(void *data)
iov[0].iov_len = sizeof(dnbd3_request);
// start rtt measurement
- do_gettimeofday(&start);
+ start = ktime_get_real();
if (kernel_sendmsg(sock, &msg, iov, 1, sizeof(dnbd3_request)) <= 0)
error_alt("ERROR: Requesting test block failed (discover).");
@@ -715,10 +717,9 @@ int dnbd3_net_discover(void *data)
if (kernel_recvmsg(sock, &msg, iov, 1, dnbd3_reply.size, msg.msg_flags) != RTT_BLOCK_SIZE)
error_alt("ERROR: Receiving test block payload failed (discover).");
- do_gettimeofday(&end); // end rtt measurement
+ end = ktime_get_real(); // end rtt measurement
- dev->alt_servers[i].rtts[turn] = (unsigned long)((end.tv_sec - start.tv_sec) * 1000000ull
- + (end.tv_usec - start.tv_usec));
+ dev->alt_servers[i].rtts[turn] = (unsigned long) ktime_us_delta(end, start);
rtt = (dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2]
+ dev->alt_servers[i].rtts[3]) / 4;
@@ -781,7 +782,7 @@ int dnbd3_net_discover(void *data)
continue;
}
- do_change = ready && best_server != current_server && (start.tv_usec & 3) != 0
+ do_change = ready && best_server != current_server && (ktime_to_us(start) & 3) != 0
&& RTT_THRESHOLD_FACTOR(dev->cur_rtt) > best_rtt + 1500;
if (ready && !do_change) {
@@ -823,7 +824,7 @@ int dnbd3_net_discover(void *data)
best_sock = NULL;
}
- if (!ready || (start.tv_usec & 15) != 0)
+ if (!ready || (ktime_to_us(start) & 15) != 0)
turn = (turn + 1) % 4;
if (turn == 2) // Set ready when we only have 2 of 4 measurements for quicker load balancing
ready = 1;
@@ -1032,7 +1033,7 @@ int dnbd3_net_receive(void *data)
}
spin_lock_irqsave(&dev->blk_lock, irqflags);
list_del_init(&blk_request->queuelist);
- __blk_end_request_all(blk_request, 0);
+ blk_mq_end_request(blk_request, BLK_STS_OK);
spin_unlock_irqrestore(&dev->blk_lock, irqflags);
continue;
@@ -1120,4 +1121,3 @@ int dnbd3_net_receive(void *data)
dev->thread_receive = NULL;
return -1;
}
-