summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLars Müller2008-03-01 19:30:38 +0100
committerLars Müller2008-03-01 19:30:38 +0100
commit93b9103f7383d400616d222606c294e07b16e1aa (patch)
tree611a39f7bc1d1dd5a4335157ef95c101d64dddc8 /kernel
downloaddnbd2-93b9103f7383d400616d222606c294e07b16e1aa.tar.gz
dnbd2-93b9103f7383d400616d222606c294e07b16e1aa.tar.xz
dnbd2-93b9103f7383d400616d222606c294e07b16e1aa.zip
Import dnbd* from the former openslx-contrib repo as of revision 92.
openslx-contrib is currently read only and will get removed in some days. git-svn-id: http://svn.openslx.org/svn/openslx/contrib/dnbd2/trunk@1592 95ad53e4-c205-0410-b2fa-d234c58c8868
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile18
-rw-r--r--kernel/core.c510
-rw-r--r--kernel/core.h102
-rw-r--r--kernel/devices.c220
-rw-r--r--kernel/devices.h14
-rw-r--r--kernel/dnbd2.h118
-rw-r--r--kernel/fops.c43
-rw-r--r--kernel/fops.h9
-rw-r--r--kernel/misc.c84
-rw-r--r--kernel/misc.h23
-rw-r--r--kernel/scp1
-rw-r--r--kernel/servers.c355
-rw-r--r--kernel/servers.h84
-rw-r--r--kernel/sysfs.c460
-rw-r--r--kernel/sysfs.h24
15 files changed, 2065 insertions, 0 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
new file mode 100644
index 0000000..8c01ed3
--- /dev/null
+++ b/kernel/Makefile
@@ -0,0 +1,18 @@
+#
+# kernel/Makefile
+#
+
+KDIR := /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+obj-m := dnbd2.o
+dnbd2-objs := fops.o sysfs.o servers.o devices.o misc.o core.o
+
+all:
+ $(MAKE) -C $(KDIR) M=$(PWD) modules
+
+install:
+ $(MAKE) -C $(KDIR) M=$(PWD) modules_install
+
+clean:
+ rm -rf *.o *.ko *~ *.symvers dnbd2.mod.c .*o.cmd .tmp_versions
diff --git a/kernel/core.c b/kernel/core.c
new file mode 100644
index 0000000..6bee380
--- /dev/null
+++ b/kernel/core.c
@@ -0,0 +1,510 @@
+/*
+ * kernel/core.c - The principal component of the driver.
+ * See core.h for comments on each function.
+ */
+
+
+#include "dnbd2.h"
+#include "core.h"
+#include "devices.h"
+#include "misc.h"
+#include "fops.h"
+#include "servers.h"
+
+
+int dnbd2_major;
+static dnbd2_device_t dev[DNBD2_DEVICES];
+
+
+void dnbd2_request(request_queue_t *q)
+{
+ int i;
+ struct request *req;
+ struct req_info *info;
+ dnbd2_device_t *dev;
+
+ while ((req = elv_next_request(q)) != NULL) {
+ /* Discard invalid block requests. */
+ if (!blk_fs_request(req)) {
+ end_request(req, 0);
+ continue;
+ }
+ if (rq_data_dir(req) != READ) {
+ end_request(req, 0);
+ continue;
+ }
+
+ /* Prepare request. */
+ dev = req->rq_disk->private_data;
+ for (i=0 ; i<POOL_SIZE ; i++) {
+ if (dev->info_pool[i].cnt == -1)
+ break;
+ }
+ if (i == POOL_SIZE)
+ continue;
+ dev->pending_reqs++;
+ info = &dev->info_pool[i];
+ info->time = jiffies;
+ info->cnt = 0;
+ info->cmd = CMD_GET_BLOCK;
+ info->dst = dev->active_server;
+ req->special = info;
+ blkdev_dequeue_request(req);
+
+ /* Enqueue the request for sending. */
+ spin_lock_bh(&dev->send_queue_lock);
+ list_add_tail(&req->queuelist, &dev->send_queue);
+ spin_unlock_bh(&dev->send_queue_lock);
+
+ /* Wakeup sender function. */
+ wake_up_interruptible(&dev->sender_wq);
+ }
+}
+
+
+int dnbd2_tx_loop(void *data)
+{
+ struct request *req;
+ dnbd2_device_t *dev = data;
+ struct req_info *info;
+
+ daemonize("dnbd2_tx_loop");
+ allow_signal(SIGKILL);
+ complete(&dev->tx_start);
+
+ while (1) {
+ wait_event_interruptible(dev->sender_wq,
+ !list_empty(&dev->send_queue));
+
+ /*
+ * If wait_event_interruptible is interrupted while
+ * the queue is not empty it retunrs 0, not
+ * -ERESTARTSYS. Therefore we need another flag.
+ */
+ if (dev->tx_signal)
+ break;
+
+ /* Dequeue request from the send-queue. */
+ spin_lock_bh(&dev->send_queue_lock);
+ req = blkdev_entry_to_request(dev->send_queue.next);
+ list_del_init(&req->queuelist);
+ spin_unlock_bh(&dev->send_queue_lock);
+
+ /*
+ * This is a good place to do some sanity checks
+ * because the request is neither in the send- nor in
+ * the pending-queue.
+ */
+ down(&dev->servers_mutex);
+ info = req->special;
+ switch (info->cmd) {
+ case CMD_GET_BLOCK:
+ /*
+ * If dnbd2_request is called w/o first
+ * opening the device or with the device's
+ * capacity set to zero we should land here.
+ */
+ if (!dev->running) {
+ dnbd2_end_request(req, 0);
+ up(&dev->servers_mutex);
+ continue;
+ }
+ /*
+ * Send block requests to dev->active_server,
+ * which is always set while running.
+ */
+ info->last_dst = info->dst;
+ info->dst = dev->active_server;
+ break;
+
+ case CMD_GET_SERVERS:
+ case CMD_GET_SIZE:
+ /*
+ * Discard the request if it's destination is
+ * no longer available.
+ */
+ if (!info->dst->sock) {
+ dnbd2_end_request(req, 1);
+ up(&dev->servers_mutex);
+ continue;
+ }
+ break;
+ }
+
+ /* Enqueue request in the pending-queue. */
+ spin_lock_bh(&dev->pending_queue_lock);
+ list_add_tail(&req->queuelist, &dev->pending_queue);
+ spin_unlock_bh(&dev->pending_queue_lock);
+
+ /* Send request. */
+ dnbd2_send_request(req, dev);
+ up(&dev->servers_mutex);
+ }
+
+ complete_and_exit(&dev->tx_stop, 0);
+}
+
+
+void dnbd2_send_request(struct request *req, dnbd2_device_t *dev) {
+ dnbd2_data_request_t dnbd2_req;
+ struct req_info *info = req->special;
+
+ dnbd2_req.cmd = cpu_to_be16(info->cmd);
+ dnbd2_req.time = cpu_to_be16(info->time);
+ dnbd2_req.vid = cpu_to_be16(dev->vid);
+ dnbd2_req.rid = cpu_to_be16(dev->rid);
+ dnbd2_req.num = cpu_to_be64(req->sector * SECTOR_SIZE);
+
+ /*
+ * If sock_xmit fails the request eventually gets requeued.
+ * That's why we don't check its return value.
+ */
+ sock_xmit(info->dst->sock, SEND, &dnbd2_req, sizeof(dnbd2_req));
+ info->cnt++;
+ if (info->dst != info->last_dst) {
+ info->cnt = 1;
+ } else if (info->cnt > 1) {
+ info->dst->retries++;
+ }
+}
+
+
+int dnbd2_rx_loop(void *data)
+{
+ struct srv_info *srv_info = data;
+ dnbd2_device_t *dev = srv_info->dev;
+ dnbd2_data_reply_t dnbd2_rep;
+ struct request *req;
+ uint64_t num;
+ uint16_t cmd, time;
+ char *buffer;
+ int ret, i;
+
+ daemonize("dnbd2_rx_loop");
+ allow_signal(SIGKILL);
+ complete(&srv_info->rx_start);
+
+ while (1) {
+ ret = sock_xmit(srv_info->sock, RECV,
+ &dnbd2_rep, sizeof(dnbd2_rep));
+ if (ret == -EINTR)
+ break;
+ if (ret != sizeof(dnbd2_rep) ||
+ be16_to_cpu(dnbd2_rep.vid) != dev->vid ||
+ be16_to_cpu(dnbd2_rep.rid) != dev->rid)
+ continue;
+
+ cmd = be16_to_cpu(dnbd2_rep.cmd);
+ num = be64_to_cpu(dnbd2_rep.num);
+ time = be16_to_cpu(dnbd2_rep.time);
+
+ /* Find a matching request in the pending-queue. */
+ req = dnbd2_find_request(num, cmd, srv_info);
+ if (!req)
+ continue;
+ srv_info->last_reply = jiffies;
+ update_rtt(diff(time), srv_info, cmd);
+
+ switch (cmd) {
+ case CMD_GET_BLOCK:
+ spin_lock(&dev->kmap_lock);
+ buffer = __bio_kmap_atomic(req->bio, 0, KM_USER0);
+ memcpy(buffer,
+ dnbd2_rep.payload.data,
+ DNBD2_BLOCK_SIZE);
+ __bio_kunmap_atomic(req->bio, KM_USER0);
+ spin_unlock(&dev->kmap_lock);
+ break;
+
+ case CMD_GET_SIZE:
+ if (!srv_info->capacity) {
+ srv_info->capacity = num / SECTOR_SIZE;
+ wake_up_all(&srv_info->wq);
+ }
+ break;
+
+ case CMD_GET_SERVERS:
+ down(&dev->servers_mutex);
+
+ if (dev->emergency)
+ stop_emergency(srv_info);
+
+ /* Recreate the emergency list. */
+ dev->emerg_list[0].ip = srv_info->ip;
+ dev->emerg_list[0].port = srv_info->port;
+ for (i=1 ; i<ALT_SERVERS_MAX; i++) {
+ if (num) {
+ memcpy(&dev->emerg_list[i],
+ &dnbd2_rep.payload.server[i-1],
+ sizeof(dnbd2_server_t));
+ try_add_server(dev->emerg_list[i], dev);
+ num--;
+ } else {
+ dev->emerg_list[i].ip = 0;
+ dev->emerg_list[i].port = 0;
+ }
+ }
+ up(&dev->servers_mutex);
+ break;
+ }
+ dnbd2_end_request(req, 1);
+ }
+
+ complete_and_exit(&srv_info->rx_stop, 0);
+}
+
+
+struct request *dnbd2_find_request(uint64_t num,
+ uint16_t cmd,
+ struct srv_info *dst)
+{
+ dnbd2_device_t *dev = dst->dev;
+ struct list_head *cur, *next;
+ struct req_info *info;
+ struct request *req;
+ sector_t sector;
+
+ switch (cmd) {
+ case CMD_GET_BLOCK:
+ sector = num / SECTOR_SIZE;
+ spin_lock_bh(&dev->pending_queue_lock);
+ list_for_each_safe(cur, next, &dev->pending_queue) {
+ req = blkdev_entry_to_request(cur);
+ info = req->special;
+
+ if (req->sector == sector && info->cmd == cmd) {
+ list_del_init(&req->queuelist);
+ spin_unlock_bh(&dev->pending_queue_lock);
+ return req;
+ }
+ }
+ break;
+
+ case CMD_GET_SIZE:
+ case CMD_GET_SERVERS:
+ spin_lock_bh(&dev->pending_queue_lock);
+ list_for_each_safe(cur, next, &dev->pending_queue) {
+ req = blkdev_entry_to_request(cur);
+ info = req->special;
+
+ if (info->cmd == cmd && info->dst == dst) {
+ list_del_init(&req->queuelist);
+ spin_unlock_bh(&dev->pending_queue_lock);
+ return req;
+ }
+ }
+ }
+
+ spin_unlock_bh(&dev->pending_queue_lock);
+ return NULL;
+}
+
+
+void dnbd2_end_request(struct request *req, int success)
+{
+ unsigned long flags;
+ struct req_info *info = req->special;
+ dnbd2_device_t *dev;
+
+ switch (info->cmd) {
+ case CMD_GET_BLOCK:
+ dev = req->rq_disk->private_data;
+ spin_lock_irqsave(&dev->blk_lock, flags);
+ list_del_init(&req->queuelist);
+ if (!end_that_request_first(req, success, req->nr_sectors)) {
+ end_that_request_last(req, success);
+ }
+ dev->pending_reqs--;
+ spin_unlock_irqrestore(&dev->blk_lock, flags);
+ info->cnt = -1;
+ break;
+
+ case CMD_GET_SIZE:
+ case CMD_GET_SERVERS:
+ kfree(info);
+ kfree(req);
+ break;
+ }
+}
+
+
+void dnbd2_requeue_timer(unsigned long arg)
+{
+ dnbd2_device_t *dev = (dnbd2_device_t *)arg;
+ struct list_head *cur, *next;
+ struct request *req;
+ struct req_info *info;
+ unsigned long too_long;
+ int requeue;
+
+ spin_lock(&dev->pending_queue_lock);
+ list_for_each_safe(cur, next, &dev->pending_queue) {
+ requeue = 0;
+ req = blkdev_entry_to_request(cur);
+ info = req->special;
+ if (!info->cnt)
+ continue;
+
+ /* Each request type has a specific requeue policy. */
+ switch (info->cmd) {
+ case CMD_GET_BLOCK:
+ too_long = 2 * (info->dst->srtt >> SRTT_SHIFT);
+ too_long *= 4 << info->cnt;
+ if (too_long > HZ)
+ too_long = HZ;
+ if (diff(info->time) >= too_long)
+ requeue = 1;
+ break;
+
+ case CMD_GET_SERVERS:
+ case CMD_GET_SIZE:
+ if (info->cnt == 4) {
+ list_del_init(&req->queuelist);
+ dnbd2_end_request(req, 0);
+ schedule_del_server(info->dst);
+ break;
+ }
+ if (diff(info->time) >= HZ)
+ requeue = 1;
+ break;
+ }
+
+ if (requeue) {
+ list_del_init(&req->queuelist);
+ spin_lock(&dev->send_queue_lock);
+ list_add_tail(&req->queuelist, &dev->send_queue);
+ spin_unlock(&dev->send_queue_lock);
+ }
+ }
+ spin_unlock(&dev->pending_queue_lock);
+
+ wake_up_interruptible(&dev->sender_wq);
+ dev->requeue_timer.expires = jiffies + REQUEUE_INTERVAL;
+ add_timer(&dev->requeue_timer);
+}
+
+
+void dnbd2_hb_timer(unsigned long arg)
+{
+ dnbd2_device_t *dev = (dnbd2_device_t *)arg;
+ int i;
+
+ if (dev->running)
+ for_each_server(i)
+ enqueue_hb(&dev->servers[i]);
+
+ wake_up_interruptible(&dev->sender_wq);
+ dev->hb_timer.expires = jiffies + dev->hb_interval;
+ add_timer(&dev->hb_timer);
+}
+
+
+void dnbd2_to_timer(unsigned long arg)
+{
+ dnbd2_device_t *dev = (dnbd2_device_t *)arg;
+
+ if (dev->running)
+ schedule_activate_fastest(dev);
+
+ dev->to_timer.expires = jiffies + TO_INTERVAL;
+ add_timer(&dev->to_timer);
+}
+
+
+void start_emergency(dnbd2_device_t *dev)
+{
+ int i;
+
+ if (dev->emergency)
+ return;
+
+ p("No servers reachable, starting emergency mode!\n");
+ dev->emergency = 1;
+
+ del_timer(&dev->requeue_timer);
+ del_timer(&dev->to_timer);
+ del_timer(&dev->hb_timer);
+
+ /* Activate the emergency list. */
+ for_each_server(i)
+ try_add_server(dev->emerg_list[i], dev);
+
+ /* Increase frequency of heartbeats. */
+ dev->hb_interval = HB_EMERG_INTERVAL;
+ dev->hb_timer.expires = jiffies + HB_EMERG_INTERVAL;
+ add_timer(&dev->hb_timer);
+}
+
+
+void stop_emergency(struct srv_info *srv_info)
+{
+ dnbd2_device_t *dev = srv_info->dev;
+
+ if (!dev->emergency)
+ return;
+
+ p("Stopping emergency mode.\n");
+ del_timer(&dev->hb_timer);
+
+ dev->active_server = srv_info;
+
+ /* Decrease frequency of heartbeats. */
+ dev->hb_interval = HB_NORMAL_INTERVAL;
+ dev->hb_timer.expires = jiffies + HB_NORMAL_INTERVAL;
+ add_timer(&dev->hb_timer);
+
+ dev->requeue_timer.expires = jiffies + REQUEUE_INTERVAL;
+ add_timer(&dev->requeue_timer);
+ dev->to_timer.expires = jiffies + TO_INTERVAL;
+ add_timer(&dev->to_timer);
+
+ dev->emergency = 0;
+}
+
+
+static int __init dnbd2_init(void)
+{
+ int i;
+
+ /* We are platform dependant. */
+ if (DNBD2_BLOCK_SIZE != PAGE_SIZE) {
+ printk(LOG "DNBD2_BLOCK_SIZE (%d) != PAGE_SIZE (%d)\n",
+ (int) DNBD2_BLOCK_SIZE, (int) PAGE_SIZE);
+ return -EINVAL;
+ }
+
+ dnbd2_major = register_blkdev(0, "dnbd2");
+ if (dnbd2_major <= 0) {
+ p("Could not get major number.\n");
+ return -EBUSY;
+ }
+
+ for (i=0 ; i<DNBD2_DEVICES ; i++)
+ if (add_device(&dev[i], i))
+ goto out;
+
+ p("DNBD2 loaded.\n");
+ return 0;
+
+ out:
+ while (i--)
+ del_device(&dev[i]);
+ return -ENOMEM;
+}
+
+
+static void __exit dnbd2_exit(void)
+{
+ int i;
+ for (i=0 ; i<DNBD2_DEVICES ; i++)
+ del_device(&dev[i]);
+ unregister_blkdev(dnbd2_major, "dnbd2");
+ p("DNBD2 unloaded.\n");
+}
+
+
+module_init(dnbd2_init);
+module_exit(dnbd2_exit);
+
+MODULE_DESCRIPTION("Distributed Network Block Device v2");
+MODULE_LICENSE("GPL");
diff --git a/kernel/core.h b/kernel/core.h
new file mode 100644
index 0000000..3c75ade
--- /dev/null
+++ b/kernel/core.h
@@ -0,0 +1,102 @@
+/*
+ * kernel/core.h
+ */
+
+
+/* interval to send heartbeats */
+#define HB_NORMAL_INTERVAL 30*HZ
+
+/* interval to send heartbeats in emergency-mode */
+#define HB_EMERG_INTERVAL 5*HZ
+
+/* interval to check the pending-queue */
+#define REQUEUE_INTERVAL HZ/20
+
+/* interval to look for faster servers */
+#define TO_INTERVAL HZ
+
+/* dnbd0, dnbd1, etc */
+#define DNBD2_DEVICES 3
+
+
+/* Given to us by the kernel at load-time. */
+extern int dnbd2_major;
+
+
+/*
+ * Called by the block layer to make the driver process some
+ * requests. For each request it does the following:
+ * 1. Prepares the request by attaching information to it.
+ * 2. Enqueues the request into the send-queue.
+ * 3. Wakes up dnbd2_tx_loop.
+ */
+void dnbd2_request(request_queue_t *q);
+
+/*
+ * This thread sleeps until there are requests in the send-queue.
+ * Each request is dequeued from the send-queue, enqueued into the
+ * pending-queue and finally given to dnbd2_send_request.
+ */
+int dnbd2_tx_loop(void *data);
+
+/*
+ * Send a request and update the request's and the destination
+ * server's counters.
+ */
+void dnbd2_send_request(struct request *req, dnbd2_device_t *dev);
+
+/*
+ * This thread sleeps until a reply arrives and then processes it.
+ */
+int dnbd2_rx_loop(void *data);
+
+/*
+ * When a reply arrives through the network it is caught by
+ * dnbd2_rx_loop, which calls this function to find a matching request
+ * in the pending queue.
+ */
+struct request *dnbd2_find_request(uint64_t num, uint16_t cmd,
+ struct srv_info *dst);
+
+/*
+ * The driver calls this function when it is done processing a request
+ * and has no further use for it. Resources are freed and the block
+ * layer informed if necessary.
+ */
+void dnbd2_end_request(struct request *req, int success);
+
+/*
+ * If a request remains too long unanswered, it is moved by this
+ * function from the pending-queue into the send-queue. This function
+ * is fired regularly by a timer.
+ */
+void dnbd2_requeue_timer(unsigned long arg);
+
+/*
+ * Periodically enqueu, for each server, a heartbeat request
+ * (CMD_GET_SERVERS) into the send-queue.
+ */
+void dnbd2_hb_timer(unsigned long arg);
+
+/*
+ * Check every TO_INTERVAL jiffies if there is a faster server and
+ * switch to it. TO stands for "takeover".
+ */
+void dnbd2_to_timer(unsigned long arg);
+
+/*
+ * If the driver looses contact to all servers it starts the emergency
+ * mode: First dnbd2_requeue_timer and dnbd2_to_timer are stopped.
+ * Then the emergency list (the last list of servers acquired with
+ * CMD_GET_SERVERS) is activated and the rate of heartbeats increased
+ * to HB_EMERG_INTERVAL.
+ */
+void start_emergency(dnbd2_device_t *dev);
+
+/*
+ * This function is called if a server answers to a heartbeat during
+ * emergency mode. This server (@srv_info) is made the active server,
+ * dnbd2_requeue_timer and dnbd2_to_timer started and the rate of
+ * heartbeats decreased to HB_NORMAL_INTERVAL.
+ */
+void stop_emergency(struct srv_info *srv_info);
diff --git a/kernel/devices.c b/kernel/devices.c
new file mode 100644
index 0000000..ccc6db1
--- /dev/null
+++ b/kernel/devices.c
@@ -0,0 +1,220 @@
+/*
+ * kernel/devices.c
+ */
+
+
+#include "dnbd2.h"
+#include "core.h"
+#include "fops.h"
+#include "sysfs.h"
+#include "devices.h"
+#include "servers.h"
+
+
+#define TO_PERCENT 50
+#define TO_JIFFIES 10
+
+
+/*
+ * Activate the request-processing mechanism for @dev: dnbd2_requeue
+ * (timer) and dnbd2_tx_loop (thread). dnbd2_rx_loop is started
+ * afterwards by add_server on a per-server basis.
+ */
+int start_device(dnbd2_device_t *dev)
+{
+ /* Start requeue timer. */
+ init_timer(&dev->requeue_timer);
+ dev->requeue_timer.data = (unsigned long)dev;
+ dev->requeue_timer.function = dnbd2_requeue_timer;
+ dev->requeue_timer.expires = jiffies + REQUEUE_INTERVAL;
+ add_timer(&dev->requeue_timer);
+
+ /* Start heartbeat timer. */
+ init_timer(&dev->hb_timer);
+ dev->hb_timer.data = (unsigned long)dev;
+ dev->hb_timer.function = dnbd2_hb_timer;
+ dev->hb_timer.expires = jiffies + HB_NORMAL_INTERVAL;
+ add_timer(&dev->hb_timer);
+
+ /* Start takeover timer. */
+ init_timer(&dev->to_timer);
+ dev->to_timer.data = (unsigned long)dev;
+ dev->to_timer.function = dnbd2_to_timer;
+ dev->to_timer.expires = jiffies + TO_INTERVAL;
+ add_timer(&dev->to_timer);
+
+ /* Start sending thread. */
+ dev->tx_signal = 0;
+ dev->tx_id = kernel_thread(dnbd2_tx_loop, dev, CLONE_KERNEL);
+ if (dev->tx_id < 0) {
+ del_timer(&dev->hb_timer);
+ del_timer(&dev->to_timer);
+ del_timer(&dev->requeue_timer);
+ return -1;
+ }
+ wait_for_completion(&dev->tx_start);
+
+ return 0;
+}
+
+
+/*
+ * Deactivate the request-processing mechanism for @dev. All
+ * dnbd2_rx_loop threads must be stopped beforehand by del_server.
+ */
+void stop_device(dnbd2_device_t *dev)
+{
+ struct list_head *cur, *next;
+ struct request *req;
+
+ /* Stop heartbeat timer. */
+ del_timer(&dev->hb_timer);
+
+ /* Stop takeover timer. */
+ del_timer(&dev->to_timer);
+
+ /* Stop request processing. */
+ del_timer(&dev->requeue_timer);
+ dev->tx_signal = 1;
+ kill_proc(dev->tx_id, SIGKILL, 1);
+ wait_for_completion(&dev->tx_stop);
+
+ /* Empty pending-queue. */
+ list_for_each_safe(cur, next, &dev->pending_queue) {
+ req = blkdev_entry_to_request(cur);
+ list_del_init(&req->queuelist);
+ dnbd2_end_request(req, 0);
+ }
+
+ /* Empty send-queue. */
+ list_for_each_safe(cur, next, &dev->send_queue) {
+ req = blkdev_entry_to_request(cur);
+ list_del_init(&req->queuelist);
+ dnbd2_end_request(req, 0);
+ }
+}
+
+
+int add_device(dnbd2_device_t *dev, int minor)
+{
+ struct request_queue *queue;
+ struct srv_info *srv_info;
+ struct gendisk *disk;
+ int i;
+
+ /*
+ * Prepare dnbd2_device_t. Please use the
+ * same order as in dnbd2.h.
+ */
+ INIT_WORK(&dev->work, NULL); //, NULL);
+ /* Change in /<linuxheaders>/include/linux/workqueue.h */
+ spin_lock_init(&dev->kmap_lock);
+ for (i=0 ; i<POOL_SIZE ; i++)
+ dev->info_pool[i].cnt = -1;
+
+ dev->emergency = 0;
+ dev->running = 0;
+
+ dev->vid = 0;
+ dev->rid = 0;
+
+ atomic_set(&dev->refcnt, 0);
+ init_MUTEX(&dev->config_mutex);
+
+ spin_lock_init(&dev->blk_lock);
+
+ init_waitqueue_head(&dev->sender_wq);
+ spin_lock_init(&dev->send_queue_lock);
+ INIT_LIST_HEAD(&dev->send_queue);
+
+ dev->pending_reqs = 0;
+ spin_lock_init(&dev->pending_queue_lock);
+ INIT_LIST_HEAD(&dev->pending_queue);
+
+ dev->hb_interval = HB_NORMAL_INTERVAL;
+
+ init_completion(&dev->tx_start);
+ init_completion(&dev->tx_stop);
+
+ init_MUTEX(&dev->servers_mutex);
+ for_each_server(i) {
+ dev->emerg_list[i].ip = 0;
+ dev->emerg_list[i].port = 0;
+ srv_info = &dev->servers[i];
+ memset(srv_info, 0, sizeof(struct srv_info));
+ init_completion(&srv_info->rx_start);
+ init_completion(&srv_info->rx_stop);
+ init_waitqueue_head(&srv_info->wq);
+ INIT_WORK(&srv_info->work, NULL); //, NULL);
+ /* Change in /<linuxheaders>/include/linux/workqueue.h */
+ srv_info->dev = dev;
+ }
+ dev->active_server = NULL;
+
+ dev->to_percent = TO_PERCENT;
+ dev->to_jiffies = TO_JIFFIES;
+
+ /* Prepare struct gendisk. */
+ disk = alloc_disk(1);
+ if (!disk) {
+ p("Could not alloc gendisk.\n");
+ goto out_nodisk;
+ }
+ dev->disk = disk;
+ disk->private_data = dev;
+ disk->major = dnbd2_major;
+ disk->first_minor = minor;
+ disk->fops = &dnbd2_fops;
+ sprintf(disk->disk_name, "vnbd%d", minor);
+ set_capacity(disk, 0);
+ set_disk_ro(disk, 1);
+
+ /* Prepare struct request_queue. */
+ queue = blk_init_queue(dnbd2_request, &dev->blk_lock);
+ if (!queue) {
+ p("Could not alloc request queue.\n");
+ goto out_noqueue;
+ }
+ /*
+ * Tell the block layer to give us only requests consisting of
+ * one segment of DNBD2_BLOCK_SIZE bytes.
+ */
+ blk_queue_max_sectors(queue, DNBD2_BLOCK_SIZE/SECTOR_SIZE);
+ blk_queue_max_segment_size(queue, DNBD2_BLOCK_SIZE);
+ blk_queue_hardsect_size(queue, DNBD2_BLOCK_SIZE);
+ blk_queue_max_phys_segments(queue, 1);
+ blk_queue_max_hw_segments(queue, 1);
+ disk->queue = queue;
+ add_disk(disk);
+
+ if (start_sysfs(dev))
+ goto out_nosysfs;
+
+ if (start_device(dev))
+ goto out_nostart;
+
+ return 0;
+
+ out_nostart:
+ stop_sysfs(dev);
+ out_nosysfs:
+ blk_cleanup_queue(queue);
+ out_noqueue:
+ del_gendisk(disk);
+ put_disk(disk);
+ out_nodisk:
+ return -ENOMEM;
+}
+
+
+void del_device(dnbd2_device_t *dev)
+{
+ int i;
+ for_each_server(i)
+ del_server(&dev->servers[i]);
+ stop_device(dev);
+ stop_sysfs(dev);
+ blk_cleanup_queue(dev->disk->queue);
+ del_gendisk(dev->disk);
+ put_disk(dev->disk);
+}
diff --git a/kernel/devices.h b/kernel/devices.h
new file mode 100644
index 0000000..d9cc46f
--- /dev/null
+++ b/kernel/devices.h
@@ -0,0 +1,14 @@
+/*
+ * kernel/devices.h
+ */
+
+
+/*
+ * Initialize @dev and inform the kernel.
+ */
+int add_device(dnbd2_device_t *dev, int minor);
+
+/*
+ * Destroy @dev and inform the kernel.
+ */
+void del_device(dnbd2_device_t *dev);
diff --git a/kernel/dnbd2.h b/kernel/dnbd2.h
new file mode 100644
index 0000000..cf693d9
--- /dev/null
+++ b/kernel/dnbd2.h
@@ -0,0 +1,118 @@
+/*
+ * kernel/dnbd2.h
+ */
+
+
+#include <linux/version.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+#include <linux/blkdev.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <asm/semaphore.h>
+#include <net/sock.h>
+#include "../include/dnbd2.h"
+
+
+#define LOG KERN_NOTICE "dnbd2: "
+#define p(msg) printk(LOG msg);
+
+#define SECTOR_SIZE 512
+
+#define for_each_server(i) for (i=0 ; i<ALT_SERVERS_MAX ; i++)
+
+/* max number of requests the driver can handle at once. */
+#define POOL_SIZE 128
+
+/* precision of sector_t for pretty-printing */
+#ifdef CONFIG_LBD
+# define SECT_PRECISION "%llu"
+#else
+# define SECT_PRECISION "%lu"
+#endif
+
+
+typedef struct dnbd2_device dnbd2_device_t;
+
+/*
+ * We stick this structure to each request before enqueing it into the
+ * send-queue. It gives information on how to treat the request.
+ */
+struct req_info {
+ uint16_t time; /* enqueue time */
+ uint16_t cmd; /* command (CMD_XXX) */
+ int cnt; /* send count */
+ struct srv_info *dst; /* destination server */
+ struct srv_info *last_dst; /* last destination server */
+};
+
+/*
+ * Information about a server with which we communicate.
+ */
+struct srv_info {
+ dnbd2_device_t *dev;
+ struct kobject kobj;
+ struct work_struct work;
+
+ struct socket *sock; /* NULL iff server not configured */
+ uint32_t ip; /* network byte order */
+ uint32_t port; /* network byte order */
+ uint16_t min, max; /* min and max RTTs */
+ unsigned long srtt; /* SRTT (SRTT_SHIFT left-shifted) */
+
+ sector_t capacity; /* used when a CMD_GET_SIZE reply arrives */
+ wait_queue_head_t wq; /* used when a CMD_GET_SIZE reply arrives */
+
+ long rx_id; /* pid of dnbd2_rx_loop */
+ struct completion rx_start; /* dnbd2_rx_loop has started */
+ struct completion rx_stop; /* dnbd2_rx_loop has stopped */
+
+ unsigned long retries; /* number of requests retried */
+ unsigned long last_reply; /* time of last reply */
+};
+
+struct dnbd2_device {
+ struct work_struct work;
+ struct kobject kobj;
+ spinlock_t kmap_lock;
+ struct req_info info_pool[POOL_SIZE];
+
+ int running; /* device is running and capacity != 0 */
+ int emergency; /* device has lost contact to all servers */
+
+ uint16_t vid, rid; /* Volume-ID and Release-ID */
+
+ atomic_t refcnt; /* for open/release, see fops.c */
+ struct semaphore config_mutex; /* for open/release and sysfs-fops */
+
+ struct gendisk *disk; /* interface to the block layer */
+ spinlock_t blk_lock; /* queue-lock, shared with the block layer */
+
+ wait_queue_head_t sender_wq; /* wait-queue to notify dnbd2_tx_loop */
+ spinlock_t send_queue_lock;
+ struct list_head send_queue;
+
+ unsigned long pending_reqs; /* number of block requests pending */
+ spinlock_t pending_queue_lock;
+ struct list_head pending_queue;
+
+ struct timer_list requeue_timer; /* requeue timer */
+ struct timer_list to_timer; /* takeover timer */
+ struct timer_list hb_timer; /* heartbeat timer */
+ unsigned long hb_interval; /* HB_NORMAL_ or HB_EMERG_INTERVAL */
+
+ int tx_id; /* pid of dnbd2_tx_loop */
+ struct completion tx_start; /* dnbd2_tx_loop has started */
+ struct completion tx_stop; /* dnbd2_tx_loop has stopped */
+ int tx_signal; /* tells dnbd2_tx_loop to stop */
+
+ struct semaphore servers_mutex;
+ struct srv_info servers[ALT_SERVERS_MAX];
+ dnbd2_server_t emerg_list[ALT_SERVERS_MAX]; /* last servers known */
+ struct srv_info *active_server;
+
+ int to_percent; /* percent threshold for takeover (relativ) */
+ uint16_t to_jiffies; /* jiffies threshold for takeover (absolute) */
+};
diff --git a/kernel/fops.c b/kernel/fops.c
new file mode 100644
index 0000000..c098e13
--- /dev/null
+++ b/kernel/fops.c
@@ -0,0 +1,43 @@
+/*
+ * kernel/fops.c
+ */
+
+
+#include "dnbd2.h"
+#include "fops.h"
+
+
+struct block_device_operations dnbd2_fops = {
+ .owner = THIS_MODULE,
+ .open = dnbd2_open,
+ .release = dnbd2_release,
+};
+
+
+int dnbd2_open(struct inode *inode, struct file *file)
+{
+ dnbd2_device_t *dev = inode->i_bdev->bd_disk->private_data;
+ if (down_interruptible(&dev->config_mutex))
+ return -EBUSY;
+
+ /* FIXME: How do we put this add/start_device? */
+ if (set_blocksize(inode->i_bdev, DNBD2_BLOCK_SIZE)) {
+ up(&dev->config_mutex);
+ return -EBUSY;
+ }
+
+ atomic_inc(&dev->refcnt);
+ up(&dev->config_mutex);
+ return 0;
+}
+
+
+int dnbd2_release(struct inode *inode, struct file *file)
+{
+ dnbd2_device_t *dev = inode->i_bdev->bd_disk->private_data;
+ if (down_interruptible(&dev->config_mutex))
+ return -EBUSY;
+ atomic_dec(&dev->refcnt);
+ up(&dev->config_mutex);
+ return 0;
+}
diff --git a/kernel/fops.h b/kernel/fops.h
new file mode 100644
index 0000000..f5222d4
--- /dev/null
+++ b/kernel/fops.h
@@ -0,0 +1,9 @@
+/*
+ * kernel/fops.h
+ */
+
+
+extern struct block_device_operations dnbd2_fops;
+
+int dnbd2_open(struct inode *inode, struct file *file);
+int dnbd2_release(struct inode *inode, struct file *file);
diff --git a/kernel/misc.c b/kernel/misc.c
new file mode 100644
index 0000000..2756bef
--- /dev/null
+++ b/kernel/misc.c
@@ -0,0 +1,84 @@
+/*
+ * kernel/misc.c
+ */
+
+
+#include "dnbd2.h"
+#include "misc.h"
+
+
+uint16_t diff(uint16_t then)
+{
+ int diff = jiffies & 0xffff;
+ diff -= then;
+ if (diff < 0)
+ diff += 1 << 16;
+ return diff;
+}
+
+
+int sock_xmit(struct socket *sock, int send, void *buf, int size)
+{
+ int result;
+ struct msghdr msg;
+ struct kvec iov;
+ unsigned long flags;
+ sigset_t oldset;
+
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ oldset = current->blocked;
+ sigfillset(&current->blocked);
+ sigdelsetmask(&current->blocked, sigmask(SIGKILL));
+ recalc_sigpending();
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
+
+ do {
+ sock->sk->sk_allocation = GFP_NOIO;
+ iov.iov_base = buf;
+ iov.iov_len = size;
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = MSG_NOSIGNAL;
+
+ if (send)
+ result = kernel_sendmsg(sock, &msg, &iov, 1, size);
+ else
+ result = kernel_recvmsg(sock, &msg, &iov, 1, size, 0);
+
+ if (signal_pending(current)) {
+ siginfo_t info;
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ dequeue_signal(current, &current->blocked, &info);
+ spin_unlock_irqrestore(&current->sighand->siglock,
+ flags);
+ result = -EINTR;
+ break;
+ }
+
+ if (result <= 0) {
+ if (result == 0)
+ result = -EPIPE;
+ break;
+ }
+ size -= result;
+ buf += result;
+ } while (size > 0);
+
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ current->blocked = oldset;
+ recalc_sigpending();
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
+
+ return result;
+}
+
+
+char *inet_ntoa(uint32_t ip)
+{
+ static char buf[sizeof "aaa.bbb.ccc.ddd"];
+ unsigned char *nums = (unsigned char *)&ip;
+ sprintf(buf, "%u.%u.%u.%u", nums[0], nums[1], nums[2], nums[3]);
+ return buf;
+}
diff --git a/kernel/misc.h b/kernel/misc.h
new file mode 100644
index 0000000..8f7f511
--- /dev/null
+++ b/kernel/misc.h
@@ -0,0 +1,23 @@
+/*
+ * kernel/misc.h - Usefull stuff that doesn't fit anywhere else.
+ */
+
+
+#define SEND 1
+#define RECV 0
+
+
+/*
+ * Jiffies between now and @then.
+ */
+uint16_t diff(uint16_t then);
+
+/*
+ * Send or receive packet (from nbd.c)
+ */
+int sock_xmit(struct socket *sock, int send, void *buf, int size);
+
+/*
+ * Pretty printing of IPs.
+ */
+char *inet_ntoa(uint32_t ip);
diff --git a/kernel/scp b/kernel/scp
new file mode 100644
index 0000000..c14b937
--- /dev/null
+++ b/kernel/scp
@@ -0,0 +1 @@
+scp /root/fs/ba-vito/tags/abgabe/kernel/dnbd2.ko 132.230.4.71:/var/opt/openslx/stage1/suse-10.2/lib/modules/2.6.18.8-0.3-default/kernel/drivers/block/
diff --git a/kernel/servers.c b/kernel/servers.c
new file mode 100644
index 0000000..7d5c229
--- /dev/null
+++ b/kernel/servers.c
@@ -0,0 +1,355 @@
+/*
+ * kernel/servers.c
+ */
+
+
+#include "dnbd2.h"
+#include "servers.h"
+#include "core.h"
+#include "misc.h"
+
+
+/* Only use RTTs smaller than RTT_MAX for statistics. */
+#define RTT_MAX HZ/4
+
+
+/*
+ * Find the next configured server which is not active.
+ */
+struct srv_info *next_server(dnbd2_device_t *dev)
+{
+ int i;
+ struct srv_info *srv_info, *next_srv = NULL;
+
+ for_each_server(i) {
+ srv_info = &dev->servers[i];
+ if (srv_info->sock && srv_info != dev->active_server) {
+ next_srv = srv_info;
+ break;
+ }
+ }
+ return next_srv;
+}
+
+
+/*
+ * Find the server with smallest SRTT.
+ */
+struct srv_info *fastest_server(uint16_t *srtt, dnbd2_device_t *dev)
+{
+ int i;
+ struct srv_info *srv_info, *alt_srv = NULL;
+ unsigned long min = RTT_MAX << SRTT_SHIFT;
+
+ for_each_server(i) {
+ srv_info = &dev->servers[i];
+ if (srv_info->sock &&
+ srv_info->srtt < min &&
+ srv_info->min < RTT_MAX) {
+ min = srv_info->srtt;
+ alt_srv = srv_info;
+ }
+ }
+
+ *srtt = min >> SRTT_SHIFT;
+ return alt_srv;
+}
+
+
+/*
+ * This function can be enqueued in a workqueue. It removes srv_info
+ * from the list of servers.
+ */
+void del_server_work(struct work_struct *work)
+{
+ struct srv_info *srv_info = container_of(work, struct srv_info, work);
+ dnbd2_device_t *dev = srv_info->dev;
+
+ down(&dev->servers_mutex);
+ if (!srv_info->sock || dev->active_server == srv_info) {
+ up(&dev->servers_mutex);
+ return;
+ }
+ del_server(srv_info);
+ up(&dev->servers_mutex);
+ return;
+}
+
+
+/*
+ * This function can be enqueued in a workqueue. Read comment on
+ * schedule_activate_fastest in servers.h
+ */
+void activate_fastest_work(struct work_struct *work)
+{
+ dnbd2_device_t *dev = container_of(work, dnbd2_device_t, work);
+ struct srv_info *alt_srv, *next_srv;
+ uint16_t min, srtt;
+ int newsrv = 0;
+ long unsigned delta;
+
+ down(&dev->servers_mutex);
+ if (!dev->active_server)
+ goto out;
+
+ /* Detect if dev->active_server is stalled. */
+ delta = (long)jiffies - (long)dev->active_server->last_reply;
+ if (dev->pending_reqs && delta > TIMEOUT_STALLED) {
+ next_srv = next_server(dev);
+ if (!next_srv) {
+ start_emergency(dev);
+ goto out;
+ }
+ alt_srv = dev->active_server;
+ dev->active_server = next_srv;
+ del_server(alt_srv);
+ }
+
+ /* Switch to another server if requirements met. */
+ srtt = dev->active_server->srtt >> SRTT_SHIFT;
+ alt_srv = fastest_server(&min, dev);
+ if (!alt_srv || alt_srv == dev->active_server)
+ goto out;
+ if (dev->to_percent) {
+ if (100 * (srtt - min) < dev->to_percent * srtt)
+ goto out;
+ newsrv = 1;
+ }
+ if (dev->to_jiffies) {
+ if (min + dev->to_jiffies > srtt)
+ goto out;
+ newsrv = 1;
+ }
+ if (newsrv)
+ dev->active_server = alt_srv;
+
+ out:
+ up(&dev->servers_mutex);
+}
+
+
+int start_rx_loop(struct srv_info *srv_info)
+{
+ srv_info->rx_id = kernel_thread(dnbd2_rx_loop, srv_info, CLONE_KERNEL);
+ if (srv_info->rx_id < 0) {
+ srv_info->rx_id = 0;
+ return -1;
+ }
+ wait_for_completion(&srv_info->rx_start);
+ return 0;
+}
+
+
+void stop_rx_loop(struct srv_info *srv_info)
+{
+ if (!srv_info->rx_id)
+ return;
+ kill_proc(srv_info->rx_id, SIGKILL, 1);
+ wait_for_completion(&srv_info->rx_stop);
+ srv_info->rx_id = 0;
+}
+
+
+/******************************************************/
+/* For the next functions see servers.h for comments. */
+/******************************************************/
+
+
+int add_server(dnbd2_server_t server, struct srv_info *srv_info)
+{
+ struct sockaddr_in addr;
+ struct socket *sock;
+
+ if (!server.ip || !server.port)
+ return -1;
+
+ if (sock_create(PF_INET,SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
+ p("Could not create socket.\n");
+ return -1;
+ }
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = server.ip;
+ addr.sin_port = server.port;
+ if (sock->ops->connect(sock,
+ (struct sockaddr *)&addr,
+ sizeof(addr), 0)) {
+ p("Could not connect to socket.\n");
+ goto out;
+ }
+
+ srv_info->sock = sock;
+ srv_info->ip = server.ip;
+ srv_info->port = server.port;
+ srv_info->min = RTT_MAX;
+
+ if (start_rx_loop(srv_info)) {
+ p("Could not start rx_loop\n");
+ goto out;
+ }
+
+ return 0;
+ out:
+ del_server(srv_info);
+ return -1;
+}
+
+
+void del_server(struct srv_info *srv_info)
+{
+ stop_rx_loop(srv_info);
+ if (srv_info->sock)
+ sock_release(srv_info->sock);
+ srv_info->sock = NULL;
+ srv_info->ip = 0;
+ srv_info->port = 0;
+ srv_info->srtt = 0;
+ srv_info->min = 0;
+ srv_info->max = 0;
+ srv_info->srtt = 0;
+ srv_info->retries = 0;
+ srv_info->last_reply = 0;
+}
+
+
+void try_add_server(dnbd2_server_t server, dnbd2_device_t *dev)
+{
+ int i;
+
+ for_each_server(i)
+ if (dev->servers[i].sock &&
+ dev->servers[i].ip == server.ip &&
+ dev->servers[i].port == server.port)
+ return;
+
+ for_each_server(i)
+ if (!dev->servers[i].sock)
+ break;
+
+ if (i == ALT_SERVERS_MAX)
+ return;
+
+ add_server(server, &dev->servers[i]);
+}
+
+
+sector_t srv_get_capacity(struct srv_info *srv_info)
+{
+ struct request *req;
+ struct req_info *info;
+ dnbd2_device_t *dev = srv_info->dev;
+ sector_t capacity;
+ int ret;
+
+ info = kmalloc(sizeof(struct req_info), GFP_KERNEL);
+ if (!info)
+ return 0;
+ req = kmalloc(sizeof(struct request), GFP_KERNEL);
+ if (!req) {
+ kfree(info);
+ return 0;
+ }
+ info->cmd = CMD_GET_SIZE;
+ info->cnt = 0;
+ info->time = jiffies;
+ info->dst = srv_info;
+ info->last_dst = srv_info;
+ req->special = info;
+ req->sector = 0;
+ INIT_LIST_HEAD(&req->queuelist);
+
+ /* Enqueue the request for sending. */
+ spin_lock_bh(&dev->send_queue_lock);
+ list_add_tail(&req->queuelist, &dev->send_queue);
+ spin_unlock_bh(&dev->send_queue_lock);
+
+ /* Wake up sender function. */
+ wake_up_interruptible(&dev->sender_wq);
+
+ /* If we don't get an answer in 4 seconds we give up. */
+ ret = wait_event_timeout(srv_info->wq, srv_info->capacity, 4*HZ);
+ capacity = srv_info->capacity;
+ srv_info->capacity = 0;
+
+ return ret ? capacity : 0;
+}
+
+
+void update_rtt(uint16_t rtt, struct srv_info *srv_info, uint16_t cmd)
+{
+ if (rtt == 0)
+ rtt = 1;
+ if (rtt > RTT_MAX)
+ return;
+
+ if (rtt < srv_info->min)
+ srv_info->min = rtt;
+ if (rtt > srv_info->max)
+ srv_info->max = rtt;
+
+ if (!srv_info->srtt) {
+ srv_info->srtt = rtt << SRTT_SHIFT;
+ return;
+ }
+
+ switch (cmd) {
+ case CMD_GET_BLOCK:
+ srv_info->srtt = SRTT_BETA * srv_info->srtt;
+ srv_info->srtt += (SRTT_BETA_COMP * ((int)rtt)) << SRTT_SHIFT;
+ srv_info->srtt /= SRTT_BETA_BASE;
+ break;
+
+ case CMD_GET_SIZE:
+ case CMD_GET_SERVERS:
+ srv_info->srtt = rtt << SRTT_SHIFT;
+ break;
+ }
+}
+
+
+void enqueue_hb(struct srv_info *srv_info)
+{
+ struct request *req;
+ struct req_info *info;
+ dnbd2_device_t *dev = srv_info->dev;
+
+ if (!srv_info->sock)
+ return;
+ info = kmalloc(sizeof(struct req_info), GFP_ATOMIC);
+ if (!info)
+ return;
+ req = kmalloc(sizeof(struct request), GFP_ATOMIC);
+ if (!req) {
+ kfree(info);
+ return;
+ }
+ info->cmd = CMD_GET_SERVERS;
+ info->cnt = 0;
+ info->time = jiffies;
+ info->dst = srv_info;
+ info->dst = srv_info;
+ info->last_dst = srv_info;
+ req->special = info;
+ req->sector = 0;
+ INIT_LIST_HEAD(&req->queuelist);
+
+ /* Enqueue the request for sending. */
+ spin_lock_bh(&dev->send_queue_lock);
+ list_add_tail(&req->queuelist, &dev->send_queue);
+ spin_unlock_bh(&dev->send_queue_lock);
+}
+
+
+void schedule_del_server(struct srv_info *srv_info)
+{
+ PREPARE_WORK(&srv_info->work, del_server_work); //, srv_info);
+ /* Change in /<linuxheaders>/include/linux/workqueue.h */
+ schedule_work(&srv_info->work);
+}
+
+
+void schedule_activate_fastest(dnbd2_device_t *dev)
+{
+ PREPARE_WORK(&dev->work, activate_fastest_work); //, dev);
+ /* Change in /<linuxheaders>/include/linux/workqueue.h */
+ schedule_work(&dev->work);
+}
diff --git a/kernel/servers.h b/kernel/servers.h
new file mode 100644
index 0000000..83d9b46
--- /dev/null
+++ b/kernel/servers.h
@@ -0,0 +1,84 @@
+/*
+ * kernel/servers.h
+ */
+
+
+#define SRTT_BETA 990
+#define SRTT_BETA_COMP 10
+#define SRTT_BETA_BASE 1000
+
+/*
+ * Though presented as a 16-bit number in sysfs, the SRTT is stored in
+ * an unsigned long, SRTT_SHIFT bits left-shifted. This helps
+ * preserve the precision we need tou update the SRTTs. If we stored
+ * SRTT in an integer, it would only be changed by RTTS *very* far
+ * away from it.
+ */
+#define SRTT_SHIFT 10
+
+/*
+ * After this interval without answering
+ * requests a server is considered down.
+ */
+#define TIMEOUT_STALLED 3*HZ
+
+
+/*
+ * Configure @srv_info based on @server: Create a socket, connect it
+ * and start a dnbd2_rx_loop for it.
+ */
+int add_server(dnbd2_server_t server, struct srv_info *srv_info);
+
+/*
+ * Reset @srv_info: Stop dnbd2_rx_loop, close the socket and zero all
+ * variables.
+ */
+void del_server(struct srv_info *srv_info);
+
+/*
+ * Look for an unused server-slot in @dev and configure it according
+ * to @server (using add_server). Nothing happens if the list of
+ * servers if full or if there is already a server in the list with
+ * the same IP and port as @server.
+ */
+void try_add_server(dnbd2_server_t server, dnbd2_device_t *dev);
+
+/*
+ * This function enqueues into the send-queue a request for the device
+ * capacity (CMD_GET_SIZE). This type of request is retransmitted
+ * every second. If an answer doesn't arrive within 4 seconds it
+ * assumes that the server is down.
+ */
+sector_t srv_get_capacity(struct srv_info *srv_info);
+
+/*
+ * Update min, max and srtt in @srv_info.
+ */
+void update_rtt(uint16_t rtt, struct srv_info *srv_info, uint16_t cmd);
+
+/*
+ * Enqueue a heartbeat request (CMD_GET_SERVERS) for @srv_info into
+ * the send-queue.
+ */
+void enqueue_hb(struct srv_info *srv_info);
+
+/*
+ * Schedule the removal of @srv_info to be processed by the global
+ * kernel workqueue as soon as possible. This function allows to
+ * trigger the removal of servers from within timers, in which
+ * sleeping is not allowed.
+ */
+void schedule_del_server(struct srv_info *srv_info);
+
+/*
+ * Schedule the search for a faster server:
+ * 1. Detect if the active server is stalled (down). In this case
+ * switch to another server, if possible, and remove the old one
+ * from the list. If no servers are available to switch to
+ * start the emergency mode.
+ * 2. If the emergency mode was not started go through the list of
+ * servers and pick the one with the smallest SRTT. If this
+ * server meets our requirements (to_jiffies and to_percent)
+ * switch to it.
+ */
+void schedule_activate_fastest(dnbd2_device_t *dev);
diff --git a/kernel/sysfs.c b/kernel/sysfs.c
new file mode 100644
index 0000000..802e9d2
--- /dev/null
+++ b/kernel/sysfs.c
@@ -0,0 +1,460 @@
+/*
+ * kernel/sysfs.c
+ */
+
+
+#include "dnbd2.h"
+#include "misc.h"
+#include "sysfs.h"
+#include "devices.h"
+#include "servers.h"
+
+
+#define RW 0644
+#define RO 0444
+
+#define kobj_to_dev(kp) container_of(kp, dnbd2_device_t, kobj)
+#define kobj_to_srv(kp) container_of(kp, struct srv_info, kobj)
+
+#define attr_to_srvattr(ap) container_of(ap, struct server_attr, attr)
+#define attr_to_devattr(ap) container_of(ap, struct device_attr, attr)
+
+#define DEV_ATTR_RW(_name) \
+static struct device_attr _name = \
+__ATTR(_name, RW, show_##_name, store_##_name)
+
+#define SRV_ATTR_RW(_name) \
+static struct server_attr _name = \
+__ATTR(_name, RW, show_##_name, store_##_name)
+
+#define DEV_ATTR_RO(_name) \
+static struct device_attr _name = \
+__ATTR(_name, RO, show_##_name, NULL)
+
+#define SRV_ATTR_RO(_name) \
+static struct server_attr _name = \
+__ATTR(_name, RO, show_##_name, NULL)
+
+
+struct device_attr {
+ struct attribute attr;
+ ssize_t (*show)(char *, dnbd2_device_t *);
+ ssize_t (*store)(const char *, size_t, dnbd2_device_t *);
+};
+
+struct server_attr {
+ struct attribute attr;
+ ssize_t (*show)(char *, struct srv_info *);
+ ssize_t (*store)(const char *, size_t, struct srv_info *);
+};
+
+
+void release(struct kobject *kobj) {}
+
+ssize_t show_running(char *, dnbd2_device_t *);
+ssize_t store_running(const char *, size_t, dnbd2_device_t *);
+ssize_t show_to_percent(char *, dnbd2_device_t *);
+ssize_t store_to_percent(const char *, size_t, dnbd2_device_t *);
+ssize_t show_to_jiffies(char *, dnbd2_device_t *);
+ssize_t store_to_jiffies(const char *, size_t, dnbd2_device_t *);
+ssize_t show_vid(char *, dnbd2_device_t *);
+ssize_t store_vid(const char *, size_t, dnbd2_device_t *);
+ssize_t show_rid(char *, dnbd2_device_t *);
+ssize_t store_rid(const char *, size_t, dnbd2_device_t *);
+ssize_t show_pending_reqs(char *, dnbd2_device_t *);
+ssize_t show_emergency(char *, dnbd2_device_t *);
+
+ssize_t show_sock(char *, struct srv_info *);
+ssize_t store_sock(const char *, size_t, struct srv_info *);
+ssize_t show_active(char *, struct srv_info *);
+ssize_t store_active(const char *, size_t, struct srv_info *);
+ssize_t show_rtt(char *, struct srv_info *);
+ssize_t show_retries(char *, struct srv_info *);
+ssize_t show_last_reply(char *, struct srv_info *);
+
+
+/* device attributes */
+DEV_ATTR_RW(running);
+DEV_ATTR_RW(to_percent);
+DEV_ATTR_RW(to_jiffies);
+DEV_ATTR_RW(vid);
+DEV_ATTR_RW(rid);
+DEV_ATTR_RO(pending_reqs);
+DEV_ATTR_RO(emergency);
+
+static struct attribute *device_attrs[] = {
+ &running.attr,
+ &to_percent.attr,
+ &to_jiffies.attr,
+ &vid.attr,
+ &rid.attr,
+ &pending_reqs.attr,
+ &emergency.attr,
+ NULL,
+};
+
+/* server attributes */
+SRV_ATTR_RW(sock);
+SRV_ATTR_RW(active);
+SRV_ATTR_RO(rtt);
+SRV_ATTR_RO(retries);
+SRV_ATTR_RO(last_reply);
+
+struct attribute *server_attrs[] = {
+ &sock.attr,
+ &active.attr,
+ &rtt.attr,
+ &retries.attr,
+ &last_reply.attr,
+ NULL,
+};
+
+
+/*
+ * Wrapper functions for show/store, one pair for device attributes
+ * and one pair for server attributes. Each attribute has its own
+ * specific function(s).
+ */
+ssize_t device_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct device_attr *device_attr = attr_to_devattr(attr);
+ dnbd2_device_t *dev = kobj_to_dev(kobj);
+ return device_attr->show(buf, dev);
+}
+
+ssize_t device_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ int ret;
+ struct device_attr *device_attr = attr_to_devattr(attr);
+ dnbd2_device_t *dev = kobj_to_dev(kobj);
+ down(&dev->config_mutex);
+ ret = device_attr->store(buf, count, dev);
+ up(&dev->config_mutex);
+ return ret;
+}
+
+ssize_t server_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct server_attr *server_attr = attr_to_srvattr(attr);
+ struct srv_info *srv_info = kobj_to_srv(kobj);
+ return server_attr->show(buf, srv_info);
+}
+
+ssize_t server_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ int ret;
+ struct server_attr *server_attr = attr_to_srvattr(attr);
+ struct srv_info *srv_info = kobj_to_srv(kobj);
+ down(&srv_info->dev->config_mutex);
+ ret = server_attr->store(buf, count, srv_info);
+ up(&srv_info->dev->config_mutex);
+ return ret;
+}
+
+
+struct sysfs_ops device_ops = {
+ .show = device_show,
+ .store = device_store,
+};
+
+struct sysfs_ops server_ops = {
+ .show = server_show,
+ .store = server_store,
+};
+
+struct kobj_type device_ktype = {
+ .default_attrs = device_attrs,
+ .sysfs_ops = &device_ops,
+ .release = release,
+};
+
+struct kobj_type server_ktype = {
+ .default_attrs = server_attrs,
+ .sysfs_ops = &server_ops,
+ .release = release,
+};
+
+
+/*
+ * RW device attribute functions.
+ */
+ssize_t show_running(char *buf, dnbd2_device_t *dev)
+{
+ return sprintf(buf, "%d\n", dev->running);
+}
+
+ssize_t store_running(const char *buf, size_t count, dnbd2_device_t *dev)
+{
+ sector_t capacity = 0;
+ struct srv_info *srv_info;
+ int i, running, ret = sscanf(buf, "%d", &running);
+
+ if (ret != 1)
+ return -EINVAL;
+
+ if (!dev->running) {
+ if (running != 1 || !dev->vid || !dev->rid)
+ return -EINVAL;
+ if (atomic_read(&dev->refcnt) > 0)
+ return -EBUSY;
+
+ for_each_server(i) {
+ srv_info = &dev->servers[i];
+ if (!srv_info->sock)
+ continue;
+ capacity = srv_get_capacity(srv_info);
+ if (capacity) {
+ set_capacity(dev->disk, capacity);
+ dev->active_server = srv_info;
+ break;
+ }
+ }
+ if (!capacity) {
+ p("Could not contact any servers.\n");
+ return -EHOSTUNREACH;
+ }
+ for_each_server(i) {
+ dev->emerg_list[i].ip = dev->servers[i].ip;
+ dev->emerg_list[i].port = dev->servers[i].port;
+ }
+ printk(LOG "Device capacity = " SECT_PRECISION " KB\n",
+ capacity * SECTOR_SIZE / 1024);
+ dev->running = 1;
+ __module_get(THIS_MODULE);
+ } else {
+ if (running != 0)
+ return -EINVAL;
+ if (atomic_read(&dev->refcnt) > 0)
+ return -EBUSY;
+
+ /* Stop device. */
+ dev->running = 0;
+ set_capacity(dev->disk, 0);
+ dev->active_server = NULL;
+ module_put(THIS_MODULE);
+ }
+
+ return count;
+}
+
+ssize_t show_vid(char *buf, dnbd2_device_t *dev)
+{
+ return sprintf(buf, "%hu\n", dev->vid);
+}
+
+ssize_t store_vid(const char *buf, size_t count, dnbd2_device_t *dev)
+{
+ uint16_t vid;
+
+ if (dev->running)
+ return -EBUSY;
+ if (sscanf(buf, "%hu", &vid) != 1)
+ return -EINVAL;
+
+ dev->vid = vid;
+ return count;
+}
+
+ssize_t show_rid(char *buf, dnbd2_device_t *dev)
+{
+ return sprintf(buf, "%hu\n", dev->rid);
+}
+
+ssize_t store_rid(const char *buf, size_t count, dnbd2_device_t *dev)
+{
+ uint16_t rid;
+
+ if (dev->running)
+ return -EBUSY;
+ if (sscanf(buf, "%hu", &rid) != 1)
+ return -EINVAL;
+
+ dev->rid = rid;
+ return count;
+}
+
+ssize_t show_to_percent(char *buf, dnbd2_device_t *dev)
+{
+ return sprintf(buf, "%d\n", dev->to_percent);
+}
+
+ssize_t store_to_percent(const char *buf, size_t count, dnbd2_device_t *dev)
+{
+ if (sscanf(buf, "%d", &dev->to_percent) == 1)
+ return count;
+ return -EINVAL;
+}
+
+ssize_t show_to_jiffies(char *buf, dnbd2_device_t *dev)
+{
+ return sprintf(buf, "%hu\n", dev->to_jiffies);
+}
+
+ssize_t store_to_jiffies(const char *buf, size_t count, dnbd2_device_t *dev)
+{
+ if (sscanf(buf, "%hu", &dev->to_jiffies) == 1)
+ return count;
+ return -EINVAL;
+}
+
+
+/*
+ * RO device attribute functions.
+ */
+ssize_t show_pending_reqs(char *buf, dnbd2_device_t *dev)
+{
+ return sprintf(buf, "%lu\n", dev->pending_reqs);
+}
+
+ssize_t show_emergency(char *buf, dnbd2_device_t *dev)
+{
+ return sprintf(buf, "%d\n", dev->emergency);
+}
+
+
+/*
+ * RW server attribute functions.
+ */
+ssize_t show_sock(char *buf, struct srv_info *srv_info)
+{
+ return sprintf(buf, "%s %hu\n",
+ inet_ntoa(srv_info->ip),
+ ntohs(srv_info->port));
+}
+
+ssize_t store_sock(const char *buf, size_t count, struct srv_info *srv_info)
+{
+ char ip[sizeof "aaa.bbb.ccc.ddd 12345"];
+ uint16_t port;
+ dnbd2_server_t server;
+ dnbd2_device_t *dev = srv_info->dev;
+
+ if (count > sizeof "aaa.bbb.ccc.ddd 12345")
+ return -EINVAL;
+ if (sscanf(buf, "%s %hu", ip, &port) != 2)
+ return -EINVAL;
+ server.ip = in_aton(ip);
+ server.port = htons(port);
+
+ down(&dev->servers_mutex);
+ if (dev->running && dev->active_server == srv_info) {
+ up(&dev->servers_mutex);
+ return -EBUSY;
+ }
+ if (srv_info->sock)
+ del_server(srv_info);
+ if (server.ip && server.port && add_server(server, srv_info)) {
+ up(&dev->servers_mutex);
+ return -EINVAL;
+ }
+ up(&dev->servers_mutex);
+
+ return count;
+}
+
+ssize_t show_active(char *buf, struct srv_info *srv_info)
+{
+ if (!srv_info->sock)
+ return sprintf(buf, "0\n");
+ if (srv_info->dev->active_server == srv_info)
+ return sprintf(buf, "1\n");
+ return sprintf(buf, "0\n");
+}
+
+ssize_t store_active(const char *buf, size_t count, struct srv_info *srv_info)
+{
+ dnbd2_device_t *dev = srv_info->dev;
+ int active;
+
+ if (sscanf(buf, "%d", &active) != 1 || active != 1)
+ return -EINVAL;
+
+ down(&dev->servers_mutex);
+ if (!dev->running) {
+ up(&dev->servers_mutex);
+ return -EINVAL;
+ }
+ if (!srv_info->sock || dev->active_server == srv_info) {
+ up(&dev->servers_mutex);
+ return -EINVAL;
+ }
+ dev->active_server = srv_info;
+ up(&dev->servers_mutex);
+
+ return count;
+}
+
+
+/*
+ * RO server attribute functions.
+ */
+ssize_t show_rtt(char *buf, struct srv_info *srv_info)
+{
+ return sprintf(buf, "%hu %lu %hu\n",
+ srv_info->min,
+ srv_info->srtt >> SRTT_SHIFT,
+ srv_info->max);
+}
+
+ssize_t show_retries(char *buf, struct srv_info *srv_info)
+{
+ return sprintf(buf, "%lu\n", srv_info->retries);
+}
+
+ssize_t show_last_reply(char *buf, struct srv_info *srv_info)
+{
+ return sprintf(buf, "%lu\n", srv_info->last_reply);
+}
+
+
+/* Helper for start_sysfs. */
+int setup_kobj(struct kobject *kobj, char *name, struct kobject *parent,
+ struct kobj_type *ktype)
+{
+ memset(kobj, 0, sizeof(struct kobject));
+ kobj->parent = parent;
+ kobj->ktype = ktype;
+ if (kobject_set_name(kobj, name))
+ return -1;
+ if (kobject_register(kobj))
+ return -1;
+ return 0;
+}
+
+
+/*
+ * Exported functions - see sysfs.h
+ */
+int start_sysfs(dnbd2_device_t *dev)
+{
+ int i;
+ char name[] = "server99";
+
+ if (setup_kobj(&dev->kobj, "config", &dev->disk->kobj, &device_ktype))
+ return -1;
+
+ for_each_server(i) {
+ sprintf(name, "server%d", i);
+ if (setup_kobj(&dev->servers[i].kobj, name,
+ &dev->disk->kobj, &server_ktype))
+ goto out;
+ }
+ return 0;
+
+ out:
+ while (i--)
+ kobject_unregister(&dev->servers[i].kobj);
+ return -1;
+}
+
+void stop_sysfs(dnbd2_device_t *dev)
+{
+ int i;
+ for_each_server(i)
+ kobject_unregister(&dev->servers[i].kobj);
+ kobject_unregister(&dev->kobj);
+}
diff --git a/kernel/sysfs.h b/kernel/sysfs.h
new file mode 100644
index 0000000..a18338e
--- /dev/null
+++ b/kernel/sysfs.h
@@ -0,0 +1,24 @@
+/*
+ * kernel/sysfs.h
+ */
+
+
+/*
+ * Setup the sysfs-interface for @dev:
+ *
+ * M = minor number
+ * N = ALT_SERVERS_MAX - 1
+ *
+ * /sys/block/vnbdM/config (from @dev->kobj)
+ * /sys/block/vnbdM/server0 (from @dev->servers[0].kobj)
+ * .
+ * .
+ * /sys/block/vnbdM/serverN (from @dev->servers[N].kobj)
+ *
+ */
+int start_sysfs(dnbd2_device_t *dev);
+
+/*
+ * Destroy the sysfs-interface for @dev.
+ */
+void stop_sysfs(dnbd2_device_t *dev);