diff options
author | Lars Müller | 2008-03-01 19:30:38 +0100 |
---|---|---|
committer | Lars Müller | 2008-03-01 19:30:38 +0100 |
commit | 93b9103f7383d400616d222606c294e07b16e1aa (patch) | |
tree | 611a39f7bc1d1dd5a4335157ef95c101d64dddc8 /kernel | |
download | dnbd2-93b9103f7383d400616d222606c294e07b16e1aa.tar.gz dnbd2-93b9103f7383d400616d222606c294e07b16e1aa.tar.xz dnbd2-93b9103f7383d400616d222606c294e07b16e1aa.zip |
Import dnbd* from the former openslx-contrib repo as of revision 92.
openslx-contrib is currently read only and will get removed in some
days.
git-svn-id: http://svn.openslx.org/svn/openslx/contrib/dnbd2/trunk@1592 95ad53e4-c205-0410-b2fa-d234c58c8868
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 18 | ||||
-rw-r--r-- | kernel/core.c | 510 | ||||
-rw-r--r-- | kernel/core.h | 102 | ||||
-rw-r--r-- | kernel/devices.c | 220 | ||||
-rw-r--r-- | kernel/devices.h | 14 | ||||
-rw-r--r-- | kernel/dnbd2.h | 118 | ||||
-rw-r--r-- | kernel/fops.c | 43 | ||||
-rw-r--r-- | kernel/fops.h | 9 | ||||
-rw-r--r-- | kernel/misc.c | 84 | ||||
-rw-r--r-- | kernel/misc.h | 23 | ||||
-rw-r--r-- | kernel/scp | 1 | ||||
-rw-r--r-- | kernel/servers.c | 355 | ||||
-rw-r--r-- | kernel/servers.h | 84 | ||||
-rw-r--r-- | kernel/sysfs.c | 460 | ||||
-rw-r--r-- | kernel/sysfs.h | 24 |
15 files changed, 2065 insertions, 0 deletions
diff --git a/kernel/Makefile b/kernel/Makefile new file mode 100644 index 0000000..8c01ed3 --- /dev/null +++ b/kernel/Makefile @@ -0,0 +1,18 @@ +# +# kernel/Makefile +# + +KDIR := /lib/modules/$(shell uname -r)/build +PWD := $(shell pwd) + +obj-m := dnbd2.o +dnbd2-objs := fops.o sysfs.o servers.o devices.o misc.o core.o + +all: + $(MAKE) -C $(KDIR) M=$(PWD) modules + +install: + $(MAKE) -C $(KDIR) M=$(PWD) modules_install + +clean: + rm -rf *.o *.ko *~ *.symvers dnbd2.mod.c .*o.cmd .tmp_versions diff --git a/kernel/core.c b/kernel/core.c new file mode 100644 index 0000000..6bee380 --- /dev/null +++ b/kernel/core.c @@ -0,0 +1,510 @@ +/* + * kernel/core.c - The principal component of the driver. + * See core.h for comments on each function. + */ + + +#include "dnbd2.h" +#include "core.h" +#include "devices.h" +#include "misc.h" +#include "fops.h" +#include "servers.h" + + +int dnbd2_major; +static dnbd2_device_t dev[DNBD2_DEVICES]; + + +void dnbd2_request(request_queue_t *q) +{ + int i; + struct request *req; + struct req_info *info; + dnbd2_device_t *dev; + + while ((req = elv_next_request(q)) != NULL) { + /* Discard invalid block requests. */ + if (!blk_fs_request(req)) { + end_request(req, 0); + continue; + } + if (rq_data_dir(req) != READ) { + end_request(req, 0); + continue; + } + + /* Prepare request. */ + dev = req->rq_disk->private_data; + for (i=0 ; i<POOL_SIZE ; i++) { + if (dev->info_pool[i].cnt == -1) + break; + } + if (i == POOL_SIZE) + continue; + dev->pending_reqs++; + info = &dev->info_pool[i]; + info->time = jiffies; + info->cnt = 0; + info->cmd = CMD_GET_BLOCK; + info->dst = dev->active_server; + req->special = info; + blkdev_dequeue_request(req); + + /* Enqueue the request for sending. */ + spin_lock_bh(&dev->send_queue_lock); + list_add_tail(&req->queuelist, &dev->send_queue); + spin_unlock_bh(&dev->send_queue_lock); + + /* Wakeup sender function. */ + wake_up_interruptible(&dev->sender_wq); + } +} + + +int dnbd2_tx_loop(void *data) +{ + struct request *req; + dnbd2_device_t *dev = data; + struct req_info *info; + + daemonize("dnbd2_tx_loop"); + allow_signal(SIGKILL); + complete(&dev->tx_start); + + while (1) { + wait_event_interruptible(dev->sender_wq, + !list_empty(&dev->send_queue)); + + /* + * If wait_event_interruptible is interrupted while + * the queue is not empty it retunrs 0, not + * -ERESTARTSYS. Therefore we need another flag. + */ + if (dev->tx_signal) + break; + + /* Dequeue request from the send-queue. */ + spin_lock_bh(&dev->send_queue_lock); + req = blkdev_entry_to_request(dev->send_queue.next); + list_del_init(&req->queuelist); + spin_unlock_bh(&dev->send_queue_lock); + + /* + * This is a good place to do some sanity checks + * because the request is neither in the send- nor in + * the pending-queue. + */ + down(&dev->servers_mutex); + info = req->special; + switch (info->cmd) { + case CMD_GET_BLOCK: + /* + * If dnbd2_request is called w/o first + * opening the device or with the device's + * capacity set to zero we should land here. + */ + if (!dev->running) { + dnbd2_end_request(req, 0); + up(&dev->servers_mutex); + continue; + } + /* + * Send block requests to dev->active_server, + * which is always set while running. + */ + info->last_dst = info->dst; + info->dst = dev->active_server; + break; + + case CMD_GET_SERVERS: + case CMD_GET_SIZE: + /* + * Discard the request if it's destination is + * no longer available. + */ + if (!info->dst->sock) { + dnbd2_end_request(req, 1); + up(&dev->servers_mutex); + continue; + } + break; + } + + /* Enqueue request in the pending-queue. */ + spin_lock_bh(&dev->pending_queue_lock); + list_add_tail(&req->queuelist, &dev->pending_queue); + spin_unlock_bh(&dev->pending_queue_lock); + + /* Send request. */ + dnbd2_send_request(req, dev); + up(&dev->servers_mutex); + } + + complete_and_exit(&dev->tx_stop, 0); +} + + +void dnbd2_send_request(struct request *req, dnbd2_device_t *dev) { + dnbd2_data_request_t dnbd2_req; + struct req_info *info = req->special; + + dnbd2_req.cmd = cpu_to_be16(info->cmd); + dnbd2_req.time = cpu_to_be16(info->time); + dnbd2_req.vid = cpu_to_be16(dev->vid); + dnbd2_req.rid = cpu_to_be16(dev->rid); + dnbd2_req.num = cpu_to_be64(req->sector * SECTOR_SIZE); + + /* + * If sock_xmit fails the request eventually gets requeued. + * That's why we don't check its return value. + */ + sock_xmit(info->dst->sock, SEND, &dnbd2_req, sizeof(dnbd2_req)); + info->cnt++; + if (info->dst != info->last_dst) { + info->cnt = 1; + } else if (info->cnt > 1) { + info->dst->retries++; + } +} + + +int dnbd2_rx_loop(void *data) +{ + struct srv_info *srv_info = data; + dnbd2_device_t *dev = srv_info->dev; + dnbd2_data_reply_t dnbd2_rep; + struct request *req; + uint64_t num; + uint16_t cmd, time; + char *buffer; + int ret, i; + + daemonize("dnbd2_rx_loop"); + allow_signal(SIGKILL); + complete(&srv_info->rx_start); + + while (1) { + ret = sock_xmit(srv_info->sock, RECV, + &dnbd2_rep, sizeof(dnbd2_rep)); + if (ret == -EINTR) + break; + if (ret != sizeof(dnbd2_rep) || + be16_to_cpu(dnbd2_rep.vid) != dev->vid || + be16_to_cpu(dnbd2_rep.rid) != dev->rid) + continue; + + cmd = be16_to_cpu(dnbd2_rep.cmd); + num = be64_to_cpu(dnbd2_rep.num); + time = be16_to_cpu(dnbd2_rep.time); + + /* Find a matching request in the pending-queue. */ + req = dnbd2_find_request(num, cmd, srv_info); + if (!req) + continue; + srv_info->last_reply = jiffies; + update_rtt(diff(time), srv_info, cmd); + + switch (cmd) { + case CMD_GET_BLOCK: + spin_lock(&dev->kmap_lock); + buffer = __bio_kmap_atomic(req->bio, 0, KM_USER0); + memcpy(buffer, + dnbd2_rep.payload.data, + DNBD2_BLOCK_SIZE); + __bio_kunmap_atomic(req->bio, KM_USER0); + spin_unlock(&dev->kmap_lock); + break; + + case CMD_GET_SIZE: + if (!srv_info->capacity) { + srv_info->capacity = num / SECTOR_SIZE; + wake_up_all(&srv_info->wq); + } + break; + + case CMD_GET_SERVERS: + down(&dev->servers_mutex); + + if (dev->emergency) + stop_emergency(srv_info); + + /* Recreate the emergency list. */ + dev->emerg_list[0].ip = srv_info->ip; + dev->emerg_list[0].port = srv_info->port; + for (i=1 ; i<ALT_SERVERS_MAX; i++) { + if (num) { + memcpy(&dev->emerg_list[i], + &dnbd2_rep.payload.server[i-1], + sizeof(dnbd2_server_t)); + try_add_server(dev->emerg_list[i], dev); + num--; + } else { + dev->emerg_list[i].ip = 0; + dev->emerg_list[i].port = 0; + } + } + up(&dev->servers_mutex); + break; + } + dnbd2_end_request(req, 1); + } + + complete_and_exit(&srv_info->rx_stop, 0); +} + + +struct request *dnbd2_find_request(uint64_t num, + uint16_t cmd, + struct srv_info *dst) +{ + dnbd2_device_t *dev = dst->dev; + struct list_head *cur, *next; + struct req_info *info; + struct request *req; + sector_t sector; + + switch (cmd) { + case CMD_GET_BLOCK: + sector = num / SECTOR_SIZE; + spin_lock_bh(&dev->pending_queue_lock); + list_for_each_safe(cur, next, &dev->pending_queue) { + req = blkdev_entry_to_request(cur); + info = req->special; + + if (req->sector == sector && info->cmd == cmd) { + list_del_init(&req->queuelist); + spin_unlock_bh(&dev->pending_queue_lock); + return req; + } + } + break; + + case CMD_GET_SIZE: + case CMD_GET_SERVERS: + spin_lock_bh(&dev->pending_queue_lock); + list_for_each_safe(cur, next, &dev->pending_queue) { + req = blkdev_entry_to_request(cur); + info = req->special; + + if (info->cmd == cmd && info->dst == dst) { + list_del_init(&req->queuelist); + spin_unlock_bh(&dev->pending_queue_lock); + return req; + } + } + } + + spin_unlock_bh(&dev->pending_queue_lock); + return NULL; +} + + +void dnbd2_end_request(struct request *req, int success) +{ + unsigned long flags; + struct req_info *info = req->special; + dnbd2_device_t *dev; + + switch (info->cmd) { + case CMD_GET_BLOCK: + dev = req->rq_disk->private_data; + spin_lock_irqsave(&dev->blk_lock, flags); + list_del_init(&req->queuelist); + if (!end_that_request_first(req, success, req->nr_sectors)) { + end_that_request_last(req, success); + } + dev->pending_reqs--; + spin_unlock_irqrestore(&dev->blk_lock, flags); + info->cnt = -1; + break; + + case CMD_GET_SIZE: + case CMD_GET_SERVERS: + kfree(info); + kfree(req); + break; + } +} + + +void dnbd2_requeue_timer(unsigned long arg) +{ + dnbd2_device_t *dev = (dnbd2_device_t *)arg; + struct list_head *cur, *next; + struct request *req; + struct req_info *info; + unsigned long too_long; + int requeue; + + spin_lock(&dev->pending_queue_lock); + list_for_each_safe(cur, next, &dev->pending_queue) { + requeue = 0; + req = blkdev_entry_to_request(cur); + info = req->special; + if (!info->cnt) + continue; + + /* Each request type has a specific requeue policy. */ + switch (info->cmd) { + case CMD_GET_BLOCK: + too_long = 2 * (info->dst->srtt >> SRTT_SHIFT); + too_long *= 4 << info->cnt; + if (too_long > HZ) + too_long = HZ; + if (diff(info->time) >= too_long) + requeue = 1; + break; + + case CMD_GET_SERVERS: + case CMD_GET_SIZE: + if (info->cnt == 4) { + list_del_init(&req->queuelist); + dnbd2_end_request(req, 0); + schedule_del_server(info->dst); + break; + } + if (diff(info->time) >= HZ) + requeue = 1; + break; + } + + if (requeue) { + list_del_init(&req->queuelist); + spin_lock(&dev->send_queue_lock); + list_add_tail(&req->queuelist, &dev->send_queue); + spin_unlock(&dev->send_queue_lock); + } + } + spin_unlock(&dev->pending_queue_lock); + + wake_up_interruptible(&dev->sender_wq); + dev->requeue_timer.expires = jiffies + REQUEUE_INTERVAL; + add_timer(&dev->requeue_timer); +} + + +void dnbd2_hb_timer(unsigned long arg) +{ + dnbd2_device_t *dev = (dnbd2_device_t *)arg; + int i; + + if (dev->running) + for_each_server(i) + enqueue_hb(&dev->servers[i]); + + wake_up_interruptible(&dev->sender_wq); + dev->hb_timer.expires = jiffies + dev->hb_interval; + add_timer(&dev->hb_timer); +} + + +void dnbd2_to_timer(unsigned long arg) +{ + dnbd2_device_t *dev = (dnbd2_device_t *)arg; + + if (dev->running) + schedule_activate_fastest(dev); + + dev->to_timer.expires = jiffies + TO_INTERVAL; + add_timer(&dev->to_timer); +} + + +void start_emergency(dnbd2_device_t *dev) +{ + int i; + + if (dev->emergency) + return; + + p("No servers reachable, starting emergency mode!\n"); + dev->emergency = 1; + + del_timer(&dev->requeue_timer); + del_timer(&dev->to_timer); + del_timer(&dev->hb_timer); + + /* Activate the emergency list. */ + for_each_server(i) + try_add_server(dev->emerg_list[i], dev); + + /* Increase frequency of heartbeats. */ + dev->hb_interval = HB_EMERG_INTERVAL; + dev->hb_timer.expires = jiffies + HB_EMERG_INTERVAL; + add_timer(&dev->hb_timer); +} + + +void stop_emergency(struct srv_info *srv_info) +{ + dnbd2_device_t *dev = srv_info->dev; + + if (!dev->emergency) + return; + + p("Stopping emergency mode.\n"); + del_timer(&dev->hb_timer); + + dev->active_server = srv_info; + + /* Decrease frequency of heartbeats. */ + dev->hb_interval = HB_NORMAL_INTERVAL; + dev->hb_timer.expires = jiffies + HB_NORMAL_INTERVAL; + add_timer(&dev->hb_timer); + + dev->requeue_timer.expires = jiffies + REQUEUE_INTERVAL; + add_timer(&dev->requeue_timer); + dev->to_timer.expires = jiffies + TO_INTERVAL; + add_timer(&dev->to_timer); + + dev->emergency = 0; +} + + +static int __init dnbd2_init(void) +{ + int i; + + /* We are platform dependant. */ + if (DNBD2_BLOCK_SIZE != PAGE_SIZE) { + printk(LOG "DNBD2_BLOCK_SIZE (%d) != PAGE_SIZE (%d)\n", + (int) DNBD2_BLOCK_SIZE, (int) PAGE_SIZE); + return -EINVAL; + } + + dnbd2_major = register_blkdev(0, "dnbd2"); + if (dnbd2_major <= 0) { + p("Could not get major number.\n"); + return -EBUSY; + } + + for (i=0 ; i<DNBD2_DEVICES ; i++) + if (add_device(&dev[i], i)) + goto out; + + p("DNBD2 loaded.\n"); + return 0; + + out: + while (i--) + del_device(&dev[i]); + return -ENOMEM; +} + + +static void __exit dnbd2_exit(void) +{ + int i; + for (i=0 ; i<DNBD2_DEVICES ; i++) + del_device(&dev[i]); + unregister_blkdev(dnbd2_major, "dnbd2"); + p("DNBD2 unloaded.\n"); +} + + +module_init(dnbd2_init); +module_exit(dnbd2_exit); + +MODULE_DESCRIPTION("Distributed Network Block Device v2"); +MODULE_LICENSE("GPL"); diff --git a/kernel/core.h b/kernel/core.h new file mode 100644 index 0000000..3c75ade --- /dev/null +++ b/kernel/core.h @@ -0,0 +1,102 @@ +/* + * kernel/core.h + */ + + +/* interval to send heartbeats */ +#define HB_NORMAL_INTERVAL 30*HZ + +/* interval to send heartbeats in emergency-mode */ +#define HB_EMERG_INTERVAL 5*HZ + +/* interval to check the pending-queue */ +#define REQUEUE_INTERVAL HZ/20 + +/* interval to look for faster servers */ +#define TO_INTERVAL HZ + +/* dnbd0, dnbd1, etc */ +#define DNBD2_DEVICES 3 + + +/* Given to us by the kernel at load-time. */ +extern int dnbd2_major; + + +/* + * Called by the block layer to make the driver process some + * requests. For each request it does the following: + * 1. Prepares the request by attaching information to it. + * 2. Enqueues the request into the send-queue. + * 3. Wakes up dnbd2_tx_loop. + */ +void dnbd2_request(request_queue_t *q); + +/* + * This thread sleeps until there are requests in the send-queue. + * Each request is dequeued from the send-queue, enqueued into the + * pending-queue and finally given to dnbd2_send_request. + */ +int dnbd2_tx_loop(void *data); + +/* + * Send a request and update the request's and the destination + * server's counters. + */ +void dnbd2_send_request(struct request *req, dnbd2_device_t *dev); + +/* + * This thread sleeps until a reply arrives and then processes it. + */ +int dnbd2_rx_loop(void *data); + +/* + * When a reply arrives through the network it is caught by + * dnbd2_rx_loop, which calls this function to find a matching request + * in the pending queue. + */ +struct request *dnbd2_find_request(uint64_t num, uint16_t cmd, + struct srv_info *dst); + +/* + * The driver calls this function when it is done processing a request + * and has no further use for it. Resources are freed and the block + * layer informed if necessary. + */ +void dnbd2_end_request(struct request *req, int success); + +/* + * If a request remains too long unanswered, it is moved by this + * function from the pending-queue into the send-queue. This function + * is fired regularly by a timer. + */ +void dnbd2_requeue_timer(unsigned long arg); + +/* + * Periodically enqueu, for each server, a heartbeat request + * (CMD_GET_SERVERS) into the send-queue. + */ +void dnbd2_hb_timer(unsigned long arg); + +/* + * Check every TO_INTERVAL jiffies if there is a faster server and + * switch to it. TO stands for "takeover". + */ +void dnbd2_to_timer(unsigned long arg); + +/* + * If the driver looses contact to all servers it starts the emergency + * mode: First dnbd2_requeue_timer and dnbd2_to_timer are stopped. + * Then the emergency list (the last list of servers acquired with + * CMD_GET_SERVERS) is activated and the rate of heartbeats increased + * to HB_EMERG_INTERVAL. + */ +void start_emergency(dnbd2_device_t *dev); + +/* + * This function is called if a server answers to a heartbeat during + * emergency mode. This server (@srv_info) is made the active server, + * dnbd2_requeue_timer and dnbd2_to_timer started and the rate of + * heartbeats decreased to HB_NORMAL_INTERVAL. + */ +void stop_emergency(struct srv_info *srv_info); diff --git a/kernel/devices.c b/kernel/devices.c new file mode 100644 index 0000000..ccc6db1 --- /dev/null +++ b/kernel/devices.c @@ -0,0 +1,220 @@ +/* + * kernel/devices.c + */ + + +#include "dnbd2.h" +#include "core.h" +#include "fops.h" +#include "sysfs.h" +#include "devices.h" +#include "servers.h" + + +#define TO_PERCENT 50 +#define TO_JIFFIES 10 + + +/* + * Activate the request-processing mechanism for @dev: dnbd2_requeue + * (timer) and dnbd2_tx_loop (thread). dnbd2_rx_loop is started + * afterwards by add_server on a per-server basis. + */ +int start_device(dnbd2_device_t *dev) +{ + /* Start requeue timer. */ + init_timer(&dev->requeue_timer); + dev->requeue_timer.data = (unsigned long)dev; + dev->requeue_timer.function = dnbd2_requeue_timer; + dev->requeue_timer.expires = jiffies + REQUEUE_INTERVAL; + add_timer(&dev->requeue_timer); + + /* Start heartbeat timer. */ + init_timer(&dev->hb_timer); + dev->hb_timer.data = (unsigned long)dev; + dev->hb_timer.function = dnbd2_hb_timer; + dev->hb_timer.expires = jiffies + HB_NORMAL_INTERVAL; + add_timer(&dev->hb_timer); + + /* Start takeover timer. */ + init_timer(&dev->to_timer); + dev->to_timer.data = (unsigned long)dev; + dev->to_timer.function = dnbd2_to_timer; + dev->to_timer.expires = jiffies + TO_INTERVAL; + add_timer(&dev->to_timer); + + /* Start sending thread. */ + dev->tx_signal = 0; + dev->tx_id = kernel_thread(dnbd2_tx_loop, dev, CLONE_KERNEL); + if (dev->tx_id < 0) { + del_timer(&dev->hb_timer); + del_timer(&dev->to_timer); + del_timer(&dev->requeue_timer); + return -1; + } + wait_for_completion(&dev->tx_start); + + return 0; +} + + +/* + * Deactivate the request-processing mechanism for @dev. All + * dnbd2_rx_loop threads must be stopped beforehand by del_server. + */ +void stop_device(dnbd2_device_t *dev) +{ + struct list_head *cur, *next; + struct request *req; + + /* Stop heartbeat timer. */ + del_timer(&dev->hb_timer); + + /* Stop takeover timer. */ + del_timer(&dev->to_timer); + + /* Stop request processing. */ + del_timer(&dev->requeue_timer); + dev->tx_signal = 1; + kill_proc(dev->tx_id, SIGKILL, 1); + wait_for_completion(&dev->tx_stop); + + /* Empty pending-queue. */ + list_for_each_safe(cur, next, &dev->pending_queue) { + req = blkdev_entry_to_request(cur); + list_del_init(&req->queuelist); + dnbd2_end_request(req, 0); + } + + /* Empty send-queue. */ + list_for_each_safe(cur, next, &dev->send_queue) { + req = blkdev_entry_to_request(cur); + list_del_init(&req->queuelist); + dnbd2_end_request(req, 0); + } +} + + +int add_device(dnbd2_device_t *dev, int minor) +{ + struct request_queue *queue; + struct srv_info *srv_info; + struct gendisk *disk; + int i; + + /* + * Prepare dnbd2_device_t. Please use the + * same order as in dnbd2.h. + */ + INIT_WORK(&dev->work, NULL); //, NULL); + /* Change in /<linuxheaders>/include/linux/workqueue.h */ + spin_lock_init(&dev->kmap_lock); + for (i=0 ; i<POOL_SIZE ; i++) + dev->info_pool[i].cnt = -1; + + dev->emergency = 0; + dev->running = 0; + + dev->vid = 0; + dev->rid = 0; + + atomic_set(&dev->refcnt, 0); + init_MUTEX(&dev->config_mutex); + + spin_lock_init(&dev->blk_lock); + + init_waitqueue_head(&dev->sender_wq); + spin_lock_init(&dev->send_queue_lock); + INIT_LIST_HEAD(&dev->send_queue); + + dev->pending_reqs = 0; + spin_lock_init(&dev->pending_queue_lock); + INIT_LIST_HEAD(&dev->pending_queue); + + dev->hb_interval = HB_NORMAL_INTERVAL; + + init_completion(&dev->tx_start); + init_completion(&dev->tx_stop); + + init_MUTEX(&dev->servers_mutex); + for_each_server(i) { + dev->emerg_list[i].ip = 0; + dev->emerg_list[i].port = 0; + srv_info = &dev->servers[i]; + memset(srv_info, 0, sizeof(struct srv_info)); + init_completion(&srv_info->rx_start); + init_completion(&srv_info->rx_stop); + init_waitqueue_head(&srv_info->wq); + INIT_WORK(&srv_info->work, NULL); //, NULL); + /* Change in /<linuxheaders>/include/linux/workqueue.h */ + srv_info->dev = dev; + } + dev->active_server = NULL; + + dev->to_percent = TO_PERCENT; + dev->to_jiffies = TO_JIFFIES; + + /* Prepare struct gendisk. */ + disk = alloc_disk(1); + if (!disk) { + p("Could not alloc gendisk.\n"); + goto out_nodisk; + } + dev->disk = disk; + disk->private_data = dev; + disk->major = dnbd2_major; + disk->first_minor = minor; + disk->fops = &dnbd2_fops; + sprintf(disk->disk_name, "vnbd%d", minor); + set_capacity(disk, 0); + set_disk_ro(disk, 1); + + /* Prepare struct request_queue. */ + queue = blk_init_queue(dnbd2_request, &dev->blk_lock); + if (!queue) { + p("Could not alloc request queue.\n"); + goto out_noqueue; + } + /* + * Tell the block layer to give us only requests consisting of + * one segment of DNBD2_BLOCK_SIZE bytes. + */ + blk_queue_max_sectors(queue, DNBD2_BLOCK_SIZE/SECTOR_SIZE); + blk_queue_max_segment_size(queue, DNBD2_BLOCK_SIZE); + blk_queue_hardsect_size(queue, DNBD2_BLOCK_SIZE); + blk_queue_max_phys_segments(queue, 1); + blk_queue_max_hw_segments(queue, 1); + disk->queue = queue; + add_disk(disk); + + if (start_sysfs(dev)) + goto out_nosysfs; + + if (start_device(dev)) + goto out_nostart; + + return 0; + + out_nostart: + stop_sysfs(dev); + out_nosysfs: + blk_cleanup_queue(queue); + out_noqueue: + del_gendisk(disk); + put_disk(disk); + out_nodisk: + return -ENOMEM; +} + + +void del_device(dnbd2_device_t *dev) +{ + int i; + for_each_server(i) + del_server(&dev->servers[i]); + stop_device(dev); + stop_sysfs(dev); + blk_cleanup_queue(dev->disk->queue); + del_gendisk(dev->disk); + put_disk(dev->disk); +} diff --git a/kernel/devices.h b/kernel/devices.h new file mode 100644 index 0000000..d9cc46f --- /dev/null +++ b/kernel/devices.h @@ -0,0 +1,14 @@ +/* + * kernel/devices.h + */ + + +/* + * Initialize @dev and inform the kernel. + */ +int add_device(dnbd2_device_t *dev, int minor); + +/* + * Destroy @dev and inform the kernel. + */ +void del_device(dnbd2_device_t *dev); diff --git a/kernel/dnbd2.h b/kernel/dnbd2.h new file mode 100644 index 0000000..cf693d9 --- /dev/null +++ b/kernel/dnbd2.h @@ -0,0 +1,118 @@ +/* + * kernel/dnbd2.h + */ + + +#include <linux/version.h> +#include <linux/workqueue.h> +#include <linux/completion.h> +#include <linux/blkdev.h> +#include <linux/types.h> +#include <linux/wait.h> +#include <linux/inet.h> +#include <linux/in.h> +#include <asm/semaphore.h> +#include <net/sock.h> +#include "../include/dnbd2.h" + + +#define LOG KERN_NOTICE "dnbd2: " +#define p(msg) printk(LOG msg); + +#define SECTOR_SIZE 512 + +#define for_each_server(i) for (i=0 ; i<ALT_SERVERS_MAX ; i++) + +/* max number of requests the driver can handle at once. */ +#define POOL_SIZE 128 + +/* precision of sector_t for pretty-printing */ +#ifdef CONFIG_LBD +# define SECT_PRECISION "%llu" +#else +# define SECT_PRECISION "%lu" +#endif + + +typedef struct dnbd2_device dnbd2_device_t; + +/* + * We stick this structure to each request before enqueing it into the + * send-queue. It gives information on how to treat the request. + */ +struct req_info { + uint16_t time; /* enqueue time */ + uint16_t cmd; /* command (CMD_XXX) */ + int cnt; /* send count */ + struct srv_info *dst; /* destination server */ + struct srv_info *last_dst; /* last destination server */ +}; + +/* + * Information about a server with which we communicate. + */ +struct srv_info { + dnbd2_device_t *dev; + struct kobject kobj; + struct work_struct work; + + struct socket *sock; /* NULL iff server not configured */ + uint32_t ip; /* network byte order */ + uint32_t port; /* network byte order */ + uint16_t min, max; /* min and max RTTs */ + unsigned long srtt; /* SRTT (SRTT_SHIFT left-shifted) */ + + sector_t capacity; /* used when a CMD_GET_SIZE reply arrives */ + wait_queue_head_t wq; /* used when a CMD_GET_SIZE reply arrives */ + + long rx_id; /* pid of dnbd2_rx_loop */ + struct completion rx_start; /* dnbd2_rx_loop has started */ + struct completion rx_stop; /* dnbd2_rx_loop has stopped */ + + unsigned long retries; /* number of requests retried */ + unsigned long last_reply; /* time of last reply */ +}; + +struct dnbd2_device { + struct work_struct work; + struct kobject kobj; + spinlock_t kmap_lock; + struct req_info info_pool[POOL_SIZE]; + + int running; /* device is running and capacity != 0 */ + int emergency; /* device has lost contact to all servers */ + + uint16_t vid, rid; /* Volume-ID and Release-ID */ + + atomic_t refcnt; /* for open/release, see fops.c */ + struct semaphore config_mutex; /* for open/release and sysfs-fops */ + + struct gendisk *disk; /* interface to the block layer */ + spinlock_t blk_lock; /* queue-lock, shared with the block layer */ + + wait_queue_head_t sender_wq; /* wait-queue to notify dnbd2_tx_loop */ + spinlock_t send_queue_lock; + struct list_head send_queue; + + unsigned long pending_reqs; /* number of block requests pending */ + spinlock_t pending_queue_lock; + struct list_head pending_queue; + + struct timer_list requeue_timer; /* requeue timer */ + struct timer_list to_timer; /* takeover timer */ + struct timer_list hb_timer; /* heartbeat timer */ + unsigned long hb_interval; /* HB_NORMAL_ or HB_EMERG_INTERVAL */ + + int tx_id; /* pid of dnbd2_tx_loop */ + struct completion tx_start; /* dnbd2_tx_loop has started */ + struct completion tx_stop; /* dnbd2_tx_loop has stopped */ + int tx_signal; /* tells dnbd2_tx_loop to stop */ + + struct semaphore servers_mutex; + struct srv_info servers[ALT_SERVERS_MAX]; + dnbd2_server_t emerg_list[ALT_SERVERS_MAX]; /* last servers known */ + struct srv_info *active_server; + + int to_percent; /* percent threshold for takeover (relativ) */ + uint16_t to_jiffies; /* jiffies threshold for takeover (absolute) */ +}; diff --git a/kernel/fops.c b/kernel/fops.c new file mode 100644 index 0000000..c098e13 --- /dev/null +++ b/kernel/fops.c @@ -0,0 +1,43 @@ +/* + * kernel/fops.c + */ + + +#include "dnbd2.h" +#include "fops.h" + + +struct block_device_operations dnbd2_fops = { + .owner = THIS_MODULE, + .open = dnbd2_open, + .release = dnbd2_release, +}; + + +int dnbd2_open(struct inode *inode, struct file *file) +{ + dnbd2_device_t *dev = inode->i_bdev->bd_disk->private_data; + if (down_interruptible(&dev->config_mutex)) + return -EBUSY; + + /* FIXME: How do we put this add/start_device? */ + if (set_blocksize(inode->i_bdev, DNBD2_BLOCK_SIZE)) { + up(&dev->config_mutex); + return -EBUSY; + } + + atomic_inc(&dev->refcnt); + up(&dev->config_mutex); + return 0; +} + + +int dnbd2_release(struct inode *inode, struct file *file) +{ + dnbd2_device_t *dev = inode->i_bdev->bd_disk->private_data; + if (down_interruptible(&dev->config_mutex)) + return -EBUSY; + atomic_dec(&dev->refcnt); + up(&dev->config_mutex); + return 0; +} diff --git a/kernel/fops.h b/kernel/fops.h new file mode 100644 index 0000000..f5222d4 --- /dev/null +++ b/kernel/fops.h @@ -0,0 +1,9 @@ +/* + * kernel/fops.h + */ + + +extern struct block_device_operations dnbd2_fops; + +int dnbd2_open(struct inode *inode, struct file *file); +int dnbd2_release(struct inode *inode, struct file *file); diff --git a/kernel/misc.c b/kernel/misc.c new file mode 100644 index 0000000..2756bef --- /dev/null +++ b/kernel/misc.c @@ -0,0 +1,84 @@ +/* + * kernel/misc.c + */ + + +#include "dnbd2.h" +#include "misc.h" + + +uint16_t diff(uint16_t then) +{ + int diff = jiffies & 0xffff; + diff -= then; + if (diff < 0) + diff += 1 << 16; + return diff; +} + + +int sock_xmit(struct socket *sock, int send, void *buf, int size) +{ + int result; + struct msghdr msg; + struct kvec iov; + unsigned long flags; + sigset_t oldset; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + oldset = current->blocked; + sigfillset(¤t->blocked); + sigdelsetmask(¤t->blocked, sigmask(SIGKILL)); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + + do { + sock->sk->sk_allocation = GFP_NOIO; + iov.iov_base = buf; + iov.iov_len = size; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_NOSIGNAL; + + if (send) + result = kernel_sendmsg(sock, &msg, &iov, 1, size); + else + result = kernel_recvmsg(sock, &msg, &iov, 1, size, 0); + + if (signal_pending(current)) { + siginfo_t info; + spin_lock_irqsave(¤t->sighand->siglock, flags); + dequeue_signal(current, ¤t->blocked, &info); + spin_unlock_irqrestore(¤t->sighand->siglock, + flags); + result = -EINTR; + break; + } + + if (result <= 0) { + if (result == 0) + result = -EPIPE; + break; + } + size -= result; + buf += result; + } while (size > 0); + + spin_lock_irqsave(¤t->sighand->siglock, flags); + current->blocked = oldset; + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + + return result; +} + + +char *inet_ntoa(uint32_t ip) +{ + static char buf[sizeof "aaa.bbb.ccc.ddd"]; + unsigned char *nums = (unsigned char *)&ip; + sprintf(buf, "%u.%u.%u.%u", nums[0], nums[1], nums[2], nums[3]); + return buf; +} diff --git a/kernel/misc.h b/kernel/misc.h new file mode 100644 index 0000000..8f7f511 --- /dev/null +++ b/kernel/misc.h @@ -0,0 +1,23 @@ +/* + * kernel/misc.h - Usefull stuff that doesn't fit anywhere else. + */ + + +#define SEND 1 +#define RECV 0 + + +/* + * Jiffies between now and @then. + */ +uint16_t diff(uint16_t then); + +/* + * Send or receive packet (from nbd.c) + */ +int sock_xmit(struct socket *sock, int send, void *buf, int size); + +/* + * Pretty printing of IPs. + */ +char *inet_ntoa(uint32_t ip); diff --git a/kernel/scp b/kernel/scp new file mode 100644 index 0000000..c14b937 --- /dev/null +++ b/kernel/scp @@ -0,0 +1 @@ +scp /root/fs/ba-vito/tags/abgabe/kernel/dnbd2.ko 132.230.4.71:/var/opt/openslx/stage1/suse-10.2/lib/modules/2.6.18.8-0.3-default/kernel/drivers/block/ diff --git a/kernel/servers.c b/kernel/servers.c new file mode 100644 index 0000000..7d5c229 --- /dev/null +++ b/kernel/servers.c @@ -0,0 +1,355 @@ +/* + * kernel/servers.c + */ + + +#include "dnbd2.h" +#include "servers.h" +#include "core.h" +#include "misc.h" + + +/* Only use RTTs smaller than RTT_MAX for statistics. */ +#define RTT_MAX HZ/4 + + +/* + * Find the next configured server which is not active. + */ +struct srv_info *next_server(dnbd2_device_t *dev) +{ + int i; + struct srv_info *srv_info, *next_srv = NULL; + + for_each_server(i) { + srv_info = &dev->servers[i]; + if (srv_info->sock && srv_info != dev->active_server) { + next_srv = srv_info; + break; + } + } + return next_srv; +} + + +/* + * Find the server with smallest SRTT. + */ +struct srv_info *fastest_server(uint16_t *srtt, dnbd2_device_t *dev) +{ + int i; + struct srv_info *srv_info, *alt_srv = NULL; + unsigned long min = RTT_MAX << SRTT_SHIFT; + + for_each_server(i) { + srv_info = &dev->servers[i]; + if (srv_info->sock && + srv_info->srtt < min && + srv_info->min < RTT_MAX) { + min = srv_info->srtt; + alt_srv = srv_info; + } + } + + *srtt = min >> SRTT_SHIFT; + return alt_srv; +} + + +/* + * This function can be enqueued in a workqueue. It removes srv_info + * from the list of servers. + */ +void del_server_work(struct work_struct *work) +{ + struct srv_info *srv_info = container_of(work, struct srv_info, work); + dnbd2_device_t *dev = srv_info->dev; + + down(&dev->servers_mutex); + if (!srv_info->sock || dev->active_server == srv_info) { + up(&dev->servers_mutex); + return; + } + del_server(srv_info); + up(&dev->servers_mutex); + return; +} + + +/* + * This function can be enqueued in a workqueue. Read comment on + * schedule_activate_fastest in servers.h + */ +void activate_fastest_work(struct work_struct *work) +{ + dnbd2_device_t *dev = container_of(work, dnbd2_device_t, work); + struct srv_info *alt_srv, *next_srv; + uint16_t min, srtt; + int newsrv = 0; + long unsigned delta; + + down(&dev->servers_mutex); + if (!dev->active_server) + goto out; + + /* Detect if dev->active_server is stalled. */ + delta = (long)jiffies - (long)dev->active_server->last_reply; + if (dev->pending_reqs && delta > TIMEOUT_STALLED) { + next_srv = next_server(dev); + if (!next_srv) { + start_emergency(dev); + goto out; + } + alt_srv = dev->active_server; + dev->active_server = next_srv; + del_server(alt_srv); + } + + /* Switch to another server if requirements met. */ + srtt = dev->active_server->srtt >> SRTT_SHIFT; + alt_srv = fastest_server(&min, dev); + if (!alt_srv || alt_srv == dev->active_server) + goto out; + if (dev->to_percent) { + if (100 * (srtt - min) < dev->to_percent * srtt) + goto out; + newsrv = 1; + } + if (dev->to_jiffies) { + if (min + dev->to_jiffies > srtt) + goto out; + newsrv = 1; + } + if (newsrv) + dev->active_server = alt_srv; + + out: + up(&dev->servers_mutex); +} + + +int start_rx_loop(struct srv_info *srv_info) +{ + srv_info->rx_id = kernel_thread(dnbd2_rx_loop, srv_info, CLONE_KERNEL); + if (srv_info->rx_id < 0) { + srv_info->rx_id = 0; + return -1; + } + wait_for_completion(&srv_info->rx_start); + return 0; +} + + +void stop_rx_loop(struct srv_info *srv_info) +{ + if (!srv_info->rx_id) + return; + kill_proc(srv_info->rx_id, SIGKILL, 1); + wait_for_completion(&srv_info->rx_stop); + srv_info->rx_id = 0; +} + + +/******************************************************/ +/* For the next functions see servers.h for comments. */ +/******************************************************/ + + +int add_server(dnbd2_server_t server, struct srv_info *srv_info) +{ + struct sockaddr_in addr; + struct socket *sock; + + if (!server.ip || !server.port) + return -1; + + if (sock_create(PF_INET,SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { + p("Could not create socket.\n"); + return -1; + } + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = server.ip; + addr.sin_port = server.port; + if (sock->ops->connect(sock, + (struct sockaddr *)&addr, + sizeof(addr), 0)) { + p("Could not connect to socket.\n"); + goto out; + } + + srv_info->sock = sock; + srv_info->ip = server.ip; + srv_info->port = server.port; + srv_info->min = RTT_MAX; + + if (start_rx_loop(srv_info)) { + p("Could not start rx_loop\n"); + goto out; + } + + return 0; + out: + del_server(srv_info); + return -1; +} + + +void del_server(struct srv_info *srv_info) +{ + stop_rx_loop(srv_info); + if (srv_info->sock) + sock_release(srv_info->sock); + srv_info->sock = NULL; + srv_info->ip = 0; + srv_info->port = 0; + srv_info->srtt = 0; + srv_info->min = 0; + srv_info->max = 0; + srv_info->srtt = 0; + srv_info->retries = 0; + srv_info->last_reply = 0; +} + + +void try_add_server(dnbd2_server_t server, dnbd2_device_t *dev) +{ + int i; + + for_each_server(i) + if (dev->servers[i].sock && + dev->servers[i].ip == server.ip && + dev->servers[i].port == server.port) + return; + + for_each_server(i) + if (!dev->servers[i].sock) + break; + + if (i == ALT_SERVERS_MAX) + return; + + add_server(server, &dev->servers[i]); +} + + +sector_t srv_get_capacity(struct srv_info *srv_info) +{ + struct request *req; + struct req_info *info; + dnbd2_device_t *dev = srv_info->dev; + sector_t capacity; + int ret; + + info = kmalloc(sizeof(struct req_info), GFP_KERNEL); + if (!info) + return 0; + req = kmalloc(sizeof(struct request), GFP_KERNEL); + if (!req) { + kfree(info); + return 0; + } + info->cmd = CMD_GET_SIZE; + info->cnt = 0; + info->time = jiffies; + info->dst = srv_info; + info->last_dst = srv_info; + req->special = info; + req->sector = 0; + INIT_LIST_HEAD(&req->queuelist); + + /* Enqueue the request for sending. */ + spin_lock_bh(&dev->send_queue_lock); + list_add_tail(&req->queuelist, &dev->send_queue); + spin_unlock_bh(&dev->send_queue_lock); + + /* Wake up sender function. */ + wake_up_interruptible(&dev->sender_wq); + + /* If we don't get an answer in 4 seconds we give up. */ + ret = wait_event_timeout(srv_info->wq, srv_info->capacity, 4*HZ); + capacity = srv_info->capacity; + srv_info->capacity = 0; + + return ret ? capacity : 0; +} + + +void update_rtt(uint16_t rtt, struct srv_info *srv_info, uint16_t cmd) +{ + if (rtt == 0) + rtt = 1; + if (rtt > RTT_MAX) + return; + + if (rtt < srv_info->min) + srv_info->min = rtt; + if (rtt > srv_info->max) + srv_info->max = rtt; + + if (!srv_info->srtt) { + srv_info->srtt = rtt << SRTT_SHIFT; + return; + } + + switch (cmd) { + case CMD_GET_BLOCK: + srv_info->srtt = SRTT_BETA * srv_info->srtt; + srv_info->srtt += (SRTT_BETA_COMP * ((int)rtt)) << SRTT_SHIFT; + srv_info->srtt /= SRTT_BETA_BASE; + break; + + case CMD_GET_SIZE: + case CMD_GET_SERVERS: + srv_info->srtt = rtt << SRTT_SHIFT; + break; + } +} + + +void enqueue_hb(struct srv_info *srv_info) +{ + struct request *req; + struct req_info *info; + dnbd2_device_t *dev = srv_info->dev; + + if (!srv_info->sock) + return; + info = kmalloc(sizeof(struct req_info), GFP_ATOMIC); + if (!info) + return; + req = kmalloc(sizeof(struct request), GFP_ATOMIC); + if (!req) { + kfree(info); + return; + } + info->cmd = CMD_GET_SERVERS; + info->cnt = 0; + info->time = jiffies; + info->dst = srv_info; + info->dst = srv_info; + info->last_dst = srv_info; + req->special = info; + req->sector = 0; + INIT_LIST_HEAD(&req->queuelist); + + /* Enqueue the request for sending. */ + spin_lock_bh(&dev->send_queue_lock); + list_add_tail(&req->queuelist, &dev->send_queue); + spin_unlock_bh(&dev->send_queue_lock); +} + + +void schedule_del_server(struct srv_info *srv_info) +{ + PREPARE_WORK(&srv_info->work, del_server_work); //, srv_info); + /* Change in /<linuxheaders>/include/linux/workqueue.h */ + schedule_work(&srv_info->work); +} + + +void schedule_activate_fastest(dnbd2_device_t *dev) +{ + PREPARE_WORK(&dev->work, activate_fastest_work); //, dev); + /* Change in /<linuxheaders>/include/linux/workqueue.h */ + schedule_work(&dev->work); +} diff --git a/kernel/servers.h b/kernel/servers.h new file mode 100644 index 0000000..83d9b46 --- /dev/null +++ b/kernel/servers.h @@ -0,0 +1,84 @@ +/* + * kernel/servers.h + */ + + +#define SRTT_BETA 990 +#define SRTT_BETA_COMP 10 +#define SRTT_BETA_BASE 1000 + +/* + * Though presented as a 16-bit number in sysfs, the SRTT is stored in + * an unsigned long, SRTT_SHIFT bits left-shifted. This helps + * preserve the precision we need tou update the SRTTs. If we stored + * SRTT in an integer, it would only be changed by RTTS *very* far + * away from it. + */ +#define SRTT_SHIFT 10 + +/* + * After this interval without answering + * requests a server is considered down. + */ +#define TIMEOUT_STALLED 3*HZ + + +/* + * Configure @srv_info based on @server: Create a socket, connect it + * and start a dnbd2_rx_loop for it. + */ +int add_server(dnbd2_server_t server, struct srv_info *srv_info); + +/* + * Reset @srv_info: Stop dnbd2_rx_loop, close the socket and zero all + * variables. + */ +void del_server(struct srv_info *srv_info); + +/* + * Look for an unused server-slot in @dev and configure it according + * to @server (using add_server). Nothing happens if the list of + * servers if full or if there is already a server in the list with + * the same IP and port as @server. + */ +void try_add_server(dnbd2_server_t server, dnbd2_device_t *dev); + +/* + * This function enqueues into the send-queue a request for the device + * capacity (CMD_GET_SIZE). This type of request is retransmitted + * every second. If an answer doesn't arrive within 4 seconds it + * assumes that the server is down. + */ +sector_t srv_get_capacity(struct srv_info *srv_info); + +/* + * Update min, max and srtt in @srv_info. + */ +void update_rtt(uint16_t rtt, struct srv_info *srv_info, uint16_t cmd); + +/* + * Enqueue a heartbeat request (CMD_GET_SERVERS) for @srv_info into + * the send-queue. + */ +void enqueue_hb(struct srv_info *srv_info); + +/* + * Schedule the removal of @srv_info to be processed by the global + * kernel workqueue as soon as possible. This function allows to + * trigger the removal of servers from within timers, in which + * sleeping is not allowed. + */ +void schedule_del_server(struct srv_info *srv_info); + +/* + * Schedule the search for a faster server: + * 1. Detect if the active server is stalled (down). In this case + * switch to another server, if possible, and remove the old one + * from the list. If no servers are available to switch to + * start the emergency mode. + * 2. If the emergency mode was not started go through the list of + * servers and pick the one with the smallest SRTT. If this + * server meets our requirements (to_jiffies and to_percent) + * switch to it. + */ +void schedule_activate_fastest(dnbd2_device_t *dev); diff --git a/kernel/sysfs.c b/kernel/sysfs.c new file mode 100644 index 0000000..802e9d2 --- /dev/null +++ b/kernel/sysfs.c @@ -0,0 +1,460 @@ +/* + * kernel/sysfs.c + */ + + +#include "dnbd2.h" +#include "misc.h" +#include "sysfs.h" +#include "devices.h" +#include "servers.h" + + +#define RW 0644 +#define RO 0444 + +#define kobj_to_dev(kp) container_of(kp, dnbd2_device_t, kobj) +#define kobj_to_srv(kp) container_of(kp, struct srv_info, kobj) + +#define attr_to_srvattr(ap) container_of(ap, struct server_attr, attr) +#define attr_to_devattr(ap) container_of(ap, struct device_attr, attr) + +#define DEV_ATTR_RW(_name) \ +static struct device_attr _name = \ +__ATTR(_name, RW, show_##_name, store_##_name) + +#define SRV_ATTR_RW(_name) \ +static struct server_attr _name = \ +__ATTR(_name, RW, show_##_name, store_##_name) + +#define DEV_ATTR_RO(_name) \ +static struct device_attr _name = \ +__ATTR(_name, RO, show_##_name, NULL) + +#define SRV_ATTR_RO(_name) \ +static struct server_attr _name = \ +__ATTR(_name, RO, show_##_name, NULL) + + +struct device_attr { + struct attribute attr; + ssize_t (*show)(char *, dnbd2_device_t *); + ssize_t (*store)(const char *, size_t, dnbd2_device_t *); +}; + +struct server_attr { + struct attribute attr; + ssize_t (*show)(char *, struct srv_info *); + ssize_t (*store)(const char *, size_t, struct srv_info *); +}; + + +void release(struct kobject *kobj) {} + +ssize_t show_running(char *, dnbd2_device_t *); +ssize_t store_running(const char *, size_t, dnbd2_device_t *); +ssize_t show_to_percent(char *, dnbd2_device_t *); +ssize_t store_to_percent(const char *, size_t, dnbd2_device_t *); +ssize_t show_to_jiffies(char *, dnbd2_device_t *); +ssize_t store_to_jiffies(const char *, size_t, dnbd2_device_t *); +ssize_t show_vid(char *, dnbd2_device_t *); +ssize_t store_vid(const char *, size_t, dnbd2_device_t *); +ssize_t show_rid(char *, dnbd2_device_t *); +ssize_t store_rid(const char *, size_t, dnbd2_device_t *); +ssize_t show_pending_reqs(char *, dnbd2_device_t *); +ssize_t show_emergency(char *, dnbd2_device_t *); + +ssize_t show_sock(char *, struct srv_info *); +ssize_t store_sock(const char *, size_t, struct srv_info *); +ssize_t show_active(char *, struct srv_info *); +ssize_t store_active(const char *, size_t, struct srv_info *); +ssize_t show_rtt(char *, struct srv_info *); +ssize_t show_retries(char *, struct srv_info *); +ssize_t show_last_reply(char *, struct srv_info *); + + +/* device attributes */ +DEV_ATTR_RW(running); +DEV_ATTR_RW(to_percent); +DEV_ATTR_RW(to_jiffies); +DEV_ATTR_RW(vid); +DEV_ATTR_RW(rid); +DEV_ATTR_RO(pending_reqs); +DEV_ATTR_RO(emergency); + +static struct attribute *device_attrs[] = { + &running.attr, + &to_percent.attr, + &to_jiffies.attr, + &vid.attr, + &rid.attr, + &pending_reqs.attr, + &emergency.attr, + NULL, +}; + +/* server attributes */ +SRV_ATTR_RW(sock); +SRV_ATTR_RW(active); +SRV_ATTR_RO(rtt); +SRV_ATTR_RO(retries); +SRV_ATTR_RO(last_reply); + +struct attribute *server_attrs[] = { + &sock.attr, + &active.attr, + &rtt.attr, + &retries.attr, + &last_reply.attr, + NULL, +}; + + +/* + * Wrapper functions for show/store, one pair for device attributes + * and one pair for server attributes. Each attribute has its own + * specific function(s). + */ +ssize_t device_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct device_attr *device_attr = attr_to_devattr(attr); + dnbd2_device_t *dev = kobj_to_dev(kobj); + return device_attr->show(buf, dev); +} + +ssize_t device_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + int ret; + struct device_attr *device_attr = attr_to_devattr(attr); + dnbd2_device_t *dev = kobj_to_dev(kobj); + down(&dev->config_mutex); + ret = device_attr->store(buf, count, dev); + up(&dev->config_mutex); + return ret; +} + +ssize_t server_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct server_attr *server_attr = attr_to_srvattr(attr); + struct srv_info *srv_info = kobj_to_srv(kobj); + return server_attr->show(buf, srv_info); +} + +ssize_t server_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + int ret; + struct server_attr *server_attr = attr_to_srvattr(attr); + struct srv_info *srv_info = kobj_to_srv(kobj); + down(&srv_info->dev->config_mutex); + ret = server_attr->store(buf, count, srv_info); + up(&srv_info->dev->config_mutex); + return ret; +} + + +struct sysfs_ops device_ops = { + .show = device_show, + .store = device_store, +}; + +struct sysfs_ops server_ops = { + .show = server_show, + .store = server_store, +}; + +struct kobj_type device_ktype = { + .default_attrs = device_attrs, + .sysfs_ops = &device_ops, + .release = release, +}; + +struct kobj_type server_ktype = { + .default_attrs = server_attrs, + .sysfs_ops = &server_ops, + .release = release, +}; + + +/* + * RW device attribute functions. + */ +ssize_t show_running(char *buf, dnbd2_device_t *dev) +{ + return sprintf(buf, "%d\n", dev->running); +} + +ssize_t store_running(const char *buf, size_t count, dnbd2_device_t *dev) +{ + sector_t capacity = 0; + struct srv_info *srv_info; + int i, running, ret = sscanf(buf, "%d", &running); + + if (ret != 1) + return -EINVAL; + + if (!dev->running) { + if (running != 1 || !dev->vid || !dev->rid) + return -EINVAL; + if (atomic_read(&dev->refcnt) > 0) + return -EBUSY; + + for_each_server(i) { + srv_info = &dev->servers[i]; + if (!srv_info->sock) + continue; + capacity = srv_get_capacity(srv_info); + if (capacity) { + set_capacity(dev->disk, capacity); + dev->active_server = srv_info; + break; + } + } + if (!capacity) { + p("Could not contact any servers.\n"); + return -EHOSTUNREACH; + } + for_each_server(i) { + dev->emerg_list[i].ip = dev->servers[i].ip; + dev->emerg_list[i].port = dev->servers[i].port; + } + printk(LOG "Device capacity = " SECT_PRECISION " KB\n", + capacity * SECTOR_SIZE / 1024); + dev->running = 1; + __module_get(THIS_MODULE); + } else { + if (running != 0) + return -EINVAL; + if (atomic_read(&dev->refcnt) > 0) + return -EBUSY; + + /* Stop device. */ + dev->running = 0; + set_capacity(dev->disk, 0); + dev->active_server = NULL; + module_put(THIS_MODULE); + } + + return count; +} + +ssize_t show_vid(char *buf, dnbd2_device_t *dev) +{ + return sprintf(buf, "%hu\n", dev->vid); +} + +ssize_t store_vid(const char *buf, size_t count, dnbd2_device_t *dev) +{ + uint16_t vid; + + if (dev->running) + return -EBUSY; + if (sscanf(buf, "%hu", &vid) != 1) + return -EINVAL; + + dev->vid = vid; + return count; +} + +ssize_t show_rid(char *buf, dnbd2_device_t *dev) +{ + return sprintf(buf, "%hu\n", dev->rid); +} + +ssize_t store_rid(const char *buf, size_t count, dnbd2_device_t *dev) +{ + uint16_t rid; + + if (dev->running) + return -EBUSY; + if (sscanf(buf, "%hu", &rid) != 1) + return -EINVAL; + + dev->rid = rid; + return count; +} + +ssize_t show_to_percent(char *buf, dnbd2_device_t *dev) +{ + return sprintf(buf, "%d\n", dev->to_percent); +} + +ssize_t store_to_percent(const char *buf, size_t count, dnbd2_device_t *dev) +{ + if (sscanf(buf, "%d", &dev->to_percent) == 1) + return count; + return -EINVAL; +} + +ssize_t show_to_jiffies(char *buf, dnbd2_device_t *dev) +{ + return sprintf(buf, "%hu\n", dev->to_jiffies); +} + +ssize_t store_to_jiffies(const char *buf, size_t count, dnbd2_device_t *dev) +{ + if (sscanf(buf, "%hu", &dev->to_jiffies) == 1) + return count; + return -EINVAL; +} + + +/* + * RO device attribute functions. + */ +ssize_t show_pending_reqs(char *buf, dnbd2_device_t *dev) +{ + return sprintf(buf, "%lu\n", dev->pending_reqs); +} + +ssize_t show_emergency(char *buf, dnbd2_device_t *dev) +{ + return sprintf(buf, "%d\n", dev->emergency); +} + + +/* + * RW server attribute functions. + */ +ssize_t show_sock(char *buf, struct srv_info *srv_info) +{ + return sprintf(buf, "%s %hu\n", + inet_ntoa(srv_info->ip), + ntohs(srv_info->port)); +} + +ssize_t store_sock(const char *buf, size_t count, struct srv_info *srv_info) +{ + char ip[sizeof "aaa.bbb.ccc.ddd 12345"]; + uint16_t port; + dnbd2_server_t server; + dnbd2_device_t *dev = srv_info->dev; + + if (count > sizeof "aaa.bbb.ccc.ddd 12345") + return -EINVAL; + if (sscanf(buf, "%s %hu", ip, &port) != 2) + return -EINVAL; + server.ip = in_aton(ip); + server.port = htons(port); + + down(&dev->servers_mutex); + if (dev->running && dev->active_server == srv_info) { + up(&dev->servers_mutex); + return -EBUSY; + } + if (srv_info->sock) + del_server(srv_info); + if (server.ip && server.port && add_server(server, srv_info)) { + up(&dev->servers_mutex); + return -EINVAL; + } + up(&dev->servers_mutex); + + return count; +} + +ssize_t show_active(char *buf, struct srv_info *srv_info) +{ + if (!srv_info->sock) + return sprintf(buf, "0\n"); + if (srv_info->dev->active_server == srv_info) + return sprintf(buf, "1\n"); + return sprintf(buf, "0\n"); +} + +ssize_t store_active(const char *buf, size_t count, struct srv_info *srv_info) +{ + dnbd2_device_t *dev = srv_info->dev; + int active; + + if (sscanf(buf, "%d", &active) != 1 || active != 1) + return -EINVAL; + + down(&dev->servers_mutex); + if (!dev->running) { + up(&dev->servers_mutex); + return -EINVAL; + } + if (!srv_info->sock || dev->active_server == srv_info) { + up(&dev->servers_mutex); + return -EINVAL; + } + dev->active_server = srv_info; + up(&dev->servers_mutex); + + return count; +} + + +/* + * RO server attribute functions. + */ +ssize_t show_rtt(char *buf, struct srv_info *srv_info) +{ + return sprintf(buf, "%hu %lu %hu\n", + srv_info->min, + srv_info->srtt >> SRTT_SHIFT, + srv_info->max); +} + +ssize_t show_retries(char *buf, struct srv_info *srv_info) +{ + return sprintf(buf, "%lu\n", srv_info->retries); +} + +ssize_t show_last_reply(char *buf, struct srv_info *srv_info) +{ + return sprintf(buf, "%lu\n", srv_info->last_reply); +} + + +/* Helper for start_sysfs. */ +int setup_kobj(struct kobject *kobj, char *name, struct kobject *parent, + struct kobj_type *ktype) +{ + memset(kobj, 0, sizeof(struct kobject)); + kobj->parent = parent; + kobj->ktype = ktype; + if (kobject_set_name(kobj, name)) + return -1; + if (kobject_register(kobj)) + return -1; + return 0; +} + + +/* + * Exported functions - see sysfs.h + */ +int start_sysfs(dnbd2_device_t *dev) +{ + int i; + char name[] = "server99"; + + if (setup_kobj(&dev->kobj, "config", &dev->disk->kobj, &device_ktype)) + return -1; + + for_each_server(i) { + sprintf(name, "server%d", i); + if (setup_kobj(&dev->servers[i].kobj, name, + &dev->disk->kobj, &server_ktype)) + goto out; + } + return 0; + + out: + while (i--) + kobject_unregister(&dev->servers[i].kobj); + return -1; +} + +void stop_sysfs(dnbd2_device_t *dev) +{ + int i; + for_each_server(i) + kobject_unregister(&dev->servers[i].kobj); + kobject_unregister(&dev->kobj); +} diff --git a/kernel/sysfs.h b/kernel/sysfs.h new file mode 100644 index 0000000..a18338e --- /dev/null +++ b/kernel/sysfs.h @@ -0,0 +1,24 @@ +/* + * kernel/sysfs.h + */ + + +/* + * Setup the sysfs-interface for @dev: + * + * M = minor number + * N = ALT_SERVERS_MAX - 1 + * + * /sys/block/vnbdM/config (from @dev->kobj) + * /sys/block/vnbdM/server0 (from @dev->servers[0].kobj) + * . + * . + * /sys/block/vnbdM/serverN (from @dev->servers[N].kobj) + * + */ +int start_sysfs(dnbd2_device_t *dev); + +/* + * Destroy the sysfs-interface for @dev. + */ +void stop_sysfs(dnbd2_device_t *dev); |