summaryrefslogtreecommitdiffstats
path: root/fs/nfs/objlayout/objlayout.c
diff options
context:
space:
mode:
authorBoaz Harrosh2011-05-22 18:52:19 +0200
committerBoaz Harrosh2011-05-29 19:54:15 +0200
commit04f83450388e87d86b387cf4a27b81eb7e69de7d (patch)
tree99c10d6a995ed1e7b872abb0127fc38d6bef9982 /fs/nfs/objlayout/objlayout.c
parentpnfs: support for non-rpc layout drivers (diff)
downloadkernel-qcow2-linux-04f83450388e87d86b387cf4a27b81eb7e69de7d.tar.gz
kernel-qcow2-linux-04f83450388e87d86b387cf4a27b81eb7e69de7d.tar.xz
kernel-qcow2-linux-04f83450388e87d86b387cf4a27b81eb7e69de7d.zip
pnfs-obj: osd raid engine read/write implementation
With the use of the in-kernel osd library. Implement read/write of data from/to osd-objects according to information specified in the objects-layout. Support for stripping over mirrors with a received stripe_unit. There are however a few constrains which are not supported: 1. Stripe Unit must be a multiple of PAGE_SIZE 2. stripe length (stripe_unit * number_of_stripes) can not be bigger then 32bit. Also support raid-groups and partial-layout. Partial-layout is when not all the groups are received on the line, addressing only a partial range of the file. TODO: Only raid0! raid 4/5/6 support will come at later stage A none supported layout will send IO through the MDS [Important fallout from the last rebase] Signed-off-by: Boaz Harrosh <bharrosh@panasas.com> [gfp_flags] Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Diffstat (limited to 'fs/nfs/objlayout/objlayout.c')
-rw-r--r--fs/nfs/objlayout/objlayout.c254
1 files changed, 254 insertions, 0 deletions
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index f14b4da34052..5157ef6d0041 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -129,6 +129,260 @@ objlayout_free_lseg(struct pnfs_layout_segment *lseg)
}
/*
+ * I/O Operations
+ */
+static inline u64
+end_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ end = start + len;
+ return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ BUG_ON(!len);
+ end = start + len;
+ return end > start ? end - 1 : NFS4_MAX_UINT64;
+}
+
+static struct objlayout_io_state *
+objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
+ struct page **pages,
+ unsigned pgbase,
+ loff_t offset,
+ size_t count,
+ struct pnfs_layout_segment *lseg,
+ void *rpcdata,
+ gfp_t gfp_flags)
+{
+ struct objlayout_io_state *state;
+ u64 lseg_end_offset;
+
+ dprintk("%s: allocating io_state\n", __func__);
+ if (objio_alloc_io_state(lseg, &state, gfp_flags))
+ return NULL;
+
+ BUG_ON(offset < lseg->pls_range.offset);
+ lseg_end_offset = end_offset(lseg->pls_range.offset,
+ lseg->pls_range.length);
+ BUG_ON(offset >= lseg_end_offset);
+ if (offset + count > lseg_end_offset) {
+ count = lseg->pls_range.length -
+ (offset - lseg->pls_range.offset);
+ dprintk("%s: truncated count %Zd\n", __func__, count);
+ }
+
+ if (pgbase > PAGE_SIZE) {
+ pages += pgbase >> PAGE_SHIFT;
+ pgbase &= ~PAGE_MASK;
+ }
+
+ state->lseg = lseg;
+ state->rpcdata = rpcdata;
+ state->pages = pages;
+ state->pgbase = pgbase;
+ state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ state->offset = offset;
+ state->count = count;
+ state->sync = 0;
+
+ return state;
+}
+
+static void
+objlayout_free_io_state(struct objlayout_io_state *state)
+{
+ dprintk("%s: freeing io_state\n", __func__);
+ if (unlikely(!state))
+ return;
+
+ objio_free_io_state(state);
+}
+
+/*
+ * I/O done common code
+ */
+static void
+objlayout_iodone(struct objlayout_io_state *state)
+{
+ dprintk("%s: state %p status\n", __func__, state);
+
+ objlayout_free_io_state(state);
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_read_complete(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_read_data *rdata;
+
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ rdata = container_of(task, struct nfs_read_data, task);
+
+ pnfs_ld_read_done(rdata);
+}
+
+void
+objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
+{
+ int eof = state->eof;
+ struct nfs_read_data *rdata;
+
+ state->status = status;
+ dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
+ rdata = state->rpcdata;
+ rdata->task.tk_status = status;
+ if (status >= 0) {
+ rdata->res.count = status;
+ rdata->res.eof = eof;
+ }
+ objlayout_iodone(state);
+ /* must not use state after this point */
+
+ if (sync)
+ pnfs_ld_read_done(rdata);
+ else {
+ INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
+ schedule_work(&rdata->task.u.tk_work);
+ }
+}
+
+/*
+ * Perform sync or async reads.
+ */
+enum pnfs_try_status
+objlayout_read_pagelist(struct nfs_read_data *rdata)
+{
+ loff_t offset = rdata->args.offset;
+ size_t count = rdata->args.count;
+ struct objlayout_io_state *state;
+ ssize_t status = 0;
+ loff_t eof;
+
+ dprintk("%s: Begin inode %p offset %llu count %d\n",
+ __func__, rdata->inode, offset, (int)count);
+
+ eof = i_size_read(rdata->inode);
+ if (unlikely(offset + count > eof)) {
+ if (offset >= eof) {
+ status = 0;
+ rdata->res.count = 0;
+ rdata->res.eof = 1;
+ goto out;
+ }
+ count = eof - offset;
+ }
+
+ state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
+ rdata->args.pages, rdata->args.pgbase,
+ offset, count,
+ rdata->lseg, rdata,
+ GFP_KERNEL);
+ if (unlikely(!state)) {
+ status = -ENOMEM;
+ goto out;
+ }
+
+ state->eof = state->offset + state->count >= eof;
+
+ status = objio_read_pagelist(state);
+ out:
+ dprintk("%s: Return status %Zd\n", __func__, status);
+ rdata->pnfs_error = status;
+ return PNFS_ATTEMPTED;
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_write_complete(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_write_data *wdata;
+
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ wdata = container_of(task, struct nfs_write_data, task);
+
+ pnfs_ld_write_done(wdata);
+}
+
+void
+objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
+ bool sync)
+{
+ struct nfs_write_data *wdata;
+
+ dprintk("%s: Begin\n", __func__);
+ wdata = state->rpcdata;
+ state->status = status;
+ wdata->task.tk_status = status;
+ if (status >= 0) {
+ wdata->res.count = status;
+ wdata->verf.committed = state->committed;
+ dprintk("%s: Return status %d committed %d\n",
+ __func__, wdata->task.tk_status,
+ wdata->verf.committed);
+ } else
+ dprintk("%s: Return status %d\n",
+ __func__, wdata->task.tk_status);
+ objlayout_iodone(state);
+ /* must not use state after this point */
+
+ if (sync)
+ pnfs_ld_write_done(wdata);
+ else {
+ INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
+ schedule_work(&wdata->task.u.tk_work);
+ }
+}
+
+/*
+ * Perform sync or async writes.
+ */
+enum pnfs_try_status
+objlayout_write_pagelist(struct nfs_write_data *wdata,
+ int how)
+{
+ struct objlayout_io_state *state;
+ ssize_t status;
+
+ dprintk("%s: Begin inode %p offset %llu count %u\n",
+ __func__, wdata->inode, wdata->args.offset, wdata->args.count);
+
+ state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
+ wdata->args.pages,
+ wdata->args.pgbase,
+ wdata->args.offset,
+ wdata->args.count,
+ wdata->lseg, wdata,
+ GFP_NOFS);
+ if (unlikely(!state)) {
+ status = -ENOMEM;
+ goto out;
+ }
+
+ state->sync = how & FLUSH_SYNC;
+
+ status = objio_write_pagelist(state, how & FLUSH_STABLE);
+ out:
+ dprintk("%s: Return status %Zd\n", __func__, status);
+ wdata->pnfs_error = status;
+ return PNFS_ATTEMPTED;
+}
+
+/*
* Get Device Info API for io engines
*/
struct objlayout_deviceinfo {