diff options
author | Dmitry Torokhov | 2016-12-16 18:31:17 +0100 |
---|---|---|
committer | Dmitry Torokhov | 2016-12-16 18:31:17 +0100 |
commit | f26e8817b235d8764363bffcc9cbfc61867371f2 (patch) | |
tree | 6546ea2cf91b78f1ada2161db61e21085c880740 /fs/nfs | |
parent | Input: change KEY_DATA from 0x275 to 0x277 (diff) | |
parent | Merge branch 'synaptics-rmi4' into next (diff) | |
download | kernel-qcow2-linux-f26e8817b235d8764363bffcc9cbfc61867371f2.tar.gz kernel-qcow2-linux-f26e8817b235d8764363bffcc9cbfc61867371f2.tar.xz kernel-qcow2-linux-f26e8817b235d8764363bffcc9cbfc61867371f2.zip |
Merge branch 'next' into for-linus
Prepare input updates for 4.10 merge window.
Diffstat (limited to 'fs/nfs')
49 files changed, 2367 insertions, 1285 deletions
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 8664417955a2..6abdda209642 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o CFLAGS_nfstrace.o += -I$(src) nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ - direct.o pagelist.o read.o symlink.o unlink.o \ + io.o direct.o pagelist.o read.o symlink.o unlink.o \ write.o namespace.o mount_clnt.o nfstrace.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_SYSCTL) += sysctl.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 17a42e4eb872..217847679f0e 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -102,14 +102,15 @@ static inline void put_parallel(struct parallel_io *p) } static struct bio * -bl_submit_bio(int rw, struct bio *bio) +bl_submit_bio(struct bio *bio) { if (bio) { get_parallel(bio->bi_private); dprintk("%s submitting %s bio %u@%llu\n", __func__, - rw == READ ? "read" : "write", bio->bi_iter.bi_size, + bio_op(bio) == READ ? "read" : "write", + bio->bi_iter.bi_size, (unsigned long long)bio->bi_iter.bi_sector); - submit_bio(rw, bio); + submit_bio(bio); } return NULL; } @@ -158,7 +159,7 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, if (disk_addr < map->start || disk_addr >= map->start + map->len) { if (!dev->map(dev, disk_addr, map)) return ERR_PTR(-EIO); - bio = bl_submit_bio(rw, bio); + bio = bl_submit_bio(bio); } disk_addr += map->disk_offset; disk_addr -= map->start; @@ -174,9 +175,10 @@ retry: disk_addr >> SECTOR_SHIFT, end_io, par); if (!bio) return ERR_PTR(-ENOMEM); + bio_set_op_attrs(bio, rw, 0); } if (bio_add_page(bio, page, *len, offset) < *len) { - bio = bl_submit_bio(rw, bio); + bio = bl_submit_bio(bio); goto retry; } return bio; @@ -252,7 +254,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) for (i = pg_index; i < header->page_array.npages; i++) { if (extent_length <= 0) { /* We've used up the previous extent */ - bio = bl_submit_bio(READ, bio); + bio = bl_submit_bio(bio); /* Get the next one */ if (!ext_tree_lookup(bl, isect, &be, false)) { @@ -273,7 +275,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) } if (is_hole(&be)) { - bio = bl_submit_bio(READ, bio); + bio = bl_submit_bio(bio); /* Fill hole w/ zeroes w/o accessing device */ dprintk("%s Zeroing page for hole\n", __func__); zero_user_segment(pages[i], pg_offset, pg_len); @@ -306,7 +308,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) header->res.count = (isect << SECTOR_SHIFT) - header->args.offset; } out: - bl_submit_bio(READ, bio); + bl_submit_bio(bio); blk_finish_plug(&plug); put_parallel(par); return PNFS_ATTEMPTED; @@ -344,7 +346,7 @@ static void bl_write_cleanup(struct work_struct *work) PAGE_SIZE - 1) & (loff_t)PAGE_MASK; ext_tree_mark_written(bl, start >> SECTOR_SHIFT, - (end - start) >> SECTOR_SHIFT); + (end - start) >> SECTOR_SHIFT, end); } pnfs_ld_write_done(hdr); @@ -398,7 +400,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) for (i = pg_index; i < header->page_array.npages; i++) { if (extent_length <= 0) { /* We've used up the previous extent */ - bio = bl_submit_bio(WRITE, bio); + bio = bl_submit_bio(bio); /* Get the next one */ if (!ext_tree_lookup(bl, isect, &be, true)) { header->pnfs_error = -EINVAL; @@ -427,7 +429,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) header->res.count = header->args.count; out: - bl_submit_bio(WRITE, bio); + bl_submit_bio(bio); blk_finish_plug(&plug); put_parallel(par); return PNFS_ATTEMPTED; diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 18e6fd0b9506..efc007f00742 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -141,6 +141,7 @@ struct pnfs_block_layout { struct rb_root bl_ext_ro; spinlock_t bl_ext_lock; /* Protects list manipulation */ bool bl_scsi_layout; + u64 bl_lwb; }; static inline struct pnfs_block_layout * @@ -182,7 +183,7 @@ int ext_tree_insert(struct pnfs_block_layout *bl, int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start, sector_t end); int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, - sector_t len); + sector_t len, u64 lwb); bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, struct pnfs_block_extent *ret, bool rw); int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg); diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index e5b89675263e..a69ef4e9c24c 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -65,8 +65,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) if (!p) return -EIO; b->simple.nr_sigs = be32_to_cpup(p++); - if (!b->simple.nr_sigs) { - dprintk("no signature\n"); + if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) { + dprintk("Bad signature count: %d\n", b->simple.nr_sigs); return -EIO; } @@ -89,7 +89,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) memcpy(&b->simple.sigs[i].sig, p, b->simple.sigs[i].sig_len); - b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len; + b->simple.len += 8 + 4 + \ + (XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2); } break; case PNFS_BLOCK_VOLUME_SLICE: @@ -104,7 +105,12 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) p = xdr_inline_decode(xdr, 4); if (!p) return -EIO; + b->concat.volumes_count = be32_to_cpup(p++); + if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) { + dprintk("Too many volumes: %d\n", b->concat.volumes_count); + return -EIO; + } p = xdr_inline_decode(xdr, b->concat.volumes_count * 4); if (!p) @@ -116,8 +122,13 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) p = xdr_inline_decode(xdr, 8 + 4); if (!p) return -EIO; + p = xdr_decode_hyper(p, &b->stripe.chunk_size); b->stripe.volumes_count = be32_to_cpup(p++); + if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) { + dprintk("Too many volumes: %d\n", b->stripe.volumes_count); + return -EIO; + } p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4); if (!p) @@ -224,18 +235,20 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; + struct block_device *bdev; dev_t dev; dev = bl_resolve_deviceid(server, v, gfp_mask); if (!dev) return -EIO; - d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); - if (IS_ERR(d->bdev)) { + bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); + if (IS_ERR(bdev)) { printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", - MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); - return PTR_ERR(d->bdev); + MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); + return PTR_ERR(bdev); } + d->bdev = bdev; d->len = i_size_read(d->bdev->bd_inode); @@ -287,44 +300,71 @@ bl_validate_designator(struct pnfs_block_volume *v) } } +/* + * Try to open the udev path for the WWN. At least on Debian the udev + * by-id path will always point to the dm-multipath device if one exists. + */ +static struct block_device * +bl_open_udev_path(struct pnfs_block_volume *v) +{ + struct block_device *bdev; + const char *devname; + + devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN", + v->scsi.designator_len, v->scsi.designator); + if (!devname) + return ERR_PTR(-ENOMEM); + + bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); + if (IS_ERR(bdev)) { + pr_warn("pNFS: failed to open device %s (%ld)\n", + devname, PTR_ERR(bdev)); + } + + kfree(devname); + return bdev; +} + +/* + * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the + * wwn- links will only point to the first discovered SCSI device there. + */ +static struct block_device * +bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v) +{ + struct block_device *bdev; + const char *devname; + + devname = kasprintf(GFP_KERNEL, + "/dev/disk/by-id/dm-uuid-mpath-%d%*phN", + v->scsi.designator_type, + v->scsi.designator_len, v->scsi.designator); + if (!devname) + return ERR_PTR(-ENOMEM); + + bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); + kfree(devname); + return bdev; +} + static int bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; + struct block_device *bdev; const struct pr_ops *ops; - const char *devname; int error; if (!bl_validate_designator(v)) return -EINVAL; - switch (v->scsi.designator_len) { - case 8: - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN", - v->scsi.designator); - break; - case 12: - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN", - v->scsi.designator); - break; - case 16: - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN", - v->scsi.designator); - break; - default: - return -EINVAL; - } - - d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL); - if (IS_ERR(d->bdev)) { - pr_warn("pNFS: failed to open device %s (%ld)\n", - devname, PTR_ERR(d->bdev)); - kfree(devname); - return PTR_ERR(d->bdev); - } - - kfree(devname); + bdev = bl_open_dm_mpath_udev_path(v); + if (IS_ERR(bdev)) + bdev = bl_open_udev_path(v); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + d->bdev = bdev; d->len = i_size_read(d->bdev->bd_inode); d->map = bl_map_simple; @@ -352,7 +392,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, return 0; out_blkdev_put: - blkdev_put(d->bdev, FMODE_READ); + blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE); return error; } diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 720b3ff55fa9..c85fbfd2d0d9 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -121,6 +121,16 @@ ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be) return be; } +static void __ext_put_deviceids(struct list_head *head) +{ + struct pnfs_block_extent *be, *tmp; + + list_for_each_entry_safe(be, tmp, head, be_list) { + nfs4_put_deviceid_node(be->be_device); + kfree(be); + } +} + static void __ext_tree_insert(struct rb_root *root, struct pnfs_block_extent *new, bool merge_ok) @@ -163,7 +173,8 @@ free_new: } static int -__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) +__ext_tree_remove(struct rb_root *root, + sector_t start, sector_t end, struct list_head *tmp) { struct pnfs_block_extent *be; sector_t len1 = 0, len2 = 0; @@ -223,8 +234,7 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) struct pnfs_block_extent *next = ext_tree_next(be); rb_erase(&be->be_node, root); - nfs4_put_deviceid_node(be->be_device); - kfree(be); + list_add_tail(&be->be_list, tmp); be = next; } @@ -350,16 +360,18 @@ int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start, sector_t end) { int err, err2; + LIST_HEAD(tmp); spin_lock(&bl->bl_ext_lock); - err = __ext_tree_remove(&bl->bl_ext_ro, start, end); + err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); if (rw) { - err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end); + err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp); if (!err) err = err2; } spin_unlock(&bl->bl_ext_lock); + __ext_put_deviceids(&tmp); return err; } @@ -390,18 +402,19 @@ ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be, int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, - sector_t len) + sector_t len, u64 lwb) { struct rb_root *root = &bl->bl_ext_rw; sector_t end = start + len; struct pnfs_block_extent *be; int err = 0; + LIST_HEAD(tmp); spin_lock(&bl->bl_ext_lock); /* * First remove all COW extents or holes from written to range. */ - err = __ext_tree_remove(&bl->bl_ext_ro, start, end); + err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); if (err) goto out; @@ -458,7 +471,11 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, } } out: + if (bl->bl_lwb < lwb) + bl->bl_lwb = lwb; spin_unlock(&bl->bl_ext_lock); + + __ext_put_deviceids(&tmp); return err; } @@ -503,7 +520,7 @@ static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p) } static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, - size_t buffer_size, size_t *count) + size_t buffer_size, size_t *count, __u64 *lastbyte) { struct pnfs_block_extent *be; int ret = 0; @@ -527,6 +544,8 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, p = encode_block_extent(be, p); be->be_tag = EXTENT_COMMITTING; } + *lastbyte = bl->bl_lwb - 1; + bl->bl_lwb = 0; spin_unlock(&bl->bl_ext_lock); return ret; @@ -549,7 +568,7 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) arg->layoutupdate_pages = &arg->layoutupdate_page; retry: - ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count); + ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten); if (unlikely(ret)) { ext_tree_free_commitdata(arg, buffer_size); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index a7f2e6e33305..52a28311e2a4 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -275,6 +275,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, err_socks: svc_rpcb_cleanup(serv, net); err_bind: + nn->cb_users[minorversion]--; dprintk("NFS: Couldn't create callback socket: err = %d; " "net = %p\n", ret, net); return ret; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 618ced381a14..f953ef6b2f2e 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -119,27 +119,30 @@ out: * hashed by filehandle. */ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, - struct nfs_fh *fh, nfs4_stateid *stateid) + struct nfs_fh *fh) { struct nfs_server *server; + struct nfs_inode *nfsi; struct inode *ino; struct pnfs_layout_hdr *lo; +restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry(lo, &server->layouts, plh_layouts) { - if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid)) + nfsi = NFS_I(lo->plh_inode); + if (nfs_compare_fh(fh, &nfsi->fh)) continue; - if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh)) + if (nfsi->layout != lo) continue; ino = igrab(lo->plh_inode); if (!ino) break; spin_lock(&ino->i_lock); /* Is this layout in the process of being freed? */ - if (NFS_I(ino)->layout != lo) { + if (nfsi->layout != lo) { spin_unlock(&ino->i_lock); iput(ino); - break; + goto restart; } pnfs_get_layout_hdr(lo); spin_unlock(&ino->i_lock); @@ -151,13 +154,13 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, } static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, - struct nfs_fh *fh, nfs4_stateid *stateid) + struct nfs_fh *fh) { struct pnfs_layout_hdr *lo; spin_lock(&clp->cl_lock); rcu_read_lock(); - lo = get_layout_by_fh_locked(clp, fh, stateid); + lo = get_layout_by_fh_locked(clp, fh); rcu_read_unlock(); spin_unlock(&clp->cl_lock); @@ -167,17 +170,39 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, /* * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) */ -static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo, +static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new) { u32 oldseq, newseq; - oldseq = be32_to_cpu(lo->plh_stateid.seqid); + /* Is the stateid still not initialised? */ + if (!pnfs_layout_is_valid(lo)) + return NFS4ERR_DELAY; + + /* Mismatched stateid? */ + if (!nfs4_stateid_match_other(&lo->plh_stateid, new)) + return NFS4ERR_BAD_STATEID; + newseq = be32_to_cpu(new->seqid); + /* Are we already in a layout recall situation? */ + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) && + lo->plh_return_seq != 0) { + if (newseq < lo->plh_return_seq) + return NFS4ERR_OLD_STATEID; + if (newseq > lo->plh_return_seq) + return NFS4ERR_DELAY; + goto out; + } + /* Check that the stateid matches what we think it should be. */ + oldseq = be32_to_cpu(lo->plh_stateid.seqid); if (newseq > oldseq + 1) - return false; - return true; + return NFS4ERR_DELAY; + /* Crazy server! */ + if (newseq <= oldseq) + return NFS4ERR_OLD_STATEID; +out: + return NFS_OK; } static u32 initiate_file_draining(struct nfs_client *clp, @@ -188,7 +213,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, u32 rv = NFS4ERR_NOMATCHING_LAYOUT; LIST_HEAD(free_me_list); - lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); + lo = get_layout_by_fh(clp, &args->cbl_fh); if (!lo) { trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL, &args->cbl_stateid, -rv); @@ -196,18 +221,15 @@ static u32 initiate_file_draining(struct nfs_client *clp, } ino = lo->plh_inode; + pnfs_layoutcommit_inode(ino, false); + spin_lock(&ino->i_lock); - if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) { - rv = NFS4ERR_DELAY; + rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid); + if (rv != NFS_OK) goto unlock; - } pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); - spin_unlock(&ino->i_lock); - pnfs_layoutcommit_inode(ino, false); - - spin_lock(&ino->i_lock); /* * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return) */ @@ -217,16 +239,19 @@ static u32 initiate_file_draining(struct nfs_client *clp, } if (pnfs_mark_matching_lsegs_return(lo, &free_me_list, - &args->cbl_range)) { + &args->cbl_range, + be32_to_cpu(args->cbl_stateid.seqid))) { rv = NFS4_OK; goto unlock; } + /* Embrace your forgetfulness! */ + rv = NFS4ERR_NOMATCHING_LAYOUT; + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &args->cbl_range); } - pnfs_mark_layout_returned_if_empty(lo); unlock: spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); @@ -429,11 +454,8 @@ static bool referring_call_exists(struct nfs_client *clp, ((u32 *)&rclist->rcl_sessionid.data)[3], ref->rc_sequenceid, ref->rc_slotid); - spin_lock(&tbl->slot_tbl_lock); - status = (test_bit(ref->rc_slotid, tbl->used_slots) && - tbl->slots[ref->rc_slotid].seq_nr == - ref->rc_sequenceid); - spin_unlock(&tbl->slot_tbl_lock); + status = nfs4_slot_wait_on_seqid(tbl, ref->rc_slotid, + ref->rc_sequenceid, HZ >> 1) < 0; if (status) goto out; } @@ -462,7 +484,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, goto out; tbl = &clp->cl_session->bc_slot_table; - slot = tbl->slots + args->csa_slotid; /* Set up res before grabbing the spinlock */ memcpy(&res->csr_sessionid, &args->csa_sessionid, @@ -500,8 +521,10 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, cps->slot = slot; /* The ca_maxresponsesize_cached is 0 with no DRC */ - if (args->csa_cachethis != 0) - return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE); + if (args->csa_cachethis != 0) { + status = htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE); + goto out_unlock; + } /* * Check for pending referring calls. If a match is found, a diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 976c90608e56..656f68f7fe53 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -146,10 +146,16 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) p = read_buf(xdr, NFS4_STATEID_SIZE); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); - memcpy(stateid, p, NFS4_STATEID_SIZE); + memcpy(stateid->data, p, NFS4_STATEID_SIZE); return 0; } +static __be32 decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_DELEGATION_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr) { __be32 *p; @@ -211,7 +217,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, __be32 *p; __be32 status; - status = decode_stateid(xdr, &args->stateid); + status = decode_delegation_stateid(xdr, &args->stateid); if (unlikely(status != 0)) goto out; p = read_buf(xdr, 4); @@ -227,6 +233,11 @@ out: } #if defined(CONFIG_NFS_V4_1) +static __be32 decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_LAYOUT_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, @@ -263,7 +274,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, } p = xdr_decode_hyper(p, &args->cbl_range.offset); p = xdr_decode_hyper(p, &args->cbl_range.length); - status = decode_stateid(xdr, &args->cbl_stateid); + status = decode_layout_stateid(xdr, &args->cbl_stateid); if (unlikely(status != 0)) goto out; } else if (args->cbl_recall_type == RETURN_FSID) { @@ -914,7 +925,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r if (hdr_arg.minorversion == 0) { cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident); if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) - return rpc_drop_reply; + goto out_invalidcred; } cps.minorversion = hdr_arg.minorversion; @@ -942,6 +953,10 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r nfs_put_client(cps.clp); dprintk("%s: done, status = %u\n", __func__, ntohl(status)); return rpc_success; + +out_invalidcred: + pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n"); + return rpc_autherr_badcred; } /* diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 0c96528db94a..1e106780a237 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -367,8 +367,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init, */ struct nfs_client * nfs_get_client(const struct nfs_client_initdata *cl_init, - const struct rpc_timeout *timeparms, - const char *ip_addr, rpc_authflavor_t authflavour) { struct nfs_client *clp, *new = NULL; @@ -399,7 +397,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); new->cl_flags = cl_init->init_flags; - return rpc_ops->init_client(new, timeparms, ip_addr); + return rpc_ops->init_client(new, cl_init); } spin_unlock(&nn->nfs_client_lock); @@ -428,7 +426,7 @@ EXPORT_SYMBOL_GPL(nfs_mark_client_ready); * Initialise the timeout values for a connection */ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, - unsigned int timeo, unsigned int retrans) + int timeo, int retrans) { to->to_initval = timeo * HZ / 10; to->to_retries = retrans; @@ -436,9 +434,9 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, switch (proto) { case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_RDMA: - if (to->to_retries == 0) + if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_TCP_RETRANS; - if (to->to_initval == 0) + if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0) to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; @@ -451,9 +449,9 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, to->to_exponential = 0; break; case XPRT_TRANSPORT_UDP: - if (to->to_retries == 0) + if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_UDP_RETRANS; - if (!to->to_initval) + if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_UDP_TIMEOUT) to->to_initval = NFS_MAX_UDP_TIMEOUT; @@ -470,7 +468,7 @@ EXPORT_SYMBOL_GPL(nfs_init_timeout_values); * Create an RPC client handle */ int nfs_create_rpc_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, + const struct nfs_client_initdata *cl_init, rpc_authflavor_t flavor) { struct rpc_clnt *clnt = NULL; @@ -479,8 +477,9 @@ int nfs_create_rpc_client(struct nfs_client *clp, .protocol = clp->cl_proto, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, - .timeout = timeparms, + .timeout = cl_init->timeparms, .servername = clp->cl_hostname, + .nodename = cl_init->nodename, .program = &nfs_program, .version = clp->rpc_ops->version, .authflavor = flavor, @@ -591,14 +590,12 @@ EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); * nfs_init_client - Initialise an NFS2 or NFS3 client * * @clp: nfs_client to initialise - * @timeparms: timeout parameters for underlying RPC transport - * @ip_addr: IP presentation address (not used) + * @cl_init: Initialisation parameters * * Returns pointer to an NFS client, or an ERR_PTR value. */ struct nfs_client *nfs_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr) + const struct nfs_client_initdata *cl_init) { int error; @@ -612,7 +609,7 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp, * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); if (error < 0) goto error; nfs_mark_client_ready(clp, NFS_CS_READY); @@ -633,6 +630,7 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_parsed_mount_data *data, struct nfs_subversion *nfs_mod) { + struct rpc_timeout timeparms; struct nfs_client_initdata cl_init = { .hostname = data->nfs_server.hostname, .addr = (const struct sockaddr *)&data->nfs_server.address, @@ -640,8 +638,8 @@ static int nfs_init_server(struct nfs_server *server, .nfs_mod = nfs_mod, .proto = data->nfs_server.protocol, .net = data->net, + .timeparms = &timeparms, }; - struct rpc_timeout timeparms; struct nfs_client *clp; int error; @@ -653,7 +651,7 @@ static int nfs_init_server(struct nfs_server *server, set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); + clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX); if (IS_ERR(clp)) { dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); return PTR_ERR(clp); @@ -1102,7 +1100,6 @@ static const struct file_operations nfs_server_list_fops = { .read = seq_read, .llseek = seq_lseek, .release = seq_release_net, - .owner = THIS_MODULE, }; static int nfs_volume_list_open(struct inode *inode, struct file *file); @@ -1123,7 +1120,6 @@ static const struct file_operations nfs_volume_list_fops = { .read = seq_read, .llseek = seq_lseek, .release = seq_release_net, - .owner = THIS_MODULE, }; /* diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 5166adcfc0fb..322c2585bc34 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -875,15 +875,16 @@ int nfs_delegations_present(struct nfs_client *clp) /** * nfs4_copy_delegation_stateid - Copy inode's state ID information - * @dst: stateid data structure to fill in * @inode: inode to check * @flags: delegation type requirement + * @dst: stateid data structure to fill in + * @cred: optional argument to retrieve credential * * Returns "true" and fills in "dst->data" * if inode had a delegation, * otherwise "false" is returned. */ -bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, - fmode_t flags) +bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, + nfs4_stateid *dst, struct rpc_cred **cred) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; @@ -896,6 +897,8 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, if (ret) { nfs4_stateid_copy(dst, &delegation->stateid); nfs_mark_delegation_referenced(delegation); + if (cred) + *cred = get_rpccred(delegation->cred); } rcu_read_unlock(); return ret; diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 333063e032f0..64724d252a79 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -56,7 +56,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp); int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type); int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); -bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags); +bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred); void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 33eb81738d03..177fefb26c18 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -57,7 +57,7 @@ static void nfs_readdir_clear_array(struct page*); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, .read = generic_read_dir, - .iterate = nfs_readdir, + .iterate_shared = nfs_readdir, .open = nfs_opendir, .release = nfs_closedir, .fsync = nfs_fsync_dir, @@ -145,6 +145,7 @@ struct nfs_cache_array_entry { }; struct nfs_cache_array { + atomic_t refcount; int size; int eof_index; u64 last_cookie; @@ -200,11 +201,20 @@ void nfs_readdir_clear_array(struct page *page) int i; array = kmap_atomic(page); - for (i = 0; i < array->size; i++) - kfree(array->array[i].string.name); + if (atomic_dec_and_test(&array->refcount)) + for (i = 0; i < array->size; i++) + kfree(array->array[i].string.name); kunmap_atomic(array); } +static bool grab_page(struct page *page) +{ + struct nfs_cache_array *array = kmap_atomic(page); + bool res = atomic_inc_not_zero(&array->refcount); + kunmap_atomic(array); + return res; +} + /* * the caller is responsible for freeing qstr.name * when called by nfs_readdir_add_to_array, the strings will be freed in @@ -222,7 +232,7 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le * in a page cache page which kmemleak does not scan. */ kmemleak_not_leak(string->name); - string->hash = full_name_hash(name, len); + string->hash = full_name_hash(NULL, name, len); return 0; } @@ -414,12 +424,17 @@ static int xdr_decode(nfs_readdir_descriptor_t *desc, static int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { + struct inode *inode; struct nfs_inode *nfsi; if (d_really_is_negative(dentry)) return 0; - nfsi = NFS_I(d_inode(dentry)); + inode = d_inode(dentry); + if (is_bad_inode(inode) || NFS_STALE(inode)) + return 0; + + nfsi = NFS_I(inode); if (entry->fattr->fileid == nfsi->fileid) return 1; if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0) @@ -470,6 +485,7 @@ static void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) { struct qstr filename = QSTR_INIT(entry->name, entry->len); + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct dentry *dentry; struct dentry *alias; struct inode *dir = d_inode(parent); @@ -486,10 +502,16 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) if (filename.len == 2 && filename.name[1] == '.') return; } - filename.hash = full_name_hash(filename.name, filename.len); + filename.hash = full_name_hash(parent, filename.name, filename.len); dentry = d_lookup(parent, &filename); - if (dentry != NULL) { +again: + if (!dentry) { + dentry = d_alloc_parallel(parent, &filename, &wq); + if (IS_ERR(dentry)) + return; + } + if (!d_in_lookup(dentry)) { /* Is there a mountpoint here? If so, just exit */ if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid, &entry->fattr->fsid)) @@ -503,26 +525,21 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) } else { d_invalidate(dentry); dput(dentry); + dentry = NULL; + goto again; } } - dentry = d_alloc(parent, &filename); - if (dentry == NULL) - return; - inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label); - if (IS_ERR(inode)) - goto out; - alias = d_splice_alias(inode, dentry); - if (IS_ERR(alias)) - goto out; - else if (alias) { - nfs_set_verifier(alias, nfs_save_change_attribute(dir)); - dput(alias); - } else - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - + d_lookup_done(dentry); + if (alias) { + if (IS_ERR(alias)) + goto out; + dput(dentry); + dentry = alias; + } + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out: dput(dentry); } @@ -643,6 +660,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, goto out_label_free; } memset(array, 0, sizeof(struct nfs_cache_array)); + atomic_set(&array->refcount, 1); array->eof_index = -1; status = nfs_readdir_alloc_pages(pages, array_size); @@ -705,8 +723,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) static void cache_page_release(nfs_readdir_descriptor_t *desc) { - if (!desc->page->mapping) - nfs_readdir_clear_array(desc->page); + nfs_readdir_clear_array(desc->page); put_page(desc->page); desc->page = NULL; } @@ -714,8 +731,16 @@ void cache_page_release(nfs_readdir_descriptor_t *desc) static struct page *get_cache_page(nfs_readdir_descriptor_t *desc) { - return read_cache_page(file_inode(desc->file)->i_mapping, + struct page *page; + + for (;;) { + page = read_cache_page(desc->file->f_mapping, desc->page_index, (filler_t *)nfs_readdir_filler, desc); + if (IS_ERR(page) || grab_page(page)) + break; + put_page(page); + } + return page; } /* @@ -889,7 +914,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->decode = NFS_PROTO(inode)->decode_dirent; desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0; - nfs_block_sillyrename(dentry); if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode)) res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) @@ -925,7 +949,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) break; } while (!desc->eof); out: - nfs_unblock_sillyrename(dentry); if (res > 0) res = 0; dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res); @@ -934,13 +957,11 @@ out: static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) { - struct inode *inode = file_inode(filp); struct nfs_open_dir_context *dir_ctx = filp->private_data; dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", filp, offset, whence); - inode_lock(inode); switch (whence) { case 1: offset += filp->f_pos; @@ -948,16 +969,13 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) if (offset >= 0) break; default: - offset = -EINVAL; - goto out; + return -EINVAL; } if (offset != filp->f_pos) { filp->f_pos = offset; dir_ctx->dir_cookie = 0; dir_ctx->duped = 0; } -out: - inode_unlock(inode); return offset; } @@ -1350,7 +1368,6 @@ EXPORT_SYMBOL_GPL(nfs_dentry_operations); struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { struct dentry *res; - struct dentry *parent; struct inode *inode = NULL; struct nfs_fh *fhandle = NULL; struct nfs_fattr *fattr = NULL; @@ -1380,21 +1397,18 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in if (IS_ERR(label)) goto out; - parent = dentry->d_parent; - /* Protect against concurrent sillydeletes */ trace_nfs_lookup_enter(dir, dentry, flags); - nfs_block_sillyrename(parent); error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); if (error == -ENOENT) goto no_entry; if (error < 0) { res = ERR_PTR(error); - goto out_unblock_sillyrename; + goto out_label; } inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); res = ERR_CAST(inode); if (IS_ERR(res)) - goto out_unblock_sillyrename; + goto out_label; /* Success: notify readdir to use READDIRPLUS */ nfs_advise_use_readdirplus(dir); @@ -1403,12 +1417,11 @@ no_entry: res = d_splice_alias(inode, dentry); if (res != NULL) { if (IS_ERR(res)) - goto out_unblock_sillyrename; + goto out_label; dentry = res; } nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); -out_unblock_sillyrename: - nfs_unblock_sillyrename(parent); +out_label: trace_nfs_lookup_exit(dir, dentry, flags, error); nfs4_label_free(label); out: @@ -1471,11 +1484,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned open_flags, umode_t mode, int *opened) { + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct nfs_open_context *ctx; struct dentry *res; struct iattr attr = { .ia_valid = ATTR_OPEN }; struct inode *inode; unsigned int lookup_flags = 0; + bool switched = false; int err; /* Expect a negative dentry */ @@ -1490,7 +1505,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, /* NFS only supports OPEN on regular files */ if ((open_flags & O_DIRECTORY)) { - if (!d_unhashed(dentry)) { + if (!d_in_lookup(dentry)) { /* * Hashed negative dentry with O_DIRECTORY: dentry was * revalidated and is fine, no need to perform lookup @@ -1514,22 +1529,31 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, attr.ia_size = 0; } + if (!(open_flags & O_CREAT) && !d_in_lookup(dentry)) { + d_drop(dentry); + switched = true; + dentry = d_alloc_parallel(dentry->d_parent, + &dentry->d_name, &wq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + if (unlikely(!d_in_lookup(dentry))) + return finish_no_open(file, dentry); + } + ctx = create_nfs_open_context(dentry, open_flags); err = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; trace_nfs_atomic_open_enter(dir, ctx, open_flags); - nfs_block_sillyrename(dentry->d_parent); inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, opened); - nfs_unblock_sillyrename(dentry->d_parent); if (IS_ERR(inode)) { err = PTR_ERR(inode); trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); put_nfs_open_context(ctx); + d_drop(dentry); switch (err) { case -ENOENT: - d_drop(dentry); d_add(dentry, NULL); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); break; @@ -1551,14 +1575,23 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); put_nfs_open_context(ctx); out: + if (unlikely(switched)) { + d_lookup_done(dentry); + dput(dentry); + } return err; no_open: res = nfs_lookup(dir, dentry, lookup_flags); - err = PTR_ERR(res); + if (switched) { + d_lookup_done(dentry); + if (!res) + res = dentry; + else + dput(dentry); + } if (IS_ERR(res)) - goto out; - + return PTR_ERR(res); return finish_no_open(file, res); } EXPORT_SYMBOL_GPL(nfs_atomic_open); @@ -1766,7 +1799,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry) trace_nfs_rmdir_enter(dir, dentry); if (d_really_is_positive(dentry)) { - nfs_wait_on_sillyrename(dentry); + down_write(&NFS_I(d_inode(dentry))->rmdir_sem); error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); /* Ensure the VFS deletes this inode */ switch (error) { @@ -1776,6 +1809,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry) case -ENOENT: nfs_dentry_handle_enoent(dentry); } + up_write(&NFS_I(d_inode(dentry))->rmdir_sem); } else error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); trace_nfs_rmdir_exit(dir, dentry, error); @@ -2218,21 +2252,37 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st return NULL; } -static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) +static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res, bool may_block) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_access_entry *cache; - int err = -ENOENT; + bool retry = true; + int err; spin_lock(&inode->i_lock); - if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) - goto out_zap; - cache = nfs_access_search_rbtree(inode, cred); - if (cache == NULL) - goto out; - if (!nfs_have_delegated_attributes(inode) && - !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) - goto out_stale; + for(;;) { + if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) + goto out_zap; + cache = nfs_access_search_rbtree(inode, cred); + err = -ENOENT; + if (cache == NULL) + goto out; + /* Found an entry, is our attribute cache valid? */ + if (!nfs_attribute_cache_expired(inode) && + !(nfsi->cache_validity & NFS_INO_INVALID_ATTR)) + break; + err = -ECHILD; + if (!may_block) + goto out; + if (!retry) + goto out_zap; + spin_unlock(&inode->i_lock); + err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (err) + return err; + spin_lock(&inode->i_lock); + retry = false; + } res->jiffies = cache->jiffies; res->cred = cache->cred; res->mask = cache->mask; @@ -2241,12 +2291,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str out: spin_unlock(&inode->i_lock); return err; -out_stale: - rb_erase(&cache->rb_node, &nfsi->access_cache); - list_del(&cache->lru); - spin_unlock(&inode->i_lock); - nfs_access_free_entry(cache); - return -ENOENT; out_zap: spin_unlock(&inode->i_lock); nfs_access_zap_cache(inode); @@ -2273,13 +2317,12 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, cache = NULL; if (cache == NULL) goto out; - if (!nfs_have_delegated_attributes(inode) && - !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) + err = nfs_revalidate_inode_rcu(NFS_SERVER(inode), inode); + if (err) goto out; res->jiffies = cache->jiffies; res->cred = cache->cred; res->mask = cache->mask; - err = 0; out: rcu_read_unlock(); return err; @@ -2368,18 +2411,19 @@ EXPORT_SYMBOL_GPL(nfs_access_set_mask); static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) { struct nfs_access_entry cache; + bool may_block = (mask & MAY_NOT_BLOCK) == 0; int status; trace_nfs_access_enter(inode); status = nfs_access_get_cached_rcu(inode, cred, &cache); if (status != 0) - status = nfs_access_get_cached(inode, cred, &cache); + status = nfs_access_get_cached(inode, cred, &cache, may_block); if (status == 0) goto out_cached; status = -ECHILD; - if (mask & MAY_NOT_BLOCK) + if (!may_block) goto out; /* Be clever: ask server to check for all possible rights */ diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index c93826e4a8c6..72b7d13ee3c6 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -87,6 +87,7 @@ struct nfs_direct_req { int mirror_count; ssize_t count, /* bytes actually processed */ + max_count, /* max expected count */ bytes_left, /* bytes left to be sent */ io_start, /* start of IO */ error; /* any reported error */ @@ -123,6 +124,8 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) int i; ssize_t count; + WARN_ON_ONCE(dreq->count >= dreq->max_count); + if (dreq->mirror_count == 1) { dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes; dreq->count += hdr->good_bytes; @@ -193,6 +196,12 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, WARN_ON_ONCE(verfp->committed < 0); } +static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1, + const struct nfs_writeverf *v2) +{ + return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier); +} + /* * nfs_direct_cmp_hdr_verf - compare verifier for pgio header * @dreq - direct request possibly spanning multiple servers @@ -212,7 +221,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, nfs_direct_set_hdr_verf(dreq, hdr); return 0; } - return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); + return nfs_direct_cmp_verf(verfp, &hdr->verf); } /* @@ -235,22 +244,20 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, if (verfp->committed < 0) return 1; - return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); + return nfs_direct_cmp_verf(verfp, &data->verf); } /** * nfs_direct_IO - NFS address space operation for direct I/O * @iocb: target I/O control block - * @iov: array of vectors that define I/O buffer - * @pos: offset in file to begin the operation - * @nr_segs: size of iovec array + * @iter: I/O buffer * * The presence of this routine in the address space ops vector means * the NFS client supports direct I/O. However, for most direct IO, we * shunt off direct read and write requests before the VFS gets them, * so this method is only ever called for swap. */ -ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) +ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = iocb->ki_filp->f_mapping->host; @@ -261,7 +268,7 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (iov_iter_rw(iter) == READ) - return nfs_file_direct_read(iocb, iter, pos); + return nfs_file_direct_read(iocb, iter); return nfs_file_direct_write(iocb, iter); } @@ -275,7 +282,7 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages) void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq) { - cinfo->lock = &dreq->inode->i_lock; + cinfo->inode = dreq->inode; cinfo->mds = &dreq->mds_cinfo; cinfo->ds = &dreq->ds_cinfo; cinfo->dreq = dreq; @@ -350,10 +357,12 @@ static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq) result = wait_for_completion_killable(&dreq->completion); + if (!result) { + result = dreq->count; + WARN_ON_ONCE(dreq->count < 0); + } if (!result) result = dreq->error; - if (!result) - result = dreq->count; out: return (ssize_t) result; @@ -363,28 +372,18 @@ out: * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust * the iocb is still valid here if this is a synchronous request. */ -static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) +static void nfs_direct_complete(struct nfs_direct_req *dreq) { struct inode *inode = dreq->inode; - if (dreq->iocb && write) { - loff_t pos = dreq->iocb->ki_pos + dreq->count; - - spin_lock(&inode->i_lock); - if (i_size_read(inode) < pos) - i_size_write(inode, pos); - spin_unlock(&inode->i_lock); - } - - if (write) - nfs_zap_mapping(inode, inode->i_mapping); - inode_dio_end(inode); if (dreq->iocb) { long res = (long) dreq->error; - if (!res) + if (dreq->count != 0) { res = (long) dreq->count; + WARN_ON_ONCE(dreq->count < 0); + } dreq->iocb->ki_complete(dreq->iocb, res, 0); } @@ -396,7 +395,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) static void nfs_direct_readpage_release(struct nfs_page *req) { dprintk("NFS: direct read done (%s/%llu %d@%lld)\n", - d_inode(req->wb_context->dentry)->i_sb->s_id, + req->wb_context->dentry->d_sb->s_id, (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)), req->wb_bytes, (long long)req_offset(req)); @@ -431,7 +430,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) } out_put: if (put_dreq(dreq)) - nfs_direct_complete(dreq, false); + nfs_direct_complete(dreq); hdr->release(hdr); } @@ -537,7 +536,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, } if (put_dreq(dreq)) - nfs_direct_complete(dreq, false); + nfs_direct_complete(dreq); return 0; } @@ -545,7 +544,6 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_read - file direct read operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers into which to read data - * @pos: byte offset in file where reading starts * * We use this function for direct reads instead of calling * generic_file_aio_read() in order to avoid gfar's check to see if @@ -561,8 +559,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * client must read the updated atime from the server back into its * cache. */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, - loff_t pos) +ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -574,27 +571,22 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n", - file, count, (long long) pos); + file, count, (long long) iocb->ki_pos); result = 0; if (!count) goto out; - inode_lock(inode); - result = nfs_sync_mapping(mapping); - if (result) - goto out_unlock; - task_io_account_read(count); result = -ENOMEM; dreq = nfs_direct_req_alloc(); if (dreq == NULL) - goto out_unlock; + goto out; dreq->inode = inode; - dreq->bytes_left = count; - dreq->io_start = pos; + dreq->bytes_left = dreq->max_count = count; + dreq->io_start = iocb->ki_pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); if (IS_ERR(l_ctx)) { @@ -605,24 +597,21 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + nfs_start_io_direct(inode); + NFS_I(inode)->read_io += count; - result = nfs_direct_read_schedule_iovec(dreq, iter, pos); + result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); - inode_unlock(inode); + nfs_end_io_direct(inode); if (!result) { result = nfs_direct_wait(dreq); if (result > 0) - iocb->ki_pos = pos + result; + iocb->ki_pos += result; } - nfs_direct_req_release(dreq); - return result; - out_release: nfs_direct_req_release(dreq); -out_unlock: - inode_unlock(inode); out: return result; } @@ -632,13 +621,13 @@ nfs_direct_write_scan_commit_list(struct inode *inode, struct list_head *list, struct nfs_commit_info *cinfo) { - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); #ifdef CONFIG_NFS_V4_1 if (cinfo->ds != NULL && cinfo->ds->nwritten != 0) NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); #endif nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0); - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); } static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) @@ -654,6 +643,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); dreq->count = 0; + dreq->verf.committed = NFS_INVALID_STABLE_HOW; + nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); for (i = 0; i < dreq->mirror_count; i++) dreq->mirrors[i].count = 0; get_dreq(dreq); @@ -673,13 +664,13 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) if (!nfs_pageio_add_request(&desc, req)) { nfs_list_remove_request(req); nfs_list_add_request(req, &failed); - spin_lock(cinfo.lock); + spin_lock(&cinfo.inode->i_lock); dreq->flags = 0; if (desc.pg_error < 0) dreq->error = desc.pg_error; else dreq->error = -EIO; - spin_unlock(cinfo.lock); + spin_unlock(&cinfo.inode->i_lock); } nfs_release_request(req); } @@ -772,7 +763,8 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) nfs_direct_write_reschedule(dreq); break; default: - nfs_direct_complete(dreq, true); + nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping); + nfs_direct_complete(dreq); } } @@ -969,7 +961,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers from which to write data - * @pos: byte offset in file where writing starts * * We use this function for direct writes instead of calling * generic_file_aio_write() in order to avoid taking the inode @@ -989,6 +980,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) { ssize_t result = -EINVAL; + size_t count; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; @@ -999,34 +991,24 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", file, iov_iter_count(iter), (long long) iocb->ki_pos); - nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, - iov_iter_count(iter)); + result = generic_write_checks(iocb, iter); + if (result <= 0) + return result; + count = result; + nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); pos = iocb->ki_pos; end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; - inode_lock(inode); - - result = nfs_sync_mapping(mapping); - if (result) - goto out_unlock; - - if (mapping->nrpages) { - result = invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - if (result) - goto out_unlock; - } - - task_io_account_write(iov_iter_count(iter)); + task_io_account_write(count); result = -ENOMEM; dreq = nfs_direct_req_alloc(); if (!dreq) - goto out_unlock; + goto out; dreq->inode = inode; - dreq->bytes_left = iov_iter_count(iter); + dreq->bytes_left = dreq->max_count = count; dreq->io_start = pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); @@ -1038,6 +1020,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + nfs_start_io_direct(inode); + result = nfs_direct_write_schedule_iovec(dreq, iter, pos); if (mapping->nrpages) { @@ -1045,28 +1029,19 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos >> PAGE_SHIFT, end); } - inode_unlock(inode); + nfs_end_io_direct(inode); if (!result) { result = nfs_direct_wait(dreq); if (result > 0) { - struct inode *inode = mapping->host; - iocb->ki_pos = pos + result; - spin_lock(&inode->i_lock); - if (i_size_read(inode) < iocb->ki_pos) - i_size_write(inode, iocb->ki_pos); - spin_unlock(&inode->i_lock); - generic_write_sync(file, pos, result); + /* XXX: should check the generic_write_sync retval */ + generic_write_sync(iocb, result); } } - nfs_direct_req_release(dreq); - return result; - out_release: nfs_direct_req_release(dreq); -out_unlock: - inode_unlock(inode); +out: return result; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index be01095b97ae..ca699ddc11c1 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -164,18 +164,20 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) ssize_t result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_read(iocb, to, iocb->ki_pos); + return nfs_file_direct_read(iocb, to); dprintk("NFS: read(%pD2, %zu@%lu)\n", iocb->ki_filp, iov_iter_count(to), (unsigned long) iocb->ki_pos); - result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); + nfs_start_io_read(inode); + result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); if (!result) { result = generic_file_read_iter(iocb, to); if (result > 0) nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); } + nfs_end_io_read(inode); return result; } EXPORT_SYMBOL_GPL(nfs_file_read); @@ -191,12 +193,14 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", filp, (unsigned long) count, (unsigned long long) *ppos); - res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); + nfs_start_io_read(inode); + res = nfs_revalidate_mapping(inode, filp->f_mapping); if (!res) { res = generic_file_splice_read(filp, ppos, pipe, count, flags); if (res > 0) nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); } + nfs_end_io_read(inode); return res; } EXPORT_SYMBOL_GPL(nfs_file_splice_read); @@ -272,16 +276,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) trace_nfs_fsync_enter(inode); - inode_dio_wait(inode); do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) break; - inode_lock(inode); ret = nfs_file_fsync_commit(file, start, end, datasync); if (!ret) ret = pnfs_sync_inode(inode, !!datasync); - inode_unlock(inode); /* * If nfs_file_fsync_commit detected a server reboot, then * resend all dirty pages that might have been covered by @@ -359,19 +360,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file, mapping->host->i_ino, len, (long long) pos); start: - /* - * Prevent starvation issues if someone is doing a consistency - * sync-to-disk - */ - ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (ret) - return ret; - /* - * Wait for O_DIRECT to complete - */ - inode_dio_wait(mapping->host); - page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; @@ -432,7 +420,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, return status; NFS_I(mapping->host)->write_io += copied; - if (nfs_ctx_key_to_expire(ctx)) { + if (nfs_ctx_key_to_expire(ctx, mapping->host)) { status = nfs_wb_all(mapping->host); if (status < 0) return status; @@ -470,31 +458,8 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset, */ static int nfs_release_page(struct page *page, gfp_t gfp) { - struct address_space *mapping = page->mapping; - dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); - /* Always try to initiate a 'commit' if relevant, but only - * wait for it if the caller allows blocking. Even then, - * only wait 1 second and only if the 'bdi' is not congested. - * Waiting indefinitely can cause deadlocks when the NFS - * server is on this machine, when a new TCP connection is - * needed and in other rare cases. There is no particular - * need to wait extensively here. A short wait has the - * benefit that someone else can worry about the freezer. - */ - if (mapping) { - struct nfs_server *nfss = NFS_SERVER(mapping->host); - nfs_commit_inode(mapping->host, 0); - if (gfpflags_allow_blocking(gfp) && - !bdi_write_congested(&nfss->backing_dev_info)) { - wait_on_page_bit_killable_timeout(page, PG_private, - HZ); - if (PagePrivate(page)) - set_bdi_congested(&nfss->backing_dev_info, - BLK_RW_ASYNC); - } - } /* If PagePrivate() is set, then the page is not freeable */ if (PagePrivate(page)) return 0; @@ -604,6 +569,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) filp, filp->f_mapping->host->i_ino, (long long)page_offset(page)); + sb_start_pagefault(inode->i_sb); + /* make sure the cache has finished storing the page */ nfs_fscache_wait_on_page_write(NFS_I(inode), page); @@ -630,6 +597,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) out_unlock: unlock_page(page); out: + sb_end_pagefault(inode->i_sb); return ret; } @@ -645,7 +613,7 @@ static int nfs_need_check_write(struct file *filp, struct inode *inode) ctx = nfs_file_open_context(filp); if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || - nfs_ctx_key_to_expire(ctx)) + nfs_ctx_key_to_expire(ctx, inode)) return 1; return 0; } @@ -656,23 +624,17 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(file); unsigned long written = 0; ssize_t result; - size_t count = iov_iter_count(from); result = nfs_key_timeout_notify(file, inode); if (result) return result; - if (iocb->ki_flags & IOCB_DIRECT) { - result = generic_write_checks(iocb, from); - if (result <= 0) - return result; + if (iocb->ki_flags & IOCB_DIRECT) return nfs_file_direct_write(iocb, from); - } dprintk("NFS: write(%pD2, %zu@%Ld)\n", - file, count, (long long) iocb->ki_pos); + file, iov_iter_count(from), (long long) iocb->ki_pos); - result = -EBUSY; if (IS_SWAPFILE(inode)) goto out_swapfile; /* @@ -684,28 +646,36 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) goto out; } - result = count; - if (!count) + nfs_start_io_write(inode); + result = generic_write_checks(iocb, from); + if (result > 0) { + current->backing_dev_info = inode_to_bdi(inode); + result = generic_perform_write(file, from, iocb->ki_pos); + current->backing_dev_info = NULL; + } + nfs_end_io_write(inode); + if (result <= 0) goto out; - result = generic_file_write_iter(iocb, from); - if (result > 0) - written = result; + result = generic_write_sync(iocb, result); + if (result < 0) + goto out; + written = result; + iocb->ki_pos += written; /* Return error values */ - if (result >= 0 && nfs_need_check_write(file, inode)) { + if (nfs_need_check_write(file, inode)) { int err = vfs_fsync(file, 0); if (err < 0) result = err; } - if (result > 0) - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); out: return result; out_swapfile: printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); - goto out; + return -EBUSY; } EXPORT_SYMBOL_GPL(nfs_file_write); @@ -780,11 +750,6 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) } static int -is_time_granular(struct timespec *ts) { - return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000)); -} - -static int do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; @@ -817,12 +782,8 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * This makes locking act as a cache coherency point. */ nfs_sync_mapping(filp->f_mapping); - if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { - if (is_time_granular(&NFS_SERVER(inode)->time_delta)) - __nfs_revalidate_inode(NFS_SERVER(inode), inode); - else - nfs_zap_caches(inode); - } + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + nfs_zap_mapping(inode, filp->f_mapping); out: return status; } diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 3384dc8e6683..a3fc48ba4931 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -255,13 +255,16 @@ static int filelayout_read_done_cb(struct rpc_task *task, static void filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) { + loff_t end_offs = 0; if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || - hdr->res.verf->committed != NFS_DATA_SYNC) + hdr->res.verf->committed == NFS_FILE_SYNC) return; + if (hdr->res.verf->committed == NFS_DATA_SYNC) + end_offs = hdr->mds_offset + (loff_t)hdr->res.count; - pnfs_set_layoutcommit(hdr->inode, hdr->lseg, - hdr->mds_offset + hdr->res.count); + /* Note: if the write is unstable, don't set end_offs until commit */ + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -354,6 +357,12 @@ static int filelayout_write_done_cb(struct rpc_task *task, } filelayout_set_layoutcommit(hdr); + + /* zero out the fattr */ + hdr->fattr.valid = 0; + if (task->tk_status >= 0) + nfs_writeback_update_inode(hdr); + return 0; } @@ -375,8 +384,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (data->verf.committed == NFS_UNSTABLE) - pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } @@ -795,7 +803,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; } - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); if (cinfo->ds->nbuckets >= size) goto out; for (i = 0; i < cinfo->ds->nbuckets; i++) { @@ -811,7 +819,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, swap(cinfo->ds->buckets, buckets); cinfo->ds->nbuckets = size; out: - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); kfree(buckets); return 0; } @@ -890,6 +898,7 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, 0, NFS4_MAX_UINT64, IOMODE_READ, + false, GFP_KERNEL); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -915,6 +924,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, 0, NFS4_MAX_UINT64, IOMODE_RW, + false, GFP_NOFS); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 0cb1abd535e3..51b51369704c 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -26,6 +26,8 @@ #define FF_LAYOUT_POLL_RETRY_MAX (15*HZ) +static struct group_info *ff_zero_group; + static struct pnfs_layout_hdr * ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) { @@ -35,6 +37,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) if (ffl) { INIT_LIST_HEAD(&ffl->error_list); INIT_LIST_HEAD(&ffl->mirrors); + ffl->last_report_time = ktime_get(); return &ffl->generic_hdr; } else return NULL; @@ -53,14 +56,15 @@ ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo) kfree(FF_LAYOUT_FROM_HDR(lo)); } -static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) { __be32 *p; p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE); if (unlikely(p == NULL)) return -ENOBUFS; - memcpy(stateid, p, NFS4_STATEID_SIZE); + stateid->type = NFS4_PNFS_DS_STATEID_TYPE; + memcpy(stateid->data, p, NFS4_STATEID_SIZE); dprintk("%s: stateid id= [%x%x%x%x]\n", __func__, p[0], p[1], p[2], p[3]); return 0; @@ -211,10 +215,16 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) { + struct rpc_cred *cred; + ff_layout_remove_mirror(mirror); kfree(mirror->fh_versions); - if (mirror->cred) - put_rpccred(mirror->cred); + cred = rcu_access_pointer(mirror->ro_cred); + if (cred) + put_rpccred(cred); + cred = rcu_access_pointer(mirror->rw_cred); + if (cred) + put_rpccred(cred); nfs4_ff_layout_put_deviceid(mirror->mirror_ds); kfree(mirror); } @@ -290,6 +300,8 @@ ff_lseg_merge(struct pnfs_layout_segment *new, { u64 new_end, old_end; + if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags)) + return false; if (new->pls_range.iomode != old->pls_range.iomode) return false; old_end = pnfs_calc_offset_end(old->pls_range.offset, @@ -310,8 +322,6 @@ ff_lseg_merge(struct pnfs_layout_segment *new, new_end); if (test_bit(NFS_LSEG_ROC, &old->pls_flags)) set_bit(NFS_LSEG_ROC, &new->pls_flags); - if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags)) - set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags); return true; } @@ -407,8 +417,9 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid devid; struct nfs4_deviceid_node *idnode; - u32 ds_count; - u32 fh_count; + struct auth_cred acred = { .group_info = ff_zero_group }; + struct rpc_cred __rcu *cred; + u32 ds_count, fh_count, id; int j; rc = -EIO; @@ -456,7 +467,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fls->mirror_array[i]->efficiency = be32_to_cpup(p); /* stateid */ - rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid); + rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid); if (rc) goto out_err_free; @@ -484,24 +495,49 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fls->mirror_array[i]->fh_versions_cnt = fh_count; /* user */ - rc = decode_name(&stream, &fls->mirror_array[i]->uid); + rc = decode_name(&stream, &id); if (rc) goto out_err_free; + acred.uid = make_kuid(&init_user_ns, id); + /* group */ - rc = decode_name(&stream, &fls->mirror_array[i]->gid); + rc = decode_name(&stream, &id); if (rc) goto out_err_free; + acred.gid = make_kgid(&init_user_ns, id); + + /* find the cred for it */ + rcu_assign_pointer(cred, rpc_lookup_generic_cred(&acred, 0, gfp_flags)); + if (IS_ERR(cred)) { + rc = PTR_ERR(cred); + goto out_err_free; + } + + if (lgr->range.iomode == IOMODE_READ) + rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); + else + rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); + mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]); if (mirror != fls->mirror_array[i]) { + /* swap cred ptrs so free_mirror will clean up old */ + if (lgr->range.iomode == IOMODE_READ) { + cred = xchg(&mirror->ro_cred, cred); + rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); + } else { + cred = xchg(&mirror->rw_cred, cred); + rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); + } ff_layout_free_mirror(fls->mirror_array[i]); fls->mirror_array[i] = mirror; } - dprintk("%s: uid %d gid %d\n", __func__, - fls->mirror_array[i]->uid, - fls->mirror_array[i]->gid); + dprintk("%s: iomode %s uid %u gid %u\n", __func__, + lgr->range.iomode == IOMODE_READ ? "READ" : "RW", + from_kuid(&init_user_ns, acred.uid), + from_kgid(&init_user_ns, acred.gid)); } p = xdr_inline_decode(&stream, 4); @@ -605,19 +641,18 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, { static const ktime_t notime = {0}; s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL; + struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout); nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now); if (ktime_equal(mirror->start_time, notime)) mirror->start_time = now; - if (ktime_equal(mirror->last_report_time, notime)) - mirror->last_report_time = now; if (mirror->report_interval != 0) report_interval = (s64)mirror->report_interval * 1000LL; else if (layoutstats_timer != 0) report_interval = (s64)layoutstats_timer * 1000LL; - if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >= + if (ktime_to_ms(ktime_sub(now, ffl->last_report_time)) >= report_interval) { - mirror->last_report_time = now; + ffl->last_report_time = now; return true; } @@ -745,7 +780,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, else { int i; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); if (cinfo->ds->nbuckets != 0) kfree(buckets); else { @@ -759,7 +794,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, NFS_INVALID_STABLE_HOW; } } - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); return 0; } } @@ -771,11 +806,14 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); struct nfs4_pnfs_ds *ds; + bool fail_return = false; int idx; /* mirrors are sorted by efficiency */ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { - ds = nfs4_ff_layout_prepare_ds(lseg, idx, false); + if (idx+1 == fls->mirror_array_cnt) + fail_return = true; + ds = nfs4_ff_layout_prepare_ds(lseg, idx, fail_return); if (ds) { *best_idx = idx; return ds; @@ -786,6 +824,36 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, } static void +ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req, + bool strict_iomode) +{ +retry_strict: + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_READ, + strict_iomode, + GFP_KERNEL); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + } + + /* If we don't have checking, do get a IOMODE_RW + * segment, and the server wants to avoid READs + * there, then retry! + */ + if (pgio->pg_lseg && !strict_iomode && + ff_layout_avoid_read_on_rw(pgio->pg_lseg)) { + strict_iomode = true; + goto retry_strict; + } +} + +static void ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { @@ -794,27 +862,28 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs4_pnfs_ds *ds; int ds_idx; +retry: /* Use full layout for now */ - if (!pgio->pg_lseg) { - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, - 0, - NFS4_MAX_UINT64, - IOMODE_READ, - GFP_KERNEL); - if (IS_ERR(pgio->pg_lseg)) { - pgio->pg_error = PTR_ERR(pgio->pg_lseg); - pgio->pg_lseg = NULL; - return; - } - } + if (!pgio->pg_lseg) + ff_layout_pg_get_read(pgio, req, false); + else if (ff_layout_avoid_read_on_rw(pgio->pg_lseg)) + ff_layout_pg_get_read(pgio, req, true); + /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) goto out_mds; ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx); - if (!ds) - goto out_mds; + if (!ds) { + if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) + goto out_mds; + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + /* Sleep for 1 second before retrying */ + ssleep(1); + goto retry; + } + mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); pgio->pg_mirror_idx = ds_idx; @@ -841,12 +910,14 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, int i; int status; +retry: if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_RW, + false, GFP_NOFS); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -870,8 +941,15 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, for (i = 0; i < pgio->pg_mirror_count; i++) { ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true); - if (!ds) - goto out_mds; + if (!ds) { + if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) + goto out_mds; + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + /* Sleep for 1 second before retrying */ + ssleep(1); + goto retry; + } pgm = &pgio->pg_mirrors[i]; mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; @@ -895,6 +973,7 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, 0, NFS4_MAX_UINT64, IOMODE_RW, + false, GFP_NOFS); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -1067,8 +1146,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, rpc_wake_up(&tbl->slot_tbl_waitq); /* fall through */ default: - if (ff_layout_no_fallback_to_mds(lseg) || - ff_layout_has_available_ds(lseg)) + if (ff_layout_avoid_mds_available_ds(lseg)) return -NFS4ERR_RESET_TO_PNFS; reset: dprintk("%s Retry through MDS. Error %d\n", __func__, @@ -1215,8 +1293,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task, hdr->pgio_mirror_idx + 1, &hdr->pgio_mirror_idx)) goto out_eagain; - set_bit(NFS_LAYOUT_RETURN_REQUESTED, - &hdr->lseg->pls_layout->plh_flags); pnfs_read_resend_pnfs(hdr); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: @@ -1248,19 +1324,20 @@ ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg) * we always send layoutcommit after DS writes. */ static void -ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) +ff_layout_set_layoutcommit(struct inode *inode, + struct pnfs_layout_segment *lseg, + loff_t end_offset) { - if (!ff_layout_need_layoutcommit(hdr->lseg)) + if (!ff_layout_need_layoutcommit(lseg)) return; - pnfs_set_layoutcommit(hdr->inode, hdr->lseg, - hdr->mds_offset + hdr->res.count); - dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, - (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); + pnfs_set_layoutcommit(inode, lseg, end_offset); + dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino, + (unsigned long long) NFS_I(inode)->layout->plh_lwb); } static bool -ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) +ff_layout_device_unavailable(struct pnfs_layout_segment *lseg, int idx) { /* No mirroring for now */ struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx); @@ -1297,16 +1374,10 @@ static int ff_layout_read_prepare_common(struct rpc_task *task, rpc_exit(task, -EIO); return -EIO; } - if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) { - dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); - if (ff_layout_has_available_ds(hdr->lseg)) - pnfs_read_resend_pnfs(hdr); - else - ff_layout_reset_read(hdr); - rpc_exit(task, 0); + if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) { + rpc_exit(task, -EHOSTDOWN); return -EAGAIN; } - hdr->pgio_done_cb = ff_layout_read_done_cb; ff_layout_read_record_layoutstats_start(task, hdr); return 0; @@ -1398,6 +1469,7 @@ static void ff_layout_read_release(void *data) static int ff_layout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + loff_t end_offs = 0; int err; trace_nfs4_pnfs_write(hdr, task->tk_status); @@ -1423,7 +1495,10 @@ static int ff_layout_write_done_cb(struct rpc_task *task, if (hdr->res.verf->committed == NFS_FILE_SYNC || hdr->res.verf->committed == NFS_DATA_SYNC) - ff_layout_set_layoutcommit(hdr); + end_offs = hdr->mds_offset + (loff_t)hdr->res.count; + + /* Note: if the write is unstable, don't set end_offs until commit */ + ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs); /* zero out fattr since we don't care DS attr at all */ hdr->fattr.valid = 0; @@ -1459,9 +1534,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (data->verf.committed == NFS_UNSTABLE - && ff_layout_need_layoutcommit(data->lseg)) - pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); + ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } @@ -1496,14 +1569,8 @@ static int ff_layout_write_prepare_common(struct rpc_task *task, return -EIO; } - if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) { - bool retry_pnfs; - - retry_pnfs = ff_layout_has_available_ds(hdr->lseg); - dprintk("%s task %u reset io to %s\n", __func__, - task->tk_pid, retry_pnfs ? "pNFS" : "MDS"); - ff_layout_reset_write(hdr, retry_pnfs); - rpc_exit(task, 0); + if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) { + rpc_exit(task, -EHOSTDOWN); return -EAGAIN; } @@ -1712,7 +1779,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) goto out_failed; ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); - if (IS_ERR(ds_cred)) + if (!ds_cred) goto out_failed; vers = nfs4_ff_layout_ds_version(lseg, idx); @@ -1720,6 +1787,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers); + hdr->pgio_done_cb = ff_layout_read_done_cb; atomic_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; fh = nfs4_ff_layout_select_ds_fh(lseg, idx); @@ -1737,11 +1805,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) vers == 3 ? &ff_layout_read_call_ops_v3 : &ff_layout_read_call_ops_v4, 0, RPC_TASK_SOFTCONN); - + put_rpccred(ds_cred); return PNFS_ATTEMPTED; out_failed: - if (ff_layout_has_available_ds(lseg)) + if (ff_layout_avoid_mds_available_ds(lseg)) return PNFS_TRY_AGAIN; return PNFS_NOT_ATTEMPTED; } @@ -1769,7 +1837,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) return PNFS_NOT_ATTEMPTED; ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); - if (IS_ERR(ds_cred)) + if (!ds_cred) return PNFS_NOT_ATTEMPTED; vers = nfs4_ff_layout_ds_version(lseg, idx); @@ -1798,6 +1866,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) vers == 3 ? &ff_layout_write_call_ops_v3 : &ff_layout_write_call_ops_v4, sync, RPC_TASK_SOFTCONN); + put_rpccred(ds_cred); return PNFS_ATTEMPTED; } @@ -1824,7 +1893,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) struct rpc_clnt *ds_clnt; struct rpc_cred *ds_cred; u32 idx; - int vers; + int vers, ret; struct nfs_fh *fh; idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); @@ -1838,7 +1907,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) goto out_err; ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred); - if (IS_ERR(ds_cred)) + if (!ds_cred) goto out_err; vers = nfs4_ff_layout_ds_version(lseg, idx); @@ -1854,10 +1923,12 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) if (fh) data->args.fh = fh; - return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, + ret = nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, vers == 3 ? &ff_layout_commit_call_ops_v3 : &ff_layout_commit_call_ops_v4, how, RPC_TASK_SOFTCONN); + put_rpccred(ds_cred); + return ret; out_err: pnfs_generic_prepare_to_resend_writes(data); pnfs_generic_commit_release(data); @@ -2223,6 +2294,11 @@ static int __init nfs4flexfilelayout_init(void) { printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n", __func__); + if (!ff_zero_group) { + ff_zero_group = groups_alloc(0); + if (!ff_zero_group) + return -ENOMEM; + } return pnfs_register_layoutdriver(&flexfilelayout_type); } @@ -2231,6 +2307,10 @@ static void __exit nfs4flexfilelayout_exit(void) printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n", __func__); pnfs_unregister_layoutdriver(&flexfilelayout_type); + if (ff_zero_group) { + put_group_info(ff_zero_group); + ff_zero_group = NULL; + } } MODULE_ALIAS("nfs-layouttype4-4"); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index dd353bb7dc0a..3ee0c9fcea76 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -10,7 +10,8 @@ #define FS_NFS_NFS4FLEXFILELAYOUT_H #define FF_FLAGS_NO_LAYOUTCOMMIT 1 -#define FF_FLAGS_NO_IO_THRU_MDS 2 +#define FF_FLAGS_NO_IO_THRU_MDS 2 +#define FF_FLAGS_NO_READ_IO 4 #include "../pnfs.h" @@ -76,15 +77,13 @@ struct nfs4_ff_layout_mirror { u32 fh_versions_cnt; struct nfs_fh *fh_versions; nfs4_stateid stateid; - u32 uid; - u32 gid; - struct rpc_cred *cred; + struct rpc_cred __rcu *ro_cred; + struct rpc_cred __rcu *rw_cred; atomic_t ref; spinlock_t lock; struct nfs4_ff_layoutstat read_stat; struct nfs4_ff_layoutstat write_stat; ktime_t start_time; - ktime_t last_report_time; u32 report_interval; }; @@ -101,6 +100,7 @@ struct nfs4_flexfile_layout { struct pnfs_ds_commit_info commit_info; struct list_head mirrors; struct list_head error_list; /* nfs4_ff_layout_ds_err */ + ktime_t last_report_time; /* Layoutstat report times */ }; static inline struct nfs4_flexfile_layout * @@ -154,6 +154,12 @@ ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg) } static inline bool +ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg) +{ + return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO; +} + +static inline bool ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node) { return nfs4_test_deviceid_unavailable(node); @@ -192,4 +198,7 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, struct rpc_cred *mdscred); bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg); +bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg); +bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg); + #endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */ diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index add0e5a70bd6..f7a3f6b05369 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -17,8 +17,8 @@ #define NFSDBG_FACILITY NFSDBG_PNFS_LD -static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO; -static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS; +static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS; +static unsigned int dataserver_retrans; void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds) { @@ -228,7 +228,8 @@ ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1, return e1->opnum < e2->opnum ? -1 : 1; if (e1->status != e2->status) return e1->status < e2->status ? -1 : 1; - ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid)); + ret = memcmp(e1->stateid.data, e2->stateid.data, + sizeof(e1->stateid.data)); if (ret != 0) return ret; ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid)); @@ -302,40 +303,26 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, return 0; } -/* currently we only support AUTH_NONE and AUTH_SYS */ -static rpc_authflavor_t -nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror) +static struct rpc_cred * +ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) { - if (mirror->uid == (u32)-1) - return RPC_AUTH_NULL; - return RPC_AUTH_UNIX; -} + struct rpc_cred *cred, __rcu **pcred; -/* fetch cred for NFSv3 DS */ -static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror, - struct nfs4_pnfs_ds *ds) -{ - if (ds->ds_clp && !mirror->cred && - mirror->mirror_ds->ds_versions[0].version == 3) { - struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth; - struct rpc_cred *cred; - struct auth_cred acred = { - .uid = make_kuid(&init_user_ns, mirror->uid), - .gid = make_kgid(&init_user_ns, mirror->gid), - }; - - /* AUTH_NULL ignores acred */ - cred = auth->au_ops->lookup_cred(auth, &acred, 0); - if (IS_ERR(cred)) { - dprintk("%s: lookup_cred failed with %ld\n", - __func__, PTR_ERR(cred)); - return PTR_ERR(cred); - } else { - if (cmpxchg(&mirror->cred, NULL, cred)) - put_rpccred(cred); - } - } - return 0; + if (iomode == IOMODE_READ) + pcred = &mirror->ro_cred; + else + pcred = &mirror->rw_cred; + + rcu_read_lock(); + do { + cred = rcu_dereference(*pcred); + if (!cred) + break; + + cred = get_rpccred_rcu(cred); + } while(!cred); + rcu_read_unlock(); + return cred; } struct nfs_fh * @@ -356,7 +343,23 @@ out: return fh; } -/* Upon return, either ds is connected, or ds is NULL */ +/** + * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call + * @lseg: the layout segment we're operating on + * @ds_idx: index of the DS to use + * @fail_return: return layout on connect failure? + * + * Try to prepare a DS connection to accept an RPC call. This involves + * selecting a mirror to use and connecting the client to it if it's not + * already connected. + * + * Since we only need a single functioning mirror to satisfy a read, we don't + * want to return the layout if there is one. For writes though, any down + * mirror should result in a LAYOUTRETURN. @fail_return is how we distinguish + * between the two cases. + * + * Returns a pointer to a connected DS object on success or NULL on failure. + */ struct nfs4_pnfs_ds * nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, bool fail_return) @@ -367,7 +370,6 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, struct inode *ino = lseg->pls_layout->plh_inode; struct nfs_server *s = NFS_SERVER(ino); unsigned int max_payload; - rpc_authflavor_t flavor; if (!ff_layout_mirror_valid(lseg, mirror)) { pr_err_ratelimited("NFS: %s: No data server for offset index %d\n", @@ -377,15 +379,13 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, devid = &mirror->mirror_ds->id_node; if (ff_layout_test_devid_unavailable(devid)) - goto out; + goto out_fail; ds = mirror->mirror_ds->ds; /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ smp_rmb(); if (ds->ds_clp) - goto out_update_creds; - - flavor = nfs4_ff_layout_choose_authflavor(mirror); + goto out; /* FIXME: For now we assume the server sent only one version of NFS * to use for the DS. @@ -394,7 +394,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, dataserver_retrans, mirror->mirror_ds->ds_versions[0].version, mirror->mirror_ds->ds_versions[0].minor_version, - flavor); + RPC_AUTH_UNIX); /* connect success, check rsize/wsize limit */ if (ds->ds_clp) { @@ -405,25 +405,16 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror->mirror_ds->ds_versions[0].rsize = max_payload; if (mirror->mirror_ds->ds_versions[0].wsize > max_payload) mirror->mirror_ds->ds_versions[0].wsize = max_payload; - } else { - ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), - mirror, lseg->pls_range.offset, - lseg->pls_range.length, NFS4ERR_NXIO, - OP_ILLEGAL, GFP_NOIO); - if (!fail_return) { - if (ff_layout_has_available_ds(lseg)) - set_bit(NFS_LAYOUT_RETURN_REQUESTED, - &lseg->pls_layout->plh_flags); - else - pnfs_error_mark_layout_for_return(ino, lseg); - } else - pnfs_error_mark_layout_for_return(ino, lseg); - ds = NULL; goto out; } -out_update_creds: - if (ff_layout_update_mirror_cred(mirror, ds)) - ds = NULL; + ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), + mirror, lseg->pls_range.offset, + lseg->pls_range.length, NFS4ERR_NXIO, + OP_ILLEGAL, GFP_NOIO); +out_fail: + if (fail_return || !ff_layout_has_available_ds(lseg)) + pnfs_error_mark_layout_for_return(ino, lseg); + ds = NULL; out: return ds; } @@ -433,16 +424,15 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, struct rpc_cred *mdscred) { struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); - struct rpc_cred *cred = ERR_PTR(-EINVAL); - - if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true)) - goto out; + struct rpc_cred *cred; - if (mirror && mirror->cred) - cred = mirror->cred; - else - cred = mdscred; -out: + if (mirror) { + cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode); + if (!cred) + cred = get_rpccred(mdscred); + } else { + cred = get_rpccred(mdscred); + } return cred; } @@ -562,6 +552,18 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) return ff_rw_layout_has_available_ds(lseg); } +bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg) +{ + return ff_layout_no_fallback_to_mds(lseg) || + ff_layout_has_available_ds(lseg); +} + +bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg) +{ + return lseg->pls_range.iomode == IOMODE_RW && + ff_layout_no_read_on_rw(lseg); +} + module_param(dataserver_retrans, uint, 0644); MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client " "retries a request before it attempts further " diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 738c84a42eb0..bf4ec5ecc97e 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -282,6 +282,7 @@ nfs_init_locked(struct inode *inode, void *opaque) struct nfs_fattr *fattr = desc->fattr; set_nfs_fileid(inode, fattr->fileid); + inode->i_mode = fattr->mode; nfs_copy_fh(NFS_FH(inode), desc->fh); return 0; } @@ -661,9 +662,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - inode_lock(inode); - err = nfs_sync_inode(inode); - inode_unlock(inode); + err = filemap_write_and_wait(inode->i_mapping); if (err) goto out; } @@ -878,7 +877,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); - list_add(&ctx->list, &nfsi->open_files); + if (ctx->mode & FMODE_WRITE) + list_add(&ctx->list, &nfsi->open_files); + else + list_add_tail(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); @@ -971,6 +973,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) if (NFS_STALE(inode)) goto out; + /* pNFS: Attributes aren't updated until we layoutcommit */ + if (S_ISREG(inode->i_mode)) { + status = pnfs_sync_inode(inode, false); + if (status) + goto out; + } + status = -ENOMEM; fattr = nfs_alloc_fattr(); if (fattr == NULL) @@ -1121,14 +1130,12 @@ out: } /** - * __nfs_revalidate_mapping - Revalidate the pagecache + * nfs_revalidate_mapping - Revalidate the pagecache * @inode - pointer to host inode * @mapping - pointer to mapping - * @may_lock - take inode->i_mutex? */ -static int __nfs_revalidate_mapping(struct inode *inode, - struct address_space *mapping, - bool may_lock) +int nfs_revalidate_mapping(struct inode *inode, + struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; @@ -1177,12 +1184,7 @@ static int __nfs_revalidate_mapping(struct inode *inode, nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); - if (may_lock) { - inode_lock(inode); - ret = nfs_invalidate_mapping(inode, mapping); - inode_unlock(inode); - } else - ret = nfs_invalidate_mapping(inode, mapping); + ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); @@ -1192,27 +1194,28 @@ out: return ret; } -/** - * nfs_revalidate_mapping - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping - */ -int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +static bool nfs_file_has_writers(struct nfs_inode *nfsi) { - return __nfs_revalidate_mapping(inode, mapping, false); + struct inode *inode = &nfsi->vfs_inode; + + assert_spin_locked(&inode->i_lock); + + if (!S_ISREG(inode->i_mode)) + return false; + if (list_empty(&nfsi->open_files)) + return false; + /* Note: This relies on nfsi->open_files being ordered with writers + * being placed at the head of the list. + * See nfs_inode_attach_open_context() + */ + return (list_first_entry(&nfsi->open_files, + struct nfs_open_context, + list)->mode & FMODE_WRITE) == FMODE_WRITE; } -/** - * nfs_revalidate_mapping_protected - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping - * - * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex - * while invalidating the mapping. - */ -int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping) +static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) { - return __nfs_revalidate_mapping(inode, mapping, true); + return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi); } static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) @@ -1279,22 +1282,24 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) return -EIO; - if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && - inode->i_version != fattr->change_attr) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if (!nfs_file_has_buffered_writers(nfsi)) { + /* Verify a few of the more important attributes */ + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE; - /* Verify a few of the more important attributes */ - if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) - invalid |= NFS_INO_INVALID_ATTR; + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) + invalid |= NFS_INO_INVALID_ATTR; - if (fattr->valid & NFS_ATTR_FATTR_SIZE) { - cur_size = i_size_read(inode); - new_isize = nfs_size_to_loff_t(fattr->size); - if (cur_size != new_isize) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime)) + invalid |= NFS_INO_INVALID_ATTR; + + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { + cur_size = i_size_read(inode); + new_isize = nfs_size_to_loff_t(fattr->size); + if (cur_size != new_isize) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + } } - if (nfsi->nrequests != 0) - invalid &= ~NFS_INO_REVAL_PAGECACHE; /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) @@ -1469,28 +1474,12 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); } -/* - * Don't trust the change_attribute, mtime, ctime or size if - * a pnfs LAYOUTCOMMIT is outstanding - */ -static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode, - struct nfs_fattr *fattr) -{ - if (pnfs_layoutcommit_outstanding(inode)) - fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE | - NFS_ATTR_FATTR_MTIME | - NFS_ATTR_FATTR_CTIME | - NFS_ATTR_FATTR_SIZE); -} - static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { int ret; trace_nfs_refresh_inode_enter(inode); - nfs_inode_attrs_handle_layoutcommit(inode, fattr); - if (nfs_inode_attrs_need_update(inode, fattr)) ret = nfs_update_inode(inode, fattr); else @@ -1526,7 +1515,7 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode); static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { - unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + unsigned long invalid = NFS_INO_INVALID_ATTR; /* * Don't revalidate the pagecache if we hold a delegation, but do @@ -1675,6 +1664,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long invalid = 0; unsigned long now = jiffies; unsigned long save_cache_validity; + bool have_writers = nfs_file_has_buffered_writers(nfsi); bool cache_revalidated = true; dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", @@ -1724,17 +1714,25 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Do atomic weak cache consistency updates */ invalid |= nfs_wcc_update_inode(inode, fattr); + if (pnfs_layoutcommit_outstanding(inode)) { + nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR; + cache_revalidated = false; + } + /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { if (inode->i_version != fattr->change_attr) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); - invalid |= NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_DATA - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL; - if (S_ISDIR(inode->i_mode)) - nfs_force_lookup_revalidate(inode); + /* Could it be a race with writeback? */ + if (!have_writers) { + invalid |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; + if (S_ISDIR(inode->i_mode)) + nfs_force_lookup_revalidate(inode); + } inode->i_version = fattr->change_attr; } } else { @@ -1767,9 +1765,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (new_isize != cur_isize) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ - if ((nfsi->nrequests == 0) || new_isize > cur_isize) { + if (nfsi->nrequests == 0 || new_isize > cur_isize) { i_size_write(inode, new_isize); - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + if (!have_writers) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", @@ -1958,9 +1957,7 @@ static void init_once(void *foo) nfsi->nrequests = 0; nfsi->commit_info.ncommit = 0; atomic_set(&nfsi->commit_info.rpcs_out, 0); - atomic_set(&nfsi->silly_count, 1); - INIT_HLIST_HEAD(&nfsi->silly_list); - init_waitqueue_head(&nfsi->waitqueue); + init_rwsem(&nfsi->rmdir_sem); nfs4_init_once(nfsi); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index f1d1d2c472e9..74935a19e4bf 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -58,6 +58,9 @@ struct nfs_clone_mount { */ #define NFS_UNSPEC_PORT (-1) +#define NFS_UNSPEC_RETRANS (UINT_MAX) +#define NFS_UNSPEC_TIMEO (UINT_MAX) + /* * Maximum number of pages that readdir can use for creating * a vmapped array of pages. @@ -66,13 +69,16 @@ struct nfs_clone_mount { struct nfs_client_initdata { unsigned long init_flags; - const char *hostname; - const struct sockaddr *addr; + const char *hostname; /* Hostname of the server */ + const struct sockaddr *addr; /* Address of the server */ + const char *nodename; /* Hostname of the client */ + const char *ip_addr; /* IP address of the client */ size_t addrlen; struct nfs_subversion *nfs_mod; int proto; u32 minorversion; struct net *net; + const struct rpc_timeout *timeparms; }; /* @@ -147,14 +153,13 @@ extern void nfs_umount(const struct nfs_mount_request *info); extern const struct rpc_program nfs_program; extern void nfs_clients_init(struct net *net); extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); -int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t); +int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t); struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, - const struct rpc_timeout *, const char *, rpc_authflavor_t); int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); void nfs_server_insert_lists(struct nfs_server *); void nfs_server_remove_lists(struct nfs_server *); -void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int); +void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans); int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, rpc_authflavor_t); struct nfs_server *nfs_alloc_server(void); @@ -184,7 +189,7 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *, rpc_authflavor_t); extern int nfs_wait_client_init_complete(const struct nfs_client *clp); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); -extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, +extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, @@ -193,7 +198,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, rpc_authflavor_t au_flavor); extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, struct inode *); -extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, +extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, rpc_authflavor_t au_flavor); @@ -338,8 +343,7 @@ nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); extern struct nfs_client *nfs_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr); + const struct nfs_client_initdata *); /* dir.c */ extern void nfs_force_use_readdirplus(struct inode *dir); @@ -411,6 +415,19 @@ extern void __exit unregister_nfs_fs(void); extern bool nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); +/* io.c */ +extern void nfs_start_io_read(struct inode *inode); +extern void nfs_end_io_read(struct inode *inode); +extern void nfs_start_io_write(struct inode *inode); +extern void nfs_end_io_write(struct inode *inode); +extern void nfs_start_io_direct(struct inode *inode); +extern void nfs_end_io_direct(struct inode *inode); + +static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) +{ + return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; +} + /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, @@ -477,6 +494,7 @@ void nfs_mark_request_commit(struct nfs_page *req, u32 ds_commit_idx); int nfs_write_need_commit(struct nfs_pgio_header *); void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); +int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, @@ -495,9 +513,29 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo, struct inode *inode, struct nfs_direct_req *dreq); int nfs_key_timeout_notify(struct file *filp, struct inode *inode); -bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode); void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); +int nfs_filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend); + +#ifdef CONFIG_NFS_V4_1 +static inline +void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) +{ + int i; + + for (i = 0; i < cinfo->nbuckets; i++) + cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; +} +#else +static inline +void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) +{ +} +#endif + + #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); @@ -505,6 +543,13 @@ extern int nfs_migrate_page(struct address_space *, #define nfs_migrate_page NULL #endif +static inline int +nfs_write_verifier_cmp(const struct nfs_write_verifier *v1, + const struct nfs_write_verifier *v2) +{ + return memcmp(v1->data, v2->data, sizeof(v1->data)); +} + /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, @@ -520,8 +565,7 @@ extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); /* nfs4proc.c */ extern void __nfs4_read_done_cb(struct nfs_pgio_header *); extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr); + const struct nfs_client_initdata *); extern int nfs40_walk_client_list(struct nfs_client *clp, struct nfs_client **result, struct rpc_cred *cred); @@ -622,7 +666,7 @@ void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) if (!cinfo->dreq) { struct inode *inode = page_file_mapping(page)->host; - inc_zone_page_state(page, NR_UNSTABLE_NFS); + inc_node_page_state(page, NR_UNSTABLE_NFS); inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } diff --git a/fs/nfs/io.c b/fs/nfs/io.c new file mode 100644 index 000000000000..1fc5d1ce327e --- /dev/null +++ b/fs/nfs/io.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2016 Trond Myklebust + * + * I/O and data path helper functionality. + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/bitops.h> +#include <linux/rwsem.h> +#include <linux/fs.h> +#include <linux/nfs_fs.h> + +#include "internal.h" + +/* Call with exclusively locked inode->i_rwsem */ +static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) +{ + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + clear_bit(NFS_INO_ODIRECT, &nfsi->flags); + inode_dio_wait(inode); + } +} + +/** + * nfs_start_io_read - declare the file is being used for buffered reads + * @inode - file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that buffered read operations are allowed to + * execute in parallel, thanks to the shared lock, whereas direct I/O + * operations need to wait to grab an exclusive lock in order to set + * NFS_INO_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. + */ +void +nfs_start_io_read(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + nfs_block_o_direct(nfsi, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * nfs_end_io_read - declare that the buffered read operation is done + * @inode - file inode + * + * Declare that a buffered read operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +nfs_end_io_read(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} + +/** + * nfs_start_io_write - declare the file is being used for buffered writes + * @inode - file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + */ +void +nfs_start_io_write(struct inode *inode) +{ + down_write(&inode->i_rwsem); + nfs_block_o_direct(NFS_I(inode), inode); +} + +/** + * nfs_end_io_write - declare that the buffered write operation is done + * @inode - file inode + * + * Declare that a buffered write operation is done, and release the + * lock on inode->i_rwsem. + */ +void +nfs_end_io_write(struct inode *inode) +{ + up_write(&inode->i_rwsem); +} + +/* Call with exclusively locked inode->i_rwsem */ +static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) +{ + if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + set_bit(NFS_INO_ODIRECT, &nfsi->flags); + nfs_wb_all(inode); + } +} + +/** + * nfs_end_io_direct - declare the file is being used for direct i/o + * @inode - file inode + * + * Declare that a direct I/O operation is about to start, and ensure + * that we block all buffered I/O. + * On exit, the function ensures that the NFS_INO_ODIRECT flag is set, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that direct I/O operations are allowed to + * execute in parallel, thanks to the shared lock, whereas buffered I/O + * operations need to wait to grab an exclusive lock in order to clear + * NFS_INO_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. + */ +void +nfs_start_io_direct(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + nfs_block_buffered(nfsi, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * nfs_end_io_direct - declare that the direct i/o operation is done + * @inode - file inode + * + * Declare that a direct I/O operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +nfs_end_io_direct(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 17c0fa1eccfa..720d92f5abfb 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -11,6 +11,38 @@ #define NFSDBG_FACILITY NFSDBG_PROC +/* + * nfs3_prepare_get_acl, nfs3_complete_get_acl, nfs3_abort_get_acl: Helpers for + * caching get_acl results in a race-free way. See fs/posix_acl.c:get_acl() + * for explanations. + */ +static void nfs3_prepare_get_acl(struct posix_acl **p) +{ + struct posix_acl *sentinel = uncached_acl_sentinel(current); + + if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED) { + /* Not the first reader or sentinel already in place. */ + } +} + +static void nfs3_complete_get_acl(struct posix_acl **p, struct posix_acl *acl) +{ + struct posix_acl *sentinel = uncached_acl_sentinel(current); + + /* Only cache the ACL if our sentinel is still in place. */ + posix_acl_dup(acl); + if (cmpxchg(p, sentinel, acl) != sentinel) + posix_acl_release(acl); +} + +static void nfs3_abort_get_acl(struct posix_acl **p) +{ + struct posix_acl *sentinel = uncached_acl_sentinel(current); + + /* Remove our sentinel upon failure. */ + cmpxchg(p, sentinel, ACL_NOT_CACHED); +} + struct posix_acl *nfs3_get_acl(struct inode *inode, int type) { struct nfs_server *server = NFS_SERVER(inode); @@ -55,6 +87,11 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) if (res.fattr == NULL) return ERR_PTR(-ENOMEM); + if (args.mask & NFS_ACL) + nfs3_prepare_get_acl(&inode->i_acl); + if (args.mask & NFS_DFACL) + nfs3_prepare_get_acl(&inode->i_default_acl); + status = rpc_call_sync(server->client_acl, &msg, 0); dprintk("NFS reply getacl: %d\n", status); @@ -89,12 +126,12 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) } if (res.mask & NFS_ACL) - set_cached_acl(inode, ACL_TYPE_ACCESS, res.acl_access); + nfs3_complete_get_acl(&inode->i_acl, res.acl_access); else forget_cached_acl(inode, ACL_TYPE_ACCESS); if (res.mask & NFS_DFACL) - set_cached_acl(inode, ACL_TYPE_DEFAULT, res.acl_default); + nfs3_complete_get_acl(&inode->i_default_acl, res.acl_default); else forget_cached_acl(inode, ACL_TYPE_DEFAULT); @@ -108,6 +145,8 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) } getout: + nfs3_abort_get_acl(&inode->i_acl); + nfs3_abort_get_acl(&inode->i_default_acl); posix_acl_release(res.acl_access); posix_acl_release(res.acl_default); nfs_free_fattr(res.fattr); diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 9e9fa347a948..ee753547fb0a 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -76,19 +76,23 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source, * low timeout interval so that if a connection is lost, we retry through * the MDS. */ -struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, +struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, rpc_authflavor_t au_flavor) { + struct rpc_timeout ds_timeout; + struct nfs_client *mds_clp = mds_srv->nfs_client; struct nfs_client_initdata cl_init = { .addr = ds_addr, .addrlen = ds_addrlen, + .nodename = mds_clp->cl_rpcclient->cl_nodename, + .ip_addr = mds_clp->cl_ipaddr, .nfs_mod = &nfs_v3, .proto = ds_proto, .net = mds_clp->cl_net, + .timeparms = &ds_timeout, }; - struct rpc_timeout ds_timeout; struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; @@ -97,10 +101,12 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, return ERR_PTR(-EINVAL); cl_init.hostname = buf; + if (mds_srv->flags & NFS_MOUNT_NORESVPORT) + set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + /* Use the MDS nfs_client cl_ipaddr. */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, - au_flavor); + clp = nfs_get_client(&cl_init, au_flavor); return clp; } diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index cb28cceefebe..698be9361280 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -144,7 +144,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs3_proc_lookup(struct inode *dir, struct qstr *name, +nfs3_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label) { @@ -404,7 +404,7 @@ out: } static int -nfs3_proc_remove(struct inode *dir, struct qstr *name) +nfs3_proc_remove(struct inode *dir, const struct qstr *name) { struct nfs_removeargs arg = { .fh = NFS_FH(dir), @@ -480,7 +480,7 @@ nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir, } static int -nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) +nfs3_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name) { struct nfs3_linkargs arg = { .fromfh = NFS_FH(inode), @@ -582,7 +582,7 @@ out: } static int -nfs3_proc_rmdir(struct inode *dir, struct qstr *name) +nfs3_proc_rmdir(struct inode *dir, const struct qstr *name) { struct nfs_fattr *dir_attr; struct nfs3_diropargs arg = { diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index b587ccd31083..b6cd15314bab 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -13,6 +13,7 @@ /* nfs4.2proc.c */ int nfs42_proc_allocate(struct file *, loff_t, loff_t); +ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t); int nfs42_proc_deallocate(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index dff83460e5a6..64b43b4ad9dd 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -113,19 +113,135 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) return -EOPNOTSUPP; - nfs_wb_all(inode); inode_lock(inode); + err = nfs_sync_inode(inode); + if (err) + goto out_unlock; err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == 0) truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; - +out_unlock: inode_unlock(inode); return err; } +static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, + struct nfs_lock_context *src_lock, + struct file *dst, loff_t pos_dst, + struct nfs_lock_context *dst_lock, + size_t count) +{ + struct nfs42_copy_args args = { + .src_fh = NFS_FH(file_inode(src)), + .src_pos = pos_src, + .dst_fh = NFS_FH(file_inode(dst)), + .dst_pos = pos_dst, + .count = count, + }; + struct nfs42_copy_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY], + .rpc_argp = &args, + .rpc_resp = &res, + }; + struct inode *dst_inode = file_inode(dst); + struct nfs_server *server = NFS_SERVER(dst_inode); + int status; + + status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context, + src_lock, FMODE_READ); + if (status) + return status; + + status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping, + pos_src, pos_src + (loff_t)count - 1); + if (status) + return status; + + status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, + dst_lock, FMODE_WRITE); + if (status) + return status; + + status = nfs_sync_inode(dst_inode); + if (status) + return status; + + status = nfs4_call_sync(server->client, server, &msg, + &args.seq_args, &res.seq_res, 0); + if (status == -ENOTSUPP) + server->caps &= ~NFS_CAP_COPY; + if (status) + return status; + + if (res.write_res.verifier.committed != NFS_FILE_SYNC) { + status = nfs_commit_file(dst, &res.write_res.verifier.verifier); + if (status) + return status; + } + + truncate_pagecache_range(dst_inode, pos_dst, + pos_dst + res.write_res.count); + + return res.write_res.count; +} + +ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, + struct file *dst, loff_t pos_dst, + size_t count) +{ + struct nfs_server *server = NFS_SERVER(file_inode(dst)); + struct nfs_lock_context *src_lock; + struct nfs_lock_context *dst_lock; + struct nfs4_exception src_exception = { }; + struct nfs4_exception dst_exception = { }; + ssize_t err, err2; + + if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY)) + return -EOPNOTSUPP; + + src_lock = nfs_get_lock_context(nfs_file_open_context(src)); + if (IS_ERR(src_lock)) + return PTR_ERR(src_lock); + + src_exception.inode = file_inode(src); + src_exception.state = src_lock->open_context->state; + + dst_lock = nfs_get_lock_context(nfs_file_open_context(dst)); + if (IS_ERR(dst_lock)) { + err = PTR_ERR(dst_lock); + goto out_put_src_lock; + } + + dst_exception.inode = file_inode(dst); + dst_exception.state = dst_lock->open_context->state; + + do { + inode_lock(file_inode(dst)); + err = _nfs42_proc_copy(src, pos_src, src_lock, + dst, pos_dst, dst_lock, count); + inode_unlock(file_inode(dst)); + + if (err == -ENOTSUPP) { + err = -EOPNOTSUPP; + break; + } + + err2 = nfs4_handle_exception(server, err, &src_exception); + err = nfs4_handle_exception(server, err, &dst_exception); + if (!err) + err = err2; + } while (src_exception.retry || dst_exception.retry); + + nfs_put_lock_context(dst_lock); +out_put_src_lock: + nfs_put_lock_context(src_lock); + return err; +} + static loff_t _nfs42_proc_llseek(struct file *filep, struct nfs_lock_context *lock, loff_t offset, int whence) { @@ -153,7 +269,11 @@ static loff_t _nfs42_proc_llseek(struct file *filep, if (status) return status; - nfs_wb_all(inode); + status = nfs_filemap_write_and_wait_range(inode->i_mapping, + offset, LLONG_MAX); + if (status) + return status; + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == -ENOTSUPP) @@ -198,10 +318,22 @@ static void nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata) { struct nfs42_layoutstat_data *data = calldata; - struct nfs_server *server = NFS_SERVER(data->args.inode); + struct inode *inode = data->inode; + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layout_hdr *lo; + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (!pnfs_layout_is_valid(lo)) { + spin_unlock(&inode->i_lock); + rpc_exit(task, 0); + return; + } + nfs4_stateid_copy(&data->args.stateid, &lo->plh_stateid); + spin_unlock(&inode->i_lock); nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args, &data->res.seq_res, task); + } static void @@ -218,12 +350,14 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) case 0: break; case -NFS4ERR_EXPIRED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_OLD_STATEID: case -NFS4ERR_BAD_STATEID: spin_lock(&inode->i_lock); lo = NFS_I(inode)->layout; - if (lo && nfs4_stateid_match(&data->args.stateid, + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match(&data->args.stateid, &lo->plh_stateid)) { LIST_HEAD(head); @@ -231,18 +365,29 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) * Mark the bad layout state as invalid, then retry * with the current stateid. */ - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); + pnfs_mark_layout_stateid_invalid(lo, &head); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); } else spin_unlock(&inode->i_lock); break; + case -NFS4ERR_OLD_STATEID: + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(&data->args.stateid, + &lo->plh_stateid)) { + /* Do we need to delay before resending? */ + if (!nfs4_stateid_is_newer(&lo->plh_stateid, + &data->args.stateid)) + rpc_delay(task, HZ); + rpc_restart_call_prepare(task); + } + spin_unlock(&inode->i_lock); + break; case -ENOTSUPP: case -EOPNOTSUPP: NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS; - default: - break; } dprintk("%s server returns %d\n", __func__, task->tk_status); diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 0ca482a51e53..8b2605882a20 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -9,9 +9,22 @@ #define encode_fallocate_maxsz (encode_stateid_maxsz + \ 2 /* offset */ + \ 2 /* length */) +#define NFS42_WRITE_RES_SIZE (1 /* wr_callback_id size */ +\ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + 2 /* wr_count */ + \ + 1 /* wr_committed */ + \ + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) #define encode_allocate_maxsz (op_encode_hdr_maxsz + \ encode_fallocate_maxsz) #define decode_allocate_maxsz (op_decode_hdr_maxsz) +#define encode_copy_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + 2 + 2 + 2 + 1 + 1 + 1) +#define decode_copy_maxsz (op_decode_hdr_maxsz + \ + NFS42_WRITE_RES_SIZE + \ + 1 /* cr_consecutive */ + \ + 1 /* cr_synchronous */) #define encode_deallocate_maxsz (op_encode_hdr_maxsz + \ encode_fallocate_maxsz) #define decode_deallocate_maxsz (op_decode_hdr_maxsz) @@ -49,6 +62,16 @@ decode_putfh_maxsz + \ decode_allocate_maxsz + \ decode_getattr_maxsz) +#define NFS4_enc_copy_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_savefh_maxsz + \ + encode_putfh_maxsz + \ + encode_copy_maxsz) +#define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + decode_savefh_maxsz + \ + decode_putfh_maxsz + \ + decode_copy_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_deallocate_maxsz + \ @@ -102,6 +125,23 @@ static void encode_allocate(struct xdr_stream *xdr, encode_fallocate(xdr, args); } +static void encode_copy(struct xdr_stream *xdr, + struct nfs42_copy_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_COPY, decode_copy_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->src_stateid); + encode_nfs4_stateid(xdr, &args->dst_stateid); + + encode_uint64(xdr, args->src_pos); + encode_uint64(xdr, args->dst_pos); + encode_uint64(xdr, args->count); + + encode_uint32(xdr, 1); /* consecutive = true */ + encode_uint32(xdr, 1); /* synchronous = true */ + encode_uint32(xdr, 0); /* src server list */ +} + static void encode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_args *args, struct compound_hdr *hdr) @@ -182,6 +222,26 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req, } /* + * Encode COPY request + */ +static void nfs4_xdr_enc_copy(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs42_copy_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->src_fh, &hdr); + encode_savefh(xdr, &hdr); + encode_putfh(xdr, args->dst_fh, &hdr); + encode_copy(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* * Encode DEALLOCATE request */ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, @@ -266,6 +326,70 @@ static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) return decode_op_hdr(xdr, OP_ALLOCATE); } +static int decode_write_response(struct xdr_stream *xdr, + struct nfs42_write_res *res) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4 + 8 + 4); + if (unlikely(!p)) + goto out_overflow; + + /* + * We never use asynchronous mode, so warn if a server returns + * a stateid. + */ + if (unlikely(*p != 0)) { + pr_err_once("%s: server has set unrequested " + "asynchronous mode\n", __func__); + return -EREMOTEIO; + } + p++; + p = xdr_decode_hyper(p, &res->count); + res->verifier.committed = be32_to_cpup(p); + return decode_verifier(xdr, &res->verifier.verifier); + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_copy_requirements(struct xdr_stream *xdr, + struct nfs42_copy_res *res) { + __be32 *p; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(!p)) + goto out_overflow; + + res->consecutive = be32_to_cpup(p++); + res->synchronous = be32_to_cpup(p++); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res) +{ + int status; + + status = decode_op_hdr(xdr, OP_COPY); + if (status == NFS4ERR_OFFLOAD_NO_REQS) { + status = decode_copy_requirements(xdr, res); + if (status) + return status; + return NFS4ERR_OFFLOAD_NO_REQS; + } else if (status) + return status; + + status = decode_write_response(xdr, &res->write_res); + if (status) + return status; + + return decode_copy_requirements(xdr, res); +} + static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) { return decode_op_hdr(xdr, OP_DEALLOCATE); @@ -331,6 +455,36 @@ out: } /* + * Decode COPY response + */ +static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs42_copy_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_savefh(xdr); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_copy(xdr, res); +out: + return status; +} + +/* * Decode DEALLOCATE request */ static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4afdee420d25..9bf64eacba5b 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -185,6 +185,7 @@ struct nfs4_state { struct nfs4_exception { struct nfs4_state *state; struct inode *inode; + nfs4_stateid *stateid; long timeout; unsigned char delay : 1, recovering : 1, @@ -224,7 +225,8 @@ int nfs_atomic_open(struct inode *, struct dentry *, struct file *, extern struct file_system_type nfs4_fs_type; /* nfs4namespace.c */ -struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, struct qstr *); +struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, + const struct qstr *); struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, struct nfs_fh *, struct nfs_fattr *); int nfs4_replace_transport(struct nfs_server *server, @@ -251,7 +253,7 @@ extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struc extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *, struct page *page, struct rpc_cred *); extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *); -extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *, +extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, const struct qstr *, struct nfs_fh *, struct nfs_fattr *); extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); extern const struct xattr_handler *nfs4_xattr_handlers[]; @@ -394,6 +396,10 @@ extern void nfs4_schedule_state_renewal(struct nfs_client *); extern void nfs4_renewd_prepare_shutdown(struct nfs_server *); extern void nfs4_kill_renewd(struct nfs_client *); extern void nfs4_renew_state(struct work_struct *); +extern void nfs4_set_lease_period(struct nfs_client *clp, + unsigned long lease, + unsigned long lastrenewed); + /* nfs4state.c */ struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp); @@ -438,8 +444,9 @@ extern void nfs41_handle_server_scope(struct nfs_client *, struct nfs41_server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); -extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, - fmode_t, const struct nfs_lockowner *); +extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t, + const struct nfs_lockowner *, nfs4_stateid *, + struct rpc_cred **); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); @@ -496,12 +503,15 @@ extern struct svc_version nfs4_callback_version4; static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src) { - memcpy(dst, src, sizeof(*dst)); + memcpy(dst->data, src->data, sizeof(dst->data)); + dst->type = src->type; } static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src) { - return memcmp(dst, src, sizeof(*dst)) == 0; + if (dst->type != src->type) + return false; + return memcmp(dst->data, src->data, sizeof(dst->data)) == 0; } static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 10410e8b5853..cd3b7cfdde16 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -349,10 +349,10 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) * Returns pointer to an NFS client, or an ERR_PTR value. */ struct nfs_client *nfs4_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr) + const struct nfs_client_initdata *cl_init) { char buf[INET6_ADDRSTRLEN + 1]; + const char *ip_addr = cl_init->ip_addr; struct nfs_client *old; int error; @@ -370,9 +370,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I); if (error == -EINVAL) - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); if (error < 0) goto error; @@ -793,10 +793,12 @@ static int nfs4_set_client(struct nfs_server *server, .hostname = hostname, .addr = addr, .addrlen = addrlen, + .ip_addr = ip_addr, .nfs_mod = &nfs_v4, .proto = proto, .minorversion = minorversion, .net = net, + .timeparms = timeparms, }; struct nfs_client *clp; int error; @@ -809,12 +811,17 @@ static int nfs4_set_client(struct nfs_server *server, set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); + clp = nfs_get_client(&cl_init, authflavour); if (IS_ERR(clp)) { error = PTR_ERR(clp); goto error; } + if (server->nfs_client == clp) { + error = -ELOOP; + goto error; + } + /* * Query for the lease time on clientid setup or renewal * @@ -842,20 +849,24 @@ error: * low timeout interval so that if a connection is lost, we retry through * the MDS. */ -struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, +struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, u32 minor_version, rpc_authflavor_t au_flavor) { + struct rpc_timeout ds_timeout; + struct nfs_client *mds_clp = mds_srv->nfs_client; struct nfs_client_initdata cl_init = { .addr = ds_addr, .addrlen = ds_addrlen, + .nodename = mds_clp->cl_rpcclient->cl_nodename, + .ip_addr = mds_clp->cl_ipaddr, .nfs_mod = &nfs_v4, .proto = ds_proto, .minorversion = minor_version, .net = mds_clp->cl_net, + .timeparms = &ds_timeout, }; - struct rpc_timeout ds_timeout; struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; @@ -863,14 +874,16 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, return ERR_PTR(-EINVAL); cl_init.hostname = buf; + if (mds_srv->flags & NFS_MOUNT_NORESVPORT) + __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + /* * Set an authflavor equual to the MDS value. Use the MDS nfs_client * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS * (section 13.1 RFC 5661). */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, - au_flavor); + clp = nfs_get_client(&cl_init, au_flavor); dprintk("<-- %s %p\n", __func__, clp); return clp; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index d0390516467c..d085ad794884 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -66,7 +66,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (openflags & O_TRUNC) { attr.ia_valid |= ATTR_SIZE; attr.ia_size = 0; - nfs_sync_inode(inode); + filemap_write_and_wait(inode->i_mapping); } inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL); @@ -129,6 +129,16 @@ nfs4_file_flush(struct file *file, fl_owner_t id) } #ifdef CONFIG_NFS_V4_2 +static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t count, unsigned int flags) +{ + if (file_inode(file_in) == file_inode(file_out)) + return -EINVAL; + + return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); +} + static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) { loff_t ret; @@ -243,6 +253,7 @@ const struct file_operations nfs4_file_operations = { .check_flags = nfs_check_flags, .setlease = simple_nosetlease, #ifdef CONFIG_NFS_V4_2 + .copy_file_range = nfs4_copy_file_range, .llseek = nfs4_file_llseek, .fallocate = nfs42_fallocate, .clone_file_range = nfs42_clone_file_range, diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 5ba22c6b0ffa..c444285bb1b1 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -201,7 +201,7 @@ int nfs_idmap_init(void) GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA, NULL); + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index f592672373cb..d21104912676 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -208,7 +208,7 @@ static struct rpc_clnt *nfs_find_best_sec(struct rpc_clnt *clnt, */ struct rpc_clnt * nfs4_negotiate_security(struct rpc_clnt *clnt, struct inode *inode, - struct qstr *name) + const struct qstr *name) { struct page *page; struct nfs4_secinfo_flavors *flavors; @@ -397,7 +397,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, rpc_authflavor_t flavor = server->client->cl_auth->au_flavor; struct dentry *parent = dget_parent(dentry); struct inode *dir = d_inode(parent); - struct qstr *name = &dentry->d_name; + const struct qstr *name = &dentry->d_name; struct rpc_clnt *client; struct vfsmount *mnt; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 327b8c34d360..a9dec32ba9ba 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -74,6 +74,17 @@ #define NFS4_POLL_RETRY_MIN (HZ/10) #define NFS4_POLL_RETRY_MAX (15*HZ) +/* file attributes which can be mapped to nfs attributes */ +#define NFS4_VALID_ATTRS (ATTR_MODE \ + | ATTR_UID \ + | ATTR_GID \ + | ATTR_SIZE \ + | ATTR_ATIME \ + | ATTR_MTIME \ + | ATTR_CTIME \ + | ATTR_ATIME_SET \ + | ATTR_MTIME_SET) + struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); @@ -352,6 +363,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, { struct nfs_client *clp = server->nfs_client; struct nfs4_state *state = exception->state; + const nfs4_stateid *stateid = exception->stateid; struct inode *inode = exception->inode; int ret = errorcode; @@ -365,9 +377,18 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - if (inode && nfs_async_inode_return_delegation(inode, - NULL) == 0) - goto wait_on_recovery; + if (inode) { + int err; + + err = nfs_async_inode_return_delegation(inode, + stateid); + if (err == 0) + goto wait_on_recovery; + if (stateid != NULL && stateid->type == NFS4_DELEGATION_STATEID_TYPE) { + exception->retry = 1; + break; + } + } if (state == NULL) break; ret = nfs4_schedule_stateid_recovery(server, state); @@ -416,6 +437,8 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: + case -NFS4ERR_LAYOUTTRYLATER: + case -NFS4ERR_RECALLCONFLICT: exception->delay = 1; return 0; @@ -611,15 +634,11 @@ out_sleep: } EXPORT_SYMBOL_GPL(nfs40_setup_sequence); -static int nfs40_sequence_done(struct rpc_task *task, - struct nfs4_sequence_res *res) +static void nfs40_sequence_free_slot(struct nfs4_sequence_res *res) { struct nfs4_slot *slot = res->sr_slot; struct nfs4_slot_table *tbl; - if (slot == NULL) - goto out; - tbl = slot->table; spin_lock(&tbl->slot_tbl_lock); if (!nfs41_wake_and_assign_slot(tbl, slot)) @@ -627,7 +646,13 @@ static int nfs40_sequence_done(struct rpc_task *task, spin_unlock(&tbl->slot_tbl_lock); res->sr_slot = NULL; -out: +} + +static int nfs40_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + if (res->sr_slot != NULL) + nfs40_sequence_free_slot(res); return 1; } @@ -643,6 +668,11 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) tbl = slot->table; session = tbl->session; + /* Bump the slot sequence number */ + if (slot->seq_done) + slot->seq_nr++; + slot->seq_done = 0; + spin_lock(&tbl->slot_tbl_lock); /* Be nice to the server: try to ensure that the last transmitted * value for highest_user_slotid <= target_highest_slotid @@ -663,9 +693,12 @@ out_unlock: res->sr_slot = NULL; if (send_new_highest_used_slotid) nfs41_notify_server(session->clp); + if (waitqueue_active(&tbl->slot_waitq)) + wake_up_all(&tbl->slot_waitq); } -int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) +static int nfs41_sequence_process(struct rpc_task *task, + struct nfs4_sequence_res *res) { struct nfs4_session *session; struct nfs4_slot *slot = res->sr_slot; @@ -691,7 +724,7 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) switch (res->sr_status) { case 0: /* Update the slot's sequence and clientid lease timer */ - ++slot->seq_nr; + slot->seq_done = 1; clp = session->clp; do_renew_lease(clp, res->sr_timestamp); /* Check sequence flags */ @@ -746,16 +779,16 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) goto retry_nowait; default: /* Just update the slot sequence no. */ - ++slot->seq_nr; + slot->seq_done = 1; } out: /* The session may be reset by one of the error handlers. */ dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); - nfs41_sequence_free_slot(res); out_noaction: return ret; retry_nowait: if (rpc_restart_call_prepare(task)) { + nfs41_sequence_free_slot(res); task->tk_status = 0; ret = 0; } @@ -766,8 +799,37 @@ out_retry: rpc_delay(task, NFS4_POLL_RETRY_MAX); return 0; } + +int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) +{ + if (!nfs41_sequence_process(task, res)) + return 0; + if (res->sr_slot != NULL) + nfs41_sequence_free_slot(res); + return 1; + +} EXPORT_SYMBOL_GPL(nfs41_sequence_done); +static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) +{ + if (res->sr_slot == NULL) + return 1; + if (res->sr_slot->table->session != NULL) + return nfs41_sequence_process(task, res); + return nfs40_sequence_done(task, res); +} + +static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res) +{ + if (res->sr_slot != NULL) { + if (res->sr_slot->table->session != NULL) + nfs41_sequence_free_slot(res); + else + nfs40_sequence_free_slot(res); + } +} + int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { if (res->sr_slot == NULL) @@ -897,6 +959,17 @@ static int nfs4_setup_sequence(const struct nfs_server *server, args, res, task); } +static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) +{ + return nfs40_sequence_done(task, res); +} + +static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res) +{ + if (res->sr_slot != NULL) + nfs40_sequence_free_slot(res); +} + int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { @@ -1174,6 +1247,7 @@ static void nfs4_opendata_free(struct kref *kref) struct super_block *sb = p->dentry->d_sb; nfs_free_seqid(p->o_arg.seqid); + nfs4_sequence_free_slot(&p->o_res.seq_res); if (p->state != NULL) nfs4_put_open_state(p->state); nfs4_put_state_owner(p->owner); @@ -1633,9 +1707,14 @@ err: static struct nfs4_state * nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) { + struct nfs4_state *ret; + if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) - return _nfs4_opendata_reclaim_to_nfs4_state(data); - return _nfs4_opendata_to_nfs4_state(data); + ret =_nfs4_opendata_reclaim_to_nfs4_state(data); + else + ret = _nfs4_opendata_to_nfs4_state(data); + nfs4_sequence_free_slot(&data->o_res.seq_res); + return ret; } static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) @@ -2033,7 +2112,7 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata) data->rpc_status = task->tk_status; - if (!nfs4_sequence_done(task, &data->o_res.seq_res)) + if (!nfs4_sequence_process(task, &data->o_res.seq_res)) return; if (task->tk_status == 0) { @@ -2558,15 +2637,20 @@ static int _nfs4_do_open(struct inode *dir, if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) && (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { nfs4_exclusive_attrset(opendata, sattr, &label); - - nfs_fattr_init(opendata->o_res.f_attr); - status = nfs4_do_setattr(state->inode, cred, - opendata->o_res.f_attr, sattr, - state, label, olabel); - if (status == 0) { - nfs_setattr_update_inode(state->inode, sattr, - opendata->o_res.f_attr); - nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); + /* + * send create attributes which was not set by open + * with an extra setattr. + */ + if (sattr->ia_valid & NFS4_VALID_ATTRS) { + nfs_fattr_init(opendata->o_res.f_attr); + status = nfs4_do_setattr(state->inode, cred, + opendata->o_res.f_attr, sattr, + state, label, olabel); + if (status == 0) { + nfs_setattr_update_inode(state->inode, sattr, + opendata->o_res.f_attr); + nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); + } } } if (opened && opendata->file_created) @@ -2652,46 +2736,32 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, return res; } -static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, - struct nfs_fattr *fattr, struct iattr *sattr, - struct nfs4_state *state, struct nfs4_label *ilabel, - struct nfs4_label *olabel) +static int _nfs4_do_setattr(struct inode *inode, + struct nfs_setattrargs *arg, + struct nfs_setattrres *res, + struct rpc_cred *cred, + struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs_setattrargs arg = { - .fh = NFS_FH(inode), - .iap = sattr, - .server = server, - .bitmask = server->attr_bitmask, - .label = ilabel, - }; - struct nfs_setattrres res = { - .fattr = fattr, - .label = olabel, - .server = server, - }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], - .rpc_argp = &arg, - .rpc_resp = &res, + .rpc_argp = arg, + .rpc_resp = res, .rpc_cred = cred, }; + struct rpc_cred *delegation_cred = NULL; unsigned long timestamp = jiffies; fmode_t fmode; bool truncate; int status; - arg.bitmask = nfs4_bitmask(server, ilabel); - if (ilabel) - arg.bitmask = nfs4_bitmask(server, olabel); - - nfs_fattr_init(fattr); + nfs_fattr_init(res->fattr); /* Servers should only apply open mode checks for file size changes */ - truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false; + truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false; fmode = truncate ? FMODE_WRITE : FMODE_READ; - if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) { + if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) { /* Use that stateid */ } else if (truncate && state != NULL) { struct nfs_lockowner lockowner = { @@ -2700,16 +2770,20 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, }; if (!nfs4_valid_open_stateid(state)) return -EBADF; - if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, - &lockowner) == -EIO) + if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner, + &arg->stateid, &delegation_cred) == -EIO) return -EBADF; } else - nfs4_stateid_copy(&arg.stateid, &zero_stateid); + nfs4_stateid_copy(&arg->stateid, &zero_stateid); + if (delegation_cred) + msg.rpc_cred = delegation_cred; - status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1); + + put_rpccred(delegation_cred); if (status == 0 && state != NULL) renew_lease(server, timestamp); - trace_nfs4_setattr(inode, &arg.stateid, status); + trace_nfs4_setattr(inode, &arg->stateid, status); return status; } @@ -2719,13 +2793,31 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs4_label *olabel) { struct nfs_server *server = NFS_SERVER(inode); + struct nfs_setattrargs arg = { + .fh = NFS_FH(inode), + .iap = sattr, + .server = server, + .bitmask = server->attr_bitmask, + .label = ilabel, + }; + struct nfs_setattrres res = { + .fattr = fattr, + .label = olabel, + .server = server, + }; struct nfs4_exception exception = { .state = state, .inode = inode, + .stateid = &arg.stateid, }; int err; + + arg.bitmask = nfs4_bitmask(server, ilabel); + if (ilabel) + arg.bitmask = nfs4_bitmask(server, olabel); + do { - err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); + err = _nfs4_do_setattr(inode, &arg, &res, cred, state); switch (err) { case -NFS4ERR_OPENMODE: if (!(sattr->ia_valid & ATTR_SIZE)) { @@ -2860,12 +2952,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) call_close |= is_wronly; else if (is_wronly) calldata->arg.fmode |= FMODE_WRITE; + if (calldata->arg.fmode != (FMODE_READ|FMODE_WRITE)) + call_close |= is_rdwr; } else if (is_rdwr) calldata->arg.fmode |= FMODE_READ|FMODE_WRITE; - if (calldata->arg.fmode == 0) - call_close |= is_rdwr; - if (!nfs4_valid_open_stateid(state)) call_close = 0; spin_unlock(&state->owner->so_lock); @@ -3246,13 +3337,6 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, return status; } -static int nfs4_do_find_root_sec(struct nfs_server *server, - struct nfs_fh *fhandle, struct nfs_fsinfo *info) -{ - int mv = server->nfs_client->cl_minorversion; - return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info); -} - /** * nfs4_proc_get_rootfh - get file handle for server's pseudoroot * @server: initialized nfs_server handle @@ -3272,7 +3356,8 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, status = nfs4_lookup_root(server, fhandle, info); if (auth_probe || status == NFS4ERR_WRONGSEC) - status = nfs4_do_find_root_sec(server, fhandle, info); + status = server->nfs_client->cl_mvops->find_root_sec(server, + fhandle, info); if (status == 0) status = nfs4_server_capabilities(server, fhandle); @@ -3509,7 +3594,7 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr) } static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, - struct qstr *name, struct nfs_fh *fhandle, + const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label) { struct nfs4_exception exception = { }; @@ -3551,7 +3636,7 @@ out: return err; } -static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, +static int nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label) { @@ -3567,7 +3652,7 @@ static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, } struct rpc_clnt * -nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name, +nfs4_proc_lookup_mountpoint(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct rpc_clnt *client = NFS_CLIENT(dir); @@ -3726,7 +3811,7 @@ out: return status; } -static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) +static int _nfs4_proc_remove(struct inode *dir, const struct qstr *name) { struct nfs_server *server = NFS_SERVER(dir); struct nfs_removeargs args = { @@ -3749,7 +3834,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) return status; } -static int nfs4_proc_remove(struct inode *dir, struct qstr *name) +static int nfs4_proc_remove(struct inode *dir, const struct qstr *name) { struct nfs4_exception exception = { }; int err; @@ -3777,7 +3862,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) { - nfs4_setup_sequence(NFS_SERVER(data->dir), + nfs4_setup_sequence(NFS_SB(data->dentry->d_sb), &data->args.seq_args, &data->res.seq_res, task); @@ -3832,7 +3917,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, return 1; } -static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) +static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name) { struct nfs_server *server = NFS_SERVER(inode); struct nfs4_link_arg arg = { @@ -3879,7 +3964,7 @@ out: return status; } -static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) +static int nfs4_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name) { struct nfs4_exception exception = { }; int err; @@ -3901,7 +3986,7 @@ struct nfs4_createdata { }; static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, - struct qstr *name, struct iattr *sattr, u32 ftype) + const struct qstr *name, struct iattr *sattr, u32 ftype) { struct nfs4_createdata *data; @@ -4208,12 +4293,9 @@ static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, str err = _nfs4_do_fsinfo(server, fhandle, fsinfo); trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err); if (err == 0) { - struct nfs_client *clp = server->nfs_client; - - spin_lock(&clp->cl_lock); - clp->cl_lease_time = fsinfo->lease_time * HZ; - clp->cl_last_renewal = now; - spin_unlock(&clp->cl_lock); + nfs4_set_lease_period(server->nfs_client, + fsinfo->lease_time * HZ, + now); break; } err = nfs4_handle_exception(server, err, &exception); @@ -4285,7 +4367,7 @@ int nfs4_set_rw_stateid(nfs4_stateid *stateid, if (l_ctx != NULL) lockowner = &l_ctx->lockowner; - return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner); + return nfs4_select_rw_stateid(ctx->state, fmode, lockowner, stateid, NULL); } EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid); @@ -4371,7 +4453,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr, struct rpc_message *msg) { hdr->timestamp = jiffies; - hdr->pgio_done_cb = nfs4_read_done_cb; + if (!hdr->pgio_done_cb) + hdr->pgio_done_cb = nfs4_read_done_cb; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0); } @@ -4993,12 +5076,11 @@ static int nfs4_do_set_security_label(struct inode *inode, } static int -nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen) +nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen) { struct nfs4_label ilabel, *olabel = NULL; struct nfs_fattr fattr; struct rpc_cred *cred; - struct inode *inode = d_inode(dentry); int status; if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) @@ -6054,6 +6136,7 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { struct nfs_inode *nfsi = NFS_I(state->inode); + struct nfs4_state_owner *sp = state->owner; unsigned char fl_flags = request->fl_flags; int status = -ENOLCK; @@ -6068,6 +6151,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock status = do_vfs_lock(state->inode, request); if (status < 0) goto out; + mutex_lock(&sp->so_delegreturn_mutex); down_read(&nfsi->rwsem); if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { /* Yes: cache locks! */ @@ -6075,9 +6159,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock request->fl_flags = fl_flags & ~FL_SLEEP; status = do_vfs_lock(state->inode, request); up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); goto out; } up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); out: request->fl_flags = fl_flags; @@ -6255,18 +6341,18 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, - struct dentry *dentry, const char *key, - const void *buf, size_t buflen, - int flags) + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) { - return nfs4_proc_set_acl(d_inode(dentry), buf, buflen); + return nfs4_proc_set_acl(inode, buf, buflen); } static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler, - struct dentry *dentry, const char *key, - void *buf, size_t buflen) + struct dentry *unused, struct inode *inode, + const char *key, void *buf, size_t buflen) { - return nfs4_proc_get_acl(d_inode(dentry), buf, buflen); + return nfs4_proc_get_acl(inode, buf, buflen); } static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) @@ -6277,22 +6363,22 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) #ifdef CONFIG_NFS_V4_SECURITY_LABEL static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, - struct dentry *dentry, const char *key, - const void *buf, size_t buflen, - int flags) + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) { if (security_ismaclabel(key)) - return nfs4_set_security_label(dentry, buf, buflen); + return nfs4_set_security_label(inode, buf, buflen); return -EOPNOTSUPP; } static int nfs4_xattr_get_nfs4_label(const struct xattr_handler *handler, - struct dentry *dentry, const char *key, - void *buf, size_t buflen) + struct dentry *unused, struct inode *inode, + const char *key, void *buf, size_t buflen) { if (security_ismaclabel(key)) - return nfs4_get_security_label(d_inode(dentry), buf, buflen); + return nfs4_get_security_label(inode, buf, buflen); return -EOPNOTSUPP; } @@ -7351,9 +7437,11 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) * always set csa_cachethis to FALSE because the current implementation * of the back channel DRC only supports caching the CB_SEQUENCE operation. */ -static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) +static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args, + struct rpc_clnt *clnt) { unsigned int max_rqst_sz, max_resp_sz; + unsigned int max_bc_payload = rpc_max_bc_payload(clnt); max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead; max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead; @@ -7371,8 +7459,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) args->fc_attrs.max_ops, args->fc_attrs.max_reqs); /* Back channel attributes */ - args->bc_attrs.max_rqst_sz = PAGE_SIZE; - args->bc_attrs.max_resp_sz = PAGE_SIZE; + args->bc_attrs.max_rqst_sz = max_bc_payload; + args->bc_attrs.max_resp_sz = max_bc_payload; args->bc_attrs.max_resp_sz_cached = 0; args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS; args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS; @@ -7476,18 +7564,26 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, }; int status; - nfs4_init_channel_attrs(&args); + nfs4_init_channel_attrs(&args, clp->cl_rpcclient); args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN); status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); trace_nfs4_create_session(clp, status); + switch (status) { + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_DELAY: + case -ETIMEDOUT: + case -EACCES: + case -EAGAIN: + goto out; + }; + + clp->cl_seqid++; if (!status) { /* Verify the session's negotiated channel_attrs values */ status = nfs4_verify_channel_attrs(&args, &res); /* Increment the clientid slot sequence id */ - if (clp->cl_seqid == res.seqid) - clp->cl_seqid++; if (status) goto out; nfs4_update_session(session, &res); @@ -7820,40 +7916,36 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) struct nfs4_layoutget *lgp = calldata; struct nfs_server *server = NFS_SERVER(lgp->args.inode); struct nfs4_session *session = nfs4_get_session(server); - int ret; dprintk("--> %s\n", __func__); - /* Note the is a race here, where a CB_LAYOUTRECALL can come in - * right now covering the LAYOUTGET we are about to send. - * However, that is not so catastrophic, and there seems - * to be no way to prevent it completely. - */ - if (nfs41_setup_sequence(session, &lgp->args.seq_args, - &lgp->res.seq_res, task)) - return; - ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid, - NFS_I(lgp->args.inode)->layout, - &lgp->args.range, - lgp->args.ctx->state); - if (ret < 0) - rpc_exit(task, ret); + nfs41_setup_sequence(session, &lgp->args.seq_args, + &lgp->res.seq_res, task); + dprintk("<-- %s\n", __func__); } static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) { struct nfs4_layoutget *lgp = calldata; + + dprintk("--> %s\n", __func__); + nfs41_sequence_process(task, &lgp->res.seq_res); + dprintk("<-- %s\n", __func__); +} + +static int +nfs4_layoutget_handle_exception(struct rpc_task *task, + struct nfs4_layoutget *lgp, struct nfs4_exception *exception) +{ struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layout_hdr *lo; - struct nfs4_state *state = NULL; - unsigned long timeo, now, giveup; + int nfs4err = task->tk_status; + int err, status = 0; + LIST_HEAD(head); dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); - if (!nfs41_sequence_done(task, &lgp->res.seq_res)) - goto out; - - switch (task->tk_status) { + switch (nfs4err) { case 0: goto out; @@ -7863,88 +7955,67 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) * retry go inband. */ case -NFS4ERR_LAYOUTUNAVAILABLE: - task->tk_status = -ENODATA; + status = -ENODATA; goto out; /* * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3). */ case -NFS4ERR_BADLAYOUT: - goto out_overflow; + status = -EOVERFLOW; + goto out; /* * NFS4ERR_LAYOUTTRYLATER is a conflict with another client * (or clients) writing to the same RAID stripe except when * the minlength argument is 0 (see RFC5661 section 18.43.3). + * + * Treat it like we would RECALLCONFLICT -- we retry for a little + * while, and then eventually give up. */ case -NFS4ERR_LAYOUTTRYLATER: - if (lgp->args.minlength == 0) - goto out_overflow; - /* - * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall - * existing layout before getting a new one). - */ - case -NFS4ERR_RECALLCONFLICT: - timeo = rpc_get_timeout(task->tk_client); - giveup = lgp->args.timestamp + timeo; - now = jiffies; - if (time_after(giveup, now)) { - unsigned long delay; - - /* Delay for: - * - Not less then NFS4_POLL_RETRY_MIN. - * - One last time a jiffie before we give up - * - exponential backoff (time_now minus start_attempt) - */ - delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN, - min((giveup - now - 1), - now - lgp->args.timestamp)); - - dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n", - __func__, delay); - rpc_delay(task, delay); - /* Do not call nfs4_async_handle_error() */ - goto out_restart; + if (lgp->args.minlength == 0) { + status = -EOVERFLOW; + goto out; } + status = -EBUSY; + break; + case -NFS4ERR_RECALLCONFLICT: + status = -ERECALLCONFLICT; break; case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: + exception->timeout = 0; spin_lock(&inode->i_lock); - if (nfs4_stateid_match(&lgp->args.stateid, + lo = NFS_I(inode)->layout; + /* If the open stateid was bad, then recover it. */ + if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || + nfs4_stateid_match_other(&lgp->args.stateid, &lgp->args.ctx->state->stateid)) { spin_unlock(&inode->i_lock); - /* If the open stateid was bad, then recover it. */ - state = lgp->args.ctx->state; + exception->state = lgp->args.ctx->state; break; } - lo = NFS_I(inode)->layout; - if (lo && nfs4_stateid_match(&lgp->args.stateid, - &lo->plh_stateid)) { - LIST_HEAD(head); - - /* - * Mark the bad layout state as invalid, then retry - * with the current stateid. - */ - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); - spin_unlock(&inode->i_lock); - pnfs_free_lseg_list(&head); - } else - spin_unlock(&inode->i_lock); - goto out_restart; + + /* + * Mark the bad layout state as invalid, then retry + */ + pnfs_mark_layout_stateid_invalid(lo, &head); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); + status = -EAGAIN; + goto out; + } + + err = nfs4_handle_exception(server, nfs4err, exception); + if (!status) { + if (exception->retry) + status = -EAGAIN; + else + status = err; } - if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN) - goto out_restart; out: dprintk("<-- %s\n", __func__); - return; -out_restart: - task->tk_status = 0; - rpc_restart_call_prepare(task); - return; -out_overflow: - task->tk_status = -EOVERFLOW; - goto out; + return status; } static size_t max_response_pages(struct nfs_server *server) @@ -8013,7 +8084,7 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = { }; struct pnfs_layout_segment * -nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) +nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) { struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); @@ -8033,6 +8104,10 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) .flags = RPC_TASK_ASYNC, }; struct pnfs_layout_segment *lseg = NULL; + struct nfs4_exception exception = { + .inode = inode, + .timeout = *timeout, + }; int status = 0; dprintk("--> %s\n", __func__); @@ -8046,7 +8121,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) return ERR_PTR(-ENOMEM); } lgp->args.layout.pglen = max_pages * PAGE_SIZE; - lgp->args.timestamp = jiffies; lgp->res.layoutp = &lgp->args.layout; lgp->res.seq_res.sr_slot = NULL; @@ -8056,16 +8130,21 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) if (IS_ERR(task)) return ERR_CAST(task); status = nfs4_wait_for_completion_rpc_task(task); - if (status == 0) - status = task->tk_status; + if (status == 0) { + status = nfs4_layoutget_handle_exception(task, lgp, &exception); + *timeout = exception.timeout; + } + trace_nfs4_layoutget(lgp->args.ctx, &lgp->args.range, &lgp->res.range, &lgp->res.stateid, status); + /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) lseg = pnfs_layout_process(lgp); + nfs4_sequence_free_slot(&lgp->res.seq_res); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); if (status) @@ -8092,7 +8171,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) dprintk("--> %s\n", __func__); - if (!nfs41_sequence_done(task, &lrp->res.seq_res)) + if (!nfs41_sequence_process(task, &lrp->res.seq_res)) return; server = NFS_SERVER(lrp->args.inode); @@ -8104,6 +8183,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) case -NFS4ERR_DELAY: if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN) break; + nfs4_sequence_free_slot(&lrp->res.seq_res); rpc_restart_call_prepare(task); return; } @@ -8118,12 +8198,16 @@ static void nfs4_layoutreturn_release(void *calldata) dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); - pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); - pnfs_mark_layout_returned_if_empty(lo); - if (lrp->res.lrs_present) + if (lrp->res.lrs_present) { + pnfs_mark_matching_lsegs_invalid(lo, &freeme, + &lrp->args.range, + be32_to_cpu(lrp->args.stateid.seqid)); pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + } else + pnfs_mark_layout_stateid_invalid(lo, &freeme); pnfs_clear_layoutreturn_waitbit(lo); spin_unlock(&lo->plh_inode->i_lock); + nfs4_sequence_free_slot(&lrp->res.seq_res); pnfs_free_lseg_list(&freeme); pnfs_put_layout_hdr(lrp->args.layout); nfs_iput_and_deactive(lrp->inode); @@ -8653,6 +8737,9 @@ nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) static bool nfs41_match_stateid(const nfs4_stateid *s1, const nfs4_stateid *s2) { + if (s1->type != s2->type) + return false; + if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0) return false; @@ -8793,6 +8880,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 | NFS_CAP_ALLOCATE + | NFS_CAP_COPY | NFS_CAP_DEALLOCATE | NFS_CAP_SEEK | NFS_CAP_LAYOUTSTATS @@ -8821,7 +8909,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { #endif }; -ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) +static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) { ssize_t error, error2; diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index e1ba58c3d1ad..82e77198d17e 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -136,6 +136,26 @@ nfs4_kill_renewd(struct nfs_client *clp) cancel_delayed_work_sync(&clp->cl_renewd); } +/** + * nfs4_set_lease_period - Sets the lease period on a nfs_client + * + * @clp: pointer to nfs_client + * @lease: new value for lease period + * @lastrenewed: time at which lease was last renewed + */ +void nfs4_set_lease_period(struct nfs_client *clp, + unsigned long lease, + unsigned long lastrenewed) +{ + spin_lock(&clp->cl_lock); + clp->cl_lease_time = lease; + clp->cl_last_renewal = lastrenewed; + spin_unlock(&clp->cl_lock); + + /* Cap maximum reconnect timeout at 1/2 lease period */ + rpc_cap_max_reconnect_timeout(clp->cl_rpcclient, lease >> 1); +} + /* * Local variables: * c-basic-offset: 8 diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index 332d06e64fa9..b62973045a3e 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -28,6 +28,7 @@ static void nfs4_init_slot_table(struct nfs4_slot_table *tbl, const char *queue) tbl->highest_used_slotid = NFS4_NO_SLOT; spin_lock_init(&tbl->slot_tbl_lock); rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue); + init_waitqueue_head(&tbl->slot_waitq); init_completion(&tbl->complete); } @@ -172,6 +173,58 @@ struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid) return ERR_PTR(-E2BIG); } +static int nfs4_slot_get_seqid(struct nfs4_slot_table *tbl, u32 slotid, + u32 *seq_nr) + __must_hold(&tbl->slot_tbl_lock) +{ + struct nfs4_slot *slot; + + slot = nfs4_lookup_slot(tbl, slotid); + if (IS_ERR(slot)) + return PTR_ERR(slot); + *seq_nr = slot->seq_nr; + return 0; +} + +/* + * nfs4_slot_seqid_in_use - test if a slot sequence id is still in use + * + * Given a slot table, slot id and sequence number, determine if the + * RPC call in question is still in flight. This function is mainly + * intended for use by the callback channel. + */ +static bool nfs4_slot_seqid_in_use(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_nr) +{ + u32 cur_seq; + bool ret = false; + + spin_lock(&tbl->slot_tbl_lock); + if (nfs4_slot_get_seqid(tbl, slotid, &cur_seq) == 0 && + cur_seq == seq_nr && test_bit(slotid, tbl->used_slots)) + ret = true; + spin_unlock(&tbl->slot_tbl_lock); + return ret; +} + +/* + * nfs4_slot_wait_on_seqid - wait until a slot sequence id is complete + * + * Given a slot table, slot id and sequence number, wait until the + * corresponding RPC call completes. This function is mainly + * intended for use by the callback channel. + */ +int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_nr, + unsigned long timeout) +{ + if (wait_event_timeout(tbl->slot_waitq, + !nfs4_slot_seqid_in_use(tbl, slotid, seq_nr), + timeout) == 0) + return -ETIMEDOUT; + return 0; +} + /* * nfs4_alloc_slot - efficiently look for a free slot * diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 5b51298d1d03..f703b755351b 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -21,7 +21,8 @@ struct nfs4_slot { unsigned long generation; u32 slot_nr; u32 seq_nr; - unsigned int interrupted : 1; + unsigned int interrupted : 1, + seq_done : 1; }; /* Sessions */ @@ -36,6 +37,7 @@ struct nfs4_slot_table { unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */ spinlock_t slot_tbl_lock; struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */ + wait_queue_head_t slot_waitq; /* Completion wait on slot */ u32 max_slots; /* # slots in table */ u32 max_slotid; /* Max allowed slotid value */ u32 highest_used_slotid; /* sent to server on each SEQ. @@ -78,6 +80,9 @@ extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl); extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); extern struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid); +extern int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_nr, + unsigned long timeout); extern bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index d854693a15b0..cada00aa5096 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -65,7 +65,10 @@ #define OPENOWNER_POOL_SIZE 8 -const nfs4_stateid zero_stateid; +const nfs4_stateid zero_stateid = { + { .data = { 0 } }, + .type = NFS4_SPECIAL_STATEID_TYPE, +}; static DEFINE_MUTEX(nfs_clid_init_mutex); int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) @@ -274,20 +277,17 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp) { int status; struct nfs_fsinfo fsinfo; + unsigned long now; if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { nfs4_schedule_state_renewal(clp); return 0; } + now = jiffies; status = nfs4_proc_get_lease_time(clp, &fsinfo); if (status == 0) { - /* Update lease time and schedule renewal */ - spin_lock(&clp->cl_lock); - clp->cl_lease_time = fsinfo.lease_time * HZ; - clp->cl_last_renewal = jiffies; - spin_unlock(&clp->cl_lock); - + nfs4_set_lease_period(clp, fsinfo.lease_time * HZ, now); nfs4_schedule_state_renewal(clp); } @@ -985,15 +985,20 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) * Byte-range lock aware utility to initialize the stateid of read/write * requests. */ -int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, - fmode_t fmode, const struct nfs_lockowner *lockowner) +int nfs4_select_rw_stateid(struct nfs4_state *state, + fmode_t fmode, const struct nfs_lockowner *lockowner, + nfs4_stateid *dst, struct rpc_cred **cred) { - int ret = nfs4_copy_lock_stateid(dst, state, lockowner); + int ret; + + if (cred != NULL) + *cred = NULL; + ret = nfs4_copy_lock_stateid(dst, state, lockowner); if (ret == -EIO) /* A lost lock - don't even consider delegations */ goto out; /* returns true if delegation stateid found and copied */ - if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) { + if (nfs4_copy_delegation_stateid(state->inode, fmode, dst, cred)) { ret = 0; goto out; } @@ -1480,9 +1485,9 @@ restart: } spin_unlock(&state->state_lock); } - nfs4_put_open_state(state); clear_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); + nfs4_put_open_state(state); spin_lock(&sp->so_lock); goto restart; } diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 2c8d05dae5b1..cfb8f7ce5cf6 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -1235,8 +1235,8 @@ DECLARE_EVENT_CLASS(nfs4_idmap_event, len = 0; __entry->error = error < 0 ? error : 0; __entry->id = id; - memcpy(__get_dynamic_array(name), name, len); - ((char *)__get_dynamic_array(name))[len] = 0; + memcpy(__get_str(name), name, len); + __get_str(name)[len] = 0; ), TP_printk( @@ -1520,6 +1520,8 @@ DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close); { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \ { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \ { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \ + { PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" }, \ + { PNFS_UPDATE_LAYOUT_RETRY, "retrying" }, \ { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }) TRACE_EVENT(pnfs_update_layout, @@ -1528,9 +1530,10 @@ TRACE_EVENT(pnfs_update_layout, u64 count, enum pnfs_iomode iomode, struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, enum pnfs_update_layout_reason reason ), - TP_ARGS(inode, pos, count, iomode, lo, reason), + TP_ARGS(inode, pos, count, iomode, lo, lseg, reason), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, fileid) @@ -1540,6 +1543,7 @@ TRACE_EVENT(pnfs_update_layout, __field(enum pnfs_iomode, iomode) __field(int, layoutstateid_seq) __field(u32, layoutstateid_hash) + __field(long, lseg) __field(enum pnfs_update_layout_reason, reason) ), TP_fast_assign( @@ -1559,11 +1563,12 @@ TRACE_EVENT(pnfs_update_layout, __entry->layoutstateid_seq = 0; __entry->layoutstateid_hash = 0; } + __entry->lseg = (long)lseg; ), TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x " "iomode=%s pos=%llu count=%llu " - "layoutstateid=%d:0x%08x (%s)", + "layoutstateid=%d:0x%08x lseg=0x%lx (%s)", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, @@ -1571,6 +1576,7 @@ TRACE_EVENT(pnfs_update_layout, (unsigned long long)__entry->pos, (unsigned long long)__entry->count, __entry->layoutstateid_seq, __entry->layoutstateid_hash, + __entry->lseg, show_pnfs_update_layout_reason(__entry->reason) ) ); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 88474a4fc669..7bd3a5c09d31 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1985,9 +1985,14 @@ encode_layoutcommit(struct xdr_stream *xdr, p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ *p = cpu_to_be32(0); /* reclaim */ encode_nfs4_stateid(xdr, &args->stateid); - p = reserve_space(xdr, 20); - *p++ = cpu_to_be32(1); /* newoffset = TRUE */ - p = xdr_encode_hyper(p, args->lastbytewritten); + if (args->lastbytewritten != U64_MAX) { + p = reserve_space(xdr, 20); + *p++ = cpu_to_be32(1); /* newoffset = TRUE */ + p = xdr_encode_hyper(p, args->lastbytewritten); + } else { + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(0); /* newoffset = FALSE */ + } *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ @@ -4270,6 +4275,24 @@ static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE); } +static int decode_open_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_OPEN_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + +static int decode_lock_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_LOCK_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + +static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_DELEGATION_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) { int status; @@ -4278,7 +4301,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); if (!status) - status = decode_stateid(xdr, &res->stateid); + status = decode_open_stateid(xdr, &res->stateid); return status; } @@ -4937,7 +4960,7 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) if (status == -EIO) goto out; if (status == 0) { - status = decode_stateid(xdr, &res->stateid); + status = decode_lock_stateid(xdr, &res->stateid); if (unlikely(status)) goto out; } else if (status == -NFS4ERR_DENIED) @@ -4966,7 +4989,7 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) if (status != -EIO) nfs_increment_lock_seqid(status, res->seqid); if (status == 0) - status = decode_stateid(xdr, &res->stateid); + status = decode_lock_stateid(xdr, &res->stateid); return status; } @@ -5016,7 +5039,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr, __be32 *p; int status; - status = decode_stateid(xdr, &res->delegation); + status = decode_delegation_stateid(xdr, &res->delegation); if (unlikely(status)) return status; p = xdr_inline_decode(xdr, 4); @@ -5096,7 +5119,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) nfs_increment_open_seqid(status, res->seqid); if (status) return status; - status = decode_stateid(xdr, &res->stateid); + status = decode_open_stateid(xdr, &res->stateid); if (unlikely(status)) return status; @@ -5136,7 +5159,7 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); if (!status) - status = decode_stateid(xdr, &res->stateid); + status = decode_open_stateid(xdr, &res->stateid); return status; } @@ -5148,7 +5171,7 @@ static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *re if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); if (!status) - status = decode_stateid(xdr, &res->stateid); + status = decode_open_stateid(xdr, &res->stateid); return status; } @@ -5838,6 +5861,12 @@ out_overflow: } #if defined(CONFIG_NFS_V4_1) +static int decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_LAYOUT_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + static int decode_getdeviceinfo(struct xdr_stream *xdr, struct nfs4_getdeviceinfo_res *res) { @@ -5919,7 +5948,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, if (unlikely(!p)) goto out_overflow; res->return_on_close = be32_to_cpup(p); - decode_stateid(xdr, &res->stateid); + decode_layout_stateid(xdr, &res->stateid); p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; @@ -5985,7 +6014,7 @@ static int decode_layoutreturn(struct xdr_stream *xdr, goto out_overflow; res->lrs_present = be32_to_cpup(p); if (res->lrs_present) - status = decode_stateid(xdr, &res->stateid); + status = decode_layout_stateid(xdr, &res->stateid); return status; out_overflow: print_overflow_msg(__func__, xdr); @@ -7515,6 +7544,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(DEALLOCATE, enc_deallocate, dec_deallocate), PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), PROC(CLONE, enc_clone, dec_clone), + PROC(COPY, enc_copy, dec_copy), #endif /* CONFIG_NFS_V4_2 */ }; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 9f80a086b612..2ca9167bc97d 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -37,7 +37,6 @@ { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ { 1 << NFS_INO_STALE, "STALE" }, \ { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ - { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) @@ -702,14 +701,14 @@ TRACE_EVENT(nfs_sillyrename_unlink, ), TP_fast_assign( - struct inode *dir = data->dir; + struct inode *dir = d_inode(data->dentry->d_parent); size_t len = data->args.name.len; __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->error = error; - memcpy(__get_dynamic_array(name), + memcpy(__get_str(name), data->args.name.name, len); - ((char *)__get_dynamic_array(name))[len] = 0; + __get_str(name)[len] = 0; ), TP_printk( diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 1f6db4231057..174dd4cf5747 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -341,8 +341,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, * long write-back delay. This will be adjusted in * update_nfs_request below if the region is not locked. */ req->wb_page = page; - req->wb_index = page_file_index(page); - get_page(page); + if (page) { + req->wb_index = page_file_index(page); + get_page(page); + } req->wb_offset = offset; req->wb_pgbase = offset; req->wb_bytes = count; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 89a5ef4df08a..2c93a85eda51 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -259,7 +259,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) * is required. * Note that caller must hold inode->i_lock. */ -static int +int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, struct list_head *lseg_list) { @@ -270,7 +270,7 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, }; set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range); + return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range, 0); } static int @@ -308,7 +308,7 @@ pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) spin_lock(&inode->i_lock); pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); - pnfs_mark_matching_lsegs_invalid(lo, &head, &range); + pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, @@ -334,14 +334,17 @@ pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode) } static void -init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) +pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, + const struct pnfs_layout_range *range, + const nfs4_stateid *stateid) { INIT_LIST_HEAD(&lseg->pls_list); INIT_LIST_HEAD(&lseg->pls_lc_list); atomic_set(&lseg->pls_refcount, 1); - smp_mb(); set_bit(NFS_LSEG_VALID, &lseg->pls_flags); lseg->pls_layout = lo; + lseg->pls_range = *range; + lseg->pls_seq = be32_to_cpu(stateid->seqid); } static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) @@ -361,8 +364,11 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, list_del_init(&lseg->pls_list); /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ atomic_dec(&lo->plh_refcount); - if (list_empty(&lo->plh_segs)) + if (list_empty(&lo->plh_segs)) { + if (atomic_read(&lo->plh_outstanding) == 0) + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); + } rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); } @@ -484,15 +490,6 @@ pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, (end2 == NFS4_MAX_UINT64 || end2 > start1); } -static bool -should_free_lseg(const struct pnfs_layout_range *lseg_range, - const struct pnfs_layout_range *recall_range) -{ - return (recall_range->iomode == IOMODE_ANY || - lseg_range->iomode == recall_range->iomode) && - pnfs_lseg_range_intersecting(lseg_range, recall_range); -} - static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, struct list_head *tmp_list) { @@ -522,13 +519,56 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, return rv; } -/* Returns count of number of matching invalid lsegs remaining in list - * after call. +/* + * Compare 2 layout stateid sequence ids, to see which is newer, + * taking into account wraparound issues. + */ +static bool pnfs_seqid_is_newer(u32 s1, u32 s2) +{ + return (s32)(s1 - s2) > 0; +} + +static bool +pnfs_should_free_range(const struct pnfs_layout_range *lseg_range, + const struct pnfs_layout_range *recall_range) +{ + return (recall_range->iomode == IOMODE_ANY || + lseg_range->iomode == recall_range->iomode) && + pnfs_lseg_range_intersecting(lseg_range, recall_range); +} + +static bool +pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg, + const struct pnfs_layout_range *recall_range, + u32 seq) +{ + if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq)) + return false; + if (recall_range == NULL) + return true; + return pnfs_should_free_range(&lseg->pls_range, recall_range); +} + +/** + * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later + * @lo: layout header containing the lsegs + * @tmp_list: list head where doomed lsegs should go + * @recall_range: optional recall range argument to match (may be NULL) + * @seq: only invalidate lsegs obtained prior to this sequence (may be 0) + * + * Walk the list of lsegs in the layout header, and tear down any that should + * be destroyed. If "recall_range" is specified then the segment must match + * that range. If "seq" is non-zero, then only match segments that were handed + * out at or before that sequence. + * + * Returns number of matching invalid lsegs remaining in list after scanning + * it and purging them. */ int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - const struct pnfs_layout_range *recall_range) + const struct pnfs_layout_range *recall_range, + u32 seq) { struct pnfs_layout_segment *lseg, *next; int remaining = 0; @@ -538,12 +578,11 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, if (list_empty(&lo->plh_segs)) return 0; list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) - if (!recall_range || - should_free_lseg(&lseg->pls_range, recall_range)) { - dprintk("%s: freeing lseg %p iomode %d " + if (pnfs_match_lseg_recall(lseg, recall_range, seq)) { + dprintk("%s: freeing lseg %p iomode %d seq %u" "offset %llu length %llu\n", __func__, - lseg, lseg->pls_range.iomode, lseg->pls_range.offset, - lseg->pls_range.length); + lseg, lseg->pls_range.iomode, lseg->pls_seq, + lseg->pls_range.offset, lseg->pls_range.length); if (!mark_lseg_invalid(lseg, tmp_list)) remaining++; } @@ -730,13 +769,12 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) pnfs_destroy_layouts_byclid(clp, false); } -/* - * Compare 2 layout stateid sequence ids, to see which is newer, - * taking into account wraparound issues. - */ -static bool pnfs_seqid_is_newer(u32 s1, u32 s2) +static void +pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) { - return (s32)(s1 - s2) > 0; + lo->plh_return_iomode = 0; + lo->plh_return_seq = 0; + clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); } /* update lo->plh_stateid with new if is more recent */ @@ -744,24 +782,32 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier) { - u32 oldseq, newseq, new_barrier; - int empty = list_empty(&lo->plh_segs); + u32 oldseq, newseq, new_barrier = 0; oldseq = be32_to_cpu(lo->plh_stateid.seqid); newseq = be32_to_cpu(new->seqid); - if (empty || pnfs_seqid_is_newer(newseq, oldseq)) { + + if (!pnfs_layout_is_valid(lo)) { nfs4_stateid_copy(&lo->plh_stateid, new); - if (update_barrier) { - new_barrier = be32_to_cpu(new->seqid); - } else { - /* Because of wraparound, we want to keep the barrier - * "close" to the current seqids. - */ - new_barrier = newseq - atomic_read(&lo->plh_outstanding); - } - if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) - lo->plh_barrier = new_barrier; + lo->plh_barrier = newseq; + pnfs_clear_layoutreturn_info(lo); + clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + return; } + if (pnfs_seqid_is_newer(newseq, oldseq)) { + nfs4_stateid_copy(&lo->plh_stateid, new); + /* + * Because of wraparound, we want to keep the barrier + * "close" to the current seqids. + */ + new_barrier = newseq - atomic_read(&lo->plh_outstanding); + } + if (update_barrier) + new_barrier = be32_to_cpu(new->seqid); + else if (new_barrier == 0) + return; + if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) + lo->plh_barrier = new_barrier; } static bool @@ -781,50 +827,22 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo) test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } -int -pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, - const struct pnfs_layout_range *range, - struct nfs4_state *open_state) -{ - int status = 0; - - dprintk("--> %s\n", __func__); - spin_lock(&lo->plh_inode->i_lock); - if (pnfs_layoutgets_blocked(lo)) { - status = -EAGAIN; - } else if (!nfs4_valid_open_stateid(open_state)) { - status = -EBADF; - } else if (list_empty(&lo->plh_segs) || - test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { - int seq; - - do { - seq = read_seqbegin(&open_state->seqlock); - nfs4_stateid_copy(dst, &open_state->stateid); - } while (read_seqretry(&open_state->seqlock, seq)); - } else - nfs4_stateid_copy(dst, &lo->plh_stateid); - spin_unlock(&lo->plh_inode->i_lock); - dprintk("<-- %s\n", __func__); - return status; -} - /* -* Get layout from server. -* for now, assume that whole file layouts are requested. -* arg->offset: 0 -* arg->length: all ones -*/ + * Get layout from server. + * for now, assume that whole file layouts are requested. + * arg->offset: 0 + * arg->length: all ones + */ static struct pnfs_layout_segment * send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_open_context *ctx, + nfs4_stateid *stateid, const struct pnfs_layout_range *range, - gfp_t gfp_flags) + long *timeout, gfp_t gfp_flags) { struct inode *ino = lo->plh_inode; struct nfs_server *server = NFS_SERVER(ino); struct nfs4_layoutget *lgp; - struct pnfs_layout_segment *lseg; loff_t i_size; dprintk("--> %s\n", __func__); @@ -834,40 +852,31 @@ send_layoutget(struct pnfs_layout_hdr *lo, * store in lseg. If we race with a concurrent seqid morphing * op, then re-send the LAYOUTGET. */ - do { - lgp = kzalloc(sizeof(*lgp), gfp_flags); - if (lgp == NULL) - return NULL; - - i_size = i_size_read(ino); - - lgp->args.minlength = PAGE_SIZE; - if (lgp->args.minlength > range->length) - lgp->args.minlength = range->length; - if (range->iomode == IOMODE_READ) { - if (range->offset >= i_size) - lgp->args.minlength = 0; - else if (i_size - range->offset < lgp->args.minlength) - lgp->args.minlength = i_size - range->offset; - } - lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; - pnfs_copy_range(&lgp->args.range, range); - lgp->args.type = server->pnfs_curr_ld->id; - lgp->args.inode = ino; - lgp->args.ctx = get_nfs_open_context(ctx); - lgp->gfp_flags = gfp_flags; - lgp->cred = lo->plh_lc_cred; - - lseg = nfs4_proc_layoutget(lgp, gfp_flags); - } while (lseg == ERR_PTR(-EAGAIN)); - - if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg))) - lseg = NULL; - else - pnfs_layout_clear_fail_bit(lo, - pnfs_iomode_to_fail_bit(range->iomode)); + lgp = kzalloc(sizeof(*lgp), gfp_flags); + if (lgp == NULL) + return ERR_PTR(-ENOMEM); - return lseg; + i_size = i_size_read(ino); + + lgp->args.minlength = PAGE_SIZE; + if (lgp->args.minlength > range->length) + lgp->args.minlength = range->length; + if (range->iomode == IOMODE_READ) { + if (range->offset >= i_size) + lgp->args.minlength = 0; + else if (i_size - range->offset < lgp->args.minlength) + lgp->args.minlength = i_size - range->offset; + } + lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; + pnfs_copy_range(&lgp->args.range, range); + lgp->args.type = server->pnfs_curr_ld->id; + lgp->args.inode = ino; + lgp->args.ctx = get_nfs_open_context(ctx); + nfs4_stateid_copy(&lgp->args.stateid, stateid); + lgp->gfp_flags = gfp_flags; + lgp->cred = lo->plh_lc_cred; + + return nfs4_proc_layoutget(lgp, timeout, gfp_flags); } static void pnfs_clear_layoutcommit(struct inode *inode, @@ -894,13 +903,31 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) } static bool -pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) +pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, + nfs4_stateid *stateid, + enum pnfs_iomode *iomode) { + /* Serialise LAYOUTGET/LAYOUTRETURN */ + if (atomic_read(&lo->plh_outstanding) != 0) + return false; if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return false; - lo->plh_return_iomode = 0; pnfs_get_layout_hdr(lo); - clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { + if (stateid != NULL) { + nfs4_stateid_copy(stateid, &lo->plh_stateid); + if (lo->plh_return_seq != 0) + stateid->seqid = cpu_to_be32(lo->plh_return_seq); + } + if (iomode != NULL) + *iomode = lo->plh_return_iomode; + pnfs_clear_layoutreturn_info(lo); + return true; + } + if (stateid != NULL) + nfs4_stateid_copy(stateid, &lo->plh_stateid); + if (iomode != NULL) + *iomode = IOMODE_ANY; return true; } @@ -968,9 +995,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) enum pnfs_iomode iomode; bool send; - nfs4_stateid_copy(&stateid, &lo->plh_stateid); - iomode = lo->plh_return_iomode; - send = pnfs_prepare_layoutreturn(lo); + send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); spin_unlock(&inode->i_lock); if (send) { /* Send an async layoutreturn so we dont deadlock */ @@ -1007,12 +1032,11 @@ _pnfs_return_layout(struct inode *ino) dprintk("NFS: %s no layout to return\n", __func__); goto out; } - nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid); /* Reference matched in nfs4_layoutreturn_release */ pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); pnfs_clear_layoutcommit(ino, &tmp_list); - pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0); if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { struct pnfs_layout_range range = { @@ -1030,8 +1054,7 @@ _pnfs_return_layout(struct inode *ino) goto out_put_layout_hdr; } - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - send = pnfs_prepare_layoutreturn(lo); + send = pnfs_prepare_layoutreturn(lo, &stateid, NULL); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); if (send) @@ -1098,11 +1121,10 @@ bool pnfs_roc(struct inode *ino) goto out_noroc; } - nfs4_stateid_copy(&stateid, &lo->plh_stateid); /* always send layoutreturn if being marked so */ - if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED, - &lo->plh_flags)) - layoutreturn = pnfs_prepare_layoutreturn(lo); + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) + layoutreturn = pnfs_prepare_layoutreturn(lo, + &stateid, NULL); list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) /* If we are sending layoutreturn, invalidate all valid lsegs */ @@ -1150,7 +1172,6 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; - pnfs_mark_layout_returned_if_empty(lo); if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) lo->plh_barrier = barrier; spin_unlock(&ino->i_lock); @@ -1310,6 +1331,7 @@ alloc_init_layout_hdr(struct inode *ino, INIT_LIST_HEAD(&lo->plh_bulk_destroy); lo->plh_inode = ino; lo->plh_lc_cred = get_rpccred(ctx->cred); + lo->plh_flags |= 1 << NFS_LAYOUT_INVALID_STID; return lo; } @@ -1317,6 +1339,8 @@ static struct pnfs_layout_hdr * pnfs_find_alloc_layout(struct inode *ino, struct nfs_open_context *ctx, gfp_t gfp_flags) + __releases(&ino->i_lock) + __acquires(&ino->i_lock) { struct nfs_inode *nfsi = NFS_I(ino); struct pnfs_layout_hdr *new = NULL; @@ -1341,23 +1365,28 @@ out_existing: /* * iomode matching rules: - * iomode lseg match - * ----- ----- ----- - * ANY READ true - * ANY RW true - * RW READ false - * RW RW true - * READ READ true - * READ RW true + * iomode lseg strict match + * iomode + * ----- ----- ------ ----- + * ANY READ N/A true + * ANY RW N/A true + * RW READ N/A false + * RW RW N/A true + * READ READ N/A true + * READ RW true false + * READ RW false true */ static bool pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, - const struct pnfs_layout_range *range) + const struct pnfs_layout_range *range, + bool strict_iomode) { struct pnfs_layout_range range1; if ((range->iomode == IOMODE_RW && ls_range->iomode != IOMODE_RW) || + (range->iomode != ls_range->iomode && + strict_iomode == true) || !pnfs_lseg_range_intersecting(ls_range, range)) return 0; @@ -1372,7 +1401,8 @@ pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, */ static struct pnfs_layout_segment * pnfs_find_lseg(struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range) + struct pnfs_layout_range *range, + bool strict_iomode) { struct pnfs_layout_segment *lseg, *ret = NULL; @@ -1381,7 +1411,8 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, list_for_each_entry(lseg, &lo->plh_segs, pls_list) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) && - pnfs_lseg_range_match(&lseg->pls_range, range)) { + pnfs_lseg_range_match(&lseg->pls_range, range, + strict_iomode)) { ret = pnfs_get_lseg(lseg); break; } @@ -1498,6 +1529,7 @@ pnfs_update_layout(struct inode *ino, loff_t pos, u64 count, enum pnfs_iomode iomode, + bool strict_iomode, gfp_t gfp_flags) { struct pnfs_layout_range arg = { @@ -1505,45 +1537,49 @@ pnfs_update_layout(struct inode *ino, .offset = pos, .length = count, }; - unsigned pg_offset; + unsigned pg_offset, seq; struct nfs_server *server = NFS_SERVER(ino); struct nfs_client *clp = server->nfs_client; - struct pnfs_layout_hdr *lo; + struct pnfs_layout_hdr *lo = NULL; struct pnfs_layout_segment *lseg = NULL; + nfs4_stateid stateid; + long timeout = 0; + unsigned long giveup = jiffies + (clp->cl_lease_time << 1); bool first; if (!pnfs_enabled_sb(NFS_SERVER(ino))) { - trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_NO_PNFS); goto out; } if (iomode == IOMODE_READ && i_size_read(ino) == 0) { - trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RD_ZEROLEN); goto out; } if (pnfs_within_mdsthreshold(ctx, ino, iomode)) { - trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_MDSTHRESH); goto out; } lookup_again: + nfs4_client_recover_expired_lease(clp); first = false; spin_lock(&ino->i_lock); lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { spin_unlock(&ino->i_lock); - trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_NOMEM); goto out; } /* Do we even need to bother with this? */ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_BULK_RECALL); dprintk("%s matches recall, use MDS\n", __func__); goto out_unlock; @@ -1551,14 +1587,33 @@ lookup_again: /* if LAYOUTGET already failed once we don't try again */ if (pnfs_layout_io_test_failed(lo, iomode)) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_IO_TEST_FAIL); goto out_unlock; } - first = list_empty(&lo->plh_segs); - if (first) { - /* The first layoutget for the file. Need to serialize per + lseg = pnfs_find_lseg(lo, &arg, strict_iomode); + if (lseg) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_FOUND_CACHED); + goto out_unlock; + } + + if (!nfs4_valid_open_stateid(ctx->state)) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_INVALID_OPEN); + goto out_unlock; + } + + /* + * Choose a stateid for the LAYOUTGET. If we don't have a layout + * stateid, or it has been invalidated, then we must use the open + * stateid. + */ + if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { + + /* + * The first layoutget for the file. Need to serialize per * RFC 5661 Errata 3208. */ if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, @@ -1567,18 +1622,17 @@ lookup_again: wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET, TASK_UNINTERRUPTIBLE); pnfs_put_layout_hdr(lo); + dprintk("%s retrying\n", __func__); goto lookup_again; } + + first = true; + do { + seq = read_seqbegin(&ctx->state->seqlock); + nfs4_stateid_copy(&stateid, &ctx->state->stateid); + } while (read_seqretry(&ctx->state->seqlock, seq)); } else { - /* Check to see if the layout for the given range - * already exists - */ - lseg = pnfs_find_lseg(lo, &arg); - if (lseg) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, - PNFS_UPDATE_LAYOUT_FOUND_CACHED); - goto out_unlock; - } + nfs4_stateid_copy(&stateid, &lo->plh_stateid); } /* @@ -1593,15 +1647,17 @@ lookup_again: pnfs_clear_first_layoutget(lo); pnfs_put_layout_hdr(lo); dprintk("%s retrying\n", __func__); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + lseg, PNFS_UPDATE_LAYOUT_RETRY); goto lookup_again; } - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETURN); goto out_put_layout_hdr; } if (pnfs_layoutgets_blocked(lo)) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_BLOCKED); goto out_unlock; } @@ -1626,10 +1682,47 @@ lookup_again: if (arg.length != NFS4_MAX_UINT64) arg.length = PAGE_ALIGN(arg.length); - lseg = send_layoutget(lo, ctx, &arg, gfp_flags); - atomic_dec(&lo->plh_outstanding); - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); + atomic_dec(&lo->plh_outstanding); + if (IS_ERR(lseg)) { + switch(PTR_ERR(lseg)) { + case -EBUSY: + if (time_after(jiffies, giveup)) + lseg = NULL; + break; + case -ERECALLCONFLICT: + /* Huh? We hold no layouts, how is there a recall? */ + if (first) { + lseg = NULL; + break; + } + /* Destroy the existing layout and start over */ + if (time_after(jiffies, giveup)) + pnfs_destroy_layout(NFS_I(ino)); + /* Fallthrough */ + case -EAGAIN: + break; + default: + if (!nfs_error_is_fatal(PTR_ERR(lseg))) { + pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); + lseg = NULL; + } + goto out_put_layout_hdr; + } + if (lseg) { + if (first) + pnfs_clear_first_layoutget(lo); + trace_pnfs_update_layout(ino, pos, count, + iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY); + pnfs_put_layout_hdr(lo); + goto lookup_again; + } + } else { + pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); + } + out_put_layout_hdr: if (first) pnfs_clear_first_layoutget(lo); @@ -1678,38 +1771,34 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) struct pnfs_layout_segment *lseg; struct inode *ino = lo->plh_inode; LIST_HEAD(free_me); - int status = -EINVAL; if (!pnfs_sanity_check_layout_range(&res->range)) - goto out; + return ERR_PTR(-EINVAL); /* Inject layout blob into I/O device driver */ lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); - if (!lseg || IS_ERR(lseg)) { + if (IS_ERR_OR_NULL(lseg)) { if (!lseg) - status = -ENOMEM; - else - status = PTR_ERR(lseg); - dprintk("%s: Could not allocate layout: error %d\n", - __func__, status); - goto out; + lseg = ERR_PTR(-ENOMEM); + + dprintk("%s: Could not allocate layout: error %ld\n", + __func__, PTR_ERR(lseg)); + return lseg; } - init_lseg(lo, lseg); - lseg->pls_range = res->range; + pnfs_init_lseg(lo, lseg, &res->range, &res->stateid); spin_lock(&ino->i_lock); if (pnfs_layoutgets_blocked(lo)) { dprintk("%s forget reply due to state\n", __func__); - goto out_forget_reply; + goto out_forget; } if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { /* existing state ID, make sure the sequence number matches. */ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { dprintk("%s forget reply due to sequence\n", __func__); - status = -EAGAIN; - goto out_forget_reply; + goto out_forget; } pnfs_set_layout_stateid(lo, &res->stateid, false); } else { @@ -1718,42 +1807,41 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) * inode invalid, and don't bother validating the stateid * sequence number. */ - pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL); + pnfs_mark_layout_stateid_invalid(lo, &free_me); - nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); - lo->plh_barrier = be32_to_cpu(res->stateid.seqid); + pnfs_set_layout_stateid(lo, &res->stateid, true); } - clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_get_lseg(lseg); pnfs_layout_insert_lseg(lo, lseg, &free_me); + if (res->return_on_close) set_bit(NFS_LSEG_ROC, &lseg->pls_flags); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me); return lseg; -out: - return ERR_PTR(status); -out_forget_reply: +out_forget: spin_unlock(&ino->i_lock); lseg->pls_layout = lo; NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); - goto out; + return ERR_PTR(-EAGAIN); } static void -pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) +pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, + u32 seq) { - if (lo->plh_return_iomode == iomode) - return; - if (lo->plh_return_iomode != 0) + if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode) iomode = IOMODE_ANY; lo->plh_return_iomode = iomode; set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); + if (seq != 0) { + WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq); + lo->plh_return_seq = seq; + } } /** @@ -1769,7 +1857,8 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - const struct pnfs_layout_range *return_range) + const struct pnfs_layout_range *return_range, + u32 seq) { struct pnfs_layout_segment *lseg, *next; int remaining = 0; @@ -1782,7 +1871,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, assert_spin_locked(&lo->plh_inode->i_lock); list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) - if (should_free_lseg(&lseg->pls_range, return_range)) { + if (pnfs_match_lseg_recall(lseg, return_range, seq)) { dprintk("%s: marking lseg %p iomode %d " "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, @@ -1792,8 +1881,11 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, continue; remaining++; set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); - pnfs_set_plh_return_iomode(lo, return_range->iomode); } + + if (remaining) + pnfs_set_plh_return_info(lo, return_range->iomode, seq); + return remaining; } @@ -1810,18 +1902,17 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, bool return_now = false; spin_lock(&inode->i_lock); - pnfs_set_plh_return_iomode(lo, range.iomode); + pnfs_set_plh_return_info(lo, range.iomode, 0); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) { + if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) { nfs4_stateid stateid; - enum pnfs_iomode iomode = lo->plh_return_iomode; + enum pnfs_iomode iomode; - nfs4_stateid_copy(&stateid, &lo->plh_stateid); - return_now = pnfs_prepare_layoutreturn(lo); + return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); spin_unlock(&inode->i_lock); if (return_now) pnfs_send_layoutreturn(lo, &stateid, iomode, false); @@ -1849,6 +1940,7 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r req_offset(req), rd_size, IOMODE_READ, + false, GFP_KERNEL); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -1873,6 +1965,7 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, req_offset(req), wb_size, IOMODE_RW, + false, GFP_NOFS); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -2143,12 +2236,15 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr, } /* Resend all requests through pnfs. */ -int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) +void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) { struct nfs_pageio_descriptor pgio; - nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops); - return nfs_pageio_resend(&pgio, hdr); + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + nfs_pageio_init_read(&pgio, hdr->inode, false, + hdr->completion_ops); + hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr); + } } EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs); @@ -2158,12 +2254,11 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; struct pnfs_layout_segment *lseg = desc->pg_lseg; enum pnfs_try_status trypnfs; - int err = 0; trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); if (trypnfs == PNFS_TRY_AGAIN) - err = pnfs_read_resend_pnfs(hdr); - if (trypnfs == PNFS_NOT_ATTEMPTED || err) + pnfs_read_resend_pnfs(hdr); + if (trypnfs == PNFS_NOT_ATTEMPTED || hdr->task.tk_status) pnfs_read_through_mds(desc, hdr); } @@ -2332,7 +2427,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) nfs_fattr_init(&data->fattr); data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; data->res.fattr = &data->fattr; - data->args.lastbytewritten = end_pos - 1; + if (end_pos != 0) + data->args.lastbytewritten = end_pos - 1; + else + data->args.lastbytewritten = U64_MAX; data->res.server = NFS_SERVER(inode); if (ld->prepare_layoutcommit) { @@ -2405,7 +2503,7 @@ pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags) spin_lock(&inode->i_lock); if (!NFS_I(inode)->layout) { spin_unlock(&inode->i_lock); - goto out; + goto out_clear_layoutstats; } hdr = NFS_I(inode)->layout; pnfs_get_layout_hdr(hdr); @@ -2419,7 +2517,6 @@ pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags) data->args.fh = NFS_FH(inode); data->args.inode = inode; - nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid); status = ld->prepare_layoutstats(&data->args); if (status) goto out_free; @@ -2434,6 +2531,7 @@ out_free: kfree(data); out_put: pnfs_put_layout_hdr(hdr); +out_clear_layoutstats: smp_mb__before_atomic(); clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags); smp_mb__after_atomic(); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 1ac1db5f6dad..31d99b2927b0 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -64,6 +64,7 @@ struct pnfs_layout_segment { struct list_head pls_lc_list; struct pnfs_layout_range pls_range; atomic_t pls_refcount; + u32 pls_seq; unsigned long pls_flags; struct pnfs_layout_hdr *pls_layout; struct work_struct pls_work; @@ -194,6 +195,7 @@ struct pnfs_layout_hdr { unsigned long plh_flags; nfs4_stateid plh_stateid; u32 plh_barrier; /* ignore lower seqids */ + u32 plh_return_seq; enum pnfs_iomode plh_return_iomode; loff_t plh_lwb; /* last write byte for layoutcommit */ struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ @@ -226,7 +228,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *dev, struct rpc_cred *cred); -extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); +extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags); extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync); /* pnfs.c */ @@ -258,16 +260,16 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier); -int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, - struct pnfs_layout_hdr *lo, - const struct pnfs_layout_range *range, - struct nfs4_state *open_state); int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - const struct pnfs_layout_range *recall_range); + const struct pnfs_layout_range *recall_range, + u32 seq); int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - const struct pnfs_layout_range *recall_range); + const struct pnfs_layout_range *recall_range, + u32 seq); +int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, + struct list_head *lseg_list); bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); @@ -282,12 +284,13 @@ int _pnfs_return_layout(struct inode *); int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_pgio_header *); void pnfs_ld_read_done(struct nfs_pgio_header *); -int pnfs_read_resend_pnfs(struct nfs_pgio_header *); +void pnfs_read_resend_pnfs(struct nfs_pgio_header *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, u64 count, enum pnfs_iomode iomode, + bool strict_iomode, gfp_t gfp_flags); void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo); @@ -374,6 +377,11 @@ static inline bool nfs_have_layout(struct inode *inode) return NFS_I(inode)->layout != NULL; } +static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo) +{ + return test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) == 0; +} + static inline struct nfs4_deviceid_node * nfs4_get_deviceid(struct nfs4_deviceid_node *d) { @@ -544,19 +552,6 @@ pnfs_calc_offset_length(u64 offset, u64 end) return 1 + end - offset; } -/** - * pnfs_mark_layout_returned_if_empty - marks the layout as returned - * @lo: layout header - * - * Note: Caller must hold inode->i_lock - */ -static inline void -pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo) -{ - if (list_empty(&lo->plh_segs)) - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); -} - static inline void pnfs_copy_range(struct pnfs_layout_range *dst, const struct pnfs_layout_range *src) @@ -628,6 +623,13 @@ pnfs_sync_inode(struct inode *inode, bool datasync) } static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ + return false; +} + + +static inline bool pnfs_roc(struct inode *ino) { return false; @@ -715,13 +717,6 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, return false; } -static inline bool -pnfs_layoutcommit_outstanding(struct inode *inode) -{ - return false; -} - - static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { return NULL; diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 4aaed890048f..f3468b57a32a 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -61,7 +61,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_release); /* The generic layer is about to remove the req from the commit list. * If this will make the bucket empty, it will need to put the lseg reference. - * Note this must be called holding the inode (/cinfo) lock + * Note this must be called holding i_lock */ void pnfs_generic_clear_request_commit(struct nfs_page *req, @@ -98,7 +98,7 @@ pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst, if (!nfs_lock_request(req)) continue; kref_get(&req->wb_kref); - if (cond_resched_lock(cinfo->lock)) + if (cond_resched_lock(&cinfo->inode->i_lock)) list_safe_reset_next(req, tmp, wb_list); nfs_request_remove_commit_list(req, cinfo); clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); @@ -119,7 +119,7 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, struct list_head *dst = &bucket->committing; int ret; - lockdep_assert_held(cinfo->lock); + lockdep_assert_held(&cinfo->inode->i_lock); ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max); if (ret) { cinfo->ds->nwritten -= ret; @@ -142,7 +142,7 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, { int i, rv = 0, cnt; - lockdep_assert_held(cinfo->lock); + lockdep_assert_held(&cinfo->inode->i_lock); for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i], cinfo, max); @@ -161,16 +161,16 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst, struct pnfs_layout_segment *freeme; int i; - lockdep_assert_held(cinfo->lock); + lockdep_assert_held(&cinfo->inode->i_lock); restart: for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { if (pnfs_generic_transfer_commit_list(&b->written, dst, cinfo, 0)) { freeme = b->wlseg; b->wlseg = NULL; - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); pnfs_put_lseg(freeme); - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); goto restart; } } @@ -186,7 +186,7 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) LIST_HEAD(pages); int i; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); for (i = idx; i < fl_cinfo->nbuckets; i++) { bucket = &fl_cinfo->buckets[i]; if (list_empty(&bucket->committing)) @@ -194,12 +194,12 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) freeme = bucket->clseg; bucket->clseg = NULL; list_splice_init(&bucket->committing, &pages); - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); nfs_retry_commit(&pages, freeme, cinfo, i); pnfs_put_lseg(freeme); - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); } - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); } static unsigned int @@ -238,14 +238,39 @@ void pnfs_fetch_commit_bucket_list(struct list_head *pages, struct pnfs_commit_bucket *bucket; bucket = &cinfo->ds->buckets[data->ds_commit_index]; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); list_splice_init(&bucket->committing, pages); data->lseg = bucket->clseg; bucket->clseg = NULL; - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); } +/* Helper function for pnfs_generic_commit_pagelist to catch an empty + * page list. This can happen when two commits race. + * + * This must be called instead of nfs_init_commit - call one or the other, but + * not both! + */ +static bool +pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages, + struct nfs_commit_data *data, + struct nfs_commit_info *cinfo) +{ + if (list_empty(pages)) { + if (atomic_dec_and_test(&cinfo->mds->rpcs_out)) + wake_up_atomic_t(&cinfo->mds->rpcs_out); + /* don't call nfs_commitdata_release - it tries to put + * the open_context which is not acquired until nfs_init_commit + * which has not been called on @data */ + WARN_ON_ONCE(data->context); + nfs_commit_free(data); + return true; + } + + return false; +} + /* This follows nfs_commit_list pretty closely */ int pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, @@ -280,6 +305,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, list_for_each_entry_safe(data, tmp, &list, pages) { list_del_init(&data->pages); if (data->ds_commit_index < 0) { + /* another commit raced with us */ + if (pnfs_generic_commit_cancel_empty_pagelist(mds_pages, + data, cinfo)) + continue; + nfs_init_commit(data, mds_pages, NULL, cinfo); nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(data->inode), @@ -288,6 +318,12 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, LIST_HEAD(pages); pnfs_fetch_commit_bucket_list(&pages, data, cinfo); + + /* another commit raced with us */ + if (pnfs_generic_commit_cancel_empty_pagelist(&pages, + data, cinfo)) + continue; + nfs_init_commit(data, &pages, data->lseg, cinfo); initiate_commit(data, how); } @@ -559,7 +595,7 @@ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) } static struct nfs_client *(*get_v3_ds_connect)( - struct nfs_client *mds_clp, + struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, @@ -618,7 +654,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, rpc_clnt_test_and_add_xprt, NULL); } else - clp = get_v3_ds_connect(mds_srv->nfs_client, + clp = get_v3_ds_connect(mds_srv, (struct sockaddr *)&da->da_addr, da->da_addrlen, IPPROTO_TCP, timeo, retrans, au_flavor); @@ -654,7 +690,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, dprintk("%s: DS %s: trying address %s\n", __func__, ds->ds_remotestr, da->da_remotestr); - clp = nfs4_set_ds_client(mds_srv->nfs_client, + clp = nfs4_set_ds_client(mds_srv, (struct sockaddr *)&da->da_addr, da->da_addrlen, IPPROTO_TCP, timeo, retrans, minor_version, @@ -874,12 +910,12 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, struct list_head *list; struct pnfs_commit_bucket *buckets; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); buckets = cinfo->ds->buckets; list = &buckets[ds_commit_idx].written; if (list_empty(list)) { if (!pnfs_is_valid_lseg(lseg)) { - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); cinfo->completion_ops->resched_write(cinfo, req); return; } @@ -896,7 +932,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, cinfo->ds->nwritten++; nfs_request_add_commit_list_locked(req, list, cinfo); - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); nfs_mark_page_unstable(req->wb_page, cinfo); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); @@ -904,6 +940,13 @@ EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); int pnfs_nfs_generic_sync(struct inode *inode, bool datasync) { + int ret; + + if (!pnfs_layoutcommit_outstanding(inode)) + return 0; + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret < 0) + return ret; if (datasync) return 0; return pnfs_layoutcommit_inode(inode, true); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index b417bbcd9704..b7bca8303989 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -145,7 +145,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs_proc_lookup(struct inode *dir, struct qstr *name, +nfs_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label) { @@ -299,7 +299,7 @@ out: } static int -nfs_proc_remove(struct inode *dir, struct qstr *name) +nfs_proc_remove(struct inode *dir, const struct qstr *name) { struct nfs_removeargs arg = { .fh = NFS_FH(dir), @@ -357,7 +357,7 @@ nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir, } static int -nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) +nfs_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name) { struct nfs_linkargs arg = { .fromfh = NFS_FH(inode), @@ -456,7 +456,7 @@ out: } static int -nfs_proc_rmdir(struct inode *dir, struct qstr *name) +nfs_proc_rmdir(struct inode *dir, const struct qstr *name) { struct nfs_diropargs arg = { .fh = NFS_FH(dir), diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 6776d7a7839e..572e5b3b06f1 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -367,13 +367,13 @@ readpage_async_filler(void *data, struct page *page) nfs_list_remove_request(new); nfs_readpage_release(new); error = desc->pgio->pg_error; - goto out_unlock; + goto out; } return 0; out_error: error = PTR_ERR(new); -out_unlock: unlock_page(page); +out: return error; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f1268280244e..d39601381adf 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -191,6 +191,7 @@ static const match_table_t nfs_mount_option_tokens = { enum { Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma, + Opt_xprt_rdma6, Opt_xprt_err }; @@ -201,6 +202,7 @@ static const match_table_t nfs_xprt_protocol_tokens = { { Opt_xprt_tcp, "tcp" }, { Opt_xprt_tcp6, "tcp6" }, { Opt_xprt_rdma, "rdma" }, + { Opt_xprt_rdma6, "rdma6" }, { Opt_xprt_err, NULL } }; @@ -921,6 +923,8 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void) data = kzalloc(sizeof(*data), GFP_KERNEL); if (data) { + data->timeo = NFS_UNSPEC_TIMEO; + data->retrans = NFS_UNSPEC_RETRANS; data->acregmin = NFS_DEF_ACREGMIN; data->acregmax = NFS_DEF_ACREGMAX; data->acdirmin = NFS_DEF_ACDIRMIN; @@ -1187,6 +1191,19 @@ static int nfs_get_option_ul(substring_t args[], unsigned long *option) return rc; } +static int nfs_get_option_ul_bound(substring_t args[], unsigned long *option, + unsigned long l_bound, unsigned long u_bound) +{ + int ret; + + ret = nfs_get_option_ul(args, option); + if (ret != 0) + return ret; + if (*option < l_bound || *option > u_bound) + return -ERANGE; + return 0; +} + /* * Error-check and convert a string of mount options from user space into * a data structure. The whole mount string is processed; bad options are @@ -1350,12 +1367,12 @@ static int nfs_parse_mount_options(char *raw, mnt->bsize = option; break; case Opt_timeo: - if (nfs_get_option_ul(args, &option) || option == 0) + if (nfs_get_option_ul_bound(args, &option, 1, INT_MAX)) goto out_invalid_value; mnt->timeo = option; break; case Opt_retrans: - if (nfs_get_option_ul(args, &option) || option == 0) + if (nfs_get_option_ul_bound(args, &option, 0, INT_MAX)) goto out_invalid_value; mnt->retrans = option; break; @@ -1456,6 +1473,8 @@ static int nfs_parse_mount_options(char *raw, mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; break; + case Opt_xprt_rdma6: + protofamily = AF_INET6; case Opt_xprt_rdma: /* vector side protocols to TCP */ mnt->flags |= NFS_MOUNT_TCP; @@ -1680,6 +1699,7 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, { rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; unsigned int i; + int use_auth_null = false; /* * If the sec= mount option is used, the specified flavor or AUTH_NULL @@ -1687,14 +1707,21 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, * * AUTH_NULL has a special meaning when it's in the server list - it * means that the server will ignore the rpc creds, so any flavor - * can be used. + * can be used but still use the sec= that was specified. */ for (i = 0; i < count; i++) { flavor = server_authlist[i]; - if (nfs_auth_info_match(&args->auth_info, flavor) || - flavor == RPC_AUTH_NULL) + if (nfs_auth_info_match(&args->auth_info, flavor)) goto out; + + if (flavor == RPC_AUTH_NULL) + use_auth_null = true; + } + + if (use_auth_null) { + flavor = RPC_AUTH_NULL; + goto out; } dfprintk(MOUNT, @@ -2408,6 +2435,11 @@ static int nfs_compare_super_address(struct nfs_server *server1, struct nfs_server *server2) { struct sockaddr *sap1, *sap2; + struct rpc_xprt *xprt1 = server1->client->cl_xprt; + struct rpc_xprt *xprt2 = server2->client->cl_xprt; + + if (!net_eq(xprt1->xprt_net, xprt2->xprt_net)) + return 0; sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr; sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index fa538b2ba251..191aa577dd1f 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -30,45 +30,11 @@ static void nfs_free_unlinkdata(struct nfs_unlinkdata *data) { - iput(data->dir); put_rpccred(data->cred); kfree(data->args.name.name); kfree(data); } -#define NAME_ALLOC_LEN(len) ((len+16) & ~15) -/** - * nfs_copy_dname - copy dentry name to data structure - * @dentry: pointer to dentry - * @data: nfs_unlinkdata - */ -static int nfs_copy_dname(struct dentry *dentry, struct nfs_unlinkdata *data) -{ - char *str; - int len = dentry->d_name.len; - - str = kmemdup(dentry->d_name.name, NAME_ALLOC_LEN(len), GFP_KERNEL); - if (!str) - return -ENOMEM; - data->args.name.len = len; - data->args.name.name = str; - return 0; -} - -static void nfs_free_dname(struct nfs_unlinkdata *data) -{ - kfree(data->args.name.name); - data->args.name.name = NULL; - data->args.name.len = 0; -} - -static void nfs_dec_sillycount(struct inode *dir) -{ - struct nfs_inode *nfsi = NFS_I(dir); - if (atomic_dec_return(&nfsi->silly_count) == 1) - wake_up(&nfsi->waitqueue); -} - /** * nfs_async_unlink_done - Sillydelete post-processing * @task: rpc_task of the sillydelete @@ -78,7 +44,7 @@ static void nfs_dec_sillycount(struct inode *dir) static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) { struct nfs_unlinkdata *data = calldata; - struct inode *dir = data->dir; + struct inode *dir = d_inode(data->dentry->d_parent); trace_nfs_sillyrename_unlink(data, task->tk_status); if (!NFS_PROTO(dir)->unlink_done(task, dir)) @@ -95,17 +61,21 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) static void nfs_async_unlink_release(void *calldata) { struct nfs_unlinkdata *data = calldata; - struct super_block *sb = data->dir->i_sb; + struct dentry *dentry = data->dentry; + struct super_block *sb = dentry->d_sb; - nfs_dec_sillycount(data->dir); + up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem); + d_lookup_done(dentry); nfs_free_unlinkdata(data); + dput(dentry); nfs_sb_deactive(sb); } static void nfs_unlink_prepare(struct rpc_task *task, void *calldata) { struct nfs_unlinkdata *data = calldata; - NFS_PROTO(data->dir)->unlink_rpc_prepare(task, data); + struct inode *dir = d_inode(data->dentry->d_parent); + NFS_PROTO(dir)->unlink_rpc_prepare(task, data); } static const struct rpc_call_ops nfs_unlink_ops = { @@ -114,7 +84,7 @@ static const struct rpc_call_ops nfs_unlink_ops = { .rpc_call_prepare = nfs_unlink_prepare, }; -static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data) +static void nfs_do_call_unlink(struct nfs_unlinkdata *data) { struct rpc_message msg = { .rpc_argp = &data->args, @@ -129,10 +99,31 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n .flags = RPC_TASK_ASYNC, }; struct rpc_task *task; + struct inode *dir = d_inode(data->dentry->d_parent); + nfs_sb_active(dir->i_sb); + data->args.fh = NFS_FH(dir); + nfs_fattr_init(data->res.dir_attr); + + NFS_PROTO(dir)->unlink_setup(&msg, dir); + + task_setup_data.rpc_client = NFS_CLIENT(dir); + task = rpc_run_task(&task_setup_data); + if (!IS_ERR(task)) + rpc_put_task_async(task); +} + +static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) +{ + struct inode *dir = d_inode(dentry->d_parent); struct dentry *alias; - alias = d_lookup(parent, &data->args.name); - if (alias != NULL) { + down_read_non_owner(&NFS_I(dir)->rmdir_sem); + alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq); + if (IS_ERR(alias)) { + up_read_non_owner(&NFS_I(dir)->rmdir_sem); + return 0; + } + if (!d_in_lookup(alias)) { int ret; void *devname_garbage = NULL; @@ -140,10 +131,8 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n * Hey, we raced with lookup... See if we need to transfer * the sillyrename information to the aliased dentry. */ - nfs_free_dname(data); - ret = nfs_copy_dname(alias, data); spin_lock(&alias->d_lock); - if (ret == 0 && d_really_is_positive(alias) && + if (d_really_is_positive(alias) && !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { devname_garbage = alias->d_fsdata; alias->d_fsdata = data; @@ -152,8 +141,8 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n } else ret = 0; spin_unlock(&alias->d_lock); - nfs_dec_sillycount(dir); dput(alias); + up_read_non_owner(&NFS_I(dir)->rmdir_sem); /* * If we'd displaced old cached devname, free it. At that * point dentry is definitely not a root, so we won't need @@ -162,94 +151,18 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n kfree(devname_garbage); return ret; } - data->dir = igrab(dir); - if (!data->dir) { - nfs_dec_sillycount(dir); - return 0; - } - nfs_sb_active(dir->i_sb); - data->args.fh = NFS_FH(dir); - nfs_fattr_init(data->res.dir_attr); - - NFS_PROTO(dir)->unlink_setup(&msg, dir); - - task_setup_data.rpc_client = NFS_CLIENT(dir); - task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task_async(task); + data->dentry = alias; + nfs_do_call_unlink(data); return 1; } -static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) -{ - struct dentry *parent; - struct inode *dir; - int ret = 0; - - - parent = dget_parent(dentry); - if (parent == NULL) - goto out_free; - dir = d_inode(parent); - /* Non-exclusive lock protects against concurrent lookup() calls */ - spin_lock(&dir->i_lock); - if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) { - /* Deferred delete */ - hlist_add_head(&data->list, &NFS_I(dir)->silly_list); - spin_unlock(&dir->i_lock); - ret = 1; - goto out_dput; - } - spin_unlock(&dir->i_lock); - ret = nfs_do_call_unlink(parent, dir, data); -out_dput: - dput(parent); -out_free: - return ret; -} - -void nfs_wait_on_sillyrename(struct dentry *dentry) -{ - struct nfs_inode *nfsi = NFS_I(d_inode(dentry)); - - wait_event(nfsi->waitqueue, atomic_read(&nfsi->silly_count) <= 1); -} - -void nfs_block_sillyrename(struct dentry *dentry) -{ - struct nfs_inode *nfsi = NFS_I(d_inode(dentry)); - - wait_event(nfsi->waitqueue, atomic_cmpxchg(&nfsi->silly_count, 1, 0) == 1); -} - -void nfs_unblock_sillyrename(struct dentry *dentry) -{ - struct inode *dir = d_inode(dentry); - struct nfs_inode *nfsi = NFS_I(dir); - struct nfs_unlinkdata *data; - - atomic_inc(&nfsi->silly_count); - spin_lock(&dir->i_lock); - while (!hlist_empty(&nfsi->silly_list)) { - if (!atomic_inc_not_zero(&nfsi->silly_count)) - break; - data = hlist_entry(nfsi->silly_list.first, struct nfs_unlinkdata, list); - hlist_del(&data->list); - spin_unlock(&dir->i_lock); - if (nfs_do_call_unlink(dentry, dir, data) == 0) - nfs_free_unlinkdata(data); - spin_lock(&dir->i_lock); - } - spin_unlock(&dir->i_lock); -} - /** * nfs_async_unlink - asynchronous unlinking of a file * @dir: parent directory of dentry * @dentry: dentry to unlink */ static int -nfs_async_unlink(struct inode *dir, struct dentry *dentry) +nfs_async_unlink(struct dentry *dentry, const struct qstr *name) { struct nfs_unlinkdata *data; int status = -ENOMEM; @@ -258,13 +171,18 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry) data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) goto out; + data->args.name.name = kstrdup(name->name, GFP_KERNEL); + if (!data->args.name.name) + goto out_free; + data->args.name.len = name->len; data->cred = rpc_lookup_cred(); if (IS_ERR(data->cred)) { status = PTR_ERR(data->cred); - goto out_free; + goto out_free_name; } data->res.dir_attr = &data->dir_attr; + init_waitqueue_head(&data->wq); status = -EBUSY; spin_lock(&dentry->d_lock); @@ -284,6 +202,8 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry) out_unlock: spin_unlock(&dentry->d_lock); put_rpccred(data->cred); +out_free_name: + kfree(data->args.name.name); out_free: kfree(data); out: @@ -302,17 +222,15 @@ out: void nfs_complete_unlink(struct dentry *dentry, struct inode *inode) { - struct nfs_unlinkdata *data = NULL; + struct nfs_unlinkdata *data; spin_lock(&dentry->d_lock); - if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { - dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; - data = dentry->d_fsdata; - dentry->d_fsdata = NULL; - } + dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; + data = dentry->d_fsdata; + dentry->d_fsdata = NULL; spin_unlock(&dentry->d_lock); - if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data))) + if (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)) nfs_free_unlinkdata(data); } @@ -559,18 +477,10 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) /* queue unlink first. Can't do this from rpc_release as it * has to allocate memory */ - error = nfs_async_unlink(dir, dentry); + error = nfs_async_unlink(dentry, &sdentry->d_name); if (error) goto out_dput; - /* populate unlinkdata with the right dname */ - error = nfs_copy_dname(sdentry, - (struct nfs_unlinkdata *)dentry->d_fsdata); - if (error) { - nfs_cancel_async_unlink(dentry); - goto out_dput; - } - /* run the rename task, undo unlink if it fails */ task = nfs_async_rename(dir, dir, dentry, sdentry, nfs_complete_sillyrename); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5f4fd53e5764..3a6724c6eb5f 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -245,8 +245,7 @@ static void nfs_mark_uptodate(struct nfs_page *req) static int wb_priority(struct writeback_control *wbc) { int ret = 0; - if (wbc->for_reclaim) - return FLUSH_HIGHPRI | FLUSH_COND_STABLE; + if (wbc->sync_mode == WB_SYNC_ALL) ret = FLUSH_COND_STABLE; return ret; @@ -626,7 +625,7 @@ static int nfs_writepage_locked(struct page *page, int err; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); - nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), + nfs_pageio_init_write(&pgio, inode, 0, false, &nfs_async_write_completion_ops); err = nfs_do_writepage(page, wbc, &pgio, launder); nfs_pageio_complete(&pgio); @@ -658,16 +657,9 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - unsigned long *bitlock = &NFS_I(inode)->flags; struct nfs_pageio_descriptor pgio; int err; - /* Stop dirtying of new pages while we sync */ - err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (err) - goto out_err; - nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, @@ -675,10 +667,6 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); nfs_pageio_complete(&pgio); - clear_bit_unlock(NFS_INO_FLUSHING, bitlock); - smp_mb__after_atomic(); - wake_up_bit(bitlock, NFS_INO_FLUSHING); - if (err < 0) goto out_err; err = pgio.pg_error; @@ -737,7 +725,7 @@ static void nfs_inode_remove_request(struct nfs_page *req) head = req->wb_head; spin_lock(&inode->i_lock); - if (likely(!PageSwapCache(head->wb_page))) { + if (likely(head->wb_page && !PageSwapCache(head->wb_page))) { set_page_private(head->wb_page, 0); ClearPagePrivate(head->wb_page); smp_mb__after_atomic(); @@ -759,7 +747,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) static void nfs_mark_request_dirty(struct nfs_page *req) { - __set_page_dirty_nobuffers(req->wb_page); + if (req->wb_page) + __set_page_dirty_nobuffers(req->wb_page); } /* @@ -804,7 +793,7 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, * number of outstanding requests requiring a commit as well as * the MM page stats. * - * The caller must hold the cinfo->lock, and the nfs_page lock. + * The caller must hold cinfo->inode->i_lock, and the nfs_page lock. */ void nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, @@ -832,10 +821,11 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); void nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo) { - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo); - spin_unlock(cinfo->lock); - nfs_mark_page_unstable(req->wb_page, cinfo); + spin_unlock(&cinfo->inode->i_lock); + if (req->wb_page) + nfs_mark_page_unstable(req->wb_page, cinfo); } EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); @@ -864,7 +854,7 @@ EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode) { - cinfo->lock = &inode->i_lock; + cinfo->inode = inode; cinfo->mds = &NFS_I(inode)->commit_info; cinfo->ds = pnfs_get_ds_info(inode); cinfo->dreq = NULL; @@ -897,7 +887,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, static void nfs_clear_page_commit(struct page *page) { - dec_zone_page_state(page, NR_UNSTABLE_NFS); + dec_node_page_state(page, NR_UNSTABLE_NFS); dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, WB_RECLAIMABLE); } @@ -967,7 +957,7 @@ nfs_reqs_to_commit(struct nfs_commit_info *cinfo) return cinfo->mds->ncommit; } -/* cinfo->lock held by caller */ +/* cinfo->inode->i_lock held by caller */ int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max) @@ -979,7 +969,7 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst, if (!nfs_lock_request(req)) continue; kref_get(&req->wb_kref); - if (cond_resched_lock(cinfo->lock)) + if (cond_resched_lock(&cinfo->inode->i_lock)) list_safe_reset_next(req, tmp, wb_list); nfs_request_remove_commit_list(req, cinfo); nfs_list_add_request(req, dst); @@ -1005,7 +995,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, { int ret = 0; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); if (cinfo->mds->ncommit > 0) { const int max = INT_MAX; @@ -1013,7 +1003,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, cinfo, max); ret += pnfs_scan_commit_lists(inode, cinfo, max - ret); } - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); return ret; } @@ -1194,9 +1184,11 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode) /* * Test if the open context credential key is marked to expire soon. */ -bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx) +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode) { - return rpcauth_cred_key_to_expire(ctx->cred); + struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth; + + return rpcauth_cred_key_to_expire(auth, ctx->cred); } /* @@ -1288,6 +1280,9 @@ int nfs_updatepage(struct file *file, struct page *page, dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n", file, count, (long long)(page_file_offset(page) + offset)); + if (!count) + goto out; + if (nfs_can_extend_write(file, page, inode)) { count = max(count + offset, nfs_page_length(page)); offset = 0; @@ -1298,7 +1293,7 @@ int nfs_updatepage(struct file *file, struct page *page, nfs_set_pageerror(page); else __set_page_dirty_nobuffers(page); - +out: dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); return status; @@ -1709,6 +1704,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, { struct nfs_commit_data *data; + /* another commit raced with us */ + if (list_empty(head)) + return 0; + data = nfs_commitdata_alloc(); if (!data) @@ -1724,6 +1723,36 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, return -ENOMEM; } +int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf) +{ + struct inode *inode = file_inode(file); + struct nfs_open_context *open; + struct nfs_commit_info cinfo; + struct nfs_page *req; + int ret; + + open = get_nfs_open_context(nfs_file_open_context(file)); + req = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode)); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out_put; + } + + nfs_init_cinfo_from_inode(&cinfo, inode); + + memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier)); + nfs_request_add_commit_list(req, &cinfo); + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret > 0) + ret = 0; + + nfs_free_request(req); +out_put: + put_nfs_open_context(open); + return ret; +} +EXPORT_SYMBOL_GPL(nfs_commit_file); + /* * COMMIT call returned */ @@ -1748,7 +1777,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - nfs_clear_page_commit(req->wb_page); + if (req->wb_page) + nfs_clear_page_commit(req->wb_page); dprintk("NFS: commit (%s/%llu %d@%lld)", req->wb_context->dentry->d_sb->s_id, @@ -1764,7 +1794,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* Okay, COMMIT succeeded, apparently. Check the verifier * returned by the server against all stored verfs. */ - if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) { + if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) { /* We have a match */ nfs_inode_remove_request(req); dprintk(" OK\n"); @@ -1888,6 +1918,24 @@ out_mark_dirty: EXPORT_SYMBOL_GPL(nfs_write_inode); /* + * Wrapper for filemap_write_and_wait_range() + * + * Needed for pNFS in order to ensure data becomes visible to the + * client. + */ +int nfs_filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend) +{ + int ret; + + ret = filemap_write_and_wait_range(mapping, lstart, lend); + if (ret == 0) + ret = pnfs_sync_inode(mapping->host, true); + return ret; +} +EXPORT_SYMBOL_GPL(nfs_filemap_write_and_wait_range); + +/* * flush the inode to disk. */ int nfs_wb_all(struct inode *inode) |