From 7f1bda447c9bd48b415acedba6b830f61591601f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 18 Dec 2017 14:39:13 -0500 Subject: NFS: Add a cond_resched() to nfs_commit_release_pages() The commit list can get very large, and so we need a cond_resched() in nfs_commit_release_pages() in order to ensure we don't hog the CPU for excessive periods of time. Reported-by: Mike Galbraith Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/nfs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4a379d7918f2..cf61108f8f8d 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1837,6 +1837,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); next: nfs_unlock_and_release_request(req); + /* Latency breaker */ + cond_resched(); } nfss = NFS_SERVER(data->inode); if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) -- cgit v1.2.3-55-g7522 From b8b8d221090f2b469027f04e451ef1877cb1be08 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 7 Nov 2017 10:51:37 -0500 Subject: NFSv4: Convert CLOSE to use nfs4_async_handle_exception() Convert CLOSE so that it specifies the correct stateid, state and inode for the error handling. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 56fa5a16e097..ea2b7e8db437 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3148,6 +3148,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data) struct nfs4_state *state = calldata->state; struct nfs_server *server = NFS_SERVER(calldata->inode); nfs4_stateid *res_stateid = NULL; + struct nfs4_exception exception = { + .state = state, + .inode = calldata->inode, + .stateid = &calldata->arg.stateid, + }; dprintk("%s: begin!\n", __func__); if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -3213,7 +3218,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data) case -NFS4ERR_BAD_STATEID: break; default: - if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + task->tk_status = nfs4_async_handle_exception(task, + server, task->tk_status, &exception); + if (exception.retry) goto out_restart; } nfs_clear_open_stateid(state, &calldata->arg.stateid, -- cgit v1.2.3-55-g7522 From e0dba0128a662ab8540fcea76aca8ae27774c7da Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 7 Nov 2017 11:02:32 -0500 Subject: NFSv4: Convert DELEGRETURN to use nfs4_handle_exception() Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ea2b7e8db437..b704e08b390c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5764,6 +5764,10 @@ struct nfs4_delegreturndata { static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_delegreturndata *data = calldata; + struct nfs4_exception exception = { + .inode = data->inode, + .stateid = &data->stateid, + }; if (!nfs4_sequence_done(task, &data->res.seq_res)) return; @@ -5825,10 +5829,11 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) } /* Fallthrough */ default: - if (nfs4_async_handle_error(task, data->res.server, - NULL, NULL) == -EAGAIN) { + task->tk_status = nfs4_async_handle_exception(task, + data->res.server, task->tk_status, + &exception); + if (exception.retry) goto out_restart; - } } data->rpc_status = task->tk_status; return; -- cgit v1.2.3-55-g7522 From 82571552a034f7531b0c8e76c1fd593c8755c519 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 7 Nov 2017 11:14:49 -0500 Subject: NFSv4: Convert LOCKU to use nfs4_async_handle_exception() Convert CLOSE so that it specifies the correct stateid and inode for the error handling. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b704e08b390c..b927fda32e74 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6071,6 +6071,10 @@ static void nfs4_locku_release_calldata(void *data) static void nfs4_locku_done(struct rpc_task *task, void *data) { struct nfs4_unlockdata *calldata = data; + struct nfs4_exception exception = { + .inode = calldata->lsp->ls_state->inode, + .stateid = &calldata->arg.stateid, + }; if (!nfs4_sequence_done(task, &calldata->res.seq_res)) return; @@ -6094,8 +6098,10 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) rpc_restart_call_prepare(task); break; default: - if (nfs4_async_handle_error(task, calldata->server, - NULL, NULL) == -EAGAIN) + task->tk_status = nfs4_async_handle_exception(task, + calldata->server, task->tk_status, + &exception); + if (exception.retry) rpc_restart_call_prepare(task); } nfs_release_seqid(calldata->arg.seqid); -- cgit v1.2.3-55-g7522 From 8634ef5e05311f32d7f2aee06f6b27a8834a3bd6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 6 Jan 2018 09:53:49 -0500 Subject: NFS: Fix nfsstat breakage due to LOOKUPP The LOOKUPP operation was inserted into the nfs4_procedures array rather than being appended, which put /proc/net/rpc/nfs out of whack, and broke the nfsstat utility. Fix by moving the LOOKUPP operation to the end of the array, and by ensuring that it keeps the same length whether or not NFSV4.1 and NFSv4.2 are compiled in. Fixes: 5b5faaf6df734 ("nfs4: add NFSv4 LOOKUPP handlers") Cc: stable@vger.kernel.org # v4.13+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 64 +++++++++++++++++++++++++++++++--------------------- include/linux/nfs4.h | 12 ++++++---- 2 files changed, 46 insertions(+), 30 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 77c6729e57f0..65c9c4175145 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -7678,6 +7678,22 @@ nfs4_stat_to_errno(int stat) .p_name = #proc, \ } +#if defined(CONFIG_NFS_V4_1) +#define PROC41(proc, argtype, restype) \ + PROC(proc, argtype, restype) +#else +#define PROC41(proc, argtype, restype) \ + STUB(proc) +#endif + +#if defined(CONFIG_NFS_V4_2) +#define PROC42(proc, argtype, restype) \ + PROC(proc, argtype, restype) +#else +#define PROC42(proc, argtype, restype) \ + STUB(proc) +#endif + const struct rpc_procinfo nfs4_procedures[] = { PROC(READ, enc_read, dec_read), PROC(WRITE, enc_write, dec_write), @@ -7698,7 +7714,6 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC(ACCESS, enc_access, dec_access), PROC(GETATTR, enc_getattr, dec_getattr), PROC(LOOKUP, enc_lookup, dec_lookup), - PROC(LOOKUPP, enc_lookupp, dec_lookupp), PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root), PROC(REMOVE, enc_remove, dec_remove), PROC(RENAME, enc_rename, dec_rename), @@ -7717,33 +7732,30 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), PROC(SECINFO, enc_secinfo, dec_secinfo), PROC(FSID_PRESENT, enc_fsid_present, dec_fsid_present), -#if defined(CONFIG_NFS_V4_1) - PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), - PROC(CREATE_SESSION, enc_create_session, dec_create_session), - PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), - PROC(SEQUENCE, enc_sequence, dec_sequence), - PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), - PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), - PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), - PROC(LAYOUTGET, enc_layoutget, dec_layoutget), - PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), - PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), - PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), - PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), - PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), + PROC41(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), + PROC41(CREATE_SESSION, enc_create_session, dec_create_session), + PROC41(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), + PROC41(SEQUENCE, enc_sequence, dec_sequence), + PROC41(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC41(RECLAIM_COMPLETE,enc_reclaim_complete, dec_reclaim_complete), + PROC41(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), + PROC41(LAYOUTGET, enc_layoutget, dec_layoutget), + PROC41(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), + PROC41(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), + PROC41(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), + PROC41(TEST_STATEID, enc_test_stateid, dec_test_stateid), + PROC41(FREE_STATEID, enc_free_stateid, dec_free_stateid), STUB(GETDEVICELIST), - PROC(BIND_CONN_TO_SESSION, + PROC41(BIND_CONN_TO_SESSION, enc_bind_conn_to_session, dec_bind_conn_to_session), - PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), -#endif /* CONFIG_NFS_V4_1 */ -#ifdef CONFIG_NFS_V4_2 - PROC(SEEK, enc_seek, dec_seek), - PROC(ALLOCATE, enc_allocate, dec_allocate), - PROC(DEALLOCATE, enc_deallocate, dec_deallocate), - PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), - PROC(CLONE, enc_clone, dec_clone), - PROC(COPY, enc_copy, dec_copy), -#endif /* CONFIG_NFS_V4_2 */ + PROC41(DESTROY_CLIENTID,enc_destroy_clientid, dec_destroy_clientid), + PROC42(SEEK, enc_seek, dec_seek), + PROC42(ALLOCATE, enc_allocate, dec_allocate), + PROC42(DEALLOCATE, enc_deallocate, dec_deallocate), + PROC42(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), + PROC42(CLONE, enc_clone, dec_clone), + PROC42(COPY, enc_copy, dec_copy), + PROC(LOOKUPP, enc_lookupp, dec_lookupp), }; static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 47adac640191..57ffaa20d564 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -457,7 +457,12 @@ enum lock_type4 { #define NFS4_DEBUG 1 -/* Index of predefined Linux client operations */ +/* + * Index of predefined Linux client operations + * + * To ensure that /proc/net/rpc/nfs remains correctly ordered, please + * append only to this enum when adding new client operations. + */ enum { NFSPROC4_CLNT_NULL = 0, /* Unused */ @@ -480,7 +485,6 @@ enum { NFSPROC4_CLNT_ACCESS, NFSPROC4_CLNT_GETATTR, NFSPROC4_CLNT_LOOKUP, - NFSPROC4_CLNT_LOOKUPP, NFSPROC4_CLNT_LOOKUP_ROOT, NFSPROC4_CLNT_REMOVE, NFSPROC4_CLNT_RENAME, @@ -500,7 +504,6 @@ enum { NFSPROC4_CLNT_SECINFO, NFSPROC4_CLNT_FSID_PRESENT, - /* nfs41 */ NFSPROC4_CLNT_EXCHANGE_ID, NFSPROC4_CLNT_CREATE_SESSION, NFSPROC4_CLNT_DESTROY_SESSION, @@ -518,13 +521,14 @@ enum { NFSPROC4_CLNT_BIND_CONN_TO_SESSION, NFSPROC4_CLNT_DESTROY_CLIENTID, - /* nfs42 */ NFSPROC4_CLNT_SEEK, NFSPROC4_CLNT_ALLOCATE, NFSPROC4_CLNT_DEALLOCATE, NFSPROC4_CLNT_LAYOUTSTATS, NFSPROC4_CLNT_CLONE, NFSPROC4_CLNT_COPY, + + NFSPROC4_CLNT_LOOKUPP, }; /* nfs41 types */ -- cgit v1.2.3-55-g7522 From 9ccee940bd5b766b6dab6c1a80908b9490a4850d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 4 Jan 2018 17:46:09 -0500 Subject: Support statx() mask and query flags parameters Support the query flags AT_STATX_FORCE_SYNC by forcing an attribute revalidation, and AT_STATX_DONT_SYNC by returning cached attributes only. Use the mask to optimise away server revalidation for attributes that are not being requested by the user. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 52 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 13 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b992d2382ffa..deeb7d1097d0 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -735,12 +735,20 @@ int nfs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; + struct nfs_server *server = NFS_SERVER(inode); + unsigned long cache_validity; int err = 0; + bool force_sync = query_flags & AT_STATX_FORCE_SYNC; + bool do_update = false; trace_nfs_getattr_enter(inode); + + if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) + goto out_no_update; + /* Flush out writes to the server in order to update c/mtime. */ - if (S_ISREG(inode->i_mode)) { + if ((request_mask & (STATX_CTIME|STATX_MTIME)) && + S_ISREG(inode->i_mode)) { err = filemap_write_and_wait(inode->i_mapping); if (err) goto out; @@ -757,24 +765,42 @@ int nfs_getattr(const struct path *path, struct kstat *stat, */ if ((path->mnt->mnt_flags & MNT_NOATIME) || ((path->mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) - need_atime = 0; - - if (need_atime || nfs_need_revalidate_inode(inode)) { - struct nfs_server *server = NFS_SERVER(inode); - + request_mask &= ~STATX_ATIME; + + /* Is the user requesting attributes that might need revalidation? */ + if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME| + STATX_MTIME|STATX_UID|STATX_GID| + STATX_SIZE|STATX_BLOCKS))) + goto out_no_revalidate; + + /* Check whether the cached attributes are stale */ + do_update |= force_sync || nfs_attribute_cache_expired(inode); + cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); + do_update |= cache_validity & + (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL); + if (request_mask & STATX_ATIME) + do_update |= cache_validity & NFS_INO_INVALID_ATIME; + if (request_mask & (STATX_CTIME|STATX_MTIME)) + do_update |= cache_validity & NFS_INO_REVAL_PAGECACHE; + if (do_update) { + /* Update the attribute cache */ if (!(server->flags & NFS_MOUNT_NOAC)) nfs_readdirplus_parent_cache_miss(path->dentry); else nfs_readdirplus_parent_cache_hit(path->dentry); err = __nfs_revalidate_inode(server, inode); + if (err) + goto out; } else nfs_readdirplus_parent_cache_hit(path->dentry); - if (!err) { - generic_fillattr(inode, stat); - stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); - if (S_ISDIR(inode->i_mode)) - stat->blksize = NFS_SERVER(inode)->dtsize; - } +out_no_revalidate: + /* Only return attributes that were revalidated. */ + stat->result_mask &= request_mask; +out_no_update: + generic_fillattr(inode, stat); + stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); + if (S_ISDIR(inode->i_mode)) + stat->blksize = NFS_SERVER(inode)->dtsize; out: trace_nfs_getattr_exit(inode, err); return err; -- cgit v1.2.3-55-g7522 From aaa1500894655427f1b964d1f31dbbd20baa80ab Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 22 Nov 2017 08:23:41 +1100 Subject: nfs: remove dead code from nfs_encode_fh() This code can never be used as the IS_AUTOMOUNT(inode) case has already been handled. So remove it to avoid confusion. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/export.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 83fd09fc8f77..2b80a6652818 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -48,10 +48,6 @@ nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent) *max_len = len; return FILEID_INVALID; } - if (IS_AUTOMOUNT(inode)) { - *max_len = FILEID_INVALID; - goto out; - } p[FILEID_HIGH_OFF] = NFS_FILEID(inode) >> 32; p[FILEID_LOW_OFF] = NFS_FILEID(inode); -- cgit v1.2.3-55-g7522 From dce2630c7da73b0634686bca557cc8945cc450c8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 13 Dec 2017 09:57:09 +1100 Subject: NFSv4: always set NFS_LOCK_LOST when a lock is lost. There are 2 comments in the NFSv4 code which suggest that SIGLOST should possibly be sent to a process. In these cases a lock has been lost. The current practice is to set NFS_LOCK_LOST so that read/write returns EIO when a lock is lost. So change these comments to code when sets NFS_LOCK_LOST. One case is when lock recovery after apparent server restart fails with NFS4ERR_DENIED, NFS4ERR_RECLAIM_BAD, or NFS4ERRO_RECLAIM_CONFLICT. The other case is when a lock attempt as part of lease recovery fails with NFS4ERR_DENIED. In an ideal world, these should not happen. However I have a packet trace showing an NFSv4.1 session getting NFS4ERR_BADSESSION after an extended network parition. The NFSv4.1 client treats this like server reboot until/unless it get NFS4ERR_NO_GRACE, in which case it switches over to "nograce" recovery mode. In this network trace, the client attempts to recover a lock and the server (incorrectly) reports NFS4ERR_DENIED rather than NFS4ERR_NO_GRACE. This leads to the ineffective comment and the client then continues to write using the OPEN stateid. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 12 ++++++++---- fs/nfs/nfs4state.c | 5 ++++- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b927fda32e74..f8a2b226f571 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2019,7 +2019,7 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta return ret; } -static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, int err) +static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, struct file_lock *fl, int err) { switch (err) { default: @@ -2066,7 +2066,11 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct return -EAGAIN; case -ENOMEM: case -NFS4ERR_DENIED: - /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + if (fl) { + struct nfs4_lock_state *lsp = fl->fl_u.nfs4_fl.owner; + if (lsp) + set_bit(NFS_LOCK_LOST, &lsp->ls_flags); + } return 0; } return err; @@ -2102,7 +2106,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, err = nfs4_open_recover_helper(opendata, FMODE_READ); } nfs4_opendata_put(opendata); - return nfs4_handle_delegation_recall_error(server, state, stateid, err); + return nfs4_handle_delegation_recall_error(server, state, stateid, NULL, err); } static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) @@ -6757,7 +6761,7 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, if (err != 0) return err; err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); - return nfs4_handle_delegation_recall_error(server, state, stateid, err); + return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err); } struct nfs_release_lockowner_data { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e4f4a09ed9f4..91a4d4eeb235 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1482,6 +1482,7 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ struct inode *inode = state->inode; struct nfs_inode *nfsi = NFS_I(inode); struct file_lock *fl; + struct nfs4_lock_state *lsp; int status = 0; struct file_lock_context *flctx = inode->i_flctx; struct list_head *list; @@ -1522,7 +1523,9 @@ restart: case -NFS4ERR_DENIED: case -NFS4ERR_RECLAIM_BAD: case -NFS4ERR_RECLAIM_CONFLICT: - /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + lsp = fl->fl_u.nfs4_fl.owner; + if (lsp) + set_bit(NFS_LOCK_LOST, &lsp->ls_flags); status = 0; } spin_lock(&flctx->flc_lock); -- cgit v1.2.3-55-g7522 From e545735a3255a2f211d9c7efd2d67f2affcde1ba Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 8 Dec 2017 12:52:37 -0500 Subject: NFS: remove unused offset arg in nfs_pgio_rpcsetup nfs_pgio_rpcsetup() is always called with an offset of 0, so we should be able to drop the arguement altogether. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index d0543e19098a..18a7626ac638 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -537,7 +537,7 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free); * @cinfo: Commit information for the call (writes only) */ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, - unsigned int count, unsigned int offset, + unsigned int count, int how, struct nfs_commit_info *cinfo) { struct nfs_page *req = hdr->req; @@ -546,10 +546,10 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, * NB: take care not to mess about with hdr->commit et al. */ hdr->args.fh = NFS_FH(hdr->inode); - hdr->args.offset = req_offset(req) + offset; + hdr->args.offset = req_offset(req); /* pnfs_set_layoutcommit needs this */ hdr->mds_offset = hdr->args.offset; - hdr->args.pgbase = req->wb_pgbase + offset; + hdr->args.pgbase = req->wb_pgbase; hdr->args.pages = hdr->page_array.pagevec; hdr->args.count = count; hdr->args.context = get_nfs_open_context(req->wb_context); @@ -789,7 +789,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, desc->pg_ioflags &= ~FLUSH_COND_STABLE; /* Set up the argument struct */ - nfs_pgio_rpcsetup(hdr, mirror->pg_count, 0, desc->pg_ioflags, &cinfo); + nfs_pgio_rpcsetup(hdr, mirror->pg_count, desc->pg_ioflags, &cinfo); desc->pg_rpc_callops = &nfs_pgio_common_ops; return 0; } -- cgit v1.2.3-55-g7522 From ad6b0241c94e2732f928ec9ef5b3561d7448c8fe Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 8 Dec 2017 12:52:47 -0500 Subject: pnfs/blocklayout: Add module alias for LAYOUT4_SCSI The blocklayout module contains the client support for both block and SCSI layouts. Add a module alias for the SCSI layout type so that the module will be loaded for SCSI layouts. Signed-off-by: Benjamin Coddington Reviewed-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/nfs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 995d707537da..ec110aa87634 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -967,6 +967,7 @@ static void __exit nfs4blocklayout_exit(void) } MODULE_ALIAS("nfs-layouttype4-3"); +MODULE_ALIAS("nfs-layouttype4-5"); module_init(nfs4blocklayout_init); module_exit(nfs4blocklayout_exit); -- cgit v1.2.3-55-g7522 From d78471d32bb60837930026e11828af596fb4bdac Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 8 Dec 2017 12:52:57 -0500 Subject: pnfs/blocklayout: set PNFS_LAYOUTRETURN_ON_ERROR If there's an error doing I/O to block device, and the client resends the I/O to the MDS, the MDS must recall the layout from the client before processing the I/O. Let's preempt that exchange by returning the layout before falling back to the MDS when there's an error. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 2 ++ fs/nfs/pnfs.h | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/nfs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index ec110aa87634..334570888649 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -887,6 +887,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = { .name = "LAYOUT_BLOCK_VOLUME", .owner = THIS_MODULE, .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_LAYOUTRET_ON_ERROR | PNFS_READ_WHOLE_PAGE, .read_pagelist = bl_read_pagelist, .write_pagelist = bl_write_pagelist, @@ -910,6 +911,7 @@ static struct pnfs_layoutdriver_type scsilayout_type = { .name = "LAYOUT_SCSI", .owner = THIS_MODULE, .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_LAYOUTRET_ON_ERROR | PNFS_READ_WHOLE_PAGE, .read_pagelist = bl_read_pagelist, .write_pagelist = bl_write_pagelist, diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 8d507c361d98..29a19814e538 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -524,8 +524,10 @@ static inline int pnfs_return_layout(struct inode *ino) struct nfs_inode *nfsi = NFS_I(ino); struct nfs_server *nfss = NFS_SERVER(ino); - if (pnfs_enabled_sb(nfss) && nfsi->layout) + if (pnfs_enabled_sb(nfss) && nfsi->layout) { + set_bit(NFS_LAYOUT_RETURN_REQUESTED, &nfsi->layout->plh_flags); return _pnfs_return_layout(ino); + } return 0; } -- cgit v1.2.3-55-g7522 From b3dce6a2f0601be9b6781b394fdf6ceb63009a44 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 8 Dec 2017 12:52:59 -0500 Subject: pnfs/blocklayout: handle transient devices PNFS block/SCSI layouts should gracefully handle cases where block devices are not available when a layout is retrieved, or the block devices are removed while the client holds a layout. While setting up a layout segment, keep a record of an unavailable or un-parsable block device in cache with a flag so that subsequent layouts do not spam the server with GETDEVINFO. We can reuse the current NFS_DEVICEID_UNAVAILABLE handling with one variation: instead of reusing the device, we will discard it and send a fresh GETDEVINFO after the timeout, since the lookup and validation of the device occurs within the GETDEVINFO response handling. A lookup of a layout segment that references an unavailable device will return a segment with the NFS_LSEG_UNAVAILABLE flag set. This will allow the pgio layer to mark the layout with the appropriate fail bit, which forces subsequent IO to the MDS, and prevents spamming the server with LAYOUTGET, LAYOUTRETURN. Finally, when IO to a block device fails, look up the block device(s) referenced by the pgio header, and mark them as unavailable. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 82 +++++++++++++++++++++++++++++++++++++--- fs/nfs/blocklayout/dev.c | 7 +--- fs/nfs/pnfs.c | 2 +- fs/nfs/pnfs.h | 2 + fs/nfs/pnfs_dev.c | 1 - 5 files changed, 82 insertions(+), 12 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 334570888649..ca6cf54b54df 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -184,6 +184,29 @@ retry: return bio; } +static void bl_mark_devices_unavailable(struct nfs_pgio_header *header, bool rw) +{ + struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg); + size_t bytes_left = header->args.count; + sector_t isect, extent_length = 0; + struct pnfs_block_extent be; + + isect = header->args.offset >> SECTOR_SHIFT; + bytes_left += header->args.offset - (isect << SECTOR_SHIFT); + + while (bytes_left > 0) { + if (!ext_tree_lookup(bl, isect, &be, rw)) + return; + extent_length = be.be_length - (isect - be.be_f_offset); + nfs4_mark_deviceid_unavailable(be.be_device); + isect += extent_length; + if (bytes_left > extent_length << SECTOR_SHIFT) + bytes_left -= extent_length << SECTOR_SHIFT; + else + bytes_left = 0; + } +} + static void bl_end_io_read(struct bio *bio) { struct parallel_io *par = bio->bi_private; @@ -194,6 +217,7 @@ static void bl_end_io_read(struct bio *bio) if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); + bl_mark_devices_unavailable(header, false); } bio_put(bio); @@ -323,6 +347,7 @@ static void bl_end_io_write(struct bio *bio) if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); + bl_mark_devices_unavailable(header, true); } bio_put(bio); put_parallel(par); @@ -552,6 +577,31 @@ static int decode_sector_number(__be32 **rp, sector_t *sp) return 0; } +static struct nfs4_deviceid_node * +bl_find_get_deviceid(struct nfs_server *server, + const struct nfs4_deviceid *id, struct rpc_cred *cred, + gfp_t gfp_mask) +{ + struct nfs4_deviceid_node *node; + unsigned long start, end; + +retry: + node = nfs4_find_get_deviceid(server, id, cred, gfp_mask); + if (!node) + return ERR_PTR(-ENODEV); + + if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) + return node; + + end = jiffies; + start = end - PNFS_DEVICE_RETRY_TIMEOUT; + if (!time_in_range(node->timestamp_unavailable, start, end)) { + nfs4_delete_deviceid(node->ld, node->nfs_client, id); + goto retry; + } + return ERR_PTR(-ENODEV); +} + static int bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo, struct layout_verification *lv, struct list_head *extents, @@ -573,16 +623,18 @@ bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo, memcpy(&id, p, NFS4_DEVICEID4_SIZE); p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); - error = -EIO; - be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id, + be->be_device = bl_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id, lo->plh_lc_cred, gfp_mask); - if (!be->be_device) + if (IS_ERR(be->be_device)) { + error = PTR_ERR(be->be_device); goto out_free_be; + } /* * The next three values are read in as bytes, but stored in the * extent structure in 512-byte granularity. */ + error = -EIO; if (decode_sector_number(&p, &be->be_f_offset) < 0) goto out_put_deviceid; if (decode_sector_number(&p, &be->be_length) < 0) @@ -692,11 +744,16 @@ out_free_scratch: __free_page(scratch); out: dprintk("%s returns %d\n", __func__, status); - if (status) { + switch (status) { + case -ENODEV: + /* Our extent block devices are unavailable */ + set_bit(NFS_LSEG_UNAVAILABLE, &lseg->pls_flags); + case 0: + return lseg; + default: kfree(lseg); return ERR_PTR(status); } - return lseg; } static void @@ -798,6 +855,13 @@ bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) } pnfs_generic_pg_init_read(pgio, req); + + if (pgio->pg_lseg && + test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) { + pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg); + pnfs_set_lo_fail(pgio->pg_lseg); + nfs_pageio_reset_read_mds(pgio); + } } /* @@ -853,6 +917,14 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); pnfs_generic_pg_init_write(pgio, req, wb_size); + + if (pgio->pg_lseg && + test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) { + + pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg); + pnfs_set_lo_fail(pgio->pg_lseg); + nfs_pageio_reset_write_mds(pgio); + } } /* diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 95f74bd2c067..a7efd83779d2 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -533,14 +533,11 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_free_volumes; ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask); - if (ret) { - bl_free_device(top); - kfree(top); - goto out_free_volumes; - } node = &top->node; nfs4_init_deviceid_node(node, server, &pdev->dev_id); + if (ret) + nfs4_mark_deviceid_unavailable(node); out_free_volumes: kfree(volumes); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index d602fe9e1ac8..b3dae6ec2d39 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -655,7 +655,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return 0; list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (pnfs_match_lseg_recall(lseg, recall_range, seq)) { - dprintk("%s: freeing lseg %p iomode %d seq %u" + dprintk("%s: freeing lseg %p iomode %d seq %u " "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_seq, lseg->pls_range.offset, lseg->pls_range.length); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 29a19814e538..daf6cbf5c15f 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -40,6 +40,7 @@ enum { NFS_LSEG_ROC, /* roc bit received from server */ NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */ NFS_LSEG_LAYOUTRETURN, /* layoutreturn bit set for layoutreturn */ + NFS_LSEG_UNAVAILABLE, /* unavailable bit set for temporary problem */ }; /* Individual ip address */ @@ -86,6 +87,7 @@ enum pnfs_try_status { */ #define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */ #define NFS4_DEF_DS_RETRANS 5 +#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ) /* error codes for internal use */ #define NFS4ERR_RESET_TO_MDS 12001 diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 2961fcd7a2df..e8a07b3f9aaa 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -43,7 +43,6 @@ #define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) #define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) -#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ) static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE]; static DEFINE_SPINLOCK(nfs4_deviceid_lock); -- cgit v1.2.3-55-g7522 From ba4a76f703ab7eb72941fdaac848502073d6e9ee Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Fri, 15 Dec 2017 16:12:32 -0500 Subject: nfs/pnfs: fix nfs_direct_req ref leak when i/o falls back to the mds Currently when falling back to doing I/O through the MDS (via pnfs_{read|write}_through_mds), the client frees the nfs_pgio_header without releasing the reference taken on the dreq via pnfs_generic_pg_{read|write}pages -> nfs_pgheader_init -> nfs_direct_pgio_init. It then takes another reference on the dreq via nfs_generic_pg_pgios -> nfs_pgheader_init -> nfs_direct_pgio_init and as a result the requester will become stuck in inode_dio_wait. Once that happens, other processes accessing the inode will become stuck as well. Ensure that pnfs_read_through_mds() and pnfs_write_through_mds() clean up correctly by calling hdr->completion_ops->completion() instead of calling hdr->release() directly. This can be reproduced (sometimes) by performing "storage failover takeover" commands on NetApp filer while doing direct I/O from a client. This can also be reproduced using SystemTap to simulate a failure while doing direct I/O from a client (from Dave Wysochanski ): stap -v -g -e 'probe module("nfs_layout_nfsv41_files").function("nfs4_fl_prepare_ds").return { $return=NULL; exit(); }' Suggested-by: Trond Myklebust Signed-off-by: Scott Mayhew Fixes: 1ca018d28d ("pNFS: Fix a memory leak when attempted pnfs fails") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index b3dae6ec2d39..c13e826614b5 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2255,7 +2255,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_write_mds(desc); mirror->pg_recoalesce = 1; } - hdr->release(hdr); + hdr->completion_ops->completion(hdr); } static enum pnfs_try_status @@ -2378,7 +2378,7 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_read_mds(desc); mirror->pg_recoalesce = 1; } - hdr->release(hdr); + hdr->completion_ops->completion(hdr); } /* -- cgit v1.2.3-55-g7522 From 530ea4219231e62341f79a5517d7b4f12ec3b74f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2017 14:13:38 -0500 Subject: nfs: Referrals should use the same proto setting as their parent Helen Chao noticed that when a user traverses a referral on an NFS/RDMA mount, the resulting submount always uses TCP. This behavior does not match the vers= setting when traversing a referral (vers=4.1 is preserved). It also does not match the behavior of crossing from the pseudofs into a real filesystem (proto=rdma is preserved in that case). The Linux NFS client does not currently support the fs_locations_info attribute. The situation is similar for all NFSv4 servers I know of. Therefore until the community has broad support for fs_locations_info, when following a referral: - First try to connect with RPC-over-RDMA. This will fail quickly if the client has no RDMA-capable interfaces. - If connecting with RPC-over-RDMA fails, or the RPC-over-RDMA transport is not available, use TCP. Reported-by: Helen Chao Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 23 ++++++++++++++++++++--- fs/nfs/nfs4namespace.c | 2 -- 2 files changed, 20 insertions(+), 5 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 65a7e5da508c..507f1f534b16 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1123,19 +1123,36 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, /* Initialise the client representation from the parent server */ nfs_server_copy_userdata(server, parent_server); - /* Get a client representation. - * Note: NFSv4 always uses TCP, */ + /* Get a client representation */ +#ifdef CONFIG_SUNRPC_XPRT_RDMA + rpc_set_port(data->addr, NFS_RDMA_PORT); error = nfs4_set_client(server, data->hostname, data->addr, data->addrlen, parent_client->cl_ipaddr, - rpc_protocol(parent_server->client), + XPRT_TRANSPORT_RDMA, + parent_server->client->cl_timeout, + parent_client->cl_mvops->minor_version, + parent_client->cl_net); + if (!error) + goto init_server; +#endif /* CONFIG_SUNRPC_XPRT_RDMA */ + + rpc_set_port(data->addr, NFS_PORT); + error = nfs4_set_client(server, data->hostname, + data->addr, + data->addrlen, + parent_client->cl_ipaddr, + XPRT_TRANSPORT_TCP, parent_server->client->cl_timeout, parent_client->cl_mvops->minor_version, parent_client->cl_net); if (error < 0) goto error; +#ifdef CONFIG_SUNRPC_XPRT_RDMA +init_server: +#endif error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor); if (error < 0) goto error; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 8c3f327d858d..24f06dcc2b08 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -270,8 +270,6 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, if (mountdata->addrlen == 0) continue; - rpc_set_port(mountdata->addr, NFS_PORT); - memcpy(page2, buf->data, buf->len); page2[buf->len] = '\0'; mountdata->hostname = page2; -- cgit v1.2.3-55-g7522 From 801b564309bafea3cfd71f9c535d88f4d58d2d34 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2017 14:13:46 -0500 Subject: nfs: Update server port after referral or migration After traversing a referral or recovering from a migration event, ensure that the server port reported in /proc/mounts is updated to the correct port setting for the new submount. Reported-by: Helen Chao Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 507f1f534b16..04612c24d394 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -861,6 +861,7 @@ static int nfs4_set_client(struct nfs_server *server, set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status)) set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags); + server->port = rpc_get_port(addr); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); -- cgit v1.2.3-55-g7522 From f96adf1ea08a37e3739b1f929ce8318eee63b1f4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 Jan 2018 10:33:14 +0100 Subject: nfs: remove unused label in nfs_encode_fh() The only reference to the label got removed, so we now get a harmless compiler warning: fs/nfs/export.c: In function 'nfs_encode_fh': fs/nfs/export.c:58:1: error: label 'out' defined but not used [-Werror=unused-label] Fixes: aaa150089465 ("nfs: remove dead code from nfs_encode_fh()") Signed-off-by: Arnd Bergmann Signed-off-by: Trond Myklebust --- fs/nfs/export.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/nfs') diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 2b80a6652818..ab5de3246c5c 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -55,7 +55,6 @@ nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent) p[len - 1] = 0; /* Padding */ nfs_copy_fh(clnt_fh, server_fh); *max_len = len; -out: dprintk("%s: result fh fileid %llu mode %u size %d\n", __func__, NFS_FILEID(inode), inode->i_mode, *max_len); return *max_len; -- cgit v1.2.3-55-g7522 From 1b8d97b0a837beaf48a8449955b52c650a7114b4 Mon Sep 17 00:00:00 2001 From: J. Bruce Fields Date: Tue, 16 Jan 2018 10:08:00 -0500 Subject: NFS: commit direct writes even if they fail partially If some of the WRITE calls making up an O_DIRECT write syscall fail, we neglect to commit, even if some of the WRITEs succeed. We also depend on the commit code to free the reference count on the nfs_page taken in the "if (request_commit)" case at the end of nfs_direct_write_completion(). The problem was originally noticed because ENOSPC's encountered partway through a write would result in a closed file being sillyrenamed when it should have been unlinked. Signed-off-by: J. Bruce Fields Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index d2972d537469..8c10b0562e75 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -775,10 +775,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) spin_lock(&dreq->lock); - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { - dreq->flags = 0; + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) dreq->error = hdr->error; - } if (dreq->error == 0) { nfs_direct_good_bytes(dreq, hdr); if (nfs_write_need_commit(hdr)) { -- cgit v1.2.3-55-g7522 From 7ff4cff637aa0bd2abbd81f53b2a6206c50afd95 Mon Sep 17 00:00:00 2001 From: Tigran Mkrtchyan Date: Tue, 16 Jan 2018 22:38:50 +0100 Subject: nfs41: do not return ENOMEM on LAYOUTUNAVAILABLE A pNFS server may return LAYOUTUNAVAILABLE error on LAYOUTGET for files which don't have any layout. In this situation pnfs_update_layout currently returns NULL. As this NULL is converted into ENOMEM, IO requests fails instead of falling back to MDS. Do not return ENOMEM on LAYOUTUNAVAILABLE and let client retry through MDS. Fixes 8d40b0f14846f. I will suggest to backport this fix to affected stable branches. Signed-off-by: Tigran Mkrtchyan [trondmy: Use IS_ERR_OR_NULL()] Fixes: 8d40b0f14846 ("NFS filelayout:call GETDEVICEINFO after...") Cc: stable@vger.kernel.org # v4.11+ Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 4e54d8b5413a..d175724ff566 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -895,9 +895,7 @@ fl_pnfs_update_layout(struct inode *ino, lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode, gfp_flags); - if (!lseg) - lseg = ERR_PTR(-ENOMEM); - if (IS_ERR(lseg)) + if (IS_ERR_OR_NULL(lseg)) goto out; lo = NFS_I(ino)->layout; -- cgit v1.2.3-55-g7522 From 06e1902456f3d0e45c3e64b75124339d67c83e5b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 18 Jan 2018 14:55:01 -0500 Subject: nfs: Use proper enum definitions for nfs_show_stable Commit 8224b2734ab1 ("NFS: Add static NFS I/O tracepoints") had a hack to work around some odd behavior observed with __print_symbolic. I couldn't ever get it to display NFS_FILE_SYNC when using TRACE_DEFINE_ENUM macros to set up the enum values. I tracked down the actual bug that forced me to add the workaround. That issue will be addressed soon, so replace the hack with a proper implementation. Fixes: 8224b2734ab1 ("NFS: Add static NFS I/O tracepoints") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfstrace.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 093290c42d7c..c6dbd48896a1 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -796,15 +796,15 @@ TRACE_EVENT(nfs_readpage_done, ) ); -/* - * XXX: I tried using NFS_UNSTABLE and friends in this table, but they - * all evaluate to 0 for some reason, even if I include linux/nfs.h. - */ +TRACE_DEFINE_ENUM(NFS_UNSTABLE); +TRACE_DEFINE_ENUM(NFS_DATA_SYNC); +TRACE_DEFINE_ENUM(NFS_FILE_SYNC); + #define nfs_show_stable(stable) \ __print_symbolic(stable, \ - { 0, " (UNSTABLE)" }, \ - { 1, " (DATA_SYNC)" }, \ - { 2, " (FILE_SYNC)" }) + { NFS_UNSTABLE, "UNSTABLE" }, \ + { NFS_DATA_SYNC, "DATA_SYNC" }, \ + { NFS_FILE_SYNC, "FILE_SYNC" }) TRACE_EVENT(nfs_initiate_write, TP_PROTO( @@ -837,12 +837,12 @@ TRACE_EVENT(nfs_initiate_write, TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%lu stable=%d%s", + "offset=%lld count=%lu stable=%s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->offset, __entry->count, - __entry->stable, nfs_show_stable(__entry->stable) + nfs_show_stable(__entry->stable) ) ); @@ -881,13 +881,13 @@ TRACE_EVENT(nfs_writeback_done, TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld status=%d stable=%d%s " + "offset=%lld status=%d stable=%s " "verifier 0x%016llx", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->offset, __entry->status, - __entry->stable, nfs_show_stable(__entry->stable), + nfs_show_stable(__entry->stable), __entry->verifier ) ); -- cgit v1.2.3-55-g7522 From cbebc6ef4fc830f4040d4140bf53484812d5d5d9 Mon Sep 17 00:00:00 2001 From: Jan Chochol Date: Fri, 5 Jan 2018 08:39:12 +0100 Subject: nfs: Do not convert nfs_idmap_cache_timeout to jiffies Since commit 57e62324e469 ("NFS: Store the legacy idmapper result in the keyring") nfs_idmap_cache_timeout changed units from jiffies to seconds. Unfortunately sysctl interface was not updated accordingly. As a effect updating /proc/sys/fs/nfs/idmap_cache_timeout with some value will incorrectly multiply this value by HZ. Also reading /proc/sys/fs/nfs/idmap_cache_timeout will show real value divided by HZ. Fixes: 57e62324e469 ("NFS: Store the legacy idmapper result in the keyring") Signed-off-by: Jan Chochol Signed-off-by: Trond Myklebust --- fs/nfs/nfs4sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 0d91d84e5822..c394e4447100 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -32,7 +32,7 @@ static struct ctl_table nfs4_cb_sysctls[] = { .data = &nfs_idmap_cache_timeout, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = proc_dointvec, }, { } }; -- cgit v1.2.3-55-g7522 From 49686cbbb3ebafe42e63868222f269d8053ead00 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Jan 2018 15:15:34 -0800 Subject: NFS: reject request for id_legacy key without auxdata nfs_idmap_legacy_upcall() is supposed to be called with 'aux' pointing to a 'struct idmap', via the call to request_key_with_auxdata() in nfs_idmap_request_key(). However it can also be reached via the request_key() system call in which case 'aux' will be NULL, causing a NULL pointer dereference in nfs_idmap_prepare_pipe_upcall(), assuming that the key description is valid enough to get that far. Fix this by making nfs_idmap_legacy_upcall() negate the key if no auxdata is provided. As usual, this bug was found by syzkaller. A simple reproducer using the command-line keyctl program is: keyctl request2 id_legacy uid:0 '' @s Fixes: 57e62324e469 ("NFS: Store the legacy idmapper result in the keyring") Reported-by: syzbot+5dfdbcf7b3eb5912abbb@syzkaller.appspotmail.com Cc: # v3.4+ Signed-off-by: Eric Biggers Signed-off-by: Trond Myklebust --- fs/nfs/nfs4idmap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 30426c1a1bbd..22dc30a679a0 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -568,9 +568,13 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, struct idmap_msg *im; struct idmap *idmap = (struct idmap *)aux; struct key *key = cons->key; - int ret = -ENOMEM; + int ret = -ENOKEY; + + if (!aux) + goto out1; /* msg and im are freed in idmap_pipe_destroy_msg */ + ret = -ENOMEM; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out1; -- cgit v1.2.3-55-g7522 From b39604755ce06389357b0c22d138c954f2e96451 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Thu, 25 Jan 2018 09:36:25 -0500 Subject: pnfs/blocklayout: pnfs_block_dev_map uses bytes, not sectors Fixup the field types to match their use. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index efc007f00742..716bc75e9ed2 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -92,10 +92,9 @@ struct pnfs_block_volume { }; struct pnfs_block_dev_map { - sector_t start; - sector_t len; - - sector_t disk_offset; + u64 start; + u64 len; + u64 disk_offset; struct block_device *bdev; }; -- cgit v1.2.3-55-g7522 From f34462c3c8a2ec0f09003b526c4d7c08782d9350 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Thu, 25 Jan 2018 09:36:26 -0500 Subject: pnfs/blocklayout: Ensure disk address in block device map It's possible that the device map is smaller than the offset into the device for the I/O we're adding. Add a check for it and bail out, otherwise we risk botching the bio calculations that follow. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index ca6cf54b54df..7cb5c38c19e4 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -137,6 +137,11 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, return bio; } +static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map) +{ + return offset >= map->start && offset < map->start + map->len; +} + static struct bio * do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, struct page *page, struct pnfs_block_dev_map *map, @@ -156,8 +161,8 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, /* translate to physical disk offset */ disk_addr = (u64)isect << SECTOR_SHIFT; - if (disk_addr < map->start || disk_addr >= map->start + map->len) { - if (!dev->map(dev, disk_addr, map)) + if (!offset_in_map(disk_addr, map)) { + if (!dev->map(dev, disk_addr, map) || !offset_in_map(disk_addr, map)) return ERR_PTR(-EIO); bio = bl_submit_bio(bio); } -- cgit v1.2.3-55-g7522 From 128159f2929ef398698c28a4177eca1c1b88aa74 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 28 Jan 2018 09:34:51 -0500 Subject: NFS: Remove a redundant call to unmap_mapping_range() We don't need to call unmap_mapping_range() prior to calling nfs_sync_mapping(). Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/nfs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index deeb7d1097d0..49fba9ea5872 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1170,7 +1170,6 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map if (mapping->nrpages != 0) { if (S_ISREG(inode->i_mode)) { - unmap_mapping_range(mapping, 0, 0, 0); ret = nfs_sync_mapping(mapping); if (ret < 0) return ret; -- cgit v1.2.3-55-g7522 From e231c6879cfd44e4fffd384bb6dd7d313249a523 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 28 Jan 2018 09:29:41 -0500 Subject: NFS: Fix a race between mmap() and O_DIRECT When locking the file in order to do O_DIRECT on it, we must unmap any mmapped ranges on the pagecache so that we can flush out the dirty data. Fixes: a5864c999de67 ("NFS: Do not serialise O_DIRECT reads and writes") Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v4.8+ --- fs/nfs/io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/nfs') diff --git a/fs/nfs/io.c b/fs/nfs/io.c index 20fef85d2bb1..9034b4926909 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -99,7 +99,7 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) { if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { set_bit(NFS_INO_ODIRECT, &nfsi->flags); - nfs_wb_all(inode); + nfs_sync_mapping(inode->i_mapping); } } -- cgit v1.2.3-55-g7522