summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/porting5
-rw-r--r--Documentation/filesystems/vfs.txt22
-rw-r--r--fs/btrfs/ctree.h8
-rw-r--r--fs/btrfs/file.c3
-rw-r--r--fs/btrfs/ioctl.c50
-rw-r--r--fs/cifs/cifsfs.c24
-rw-r--r--fs/ioctl.c10
-rw-r--r--fs/nfs/nfs4file.c12
-rw-r--r--fs/nfsd/vfs.c8
-rw-r--r--fs/ocfs2/file.c93
-rw-r--r--fs/ocfs2/refcounttree.c148
-rw-r--r--fs/ocfs2/refcounttree.h24
-rw-r--r--fs/overlayfs/copy_up.c6
-rw-r--r--fs/overlayfs/file.c43
-rw-r--r--fs/read_write.c405
-rw-r--r--fs/xfs/xfs_file.c82
-rw-r--r--fs/xfs/xfs_reflink.c173
-rw-r--r--fs/xfs/xfs_reflink.h15
-rw-r--r--include/linux/fs.h55
-rw-r--r--mm/filemap.c146
20 files changed, 735 insertions, 597 deletions
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 321d74b73937..cf43bc4dbf31 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -623,6 +623,11 @@ in your dentry operations instead.
On success you get a new struct file sharing the mount/dentry with the
original, on failure - ERR_PTR().
--
+[mandatory]
+ ->clone_file_range() and ->dedupe_file_range have been replaced with
+ ->remap_file_range(). See Documentation/filesystems/vfs.txt for more
+ information.
+--
[recommended]
->lookup() instances doing an equivalent of
if (IS_ERR(inode))
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index a6c6a8af48a2..5f71a252e2e0 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -883,8 +883,9 @@ struct file_operations {
unsigned (*mmap_capabilities)(struct file *);
#endif
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int);
- int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64);
- int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64);
+ loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);
};
@@ -960,11 +961,18 @@ otherwise noted.
copy_file_range: called by the copy_file_range(2) system call.
- clone_file_range: called by the ioctl(2) system call for FICLONERANGE and
- FICLONE commands.
-
- dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE
- command.
+ remap_file_range: called by the ioctl(2) system call for FICLONERANGE and
+ FICLONE and FIDEDUPERANGE commands to remap file ranges. An
+ implementation should remap len bytes at pos_in of the source file into
+ the dest file at pos_out. Implementations must handle callers passing
+ in len == 0; this means "remap to the end of the source file". The
+ return value should the number of bytes remapped, or the usual
+ negative error code if errors occurred before any bytes were remapped.
+ The remap_flags parameter accepts REMAP_FILE_* flags. If
+ REMAP_FILE_DEDUP is set then the implementation must only remap if the
+ requested file ranges have identical contents. If REMAP_CAN_SHORTEN is
+ set, the caller is ok with the implementation shortening the request
+ length to satisfy alignment or EOF requirements (or any other reason).
fadvise: possibly called by the fadvise64() system call.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 68ca41dbbef3..80953528572d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3201,9 +3201,6 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
-int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
- struct file *dst_file, loff_t dst_loff,
- u64 olen);
/* file.c */
int __init btrfs_auto_defrag_init(void);
@@ -3233,8 +3230,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len);
+loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 97c7a086f7bd..a3c22e16509b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3298,8 +3298,7 @@ const struct file_operations btrfs_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_compat_ioctl,
#endif
- .clone_file_range = btrfs_clone_file_range,
- .dedupe_file_range = btrfs_dedupe_file_range,
+ .remap_file_range = btrfs_remap_file_range,
};
void __cold btrfs_auto_defrag_exit(void)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a990a9045139..3ca6943827ef 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3629,26 +3629,6 @@ out_unlock:
return ret;
}
-int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
- struct file *dst_file, loff_t dst_loff,
- u64 olen)
-{
- struct inode *src = file_inode(src_file);
- struct inode *dst = file_inode(dst_file);
- u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
-
- if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
- /*
- * Btrfs does not support blocksize < page_size. As a
- * result, btrfs_cmp_data() won't correctly handle
- * this situation without an update.
- */
- return -EINVAL;
- }
-
- return btrfs_extent_same(src, src_loff, olen, dst, dst_loff);
-}
-
static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
struct inode *inode,
u64 endoff,
@@ -4350,10 +4330,34 @@ out_unlock:
return ret;
}
-int btrfs_clone_file_range(struct file *src_file, loff_t off,
- struct file *dst_file, loff_t destoff, u64 len)
+loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
+ struct file *dst_file, loff_t destoff, loff_t len,
+ unsigned int remap_flags)
{
- return btrfs_clone_files(dst_file, src_file, off, len, destoff);
+ int ret;
+
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ return -EINVAL;
+
+ if (remap_flags & REMAP_FILE_DEDUP) {
+ struct inode *src = file_inode(src_file);
+ struct inode *dst = file_inode(dst_file);
+ u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+
+ if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
+ /*
+ * Btrfs does not support blocksize < page_size. As a
+ * result, btrfs_cmp_data() won't correctly handle
+ * this situation without an update.
+ */
+ return -EINVAL;
+ }
+
+ ret = btrfs_extent_same(src, off, len, dst, destoff);
+ } else {
+ ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
+ }
+ return ret < 0 ? ret : len;
}
static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 7de9603c54f1..b7ac09e38159 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -992,8 +992,9 @@ const struct inode_operations cifs_symlink_inode_ops = {
.listxattr = cifs_listxattr,
};
-static int cifs_clone_file_range(struct file *src_file, loff_t off,
- struct file *dst_file, loff_t destoff, u64 len)
+static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
+ struct file *dst_file, loff_t destoff, loff_t len,
+ unsigned int remap_flags)
{
struct inode *src_inode = file_inode(src_file);
struct inode *target_inode = file_inode(dst_file);
@@ -1003,6 +1004,9 @@ static int cifs_clone_file_range(struct file *src_file, loff_t off,
unsigned int xid;
int rc;
+ if (remap_flags & ~REMAP_FILE_ADVISORY)
+ return -EINVAL;
+
cifs_dbg(FYI, "clone range\n");
xid = get_xid();
@@ -1042,7 +1046,7 @@ static int cifs_clone_file_range(struct file *src_file, loff_t off,
unlock_two_nondirectories(src_inode, target_inode);
out:
free_xid(xid);
- return rc;
+ return rc < 0 ? rc : len;
}
ssize_t cifs_file_copychunk_range(unsigned int xid,
@@ -1151,7 +1155,7 @@ const struct file_operations cifs_file_ops = {
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
- .clone_file_range = cifs_clone_file_range,
+ .remap_file_range = cifs_remap_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -1170,7 +1174,7 @@ const struct file_operations cifs_file_strict_ops = {
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
- .clone_file_range = cifs_clone_file_range,
+ .remap_file_range = cifs_remap_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -1189,7 +1193,7 @@ const struct file_operations cifs_file_direct_ops = {
.splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
- .clone_file_range = cifs_clone_file_range,
+ .remap_file_range = cifs_remap_file_range,
.llseek = cifs_llseek,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
@@ -1208,7 +1212,7 @@ const struct file_operations cifs_file_nobrl_ops = {
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
- .clone_file_range = cifs_clone_file_range,
+ .remap_file_range = cifs_remap_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -1226,7 +1230,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
- .clone_file_range = cifs_clone_file_range,
+ .remap_file_range = cifs_remap_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -1244,7 +1248,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
.splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
- .clone_file_range = cifs_clone_file_range,
+ .remap_file_range = cifs_remap_file_range,
.llseek = cifs_llseek,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
@@ -1256,7 +1260,7 @@ const struct file_operations cifs_dir_ops = {
.read = generic_read_dir,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
- .clone_file_range = cifs_clone_file_range,
+ .remap_file_range = cifs_remap_file_range,
.llseek = generic_file_llseek,
.fsync = cifs_dir_fsync,
};
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 0400297c8d72..d64f622cac8b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -223,6 +223,7 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
u64 off, u64 olen, u64 destoff)
{
struct fd src_file = fdget(srcfd);
+ loff_t cloned;
int ret;
if (!src_file.file)
@@ -230,7 +231,14 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
ret = -EXDEV;
if (src_file.file->f_path.mnt != dst_file->f_path.mnt)
goto fdput;
- ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen);
+ cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
+ olen, 0);
+ if (cloned < 0)
+ ret = cloned;
+ else if (olen && cloned != olen)
+ ret = -EINVAL;
+ else
+ ret = 0;
fdput:
fdput(src_file);
return ret;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 4288a6ecaf75..46d691ba04bc 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -180,8 +180,9 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
return nfs42_proc_allocate(filep, offset, len);
}
-static int nfs42_clone_file_range(struct file *src_file, loff_t src_off,
- struct file *dst_file, loff_t dst_off, u64 count)
+static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off, loff_t count,
+ unsigned int remap_flags)
{
struct inode *dst_inode = file_inode(dst_file);
struct nfs_server *server = NFS_SERVER(dst_inode);
@@ -190,6 +191,9 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off,
bool same_inode = false;
int ret;
+ if (remap_flags & ~REMAP_FILE_ADVISORY)
+ return -EINVAL;
+
/* check alignment w.r.t. clone_blksize */
ret = -EINVAL;
if (bs) {
@@ -240,7 +244,7 @@ out_unlock:
inode_unlock(src_inode);
}
out:
- return ret;
+ return ret < 0 ? ret : count;
}
#endif /* CONFIG_NFS_V4_2 */
@@ -262,7 +266,7 @@ const struct file_operations nfs4_file_operations = {
.copy_file_range = nfs4_copy_file_range,
.llseek = nfs4_file_llseek,
.fallocate = nfs42_fallocate,
- .clone_file_range = nfs42_clone_file_range,
+ .remap_file_range = nfs42_remap_file_range,
#else
.llseek = nfs_file_llseek,
#endif
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index fb28be653014..eb67098117b4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -541,8 +541,12 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
u64 dst_pos, u64 count)
{
- return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos,
- count));
+ loff_t cloned;
+
+ cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
+ if (count && cloned != count)
+ cloned = -EINVAL;
+ return nfserrno(cloned < 0 ? cloned : 0);
}
ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9fa35cb6f6e0..fe570824b991 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2527,24 +2527,79 @@ out:
return offset;
}
-static int ocfs2_file_clone_range(struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len)
+static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags)
{
- return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
- len, false);
-}
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
+ struct buffer_head *in_bh = NULL, *out_bh = NULL;
+ bool same_inode = (inode_in == inode_out);
+ loff_t remapped = 0;
+ ssize_t ret;
-static int ocfs2_file_dedupe_range(struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len)
-{
- return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
- len, true);
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ return -EINVAL;
+ if (!ocfs2_refcount_tree(osb))
+ return -EOPNOTSUPP;
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ /* Lock both files against IO */
+ ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
+ if (ret)
+ return ret;
+
+ /* Check file eligibility and prepare for block sharing. */
+ ret = -EINVAL;
+ if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
+ (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
+ goto out_unlock;
+
+ ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+ &len, remap_flags);
+ if (ret < 0 || len == 0)
+ goto out_unlock;
+
+ /* Lock out changes to the allocation maps and remap. */
+ down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+ if (!same_inode)
+ down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
+ SINGLE_DEPTH_NESTING);
+
+ /* Zap any page cache for the destination file's range. */
+ truncate_inode_pages_range(&inode_out->i_data,
+ round_down(pos_out, PAGE_SIZE),
+ round_up(pos_out + len, PAGE_SIZE) - 1);
+
+ remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in,
+ inode_out, out_bh, pos_out, len);
+ up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+ if (!same_inode)
+ up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
+ if (remapped < 0) {
+ ret = remapped;
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ /*
+ * Empty the extent map so that we may get the right extent
+ * record from the disk.
+ */
+ ocfs2_extent_map_trunc(inode_in, 0);
+ ocfs2_extent_map_trunc(inode_out, 0);
+
+ ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+out_unlock:
+ ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+ return remapped > 0 ? remapped : ret;
}
const struct inode_operations ocfs2_file_iops = {
@@ -2586,8 +2641,7 @@ const struct file_operations ocfs2_fops = {
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
- .clone_file_range = ocfs2_file_clone_range,
- .dedupe_file_range = ocfs2_file_dedupe_range,
+ .remap_file_range = ocfs2_remap_file_range,
};
const struct file_operations ocfs2_dops = {
@@ -2633,8 +2687,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
- .clone_file_range = ocfs2_file_clone_range,
- .dedupe_file_range = ocfs2_file_dedupe_range,
+ .remap_file_range = ocfs2_remap_file_range,
};
const struct file_operations ocfs2_dops_no_plocks = {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 1114ef02e780..a35259eebc56 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4466,9 +4466,9 @@ out:
}
/* Update destination inode size, if necessary. */
-static int ocfs2_reflink_update_dest(struct inode *dest,
- struct buffer_head *d_bh,
- loff_t newlen)
+int ocfs2_reflink_update_dest(struct inode *dest,
+ struct buffer_head *d_bh,
+ loff_t newlen)
{
handle_t *handle;
int ret;
@@ -4505,14 +4505,14 @@ out_commit:
}
/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
-static int ocfs2_reflink_remap_extent(struct inode *s_inode,
- struct buffer_head *s_bh,
- loff_t pos_in,
- struct inode *t_inode,
- struct buffer_head *t_bh,
- loff_t pos_out,
- loff_t len,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ loff_t pos_in,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ loff_t pos_out,
+ loff_t len,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
{
struct ocfs2_extent_tree s_et;
struct ocfs2_extent_tree t_et;
@@ -4520,8 +4520,9 @@ static int ocfs2_reflink_remap_extent(struct inode *s_inode,
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *ref_tree;
struct ocfs2_super *osb;
+ loff_t remapped_bytes = 0;
loff_t pstart, plen;
- u32 p_cluster, num_clusters, slast, spos, tpos;
+ u32 p_cluster, num_clusters, slast, spos, tpos, remapped_clus = 0;
unsigned int ext_flags;
int ret = 0;
@@ -4603,30 +4604,34 @@ static int ocfs2_reflink_remap_extent(struct inode *s_inode,
next_loop:
spos += num_clusters;
tpos += num_clusters;
+ remapped_clus += num_clusters;
}
-out:
- return ret;
+ goto out;
out_unlock_refcount:
ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
brelse(ref_root_bh);
- return ret;
+out:
+ remapped_bytes = ocfs2_clusters_to_bytes(t_inode->i_sb, remapped_clus);
+ remapped_bytes = min_t(loff_t, len, remapped_bytes);
+
+ return remapped_bytes > 0 ? remapped_bytes : ret;
}
/* Set up refcount tree and remap s_inode to t_inode. */
-static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
- struct buffer_head *s_bh,
- loff_t pos_in,
- struct inode *t_inode,
- struct buffer_head *t_bh,
- loff_t pos_out,
- loff_t len)
+loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ loff_t pos_in,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ loff_t pos_out,
+ loff_t len)
{
struct ocfs2_cached_dealloc_ctxt dealloc;
struct ocfs2_super *osb;
struct ocfs2_dinode *dis;
struct ocfs2_dinode *dit;
- int ret;
+ loff_t ret;
osb = OCFS2_SB(s_inode->i_sb);
dis = (struct ocfs2_dinode *)s_bh->b_data;
@@ -4698,7 +4703,7 @@ static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
/* Actually remap extents now. */
ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
pos_out, len, &dealloc);
- if (ret) {
+ if (ret < 0) {
mlog_errno(ret);
goto out;
}
@@ -4713,10 +4718,10 @@ out:
}
/* Lock an inode and grab a bh pointing to the inode. */
-static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
- struct buffer_head **bh1,
- struct inode *t_inode,
- struct buffer_head **bh2)
+int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+ struct buffer_head **bh1,
+ struct inode *t_inode,
+ struct buffer_head **bh2)
{
struct inode *inode1;
struct inode *inode2;
@@ -4801,10 +4806,10 @@ out_i1:
}
/* Unlock both inodes and release buffers. */
-static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
- struct buffer_head *s_bh,
- struct inode *t_inode,
- struct buffer_head *t_bh)
+void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh)
{
ocfs2_inode_unlock(s_inode, 1);
ocfs2_rw_unlock(s_inode, 1);
@@ -4816,82 +4821,3 @@ static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
}
unlock_two_nondirectories(s_inode, t_inode);
}
-
-/* Link a range of blocks from one file to another. */
-int ocfs2_reflink_remap_range(struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len,
- bool is_dedupe)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
- struct buffer_head *in_bh = NULL, *out_bh = NULL;
- bool same_inode = (inode_in == inode_out);
- ssize_t ret;
-
- if (!ocfs2_refcount_tree(osb))
- return -EOPNOTSUPP;
- if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
- return -EROFS;
-
- /* Lock both files against IO */
- ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
- if (ret)
- return ret;
-
- /* Check file eligibility and prepare for block sharing. */
- ret = -EINVAL;
- if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
- (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
- goto out_unlock;
-
- ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
- &len, is_dedupe);
- if (ret <= 0)
- goto out_unlock;
-
- /* Lock out changes to the allocation maps and remap. */
- down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
- if (!same_inode)
- down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
- SINGLE_DEPTH_NESTING);
-
- ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
- out_bh, pos_out, len);
-
- /* Zap any page cache for the destination file's range. */
- if (!ret)
- truncate_inode_pages_range(&inode_out->i_data, pos_out,
- PAGE_ALIGN(pos_out + len) - 1);
-
- up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
- if (!same_inode)
- up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
- if (ret) {
- mlog_errno(ret);
- goto out_unlock;
- }
-
- /*
- * Empty the extent map so that we may get the right extent
- * record from the disk.
- */
- ocfs2_extent_map_trunc(inode_in, 0);
- ocfs2_extent_map_trunc(inode_out, 0);
-
- ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
- if (ret) {
- mlog_errno(ret);
- goto out_unlock;
- }
-
- ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
- return 0;
-
-out_unlock:
- ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
- return ret;
-}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 4af55bf4b35b..e9e862be4a1e 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -115,11 +115,23 @@ int ocfs2_reflink_ioctl(struct inode *inode,
const char __user *oldname,
const char __user *newname,
bool preserve);
-int ocfs2_reflink_remap_range(struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len,
- bool is_dedupe);
+loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ loff_t pos_in,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ loff_t pos_out,
+ loff_t len);
+int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+ struct buffer_head **bh1,
+ struct inode *t_inode,
+ struct buffer_head **bh2);
+void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh);
+int ocfs2_reflink_update_dest(struct inode *dest,
+ struct buffer_head *d_bh,
+ loff_t newlen);
#endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index d6a3346e2672..9e62dcf06fc4 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -125,6 +125,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
+ loff_t cloned;
int error = 0;
if (len == 0)
@@ -141,11 +142,10 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
}
/* Try to use clone_file_range to clone up within the same fs */
- error = do_clone_file_range(old_file, 0, new_file, 0, len);
- if (!error)
+ cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
+ if (cloned == len)
goto out;
/* Couldn't clone, so now we try to copy the data */
- error = 0;
/* FIXME: copy up sparse files efficiently */
while (len) {
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 986313da0c88..84dd957efa24 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -434,14 +434,14 @@ enum ovl_copyop {
OVL_DEDUPE,
};
-static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in,
+static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
- u64 len, unsigned int flags, enum ovl_copyop op)
+ loff_t len, unsigned int flags, enum ovl_copyop op)
{
struct inode *inode_out = file_inode(file_out);
struct fd real_in, real_out;
const struct cred *old_cred;
- ssize_t ret;
+ loff_t ret;
ret = ovl_real_fdget(file_out, &real_out);
if (ret)
@@ -462,12 +462,13 @@ static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in,
case OVL_CLONE:
ret = vfs_clone_file_range(real_in.file, pos_in,
- real_out.file, pos_out, len);
+ real_out.file, pos_out, len, flags);
break;
case OVL_DEDUPE:
ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
- real_out.file, pos_out, len);
+ real_out.file, pos_out, len,
+ flags);
break;
}
revert_creds(old_cred);
@@ -489,26 +490,31 @@ static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in,
OVL_COPY);
}
-static int ovl_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len)
+static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags)
{
- return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0,
- OVL_CLONE);
-}
+ enum ovl_copyop op;
+
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ return -EINVAL;
+
+ if (remap_flags & REMAP_FILE_DEDUP)
+ op = OVL_DEDUPE;
+ else
+ op = OVL_CLONE;
-static int ovl_dedupe_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len)
-{
/*
* Don't copy up because of a dedupe request, this wouldn't make sense
* most of the time (data would be duplicated instead of deduplicated).
*/
- if (!ovl_inode_upper(file_inode(file_in)) ||
- !ovl_inode_upper(file_inode(file_out)))
+ if (op == OVL_DEDUPE &&
+ (!ovl_inode_upper(file_inode(file_in)) ||
+ !ovl_inode_upper(file_inode(file_out))))
return -EPERM;
- return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0,
- OVL_DEDUPE);
+ return ovl_copyfile(file_in, pos_in, file_out, pos_out, len,
+ remap_flags, op);
}
const struct file_operations ovl_file_operations = {
@@ -525,6 +531,5 @@ const struct file_operations ovl_file_operations = {
.compat_ioctl = ovl_compat_ioctl,
.copy_file_range = ovl_copy_file_range,
- .clone_file_range = ovl_clone_file_range,
- .dedupe_file_range = ovl_dedupe_file_range,
+ .remap_file_range = ovl_remap_file_range,
};
diff --git a/fs/read_write.c b/fs/read_write.c
index 5a2ee488c5d2..bfcb4ced5664 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1587,11 +1587,15 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
* Try cloning first, this is supported by more file systems, and
* more efficient if both clone and copy are supported (e.g. NFS).
*/
- if (file_in->f_op->clone_file_range) {
- ret = file_in->f_op->clone_file_range(file_in, pos_in,
- file_out, pos_out, len);
- if (ret == 0) {
- ret = len;
+ if (file_in->f_op->remap_file_range) {
+ loff_t cloned;
+
+ cloned = file_in->f_op->remap_file_range(file_in, pos_in,
+ file_out, pos_out,
+ min_t(loff_t, MAX_RW_COUNT, len),
+ REMAP_FILE_CAN_SHORTEN);
+ if (cloned > 0) {
+ ret = cloned;
goto done;
}
}
@@ -1685,11 +1689,12 @@ out2:
return ret;
}
-static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
+static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
+ bool write)
{
struct inode *inode = file_inode(file);
- if (unlikely(pos < 0))
+ if (unlikely(pos < 0 || len < 0))
return -EINVAL;
if (unlikely((loff_t) (pos + len) < 0))
@@ -1707,22 +1712,150 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
}
+/*
+ * Ensure that we don't remap a partial EOF block in the middle of something
+ * else. Assume that the offsets have already been checked for block
+ * alignment.
+ *
+ * For deduplication we always scale down to the previous block because we
+ * can't meaningfully compare post-EOF contents.
+ *
+ * For clone we only link a partial EOF block above the destination file's EOF.
+ *
+ * Shorten the request if possible.
+ */
+static int generic_remap_check_len(struct inode *inode_in,
+ struct inode *inode_out,
+ loff_t pos_out,
+ loff_t *len,
+ unsigned int remap_flags)
+{
+ u64 blkmask = i_blocksize(inode_in) - 1;
+ loff_t new_len = *len;
+
+ if ((*len & blkmask) == 0)
+ return 0;
+
+ if ((remap_flags & REMAP_FILE_DEDUP) ||
+ pos_out + *len < i_size_read(inode_out))
+ new_len &= ~blkmask;
+
+ if (new_len == *len)
+ return 0;
+
+ if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
+ *len = new_len;
+ return 0;
+ }
+
+ return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
+}
+
+/*
+ * Read a page's worth of file data into the page cache. Return the page
+ * locked.
+ */
+static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
+{
+ struct page *page;
+
+ page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
+ if (IS_ERR(page))
+ return page;
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ lock_page(page);
+ return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ * Caller must have locked both inodes to prevent write races.
+ */
+static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+ struct inode *dest, loff_t destoff,
+ loff_t len, bool *is_same)
+{
+ loff_t src_poff;
+ loff_t dest_poff;
+ void *src_addr;
+ void *dest_addr;
+ struct page *src_page;
+ struct page *dest_page;
+ loff_t cmp_len;
+ bool same;
+ int error;
+
+ error = -EINVAL;
+ same = true;
+ while (len) {
+ src_poff = srcoff & (PAGE_SIZE - 1);
+ dest_poff = destoff & (PAGE_SIZE - 1);
+ cmp_len = min(PAGE_SIZE - src_poff,
+ PAGE_SIZE - dest_poff);
+ cmp_len = min(cmp_len, len);
+ if (cmp_len <= 0)
+ goto out_error;
+
+ src_page = vfs_dedupe_get_page(src, srcoff);
+ if (IS_ERR(src_page)) {
+ error = PTR_ERR(src_page);
+ goto out_error;
+ }
+ dest_page = vfs_dedupe_get_page(dest, destoff);
+ if (IS_ERR(dest_page)) {
+ error = PTR_ERR(dest_page);
+ unlock_page(src_page);
+ put_page(src_page);
+ goto out_error;
+ }
+ src_addr = kmap_atomic(src_page);
+ dest_addr = kmap_atomic(dest_page);
+
+ flush_dcache_page(src_page);
+ flush_dcache_page(dest_page);
+
+ if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+ same = false;
+
+ kunmap_atomic(dest_addr);
+ kunmap_atomic(src_addr);
+ unlock_page(dest_page);
+ unlock_page(src_page);
+ put_page(dest_page);
+ put_page(src_page);
+
+ if (!same)
+ break;
+
+ srcoff += cmp_len;
+ destoff += cmp_len;
+ len -= cmp_len;
+ }
+
+ *is_same = same;
+ return 0;
+
+out_error:
+ return error;
+}
/*
* Check that the two inodes are eligible for cloning, the ranges make
* sense, and then flush all dirty data. Caller must ensure that the
* inodes have been locked against any other modifications.
*
- * Returns: 0 for "nothing to clone", 1 for "something to clone", or
- * the usual negative error code.
+ * If there's an error, then the usual negative error code is returned.
+ * Otherwise returns 0 with *len set to the request length.
*/
-int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
- struct inode *inode_out, loff_t pos_out,
- u64 *len, bool is_dedupe)
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags)
{
- loff_t bs = inode_out->i_sb->s_blocksize;
- loff_t blen;
- loff_t isize;
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
bool same_inode = (inode_in == inode_out);
int ret;
@@ -1739,50 +1872,24 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
- /* Are we going all the way to the end? */
- isize = i_size_read(inode_in);
- if (isize == 0)
- return 0;
-
/* Zero length dedupe exits immediately; reflink goes to EOF. */
if (*len == 0) {
- if (is_dedupe || pos_in == isize)
+ loff_t isize = i_size_read(inode_in);
+
+ if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
return 0;
if (pos_in > isize)
return -EINVAL;
*len = isize - pos_in;
+ if (*len == 0)
+ return 0;
}
- /* Ensure offsets don't wrap and the input is inside i_size */
- if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
- pos_in + *len > isize)
- return -EINVAL;
-
- /* Don't allow dedupe past EOF in the dest file */
- if (is_dedupe) {
- loff_t disize;
-
- disize = i_size_read(inode_out);
- if (pos_out >= disize || pos_out + *len > disize)
- return -EINVAL;
- }
-
- /* If we're linking to EOF, continue to the block boundary. */
- if (pos_in + *len == isize)
- blen = ALIGN(isize, bs) - pos_in;
- else
- blen = *len;
-
- /* Only reflink if we're aligned to block boundaries */
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
- !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
- return -EINVAL;
-
- /* Don't allow overlapped reflink within the same file */
- if (same_inode) {
- if (pos_out + blen > pos_in && pos_out < pos_in + blen)
- return -EINVAL;
- }
+ /* Check that we don't violate system file offset limits. */
+ ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
+ remap_flags);
+ if (ret)
+ return ret;
/* Wait for the completion of any pending IOs on both files */
inode_dio_wait(inode_in);
@@ -1802,7 +1909,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
/*
* Check that the extents are the same.
*/
- if (is_dedupe) {
+ if (remap_flags & REMAP_FILE_DEDUP) {
bool is_same = false;
ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
@@ -1813,16 +1920,43 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
return -EBADE;
}
- return 1;
+ ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
+ remap_flags);
+ if (ret)
+ return ret;
+
+ /* If can't alter the file contents, we're done. */
+ if (!(remap_flags & REMAP_FILE_DEDUP)) {
+ /* Update the timestamps, since we can alter file contents. */
+ if (!(file_out->f_mode & FMODE_NOCMTIME)) {
+ ret = file_update_time(file_out);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Clear the security bits if the process is not being run by
+ * root. This keeps people from modifying setuid and setgid
+ * binaries.
+ */
+ ret = file_remove_privs(file_out);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
}
-EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
+EXPORT_SYMBOL(generic_remap_file_range_prep);
-int do_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len)
+loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
- int ret;
+ loff_t ret;
+
+ WARN_ON_ONCE(remap_flags);
if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
@@ -1842,140 +1976,43 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in,
(file_out->f_flags & O_APPEND))
return -EBADF;
- if (!file_in->f_op->clone_file_range)
+ if (!file_in->f_op->remap_file_range)
return -EOPNOTSUPP;
- ret = clone_verify_area(file_in, pos_in, len, false);
+ ret = remap_verify_area(file_in, pos_in, len, false);
if (ret)
return ret;
- ret = clone_verify_area(file_out, pos_out, len, true);
+ ret = remap_verify_area(file_out, pos_out, len, true);
if (ret)
return ret;
- if (pos_in + len > i_size_read(inode_in))
- return -EINVAL;
-
- ret = file_in->f_op->clone_file_range(file_in, pos_in,
- file_out, pos_out, len);
- if (!ret) {
- fsnotify_access(file_in);
- fsnotify_modify(file_out);
- }
+ ret = file_in->f_op->remap_file_range(file_in, pos_in,
+ file_out, pos_out, len, remap_flags);
+ if (ret < 0)
+ return ret;
+ fsnotify_access(file_in);
+ fsnotify_modify(file_out);
return ret;
}
EXPORT_SYMBOL(do_clone_file_range);
-int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len)
+loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags)
{
- int ret;
+ loff_t ret;
file_start_write(file_out);
- ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len);
+ ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
+ remap_flags);
file_end_write(file_out);
return ret;
}
EXPORT_SYMBOL(vfs_clone_file_range);
-/*
- * Read a page's worth of file data into the page cache. Return the page
- * locked.
- */
-static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
-{
- struct address_space *mapping;
- struct page *page;
- pgoff_t n;
-
- n = offset >> PAGE_SHIFT;
- mapping = inode->i_mapping;
- page = read_mapping_page(mapping, n, NULL);
- if (IS_ERR(page))
- return page;
- if (!PageUptodate(page)) {
- put_page(page);
- return ERR_PTR(-EIO);
- }
- lock_page(page);
- return page;
-}
-
-/*
- * Compare extents of two files to see if they are the same.
- * Caller must have locked both inodes to prevent write races.
- */
-int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
- struct inode *dest, loff_t destoff,
- loff_t len, bool *is_same)
-{
- loff_t src_poff;
- loff_t dest_poff;
- void *src_addr;
- void *dest_addr;
- struct page *src_page;
- struct page *dest_page;
- loff_t cmp_len;
- bool same;
- int error;
-
- error = -EINVAL;
- same = true;
- while (len) {
- src_poff = srcoff & (PAGE_SIZE - 1);
- dest_poff = destoff & (PAGE_SIZE - 1);
- cmp_len = min(PAGE_SIZE - src_poff,
- PAGE_SIZE - dest_poff);
- cmp_len = min(cmp_len, len);
- if (cmp_len <= 0)
- goto out_error;
-
- src_page = vfs_dedupe_get_page(src, srcoff);
- if (IS_ERR(src_page)) {
- error = PTR_ERR(src_page);
- goto out_error;
- }
- dest_page = vfs_dedupe_get_page(dest, destoff);
- if (IS_ERR(dest_page)) {
- error = PTR_ERR(dest_page);
- unlock_page(src_page);
- put_page(src_page);
- goto out_error;
- }
- src_addr = kmap_atomic(src_page);
- dest_addr = kmap_atomic(dest_page);
-
- flush_dcache_page(src_page);
- flush_dcache_page(dest_page);
-
- if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
- same = false;
-
- kunmap_atomic(dest_addr);
- kunmap_atomic(src_addr);
- unlock_page(dest_page);
- unlock_page(src_page);
- put_page(dest_page);
- put_page(src_page);
-
- if (!same)
- break;
-
- srcoff += cmp_len;
- destoff += cmp_len;
- len -= cmp_len;
- }
-
- *is_same = same;
- return 0;
-
-out_error:
- return error;
-}
-EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
-
/* Check whether we are allowed to dedupe the destination file */
static bool allow_file_dedupe(struct file *file)
{
@@ -1990,16 +2027,20 @@ static bool allow_file_dedupe(struct file *file)
return false;
}
-int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
- struct file *dst_file, loff_t dst_pos, u64 len)
+loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
+ struct file *dst_file, loff_t dst_pos,
+ loff_t len, unsigned int remap_flags)
{
- s64 ret;
+ loff_t ret;
+
+ WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
+ REMAP_FILE_CAN_SHORTEN));
ret = mnt_want_write_file(dst_file);
if (ret)
return ret;
- ret = clone_verify_area(dst_file, dst_pos, len, true);
+ ret = remap_verify_area(dst_file, dst_pos, len, true);
if (ret < 0)
goto out_drop_write;
@@ -2016,11 +2057,16 @@ int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
goto out_drop_write;
ret = -EINVAL;
- if (!dst_file->f_op->dedupe_file_range)
+ if (!dst_file->f_op->remap_file_range)
goto out_drop_write;
- ret = dst_file->f_op->dedupe_file_range(src_file, src_pos,
- dst_file, dst_pos, len);
+ if (len == 0) {
+ ret = 0;
+ goto out_drop_write;
+ }
+
+ ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
+ dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
out_drop_write:
mnt_drop_write_file(dst_file);
@@ -2037,7 +2083,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
int i;
int ret;
u16 count = same->dest_count;
- int deduped;
+ loff_t deduped;
if (!(file->f_mode & FMODE_READ))
return -EINVAL;
@@ -2056,7 +2102,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
if (!S_ISREG(src->i_mode))
goto out;
- ret = clone_verify_area(file, off, len, false);
+ ret = remap_verify_area(file, off, len, false);
if (ret < 0)
goto out;
ret = 0;
@@ -2088,7 +2134,8 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
}
deduped = vfs_dedupe_file_range_one(file, off, dst_file,
- info->dest_offset, len);
+ info->dest_offset, len,
+ REMAP_FILE_CAN_SHORTEN);
if (deduped == -EBADE)
info->status = FILE_DEDUPE_RANGE_DIFFERS;
else if (deduped < 0)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 61a5ad2600e8..53c9ab8fb777 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -919,28 +919,67 @@ out_unlock:
return error;
}
-STATIC int
-xfs_file_clone_range(
- struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len)
-{
- return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
- len, false);
-}
-STATIC int
-xfs_file_dedupe_range(
- struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len)
+loff_t
+xfs_file_remap_range(
+ struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ loff_t len,
+ unsigned int remap_flags)
{
- return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
- len, true);
+ struct inode *inode_in = file_inode(file_in);
+ struct xfs_inode *src = XFS_I(inode_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct xfs_inode *dest = XFS_I(inode_out);
+ struct xfs_mount *mp = src->i_mount;
+ loff_t remapped = 0;
+ xfs_extlen_t cowextsize;
+ int ret;
+
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ return -EINVAL;
+
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ /* Prepare and then clone file data. */
+ ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
+ &len, remap_flags);
+ if (ret < 0 || len == 0)
+ return ret;
+
+ trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
+
+ ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
+ &remapped);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Carry the cowextsize hint from src to dest if we're sharing the
+ * entire source file to the entire destination file, the source file
+ * has a cowextsize hint, and the destination file does not.
+ */
+ cowextsize = 0;
+ if (pos_in == 0 && len == i_size_read(inode_in) &&
+ (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ pos_out == 0 && len >= i_size_read(inode_out) &&
+ !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
+ cowextsize = src->i_d.di_cowextsize;
+
+ ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
+ remap_flags);
+
+out_unlock:
+ xfs_reflink_remap_unlock(file_in, file_out);
+ if (ret)
+ trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
+ return remapped > 0 ? remapped : ret;
}
STATIC int
@@ -1175,8 +1214,7 @@ const struct file_operations xfs_file_operations = {
.fsync = xfs_file_fsync,
.get_unmapped_area = thp_get_unmapped_area,
.fallocate = xfs_file_fallocate,
- .clone_file_range = xfs_file_clone_range,
- .dedupe_file_range = xfs_file_dedupe_range,
+ .remap_file_range = xfs_file_remap_range,
};
const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 8eaeec9d58ed..ecdb086bc23e 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -913,18 +913,18 @@ out_error:
/*
* Update destination inode size & cowextsize hint, if necessary.
*/
-STATIC int
+int
xfs_reflink_update_dest(
struct xfs_inode *dest,
xfs_off_t newlen,
xfs_extlen_t cowextsize,
- bool is_dedupe)
+ unsigned int remap_flags)
{
struct xfs_mount *mp = dest->i_mount;
struct xfs_trans *tp;
int error;
- if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
+ if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
return 0;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
@@ -945,10 +945,6 @@ xfs_reflink_update_dest(
dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
}
- if (!is_dedupe) {
- xfs_trans_ichgtime(tp, dest,
- XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- }
xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
error = xfs_trans_commit(tp);
@@ -1112,19 +1108,28 @@ out:
/*
* Iteratively remap one file's extents (and holes) to another's.
*/
-STATIC int
+int
xfs_reflink_remap_blocks(
struct xfs_inode *src,
- xfs_fileoff_t srcoff,
+ loff_t pos_in,
struct xfs_inode *dest,
- xfs_fileoff_t destoff,
- xfs_filblks_t len,
- xfs_off_t new_isize)
+ loff_t pos_out,
+ loff_t remap_len,
+ loff_t *remapped)
{
struct xfs_bmbt_irec imap;
+ xfs_fileoff_t srcoff;
+ xfs_fileoff_t destoff;
+ xfs_filblks_t len;
+ xfs_filblks_t range_len;
+ xfs_filblks_t remapped_len = 0;
+ xfs_off_t new_isize = pos_out + remap_len;
int nimaps;
int error = 0;
- xfs_filblks_t range_len;
+
+ destoff = XFS_B_TO_FSBT(src->i_mount, pos_out);
+ srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in);
+ len = XFS_B_TO_FSB(src->i_mount, remap_len);
/* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
while (len) {
@@ -1139,7 +1144,7 @@ xfs_reflink_remap_blocks(
error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
xfs_iunlock(src, lock_mode);
if (error)
- goto err;
+ break;
ASSERT(nimaps == 1);
trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
@@ -1153,23 +1158,24 @@ xfs_reflink_remap_blocks(
error = xfs_reflink_remap_extent(dest, &imap, destoff,
new_isize);
if (error)
- goto err;
+ break;
if (fatal_signal_pending(current)) {
error = -EINTR;
- goto err;
+ break;
}
/* Advance drange/srange */
srcoff += range_len;
destoff += range_len;
len -= range_len;
+ remapped_len += range_len;
}
- return 0;
-
-err:
- trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
+ if (error)
+ trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
+ *remapped = min_t(loff_t, remap_len,
+ XFS_FSB_TO_B(src->i_mount, remapped_len));
return error;
}
@@ -1218,7 +1224,7 @@ retry:
}
/* Unlock both inodes after they've been prepped for a range clone. */
-STATIC void
+void
xfs_reflink_remap_unlock(
struct file *file_in,
struct file *file_out)
@@ -1286,21 +1292,20 @@ xfs_reflink_zero_posteof(
* stale data in the destination file. Hence we reject these clone attempts with
* -EINVAL in this case.
*/
-STATIC int
+int
xfs_reflink_remap_prep(
struct file *file_in,
loff_t pos_in,
struct file *file_out,
loff_t pos_out,
- u64 *len,
- bool is_dedupe)
+ loff_t *len,
+ unsigned int remap_flags)
{
struct inode *inode_in = file_inode(file_in);
struct xfs_inode *src = XFS_I(inode_in);
struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out);
bool same_inode = (inode_in == inode_out);
- u64 blkmask = i_blocksize(inode_in) - 1;
ssize_t ret;
/* Lock both files against IO */
@@ -1323,29 +1328,11 @@ xfs_reflink_remap_prep(
if (IS_DAX(inode_in) || IS_DAX(inode_out))
goto out_unlock;
- ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
- len, is_dedupe);
- if (ret <= 0)
+ ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+ len, remap_flags);
+ if (ret < 0 || *len == 0)
goto out_unlock;
- /*
- * If the dedupe data matches, chop off the partial EOF block
- * from the source file so we don't try to dedupe the partial
- * EOF block.
- */
- if (is_dedupe) {
- *len &= ~blkmask;
- } else if (*len & blkmask) {
- /*
- * The user is attempting to share a partial EOF block,
- * if it's inside the destination EOF then reject it.
- */
- if (pos_out + *len < i_size_read(inode_out)) {
- ret = -EINVAL;
- goto out_unlock;
- }
- }
-
/* Attach dquots to dest inode before changing block map */
ret = xfs_qm_dqattach(dest);
if (ret)
@@ -1365,31 +1352,9 @@ xfs_reflink_remap_prep(
goto out_unlock;
/* Zap any page cache for the destination file's range. */
- truncate_inode_pages_range(&inode_out->i_data, pos_out,
- PAGE_ALIGN(pos_out + *len) - 1);
-
- /* If we're altering the file contents... */
- if (!is_dedupe) {
- /*
- * ...update the timestamps (which will grab the ilock again
- * from xfs_fs_dirty_inode, so we have to call it before we
- * take the ilock).
- */
- if (!(file_out->f_mode & FMODE_NOCMTIME)) {
- ret = file_update_time(file_out);
- if (ret)
- goto out_unlock;
- }
-
- /*
- * ...clear the security bits if the process is not being run
- * by root. This keeps people from modifying setuid and setgid
- * binaries.
- */
- ret = file_remove_privs(file_out);
- if (ret)
- goto out_unlock;
- }
+ truncate_inode_pages_range(&inode_out->i_data,
+ round_down(pos_out, PAGE_SIZE),
+ round_up(pos_out + *len, PAGE_SIZE) - 1);
return 1;
out_unlock:
@@ -1398,72 +1363,6 @@ out_unlock:
}
/*
- * Link a range of blocks from one file to another.
- */
-int
-xfs_reflink_remap_range(
- struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len,
- bool is_dedupe)
-{
- struct inode *inode_in = file_inode(file_in);
- struct xfs_inode *src = XFS_I(inode_in);
- struct inode *inode_out = file_inode(file_out);
- struct xfs_inode *dest = XFS_I(inode_out);
- struct xfs_mount *mp = src->i_mount;
- xfs_fileoff_t sfsbno, dfsbno;
- xfs_filblks_t fsblen;
- xfs_extlen_t cowextsize;
- ssize_t ret;
-
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
- return -EOPNOTSUPP;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
- /* Prepare and then clone file data. */
- ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
- &len, is_dedupe);
- if (ret <= 0)
- return ret;
-
- trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
-
- dfsbno = XFS_B_TO_FSBT(mp, pos_out);
- sfsbno = XFS_B_TO_FSBT(mp, pos_in);
- fsblen = XFS_B_TO_FSB(mp, len);
- ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
- pos_out + len);
- if (ret)
- goto out_unlock;
-
- /*
- * Carry the cowextsize hint from src to dest if we're sharing the
- * entire source file to the entire destination file, the source file
- * has a cowextsize hint, and the destination file does not.
- */
- cowextsize = 0;
- if (pos_in == 0 && len == i_size_read(inode_in) &&
- (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
- pos_out == 0 && len >= i_size_read(inode_out) &&
- !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
- cowextsize = src->i_d.di_cowextsize;
-
- ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
- is_dedupe);
-
-out_unlock:
- xfs_reflink_remap_unlock(file_in, file_out);
- if (ret)
- trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
- return ret;
-}
-
-/*
* The user wants to preemptively CoW all shared blocks in this file,
* which enables us to turn off the reflink flag. Iterate all
* extents which are not prealloc/delalloc to see which ranges are
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 7f47202b5639..6d73daef1f13 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -27,13 +27,24 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
-extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe);
+extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, loff_t len,
+ unsigned int remap_flags);
extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp,
struct xfs_inode *ip, bool *has_shared);
extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
struct xfs_trans **tpp);
extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
+extern int xfs_reflink_remap_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, loff_t *len,
+ unsigned int remap_flags);
+extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
+ struct xfs_inode *dest, loff_t pos_out, loff_t remap_len,
+ loff_t *remapped);
+extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
+ xfs_extlen_t cowextsize, unsigned int remap_flags);
+extern void xfs_reflink_remap_unlock(struct file *file_in,
+ struct file *file_out);
#endif /* __XFS_REFLINK_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8252df30b9a1..c95c0807471f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1752,6 +1752,25 @@ struct block_device_operations;
#define NOMMU_VMFLAGS \
(NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)
+/*
+ * These flags control the behavior of the remap_file_range function pointer.
+ * If it is called with len == 0 that means "remap to end of source file".
+ * See Documentation/filesystems/vfs.txt for more details about this call.
+ *
+ * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate)
+ * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request
+ */
+#define REMAP_FILE_DEDUP (1 << 0)
+#define REMAP_FILE_CAN_SHORTEN (1 << 1)
+
+/*
+ * These flags signal that the caller is ok with altering various aspects of
+ * the behavior of the remap operation. The changes must be made by the
+ * implementation; the vfs remap helper functions can take advantage of them.
+ * Flags in this category exist to preserve the quirky behavior of the hoisted
+ * btrfs clone/dedupe ioctls.
+ */
+#define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN)
struct iov_iter;
@@ -1790,10 +1809,9 @@ struct file_operations {
#endif
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
loff_t, size_t, unsigned int);
- int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t,
- u64);
- int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t,
- u64);
+ loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);
} __randomize_layout;
@@ -1856,21 +1874,21 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
unsigned long, loff_t *, rwf_t);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
loff_t, size_t, unsigned int);
-extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
- struct inode *inode_out, loff_t pos_out,
- u64 *len, bool is_dedupe);
-extern int do_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len);
-extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len);
-extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
- struct inode *dest, loff_t destoff,
- loff_t len, bool *is_same);
+extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *count,
+ unsigned int remap_flags);
+extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags);
+extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags);
extern int vfs_dedupe_file_range(struct file *file,
struct file_dedupe_range *same);
-extern int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
- struct file *dst_file, loff_t dst_pos,
- u64 len);
+extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
+ struct file *dst_file, loff_t dst_pos,
+ loff_t len, unsigned int remap_flags);
struct super_operations {
@@ -2998,6 +3016,9 @@ extern int sb_min_blocksize(struct super_block *, int);
extern int generic_file_mmap(struct file *, struct vm_area_struct *);
extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
+extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *count, unsigned int remap_flags);
extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/mm/filemap.c b/mm/filemap.c
index 1fe6c4c37a35..81adec8ee02c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2825,6 +2825,42 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
EXPORT_SYMBOL(read_cache_page_gfp);
/*
+ * Don't operate on ranges the page cache doesn't support, and don't exceed the
+ * LFS limits. If pos is under the limit it becomes a short access. If it
+ * exceeds the limit we return -EFBIG.
+ */
+static int generic_access_check_limits(struct file *file, loff_t pos,
+ loff_t *count)
+{
+ struct inode *inode = file->f_mapping->host;
+ loff_t max_size = inode->i_sb->s_maxbytes;
+
+ if (!(file->f_flags & O_LARGEFILE))
+ max_size = MAX_NON_LFS;
+
+ if (unlikely(pos >= max_size))
+ return -EFBIG;
+ *count = min(*count, max_size - pos);
+ return 0;
+}
+
+static int generic_write_check_limits(struct file *file, loff_t pos,
+ loff_t *count)
+{
+ loff_t limit = rlimit(RLIMIT_FSIZE);
+
+ if (limit != RLIM_INFINITY) {
+ if (pos >= limit) {
+ send_sig(SIGXFSZ, current, 0);
+ return -EFBIG;
+ }
+ *count = min(*count, limit - pos);
+ }
+
+ return generic_access_check_limits(file, pos, count);
+}
+
+/*
* Performs necessary checks before doing a write
*
* Can adjust writing position or amount of bytes to write.
@@ -2835,8 +2871,8 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- unsigned long limit = rlimit(RLIMIT_FSIZE);
- loff_t pos;
+ loff_t count;
+ int ret;
if (!iov_iter_count(from))
return 0;
@@ -2845,43 +2881,99 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode);
- pos = iocb->ki_pos;
-
if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
return -EINVAL;
- if (limit != RLIM_INFINITY) {
- if (iocb->ki_pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- iov_iter_truncate(from, limit - (unsigned long)pos);
- }
+ count = iov_iter_count(from);
+ ret = generic_write_check_limits(file, iocb->ki_pos, &count);
+ if (ret)
+ return ret;
+
+ iov_iter_truncate(from, count);
+ return iov_iter_count(from);
+}
+EXPORT_SYMBOL(generic_write_checks);
+
+/*
+ * Performs necessary checks before doing a clone.
+ *
+ * Can adjust amount of bytes to clone.
+ * Returns appropriate error code that caller should return or
+ * zero in case the clone should be allowed.
+ */
+int generic_remap_checks(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *req_count, unsigned int remap_flags)
+{
+ struct inode *inode_in = file_in->f_mapping->host;
+ struct inode *inode_out = file_out->f_mapping->host;
+ uint64_t count = *req_count;
+ uint64_t bcount;
+ loff_t size_in, size_out;
+ loff_t bs = inode_out->i_sb->s_blocksize;
+ int ret;
+
+ /* The start of both ranges must be aligned to an fs block. */
+ if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
+ return -EINVAL;
+
+ /* Ensure offsets don't wrap. */
+ if (pos_in + count < pos_in || pos_out + count < pos_out)
+ return -EINVAL;
+
+ size_in = i_size_read(inode_in);
+ size_out = i_size_read(inode_out);
+
+ /* Dedupe requires both ranges to be within EOF. */
+ if ((remap_flags & REMAP_FILE_DEDUP) &&
+ (pos_in >= size_in || pos_in + count > size_in ||
+ pos_out >= size_out || pos_out + count > size_out))
+ return -EINVAL;
+
+ /* Ensure the infile range is within the infile. */
+ if (pos_in >= size_in)
+ return -EINVAL;
+ count = min(count, size_in - (uint64_t)pos_in);
+
+ ret = generic_access_check_limits(file_in, pos_in, &count);
+ if (ret)
+ return ret;
+
+ ret = generic_write_check_limits(file_out, pos_out, &count);
+ if (ret)
+ return ret;
/*
- * LFS rule
+ * If the user wanted us to link to the infile's EOF, round up to the
+ * next block boundary for this check.
+ *
+ * Otherwise, make sure the count is also block-aligned, having
+ * already confirmed the starting offsets' block alignment.
*/
- if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
- !(file->f_flags & O_LARGEFILE))) {
- if (pos >= MAX_NON_LFS)
- return -EFBIG;
- iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
+ if (pos_in + count == size_in) {
+ bcount = ALIGN(size_in, bs) - pos_in;
+ } else {
+ if (!IS_ALIGNED(count, bs))
+ count = ALIGN_DOWN(count, bs);
+ bcount = count;
}
+ /* Don't allow overlapped cloning within the same file. */
+ if (inode_in == inode_out &&
+ pos_out + bcount > pos_in &&
+ pos_out < pos_in + bcount)
+ return -EINVAL;
+
/*
- * Are we about to exceed the fs block limit ?
- *
- * If we have written data it becomes a short write. If we have
- * exceeded without writing data we send a signal and return EFBIG.
- * Linus frestrict idea will clean these up nicely..
+ * We shortened the request but the caller can't deal with that, so
+ * bounce the request back to userspace.
*/
- if (unlikely(pos >= inode->i_sb->s_maxbytes))
- return -EFBIG;
+ if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
+ return -EINVAL;
- iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
- return iov_iter_count(from);
+ *req_count = count;
+ return 0;
}
-EXPORT_SYMBOL(generic_write_checks);
int pagecache_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,