From 0b0a0806b0d8635e046bf533225a25903b1cddce Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 24 Feb 2009 20:51:52 +0000 Subject: shmem: fix shared anonymous accounting Each time I exit Firefox, /proc/meminfo's Committed_AS goes down almost 400 kB: OVERCOMMIT_NEVER would be allowing overcommits it should prohibit. Commit fc8744adc870a8d4366908221508bb113d8b72ee "Stop playing silly games with the VM_ACCOUNT flag" changed shmem_file_setup() to set the shmem file's VM_ACCOUNT flag according to VM_NORESERVE not being set in the vma flags; but did so only _after_ the shmem_acct_size(flags, size) call which is expected to pre-account a shared anonymous object. It's all clearer if we switch shmem.c over to use VM_NORESERVE throughout in place of !VM_ACCOUNT. But I very nearly sent in a patch which mistakenly removed the accounting from tmpfs files: shmem_get_inode()'s memset was good for not setting VM_ACCOUNT, but now it needs to set VM_NORESERVE. Rather than setting that by default, then perhaps clearing it again in shmem_file_setup(), let's pass it as a flag to shmem_get_inode(): that allows us to remove the #ifdef CONFIG_SHMEM from shmem_file_setup(). Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/shmem.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 19d566ccdeea..4103a239ce84 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -169,13 +169,13 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) */ static inline int shmem_acct_size(unsigned long flags, loff_t size) { - return (flags & VM_ACCOUNT) ? - security_vm_enough_memory_kern(VM_ACCT(size)) : 0; + return (flags & VM_NORESERVE) ? + 0 : security_vm_enough_memory_kern(VM_ACCT(size)); } static inline void shmem_unacct_size(unsigned long flags, loff_t size) { - if (flags & VM_ACCOUNT) + if (!(flags & VM_NORESERVE)) vm_unacct_memory(VM_ACCT(size)); } @@ -187,13 +187,13 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size) */ static inline int shmem_acct_block(unsigned long flags) { - return (flags & VM_ACCOUNT) ? - 0 : security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)); + return (flags & VM_NORESERVE) ? + security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0; } static inline void shmem_unacct_blocks(unsigned long flags, long pages) { - if (!(flags & VM_ACCOUNT)) + if (flags & VM_NORESERVE) vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE)); } @@ -1515,8 +1515,8 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static struct inode * -shmem_get_inode(struct super_block *sb, int mode, dev_t dev) +static struct inode *shmem_get_inode(struct super_block *sb, int mode, + dev_t dev, unsigned long flags) { struct inode *inode; struct shmem_inode_info *info; @@ -1537,6 +1537,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); + info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); switch (mode & S_IFMT) { @@ -1779,9 +1780,10 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) static int shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { - struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev); + struct inode *inode; int error = -ENOSPC; + inode = shmem_get_inode(dir->i_sb, mode, dev, VM_NORESERVE); if (inode) { error = security_inode_init_security(inode, dir, NULL, NULL, NULL); @@ -1920,7 +1922,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s if (len > PAGE_CACHE_SIZE) return -ENAMETOOLONG; - inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); + inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); if (!inode) return -ENOSPC; @@ -2332,7 +2334,7 @@ static int shmem_fill_super(struct super_block *sb, sb->s_flags |= MS_POSIXACL; #endif - inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0); + inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); if (!inode) goto failed; inode->i_uid = sbinfo->uid; @@ -2574,12 +2576,12 @@ int shmem_unuse(swp_entry_t entry, struct page *page) return 0; } -#define shmem_file_operations ramfs_file_operations -#define shmem_vm_ops generic_file_vm_ops -#define shmem_get_inode ramfs_get_inode -#define shmem_acct_size(a, b) 0 -#define shmem_unacct_size(a, b) do {} while (0) -#define SHMEM_MAX_BYTES LLONG_MAX +#define shmem_vm_ops generic_file_vm_ops +#define shmem_file_operations ramfs_file_operations +#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) +#define shmem_acct_size(flags, size) 0 +#define shmem_unacct_size(flags, size) do {} while (0) +#define SHMEM_MAX_BYTES LLONG_MAX #endif /* CONFIG_SHMEM */ @@ -2589,7 +2591,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page) * shmem_file_setup - get an unlinked file living in tmpfs * @name: name for dentry (to be seen in /proc//maps * @size: size to be set for the file - * @flags: vm_flags + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) { @@ -2623,13 +2625,10 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) goto put_dentry; error = -ENOSPC; - inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); + inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); if (!inode) goto close_file; -#ifdef CONFIG_SHMEM - SHMEM_I(inode)->flags = (flags & VM_NORESERVE) ? 0 : VM_ACCOUNT; -#endif d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ -- cgit v1.2.3-55-g7522 From 7766970cc13e9071b356b1f2a48a9eb8675bfcce Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 27 Feb 2009 14:03:03 -0800 Subject: mm: vmap fix overflow The new vmap allocator can wrap the address and get confused in the case of large allocations or VMALLOC_END near the end of address space. Problem reported by Christoph Hellwig on a 32-bit XFS workload. Signed-off-by: Nick Piggin Reported-by: Christoph Hellwig Cc: [2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 903cad46e796..ed3705e4b83f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -323,6 +323,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long addr; int purged = 0; + BUG_ON(!size); BUG_ON(size & ~PAGE_MASK); va = kmalloc_node(sizeof(struct vmap_area), @@ -334,6 +335,9 @@ retry: addr = ALIGN(vstart, align); spin_lock(&vmap_area_lock); + if (addr + size - 1 < addr) + goto overflow; + /* XXX: could have a last_hole cache */ n = vmap_area_root.rb_node; if (n) { @@ -365,6 +369,8 @@ retry: while (addr + size > first->va_start && addr + size <= vend) { addr = ALIGN(first->va_end + PAGE_SIZE, align); + if (addr + size - 1 < addr) + goto overflow; n = rb_next(&first->rb_node); if (n) @@ -375,6 +381,7 @@ retry: } found: if (addr + size > vend) { +overflow: spin_unlock(&vmap_area_lock); if (!purged) { purge_vmap_area_lazy(); -- cgit v1.2.3-55-g7522 From cbb766766f3f2f6d9326c561b1020590642c6e39 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 27 Feb 2009 14:03:04 -0800 Subject: mm: fix lazy vmap purging (use-after-free error) I just got this new warning from kmemcheck: WARNING: kmemcheck: Caught 32-bit read from freed memory (c7806a60) a06a80c7ecde70c1a04080c700000000a06709c1000000000000000000000000 f f f f f f f f f f f f f f f f f f f f f f f f f f f f f f f f ^ Pid: 0, comm: swapper Not tainted (2.6.29-rc4 #230) EIP: 0060:[] EFLAGS: 00000286 CPU: 0 EIP is at __purge_vmap_area_lazy+0x117/0x140 EAX: 00070f43 EBX: c7806a40 ECX: c1677080 EDX: 00027b66 ESI: 00002001 EDI: c170df0c EBP: c170df00 ESP: c178830c DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 80050033 CR2: c7806b14 CR3: 01775000 CR4: 00000690 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: 00004000 DR7: 00000000 [] free_unmap_vmap_area_noflush+0x6e/0x70 [] remove_vm_area+0x2a/0x70 [] __vunmap+0x45/0xe0 [] vunmap+0x1e/0x30 [] text_poke+0x95/0x150 [] alternatives_smp_unlock+0x49/0x60 [] alternative_instructions+0x11b/0x124 [] check_bugs+0xbd/0xdc [] start_kernel+0x2ed/0x360 [] __init_begin+0x9e/0xa9 [] 0xffffffff It happened here: $ addr2line -e vmlinux -i c1096df7 mm/vmalloc.c:540 Code: list_for_each_entry(va, &valist, purge_list) __free_vmap_area(va); It's this instruction: mov 0x20(%ebx),%edx Which corresponds to a dereference of va->purge_list.next: (gdb) p ((struct vmap_area *) 0)->purge_list.next Cannot access memory at address 0x20 It seems that we should use "safe" list traversal here, as the element is freed inside the loop. Please verify that this is the right fix. Acked-by: Nick Piggin Signed-off-by: Vegard Nossum Cc: Pekka Enberg Cc: Ingo Molnar Cc: "Paul E. McKenney" Cc: [2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ed3705e4b83f..520a75980269 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -505,6 +505,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, static DEFINE_SPINLOCK(purge_lock); LIST_HEAD(valist); struct vmap_area *va; + struct vmap_area *n_va; int nr = 0; /* @@ -544,7 +545,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, if (nr) { spin_lock(&vmap_area_lock); - list_for_each_entry(va, &valist, purge_list) + list_for_each_entry_safe(va, n_va, &valist, purge_list) __free_vmap_area(va); spin_unlock(&vmap_area_lock); } -- cgit v1.2.3-55-g7522 From f180053694b43d5714bf56cb95499a3c32ff155c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 2 Mar 2009 11:00:57 +0100 Subject: x86, mm: dont use non-temporal stores in pagecache accesses Impact: standardize IO on cached ops On modern CPUs it is almost always a bad idea to use non-temporal stores, as the regression in this commit has shown it: 30d697f: x86: fix performance regression in write() syscall The kernel simply has no good information about whether using non-temporal stores is a good idea or not - and trying to add heuristics only increases complexity and inserts fragility. The regression on cached write()s took very long to be found - over two years. So dont take any chances and let the hardware decide how it makes use of its caches. The only exception is drivers/gpu/drm/i915/i915_gem.c: there were we are absolutely sure that another entity (the GPU) will pick up the dirty data immediately and that the CPU will not touch that data before the GPU will. Also, keep the _nocache() primitives to make it easier for people to experiment with these details. There may be more clear-cut cases where non-cached copies can be used, outside of filemap.c. Cc: Salman Qazi Cc: Nick Piggin Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess_32.h | 4 ++-- arch/x86/include/asm/uaccess_64.h | 25 +++++++------------------ drivers/gpu/drm/i915/i915_gem.c | 2 +- include/linux/uaccess.h | 4 ++-- mm/filemap.c | 11 ++++------- mm/filemap_xip.c | 2 +- 6 files changed, 17 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index a0ba61386972..5e06259e90e5 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -157,7 +157,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) } static __always_inline unsigned long __copy_from_user_nocache(void *to, - const void __user *from, unsigned long n, unsigned long total) + const void __user *from, unsigned long n) { might_fault(); if (__builtin_constant_p(n)) { @@ -180,7 +180,7 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to, static __always_inline unsigned long __copy_from_user_inatomic_nocache(void *to, const void __user *from, - unsigned long n, unsigned long total) + unsigned long n) { return __copy_from_user_ll_nocache_nozero(to, from, n); } diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index dcaa0404cf7b..8cc687326eb8 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -188,29 +188,18 @@ __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest); -static inline int __copy_from_user_nocache(void *dst, const void __user *src, - unsigned size, unsigned long total) +static inline int +__copy_from_user_nocache(void *dst, const void __user *src, unsigned size) { might_sleep(); - /* - * In practice this limit means that large file write()s - * which get chunked to 4K copies get handled via - * non-temporal stores here. Smaller writes get handled - * via regular __copy_from_user(): - */ - if (likely(total >= PAGE_SIZE)) - return __copy_user_nocache(dst, src, size, 1); - else - return __copy_from_user(dst, src, size); + return __copy_user_nocache(dst, src, size, 1); } -static inline int __copy_from_user_inatomic_nocache(void *dst, - const void __user *src, unsigned size, unsigned total) +static inline int +__copy_from_user_inatomic_nocache(void *dst, const void __user *src, + unsigned size) { - if (likely(total >= PAGE_SIZE)) - return __copy_user_nocache(dst, src, size, 0); - else - return __copy_from_user_inatomic(dst, src, size); + return __copy_user_nocache(dst, src, size, 0); } unsigned long diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 6b209db8370d..818576654092 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -215,7 +215,7 @@ fast_user_write(struct io_mapping *mapping, vaddr_atomic = io_mapping_map_atomic_wc(mapping, page_base); unwritten = __copy_from_user_inatomic_nocache(vaddr_atomic + page_offset, - user_data, length, length); + user_data, length); io_mapping_unmap_atomic(vaddr_atomic); if (unwritten) return -EFAULT; diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 6f3c603b0d67..6b58367d145e 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -41,13 +41,13 @@ static inline void pagefault_enable(void) #ifndef ARCH_HAS_NOCACHE_UACCESS static inline unsigned long __copy_from_user_inatomic_nocache(void *to, - const void __user *from, unsigned long n, unsigned long total) + const void __user *from, unsigned long n) { return __copy_from_user_inatomic(to, from, n); } static inline unsigned long __copy_from_user_nocache(void *to, - const void __user *from, unsigned long n, unsigned long total) + const void __user *from, unsigned long n) { return __copy_from_user(to, from, n); } diff --git a/mm/filemap.c b/mm/filemap.c index 60fd56772cc6..126d3973b3d1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1816,14 +1816,14 @@ EXPORT_SYMBOL(file_remove_suid); static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { - size_t copied = 0, left = 0, total = bytes; + size_t copied = 0, left = 0; while (bytes) { char __user *buf = iov->iov_base + base; int copy = min(bytes, iov->iov_len - base); base = 0; - left = __copy_from_user_inatomic_nocache(vaddr, buf, copy, total); + left = __copy_from_user_inatomic(vaddr, buf, copy); copied += copy; bytes -= copy; vaddr += copy; @@ -1851,9 +1851,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, if (likely(i->nr_segs == 1)) { int left; char __user *buf = i->iov->iov_base + i->iov_offset; - - left = __copy_from_user_inatomic_nocache(kaddr + offset, - buf, bytes, bytes); + left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); copied = bytes - left; } else { copied = __iovec_copy_from_user_inatomic(kaddr + offset, @@ -1881,8 +1879,7 @@ size_t iov_iter_copy_from_user(struct page *page, if (likely(i->nr_segs == 1)) { int left; char __user *buf = i->iov->iov_base + i->iov_offset; - - left = __copy_from_user_nocache(kaddr + offset, buf, bytes, bytes); + left = __copy_from_user(kaddr + offset, buf, bytes); copied = bytes - left; } else { copied = __iovec_copy_from_user_inatomic(kaddr + offset, diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index bf54f8a2cf1d..0c04615651b7 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -354,7 +354,7 @@ __xip_file_write(struct file *filp, const char __user *buf, break; copied = bytes - - __copy_from_user_nocache(xip_mem + offset, buf, bytes, bytes); + __copy_from_user_nocache(xip_mem + offset, buf, bytes); if (likely(copied > 0)) { status = copied; -- cgit v1.2.3-55-g7522