From 0b0a0806b0d8635e046bf533225a25903b1cddce Mon Sep 17 00:00:00 2001
From: Hugh Dickins
Date: Tue, 24 Feb 2009 20:51:52 +0000
Subject: shmem: fix shared anonymous accounting

Each time I exit Firefox, /proc/meminfo's Committed_AS goes down almost
400 kB: OVERCOMMIT_NEVER would be allowing overcommits it should
prohibit.

Commit fc8744adc870a8d4366908221508bb113d8b72ee "Stop playing silly
games with the VM_ACCOUNT flag" changed shmem_file_setup() to set the
shmem file's VM_ACCOUNT flag according to VM_NORESERVE not being set in
the vma flags; but did so only _after_ the shmem_acct_size(flags, size)
call which is expected to pre-account a shared anonymous object.

It's all clearer if we switch shmem.c over to use VM_NORESERVE
throughout in place of !VM_ACCOUNT.

But I very nearly sent in a patch which mistakenly removed the
accounting from tmpfs files: shmem_get_inode()'s memset was good for not
setting VM_ACCOUNT, but now it needs to set VM_NORESERVE.

Rather than setting that by default, then perhaps clearing it again in
shmem_file_setup(), let's pass it as a flag to shmem_get_inode(): that
allows us to remove the #ifdef CONFIG_SHMEM from shmem_file_setup().

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 43 +++++++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index 19d566ccdeea..4103a239ce84 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -169,13 +169,13 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
  */
 static inline int shmem_acct_size(unsigned long flags, loff_t size)
 {
-	return (flags & VM_ACCOUNT) ?
-		security_vm_enough_memory_kern(VM_ACCT(size)) : 0;
+	return (flags & VM_NORESERVE) ?
+		0 : security_vm_enough_memory_kern(VM_ACCT(size));
 }
 
 static inline void shmem_unacct_size(unsigned long flags, loff_t size)
 {
-	if (flags & VM_ACCOUNT)
+	if (!(flags & VM_NORESERVE))
 		vm_unacct_memory(VM_ACCT(size));
 }
 
@@ -187,13 +187,13 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
  */
 static inline int shmem_acct_block(unsigned long flags)
 {
-	return (flags & VM_ACCOUNT) ?
-		0 : security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE));
+	return (flags & VM_NORESERVE) ?
+		security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0;
 }
 
 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 {
-	if (!(flags & VM_ACCOUNT))
+	if (flags & VM_NORESERVE)
 		vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
 }
 
@@ -1515,8 +1515,8 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
-static struct inode *
-shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
+static struct inode *shmem_get_inode(struct super_block *sb, int mode,
+					dev_t dev, unsigned long flags)
 {
 	struct inode *inode;
 	struct shmem_inode_info *info;
@@ -1537,6 +1537,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 		info = SHMEM_I(inode);
 		memset(info, 0, (char *)inode - (char *)info);
 		spin_lock_init(&info->lock);
+		info->flags = flags & VM_NORESERVE;
 		INIT_LIST_HEAD(&info->swaplist);
 
 		switch (mode & S_IFMT) {
@@ -1779,9 +1780,10 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int
 shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 {
-	struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
+	struct inode *inode;
 	int error = -ENOSPC;
 
+	inode = shmem_get_inode(dir->i_sb, mode, dev, VM_NORESERVE);
 	if (inode) {
 		error = security_inode_init_security(inode, dir, NULL, NULL,
 						     NULL);
@@ -1920,7 +1922,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 	if (len > PAGE_CACHE_SIZE)
 		return -ENAMETOOLONG;
 
-	inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
+	inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
 	if (!inode)
 		return -ENOSPC;
 
@@ -2332,7 +2334,7 @@ static int shmem_fill_super(struct super_block *sb,
 	sb->s_flags |= MS_POSIXACL;
 #endif
 
-	inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0);
+	inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
 	if (!inode)
 		goto failed;
 	inode->i_uid = sbinfo->uid;
@@ -2574,12 +2576,12 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
 	return 0;
 }
 
-#define shmem_file_operations ramfs_file_operations
-#define shmem_vm_ops generic_file_vm_ops
-#define shmem_get_inode ramfs_get_inode
-#define shmem_acct_size(a, b) 0
-#define shmem_unacct_size(a, b) do {} while (0)
-#define SHMEM_MAX_BYTES LLONG_MAX
+#define shmem_vm_ops				generic_file_vm_ops
+#define shmem_file_operations			ramfs_file_operations
+#define shmem_get_inode(sb, mode, dev, flags)	ramfs_get_inode(sb, mode, dev)
+#define shmem_acct_size(flags, size)		0
+#define shmem_unacct_size(flags, size)		do {} while (0)
+#define SHMEM_MAX_BYTES				LLONG_MAX
 
 #endif /* CONFIG_SHMEM */
 
@@ -2589,7 +2591,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
  * shmem_file_setup - get an unlinked file living in tmpfs
  * @name: name for dentry (to be seen in /proc/<pid>/maps
  * @size: size to be set for the file
- * @flags: vm_flags
+ * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
  */
 struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
 {
@@ -2623,13 +2625,10 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
 		goto put_dentry;
 
 	error = -ENOSPC;
-	inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
+	inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags);
 	if (!inode)
 		goto close_file;
 
-#ifdef CONFIG_SHMEM
-	SHMEM_I(inode)->flags = (flags & VM_NORESERVE) ? 0 : VM_ACCOUNT;
-#endif
 	d_instantiate(dentry, inode);
 	inode->i_size = size;
 	inode->i_nlink = 0;	/* It is unlinked */
-- 
cgit v1.2.3-55-g7522


From 7766970cc13e9071b356b1f2a48a9eb8675bfcce Mon Sep 17 00:00:00 2001
From: Nick Piggin
Date: Fri, 27 Feb 2009 14:03:03 -0800
Subject: mm: vmap fix overflow

The new vmap allocator can wrap the address and get confused in the case
of large allocations or VMALLOC_END near the end of address space.

Problem reported by Christoph Hellwig on a 32-bit XFS workload.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Reported-by: Christoph Hellwig <hch@lst.de>
Cc: <stable@kernel.org>		[2.6.28.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 903cad46e796..ed3705e4b83f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -323,6 +323,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	unsigned long addr;
 	int purged = 0;
 
+	BUG_ON(!size);
 	BUG_ON(size & ~PAGE_MASK);
 
 	va = kmalloc_node(sizeof(struct vmap_area),
@@ -334,6 +335,9 @@ retry:
 	addr = ALIGN(vstart, align);
 
 	spin_lock(&vmap_area_lock);
+	if (addr + size - 1 < addr)
+		goto overflow;
+
 	/* XXX: could have a last_hole cache */
 	n = vmap_area_root.rb_node;
 	if (n) {
@@ -365,6 +369,8 @@ retry:
 
 		while (addr + size > first->va_start && addr + size <= vend) {
 			addr = ALIGN(first->va_end + PAGE_SIZE, align);
+			if (addr + size - 1 < addr)
+				goto overflow;
 
 			n = rb_next(&first->rb_node);
 			if (n)
@@ -375,6 +381,7 @@ retry:
 	}
 found:
 	if (addr + size > vend) {
+overflow:
 		spin_unlock(&vmap_area_lock);
 		if (!purged) {
 			purge_vmap_area_lazy();
-- 
cgit v1.2.3-55-g7522


From cbb766766f3f2f6d9326c561b1020590642c6e39 Mon Sep 17 00:00:00 2001
From: Vegard Nossum
Date: Fri, 27 Feb 2009 14:03:04 -0800
Subject: mm: fix lazy vmap purging (use-after-free error)

I just got this new warning from kmemcheck:

    WARNING: kmemcheck: Caught 32-bit read from freed memory (c7806a60)
    a06a80c7ecde70c1a04080c700000000a06709c1000000000000000000000000
     f f f f f f f f f f f f f f f f f f f f f f f f f f f f f f f f
     ^

    Pid: 0, comm: swapper Not tainted (2.6.29-rc4 #230)
    EIP: 0060:[<c1096df7>] EFLAGS: 00000286 CPU: 0
    EIP is at __purge_vmap_area_lazy+0x117/0x140
    EAX: 00070f43 EBX: c7806a40 ECX: c1677080 EDX: 00027b66
    ESI: 00002001 EDI: c170df0c EBP: c170df00 ESP: c178830c
     DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
    CR0: 80050033 CR2: c7806b14 CR3: 01775000 CR4: 00000690
    DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
    DR6: 00004000 DR7: 00000000
     [<c1096f3e>] free_unmap_vmap_area_noflush+0x6e/0x70
     [<c1096f6a>] remove_vm_area+0x2a/0x70
     [<c1097025>] __vunmap+0x45/0xe0
     [<c10970de>] vunmap+0x1e/0x30
     [<c1008ba5>] text_poke+0x95/0x150
     [<c1008ca9>] alternatives_smp_unlock+0x49/0x60
     [<c171ef47>] alternative_instructions+0x11b/0x124
     [<c171f991>] check_bugs+0xbd/0xdc
     [<c17148c5>] start_kernel+0x2ed/0x360
     [<c171409e>] __init_begin+0x9e/0xa9
     [<ffffffff>] 0xffffffff

It happened here:

    $ addr2line -e vmlinux -i c1096df7
    mm/vmalloc.c:540

Code:

	list_for_each_entry(va, &valist, purge_list)
		__free_vmap_area(va);

It's this instruction:

    mov    0x20(%ebx),%edx

Which corresponds to a dereference of va->purge_list.next:

    (gdb) p ((struct vmap_area *) 0)->purge_list.next
    Cannot access memory at address 0x20

It seems that we should use "safe" list traversal here, as the element
is freed inside the loop. Please verify that this is the right fix.

Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: <stable@kernel.org>		[2.6.28.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ed3705e4b83f..520a75980269 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -505,6 +505,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 	static DEFINE_SPINLOCK(purge_lock);
 	LIST_HEAD(valist);
 	struct vmap_area *va;
+	struct vmap_area *n_va;
 	int nr = 0;
 
 	/*
@@ -544,7 +545,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 
 	if (nr) {
 		spin_lock(&vmap_area_lock);
-		list_for_each_entry(va, &valist, purge_list)
+		list_for_each_entry_safe(va, n_va, &valist, purge_list)
 			__free_vmap_area(va);
 		spin_unlock(&vmap_area_lock);
 	}
-- 
cgit v1.2.3-55-g7522


From f180053694b43d5714bf56cb95499a3c32ff155c Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Mon, 2 Mar 2009 11:00:57 +0100
Subject: x86, mm: dont use non-temporal stores in pagecache accesses

Impact: standardize IO on cached ops

On modern CPUs it is almost always a bad idea to use non-temporal stores,
as the regression in this commit has shown it:

  30d697f: x86: fix performance regression in write() syscall

The kernel simply has no good information about whether using non-temporal
stores is a good idea or not - and trying to add heuristics only increases
complexity and inserts fragility.

The regression on cached write()s took very long to be found - over two
years. So dont take any chances and let the hardware decide how it makes
use of its caches.

The only exception is drivers/gpu/drm/i915/i915_gem.c: there were we are
absolutely sure that another entity (the GPU) will pick up the dirty
data immediately and that the CPU will not touch that data before the
GPU will.

Also, keep the _nocache() primitives to make it easier for people to
experiment with these details. There may be more clear-cut cases where
non-cached copies can be used, outside of filemap.c.

Cc: Salman Qazi <sqazi@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uaccess_32.h |  4 ++--
 arch/x86/include/asm/uaccess_64.h | 25 +++++++------------------
 drivers/gpu/drm/i915/i915_gem.c   |  2 +-
 include/linux/uaccess.h           |  4 ++--
 mm/filemap.c                      | 11 ++++-------
 mm/filemap_xip.c                  |  2 +-
 6 files changed, 17 insertions(+), 31 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index a0ba61386972..5e06259e90e5 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -157,7 +157,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
 }
 
 static __always_inline unsigned long __copy_from_user_nocache(void *to,
-		const void __user *from, unsigned long n, unsigned long total)
+				const void __user *from, unsigned long n)
 {
 	might_fault();
 	if (__builtin_constant_p(n)) {
@@ -180,7 +180,7 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to,
 
 static __always_inline unsigned long
 __copy_from_user_inatomic_nocache(void *to, const void __user *from,
-				  unsigned long n, unsigned long total)
+				  unsigned long n)
 {
        return __copy_from_user_ll_nocache_nozero(to, from, n);
 }
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index dcaa0404cf7b..8cc687326eb8 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -188,29 +188,18 @@ __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
 extern long __copy_user_nocache(void *dst, const void __user *src,
 				unsigned size, int zerorest);
 
-static inline int __copy_from_user_nocache(void *dst, const void __user *src,
-				   unsigned size, unsigned long total)
+static inline int
+__copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
 {
 	might_sleep();
-	/*
-	 * In practice this limit means that large file write()s
-	 * which get chunked to 4K copies get handled via
-	 * non-temporal stores here. Smaller writes get handled
-	 * via regular __copy_from_user():
-	 */
-	if (likely(total >= PAGE_SIZE))
-		return __copy_user_nocache(dst, src, size, 1);
-	else
-		return __copy_from_user(dst, src, size);
+	return __copy_user_nocache(dst, src, size, 1);
 }
 
-static inline int __copy_from_user_inatomic_nocache(void *dst,
-	    const void __user *src, unsigned size, unsigned total)
+static inline int
+__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
+				  unsigned size)
 {
-	if (likely(total >= PAGE_SIZE))
-		return __copy_user_nocache(dst, src, size, 0);
-	else
-		return __copy_from_user_inatomic(dst, src, size);
+	return __copy_user_nocache(dst, src, size, 0);
 }
 
 unsigned long
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 6b209db8370d..818576654092 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -215,7 +215,7 @@ fast_user_write(struct io_mapping *mapping,
 
 	vaddr_atomic = io_mapping_map_atomic_wc(mapping, page_base);
 	unwritten = __copy_from_user_inatomic_nocache(vaddr_atomic + page_offset,
-						      user_data, length, length);
+						      user_data, length);
 	io_mapping_unmap_atomic(vaddr_atomic);
 	if (unwritten)
 		return -EFAULT;
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 6f3c603b0d67..6b58367d145e 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -41,13 +41,13 @@ static inline void pagefault_enable(void)
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
 static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
-		const void __user *from, unsigned long n, unsigned long total)
+				const void __user *from, unsigned long n)
 {
 	return __copy_from_user_inatomic(to, from, n);
 }
 
 static inline unsigned long __copy_from_user_nocache(void *to,
-		const void __user *from, unsigned long n, unsigned long total)
+				const void __user *from, unsigned long n)
 {
 	return __copy_from_user(to, from, n);
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 60fd56772cc6..126d3973b3d1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1816,14 +1816,14 @@ EXPORT_SYMBOL(file_remove_suid);
 static size_t __iovec_copy_from_user_inatomic(char *vaddr,
 			const struct iovec *iov, size_t base, size_t bytes)
 {
-	size_t copied = 0, left = 0, total = bytes;
+	size_t copied = 0, left = 0;
 
 	while (bytes) {
 		char __user *buf = iov->iov_base + base;
 		int copy = min(bytes, iov->iov_len - base);
 
 		base = 0;
-		left = __copy_from_user_inatomic_nocache(vaddr, buf, copy, total);
+		left = __copy_from_user_inatomic(vaddr, buf, copy);
 		copied += copy;
 		bytes -= copy;
 		vaddr += copy;
@@ -1851,9 +1851,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 	if (likely(i->nr_segs == 1)) {
 		int left;
 		char __user *buf = i->iov->iov_base + i->iov_offset;
-
-		left = __copy_from_user_inatomic_nocache(kaddr + offset,
-							buf, bytes, bytes);
+		left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
 		copied = bytes - left;
 	} else {
 		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
@@ -1881,8 +1879,7 @@ size_t iov_iter_copy_from_user(struct page *page,
 	if (likely(i->nr_segs == 1)) {
 		int left;
 		char __user *buf = i->iov->iov_base + i->iov_offset;
-
-		left = __copy_from_user_nocache(kaddr + offset, buf, bytes, bytes);
+		left = __copy_from_user(kaddr + offset, buf, bytes);
 		copied = bytes - left;
 	} else {
 		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index bf54f8a2cf1d..0c04615651b7 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -354,7 +354,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
 			break;
 
 		copied = bytes -
-			__copy_from_user_nocache(xip_mem + offset, buf, bytes, bytes);
+			__copy_from_user_nocache(xip_mem + offset, buf, bytes);
 
 		if (likely(copied > 0)) {
 			status = copied;
-- 
cgit v1.2.3-55-g7522