From bebeb3d68b24bb4132d452c5707fe321208bcbcd Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 22 Feb 2013 16:32:37 -0800 Subject: mm: introduce mm_populate() for populating new vmas When creating new mappings using the MAP_POPULATE / MAP_LOCKED flags (or with MCL_FUTURE in effect), we want to populate the pages within the newly created vmas. This may take a while as we may have to read pages from disk, so ideally we want to do this outside of the write-locked mmap_sem region. This change introduces mm_populate(), which is used to defer populating such mappings until after the mmap_sem write lock has been released. This is implemented as a generalization of the former do_mlock_pages(), which accomplished the same task but was using during mlock() / mlockall(). Signed-off-by: Michel Lespinasse Reported-by: Andy Lutomirski Acked-by: Rik van Riel Tested-by: Andy Lutomirski Cc: Greg Ungerer Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9d9dcc35d6a1..da0a0fe970c2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1474,11 +1474,23 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff); -extern unsigned long do_mmap_pgoff(struct file *, unsigned long, - unsigned long, unsigned long, - unsigned long, unsigned long); +extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, unsigned long flags, + unsigned long pgoff, bool *populate); extern int do_munmap(struct mm_struct *, unsigned long, size_t); +#ifdef CONFIG_MMU +extern int __mm_populate(unsigned long addr, unsigned long len, + int ignore_errors); +static inline void mm_populate(unsigned long addr, unsigned long len) +{ + /* Ignore errors */ + (void) __mm_populate(addr, len, 1); +} +#else +static inline void mm_populate(unsigned long addr, unsigned long len) {} +#endif + /* These take the mm semaphore themselves */ extern unsigned long vm_brk(unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); -- cgit v1.2.3-55-g7522 From c22c0d6344c362b1dde5d8e160d3d07536aca120 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 22 Feb 2013 16:32:43 -0800 Subject: mm: remove flags argument to mmap_region After the MAP_POPULATE handling has been moved to mmap_region() call sites, the only remaining use of the flags argument is to pass the MAP_NORESERVE flag. This can be just as easily handled by do_mmap_pgoff(), so do that and remove the mmap_region() flags parameter. [akpm@linux-foundation.org: remove double parens] Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Tested-by: Andy Lutomirski Cc: Greg Ungerer Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/tile/mm/elf.c | 1 - include/linux/mm.h | 3 +-- mm/fremap.c | 3 +-- mm/mmap.c | 33 ++++++++++++++++----------------- 4 files changed, 18 insertions(+), 22 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c index 3cfa98bf9125..743c951c61b0 100644 --- a/arch/tile/mm/elf.c +++ b/arch/tile/mm/elf.c @@ -130,7 +130,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, if (!retval) { unsigned long addr = MEM_USER_INTRPT; addr = mmap_region(NULL, addr, INTRPT_SIZE, - MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0); if (addr > (unsigned long) -PAGE_SIZE) diff --git a/include/linux/mm.h b/include/linux/mm.h index da0a0fe970c2..8332f3069fe3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1472,8 +1472,7 @@ extern int install_special_mapping(struct mm_struct *mm, extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); extern unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, unsigned long flags, - vm_flags_t vm_flags, unsigned long pgoff); + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff); extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, bool *populate); diff --git a/mm/fremap.c b/mm/fremap.c index b42e32171530..503a72387087 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -204,9 +204,8 @@ get_write_lock: unsigned long addr; struct file *file = get_file(vma->vm_file); - flags = (flags & MAP_NONBLOCK) | MAP_POPULATE; addr = mmap_region(file, start, size, - flags, vma->vm_flags, pgoff); + vma->vm_flags, pgoff); fput(file); if (IS_ERR_VALUE(addr)) { err = addr; diff --git a/mm/mmap.c b/mm/mmap.c index a23e30f9719c..d6bc39e939ab 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1291,7 +1291,21 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } } - addr = mmap_region(file, addr, len, flags, vm_flags, pgoff); + /* + * Set 'VM_NORESERVE' if we should not account for the + * memory use of this mapping. + */ + if (flags & MAP_NORESERVE) { + /* We honor MAP_NORESERVE if allowed to overcommit */ + if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; + + /* hugetlb applies strict overcommit unless MAP_NORESERVE */ + if (file && is_file_hugepages(file)) + vm_flags |= VM_NORESERVE; + } + + addr = mmap_region(file, addr, len, vm_flags, pgoff); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) @@ -1411,8 +1425,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) } unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, unsigned long flags, - vm_flags_t vm_flags, unsigned long pgoff) + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -1435,20 +1448,6 @@ munmap_back: if (!may_expand_vm(mm, len >> PAGE_SHIFT)) return -ENOMEM; - /* - * Set 'VM_NORESERVE' if we should not account for the - * memory use of this mapping. - */ - if ((flags & MAP_NORESERVE)) { - /* We honor MAP_NORESERVE if allowed to overcommit */ - if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) - vm_flags |= VM_NORESERVE; - - /* hugetlb applies strict overcommit unless MAP_NORESERVE */ - if (file && is_file_hugepages(file)) - vm_flags |= VM_NORESERVE; - } - /* * Private writable mapping: check memory availability */ -- cgit v1.2.3-55-g7522 From cea10a19b7972a1954c4a2d05a7de8db48b444fb Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 22 Feb 2013 16:32:44 -0800 Subject: mm: directly use __mlock_vma_pages_range() in find_extend_vma() In find_extend_vma(), we don't need mlock_vma_pages_range() to verify the vma type - we know we're working with a stack. So, we can call directly into __mlock_vma_pages_range(), and remove the last make_pages_present() call site. Note that we don't use mm_populate() here, so we can't release the mmap_sem while allocating new stack pages. This is deemed acceptable, because the stack vmas grow by a bounded number of pages at a time, and these are anon pages so we don't have to read from disk to populate them. Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Tested-by: Andy Lutomirski Cc: Greg Ungerer Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/internal.h | 4 ++-- mm/memory.c | 24 ----------------------- mm/mlock.c | 57 +++--------------------------------------------------- mm/mmap.c | 10 ++++------ 5 files changed, 9 insertions(+), 87 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8332f3069fe3..0c34d3486816 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1035,7 +1035,6 @@ static inline int fixup_user_fault(struct task_struct *tsk, } #endif -extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, int write); diff --git a/mm/internal.h b/mm/internal.h index 9ba21100ebf3..1c0c4cc0fcf7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -162,8 +162,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent); #ifdef CONFIG_MMU -extern long mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end); +extern long __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking); extern void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); static inline void munlock_vma_pages_all(struct vm_area_struct *vma) diff --git a/mm/memory.c b/mm/memory.c index 0abd07097ec6..7837ceacf090 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3824,30 +3824,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ -int make_pages_present(unsigned long addr, unsigned long end) -{ - int ret, len, write; - struct vm_area_struct * vma; - - vma = find_vma(current->mm, addr); - if (!vma) - return -ENOMEM; - /* - * We want to touch writable mappings with a write fault in order - * to break COW, except for shared mappings because these don't COW - * and we would not want to dirty them for nothing. - */ - write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; - BUG_ON(addr >= end); - BUG_ON(end > vma->vm_end); - len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; - ret = get_user_pages(current, current->mm, addr, - len, write, 0, NULL, NULL); - if (ret < 0) - return ret; - return ret == len ? 0 : -EFAULT; -} - #if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) diff --git a/mm/mlock.c b/mm/mlock.c index a296a49865df..569400a5d079 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -155,9 +155,8 @@ void munlock_vma_page(struct page *page) * * vma->vm_mm->mmap_sem must be held for at least read. */ -static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - int *nonblocking) +long __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking) { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; @@ -202,56 +201,6 @@ static int __mlock_posix_error_return(long retval) return retval; } -/** - * mlock_vma_pages_range() - mlock pages in specified vma range. - * @vma - the vma containing the specfied address range - * @start - starting address in @vma to mlock - * @end - end address [+1] in @vma to mlock - * - * For mmap()/mremap()/expansion of mlocked vma. - * - * return 0 on success for "normal" vmas. - * - * return number of pages [> 0] to be removed from locked_vm on success - * of "special" vmas. - */ -long mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - int nr_pages = (end - start) / PAGE_SIZE; - BUG_ON(!(vma->vm_flags & VM_LOCKED)); - - /* - * filter unlockable vmas - */ - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - goto no_mlock; - - if (!((vma->vm_flags & VM_DONTEXPAND) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current->mm))) { - - __mlock_vma_pages_range(vma, start, end, NULL); - - /* Hide errors from mmap() and other callers */ - return 0; - } - - /* - * User mapped kernel pages or huge pages: - * make these pages present to populate the ptes, but - * fall thru' to reset VM_LOCKED--no need to unlock, and - * return nr_pages so these don't get counted against task's - * locked limit. huge pages are already counted against - * locked vm limit. - */ - make_pages_present(start, end); - -no_mlock: - vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */ - return nr_pages; /* error or pages NOT mlocked */ -} - /* * munlock_vma_pages_range() - munlock all pages in the vma range.' * @vma - vma containing range to be munlock()ed. @@ -303,7 +252,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * * Filters out "special" vmas -- VM_LOCKED never gets set for these, and * munlock is a no-op. However, for some special vmas, we go ahead and - * populate the ptes via make_pages_present(). + * populate the ptes. * * For vmas that pass the filters, merge/split as appropriate. */ diff --git a/mm/mmap.c b/mm/mmap.c index d6bc39e939ab..8826c77513a9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2204,9 +2204,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return vma; if (!prev || expand_stack(prev, addr)) return NULL; - if (prev->vm_flags & VM_LOCKED) { - mlock_vma_pages_range(prev, addr, prev->vm_end); - } + if (prev->vm_flags & VM_LOCKED) + __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); return prev; } #else @@ -2232,9 +2231,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; - if (vma->vm_flags & VM_LOCKED) { - mlock_vma_pages_range(vma, addr, start); - } + if (vma->vm_flags & VM_LOCKED) + __mlock_vma_pages_range(vma, addr, start, NULL); return vma; } #endif -- cgit v1.2.3-55-g7522 From 1869305009857cdeaabe6283bcdc2359c5784543 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 22 Feb 2013 16:32:46 -0800 Subject: mm: introduce VM_POPULATE flag to better deal with racy userspace programs The vm_populate() code populates user mappings without constantly holding the mmap_sem. This makes it susceptible to racy userspace programs: the user mappings may change while vm_populate() is running, and in this case vm_populate() may end up populating the new mapping instead of the old one. In order to reduce the possibility of userspace getting surprised by this behavior, this change introduces the VM_POPULATE vma flag which gets set on vmas we want vm_populate() to work on. This way vm_populate() may still end up populating the new mapping after such a race, but only if the new mapping is also one that the user has requested (using MAP_SHARED, MAP_LOCKED or mlock) to be populated. Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Tested-by: Andy Lutomirski Cc: Greg Ungerer Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + include/linux/mman.h | 4 +++- mm/fremap.c | 12 ++++++++++-- mm/mlock.c | 19 ++++++++++--------- mm/mmap.c | 4 +--- 5 files changed, 25 insertions(+), 15 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0c34d3486816..9a5fcdeaa3a0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -87,6 +87,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ +#define VM_POPULATE 0x00001000 #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ diff --git a/include/linux/mman.h b/include/linux/mman.h index 9aa863da287f..61c7a87e5d2b 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -79,6 +79,8 @@ calc_vm_flag_bits(unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | - _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); + ((flags & MAP_LOCKED) ? (VM_LOCKED | VM_POPULATE) : 0) | + (((flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE) ? + VM_POPULATE : 0); } #endif /* _LINUX_MMAN_H */ diff --git a/mm/fremap.c b/mm/fremap.c index 503a72387087..0cd4c11488ed 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -204,8 +204,10 @@ get_write_lock: unsigned long addr; struct file *file = get_file(vma->vm_file); - addr = mmap_region(file, start, size, - vma->vm_flags, pgoff); + vm_flags = vma->vm_flags; + if (!(flags & MAP_NONBLOCK)) + vm_flags |= VM_POPULATE; + addr = mmap_region(file, start, size, vm_flags, pgoff); fput(file); if (IS_ERR_VALUE(addr)) { err = addr; @@ -224,6 +226,12 @@ get_write_lock: mutex_unlock(&mapping->i_mmap_mutex); } + if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) { + if (!has_write_lock) + goto get_write_lock; + vma->vm_flags |= VM_POPULATE; + } + if (vma->vm_flags & VM_LOCKED) { /* * drop PG_Mlocked flag for over-mapped range diff --git a/mm/mlock.c b/mm/mlock.c index 569400a5d079..d6378feb2950 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -340,9 +340,9 @@ static int do_mlock(unsigned long start, size_t len, int on) /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - newflags = vma->vm_flags | VM_LOCKED; - if (!on) - newflags &= ~VM_LOCKED; + newflags = vma->vm_flags & ~VM_LOCKED; + if (on) + newflags |= VM_LOCKED | VM_POPULATE; tmp = vma->vm_end; if (tmp > end) @@ -402,7 +402,8 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) * range with the first VMA. Also, skip undesirable VMA types. */ nend = min(end, vma->vm_end); - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) != + VM_POPULATE) continue; if (nstart < vma->vm_start) nstart = vma->vm_start; @@ -475,18 +476,18 @@ static int do_mlockall(int flags) struct vm_area_struct * vma, * prev = NULL; if (flags & MCL_FUTURE) - current->mm->def_flags |= VM_LOCKED; + current->mm->def_flags |= VM_LOCKED | VM_POPULATE; else - current->mm->def_flags &= ~VM_LOCKED; + current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE); if (flags == MCL_FUTURE) goto out; for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { vm_flags_t newflags; - newflags = vma->vm_flags | VM_LOCKED; - if (!(flags & MCL_CURRENT)) - newflags &= ~VM_LOCKED; + newflags = vma->vm_flags & ~VM_LOCKED; + if (flags & MCL_CURRENT) + newflags |= VM_LOCKED | VM_POPULATE; /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); diff --git a/mm/mmap.c b/mm/mmap.c index 8826c77513a9..39a3944e1658 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1306,9 +1306,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } addr = mmap_region(file, addr, len, vm_flags, pgoff); - if (!IS_ERR_VALUE(addr) && - ((vm_flags & VM_LOCKED) || - (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) + if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE)) *populate = true; return addr; } -- cgit v1.2.3-55-g7522 From 41badc15cbad0350de34408c1b0c690f9df76d4b Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 22 Feb 2013 16:32:47 -0800 Subject: mm: make do_mmap_pgoff return populate as a size in bytes, not as a bool do_mmap_pgoff() rounds up the desired size to the next PAGE_SIZE multiple, however there was no equivalent code in mm_populate(), which caused issues. This could be fixed by introduced the same rounding in mm_populate(), however I think it's preferable to make do_mmap_pgoff() return populate as a size rather than as a boolean, so we don't have to duplicate the size rounding logic in mm_populate(). Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Tested-by: Andy Lutomirski Cc: Greg Ungerer Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 5 ++--- include/linux/mm.h | 2 +- ipc/shm.c | 4 ++-- mm/mmap.c | 6 +++--- mm/nommu.c | 4 ++-- mm/util.c | 6 +++--- 6 files changed, 13 insertions(+), 14 deletions(-) (limited to 'include/linux/mm.h') diff --git a/fs/aio.c b/fs/aio.c index 82eec7c7b4bb..064bfbe37566 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -101,9 +101,8 @@ static int aio_setup_ring(struct kioctx *ctx) struct aio_ring *ring; struct aio_ring_info *info = &ctx->ring_info; unsigned nr_events = ctx->max_reqs; - unsigned long size; + unsigned long size, populate; int nr_pages; - bool populate; /* Compensate for the ring buffer's head/tail overlap entry */ nr_events += 2; /* 1 is required, 2 for good luck */ @@ -150,7 +149,7 @@ static int aio_setup_ring(struct kioctx *ctx) return -EAGAIN; } if (populate) - mm_populate(info->mmap_base, info->mmap_size); + mm_populate(info->mmap_base, populate); ctx->user_id = info->mmap_base; diff --git a/include/linux/mm.h b/include/linux/mm.h index 9a5fcdeaa3a0..95db68e34b18 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1475,7 +1475,7 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff); extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long pgoff, bool *populate); + unsigned long pgoff, unsigned long *populate); extern int do_munmap(struct mm_struct *, unsigned long, size_t); #ifdef CONFIG_MMU diff --git a/ipc/shm.c b/ipc/shm.c index 9f047ba69e62..be3ec9ae454e 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -971,7 +971,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, struct shm_file_data *sfd; struct path path; fmode_t f_mode; - bool populate = false; + unsigned long populate = 0; err = -EINVAL; if (shmid < 0) @@ -1078,7 +1078,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, invalid: up_write(¤t->mm->mmap_sem); if (populate) - mm_populate(addr, size); + mm_populate(addr, populate); out_fput: fput(file); diff --git a/mm/mmap.c b/mm/mmap.c index 39a3944e1658..44bb4d869884 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1163,13 +1163,13 @@ static inline unsigned long round_hint_to_min(unsigned long hint) unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, - bool *populate) + unsigned long *populate) { struct mm_struct * mm = current->mm; struct inode *inode; vm_flags_t vm_flags; - *populate = false; + *populate = 0; /* * Does the application expect PROT_READ to imply PROT_EXEC? @@ -1307,7 +1307,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, addr = mmap_region(file, addr, len, vm_flags, pgoff); if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE)) - *populate = true; + *populate = len; return addr; } diff --git a/mm/nommu.c b/mm/nommu.c index 7296a5a280e7..18c1b932e2c4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1251,7 +1251,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long prot, unsigned long flags, unsigned long pgoff, - bool *populate) + unsigned long *populate) { struct vm_area_struct *vma; struct vm_region *region; @@ -1261,7 +1261,7 @@ unsigned long do_mmap_pgoff(struct file *file, kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); - *populate = false; + *populate = 0; /* decide whether we should attempt the mapping, and if so what sort of * mapping */ diff --git a/mm/util.c b/mm/util.c index 13467e043e9e..3704bf1bef94 100644 --- a/mm/util.c +++ b/mm/util.c @@ -355,7 +355,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, { unsigned long ret; struct mm_struct *mm = current->mm; - bool populate; + unsigned long populate; ret = security_mmap_file(file, prot, flag); if (!ret) { @@ -363,8 +363,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, &populate); up_write(&mm->mmap_sem); - if (!IS_ERR_VALUE(ret) && populate) - mm_populate(ret, len); + if (populate) + mm_populate(ret, populate); } return ret; } -- cgit v1.2.3-55-g7522 From 46723bfa540f0a1e494476a1734d03626a0bd1e0 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Fri, 22 Feb 2013 16:33:00 -0800 Subject: memory-hotplug: implement register_page_bootmem_info_section of sparse-vmemmap For removing memmap region of sparse-vmemmap which is allocated bootmem, memmap region of sparse-vmemmap needs to be registered by get_page_bootmem(). So the patch searches pages of virtual mapping and registers the pages by get_page_bootmem(). NOTE: register_page_bootmem_memmap() is not implemented for ia64, ppc, s390, and sparc. So introduce CONFIG_HAVE_BOOTMEM_INFO_NODE and revert register_page_bootmem_info_node() when platform doesn't support it. It's implemented by adding a new Kconfig option named CONFIG_HAVE_BOOTMEM_INFO_NODE, which will be automatically selected by memory-hotplug feature fully supported archs(currently only on x86_64). Since we have 2 config options called MEMORY_HOTPLUG and MEMORY_HOTREMOVE used for memory hot-add and hot-remove separately, and codes in function register_page_bootmem_info_node() are only used for collecting infomation for hot-remove, so reside it under MEMORY_HOTREMOVE. Besides page_isolation.c selected by MEMORY_ISOLATION under MEMORY_HOTPLUG is also such case, move it too. [mhocko@suse.cz: put register_page_bootmem_memmap inside CONFIG_MEMORY_HOTPLUG_SPARSE] [linfeng@cn.fujitsu.com: introduce CONFIG_HAVE_BOOTMEM_INFO_NODE and revert register_page_bootmem_info_node()] [mhocko@suse.cz: remove the arch specific functions without any implementation] [linfeng@cn.fujitsu.com: mm/Kconfig: move auto selects from MEMORY_HOTPLUG to MEMORY_HOTREMOVE as needed] [rientjes@google.com: fix defined but not used warning] Signed-off-by: Wen Congyang Signed-off-by: Yasuaki Ishimatsu Signed-off-by: Tang Chen Reviewed-by: Wu Jianguo Cc: KOSAKI Motohiro Cc: Jiang Liu Cc: Jianguo Wu Cc: Kamezawa Hiroyuki Cc: Lai Jiangshan Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Michal Hocko Signed-off-by: Lin Feng Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/discontig.c | 1 + arch/powerpc/mm/init_64.c | 1 + arch/sparc/mm/init_64.c | 1 + arch/x86/mm/init_64.c | 60 ++++++++++++++++++++++++++++++++++++++++++ include/linux/memory_hotplug.h | 13 +++++---- include/linux/mm.h | 3 ++- mm/Kconfig | 10 ++++++- mm/memory_hotplug.c | 35 +++++++++++++++++++++--- 8 files changed, 111 insertions(+), 13 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index c641333cd997..731bf84094b6 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -822,4 +822,5 @@ int __meminit vmemmap_populate(struct page *start_page, { return vmemmap_populate_basepages(start_page, size, node); } + #endif diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 95a45293e5ac..42bf082f0124 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -297,5 +297,6 @@ int __meminit vmemmap_populate(struct page *start_page, return 0; } + #endif /* CONFIG_SPARSEMEM_VMEMMAP */ diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 5c2c6e61facb..59c6fcfdc782 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2235,6 +2235,7 @@ void __meminit vmemmap_populate_print_last(void) node_start = 0; } } + #endif /* CONFIG_SPARSEMEM_VMEMMAP */ static void prot_init_common(unsigned long page_none, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b6dd1c480b30..f17aa76dc1ae 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1034,6 +1034,66 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) return 0; } +#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) +void register_page_bootmem_memmap(unsigned long section_nr, + struct page *start_page, unsigned long size) +{ + unsigned long addr = (unsigned long)start_page; + unsigned long end = (unsigned long)(start_page + size); + unsigned long next; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + unsigned int nr_pages; + struct page *page; + + for (; addr < end; addr = next) { + pte_t *pte = NULL; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); + + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); + + if (!cpu_has_pse) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + get_page_bootmem(section_nr, pmd_page(*pmd), + MIX_SECTION_INFO); + + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) + continue; + get_page_bootmem(section_nr, pte_page(*pte), + SECTION_INFO); + } else { + next = pmd_addr_end(addr, end); + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + + nr_pages = 1 << (get_order(PMD_SIZE)); + page = pmd_page(*pmd); + while (nr_pages--) + get_page_bootmem(section_nr, page++, + SECTION_INFO); + } + } +} +#endif + void __meminit vmemmap_populate_print_last(void) { if (p_start) { diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 31a563bbd936..4d523fe75ba1 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -174,17 +174,16 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) #endif /* CONFIG_NUMA */ #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ -#ifdef CONFIG_SPARSEMEM_VMEMMAP +#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE +extern void register_page_bootmem_info_node(struct pglist_data *pgdat); +#else static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) { } -static inline void put_page_bootmem(struct page *page) -{ -} -#else -extern void register_page_bootmem_info_node(struct pglist_data *pgdat); -extern void put_page_bootmem(struct page *page); #endif +extern void put_page_bootmem(struct page *page); +extern void get_page_bootmem(unsigned long ingo, struct page *page, + unsigned long type); /* * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug diff --git a/include/linux/mm.h b/include/linux/mm.h index 95db68e34b18..060557b9764f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1718,7 +1718,8 @@ int vmemmap_populate_basepages(struct page *start_page, unsigned long pages, int node); int vmemmap_populate(struct page *start_page, unsigned long pages, int node); void vmemmap_populate_print_last(void); - +void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, + unsigned long size); enum mf_flags { MF_COUNT_INCREASED = 1 << 0, diff --git a/mm/Kconfig b/mm/Kconfig index 0b23db9a8791..2c7aea7106f9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -162,10 +162,16 @@ config MOVABLE_NODE Say Y here if you want to hotplug a whole node. Say N here if you want kernel to use memory on all nodes evenly. +# +# Only be set on architectures that have completely implemented memory hotplug +# feature. If you are not sure, don't touch it. +# +config HAVE_BOOTMEM_INFO_NODE + def_bool n + # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" - select MEMORY_ISOLATION depends on SPARSEMEM || X86_64_ACPI_NUMA depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) @@ -176,6 +182,8 @@ config MEMORY_HOTPLUG_SPARSE config MEMORY_HOTREMOVE bool "Allow for memory hot remove" + select MEMORY_ISOLATION + select HAVE_BOOTMEM_INFO_NODE if X86_64 depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 942b43f6d736..6c90d222ec0a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -91,9 +91,8 @@ static void release_memory_resource(struct resource *res) } #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE -#ifndef CONFIG_SPARSEMEM_VMEMMAP -static void get_page_bootmem(unsigned long info, struct page *page, - unsigned long type) +void get_page_bootmem(unsigned long info, struct page *page, + unsigned long type) { page->lru.next = (struct list_head *) type; SetPagePrivate(page); @@ -128,6 +127,8 @@ void __ref put_page_bootmem(struct page *page) } +#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE +#ifndef CONFIG_SPARSEMEM_VMEMMAP static void register_page_bootmem_info_section(unsigned long start_pfn) { unsigned long *usemap, mapsize, section_nr, i; @@ -161,6 +162,32 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) get_page_bootmem(section_nr, page, MIX_SECTION_INFO); } +#else /* CONFIG_SPARSEMEM_VMEMMAP */ +static void register_page_bootmem_info_section(unsigned long start_pfn) +{ + unsigned long *usemap, mapsize, section_nr, i; + struct mem_section *ms; + struct page *page, *memmap; + + if (!pfn_valid(start_pfn)) + return; + + section_nr = pfn_to_section_nr(start_pfn); + ms = __nr_to_section(section_nr); + + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + + register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); + + usemap = __nr_to_section(section_nr)->pageblock_flags; + page = virt_to_page(usemap); + + mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; + + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, MIX_SECTION_INFO); +} +#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ void register_page_bootmem_info_node(struct pglist_data *pgdat) { @@ -203,7 +230,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) register_page_bootmem_info_section(pfn); } } -#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ +#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) -- cgit v1.2.3-55-g7522 From 0197518cd3672029618a16a57597946a094ac7a8 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Fri, 22 Feb 2013 16:33:08 -0800 Subject: memory-hotplug: remove memmap of sparse-vmemmap Introduce a new API vmemmap_free() to free and remove vmemmap pagetables. Since pagetable implements are different, each architecture has to provide its own version of vmemmap_free(), just like vmemmap_populate(). Note: vmemmap_free() is not implemented for ia64, ppc, s390, and sparc. [mhocko@suse.cz: fix implicit declaration of remove_pagetable] Signed-off-by: Yasuaki Ishimatsu Signed-off-by: Jianguo Wu Signed-off-by: Wen Congyang Signed-off-by: Tang Chen Cc: KOSAKI Motohiro Cc: Jiang Liu Cc: Kamezawa Hiroyuki Cc: Lai Jiangshan Cc: Wu Jianguo Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 3 +++ arch/ia64/mm/discontig.c | 3 +++ arch/powerpc/mm/init_64.c | 4 ++++ arch/s390/mm/vmem.c | 4 ++++ arch/sparc/mm/init_64.c | 4 ++++ arch/x86/mm/init_64.c | 8 ++++++++ include/linux/mm.h | 3 +++ mm/sparse.c | 3 ++- 8 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index f4dd585898c5..224b44ab534e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -434,4 +434,7 @@ int __meminit vmemmap_populate(struct page *start_page, return 0; } #endif /* CONFIG_ARM64_64K_PAGES */ +void vmemmap_free(struct page *memmap, unsigned long nr_pages) +{ +} #endif /* CONFIG_SPARSEMEM_VMEMMAP */ diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 731bf84094b6..652fee097921 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -823,4 +823,7 @@ int __meminit vmemmap_populate(struct page *start_page, return vmemmap_populate_basepages(start_page, size, node); } +void vmemmap_free(struct page *memmap, unsigned long nr_pages) +{ +} #endif diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 42bf082f0124..7e2246fb2f31 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -298,5 +298,9 @@ int __meminit vmemmap_populate(struct page *start_page, return 0; } +void vmemmap_free(struct page *memmap, unsigned long nr_pages) +{ +} + #endif /* CONFIG_SPARSEMEM_VMEMMAP */ diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 79699f46a443..e21aaf4f5cb6 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -268,6 +268,10 @@ out: return ret; } +void vmemmap_free(struct page *memmap, unsigned long nr_pages) +{ +} + /* * Add memory segment to the segment list if it doesn't overlap with * an already present segment. diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 59c6fcfdc782..1588d33d5492 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2236,6 +2236,10 @@ void __meminit vmemmap_populate_print_last(void) } } +void vmemmap_free(struct page *memmap, unsigned long nr_pages) +{ +} + #endif /* CONFIG_SPARSEMEM_VMEMMAP */ static void prot_init_common(unsigned long page_none, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 4e58b83e6b2e..474e28f10815 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1011,6 +1011,14 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) flush_tlb_all(); } +void __ref vmemmap_free(struct page *memmap, unsigned long nr_pages) +{ + unsigned long start = (unsigned long)memmap; + unsigned long end = (unsigned long)(memmap + nr_pages); + + remove_pagetable(start, end, false); +} + static void __meminit kernel_physical_mapping_remove(unsigned long start, unsigned long end) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 060557b9764f..9d659491c0ae 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1718,6 +1718,9 @@ int vmemmap_populate_basepages(struct page *start_page, unsigned long pages, int node); int vmemmap_populate(struct page *start_page, unsigned long pages, int node); void vmemmap_populate_print_last(void); +#ifdef CONFIG_MEMORY_HOTPLUG +void vmemmap_free(struct page *memmap, unsigned long nr_pages); +#endif void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, unsigned long size); diff --git a/mm/sparse.c b/mm/sparse.c index 66f0fd9d7964..46f6ea47d9ab 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -615,10 +615,11 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, } static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) { - return; /* XXX: Not implemented yet */ + vmemmap_free(memmap, nr_pages); } static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) { + vmemmap_free(memmap, nr_pages); } #else static struct page *__kmalloc_section_memmap(unsigned long nr_pages) -- cgit v1.2.3-55-g7522 From 34b71f1e04fcba578e719e675b4882eeeb2a1f6f Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Fri, 22 Feb 2013 16:33:37 -0800 Subject: page_alloc: add movable_memmap kernel parameter Add functions to parse movablemem_map boot option. Since the option could be specified more then once, all the maps will be stored in the global variable movablemem_map.map array. And also, we keep the array in monotonic increasing order by start_pfn. And merge all overlapped ranges. [akpm@linux-foundation.org: improve comment] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: remove unneeded parens] Signed-off-by: Tang Chen Signed-off-by: Lai Jiangshan Reviewed-by: Wen Congyang Tested-by: Lin Feng Cc: Wu Jianguo Cc: Mel Gorman Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 17 +++++ include/linux/mm.h | 11 +++ mm/page_alloc.c | 131 ++++++++++++++++++++++++++++++++++++ 3 files changed, 159 insertions(+) (limited to 'include/linux/mm.h') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9aa8ff3e54dc..722a74161246 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1640,6 +1640,23 @@ bytes respectively. Such letter suffixes can also be entirely omitted. that the amount of memory usable for all allocations is not too small. + movablemem_map=nn[KMG]@ss[KMG] + [KNL,X86,IA-64,PPC] This parameter is similar to + memmap except it specifies the memory map of + ZONE_MOVABLE. + If more areas are all within one node, then from + lowest ss to the end of the node will be ZONE_MOVABLE. + If an area covers two or more nodes, the area from + ss to the end of the 1st node will be ZONE_MOVABLE, + and all the rest nodes will only have ZONE_MOVABLE. + If memmap is specified at the same time, the + movablemem_map will be limited within the memmap + areas. If kernelcore or movablecore is also specified, + movablemem_map will have higher priority to be + satisfied. So the administrator should be careful that + the amount of movablemem_map areas are not too large. + Otherwise kernel won't have enough memory to start. + MTD_Partition= [MTD] Format: ,,, diff --git a/include/linux/mm.h b/include/linux/mm.h index 9d659491c0ae..ce9bd3049836 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1359,6 +1359,17 @@ extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); extern void sparse_memory_present_with_active_regions(int nid); +#define MOVABLEMEM_MAP_MAX MAX_NUMNODES +struct movablemem_entry { + unsigned long start_pfn; /* start pfn of memory segment */ + unsigned long end_pfn; /* end pfn of memory segment (exclusive) */ +}; + +struct movablemem_map { + int nr_map; + struct movablemem_entry map[MOVABLEMEM_MAP_MAX]; +}; + #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 703944809666..aa1cc5fe9904 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -202,6 +202,9 @@ static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +/* Movable memory ranges, will also be used by memblock subsystem. */ +struct movablemem_map movablemem_map; + static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; static unsigned long __initdata required_kernelcore; @@ -5078,6 +5081,134 @@ static int __init cmdline_parse_movablecore(char *p) early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); +/** + * insert_movablemem_map - Insert a memory range in to movablemem_map.map. + * @start_pfn: start pfn of the range + * @end_pfn: end pfn of the range + * + * This function will also merge the overlapped ranges, and sort the array + * by start_pfn in monotonic increasing order. + */ +static void __init insert_movablemem_map(unsigned long start_pfn, + unsigned long end_pfn) +{ + int pos, overlap; + + /* + * pos will be at the 1st overlapped range, or the position + * where the element should be inserted. + */ + for (pos = 0; pos < movablemem_map.nr_map; pos++) + if (start_pfn <= movablemem_map.map[pos].end_pfn) + break; + + /* If there is no overlapped range, just insert the element. */ + if (pos == movablemem_map.nr_map || + end_pfn < movablemem_map.map[pos].start_pfn) { + /* + * If pos is not the end of array, we need to move all + * the rest elements backward. + */ + if (pos < movablemem_map.nr_map) + memmove(&movablemem_map.map[pos+1], + &movablemem_map.map[pos], + sizeof(struct movablemem_entry) * + (movablemem_map.nr_map - pos)); + movablemem_map.map[pos].start_pfn = start_pfn; + movablemem_map.map[pos].end_pfn = end_pfn; + movablemem_map.nr_map++; + return; + } + + /* overlap will be at the last overlapped range */ + for (overlap = pos + 1; overlap < movablemem_map.nr_map; overlap++) + if (end_pfn < movablemem_map.map[overlap].start_pfn) + break; + + /* + * If there are more ranges overlapped, we need to merge them, + * and move the rest elements forward. + */ + overlap--; + movablemem_map.map[pos].start_pfn = min(start_pfn, + movablemem_map.map[pos].start_pfn); + movablemem_map.map[pos].end_pfn = max(end_pfn, + movablemem_map.map[overlap].end_pfn); + + if (pos != overlap && overlap + 1 != movablemem_map.nr_map) + memmove(&movablemem_map.map[pos+1], + &movablemem_map.map[overlap+1], + sizeof(struct movablemem_entry) * + (movablemem_map.nr_map - overlap - 1)); + + movablemem_map.nr_map -= overlap - pos; +} + +/** + * movablemem_map_add_region - Add a memory range into movablemem_map. + * @start: physical start address of range + * @end: physical end address of range + * + * This function transform the physical address into pfn, and then add the + * range into movablemem_map by calling insert_movablemem_map(). + */ +static void __init movablemem_map_add_region(u64 start, u64 size) +{ + unsigned long start_pfn, end_pfn; + + /* In case size == 0 or start + size overflows */ + if (start + size <= start) + return; + + if (movablemem_map.nr_map >= ARRAY_SIZE(movablemem_map.map)) { + pr_err("movablemem_map: too many entries;" + " ignoring [mem %#010llx-%#010llx]\n", + (unsigned long long) start, + (unsigned long long) (start + size - 1)); + return; + } + + start_pfn = PFN_DOWN(start); + end_pfn = PFN_UP(start + size); + insert_movablemem_map(start_pfn, end_pfn); +} + +/* + * cmdline_parse_movablemem_map - Parse boot option movablemem_map. + * @p: The boot option of the following format: + * movablemem_map=nn[KMG]@ss[KMG] + * + * This option sets the memory range [ss, ss+nn) to be used as movable memory. + * + * Return: 0 on success or -EINVAL on failure. + */ +static int __init cmdline_parse_movablemem_map(char *p) +{ + char *oldp; + u64 start_at, mem_size; + + if (!p) + goto err; + + oldp = p; + mem_size = memparse(p, &p); + if (p == oldp) + goto err; + + if (*p == '@') { + oldp = ++p; + start_at = memparse(p, &p); + if (p == oldp || *p != '\0') + goto err; + + movablemem_map_add_region(start_at, mem_size); + return 0; + } +err: + return -EINVAL; +} +early_param("movablemem_map", cmdline_parse_movablemem_map); + #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ /** -- cgit v1.2.3-55-g7522 From 27168d38fa209073219abedbe6a9de7ba9acbfad Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Fri, 22 Feb 2013 16:33:46 -0800 Subject: acpi, memory-hotplug: extend movablemem_map ranges to the end of node When implementing movablemem_map boot option, we introduced an array movablemem_map.map[] to store the memory ranges to be set as ZONE_MOVABLE. Since ZONE_MOVABLE is the latst zone of a node, if user didn't specify the whole node memory range, we need to extend it to the node end so that we can use it to prevent memblock from allocating memory in the ranges user didn't specify. We now implement movablemem_map boot option like this: /* * For movablemem_map=nn[KMG]@ss[KMG]: * * SRAT: |_____| |_____| |_________| |_________| ...... * node id: 0 1 1 2 * user specified: |__| |___| * movablemem_map: |___| |_________| |______| ...... * * Using movablemem_map, we can prevent memblock from allocating memory * on ZONE_MOVABLE at boot time. * * NOTE: In this case, SRAT info will be ingored. */ [akpm@linux-foundation.org: clean up code, fix build warning] Signed-off-by: Tang Chen Cc: KOSAKI Motohiro Cc: Jiang Liu Cc: Jianguo Wu Cc: Kamezawa Hiroyuki Cc: Lai Jiangshan Cc: Wu Jianguo Cc: Yasuaki Ishimatsu Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Len Brown Cc: "Brown, Len" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/srat.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++--- include/linux/mm.h | 5 +++++ mm/page_alloc.c | 34 +++++++++++++++++++++++++++-- 3 files changed, 98 insertions(+), 5 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index cdd0da9dd530..3e90039e52e0 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -141,11 +141,65 @@ static inline int save_add_info(void) {return 1;} static inline int save_add_info(void) {return 0;} #endif +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +static void __init handle_movablemem(int node, u64 start, u64 end) +{ + int overlap; + unsigned long start_pfn, end_pfn; + + start_pfn = PFN_DOWN(start); + end_pfn = PFN_UP(end); + + /* + * For movablecore_map=nn[KMG]@ss[KMG]: + * + * SRAT: |_____| |_____| |_________| |_________| ...... + * node id: 0 1 1 2 + * user specified: |__| |___| + * movablemem_map: |___| |_________| |______| ...... + * + * Using movablemem_map, we can prevent memblock from allocating memory + * on ZONE_MOVABLE at boot time. + */ + overlap = movablemem_map_overlap(start_pfn, end_pfn); + if (overlap >= 0) { + /* + * If part of this range is in movablemem_map, we need to + * add the range after it to extend the range to the end + * of the node, because from the min address specified to + * the end of the node will be ZONE_MOVABLE. + */ + start_pfn = max(start_pfn, + movablemem_map.map[overlap].start_pfn); + insert_movablemem_map(start_pfn, end_pfn); + + /* + * Set the nodemask, so that if the address range on one node + * is not continuse, we can add the subsequent ranges on the + * same node into movablemem_map. + */ + node_set(node, movablemem_map.numa_nodes_hotplug); + } else { + if (node_isset(node, movablemem_map.numa_nodes_hotplug)) + /* + * Insert the range if we already have movable ranges + * on the same node. + */ + insert_movablemem_map(start_pfn, end_pfn); + } +} +#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +static inline void handle_movablemem(int node, u64 start, u64 end) +{ +} +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { u64 start, end; + u32 hotpluggable; int node, pxm; if (srat_disabled()) @@ -154,7 +208,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) goto out_err_bad_srat; if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) goto out_err; - if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) + hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE; + if (hotpluggable && !save_add_info()) goto out_err; start = ma->base_address; @@ -174,9 +229,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", + printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n", node, pxm, - (unsigned long long) start, (unsigned long long) end - 1); + (unsigned long long) start, (unsigned long long) end - 1, + hotpluggable ? "Hot Pluggable": ""); + + handle_movablemem(node, start, end); return 0; out_err_bad_srat: diff --git a/include/linux/mm.h b/include/linux/mm.h index ce9bd3049836..4d7377a1d084 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1368,8 +1368,13 @@ struct movablemem_entry { struct movablemem_map { int nr_map; struct movablemem_entry map[MOVABLEMEM_MAP_MAX]; + nodemask_t numa_nodes_hotplug; /* on which nodes we specify memory */ }; +extern void __init insert_movablemem_map(unsigned long start_pfn, + unsigned long end_pfn); +extern int __init movablemem_map_overlap(unsigned long start_pfn, + unsigned long end_pfn); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 88b9962c99b3..7ea9a003ad57 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5175,6 +5175,36 @@ static int __init cmdline_parse_movablecore(char *p) early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); +/** + * movablemem_map_overlap() - Check if a range overlaps movablemem_map.map[]. + * @start_pfn: start pfn of the range to be checked + * @end_pfn: end pfn of the range to be checked (exclusive) + * + * This function checks if a given memory range [start_pfn, end_pfn) overlaps + * the movablemem_map.map[] array. + * + * Return: index of the first overlapped element in movablemem_map.map[] + * or -1 if they don't overlap each other. + */ +int __init movablemem_map_overlap(unsigned long start_pfn, + unsigned long end_pfn) +{ + int overlap; + + if (!movablemem_map.nr_map) + return -1; + + for (overlap = 0; overlap < movablemem_map.nr_map; overlap++) + if (start_pfn < movablemem_map.map[overlap].end_pfn) + break; + + if (overlap == movablemem_map.nr_map || + end_pfn <= movablemem_map.map[overlap].start_pfn) + return -1; + + return overlap; +} + /** * insert_movablemem_map - Insert a memory range in to movablemem_map.map. * @start_pfn: start pfn of the range @@ -5183,8 +5213,8 @@ early_param("movablecore", cmdline_parse_movablecore); * This function will also merge the overlapped ranges, and sort the array * by start_pfn in monotonic increasing order. */ -static void __init insert_movablemem_map(unsigned long start_pfn, - unsigned long end_pfn) +void __init insert_movablemem_map(unsigned long start_pfn, + unsigned long end_pfn) { int pos, overlap; -- cgit v1.2.3-55-g7522 From 01a178a94e8eaec351b29ee49fbb3d1c124cb7fb Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Fri, 22 Feb 2013 16:33:49 -0800 Subject: acpi, memory-hotplug: support getting hotplug info from SRAT We now provide an option for users who don't want to specify physical memory address in kernel commandline. /* * For movablemem_map=acpi: * * SRAT: |_____| |_____| |_________| |_________| ...... * node id: 0 1 1 2 * hotpluggable: n y y n * movablemem_map: |_____| |_________| * * Using movablemem_map, we can prevent memblock from allocating memory * on ZONE_MOVABLE at boot time. */ So user just specify movablemem_map=acpi, and the kernel will use hotpluggable info in SRAT to determine which memory ranges should be set as ZONE_MOVABLE. If all the memory ranges in SRAT is hotpluggable, then no memory can be used by kernel. But before parsing SRAT, memblock has already reserve some memory ranges for other purposes, such as for kernel image, and so on. We cannot prevent kernel from using these memory. So we need to exclude these ranges even if these memory is hotpluggable. Furthermore, there could be several memory ranges in the single node which the kernel resides in. We may skip one range that have memory reserved by memblock, but if the rest of memory is too small, then the kernel will fail to boot. So, make the whole node which the kernel resides in un-hotpluggable. Then the kernel has enough memory to use. NOTE: Using this way will cause NUMA performance down because the whole node will be set as ZONE_MOVABLE, and kernel cannot use memory on it. If users don't want to lose NUMA performance, just don't use it. [akpm@linux-foundation.org: fix warning] [akpm@linux-foundation.org: use strcmp()] Signed-off-by: Tang Chen Cc: KOSAKI Motohiro Cc: Jiang Liu Cc: Jianguo Wu Cc: Kamezawa Hiroyuki Cc: Lai Jiangshan Cc: Wu Jianguo Cc: Yasuaki Ishimatsu Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Len Brown Cc: "Brown, Len" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 29 ++++++++++++--- arch/x86/mm/srat.c | 71 ++++++++++++++++++++++++++++++++++--- include/linux/mm.h | 2 ++ mm/page_alloc.c | 22 +++++++++++- 4 files changed, 113 insertions(+), 11 deletions(-) (limited to 'include/linux/mm.h') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 722a74161246..766087781ecd 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1640,15 +1640,30 @@ bytes respectively. Such letter suffixes can also be entirely omitted. that the amount of memory usable for all allocations is not too small. + movablemem_map=acpi + [KNL,X86,IA-64,PPC] This parameter is similar to + memmap except it specifies the memory map of + ZONE_MOVABLE. + This option inform the kernel to use Hot Pluggable bit + in flags from SRAT from ACPI BIOS to determine which + memory devices could be hotplugged. The corresponding + memory ranges will be set as ZONE_MOVABLE. + NOTE: Whatever node the kernel resides in will always + be un-hotpluggable. + movablemem_map=nn[KMG]@ss[KMG] [KNL,X86,IA-64,PPC] This parameter is similar to memmap except it specifies the memory map of ZONE_MOVABLE. - If more areas are all within one node, then from - lowest ss to the end of the node will be ZONE_MOVABLE. - If an area covers two or more nodes, the area from - ss to the end of the 1st node will be ZONE_MOVABLE, - and all the rest nodes will only have ZONE_MOVABLE. + If user specifies memory ranges, the info in SRAT will + be ingored. And it works like the following: + - If more ranges are all within one node, then from + lowest ss to the end of the node will be ZONE_MOVABLE. + - If a range is within a node, then from ss to the end + of the node will be ZONE_MOVABLE. + - If a range covers two or more nodes, then from ss to + the end of the 1st node will be ZONE_MOVABLE, and all + the rest nodes will only have ZONE_MOVABLE. If memmap is specified at the same time, the movablemem_map will be limited within the memmap areas. If kernelcore or movablecore is also specified, @@ -1656,6 +1671,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. satisfied. So the administrator should be careful that the amount of movablemem_map areas are not too large. Otherwise kernel won't have enough memory to start. + NOTE: We don't stop users specifying the node the + kernel resides in as hotpluggable so that this + option can be used as a workaround of firmware + bugs. MTD_Partition= [MTD] Format: ,,, diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 3e90039e52e0..79836d01f789 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -142,16 +142,72 @@ static inline int save_add_info(void) {return 0;} #endif #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -static void __init handle_movablemem(int node, u64 start, u64 end) +static void __init +handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable) { - int overlap; + int overlap, i; unsigned long start_pfn, end_pfn; start_pfn = PFN_DOWN(start); end_pfn = PFN_UP(end); /* - * For movablecore_map=nn[KMG]@ss[KMG]: + * For movablemem_map=acpi: + * + * SRAT: |_____| |_____| |_________| |_________| ...... + * node id: 0 1 1 2 + * hotpluggable: n y y n + * movablemem_map: |_____| |_________| + * + * Using movablemem_map, we can prevent memblock from allocating memory + * on ZONE_MOVABLE at boot time. + * + * Before parsing SRAT, memblock has already reserve some memory ranges + * for other purposes, such as for kernel image. We cannot prevent + * kernel from using these memory, so we need to exclude these memory + * even if it is hotpluggable. + * Furthermore, to ensure the kernel has enough memory to boot, we make + * all the memory on the node which the kernel resides in + * un-hotpluggable. + */ + if (hotpluggable && movablemem_map.acpi) { + /* Exclude ranges reserved by memblock. */ + struct memblock_type *rgn = &memblock.reserved; + + for (i = 0; i < rgn->cnt; i++) { + if (end <= rgn->regions[i].base || + start >= rgn->regions[i].base + + rgn->regions[i].size) + continue; + + /* + * If the memory range overlaps the memory reserved by + * memblock, then the kernel resides in this node. + */ + node_set(node, movablemem_map.numa_nodes_kernel); + + goto out; + } + + /* + * If the kernel resides in this node, then the whole node + * should not be hotpluggable. + */ + if (node_isset(node, movablemem_map.numa_nodes_kernel)) + goto out; + + insert_movablemem_map(start_pfn, end_pfn); + + /* + * numa_nodes_hotplug nodemask represents which nodes are put + * into movablemem_map.map[]. + */ + node_set(node, movablemem_map.numa_nodes_hotplug); + goto out; + } + + /* + * For movablemem_map=nn[KMG]@ss[KMG]: * * SRAT: |_____| |_____| |_________| |_________| ...... * node id: 0 1 1 2 @@ -160,6 +216,8 @@ static void __init handle_movablemem(int node, u64 start, u64 end) * * Using movablemem_map, we can prevent memblock from allocating memory * on ZONE_MOVABLE at boot time. + * + * NOTE: In this case, SRAT info will be ingored. */ overlap = movablemem_map_overlap(start_pfn, end_pfn); if (overlap >= 0) { @@ -187,9 +245,12 @@ static void __init handle_movablemem(int node, u64 start, u64 end) */ insert_movablemem_map(start_pfn, end_pfn); } +out: + return; } #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -static inline void handle_movablemem(int node, u64 start, u64 end) +static inline void +handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable) { } #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ @@ -234,7 +295,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) (unsigned long long) start, (unsigned long long) end - 1, hotpluggable ? "Hot Pluggable": ""); - handle_movablemem(node, start, end); + handle_movablemem(node, start, end, hotpluggable); return 0; out_err_bad_srat: diff --git a/include/linux/mm.h b/include/linux/mm.h index 4d7377a1d084..72a42c0fa633 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1366,9 +1366,11 @@ struct movablemem_entry { }; struct movablemem_map { + bool acpi; /* true if using SRAT info */ int nr_map; struct movablemem_entry map[MOVABLEMEM_MAP_MAX]; nodemask_t numa_nodes_hotplug; /* on which nodes we specify memory */ + nodemask_t numa_nodes_kernel; /* on which nodes kernel resides in */ }; extern void __init insert_movablemem_map(unsigned long start_pfn, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7ea9a003ad57..a7381be21320 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -203,7 +203,10 @@ static unsigned long __meminitdata dma_reserve; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* Movable memory ranges, will also be used by memblock subsystem. */ -struct movablemem_map movablemem_map; +struct movablemem_map movablemem_map = { + .acpi = false, + .nr_map = 0, +}; static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; @@ -5314,6 +5317,23 @@ static int __init cmdline_parse_movablemem_map(char *p) if (!p) goto err; + if (!strcmp(p, "acpi")) + movablemem_map.acpi = true; + + /* + * If user decide to use info from BIOS, all the other user specified + * ranges will be ingored. + */ + if (movablemem_map.acpi) { + if (movablemem_map.nr_map) { + memset(movablemem_map.map, 0, + sizeof(struct movablemem_entry) + * movablemem_map.nr_map); + movablemem_map.nr_map = 0; + } + return 0; + } + oldp = p; mem_size = memparse(p, &p); if (p == oldp) -- cgit v1.2.3-55-g7522 From 293c07e31ab5a0b8df8c19b2a9e5c6fa30308849 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Fri, 22 Feb 2013 16:34:02 -0800 Subject: memory-failure: use num_poisoned_pages instead of mce_bad_pages Since MCE is an x86 concept, and this code is in mm/, it would be better to use the name num_poisoned_pages instead of mce_bad_pages. [akpm@linux-foundation.org: fix mm/sparse.c] Signed-off-by: Xishi Qiu Signed-off-by: Jiang Liu Suggested-by: Borislav Petkov Reviewed-by: Wanpeng Li Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 2 +- include/linux/mm.h | 2 +- mm/memory-failure.c | 16 ++++++++-------- mm/sparse.c | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux/mm.h') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 80e4645f7990..c3dac611c3c0 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -158,7 +158,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) vmi.used >> 10, vmi.largest_chunk >> 10 #ifdef CONFIG_MEMORY_FAILURE - ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) + ,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * diff --git a/include/linux/mm.h b/include/linux/mm.h index 72a42c0fa633..a114b8eb7676 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1753,7 +1753,7 @@ extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); -extern atomic_long_t mce_bad_pages; +extern atomic_long_t num_poisoned_pages; extern int soft_offline_page(struct page *page, int flags); extern void dump_page(struct page *page); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f4c9fa1149e2..c95e19af510b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -61,7 +61,7 @@ int sysctl_memory_failure_early_kill __read_mostly = 0; int sysctl_memory_failure_recovery __read_mostly = 1; -atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); +atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) @@ -1040,7 +1040,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } nr_pages = 1 << compound_trans_order(hpage); - atomic_long_add(nr_pages, &mce_bad_pages); + atomic_long_add(nr_pages, &num_poisoned_pages); /* * We need/can do nothing about count=0 pages. @@ -1070,7 +1070,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (!PageHWPoison(hpage) || (hwpoison_filter(p) && TestClearPageHWPoison(p)) || (p != hpage && TestSetPageHWPoison(hpage))) { - atomic_long_sub(nr_pages, &mce_bad_pages); + atomic_long_sub(nr_pages, &num_poisoned_pages); return 0; } set_page_hwpoison_huge_page(hpage); @@ -1128,7 +1128,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) - atomic_long_sub(nr_pages, &mce_bad_pages); + atomic_long_sub(nr_pages, &num_poisoned_pages); unlock_page(hpage); put_page(hpage); return 0; @@ -1323,7 +1323,7 @@ int unpoison_memory(unsigned long pfn) return 0; } if (TestClearPageHWPoison(p)) - atomic_long_sub(nr_pages, &mce_bad_pages); + atomic_long_sub(nr_pages, &num_poisoned_pages); pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); return 0; } @@ -1337,7 +1337,7 @@ int unpoison_memory(unsigned long pfn) */ if (TestClearPageHWPoison(page)) { pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); - atomic_long_sub(nr_pages, &mce_bad_pages); + atomic_long_sub(nr_pages, &num_poisoned_pages); freeit = 1; if (PageHuge(page)) clear_page_hwpoison_huge_page(page); @@ -1442,7 +1442,7 @@ static int soft_offline_huge_page(struct page *page, int flags) } done: /* keep elevated page count for bad page */ - atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); + atomic_long_add(1 << compound_trans_order(hpage), &num_poisoned_pages); set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); out: @@ -1585,7 +1585,7 @@ int soft_offline_page(struct page *page, int flags) done: /* keep elevated page count for bad page */ - atomic_long_inc(&mce_bad_pages); + atomic_long_inc(&num_poisoned_pages); SetPageHWPoison(page); out: return ret; diff --git a/mm/sparse.c b/mm/sparse.c index cff97960f1d7..7ca6dc847947 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -783,7 +783,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) for (i = 0; i < PAGES_PER_SECTION; i++) { if (PageHWPoison(&memmap[i])) { - atomic_long_sub(1, &mce_bad_pages); + atomic_long_sub(1, &num_poisoned_pages); ClearPageHWPoison(&memmap[i]); } } -- cgit v1.2.3-55-g7522 From bbeae5b05ef6e40bf54db05ceb8635824153b9e2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Feb 2013 16:34:30 -0800 Subject: mm: move page flags layout to separate header This is a preparation patch for moving page->_last_nid into page->flags that moves page flag layout information to a separate header. This patch is necessary because otherwise there would be a circular dependency between mm_types.h and mm.h. Signed-off-by: Mel Gorman Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Ingo Molnar Cc: Simon Jeons Cc: Wanpeng Li Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 40 ---------------------- include/linux/mm_types.h | 1 + include/linux/mmzone.h | 22 +----------- include/linux/page-flags-layout.h | 71 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 73 insertions(+), 61 deletions(-) create mode 100644 include/linux/page-flags-layout.h (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index a114b8eb7676..c2d7d5993b14 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -581,51 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) * sets it, so none of the operations on it need to be atomic. */ - -/* - * page->flags layout: - * - * There are three possibilities for how page->flags get - * laid out. The first is for the normal case, without - * sparsemem. The second is for sparsemem when there is - * plenty of space for node and section. The last is when - * we have run out of space and have to fall back to an - * alternate (slower) way of determining the node. - * - * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | - * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | - * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | - */ -#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) -#define SECTIONS_WIDTH SECTIONS_SHIFT -#else -#define SECTIONS_WIDTH 0 -#endif - -#define ZONES_WIDTH ZONES_SHIFT - -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS -#define NODES_WIDTH NODES_SHIFT -#else -#ifdef CONFIG_SPARSEMEM_VMEMMAP -#error "Vmemmap: No space for nodes field in page flags" -#endif -#define NODES_WIDTH 0 -#endif - /* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) -/* - * We are going to use the flags for the page to node mapping if its in - * there. This includes the case where there is no node, so it is implicit. - */ -#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0) -#define NODE_NOT_IN_PAGE_FLAGS -#endif - /* * Define the bit shifts to access each section. For non-existent * sections we define the shift as 0; that plus a 0 mask ensures diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 47047cb4d8e5..d05d632c716e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4f4c8c26fa9d..6c80d0ac14dd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include @@ -310,24 +310,6 @@ enum zone_type { #ifndef __GENERATING_BOUNDS_H -/* - * When a memory allocation must conform to specific limitations (such - * as being suitable for DMA) the caller will pass in hints to the - * allocator in the gfp_mask, in the zone modifier bits. These bits - * are used to select a priority ordered list of memory zones which - * match the requested limits. See gfp_zone() in include/linux/gfp.h - */ - -#if MAX_NR_ZONES < 2 -#define ZONES_SHIFT 0 -#elif MAX_NR_ZONES <= 2 -#define ZONES_SHIFT 1 -#elif MAX_NR_ZONES <= 4 -#define ZONES_SHIFT 2 -#else -#error ZONES_SHIFT -- too many zones configured adjust calculation -#endif - struct zone { /* Fields commonly accessed by the page allocator */ @@ -1055,8 +1037,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn) * PA_SECTION_SHIFT physical address to/from section number * PFN_SECTION_SHIFT pfn to/from section number */ -#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) - #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h new file mode 100644 index 000000000000..316805d6ba1b --- /dev/null +++ b/include/linux/page-flags-layout.h @@ -0,0 +1,71 @@ +#ifndef PAGE_FLAGS_LAYOUT_H +#define PAGE_FLAGS_LAYOUT_H + +#include +#include + +/* + * When a memory allocation must conform to specific limitations (such + * as being suitable for DMA) the caller will pass in hints to the + * allocator in the gfp_mask, in the zone modifier bits. These bits + * are used to select a priority ordered list of memory zones which + * match the requested limits. See gfp_zone() in include/linux/gfp.h + */ +#if MAX_NR_ZONES < 2 +#define ZONES_SHIFT 0 +#elif MAX_NR_ZONES <= 2 +#define ZONES_SHIFT 1 +#elif MAX_NR_ZONES <= 4 +#define ZONES_SHIFT 2 +#else +#error ZONES_SHIFT -- too many zones configured adjust calculation +#endif + +#ifdef CONFIG_SPARSEMEM +#include + +/* SECTION_SHIFT #bits space required to store a section # */ +#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) + +#endif /* CONFIG_SPARSEMEM */ + +/* + * page->flags layout: + * + * There are three possibilities for how page->flags get + * laid out. The first is for the normal case, without + * sparsemem. The second is for sparsemem when there is + * plenty of space for node and section. The last is when + * we have run out of space and have to fall back to an + * alternate (slower) way of determining the node. + * + * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | + * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | + * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | + */ +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +#define SECTIONS_WIDTH SECTIONS_SHIFT +#else +#define SECTIONS_WIDTH 0 +#endif + +#define ZONES_WIDTH ZONES_SHIFT + +#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#define NODES_WIDTH NODES_SHIFT +#else +#ifdef CONFIG_SPARSEMEM_VMEMMAP +#error "Vmemmap: No space for nodes field in page flags" +#endif +#define NODES_WIDTH 0 +#endif + +/* + * We are going to use the flags for the page to node mapping if its in + * there. This includes the case where there is no node, so it is implicit. + */ +#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0) +#define NODE_NOT_IN_PAGE_FLAGS +#endif + +#endif /* _LINUX_PAGE_FLAGS_LAYOUT */ -- cgit v1.2.3-55-g7522 From 75980e97daccfc6babbac7e180ff118537955f5d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Feb 2013 16:34:32 -0800 Subject: mm: fold page->_last_nid into page->flags where possible page->_last_nid fits into page->flags on 64-bit. The unlikely 32-bit NUMA configuration with NUMA Balancing will still need an extra page field. As Peter notes "Completely dropping 32bit support for CONFIG_NUMA_BALANCING would simplify things, but it would also remove the warning if we grow enough 64bit only page-flags to push the last-cpu out." [mgorman@suse.de: minor modifications] Signed-off-by: Mel Gorman Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Ingo Molnar Cc: Simon Jeons Cc: Wanpeng Li Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 33 ++++++++++++++++++++++++++++++++- include/linux/mm_types.h | 2 +- include/linux/page-flags-layout.h | 33 +++++++++++++++++++++++++-------- mm/memory.c | 4 ++++ 4 files changed, 62 insertions(+), 10 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index c2d7d5993b14..473abbda942e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -581,10 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) * sets it, so none of the operations on it need to be atomic. */ -/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */ +/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) +#define LAST_NID_PGOFF (ZONES_PGOFF - LAST_NID_WIDTH) /* * Define the bit shifts to access each section. For non-existent @@ -594,6 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) +#define LAST_NID_PGSHIFT (LAST_NID_PGOFF * (LAST_NID_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS @@ -615,6 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) +#define LAST_NID_MASK ((1UL << LAST_NID_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(const struct page *page) @@ -654,6 +657,7 @@ static inline int page_to_nid(const struct page *page) #endif #ifdef CONFIG_NUMA_BALANCING +#ifdef LAST_NID_NOT_IN_PAGE_FLAGS static inline int page_xchg_last_nid(struct page *page, int nid) { return xchg(&page->_last_nid, nid); @@ -668,6 +672,33 @@ static inline void reset_page_last_nid(struct page *page) page->_last_nid = -1; } #else +static inline int page_last_nid(struct page *page) +{ + return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK; +} + +static inline int page_xchg_last_nid(struct page *page, int nid) +{ + unsigned long old_flags, flags; + int last_nid; + + do { + old_flags = flags = page->flags; + last_nid = page_last_nid(page); + + flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); + flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; + } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); + + return last_nid; +} + +static inline void reset_page_last_nid(struct page *page) +{ + page_xchg_last_nid(page, (1 << LAST_NID_SHIFT) - 1); +} +#endif /* LAST_NID_NOT_IN_PAGE_FLAGS */ +#else static inline int page_xchg_last_nid(struct page *page, int nid) { return page_to_nid(page); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d05d632c716e..ace9a5f01c64 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -174,7 +174,7 @@ struct page { void *shadow; #endif -#ifdef CONFIG_NUMA_BALANCING +#ifdef LAST_NID_NOT_IN_PAGE_FLAGS int _last_nid; #endif } diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 316805d6ba1b..93506a114034 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -32,15 +32,16 @@ /* * page->flags layout: * - * There are three possibilities for how page->flags get - * laid out. The first is for the normal case, without - * sparsemem. The second is for sparsemem when there is - * plenty of space for node and section. The last is when - * we have run out of space and have to fall back to an - * alternate (slower) way of determining the node. + * There are five possibilities for how page->flags get laid out. The first + * pair is for the normal case without sparsemem. The second pair is for + * sparsemem when there is plenty of space for node and section information. + * The last is when there is insufficient space in page->flags and a separate + * lookup is necessary. * - * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | - * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | + * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | + * " plus space for last_nid: | NODE | ZONE | LAST_NID ... | FLAGS | + * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | + * " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS | * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | */ #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) @@ -60,6 +61,18 @@ #define NODES_WIDTH 0 #endif +#ifdef CONFIG_NUMA_BALANCING +#define LAST_NID_SHIFT NODES_SHIFT +#else +#define LAST_NID_SHIFT 0 +#endif + +#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#define LAST_NID_WIDTH LAST_NID_SHIFT +#else +#define LAST_NID_WIDTH 0 +#endif + /* * We are going to use the flags for the page to node mapping if its in * there. This includes the case where there is no node, so it is implicit. @@ -68,4 +81,8 @@ #define NODE_NOT_IN_PAGE_FLAGS #endif +#if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0 +#define LAST_NID_NOT_IN_PAGE_FLAGS +#endif + #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ diff --git a/mm/memory.c b/mm/memory.c index 7837ceacf090..054250ee4a68 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -69,6 +69,10 @@ #include "internal.h" +#ifdef LAST_NID_NOT_IN_PAGE_FLAGS +#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. +#endif + #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; -- cgit v1.2.3-55-g7522 From 9800339b5e0f0e24ab3dac349e0de80d2018832e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 22 Feb 2013 16:34:35 -0800 Subject: mm: don't inline page_mapping() According to akpm, this saves 1/2k text and makes things simple for the next patch. Numbers from Minchan: add/remove: 1/0 grow/shrink: 6/22 up/down: 92/-516 (-424) function old new delta page_mapping - 48 +48 do_task_stat 2292 2308 +16 page_remove_rmap 240 248 +8 load_elf_binary 4500 4508 +8 update_queue 532 536 +4 scsi_probe_and_add_lun 2892 2896 +4 lookup_fast 644 648 +4 vcs_read 1040 1036 -4 __ip_route_output_key 1904 1900 -4 ip_route_input_noref 2508 2500 -8 shmem_file_aio_read 784 772 -12 __isolate_lru_page 272 256 -16 shmem_replace_page 708 688 -20 mark_buffer_dirty 228 208 -20 __set_page_dirty_buffers 240 220 -20 __remove_mapping 276 256 -20 update_mmu_cache 500 476 -24 set_page_dirty_balance 92 68 -24 set_page_dirty 172 148 -24 page_evictable 88 64 -24 page_cache_pipe_buf_steal 248 224 -24 clear_page_dirty_for_io 340 316 -24 test_set_page_writeback 400 372 -28 test_clear_page_writeback 516 488 -28 invalidate_inode_page 156 128 -28 page_mkclean 432 400 -32 flush_dcache_page 360 328 -32 __set_page_dirty_nobuffers 324 280 -44 shrink_page_list 2412 2356 -56 Signed-off-by: Shaohua Li Suggested-by: Andrew Morton Cc: Hugh Dickins Acked-by: Rik van Riel Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 13 +------------ mm/util.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 12 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 473abbda942e..1d4122bf6f27 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -809,18 +809,7 @@ void page_address_init(void); #define PAGE_MAPPING_KSM 2 #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) -extern struct address_space swapper_space; -static inline struct address_space *page_mapping(struct page *page) -{ - struct address_space *mapping = page->mapping; - - VM_BUG_ON(PageSlab(page)); - if (unlikely(PageSwapCache(page))) - mapping = &swapper_space; - else if ((unsigned long)mapping & PAGE_MAPPING_ANON) - mapping = NULL; - return mapping; -} +extern struct address_space *page_mapping(struct page *page); /* Neutral page->mapping pointer to address_space or anon_vma or other */ static inline void *page_rmapping(struct page *page) diff --git a/mm/util.c b/mm/util.c index 3704bf1bef94..16a73195a37b 100644 --- a/mm/util.c +++ b/mm/util.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include "internal.h" @@ -382,6 +383,21 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); +struct address_space *page_mapping(struct page *page) +{ + struct address_space *mapping = page->mapping; + + VM_BUG_ON(PageSlab(page)); +#ifdef CONFIG_SWAP + if (unlikely(PageSwapCache(page))) + mapping = &swapper_space; + else +#endif + if ((unsigned long)mapping & PAGE_MAPPING_ANON) + mapping = NULL; + return mapping; +} + /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); -- cgit v1.2.3-55-g7522 From 75f7ad8e043d9383337d917584297f7737154bbf Mon Sep 17 00:00:00 2001 From: Paul Szabo Date: Fri, 22 Feb 2013 16:34:42 -0800 Subject: page-writeback.c: subtract min_free_kbytes from dirtyable memory When calculating amount of dirtyable memory, min_free_kbytes should be subtracted because it is not intended for dirty pages. Addresses http://bugs.debian.org/695182 [akpm@linux-foundation.org: fix up min_free_kbytes extern declarations] [akpm@linux-foundation.org: fix min() warning] Signed-off-by: Paul Szabo Acked-by: Rik van Riel Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +++ kernel/sysctl.c | 1 - mm/huge_memory.c | 1 - mm/page-writeback.c | 3 +++ 4 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1d4122bf6f27..437da0ce78c7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1393,6 +1393,9 @@ extern void setup_per_cpu_pageset(void); extern void zone_pcp_update(struct zone *zone); extern void zone_pcp_reset(struct zone *zone); +/* page_alloc.c */ +extern int min_free_kbytes; + /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 467d8b923fcd..95e9e55602a8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -105,7 +105,6 @@ extern char core_pattern[]; extern unsigned int core_pipe_limit; #endif extern int pid_max; -extern int min_free_kbytes; extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b1cc6591ed83..c63a21d0e991 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -105,7 +105,6 @@ static int set_recommended_min_free_kbytes(void) struct zone *zone; int nr_zones = 0; unsigned long recommended_min; - extern int min_free_kbytes; if (!khugepaged_enabled()) return 0; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7300c9d5e1d9..cdc377c456c0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -241,6 +241,9 @@ static unsigned long global_dirtyable_memory(void) if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); + /* Subtract min_free_kbytes */ + x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10)); + return x + 1; /* Ensure that we never return 0 */ } -- cgit v1.2.3-55-g7522 From 4468b8f1e2d32ce79ef4bcb8e00d7e88627f1c3a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 22 Feb 2013 16:34:46 -0800 Subject: mm: uninline page_xchg_last_nid() Andrew Morton pointed out that page_xchg_last_nid() and reset_page_last_nid() were "getting nuttily large" and asked that it be investigated. reset_page_last_nid() is on the page free path and it would be unfortunate to make that path more expensive than it needs to be. Due to the internal use of page_xchg_last_nid() it is already too expensive but fortunately, it should also be impossible for the page->flags to be updated in parallel when we call reset_page_last_nid(). Instead of unlining the function, it uses a simplier implementation that assumes no parallel updates and should now be sufficiently short for inlining. page_xchg_last_nid() is called in paths that are already quite expensive (splitting huge page, fault handling, migration) and it is reasonable to uninline. There was not really a good place to place the function but mm/mmzone.c was the closest fit IMO. This patch saved 128 bytes of text in the vmlinux file for the kernel configuration I used for testing automatic NUMA balancing. Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 21 +++++---------------- mm/mmzone.c | 20 +++++++++++++++++++- 2 files changed, 24 insertions(+), 17 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 437da0ce78c7..8a5bbe3b9e56 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -677,25 +677,14 @@ static inline int page_last_nid(struct page *page) return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK; } -static inline int page_xchg_last_nid(struct page *page, int nid) -{ - unsigned long old_flags, flags; - int last_nid; - - do { - old_flags = flags = page->flags; - last_nid = page_last_nid(page); - - flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); - flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; - } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); - - return last_nid; -} +extern int page_xchg_last_nid(struct page *page, int nid); static inline void reset_page_last_nid(struct page *page) { - page_xchg_last_nid(page, (1 << LAST_NID_SHIFT) - 1); + int nid = (1 << LAST_NID_SHIFT) - 1; + + page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); + page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; } #endif /* LAST_NID_NOT_IN_PAGE_FLAGS */ #else diff --git a/mm/mmzone.c b/mm/mmzone.c index 4596d81b89b1..bce796e8487f 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -1,7 +1,7 @@ /* * linux/mm/mmzone.c * - * management codes for pgdats and zones. + * management codes for pgdats, zones and page flags */ @@ -96,3 +96,21 @@ void lruvec_init(struct lruvec *lruvec) for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); } + +#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) +int page_xchg_last_nid(struct page *page, int nid) +{ + unsigned long old_flags, flags; + int last_nid; + + do { + old_flags = flags = page->flags; + last_nid = page_last_nid(page); + + flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); + flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; + } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); + + return last_nid; +} +#endif -- cgit v1.2.3-55-g7522 From 22b751c3d0376e86a377e3a0aa2ddbbe9d2eefc1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 22 Feb 2013 16:34:59 -0800 Subject: mm: rename page struct field helpers The function names page_xchg_last_nid(), page_last_nid() and reset_page_last_nid() were judged to be inconsistent so rename them to a struct_field_op style pattern. As it looked jarring to have reset_page_mapcount() and page_nid_reset_last() beside each other in memmap_init_zone(), this patch also renames reset_page_mapcount() to page_mapcount_reset(). There are others like init_page_count() but as it is used throughout the arch code a rename would likely cause more conflicts than it is worth. [akpm@linux-foundation.org: fix zcache] Signed-off-by: Mel Gorman Suggested-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/staging/zcache/zbud.c | 2 +- drivers/staging/zsmalloc/zsmalloc-main.c | 2 +- include/linux/mm.h | 20 ++++++++++---------- mm/huge_memory.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 4 ++-- mm/mmzone.c | 4 ++-- mm/page_alloc.c | 10 +++++----- mm/slob.c | 2 +- mm/slub.c | 2 +- 10 files changed, 25 insertions(+), 25 deletions(-) (limited to 'include/linux/mm.h') diff --git a/drivers/staging/zcache/zbud.c b/drivers/staging/zcache/zbud.c index 328c397ea5dc..fdff5c6a0239 100644 --- a/drivers/staging/zcache/zbud.c +++ b/drivers/staging/zcache/zbud.c @@ -404,7 +404,7 @@ static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage, else zbud_pers_pageframes--; zbudpage_spin_unlock(zbudpage); - reset_page_mapcount(page); + page_mapcount_reset(page); init_page_count(page); page->index = 0; return page; diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/drivers/staging/zsmalloc/zsmalloc-main.c index 06f73a93a44d..e78d262c5249 100644 --- a/drivers/staging/zsmalloc/zsmalloc-main.c +++ b/drivers/staging/zsmalloc/zsmalloc-main.c @@ -472,7 +472,7 @@ static void reset_page(struct page *page) set_page_private(page, 0); page->mapping = NULL; page->freelist = NULL; - reset_page_mapcount(page); + page_mapcount_reset(page); } static void free_zspage(struct page *first_page) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8a5bbe3b9e56..fe039bdba4ed 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -367,7 +367,7 @@ static inline struct page *compound_head(struct page *page) * both from it and to it can be tracked, using atomic_inc_and_test * and atomic_add_negative(-1). */ -static inline void reset_page_mapcount(struct page *page) +static inline void page_mapcount_reset(struct page *page) { atomic_set(&(page)->_mapcount, -1); } @@ -658,28 +658,28 @@ static inline int page_to_nid(const struct page *page) #ifdef CONFIG_NUMA_BALANCING #ifdef LAST_NID_NOT_IN_PAGE_FLAGS -static inline int page_xchg_last_nid(struct page *page, int nid) +static inline int page_nid_xchg_last(struct page *page, int nid) { return xchg(&page->_last_nid, nid); } -static inline int page_last_nid(struct page *page) +static inline int page_nid_last(struct page *page) { return page->_last_nid; } -static inline void reset_page_last_nid(struct page *page) +static inline void page_nid_reset_last(struct page *page) { page->_last_nid = -1; } #else -static inline int page_last_nid(struct page *page) +static inline int page_nid_last(struct page *page) { return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK; } -extern int page_xchg_last_nid(struct page *page, int nid); +extern int page_nid_xchg_last(struct page *page, int nid); -static inline void reset_page_last_nid(struct page *page) +static inline void page_nid_reset_last(struct page *page) { int nid = (1 << LAST_NID_SHIFT) - 1; @@ -688,17 +688,17 @@ static inline void reset_page_last_nid(struct page *page) } #endif /* LAST_NID_NOT_IN_PAGE_FLAGS */ #else -static inline int page_xchg_last_nid(struct page *page, int nid) +static inline int page_nid_xchg_last(struct page *page, int nid) { return page_to_nid(page); } -static inline int page_last_nid(struct page *page) +static inline int page_nid_last(struct page *page) { return page_to_nid(page); } -static inline void reset_page_last_nid(struct page *page) +static inline void page_nid_reset_last(struct page *page) { } #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c63a21d0e991..6049376c7226 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1639,7 +1639,7 @@ static void __split_huge_page_refcount(struct page *page) page_tail->mapping = page->mapping; page_tail->index = page->index + i; - page_xchg_last_nid(page_tail, page_last_nid(page)); + page_nid_xchg_last(page_tail, page_nid_last(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6f7979c566d9..2ae78e255e08 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2316,7 +2316,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * it less likely we act on an unlikely task<->page * relation. */ - last_nid = page_xchg_last_nid(page, polnid); + last_nid = page_nid_xchg_last(page, polnid); if (last_nid != polnid) goto out; } diff --git a/mm/migrate.c b/mm/migrate.c index f560071e89c5..de5c371a7969 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1497,7 +1497,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, __GFP_NOWARN) & ~GFP_IOFS, 0); if (newpage) - page_xchg_last_nid(newpage, page_last_nid(page)); + page_nid_xchg_last(newpage, page_nid_last(page)); return newpage; } @@ -1681,7 +1681,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, if (!new_page) goto out_fail; - page_xchg_last_nid(new_page, page_last_nid(page)); + page_nid_xchg_last(new_page, page_nid_last(page)); isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { diff --git a/mm/mmzone.c b/mm/mmzone.c index bce796e8487f..2ac0afbd68f3 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -98,14 +98,14 @@ void lruvec_init(struct lruvec *lruvec) } #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) -int page_xchg_last_nid(struct page *page, int nid) +int page_nid_xchg_last(struct page *page, int nid) { unsigned long old_flags, flags; int last_nid; do { old_flags = flags = page->flags; - last_nid = page_last_nid(page); + last_nid = page_nid_last(page); flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3ede25e6686e..445718b328b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -295,7 +295,7 @@ static void bad_page(struct page *page) /* Don't complain about poisoned pages */ if (PageHWPoison(page)) { - reset_page_mapcount(page); /* remove PageBuddy */ + page_mapcount_reset(page); /* remove PageBuddy */ return; } @@ -327,7 +327,7 @@ static void bad_page(struct page *page) dump_stack(); out: /* Leave bad fields for debug, except PageBuddy could make trouble */ - reset_page_mapcount(page); /* remove PageBuddy */ + page_mapcount_reset(page); /* remove PageBuddy */ add_taint(TAINT_BAD_PAGE); } @@ -613,7 +613,7 @@ static inline int free_pages_check(struct page *page) bad_page(page); return 1; } - reset_page_last_nid(page); + page_nid_reset_last(page); if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return 0; @@ -3894,8 +3894,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, set_page_links(page, zone, nid, pfn); mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); - reset_page_mapcount(page); - reset_page_last_nid(page); + page_mapcount_reset(page); + page_nid_reset_last(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for diff --git a/mm/slob.c b/mm/slob.c index a99fdf7a0907..eeed4a05a2ef 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -360,7 +360,7 @@ static void slob_free(void *block, int size) clear_slob_page_free(sp); spin_unlock_irqrestore(&slob_lock, flags); __ClearPageSlab(sp); - reset_page_mapcount(sp); + page_mapcount_reset(sp); slob_free_pages(b, 0); return; } diff --git a/mm/slub.c b/mm/slub.c index ba2ca53f6c3a..ebcc44eb43b9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1408,7 +1408,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlab(page); memcg_release_pages(s, order); - reset_page_mapcount(page); + page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; __free_memcg_kmem_pages(page, order); -- cgit v1.2.3-55-g7522 From 9127ab4ff92f0ecd7b4671efa9d0edb21c691e9f Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Fri, 22 Feb 2013 16:35:21 -0800 Subject: mm: add SECTION_IN_PAGE_FLAGS Instead of directly utilizing a combination of config options to determine this, add a macro to specifically address it. Signed-off-by: Cody P Schafer Cc: David Hansen Cc: Catalin Marinas Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index fe039bdba4ed..97da0302cf51 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -625,6 +625,10 @@ static inline enum zone_type page_zonenum(const struct page *page) return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +#define SECTION_IN_PAGE_FLAGS +#endif + /* * The identification function is only used by the buddy allocator for * determining if two pages could be buddies. We are not really @@ -708,7 +712,7 @@ static inline struct zone *page_zone(const struct page *page) return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; } -#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +#ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); @@ -738,7 +742,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone, { set_page_zone(page, zone); set_page_node(page, node); -#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +#ifdef SECTION_IN_PAGE_FLAGS set_page_section(page, pfn_to_section_nr(pfn)); #endif } -- cgit v1.2.3-55-g7522 From 28a35716d317980ae9bc2ff2f84c33a3cda9e884 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 22 Feb 2013 16:35:55 -0800 Subject: mm: use long type for page counts in mm_populate() and get_user_pages() Use long type for page counts in mm_populate() so as to avoid integer overflow when running the following test code: int main(void) { void *p = mmap(NULL, 0x100000000000, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0); printf("p: %p\n", p); mlockall(MCL_CURRENT); printf("done\n"); return 0; } Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 +++--- include/linux/mm.h | 15 ++++++++------- mm/hugetlb.c | 12 ++++++------ mm/memory.c | 18 +++++++++--------- mm/mlock.c | 4 ++-- mm/nommu.c | 15 ++++++++------- 6 files changed, 36 insertions(+), 34 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0c80d3f57a5b..eedc334fb6f5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -43,9 +43,9 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, #endif int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); -int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, - struct page **, struct vm_area_struct **, - unsigned long *, int *, int, unsigned int flags); +long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, + struct page **, struct vm_area_struct **, + unsigned long *, unsigned long *, long, unsigned int); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *); void __unmap_hugepage_range_final(struct mmu_gather *tlb, diff --git a/include/linux/mm.h b/include/linux/mm.h index 97da0302cf51..87b0ef253607 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1013,13 +1013,14 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void * extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, int write); -int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas, - int *nonblocking); -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, int write, int force, - struct page **pages, struct vm_area_struct **vmas); +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int foll_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking); +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas); int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); struct kvec; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e14a8c79a1eb..cdb64e4d238a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2920,14 +2920,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, return NULL; } -int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct page **pages, struct vm_area_struct **vmas, - unsigned long *position, int *length, int i, - unsigned int flags) +long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, + struct page **pages, struct vm_area_struct **vmas, + unsigned long *position, unsigned long *nr_pages, + long i, unsigned int flags) { unsigned long pfn_offset; unsigned long vaddr = *position; - int remainder = *length; + unsigned long remainder = *nr_pages; struct hstate *h = hstate_vma(vma); spin_lock(&mm->page_table_lock); @@ -2997,7 +2997,7 @@ same_page: } } spin_unlock(&mm->page_table_lock); - *length = remainder; + *nr_pages = remainder; *position = vaddr; return i ? i : -EFAULT; diff --git a/mm/memory.c b/mm/memory.c index 7bd22a621817..bc929dbad215 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1677,15 +1677,15 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ -int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas, - int *nonblocking) +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking) { - int i; + long i; unsigned long vm_flags; - if (nr_pages <= 0) + if (!nr_pages) return 0; VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); @@ -1981,9 +1981,9 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, * * See also get_user_pages_fast, for performance critical applications. */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, int write, int force, - struct page **pages, struct vm_area_struct **vmas) +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, int write, + int force, struct page **pages, struct vm_area_struct **vmas) { int flags = FOLL_TOUCH; diff --git a/mm/mlock.c b/mm/mlock.c index 38db3b094105..e6638f565d42 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -160,7 +160,7 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; - int nr_pages = (end - start) / PAGE_SIZE; + unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; VM_BUG_ON(start & ~PAGE_MASK); @@ -382,7 +382,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; int locked = 0; - int ret = 0; + long ret = 0; VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(len != PAGE_ALIGN(len)); diff --git a/mm/nommu.c b/mm/nommu.c index 87854a55829d..6ab706608492 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -140,10 +140,10 @@ unsigned int kobjsize(const void *objp) return PAGE_SIZE << compound_order(page); } -int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas, - int *retry) +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int foll_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking) { struct vm_area_struct *vma; unsigned long vm_flags; @@ -190,9 +190,10 @@ finish_or_fault: * slab page or a secondary page from a compound page * - don't permit access to VMAs that don't support it, such as I/O mappings */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, int write, int force, - struct page **pages, struct vm_area_struct **vmas) +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) { int flags = 0; -- cgit v1.2.3-55-g7522 From 240aadeedc4a89fc44623f8ce4ca46bda73db07e Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 22 Feb 2013 16:35:56 -0800 Subject: mm: accelerate mm_populate() treatment of THP pages This change adds a follow_page_mask function which is equivalent to follow_page, but with an extra page_mask argument. follow_page_mask sets *page_mask to HPAGE_PMD_NR - 1 when it encounters a THP page, and to 0 in other cases. __get_user_pages() makes use of this in order to accelerate populating THP ranges - that is, when both the pages and vmas arrays are NULL, we don't need to iterate HPAGE_PMD_NR times to cover a single THP page (and we also avoid taking mm->page_table_lock that many times). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 13 +++++++++++-- mm/memory.c | 31 +++++++++++++++++++++++-------- mm/nommu.c | 6 ++++-- 3 files changed, 38 insertions(+), 12 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 87b0ef253607..6124f1db50fe 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1629,8 +1629,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); -struct page *follow_page(struct vm_area_struct *, unsigned long address, - unsigned int foll_flags); +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int foll_flags, + unsigned int *page_mask); + +static inline struct page *follow_page(struct vm_area_struct *vma, + unsigned long address, unsigned int foll_flags) +{ + unsigned int unused_page_mask; + return follow_page_mask(vma, address, foll_flags, &unused_page_mask); +} + #define FOLL_WRITE 0x01 /* check pte is writable */ #define FOLL_TOUCH 0x02 /* mark page accessed */ #define FOLL_GET 0x04 /* do get_page on page */ diff --git a/mm/memory.c b/mm/memory.c index bc929dbad215..5d2ef1217d0c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1462,10 +1462,11 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, EXPORT_SYMBOL_GPL(zap_vma_ptes); /** - * follow_page - look up a page descriptor from a user-virtual address + * follow_page_mask - look up a page descriptor from a user-virtual address * @vma: vm_area_struct mapping @address * @address: virtual address to look up * @flags: flags modifying lookup behaviour + * @page_mask: on output, *page_mask is set according to the size of the page * * @flags can have FOLL_ flags set, defined in * @@ -1473,8 +1474,9 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes); * an error pointer if there is a mapping to something not represented * by a page descriptor (see also vm_normal_page()). */ -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int flags) +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) { pgd_t *pgd; pud_t *pud; @@ -1484,6 +1486,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, struct page *page; struct mm_struct *mm = vma->vm_mm; + *page_mask = 0; + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); if (!IS_ERR(page)) { BUG_ON(flags & FOLL_GET); @@ -1530,6 +1534,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(&mm->page_table_lock); + *page_mask = HPAGE_PMD_NR - 1; goto out; } } else @@ -1684,6 +1689,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, { long i; unsigned long vm_flags; + unsigned int page_mask; if (!nr_pages) return 0; @@ -1761,6 +1767,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, get_page(page); } pte_unmap(pte); + page_mask = 0; goto next_page; } @@ -1778,6 +1785,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, do { struct page *page; unsigned int foll_flags = gup_flags; + unsigned int page_increm; /* * If we have a pending SIGKILL, don't keep faulting @@ -1787,7 +1795,8 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i ? i : -ERESTARTSYS; cond_resched(); - while (!(page = follow_page(vma, start, foll_flags))) { + while (!(page = follow_page_mask(vma, start, + foll_flags, &page_mask))) { int ret; unsigned int fault_flags = 0; @@ -1861,13 +1870,19 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, flush_anon_page(vma, page, start); flush_dcache_page(page); + page_mask = 0; } next_page: - if (vmas) + if (vmas) { vmas[i] = vma; - i++; - start += PAGE_SIZE; - nr_pages--; + page_mask = 0; + } + page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + if (page_increm > nr_pages) + page_increm = nr_pages; + i += page_increm; + start += page_increm * PAGE_SIZE; + nr_pages -= page_increm; } while (nr_pages && start < vma->vm_end); } while (nr_pages); return i; diff --git a/mm/nommu.c b/mm/nommu.c index 6ab706608492..da0d210fd403 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1819,9 +1819,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, return ret; } -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int foll_flags) +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) { + *page_mask = 0; return NULL; } -- cgit v1.2.3-55-g7522 From 5117b3b835f288314a2d4e5512bc1747e3a7c8ed Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 22 Feb 2013 16:36:07 -0800 Subject: mm,ksm: FOLL_MIGRATION do migration_entry_wait In "ksm: remove old stable nodes more thoroughly" I said that I'd never seen its WARN_ON_ONCE(page_mapped(page)). True at the time of writing, but it soon appeared once I tried fuller tests on the whole series. It turned out to be due to the KSM page migration itself: unmerge_and_ remove_all_rmap_items() failed to locate and replace all the KSM pages, because of that hiatus in page migration when old pte has been replaced by migration entry, but not yet by new pte. follow_page() finds no page at that instant, but a KSM page reappears shortly after, without a fault. Add FOLL_MIGRATION flag, so follow_page() can do migration_entry_wait() for KSM's break_cow(). I'd have preferred to avoid another flag, and do it every time, in case someone else makes the same easy mistake; but did not find another transgressor (the common get_user_pages() is of course safe), and cannot be sure that every follow_page() caller is prepared to sleep - ia64's xencomm_vtop()? Now, THP's wait_split_huge_page() can already sleep there, since anon_vma locking was changed to mutex, but maybe that's somehow excluded. Signed-off-by: Hugh Dickins Cc: Mel Gorman Cc: Petr Holasek Cc: Andrea Arcangeli Cc: Izik Eidus Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/ksm.c | 2 +- mm/memory.c | 20 ++++++++++++++++++-- 3 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6124f1db50fe..e7c3f9a0111a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1651,6 +1651,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma, #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ +#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/mm/ksm.c b/mm/ksm.c index 0320327b8a6c..d61cba6fa1dc 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -364,7 +364,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) do { cond_resched(); - page = follow_page(vma, addr, FOLL_GET); + page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION); if (IS_ERR_OR_NULL(page)) break; if (PageKsm(page)) diff --git a/mm/memory.c b/mm/memory.c index 5d2ef1217d0c..ec8ba011fa7d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1548,8 +1548,24 @@ split_fallthrough: ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; - if (!pte_present(pte)) - goto no_page; + if (!pte_present(pte)) { + swp_entry_t entry; + /* + * KSM's break_ksm() relies upon recognizing a ksm page + * even while it is being migrated, so for that case we + * need migration_entry_wait(). + */ + if (likely(!(flags & FOLL_MIGRATION))) + goto no_page; + if (pte_none(pte) || pte_file(pte)) + goto no_page; + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto no_page; + pte_unmap_unlock(ptep, ptl); + migration_entry_wait(mm, pmd, address); + goto split_fallthrough; + } if ((flags & FOLL_NUMA) && pte_numa(pte)) goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) -- cgit v1.2.3-55-g7522