diff options
author | Frederic Weisbecker | 2009-10-18 01:09:09 +0200 |
---|---|---|
committer | Frederic Weisbecker | 2009-10-18 01:12:33 +0200 |
commit | 0f8f86c7bdd1c954fbe153af437a0d91a6c5721a (patch) | |
tree | 94a8d419a470a4f9852ca397bb9bbe48db92ff5c /mm | |
parent | Merge branch 'linus' into tracing/hw-breakpoints (diff) | |
parent | perf tools: Move dereference after NULL test (diff) | |
download | kernel-qcow2-linux-0f8f86c7bdd1c954fbe153af437a0d91a6c5721a.tar.gz kernel-qcow2-linux-0f8f86c7bdd1c954fbe153af437a0d91a6c5721a.tar.xz kernel-qcow2-linux-0f8f86c7bdd1c954fbe153af437a0d91a6c5721a.zip |
Merge commit 'perf/core' into perf/hw-breakpoint
Conflicts:
kernel/Makefile
kernel/trace/Makefile
kernel/trace/trace.h
samples/Makefile
Merge reason: We need to be uptodate with the perf events development
branch because we plan to rewrite the breakpoints API on top of
perf events.
Diffstat (limited to 'mm')
46 files changed, 6563 insertions, 1462 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index fe5f674d7a7d..57963c6063d1 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -153,7 +153,7 @@ config MEMORY_HOTREMOVE # config PAGEFLAGS_EXTENDED def_bool y - depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM + depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address @@ -214,6 +214,20 @@ config HAVE_MLOCKED_PAGE_BIT config MMU_NOTIFIER bool +config KSM + bool "Enable KSM for page merging" + depends on MMU + help + Enable Kernel Samepage Merging: KSM periodically scans those areas + of an application's address space that an app has advised may be + mergeable. When it finds pages of identical content, it replaces + the many instances by a single resident page with that content, so + saving memory until one or another app needs to modify the content. + Recommended for use with KVM, or with other duplicative applications. + See Documentation/vm/ksm.txt for more information: KSM is inactive + until a program has madvised that an area is MADV_MERGEABLE, and + root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). + config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" default 4096 @@ -232,6 +246,22 @@ config DEFAULT_MMAP_MIN_ADDR This value can be changed after boot using the /proc/sys/vm/mmap_min_addr tunable. +config ARCH_SUPPORTS_MEMORY_FAILURE + bool + +config MEMORY_FAILURE + depends on MMU + depends on ARCH_SUPPORTS_MEMORY_FAILURE + bool "Enable recovery from hardware memory errors" + help + Enables code to recover from some memory failures on systems + with MCA recovery. This allows a system to continue running + even when some of its memory has uncorrected errors. This requires + special hardware support and typically ECC memory. + +config HWPOISON_INJECT + tristate "Poison pages injector" + depends on MEMORY_FAILURE && DEBUG_KERNEL config NOMMU_INITIAL_TRIM_EXCESS int "Turn on mmap() excess space trimming before booting" diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index aa99fd1f7109..af7cfb43d2f0 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -6,7 +6,7 @@ config DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types - of memory corruptions. + of memory corruption. config WANT_PAGE_DEBUG_FLAGS bool @@ -17,11 +17,11 @@ config PAGE_POISONING depends on !HIBERNATION select DEBUG_PAGEALLOC select WANT_PAGE_DEBUG_FLAGS - help + ---help--- Fill the pages with poison patterns after free_pages() and verify the patterns before alloc_pages(). This results in a large slowdown, - but helps to find certain types of memory corruptions. + but helps to find certain types of memory corruption. - This option cannot enalbe with hibernation. Otherwise, it will get - wrong messages for memory corruption because the free pages are not - saved to the suspend image. + This option cannot be enabled in combination with hibernation as + that would result in incorrect warnings of memory corruption after + a resume because free pages are not saved to the suspend image. diff --git a/mm/Makefile b/mm/Makefile index 147a7a7873c4..ebf849042ed3 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,16 +5,16 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o + vmalloc.o pagewalk.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o mm_init.o $(mmu-y) + page_isolation.o mm_init.o mmu_context.o \ + $(mmu-y) obj-y += init-mm.o -obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HAS_DMA) += dmapool.o @@ -25,6 +25,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o @@ -33,12 +34,14 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA obj-$(CONFIG_SMP) += percpu.o else obj-$(CONFIG_SMP) += allocpercpu.o endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o +obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index dfdee6a47359..df34ceae0c67 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -5,6 +5,8 @@ */ #include <linux/mm.h> #include <linux/module.h> +#include <linux/bootmem.h> +#include <asm/sections.h> #ifndef cache_line_size #define cache_line_size() L1_CACHE_BYTES @@ -147,3 +149,29 @@ void free_percpu(void *__pdata) kfree(__percpu_disguise(__pdata)); } EXPORT_SYMBOL_GPL(free_percpu); + +/* + * Generic percpu area setup. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; + +EXPORT_SYMBOL(__per_cpu_offset); + +void __init setup_per_cpu_areas(void) +{ + unsigned long size, i; + char *ptr; + unsigned long nr_possible_cpus = num_possible_cpus(); + + /* Copy section for each CPU (we discard the original) */ + size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + ptr = alloc_bootmem_pages(size * nr_possible_cpus); + + for_each_possible_cpu(i) { + __per_cpu_offset[i] = ptr - __per_cpu_start; + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + ptr += size; + } +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index d3ca0dac1111..5a37e2055717 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -26,6 +26,12 @@ struct backing_dev_info default_backing_dev_info = { EXPORT_SYMBOL_GPL(default_backing_dev_info); static struct class *bdi_class; + +/* + * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as + * reader side protection for bdi_pending_list. bdi_list has RCU reader side + * locking. + */ DEFINE_SPINLOCK(bdi_lock); LIST_HEAD(bdi_list); LIST_HEAD(bdi_pending_list); @@ -86,7 +92,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" "BackgroundThresh: %8lu kB\n" - "WriteBack threads:%8lu\n" + "WritebackThreads: %8lu\n" "b_dirty: %8lu\n" "b_io: %8lu\n" "b_more_io: %8lu\n" @@ -284,9 +290,9 @@ static int bdi_start_fn(void *ptr) /* * Add us to the active bdi_list */ - spin_lock(&bdi_lock); - list_add(&bdi->bdi_list, &bdi_list); - spin_unlock(&bdi_lock); + spin_lock_bh(&bdi_lock); + list_add_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); bdi_task_init(bdi, wb); @@ -389,7 +395,7 @@ static int bdi_forker_task(void *ptr) if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) wb_do_writeback(me, 0); - spin_lock(&bdi_lock); + spin_lock_bh(&bdi_lock); /* * Check if any existing bdi's have dirty data without @@ -410,7 +416,7 @@ static int bdi_forker_task(void *ptr) if (list_empty(&bdi_pending_list)) { unsigned long wait; - spin_unlock(&bdi_lock); + spin_unlock_bh(&bdi_lock); wait = msecs_to_jiffies(dirty_writeback_interval * 10); schedule_timeout(wait); try_to_freeze(); @@ -426,7 +432,7 @@ static int bdi_forker_task(void *ptr) bdi = list_entry(bdi_pending_list.next, struct backing_dev_info, bdi_list); list_del_init(&bdi->bdi_list); - spin_unlock(&bdi_lock); + spin_unlock_bh(&bdi_lock); wb = &bdi->wb; wb->task = kthread_run(bdi_start_fn, wb, "flush-%s", @@ -445,9 +451,9 @@ static int bdi_forker_task(void *ptr) * a chance to flush other bdi's to free * memory. */ - spin_lock(&bdi_lock); + spin_lock_bh(&bdi_lock); list_add_tail(&bdi->bdi_list, &bdi_pending_list); - spin_unlock(&bdi_lock); + spin_unlock_bh(&bdi_lock); bdi_flush_io(bdi); } @@ -456,6 +462,24 @@ static int bdi_forker_task(void *ptr) return 0; } +static void bdi_add_to_pending(struct rcu_head *head) +{ + struct backing_dev_info *bdi; + + bdi = container_of(head, struct backing_dev_info, rcu_head); + INIT_LIST_HEAD(&bdi->bdi_list); + + spin_lock(&bdi_lock); + list_add_tail(&bdi->bdi_list, &bdi_pending_list); + spin_unlock(&bdi_lock); + + /* + * We are now on the pending list, wake up bdi_forker_task() + * to finish the job and add us back to the active bdi_list + */ + wake_up_process(default_backing_dev_info.wb.task); +} + /* * Add the default flusher task that gets created for any bdi * that has dirty data pending writeout @@ -478,16 +502,29 @@ void static bdi_add_default_flusher_task(struct backing_dev_info *bdi) * waiting for previous additions to finish. */ if (!test_and_set_bit(BDI_pending, &bdi->state)) { - list_move_tail(&bdi->bdi_list, &bdi_pending_list); + list_del_rcu(&bdi->bdi_list); /* - * We are now on the pending list, wake up bdi_forker_task() - * to finish the job and add us back to the active bdi_list + * We must wait for the current RCU period to end before + * moving to the pending list. So schedule that operation + * from an RCU callback. */ - wake_up_process(default_backing_dev_info.wb.task); + call_rcu(&bdi->rcu_head, bdi_add_to_pending); } } +/* + * Remove bdi from bdi_list, and ensure that it is no longer visible + */ +static void bdi_remove_from_list(struct backing_dev_info *bdi) +{ + spin_lock_bh(&bdi_lock); + list_del_rcu(&bdi->bdi_list); + spin_unlock_bh(&bdi_lock); + + synchronize_rcu(); +} + int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...) { @@ -506,9 +543,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, goto exit; } - spin_lock(&bdi_lock); - list_add_tail(&bdi->bdi_list, &bdi_list); - spin_unlock(&bdi_lock); + spin_lock_bh(&bdi_lock); + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); bdi->dev = dev; @@ -526,9 +563,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, wb->task = NULL; ret = -ENOMEM; - spin_lock(&bdi_lock); - list_del(&bdi->bdi_list); - spin_unlock(&bdi_lock); + bdi_remove_from_list(bdi); goto exit; } } @@ -565,9 +600,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) /* * Make sure nobody finds us on the bdi_list anymore */ - spin_lock(&bdi_lock); - list_del(&bdi->bdi_list); - spin_unlock(&bdi_lock); + bdi_remove_from_list(bdi); /* * Finally, kill the kernel threads. We don't need to be RCU @@ -599,6 +632,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->max_ratio = 100; bdi->max_prop_frac = PROP_FRAC_BASE; spin_lock_init(&bdi->wb_lock); + INIT_RCU_HEAD(&bdi->rcu_head); INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); INIT_LIST_HEAD(&bdi->work_list); @@ -634,7 +668,19 @@ void bdi_destroy(struct backing_dev_info *bdi) { int i; - WARN_ON(bdi_has_dirty_io(bdi)); + /* + * Splice our entries to the default_backing_dev_info, if this + * bdi disappears + */ + if (bdi_has_dirty_io(bdi)) { + struct bdi_writeback *dst = &default_backing_dev_info.wb; + + spin_lock(&inode_lock); + list_splice(&bdi->wb.b_dirty, &dst->b_dirty); + list_splice(&bdi->wb.b_io, &dst->b_io); + list_splice(&bdi->wb.b_more_io, &dst->b_more_io); + spin_unlock(&inode_lock); + } bdi_unregister(bdi); diff --git a/mm/filemap.c b/mm/filemap.c index dd51c68e2b86..ef169f37156d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -58,7 +58,7 @@ /* * Lock ordering: * - * ->i_mmap_lock (vmtruncate) + * ->i_mmap_lock (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock @@ -104,6 +104,10 @@ * * ->task->proc_lock * ->dcache_lock (proc_pid_lookup) + * + * (code doesn't rely on that order, so you could switch it around) + * ->tasklist_lock (memory_failure, collect_procs_ao) + * ->i_mmap_lock */ /* @@ -119,6 +123,8 @@ void __remove_from_page_cache(struct page *page) page->mapping = NULL; mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); + if (PageSwapBacked(page)) + __dec_zone_page_state(page, NR_SHMEM); BUG_ON(page_mapped(page)); /* @@ -431,6 +437,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); + if (PageSwapBacked(page)) + __inc_zone_page_state(page, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; @@ -1603,7 +1611,7 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -struct vm_operations_struct generic_file_vm_ops = { +const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, }; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 427dfe3ce78c..1888b2d71bb8 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -296,7 +296,7 @@ out: } } -static struct vm_operations_struct xip_file_vm_ops = { +static const struct vm_operations_struct xip_file_vm_ops = { .fault = xip_file_fault, }; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b16d63634777..5d7601b02874 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -456,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) h->free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page(struct hstate *h) -{ - int nid; - struct page *page = NULL; - - for (nid = 0; nid < MAX_NUMNODES; ++nid) { - if (!list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - break; - } - } - return page; -} - static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve) @@ -641,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) /* * Use a helper variable to find the next node and then - * copy it back to hugetlb_next_nid afterwards: + * copy it back to next_nid_to_alloc afterwards: * otherwise there's a window in which a racer might * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. * But we don't need to use a spin_lock here: it really @@ -650,13 +632,13 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) * if we just successfully allocated a hugepage so that * the next caller gets hugepages on the next node. */ -static int hstate_next_node(struct hstate *h) +static int hstate_next_node_to_alloc(struct hstate *h) { int next_nid; - next_nid = next_node(h->hugetlb_next_nid, node_online_map); + next_nid = next_node(h->next_nid_to_alloc, node_online_map); if (next_nid == MAX_NUMNODES) next_nid = first_node(node_online_map); - h->hugetlb_next_nid = next_nid; + h->next_nid_to_alloc = next_nid; return next_nid; } @@ -667,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h) int next_nid; int ret = 0; - start_nid = h->hugetlb_next_nid; + start_nid = h->next_nid_to_alloc; + next_nid = start_nid; do { - page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); + page = alloc_fresh_huge_page_node(h, next_nid); if (page) ret = 1; - next_nid = hstate_next_node(h); - } while (!page && h->hugetlb_next_nid != start_nid); + next_nid = hstate_next_node_to_alloc(h); + } while (!page && next_nid != start_nid); if (ret) count_vm_event(HTLB_BUDDY_PGALLOC); @@ -684,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h) return ret; } +/* + * helper for free_pool_huge_page() - find next node + * from which to free a huge page + */ +static int hstate_next_node_to_free(struct hstate *h) +{ + int next_nid; + next_nid = next_node(h->next_nid_to_free, node_online_map); + if (next_nid == MAX_NUMNODES) + next_nid = first_node(node_online_map); + h->next_nid_to_free = next_nid; + return next_nid; +} + +/* + * Free huge page from pool from next node to free. + * Attempt to keep persistent huge pages more or less + * balanced over allowed nodes. + * Called with hugetlb_lock locked. + */ +static int free_pool_huge_page(struct hstate *h, bool acct_surplus) +{ + int start_nid; + int next_nid; + int ret = 0; + + start_nid = h->next_nid_to_free; + next_nid = start_nid; + + do { + /* + * If we're returning unused surplus pages, only examine + * nodes with surplus pages. + */ + if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && + !list_empty(&h->hugepage_freelists[next_nid])) { + struct page *page = + list_entry(h->hugepage_freelists[next_nid].next, + struct page, lru); + list_del(&page->lru); + h->free_huge_pages--; + h->free_huge_pages_node[next_nid]--; + if (acct_surplus) { + h->surplus_huge_pages--; + h->surplus_huge_pages_node[next_nid]--; + } + update_and_free_page(h, page); + ret = 1; + } + next_nid = hstate_next_node_to_free(h); + } while (!ret && next_nid != start_nid); + + return ret; +} + static struct page *alloc_buddy_huge_page(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { @@ -855,22 +893,13 @@ free: * When releasing a hugetlb pool reservation, any surplus pages that were * allocated to satisfy the reservation must be explicitly freed if they were * never used. + * Called with hugetlb_lock held. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { - static int nid = -1; - struct page *page; unsigned long nr_pages; - /* - * We want to release as many surplus pages as possible, spread - * evenly across all nodes. Iterate across all nodes until we - * can no longer free unreserved surplus pages. This occurs when - * the nodes with surplus pages have no free pages. - */ - unsigned long remaining_iterations = nr_online_nodes; - /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; @@ -880,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h, nr_pages = min(unused_resv_pages, h->surplus_huge_pages); - while (remaining_iterations-- && nr_pages) { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - - if (!h->surplus_huge_pages_node[nid]) - continue; - - if (!list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - update_and_free_page(h, page); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - h->surplus_huge_pages--; - h->surplus_huge_pages_node[nid]--; - nr_pages--; - remaining_iterations = nr_online_nodes; - } + /* + * We want to release as many surplus pages as possible, spread + * evenly across all nodes. Iterate across all nodes until we + * can no longer free unreserved surplus pages. This occurs when + * the nodes with surplus pages have no free pages. + * free_pool_huge_page() will balance the the frees across the + * on-line nodes for us and will handle the hstate accounting. + */ + while (nr_pages--) { + if (!free_pool_huge_page(h, 1)) + break; } } @@ -1008,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) void *addr; addr = __alloc_bootmem_node_nopanic( - NODE_DATA(h->hugetlb_next_nid), + NODE_DATA(h->next_nid_to_alloc), huge_page_size(h), huge_page_size(h), 0); + hstate_next_node_to_alloc(h); if (addr) { /* * Use the beginning of the huge page to store the @@ -1020,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) m = addr; goto found; } - hstate_next_node(h); nr_nodes--; } return 0; @@ -1141,31 +1161,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) */ static int adjust_pool_surplus(struct hstate *h, int delta) { - static int prev_nid; - int nid = prev_nid; + int start_nid, next_nid; int ret = 0; VM_BUG_ON(delta != -1 && delta != 1); - do { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - /* To shrink on this node, there must be a surplus page */ - if (delta < 0 && !h->surplus_huge_pages_node[nid]) - continue; - /* Surplus cannot exceed the total number of pages */ - if (delta > 0 && h->surplus_huge_pages_node[nid] >= + if (delta < 0) + start_nid = h->next_nid_to_alloc; + else + start_nid = h->next_nid_to_free; + next_nid = start_nid; + + do { + int nid = next_nid; + if (delta < 0) { + next_nid = hstate_next_node_to_alloc(h); + /* + * To shrink on this node, there must be a surplus page + */ + if (!h->surplus_huge_pages_node[nid]) + continue; + } + if (delta > 0) { + next_nid = hstate_next_node_to_free(h); + /* + * Surplus cannot exceed the total number of pages + */ + if (h->surplus_huge_pages_node[nid] >= h->nr_huge_pages_node[nid]) - continue; + continue; + } h->surplus_huge_pages += delta; h->surplus_huge_pages_node[nid] += delta; ret = 1; break; - } while (nid != prev_nid); + } while (next_nid != start_nid); - prev_nid = nid; return ret; } @@ -1227,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) min_count = max(count, min_count); try_to_free_low(h, min_count); while (min_count < persistent_huge_pages(h)) { - struct page *page = dequeue_huge_page(h); - if (!page) + if (!free_pool_huge_page(h, 0)) break; - update_and_free_page(h, page); } while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, 1)) @@ -1442,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order) h->free_huge_pages = 0; for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); - h->hugetlb_next_nid = first_node(node_online_map); + h->next_nid_to_alloc = first_node(node_online_map); + h->next_nid_to_free = first_node(node_online_map); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); @@ -1506,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) #ifdef CONFIG_SYSCTL int hugetlb_sysctl_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -1517,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write) h->max_huge_pages = set_max_huge_pages(h, tmp); @@ -1526,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, } int hugetlb_treat_movable_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (hugepages_treat_as_movable) htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; else @@ -1538,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write, } int hugetlb_overcommit_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -1549,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write) { spin_lock(&hugetlb_lock); @@ -1690,7 +1721,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return 0; } -struct vm_operations_struct hugetlb_vm_ops = { +const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, @@ -1985,6 +2016,26 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h, return find_lock_page(mapping, idx); } +/* + * Return whether there is a pagecache page to back given address within VMA. + * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. + */ +static bool hugetlbfs_pagecache_present(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct address_space *mapping; + pgoff_t idx; + struct page *page; + + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + + page = find_get_page(mapping, idx); + if (page) + put_page(page); + return page != NULL; +} + static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int flags) { @@ -2180,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, return NULL; } -static int huge_zeropage_ok(pte_t *ptep, int write, int shared) -{ - if (!ptep || write || shared) - return 0; - else - return huge_pte_none(huge_ptep_get(ptep)); -} - int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i, - int write) + unsigned int flags) { unsigned long pfn_offset; unsigned long vaddr = *position; int remainder = *length; struct hstate *h = hstate_vma(vma); - int zeropage_ok = 0; - int shared = vma->vm_flags & VM_SHARED; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; + int absent; struct page *page; /* * Some archs (sparc64, sh*) have multiple pte_ts to - * each hugepage. We have to make * sure we get the + * each hugepage. We have to make sure we get the * first, for the page indexing below to work. */ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); - if (huge_zeropage_ok(pte, write, shared)) - zeropage_ok = 1; + absent = !pte || huge_pte_none(huge_ptep_get(pte)); + + /* + * When coredumping, it suits get_dump_page if we just return + * an error where there's an empty slot with no huge pagecache + * to back it. This way, we avoid allocating a hugepage, and + * the sparse dumpfile avoids allocating disk blocks, but its + * huge holes still show up with zeroes where they need to be. + */ + if (absent && (flags & FOLL_DUMP) && + !hugetlbfs_pagecache_present(h, vma, vaddr)) { + remainder = 0; + break; + } - if (!pte || - (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || - (write && !pte_write(huge_ptep_get(pte)))) { + if (absent || + ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { int ret; spin_unlock(&mm->page_table_lock); - ret = hugetlb_fault(mm, vma, vaddr, write); + ret = hugetlb_fault(mm, vma, vaddr, + (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); spin_lock(&mm->page_table_lock); if (!(ret & VM_FAULT_ERROR)) continue; remainder = 0; - if (!i) - i = -EFAULT; break; } @@ -2235,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pte_page(huge_ptep_get(pte)); same_page: if (pages) { - if (zeropage_ok) - pages[i] = ZERO_PAGE(0); - else - pages[i] = mem_map_offset(page, pfn_offset); + pages[i] = mem_map_offset(page, pfn_offset); get_page(pages[i]); } @@ -2262,7 +2311,7 @@ same_page: *length = remainder; *position = vaddr; - return i; + return i ? i : -EFAULT; } void hugetlb_change_protection(struct vm_area_struct *vma, diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c new file mode 100644 index 000000000000..e1d85137f086 --- /dev/null +++ b/mm/hwpoison-inject.c @@ -0,0 +1,41 @@ +/* Inject a hwpoison memory failure on a arbitary pfn */ +#include <linux/module.h> +#include <linux/debugfs.h> +#include <linux/kernel.h> +#include <linux/mm.h> + +static struct dentry *hwpoison_dir, *corrupt_pfn; + +static int hwpoison_inject(void *data, u64 val) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); + return __memory_failure(val, 18, 0); +} + +DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); + +static void pfn_inject_exit(void) +{ + if (hwpoison_dir) + debugfs_remove_recursive(hwpoison_dir); +} + +static int pfn_inject_init(void) +{ + hwpoison_dir = debugfs_create_dir("hwpoison", NULL); + if (hwpoison_dir == NULL) + return -ENOMEM; + corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, + NULL, &hwpoison_fops); + if (corrupt_pfn == NULL) { + pfn_inject_exit(); + return -ENOMEM; + } + return 0; +} + +module_init(pfn_inject_init); +module_exit(pfn_inject_exit); +MODULE_LICENSE("GPL"); diff --git a/mm/internal.h b/mm/internal.h index f290c4db528b..22ec8d2b0fb8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -37,6 +37,8 @@ static inline void __put_page(struct page *page) atomic_dec(&page->_count); } +extern unsigned long highest_memmap_pfn; + /* * in mm/vmscan.c: */ @@ -46,7 +48,6 @@ extern void putback_lru_page(struct page *page); /* * in mm/page_alloc.c */ -extern unsigned long highest_memmap_pfn; extern void __free_pages_bootmem(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); @@ -250,13 +251,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, } #endif /* CONFIG_SPARSEMEM */ -#define GUP_FLAGS_WRITE 0x1 -#define GUP_FLAGS_FORCE 0x2 -#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 -#define GUP_FLAGS_IGNORE_SIGKILL 0x8 - int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int flags, + unsigned long start, int len, unsigned int foll_flags, struct page **pages, struct vm_area_struct **vmas); #define ZONE_RECLAIM_NOSCAN -2 diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index d5292fc6f523..177a5169bbde 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -36,7 +36,7 @@ struct test_node { }; static LIST_HEAD(test_list); -static DEFINE_PER_CPU(void *, test_pointer); +static DEFINE_PER_CPU(void *, kmemleak_test_pointer); /* * Some very simple testing. This function needs to be extended for @@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void) } for_each_possible_cpu(i) { - per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); + per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); pr_info("kmemleak: kmalloc(129) = %p\n", - per_cpu(test_pointer, i)); + per_cpu(kmemleak_test_pointer, i)); } return 0; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 4ea4510e2996..8bf765c4f58d 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -833,12 +833,15 @@ static void early_alloc(struct early_log *log) */ rcu_read_lock(); object = create_object((unsigned long)log->ptr, log->size, - log->min_count, GFP_KERNEL); + log->min_count, GFP_ATOMIC); + if (!object) + goto out; spin_lock_irqsave(&object->lock, flags); for (i = 0; i < log->trace_len; i++) object->trace[i] = log->trace[i]; object->trace_len = log->trace_len; spin_unlock_irqrestore(&object->lock, flags); +out: rcu_read_unlock(); } diff --git a/mm/ksm.c b/mm/ksm.c new file mode 100644 index 000000000000..bef1af4f77e3 --- /dev/null +++ b/mm/ksm.c @@ -0,0 +1,1709 @@ +/* + * Memory merging support. + * + * This code enables dynamic sharing of identical pages found in different + * memory areas, even if they are not shared by fork() + * + * Copyright (C) 2008-2009 Red Hat, Inc. + * Authors: + * Izik Eidus + * Andrea Arcangeli + * Chris Wright + * Hugh Dickins + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/mman.h> +#include <linux/sched.h> +#include <linux/rwsem.h> +#include <linux/pagemap.h> +#include <linux/rmap.h> +#include <linux/spinlock.h> +#include <linux/jhash.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/wait.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/mmu_notifier.h> +#include <linux/swap.h> +#include <linux/ksm.h> + +#include <asm/tlbflush.h> + +/* + * A few notes about the KSM scanning process, + * to make it easier to understand the data structures below: + * + * In order to reduce excessive scanning, KSM sorts the memory pages by their + * contents into a data structure that holds pointers to the pages' locations. + * + * Since the contents of the pages may change at any moment, KSM cannot just + * insert the pages into a normal sorted tree and expect it to find anything. + * Therefore KSM uses two data structures - the stable and the unstable tree. + * + * The stable tree holds pointers to all the merged pages (ksm pages), sorted + * by their contents. Because each such page is write-protected, searching on + * this tree is fully assured to be working (except when pages are unmapped), + * and therefore this tree is called the stable tree. + * + * In addition to the stable tree, KSM uses a second data structure called the + * unstable tree: this tree holds pointers to pages which have been found to + * be "unchanged for a period of time". The unstable tree sorts these pages + * by their contents, but since they are not write-protected, KSM cannot rely + * upon the unstable tree to work correctly - the unstable tree is liable to + * be corrupted as its contents are modified, and so it is called unstable. + * + * KSM solves this problem by several techniques: + * + * 1) The unstable tree is flushed every time KSM completes scanning all + * memory areas, and then the tree is rebuilt again from the beginning. + * 2) KSM will only insert into the unstable tree, pages whose hash value + * has not changed since the previous scan of all memory areas. + * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the + * colors of the nodes and not on their contents, assuring that even when + * the tree gets "corrupted" it won't get out of balance, so scanning time + * remains the same (also, searching and inserting nodes in an rbtree uses + * the same algorithm, so we have no overhead when we flush and rebuild). + * 4) KSM never flushes the stable tree, which means that even if it were to + * take 10 attempts to find a page in the unstable tree, once it is found, + * it is secured in the stable tree. (When we scan a new page, we first + * compare it against the stable tree, and then against the unstable tree.) + */ + +/** + * struct mm_slot - ksm information per mm that is being scanned + * @link: link to the mm_slots hash list + * @mm_list: link into the mm_slots list, rooted in ksm_mm_head + * @rmap_list: head for this mm_slot's list of rmap_items + * @mm: the mm that this information is valid for + */ +struct mm_slot { + struct hlist_node link; + struct list_head mm_list; + struct list_head rmap_list; + struct mm_struct *mm; +}; + +/** + * struct ksm_scan - cursor for scanning + * @mm_slot: the current mm_slot we are scanning + * @address: the next address inside that to be scanned + * @rmap_item: the current rmap that we are scanning inside the rmap_list + * @seqnr: count of completed full scans (needed when removing unstable node) + * + * There is only the one ksm_scan instance of this cursor structure. + */ +struct ksm_scan { + struct mm_slot *mm_slot; + unsigned long address; + struct rmap_item *rmap_item; + unsigned long seqnr; +}; + +/** + * struct rmap_item - reverse mapping item for virtual addresses + * @link: link into mm_slot's rmap_list (rmap_list is per mm) + * @mm: the memory structure this rmap_item is pointing into + * @address: the virtual address this rmap_item tracks (+ flags in low bits) + * @oldchecksum: previous checksum of the page at that virtual address + * @node: rb_node of this rmap_item in either unstable or stable tree + * @next: next rmap_item hanging off the same node of the stable tree + * @prev: previous rmap_item hanging off the same node of the stable tree + */ +struct rmap_item { + struct list_head link; + struct mm_struct *mm; + unsigned long address; /* + low bits used for flags below */ + union { + unsigned int oldchecksum; /* when unstable */ + struct rmap_item *next; /* when stable */ + }; + union { + struct rb_node node; /* when tree node */ + struct rmap_item *prev; /* in stable list */ + }; +}; + +#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ +#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */ +#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */ + +/* The stable and unstable tree heads */ +static struct rb_root root_stable_tree = RB_ROOT; +static struct rb_root root_unstable_tree = RB_ROOT; + +#define MM_SLOTS_HASH_HEADS 1024 +static struct hlist_head *mm_slots_hash; + +static struct mm_slot ksm_mm_head = { + .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), +}; +static struct ksm_scan ksm_scan = { + .mm_slot = &ksm_mm_head, +}; + +static struct kmem_cache *rmap_item_cache; +static struct kmem_cache *mm_slot_cache; + +/* The number of nodes in the stable tree */ +static unsigned long ksm_pages_shared; + +/* The number of page slots additionally sharing those nodes */ +static unsigned long ksm_pages_sharing; + +/* The number of nodes in the unstable tree */ +static unsigned long ksm_pages_unshared; + +/* The number of rmap_items in use: to calculate pages_volatile */ +static unsigned long ksm_rmap_items; + +/* Limit on the number of unswappable pages used */ +static unsigned long ksm_max_kernel_pages; + +/* Number of pages ksmd should scan in one batch */ +static unsigned int ksm_thread_pages_to_scan = 100; + +/* Milliseconds ksmd should sleep between batches */ +static unsigned int ksm_thread_sleep_millisecs = 20; + +#define KSM_RUN_STOP 0 +#define KSM_RUN_MERGE 1 +#define KSM_RUN_UNMERGE 2 +static unsigned int ksm_run = KSM_RUN_STOP; + +static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); +static DEFINE_MUTEX(ksm_thread_mutex); +static DEFINE_SPINLOCK(ksm_mmlist_lock); + +#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ + sizeof(struct __struct), __alignof__(struct __struct),\ + (__flags), NULL) + +static int __init ksm_slab_init(void) +{ + rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); + if (!rmap_item_cache) + goto out; + + mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); + if (!mm_slot_cache) + goto out_free; + + return 0; + +out_free: + kmem_cache_destroy(rmap_item_cache); +out: + return -ENOMEM; +} + +static void __init ksm_slab_free(void) +{ + kmem_cache_destroy(mm_slot_cache); + kmem_cache_destroy(rmap_item_cache); + mm_slot_cache = NULL; +} + +static inline struct rmap_item *alloc_rmap_item(void) +{ + struct rmap_item *rmap_item; + + rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); + if (rmap_item) + ksm_rmap_items++; + return rmap_item; +} + +static inline void free_rmap_item(struct rmap_item *rmap_item) +{ + ksm_rmap_items--; + rmap_item->mm = NULL; /* debug safety */ + kmem_cache_free(rmap_item_cache, rmap_item); +} + +static inline struct mm_slot *alloc_mm_slot(void) +{ + if (!mm_slot_cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); +} + +static inline void free_mm_slot(struct mm_slot *mm_slot) +{ + kmem_cache_free(mm_slot_cache, mm_slot); +} + +static int __init mm_slots_hash_init(void) +{ + mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!mm_slots_hash) + return -ENOMEM; + return 0; +} + +static void __init mm_slots_hash_free(void) +{ + kfree(mm_slots_hash); +} + +static struct mm_slot *get_mm_slot(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + struct hlist_head *bucket; + struct hlist_node *node; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + hlist_for_each_entry(mm_slot, node, bucket, link) { + if (mm == mm_slot->mm) + return mm_slot; + } + return NULL; +} + +static void insert_to_mm_slots_hash(struct mm_struct *mm, + struct mm_slot *mm_slot) +{ + struct hlist_head *bucket; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + mm_slot->mm = mm; + INIT_LIST_HEAD(&mm_slot->rmap_list); + hlist_add_head(&mm_slot->link, bucket); +} + +static inline int in_stable_tree(struct rmap_item *rmap_item) +{ + return rmap_item->address & STABLE_FLAG; +} + +/* + * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's + * page tables after it has passed through ksm_exit() - which, if necessary, + * takes mmap_sem briefly to serialize against them. ksm_exit() does not set + * a special flag: they can just back out as soon as mm_users goes to zero. + * ksm_test_exit() is used throughout to make this test for exit: in some + * places for correctness, in some places just to avoid unnecessary work. + */ +static inline bool ksm_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +/* + * We use break_ksm to break COW on a ksm page: it's a stripped down + * + * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) + * put_page(page); + * + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, + * in case the application has unmapped and remapped mm,addr meanwhile. + * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. + */ +static int break_ksm(struct vm_area_struct *vma, unsigned long addr) +{ + struct page *page; + int ret = 0; + + do { + cond_resched(); + page = follow_page(vma, addr, FOLL_GET); + if (!page) + break; + if (PageKsm(page)) + ret = handle_mm_fault(vma->vm_mm, vma, addr, + FAULT_FLAG_WRITE); + else + ret = VM_FAULT_WRITE; + put_page(page); + } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); + /* + * We must loop because handle_mm_fault() may back out if there's + * any difficulty e.g. if pte accessed bit gets updated concurrently. + * + * VM_FAULT_WRITE is what we have been hoping for: it indicates that + * COW has been broken, even if the vma does not permit VM_WRITE; + * but note that a concurrent fault might break PageKsm for us. + * + * VM_FAULT_SIGBUS could occur if we race with truncation of the + * backing file, which also invalidates anonymous pages: that's + * okay, that truncation will have unmapped the PageKsm for us. + * + * VM_FAULT_OOM: at the time of writing (late July 2009), setting + * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the + * current task has TIF_MEMDIE set, and will be OOM killed on return + * to user; and ksmd, having no mm, would never be chosen for that. + * + * But if the mm is in a limited mem_cgroup, then the fault may fail + * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and + * even ksmd can fail in this way - though it's usually breaking ksm + * just to undo a merge it made a moment before, so unlikely to oom. + * + * That's a pity: we might therefore have more kernel pages allocated + * than we're counting as nodes in the stable tree; but ksm_do_scan + * will retry to break_cow on each pass, so should recover the page + * in due course. The important thing is to not let VM_MERGEABLE + * be cleared while any such pages might remain in the area. + */ + return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; +} + +static void break_cow(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + goto out; + vma = find_vma(mm, addr); + if (!vma || vma->vm_start > addr) + goto out; + if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + goto out; + break_ksm(vma, addr); +out: + up_read(&mm->mmap_sem); +} + +static struct page *get_mergeable_page(struct rmap_item *rmap_item) +{ + struct mm_struct *mm = rmap_item->mm; + unsigned long addr = rmap_item->address; + struct vm_area_struct *vma; + struct page *page; + + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + goto out; + vma = find_vma(mm, addr); + if (!vma || vma->vm_start > addr) + goto out; + if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + goto out; + + page = follow_page(vma, addr, FOLL_GET); + if (!page) + goto out; + if (PageAnon(page)) { + flush_anon_page(vma, page, addr); + flush_dcache_page(page); + } else { + put_page(page); +out: page = NULL; + } + up_read(&mm->mmap_sem); + return page; +} + +/* + * get_ksm_page: checks if the page at the virtual address in rmap_item + * is still PageKsm, in which case we can trust the content of the page, + * and it returns the gotten page; but NULL if the page has been zapped. + */ +static struct page *get_ksm_page(struct rmap_item *rmap_item) +{ + struct page *page; + + page = get_mergeable_page(rmap_item); + if (page && !PageKsm(page)) { + put_page(page); + page = NULL; + } + return page; +} + +/* + * Removing rmap_item from stable or unstable tree. + * This function will clean the information from the stable/unstable tree. + */ +static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) +{ + if (in_stable_tree(rmap_item)) { + struct rmap_item *next_item = rmap_item->next; + + if (rmap_item->address & NODE_FLAG) { + if (next_item) { + rb_replace_node(&rmap_item->node, + &next_item->node, + &root_stable_tree); + next_item->address |= NODE_FLAG; + ksm_pages_sharing--; + } else { + rb_erase(&rmap_item->node, &root_stable_tree); + ksm_pages_shared--; + } + } else { + struct rmap_item *prev_item = rmap_item->prev; + + BUG_ON(prev_item->next != rmap_item); + prev_item->next = next_item; + if (next_item) { + BUG_ON(next_item->prev != rmap_item); + next_item->prev = rmap_item->prev; + } + ksm_pages_sharing--; + } + + rmap_item->next = NULL; + + } else if (rmap_item->address & NODE_FLAG) { + unsigned char age; + /* + * Usually ksmd can and must skip the rb_erase, because + * root_unstable_tree was already reset to RB_ROOT. + * But be careful when an mm is exiting: do the rb_erase + * if this rmap_item was inserted by this scan, rather + * than left over from before. + */ + age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); + BUG_ON(age > 1); + if (!age) + rb_erase(&rmap_item->node, &root_unstable_tree); + ksm_pages_unshared--; + } + + rmap_item->address &= PAGE_MASK; + + cond_resched(); /* we're called from many long loops */ +} + +static void remove_trailing_rmap_items(struct mm_slot *mm_slot, + struct list_head *cur) +{ + struct rmap_item *rmap_item; + + while (cur != &mm_slot->rmap_list) { + rmap_item = list_entry(cur, struct rmap_item, link); + cur = cur->next; + remove_rmap_item_from_tree(rmap_item); + list_del(&rmap_item->link); + free_rmap_item(rmap_item); + } +} + +/* + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather + * than check every pte of a given vma, the locking doesn't quite work for + * that - an rmap_item is assigned to the stable tree after inserting ksm + * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing + * rmap_items from parent to child at fork time (so as not to waste time + * if exit comes before the next scan reaches it). + * + * Similarly, although we'd like to remove rmap_items (so updating counts + * and freeing memory) when unmerging an area, it's easier to leave that + * to the next pass of ksmd - consider, for example, how ksmd might be + * in cmp_and_merge_page on one of the rmap_items we would be removing. + */ +static int unmerge_ksm_pages(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + unsigned long addr; + int err = 0; + + for (addr = start; addr < end && !err; addr += PAGE_SIZE) { + if (ksm_test_exit(vma->vm_mm)) + break; + if (signal_pending(current)) + err = -ERESTARTSYS; + else + err = break_ksm(vma, addr); + } + return err; +} + +#ifdef CONFIG_SYSFS +/* + * Only called through the sysfs control interface: + */ +static int unmerge_and_remove_all_rmap_items(void) +{ + struct mm_slot *mm_slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int err = 0; + + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, + struct mm_slot, mm_list); + spin_unlock(&ksm_mmlist_lock); + + for (mm_slot = ksm_scan.mm_slot; + mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { + mm = mm_slot->mm; + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (ksm_test_exit(mm)) + break; + if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + continue; + err = unmerge_ksm_pages(vma, + vma->vm_start, vma->vm_end); + if (err) + goto error; + } + + remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); + + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, + struct mm_slot, mm_list); + if (ksm_test_exit(mm)) { + hlist_del(&mm_slot->link); + list_del(&mm_slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + free_mm_slot(mm_slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + up_read(&mm->mmap_sem); + mmdrop(mm); + } else { + spin_unlock(&ksm_mmlist_lock); + up_read(&mm->mmap_sem); + } + } + + ksm_scan.seqnr = 0; + return 0; + +error: + up_read(&mm->mmap_sem); + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = &ksm_mm_head; + spin_unlock(&ksm_mmlist_lock); + return err; +} +#endif /* CONFIG_SYSFS */ + +static u32 calc_checksum(struct page *page) +{ + u32 checksum; + void *addr = kmap_atomic(page, KM_USER0); + checksum = jhash2(addr, PAGE_SIZE / 4, 17); + kunmap_atomic(addr, KM_USER0); + return checksum; +} + +static int memcmp_pages(struct page *page1, struct page *page2) +{ + char *addr1, *addr2; + int ret; + + addr1 = kmap_atomic(page1, KM_USER0); + addr2 = kmap_atomic(page2, KM_USER1); + ret = memcmp(addr1, addr2, PAGE_SIZE); + kunmap_atomic(addr2, KM_USER1); + kunmap_atomic(addr1, KM_USER0); + return ret; +} + +static inline int pages_identical(struct page *page1, struct page *page2) +{ + return !memcmp_pages(page1, page2); +} + +static int write_protect_page(struct vm_area_struct *vma, struct page *page, + pte_t *orig_pte) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long addr; + pte_t *ptep; + spinlock_t *ptl; + int swapped; + int err = -EFAULT; + + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + goto out; + + ptep = page_check_address(page, mm, addr, &ptl, 0); + if (!ptep) + goto out; + + if (pte_write(*ptep)) { + pte_t entry; + + swapped = PageSwapCache(page); + flush_cache_page(vma, addr, page_to_pfn(page)); + /* + * Ok this is tricky, when get_user_pages_fast() run it doesnt + * take any lock, therefore the check that we are going to make + * with the pagecount against the mapcount is racey and + * O_DIRECT can happen right after the check. + * So we clear the pte and flush the tlb before the check + * this assure us that no O_DIRECT can happen after the check + * or in the middle of the check. + */ + entry = ptep_clear_flush(vma, addr, ptep); + /* + * Check that no O_DIRECT or similar I/O is in progress on the + * page + */ + if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { + set_pte_at_notify(mm, addr, ptep, entry); + goto out_unlock; + } + entry = pte_wrprotect(entry); + set_pte_at_notify(mm, addr, ptep, entry); + } + *orig_pte = *ptep; + err = 0; + +out_unlock: + pte_unmap_unlock(ptep, ptl); +out: + return err; +} + +/** + * replace_page - replace page in vma by new ksm page + * @vma: vma that holds the pte pointing to oldpage + * @oldpage: the page we are replacing by newpage + * @newpage: the ksm page we replace oldpage by + * @orig_pte: the original value of the pte + * + * Returns 0 on success, -EFAULT on failure. + */ +static int replace_page(struct vm_area_struct *vma, struct page *oldpage, + struct page *newpage, pte_t orig_pte) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + spinlock_t *ptl; + unsigned long addr; + pgprot_t prot; + int err = -EFAULT; + + prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE); + + addr = page_address_in_vma(oldpage, vma); + if (addr == -EFAULT) + goto out; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + goto out; + + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte_same(*ptep, orig_pte)) { + pte_unmap_unlock(ptep, ptl); + goto out; + } + + get_page(newpage); + page_add_ksm_rmap(newpage); + + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); + + page_remove_rmap(oldpage); + put_page(oldpage); + + pte_unmap_unlock(ptep, ptl); + err = 0; +out: + return err; +} + +/* + * try_to_merge_one_page - take two pages and merge them into one + * @vma: the vma that hold the pte pointing into oldpage + * @oldpage: the page that we want to replace with newpage + * @newpage: the page that we want to map instead of oldpage + * + * Note: + * oldpage should be a PageAnon page, while newpage should be a PageKsm page, + * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm. + * + * This function returns 0 if the pages were merged, -EFAULT otherwise. + */ +static int try_to_merge_one_page(struct vm_area_struct *vma, + struct page *oldpage, + struct page *newpage) +{ + pte_t orig_pte = __pte(0); + int err = -EFAULT; + + if (!(vma->vm_flags & VM_MERGEABLE)) + goto out; + + if (!PageAnon(oldpage)) + goto out; + + get_page(newpage); + get_page(oldpage); + + /* + * We need the page lock to read a stable PageSwapCache in + * write_protect_page(). We use trylock_page() instead of + * lock_page() because we don't want to wait here - we + * prefer to continue scanning and merging different pages, + * then come back to this page when it is unlocked. + */ + if (!trylock_page(oldpage)) + goto out_putpage; + /* + * If this anonymous page is mapped only here, its pte may need + * to be write-protected. If it's mapped elsewhere, all of its + * ptes are necessarily already write-protected. But in either + * case, we need to lock and check page_count is not raised. + */ + if (write_protect_page(vma, oldpage, &orig_pte)) { + unlock_page(oldpage); + goto out_putpage; + } + unlock_page(oldpage); + + if (pages_identical(oldpage, newpage)) + err = replace_page(vma, oldpage, newpage, orig_pte); + +out_putpage: + put_page(oldpage); + put_page(newpage); +out: + return err; +} + +/* + * try_to_merge_with_ksm_page - like try_to_merge_two_pages, + * but no new kernel page is allocated: kpage must already be a ksm page. + */ +static int try_to_merge_with_ksm_page(struct mm_struct *mm1, + unsigned long addr1, + struct page *page1, + struct page *kpage) +{ + struct vm_area_struct *vma; + int err = -EFAULT; + + down_read(&mm1->mmap_sem); + if (ksm_test_exit(mm1)) + goto out; + + vma = find_vma(mm1, addr1); + if (!vma || vma->vm_start > addr1) + goto out; + + err = try_to_merge_one_page(vma, page1, kpage); +out: + up_read(&mm1->mmap_sem); + return err; +} + +/* + * try_to_merge_two_pages - take two identical pages and prepare them + * to be merged into one page. + * + * This function returns 0 if we successfully mapped two identical pages + * into one page, -EFAULT otherwise. + * + * Note that this function allocates a new kernel page: if one of the pages + * is already a ksm page, try_to_merge_with_ksm_page should be used. + */ +static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, + struct page *page1, struct mm_struct *mm2, + unsigned long addr2, struct page *page2) +{ + struct vm_area_struct *vma; + struct page *kpage; + int err = -EFAULT; + + /* + * The number of nodes in the stable tree + * is the number of kernel pages that we hold. + */ + if (ksm_max_kernel_pages && + ksm_max_kernel_pages <= ksm_pages_shared) + return err; + + kpage = alloc_page(GFP_HIGHUSER); + if (!kpage) + return err; + + down_read(&mm1->mmap_sem); + if (ksm_test_exit(mm1)) { + up_read(&mm1->mmap_sem); + goto out; + } + vma = find_vma(mm1, addr1); + if (!vma || vma->vm_start > addr1) { + up_read(&mm1->mmap_sem); + goto out; + } + + copy_user_highpage(kpage, page1, addr1, vma); + err = try_to_merge_one_page(vma, page1, kpage); + up_read(&mm1->mmap_sem); + + if (!err) { + err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage); + /* + * If that fails, we have a ksm page with only one pte + * pointing to it: so break it. + */ + if (err) + break_cow(mm1, addr1); + } +out: + put_page(kpage); + return err; +} + +/* + * stable_tree_search - search page inside the stable tree + * @page: the page that we are searching identical pages to. + * @page2: pointer into identical page that we are holding inside the stable + * tree that we have found. + * @rmap_item: the reverse mapping item + * + * This function checks if there is a page inside the stable tree + * with identical content to the page that we are scanning right now. + * + * This function return rmap_item pointer to the identical item if found, + * NULL otherwise. + */ +static struct rmap_item *stable_tree_search(struct page *page, + struct page **page2, + struct rmap_item *rmap_item) +{ + struct rb_node *node = root_stable_tree.rb_node; + + while (node) { + struct rmap_item *tree_rmap_item, *next_rmap_item; + int ret; + + tree_rmap_item = rb_entry(node, struct rmap_item, node); + while (tree_rmap_item) { + BUG_ON(!in_stable_tree(tree_rmap_item)); + cond_resched(); + page2[0] = get_ksm_page(tree_rmap_item); + if (page2[0]) + break; + next_rmap_item = tree_rmap_item->next; + remove_rmap_item_from_tree(tree_rmap_item); + tree_rmap_item = next_rmap_item; + } + if (!tree_rmap_item) + return NULL; + + ret = memcmp_pages(page, page2[0]); + + if (ret < 0) { + put_page(page2[0]); + node = node->rb_left; + } else if (ret > 0) { + put_page(page2[0]); + node = node->rb_right; + } else { + return tree_rmap_item; + } + } + + return NULL; +} + +/* + * stable_tree_insert - insert rmap_item pointing to new ksm page + * into the stable tree. + * + * @page: the page that we are searching identical page to inside the stable + * tree. + * @rmap_item: pointer to the reverse mapping item. + * + * This function returns rmap_item if success, NULL otherwise. + */ +static struct rmap_item *stable_tree_insert(struct page *page, + struct rmap_item *rmap_item) +{ + struct rb_node **new = &root_stable_tree.rb_node; + struct rb_node *parent = NULL; + + while (*new) { + struct rmap_item *tree_rmap_item, *next_rmap_item; + struct page *tree_page; + int ret; + + tree_rmap_item = rb_entry(*new, struct rmap_item, node); + while (tree_rmap_item) { + BUG_ON(!in_stable_tree(tree_rmap_item)); + cond_resched(); + tree_page = get_ksm_page(tree_rmap_item); + if (tree_page) + break; + next_rmap_item = tree_rmap_item->next; + remove_rmap_item_from_tree(tree_rmap_item); + tree_rmap_item = next_rmap_item; + } + if (!tree_rmap_item) + return NULL; + + ret = memcmp_pages(page, tree_page); + put_page(tree_page); + + parent = *new; + if (ret < 0) + new = &parent->rb_left; + else if (ret > 0) + new = &parent->rb_right; + else { + /* + * It is not a bug that stable_tree_search() didn't + * find this node: because at that time our page was + * not yet write-protected, so may have changed since. + */ + return NULL; + } + } + + rmap_item->address |= NODE_FLAG | STABLE_FLAG; + rmap_item->next = NULL; + rb_link_node(&rmap_item->node, parent, new); + rb_insert_color(&rmap_item->node, &root_stable_tree); + + ksm_pages_shared++; + return rmap_item; +} + +/* + * unstable_tree_search_insert - search and insert items into the unstable tree. + * + * @page: the page that we are going to search for identical page or to insert + * into the unstable tree + * @page2: pointer into identical page that was found inside the unstable tree + * @rmap_item: the reverse mapping item of page + * + * This function searches for a page in the unstable tree identical to the + * page currently being scanned; and if no identical page is found in the + * tree, we insert rmap_item as a new object into the unstable tree. + * + * This function returns pointer to rmap_item found to be identical + * to the currently scanned page, NULL otherwise. + * + * This function does both searching and inserting, because they share + * the same walking algorithm in an rbtree. + */ +static struct rmap_item *unstable_tree_search_insert(struct page *page, + struct page **page2, + struct rmap_item *rmap_item) +{ + struct rb_node **new = &root_unstable_tree.rb_node; + struct rb_node *parent = NULL; + + while (*new) { + struct rmap_item *tree_rmap_item; + int ret; + + tree_rmap_item = rb_entry(*new, struct rmap_item, node); + page2[0] = get_mergeable_page(tree_rmap_item); + if (!page2[0]) + return NULL; + + /* + * Don't substitute an unswappable ksm page + * just for one good swappable forked page. + */ + if (page == page2[0]) { + put_page(page2[0]); + return NULL; + } + + ret = memcmp_pages(page, page2[0]); + + parent = *new; + if (ret < 0) { + put_page(page2[0]); + new = &parent->rb_left; + } else if (ret > 0) { + put_page(page2[0]); + new = &parent->rb_right; + } else { + return tree_rmap_item; + } + } + + rmap_item->address |= NODE_FLAG; + rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); + rb_link_node(&rmap_item->node, parent, new); + rb_insert_color(&rmap_item->node, &root_unstable_tree); + + ksm_pages_unshared++; + return NULL; +} + +/* + * stable_tree_append - add another rmap_item to the linked list of + * rmap_items hanging off a given node of the stable tree, all sharing + * the same ksm page. + */ +static void stable_tree_append(struct rmap_item *rmap_item, + struct rmap_item *tree_rmap_item) +{ + rmap_item->next = tree_rmap_item->next; + rmap_item->prev = tree_rmap_item; + + if (tree_rmap_item->next) + tree_rmap_item->next->prev = rmap_item; + + tree_rmap_item->next = rmap_item; + rmap_item->address |= STABLE_FLAG; + + ksm_pages_sharing++; +} + +/* + * cmp_and_merge_page - first see if page can be merged into the stable tree; + * if not, compare checksum to previous and if it's the same, see if page can + * be inserted into the unstable tree, or merged with a page already there and + * both transferred to the stable tree. + * + * @page: the page that we are searching identical page to. + * @rmap_item: the reverse mapping into the virtual address of this page + */ +static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) +{ + struct page *page2[1]; + struct rmap_item *tree_rmap_item; + unsigned int checksum; + int err; + + if (in_stable_tree(rmap_item)) + remove_rmap_item_from_tree(rmap_item); + + /* We first start with searching the page inside the stable tree */ + tree_rmap_item = stable_tree_search(page, page2, rmap_item); + if (tree_rmap_item) { + if (page == page2[0]) /* forked */ + err = 0; + else + err = try_to_merge_with_ksm_page(rmap_item->mm, + rmap_item->address, + page, page2[0]); + put_page(page2[0]); + + if (!err) { + /* + * The page was successfully merged: + * add its rmap_item to the stable tree. + */ + stable_tree_append(rmap_item, tree_rmap_item); + } + return; + } + + /* + * A ksm page might have got here by fork, but its other + * references have already been removed from the stable tree. + * Or it might be left over from a break_ksm which failed + * when the mem_cgroup had reached its limit: try again now. + */ + if (PageKsm(page)) + break_cow(rmap_item->mm, rmap_item->address); + + /* + * In case the hash value of the page was changed from the last time we + * have calculated it, this page to be changed frequely, therefore we + * don't want to insert it to the unstable tree, and we don't want to + * waste our time to search if there is something identical to it there. + */ + checksum = calc_checksum(page); + if (rmap_item->oldchecksum != checksum) { + rmap_item->oldchecksum = checksum; + return; + } + + tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item); + if (tree_rmap_item) { + err = try_to_merge_two_pages(rmap_item->mm, + rmap_item->address, page, + tree_rmap_item->mm, + tree_rmap_item->address, page2[0]); + /* + * As soon as we merge this page, we want to remove the + * rmap_item of the page we have merged with from the unstable + * tree, and insert it instead as new node in the stable tree. + */ + if (!err) { + rb_erase(&tree_rmap_item->node, &root_unstable_tree); + tree_rmap_item->address &= ~NODE_FLAG; + ksm_pages_unshared--; + + /* + * If we fail to insert the page into the stable tree, + * we will have 2 virtual addresses that are pointing + * to a ksm page left outside the stable tree, + * in which case we need to break_cow on both. + */ + if (stable_tree_insert(page2[0], tree_rmap_item)) + stable_tree_append(rmap_item, tree_rmap_item); + else { + break_cow(tree_rmap_item->mm, + tree_rmap_item->address); + break_cow(rmap_item->mm, rmap_item->address); + } + } + + put_page(page2[0]); + } +} + +static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, + struct list_head *cur, + unsigned long addr) +{ + struct rmap_item *rmap_item; + + while (cur != &mm_slot->rmap_list) { + rmap_item = list_entry(cur, struct rmap_item, link); + if ((rmap_item->address & PAGE_MASK) == addr) { + if (!in_stable_tree(rmap_item)) + remove_rmap_item_from_tree(rmap_item); + return rmap_item; + } + if (rmap_item->address > addr) + break; + cur = cur->next; + remove_rmap_item_from_tree(rmap_item); + list_del(&rmap_item->link); + free_rmap_item(rmap_item); + } + + rmap_item = alloc_rmap_item(); + if (rmap_item) { + /* It has already been zeroed */ + rmap_item->mm = mm_slot->mm; + rmap_item->address = addr; + list_add_tail(&rmap_item->link, cur); + } + return rmap_item; +} + +static struct rmap_item *scan_get_next_rmap_item(struct page **page) +{ + struct mm_struct *mm; + struct mm_slot *slot; + struct vm_area_struct *vma; + struct rmap_item *rmap_item; + + if (list_empty(&ksm_mm_head.mm_list)) + return NULL; + + slot = ksm_scan.mm_slot; + if (slot == &ksm_mm_head) { + root_unstable_tree = RB_ROOT; + + spin_lock(&ksm_mmlist_lock); + slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); + ksm_scan.mm_slot = slot; + spin_unlock(&ksm_mmlist_lock); +next_mm: + ksm_scan.address = 0; + ksm_scan.rmap_item = list_entry(&slot->rmap_list, + struct rmap_item, link); + } + + mm = slot->mm; + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + vma = NULL; + else + vma = find_vma(mm, ksm_scan.address); + + for (; vma; vma = vma->vm_next) { + if (!(vma->vm_flags & VM_MERGEABLE)) + continue; + if (ksm_scan.address < vma->vm_start) + ksm_scan.address = vma->vm_start; + if (!vma->anon_vma) + ksm_scan.address = vma->vm_end; + + while (ksm_scan.address < vma->vm_end) { + if (ksm_test_exit(mm)) + break; + *page = follow_page(vma, ksm_scan.address, FOLL_GET); + if (*page && PageAnon(*page)) { + flush_anon_page(vma, *page, ksm_scan.address); + flush_dcache_page(*page); + rmap_item = get_next_rmap_item(slot, + ksm_scan.rmap_item->link.next, + ksm_scan.address); + if (rmap_item) { + ksm_scan.rmap_item = rmap_item; + ksm_scan.address += PAGE_SIZE; + } else + put_page(*page); + up_read(&mm->mmap_sem); + return rmap_item; + } + if (*page) + put_page(*page); + ksm_scan.address += PAGE_SIZE; + cond_resched(); + } + } + + if (ksm_test_exit(mm)) { + ksm_scan.address = 0; + ksm_scan.rmap_item = list_entry(&slot->rmap_list, + struct rmap_item, link); + } + /* + * Nuke all the rmap_items that are above this current rmap: + * because there were no VM_MERGEABLE vmas with such addresses. + */ + remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); + + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(slot->mm_list.next, + struct mm_slot, mm_list); + if (ksm_scan.address == 0) { + /* + * We've completed a full scan of all vmas, holding mmap_sem + * throughout, and found no VM_MERGEABLE: so do the same as + * __ksm_exit does to remove this mm from all our lists now. + * This applies either when cleaning up after __ksm_exit + * (but beware: we can reach here even before __ksm_exit), + * or when all VM_MERGEABLE areas have been unmapped (and + * mmap_sem then protects against race with MADV_MERGEABLE). + */ + hlist_del(&slot->link); + list_del(&slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + free_mm_slot(slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + up_read(&mm->mmap_sem); + mmdrop(mm); + } else { + spin_unlock(&ksm_mmlist_lock); + up_read(&mm->mmap_sem); + } + + /* Repeat until we've completed scanning the whole list */ + slot = ksm_scan.mm_slot; + if (slot != &ksm_mm_head) + goto next_mm; + + ksm_scan.seqnr++; + return NULL; +} + +/** + * ksm_do_scan - the ksm scanner main worker function. + * @scan_npages - number of pages we want to scan before we return. + */ +static void ksm_do_scan(unsigned int scan_npages) +{ + struct rmap_item *rmap_item; + struct page *page; + + while (scan_npages--) { + cond_resched(); + rmap_item = scan_get_next_rmap_item(&page); + if (!rmap_item) + return; + if (!PageKsm(page) || !in_stable_tree(rmap_item)) + cmp_and_merge_page(page, rmap_item); + else if (page_mapcount(page) == 1) { + /* + * Replace now-unshared ksm page by ordinary page. + */ + break_cow(rmap_item->mm, rmap_item->address); + remove_rmap_item_from_tree(rmap_item); + rmap_item->oldchecksum = calc_checksum(page); + } + put_page(page); + } +} + +static int ksmd_should_run(void) +{ + return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); +} + +static int ksm_scan_thread(void *nothing) +{ + set_user_nice(current, 5); + + while (!kthread_should_stop()) { + mutex_lock(&ksm_thread_mutex); + if (ksmd_should_run()) + ksm_do_scan(ksm_thread_pages_to_scan); + mutex_unlock(&ksm_thread_mutex); + + if (ksmd_should_run()) { + schedule_timeout_interruptible( + msecs_to_jiffies(ksm_thread_sleep_millisecs)); + } else { + wait_event_interruptible(ksm_thread_wait, + ksmd_should_run() || kthread_should_stop()); + } + } + return 0; +} + +int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags) +{ + struct mm_struct *mm = vma->vm_mm; + int err; + + switch (advice) { + case MADV_MERGEABLE: + /* + * Be somewhat over-protective for now! + */ + if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return 0; /* just ignore the advice */ + + if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + err = __ksm_enter(mm); + if (err) + return err; + } + + *vm_flags |= VM_MERGEABLE; + break; + + case MADV_UNMERGEABLE: + if (!(*vm_flags & VM_MERGEABLE)) + return 0; /* just ignore the advice */ + + if (vma->anon_vma) { + err = unmerge_ksm_pages(vma, start, end); + if (err) + return err; + } + + *vm_flags &= ~VM_MERGEABLE; + break; + } + + return 0; +} + +int __ksm_enter(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int needs_wakeup; + + mm_slot = alloc_mm_slot(); + if (!mm_slot) + return -ENOMEM; + + /* Check ksm_run too? Would need tighter locking */ + needs_wakeup = list_empty(&ksm_mm_head.mm_list); + + spin_lock(&ksm_mmlist_lock); + insert_to_mm_slots_hash(mm, mm_slot); + /* + * Insert just behind the scanning cursor, to let the area settle + * down a little; when fork is followed by immediate exec, we don't + * want ksmd to waste time setting up and tearing down an rmap_list. + */ + list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + set_bit(MMF_VM_MERGEABLE, &mm->flags); + atomic_inc(&mm->mm_count); + + if (needs_wakeup) + wake_up_interruptible(&ksm_thread_wait); + + return 0; +} + +void __ksm_exit(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int easy_to_free = 0; + + /* + * This process is exiting: if it's straightforward (as is the + * case when ksmd was never running), free mm_slot immediately. + * But if it's at the cursor or has rmap_items linked to it, use + * mmap_sem to synchronize with any break_cows before pagetables + * are freed, and leave the mm_slot on the list for ksmd to free. + * Beware: ksm may already have noticed it exiting and freed the slot. + */ + + spin_lock(&ksm_mmlist_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && ksm_scan.mm_slot != mm_slot) { + if (list_empty(&mm_slot->rmap_list)) { + hlist_del(&mm_slot->link); + list_del(&mm_slot->mm_list); + easy_to_free = 1; + } else { + list_move(&mm_slot->mm_list, + &ksm_scan.mm_slot->mm_list); + } + } + spin_unlock(&ksm_mmlist_lock); + + if (easy_to_free) { + free_mm_slot(mm_slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + mmdrop(mm); + } else if (mm_slot) { + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } +} + +#ifdef CONFIG_SYSFS +/* + * This all compiles without CONFIG_SYSFS, but is a waste of space. + */ + +#define KSM_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) +#define KSM_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static ssize_t sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs); +} + +static ssize_t sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + ksm_thread_sleep_millisecs = msecs; + + return count; +} +KSM_ATTR(sleep_millisecs); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_thread_pages_to_scan); +} + +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long nr_pages; + + err = strict_strtoul(buf, 10, &nr_pages); + if (err || nr_pages > UINT_MAX) + return -EINVAL; + + ksm_thread_pages_to_scan = nr_pages; + + return count; +} +KSM_ATTR(pages_to_scan); + +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", ksm_run); +} + +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long flags; + + err = strict_strtoul(buf, 10, &flags); + if (err || flags > UINT_MAX) + return -EINVAL; + if (flags > KSM_RUN_UNMERGE) + return -EINVAL; + + /* + * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. + * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, + * breaking COW to free the unswappable pages_shared (but leaves + * mm_slots on the list for when ksmd may be set running again). + */ + + mutex_lock(&ksm_thread_mutex); + if (ksm_run != flags) { + ksm_run = flags; + if (flags & KSM_RUN_UNMERGE) { + current->flags |= PF_OOM_ORIGIN; + err = unmerge_and_remove_all_rmap_items(); + current->flags &= ~PF_OOM_ORIGIN; + if (err) { + ksm_run = KSM_RUN_STOP; + count = err; + } + } + } + mutex_unlock(&ksm_thread_mutex); + + if (flags & KSM_RUN_MERGE) + wake_up_interruptible(&ksm_thread_wait); + + return count; +} +KSM_ATTR(run); + +static ssize_t max_kernel_pages_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long nr_pages; + + err = strict_strtoul(buf, 10, &nr_pages); + if (err) + return -EINVAL; + + ksm_max_kernel_pages = nr_pages; + + return count; +} + +static ssize_t max_kernel_pages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_max_kernel_pages); +} +KSM_ATTR(max_kernel_pages); + +static ssize_t pages_shared_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_pages_shared); +} +KSM_ATTR_RO(pages_shared); + +static ssize_t pages_sharing_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_pages_sharing); +} +KSM_ATTR_RO(pages_sharing); + +static ssize_t pages_unshared_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_pages_unshared); +} +KSM_ATTR_RO(pages_unshared); + +static ssize_t pages_volatile_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + long ksm_pages_volatile; + + ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared + - ksm_pages_sharing - ksm_pages_unshared; + /* + * It was not worth any locking to calculate that statistic, + * but it might therefore sometimes be negative: conceal that. + */ + if (ksm_pages_volatile < 0) + ksm_pages_volatile = 0; + return sprintf(buf, "%ld\n", ksm_pages_volatile); +} +KSM_ATTR_RO(pages_volatile); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_scan.seqnr); +} +KSM_ATTR_RO(full_scans); + +static struct attribute *ksm_attrs[] = { + &sleep_millisecs_attr.attr, + &pages_to_scan_attr.attr, + &run_attr.attr, + &max_kernel_pages_attr.attr, + &pages_shared_attr.attr, + &pages_sharing_attr.attr, + &pages_unshared_attr.attr, + &pages_volatile_attr.attr, + &full_scans_attr.attr, + NULL, +}; + +static struct attribute_group ksm_attr_group = { + .attrs = ksm_attrs, + .name = "ksm", +}; +#endif /* CONFIG_SYSFS */ + +static int __init ksm_init(void) +{ + struct task_struct *ksm_thread; + int err; + + ksm_max_kernel_pages = totalram_pages / 4; + + err = ksm_slab_init(); + if (err) + goto out; + + err = mm_slots_hash_init(); + if (err) + goto out_free1; + + ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); + if (IS_ERR(ksm_thread)) { + printk(KERN_ERR "ksm: creating kthread failed\n"); + err = PTR_ERR(ksm_thread); + goto out_free2; + } + +#ifdef CONFIG_SYSFS + err = sysfs_create_group(mm_kobj, &ksm_attr_group); + if (err) { + printk(KERN_ERR "ksm: register sysfs failed\n"); + kthread_stop(ksm_thread); + goto out_free2; + } +#else + ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ + +#endif /* CONFIG_SYSFS */ + + return 0; + +out_free2: + mm_slots_hash_free(); +out_free1: + ksm_slab_free(); +out: + return err; +} +module_init(ksm_init) diff --git a/mm/madvise.c b/mm/madvise.c index 76eb4193acdd..35b1479b7c9d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -11,6 +11,7 @@ #include <linux/mempolicy.h> #include <linux/hugetlb.h> #include <linux/sched.h> +#include <linux/ksm.h> /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -41,7 +42,7 @@ static long madvise_behavior(struct vm_area_struct * vma, struct mm_struct * mm = vma->vm_mm; int error = 0; pgoff_t pgoff; - int new_flags = vma->vm_flags; + unsigned long new_flags = vma->vm_flags; switch (behavior) { case MADV_NORMAL: @@ -57,8 +58,18 @@ static long madvise_behavior(struct vm_area_struct * vma, new_flags |= VM_DONTCOPY; break; case MADV_DOFORK: + if (vma->vm_flags & VM_IO) { + error = -EINVAL; + goto out; + } new_flags &= ~VM_DONTCOPY; break; + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + error = ksm_madvise(vma, start, end, behavior, &new_flags); + if (error) + goto out; + break; } if (new_flags == vma->vm_flags) { @@ -207,41 +218,46 @@ static long madvise_remove(struct vm_area_struct *vma, return error; } +#ifdef CONFIG_MEMORY_FAILURE +/* + * Error injection support for memory error handling. + */ +static int madvise_hwpoison(unsigned long start, unsigned long end) +{ + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + for (; start < end; start += PAGE_SIZE) { + struct page *p; + int ret = get_user_pages(current, current->mm, start, 1, + 0, 0, &p, NULL); + if (ret != 1) + return ret; + printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", + page_to_pfn(p), start); + /* Ignore return value for now */ + __memory_failure(page_to_pfn(p), 0, 1); + put_page(p); + } + return ret; +} +#endif + static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) { - long error; - switch (behavior) { - case MADV_DOFORK: - if (vma->vm_flags & VM_IO) { - error = -EINVAL; - break; - } - case MADV_DONTFORK: - case MADV_NORMAL: - case MADV_SEQUENTIAL: - case MADV_RANDOM: - error = madvise_behavior(vma, prev, start, end, behavior); - break; case MADV_REMOVE: - error = madvise_remove(vma, prev, start, end); - break; - + return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: - error = madvise_willneed(vma, prev, start, end); - break; - + return madvise_willneed(vma, prev, start, end); case MADV_DONTNEED: - error = madvise_dontneed(vma, prev, start, end); - break; - + return madvise_dontneed(vma, prev, start, end); default: - BUG(); - break; + return madvise_behavior(vma, prev, start, end, behavior); } - return error; } static int @@ -256,12 +272,17 @@ madvise_behavior_valid(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: +#ifdef CONFIG_KSM + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: +#endif return 1; default: return 0; } } + /* * The madvise(2) system call. * @@ -286,6 +307,12 @@ madvise_behavior_valid(int behavior) * so the kernel can free resources associated with it. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. + * MADV_DONTFORK - omit this area from child's address space when forking: + * typically, to avoid COWing pages pinned by get_user_pages(). + * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. + * MADV_MERGEABLE - the application recommends that KSM try to merge pages in + * this area with pages of identical content from other such areas. + * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. * * return values: * zero - success @@ -307,6 +334,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) int write; size_t len; +#ifdef CONFIG_MEMORY_FAILURE + if (behavior == MADV_HWPOISON) + return madvise_hwpoison(start, start+len_in); +#endif if (!madvise_behavior_valid(behavior)) return error; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fd4529d86de5..f99f5991d6bb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -29,6 +29,7 @@ #include <linux/rcupdate.h> #include <linux/limits.h> #include <linux/mutex.h> +#include <linux/rbtree.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/spinlock.h> @@ -43,6 +44,7 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 +struct mem_cgroup *root_mem_cgroup __read_mostly; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ @@ -53,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ #endif static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ +#define SOFTLIMIT_EVENTS_THRESH (1000) /* * Statistics for memory cgroup. @@ -66,6 +69,8 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ + MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ + MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, }; @@ -78,6 +83,20 @@ struct mem_cgroup_stat { struct mem_cgroup_stat_cpu cpustat[0]; }; +static inline void +__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, + enum mem_cgroup_stat_index idx) +{ + stat->count[idx] = 0; +} + +static inline s64 +__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, + enum mem_cgroup_stat_index idx) +{ + return stat->count[idx]; +} + /* * For accounting under irq disable, no need for increment preempt count. */ @@ -117,6 +136,12 @@ struct mem_cgroup_per_zone { unsigned long count[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; + struct rb_node tree_node; /* RB tree node */ + unsigned long long usage_in_excess;/* Set to the value by which */ + /* the soft limit is exceeded*/ + bool on_tree; + struct mem_cgroup *mem; /* Back pointer, we cannot */ + /* use container_of */ }; /* Macro for accessing counter */ #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) @@ -130,6 +155,26 @@ struct mem_cgroup_lru_info { }; /* + * Cgroups above their limits are maintained in a RB-Tree, independent of + * their hierarchy representation + */ + +struct mem_cgroup_tree_per_zone { + struct rb_root rb_root; + spinlock_t lock; +}; + +struct mem_cgroup_tree_per_node { + struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; +}; + +struct mem_cgroup_tree { + struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; +}; + +static struct mem_cgroup_tree soft_limit_tree __read_mostly; + +/* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide * statistics based on the statistics developed by Rik Van Riel for clock-pro, @@ -186,6 +231,13 @@ struct mem_cgroup { struct mem_cgroup_stat stat; }; +/* + * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft + * limit reclaim to prevent infinite loops, if they ever occur. + */ +#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) +#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) + enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_MAPPED, @@ -200,13 +252,8 @@ enum charge_type { #define PCGF_CACHE (1UL << PCG_CACHE) #define PCGF_USED (1UL << PCG_USED) #define PCGF_LOCK (1UL << PCG_LOCK) -static const unsigned long -pcg_default_flags[NR_CHARGE_TYPE] = { - PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ - PCGF_USED | PCGF_LOCK, /* Anon */ - PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ - 0, /* FORCE */ -}; +/* Not used, but added here for completeness */ +#define PCGF_ACCT (1UL << PCG_ACCT) /* for encoding cft->private value on file */ #define _MEM (0) @@ -215,15 +262,237 @@ pcg_default_flags[NR_CHARGE_TYPE] = { #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) +/* + * Reclaim flags for mem_cgroup_hierarchical_reclaim + */ +#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 +#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) +#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 +#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) +#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 +#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) + static void mem_cgroup_get(struct mem_cgroup *mem); static void mem_cgroup_put(struct mem_cgroup *mem); static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); +static struct mem_cgroup_per_zone * +mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) +{ + return &mem->info.nodeinfo[nid]->zoneinfo[zid]; +} + +static struct mem_cgroup_per_zone * +page_cgroup_zoneinfo(struct page_cgroup *pc) +{ + struct mem_cgroup *mem = pc->mem_cgroup; + int nid = page_cgroup_nid(pc); + int zid = page_cgroup_zid(pc); + + if (!mem) + return NULL; + + return mem_cgroup_zoneinfo(mem, nid, zid); +} + +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_node_zone(int nid, int zid) +{ + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_from_page(struct page *page) +{ + int nid = page_to_nid(page); + int zid = page_zonenum(page); + + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static void +__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz, + unsigned long long new_usage_in_excess) +{ + struct rb_node **p = &mctz->rb_root.rb_node; + struct rb_node *parent = NULL; + struct mem_cgroup_per_zone *mz_node; + + if (mz->on_tree) + return; + + mz->usage_in_excess = new_usage_in_excess; + if (!mz->usage_in_excess) + return; + while (*p) { + parent = *p; + mz_node = rb_entry(parent, struct mem_cgroup_per_zone, + tree_node); + if (mz->usage_in_excess < mz_node->usage_in_excess) + p = &(*p)->rb_left; + /* + * We can't avoid mem cgroups that are over their soft + * limit by the same amount + */ + else if (mz->usage_in_excess >= mz_node->usage_in_excess) + p = &(*p)->rb_right; + } + rb_link_node(&mz->tree_node, parent, p); + rb_insert_color(&mz->tree_node, &mctz->rb_root); + mz->on_tree = true; +} + +static void +__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) +{ + if (!mz->on_tree) + return; + rb_erase(&mz->tree_node, &mctz->rb_root); + mz->on_tree = false; +} + +static void +mem_cgroup_remove_exceeded(struct mem_cgroup *mem, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) +{ + spin_lock(&mctz->lock); + __mem_cgroup_remove_exceeded(mem, mz, mctz); + spin_unlock(&mctz->lock); +} + +static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) +{ + bool ret = false; + int cpu; + s64 val; + struct mem_cgroup_stat_cpu *cpustat; + + cpu = get_cpu(); + cpustat = &mem->stat.cpustat[cpu]; + val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); + if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { + __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); + ret = true; + } + put_cpu(); + return ret; +} + +static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) +{ + unsigned long long excess; + struct mem_cgroup_per_zone *mz; + struct mem_cgroup_tree_per_zone *mctz; + int nid = page_to_nid(page); + int zid = page_zonenum(page); + mctz = soft_limit_tree_from_page(page); + + /* + * Necessary to update all ancestors when hierarchy is used. + * because their event counter is not touched. + */ + for (; mem; mem = parent_mem_cgroup(mem)) { + mz = mem_cgroup_zoneinfo(mem, nid, zid); + excess = res_counter_soft_limit_excess(&mem->res); + /* + * We have to update the tree if mz is on RB-tree or + * mem is over its softlimit. + */ + if (excess || mz->on_tree) { + spin_lock(&mctz->lock); + /* if on-tree, remove it */ + if (mz->on_tree) + __mem_cgroup_remove_exceeded(mem, mz, mctz); + /* + * Insert again. mz->usage_in_excess will be updated. + * If excess is 0, no tree ops. + */ + __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); + spin_unlock(&mctz->lock); + } + } +} + +static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) +{ + int node, zone; + struct mem_cgroup_per_zone *mz; + struct mem_cgroup_tree_per_zone *mctz; + + for_each_node_state(node, N_POSSIBLE) { + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + mz = mem_cgroup_zoneinfo(mem, node, zone); + mctz = soft_limit_tree_node_zone(node, zone); + mem_cgroup_remove_exceeded(mem, mz, mctz); + } + } +} + +static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) +{ + return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; +} + +static struct mem_cgroup_per_zone * +__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ + struct rb_node *rightmost = NULL; + struct mem_cgroup_per_zone *mz; + +retry: + mz = NULL; + rightmost = rb_last(&mctz->rb_root); + if (!rightmost) + goto done; /* Nothing to reclaim from */ + + mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); + /* + * Remove the node now but someone else can add it back, + * we will to add it back at the end of reclaim to its correct + * position in the tree. + */ + __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); + if (!res_counter_soft_limit_excess(&mz->mem->res) || + !css_tryget(&mz->mem->css)) + goto retry; +done: + return mz; +} + +static struct mem_cgroup_per_zone * +mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ + struct mem_cgroup_per_zone *mz; + + spin_lock(&mctz->lock); + mz = __mem_cgroup_largest_soft_limit_node(mctz); + spin_unlock(&mctz->lock); + return mz; +} + +static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, + bool charge) +{ + int val = (charge) ? 1 : -1; + struct mem_cgroup_stat *stat = &mem->stat; + struct mem_cgroup_stat_cpu *cpustat; + int cpu = get_cpu(); + + cpustat = &stat->cpustat[cpu]; + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); + put_cpu(); +} + static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, struct page_cgroup *pc, bool charge) { - int val = (charge)? 1 : -1; + int val = (charge) ? 1 : -1; struct mem_cgroup_stat *stat = &mem->stat; struct mem_cgroup_stat_cpu *cpustat; int cpu = get_cpu(); @@ -240,28 +509,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, else __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); put_cpu(); } -static struct mem_cgroup_per_zone * -mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) -{ - return &mem->info.nodeinfo[nid]->zoneinfo[zid]; -} - -static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct page_cgroup *pc) -{ - struct mem_cgroup *mem = pc->mem_cgroup; - int nid = page_cgroup_nid(pc); - int zid = page_cgroup_zid(pc); - - if (!mem) - return NULL; - - return mem_cgroup_zoneinfo(mem, nid, zid); -} - static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, enum lru_list idx) { @@ -354,6 +605,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, return ret; } +static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) +{ + return (mem == root_mem_cgroup); +} + /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -371,22 +627,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) { struct page_cgroup *pc; - struct mem_cgroup *mem; struct mem_cgroup_per_zone *mz; if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); /* can happen while we handle swapcache. */ - if (list_empty(&pc->lru) || !pc->mem_cgroup) + if (!TestClearPageCgroupAcctLRU(pc)) return; + VM_BUG_ON(!pc->mem_cgroup); /* * We don't check PCG_USED bit. It's cleared when the "page" is finally * removed from global LRU. */ mz = page_cgroup_zoneinfo(pc); - mem = pc->mem_cgroup; MEM_CGROUP_ZSTAT(mz, lru) -= 1; + if (mem_cgroup_is_root(pc->mem_cgroup)) + return; + VM_BUG_ON(list_empty(&pc->lru)); list_del_init(&pc->lru); return; } @@ -410,8 +668,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) * For making pc->mem_cgroup visible, insert smp_rmb() here. */ smp_rmb(); - /* unused page is not rotated. */ - if (!PageCgroupUsed(pc)) + /* unused or root page is not rotated. */ + if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) return; mz = page_cgroup_zoneinfo(pc); list_move(&pc->lru, &mz->lists[lru]); @@ -425,6 +683,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); + VM_BUG_ON(PageCgroupAcctLRU(pc)); /* * Used bit is set without atomic ops but after smp_wmb(). * For making pc->mem_cgroup visible, insert smp_rmb() here. @@ -435,6 +694,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) mz = page_cgroup_zoneinfo(pc); MEM_CGROUP_ZSTAT(mz, lru) += 1; + SetPageCgroupAcctLRU(pc); + if (mem_cgroup_is_root(pc->mem_cgroup)) + return; list_add(&pc->lru, &mz->lists[lru]); } @@ -469,7 +731,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); /* link when the page is linked to LRU but page_cgroup isn't */ - if (PageLRU(page) && list_empty(&pc->lru)) + if (PageLRU(page) && !PageCgroupAcctLRU(pc)) mem_cgroup_add_lru_list(page, page_lru(page)); spin_unlock_irqrestore(&zone->lru_lock, flags); } @@ -648,7 +910,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, int nid = z->zone_pgdat->node_id; int zid = zone_idx(z); struct mem_cgroup_per_zone *mz; - int lru = LRU_FILE * !!file + !!active; + int lru = LRU_FILE * file + active; int ret; BUG_ON(!mem_cont); @@ -855,28 +1117,62 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) * If shrink==true, for avoiding to free too much, this returns immedieately. */ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, - gfp_t gfp_mask, bool noswap, bool shrink) + struct zone *zone, + gfp_t gfp_mask, + unsigned long reclaim_options) { struct mem_cgroup *victim; int ret, total = 0; int loop = 0; + bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; + bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; + bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; + unsigned long excess = mem_cgroup_get_excess(root_mem); /* If memsw_is_minimum==1, swap-out is of-no-use. */ if (root_mem->memsw_is_minimum) noswap = true; - while (loop < 2) { + while (1) { victim = mem_cgroup_select_victim(root_mem); - if (victim == root_mem) + if (victim == root_mem) { loop++; + if (loop >= 2) { + /* + * If we have not been able to reclaim + * anything, it might because there are + * no reclaimable pages under this hierarchy + */ + if (!check_soft || !total) { + css_put(&victim->css); + break; + } + /* + * We want to do more targetted reclaim. + * excess >> 2 is not to excessive so as to + * reclaim too much, nor too less that we keep + * coming back to reclaim from this cgroup + */ + if (total >= (excess >> 2) || + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { + css_put(&victim->css); + break; + } + } + } if (!mem_cgroup_local_usage(&victim->stat)) { /* this cgroup's local usage == 0 */ css_put(&victim->css); continue; } /* we use swappiness of local cgroup */ - ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, - get_swappiness(victim)); + if (check_soft) + ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, + noswap, get_swappiness(victim), zone, + zone->zone_pgdat->node_id); + else + ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, + noswap, get_swappiness(victim)); css_put(&victim->css); /* * At shrinking usage, we can't check we should stop here or @@ -886,7 +1182,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (shrink) return ret; total += ret; - if (mem_cgroup_check_under_limit(root_mem)) + if (check_soft) { + if (res_counter_check_under_soft_limit(&root_mem->res)) + return total; + } else if (mem_cgroup_check_under_limit(root_mem)) return 1 + total; } return total; @@ -965,7 +1264,7 @@ done: */ static int __mem_cgroup_try_charge(struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcg, - bool oom) + bool oom, struct page *page) { struct mem_cgroup *mem, *mem_over_limit; int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; @@ -996,9 +1295,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, VM_BUG_ON(css_is_removed(&mem->css)); while (1) { - int ret; - bool noswap = false; + int ret = 0; + unsigned long flags = 0; + if (mem_cgroup_is_root(mem)) + goto done; ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); if (likely(!ret)) { if (!do_swap_account) @@ -1009,7 +1310,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, break; /* mem+swap counter fails */ res_counter_uncharge(&mem->res, PAGE_SIZE); - noswap = true; + flags |= MEM_CGROUP_RECLAIM_NOSWAP; mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); } else @@ -1020,8 +1321,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (!(gfp_mask & __GFP_WAIT)) goto nomem; - ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, - noswap, false); + ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, + gfp_mask, flags); if (ret) continue; @@ -1046,13 +1347,19 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, goto nomem; } } + /* + * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. + * if they exceeds softlimit. + */ + if (mem_cgroup_soft_limit_check(mem)) + mem_cgroup_update_tree(mem, page); +done: return 0; nomem: css_put(&mem->css); return -ENOMEM; } - /* * A helper function to get mem_cgroup from ID. must be called under * rcu_read_lock(). The caller must check css_is_removed() or some if @@ -1119,15 +1426,37 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - res_counter_uncharge(&mem->res, PAGE_SIZE); - if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(mem)) { + res_counter_uncharge(&mem->res, PAGE_SIZE); + if (do_swap_account) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); + } css_put(&mem->css); return; } + pc->mem_cgroup = mem; + /* + * We access a page_cgroup asynchronously without lock_page_cgroup(). + * Especially when a page_cgroup is taken from a page, pc->mem_cgroup + * is accessed after testing USED bit. To make pc->mem_cgroup visible + * before USED bit, we need memory barrier here. + * See mem_cgroup_add_lru_list(), etc. + */ smp_wmb(); - pc->flags = pcg_default_flags[ctype]; + switch (ctype) { + case MEM_CGROUP_CHARGE_TYPE_CACHE: + case MEM_CGROUP_CHARGE_TYPE_SHMEM: + SetPageCgroupCache(pc); + SetPageCgroupUsed(pc); + break; + case MEM_CGROUP_CHARGE_TYPE_MAPPED: + ClearPageCgroupCache(pc); + SetPageCgroupUsed(pc); + break; + default: + break; + } mem_cgroup_charge_statistics(mem, pc, true); @@ -1178,7 +1507,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, if (pc->mem_cgroup != from) goto out; - res_counter_uncharge(&from->res, PAGE_SIZE); + if (!mem_cgroup_is_root(from)) + res_counter_uncharge(&from->res, PAGE_SIZE); mem_cgroup_charge_statistics(from, pc, false); page = pc->page; @@ -1197,7 +1527,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, 1); } - if (do_swap_account) + if (do_swap_account && !mem_cgroup_is_root(from)) res_counter_uncharge(&from->memsw, PAGE_SIZE); css_put(&from->css); @@ -1238,7 +1568,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); if (ret || !parent) return ret; @@ -1268,9 +1598,11 @@ uncharge: /* drop extra refcnt by try_charge() */ css_put(&parent->css); /* uncharge if move fails */ - res_counter_uncharge(&parent->res, PAGE_SIZE); - if (do_swap_account) - res_counter_uncharge(&parent->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(parent)) { + res_counter_uncharge(&parent->res, PAGE_SIZE); + if (do_swap_account) + res_counter_uncharge(&parent->memsw, PAGE_SIZE); + } return ret; } @@ -1295,7 +1627,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, prefetchw(pc); mem = memcg; - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); if (ret || !mem) return ret; @@ -1414,14 +1746,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); + ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); /* drop extra refcnt from tryget */ css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true); + return __mem_cgroup_try_charge(mm, mask, ptr, true, page); } static void @@ -1459,7 +1791,9 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, * This recorded memcg can be obsolete one. So, avoid * calling css_tryget */ - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(memcg)) + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_swap_statistics(memcg, false); mem_cgroup_put(memcg); } rcu_read_unlock(); @@ -1484,9 +1818,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - res_counter_uncharge(&mem->res, PAGE_SIZE); - if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(mem)) { + res_counter_uncharge(&mem->res, PAGE_SIZE); + if (do_swap_account) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); + } css_put(&mem->css); } @@ -1538,9 +1874,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - res_counter_uncharge(&mem->res, PAGE_SIZE); - if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(mem)) { + res_counter_uncharge(&mem->res, PAGE_SIZE); + if (do_swap_account && + (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); + } + if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) + mem_cgroup_swap_statistics(mem, true); mem_cgroup_charge_statistics(mem, pc, false); ClearPageCgroupUsed(pc); @@ -1554,6 +1895,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mz = page_cgroup_zoneinfo(pc); unlock_page_cgroup(pc); + if (mem_cgroup_soft_limit_check(mem)) + mem_cgroup_update_tree(mem, page); /* at swapout, this memcg will be accessed to record to swap */ if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) css_put(&mem->css); @@ -1629,7 +1972,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * We uncharge this because swap is freed. * This memcg can be obsolete one. We avoid calling css_tryget */ - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(memcg)) + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_swap_statistics(memcg, false); mem_cgroup_put(memcg); } rcu_read_unlock(); @@ -1658,7 +2003,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) unlock_page_cgroup(pc); if (mem) { - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, + page); css_put(&mem->css); } *ptr = mem; @@ -1798,8 +2144,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, if (!ret) break; - progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, - false, true); + progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, + GFP_KERNEL, + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -1851,7 +2198,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, if (!ret) break; - mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); + mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, + MEM_CGROUP_RECLAIM_NOSWAP | + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -1862,6 +2211,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, return ret; } +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, + gfp_t gfp_mask, int nid, + int zid) +{ + unsigned long nr_reclaimed = 0; + struct mem_cgroup_per_zone *mz, *next_mz = NULL; + unsigned long reclaimed; + int loop = 0; + struct mem_cgroup_tree_per_zone *mctz; + unsigned long long excess; + + if (order > 0) + return 0; + + mctz = soft_limit_tree_node_zone(nid, zid); + /* + * This loop can run a while, specially if mem_cgroup's continuously + * keep exceeding their soft limit and putting the system under + * pressure + */ + do { + if (next_mz) + mz = next_mz; + else + mz = mem_cgroup_largest_soft_limit_node(mctz); + if (!mz) + break; + + reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, + gfp_mask, + MEM_CGROUP_RECLAIM_SOFT); + nr_reclaimed += reclaimed; + spin_lock(&mctz->lock); + + /* + * If we failed to reclaim anything from this memory cgroup + * it is time to move on to the next cgroup + */ + next_mz = NULL; + if (!reclaimed) { + do { + /* + * Loop until we find yet another one. + * + * By the time we get the soft_limit lock + * again, someone might have aded the + * group back on the RB tree. Iterate to + * make sure we get a different mem. + * mem_cgroup_largest_soft_limit_node returns + * NULL if no other cgroup is present on + * the tree + */ + next_mz = + __mem_cgroup_largest_soft_limit_node(mctz); + if (next_mz == mz) { + css_put(&next_mz->mem->css); + next_mz = NULL; + } else /* next_mz == NULL or other memcg */ + break; + } while (1); + } + __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); + excess = res_counter_soft_limit_excess(&mz->mem->res); + /* + * One school of thought says that we should not add + * back the node to the tree if reclaim returns 0. + * But our reclaim could return 0, simply because due + * to priority we are exposing a smaller subset of + * memory to reclaim from. Consider this as a longer + * term TODO. + */ + /* If excess == 0, no tree ops */ + __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); + spin_unlock(&mctz->lock); + css_put(&mz->mem->css); + loop++; + /* + * Could not reclaim anything and there are no more + * mem cgroups to try or we seem to be looping without + * reclaiming anything. + */ + if (!nr_reclaimed && + (next_mz == NULL || + loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) + break; + } while (!nr_reclaimed); + if (next_mz) + css_put(&next_mz->mem->css); + return nr_reclaimed; +} + /* * This routine traverse page_cgroup in given list and drop them all. * *And* this routine doesn't reclaim page itself, just removes page_cgroup. @@ -2046,20 +2486,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, return retval; } +struct mem_cgroup_idx_data { + s64 val; + enum mem_cgroup_stat_index idx; +}; + +static int +mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) +{ + struct mem_cgroup_idx_data *d = data; + d->val += mem_cgroup_read_stat(&mem->stat, d->idx); + return 0; +} + +static void +mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, + enum mem_cgroup_stat_index idx, s64 *val) +{ + struct mem_cgroup_idx_data d; + d.idx = idx; + d.val = 0; + mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); + *val = d.val; +} + static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); - u64 val = 0; + u64 idx_val, val; int type, name; type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); switch (type) { case _MEM: - val = res_counter_read_u64(&mem->res, name); + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { + mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_CACHE, &idx_val); + val = idx_val; + mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_RSS, &idx_val); + val += idx_val; + val <<= PAGE_SHIFT; + } else + val = res_counter_read_u64(&mem->res, name); break; case _MEMSWAP: - val = res_counter_read_u64(&mem->memsw, name); + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { + mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_CACHE, &idx_val); + val = idx_val; + mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_RSS, &idx_val); + val += idx_val; + mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_SWAPOUT, &idx_val); + val <<= PAGE_SHIFT; + } else + val = res_counter_read_u64(&mem->memsw, name); break; default: BUG(); @@ -2083,6 +2567,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, name = MEMFILE_ATTR(cft->private); switch (name) { case RES_LIMIT: + if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ + ret = -EINVAL; + break; + } /* This function does all necessary parse...reuse it */ ret = res_counter_memparse_write_strategy(buffer, &val); if (ret) @@ -2092,6 +2580,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, else ret = mem_cgroup_resize_memsw_limit(memcg, val); break; + case RES_SOFT_LIMIT: + ret = res_counter_memparse_write_strategy(buffer, &val); + if (ret) + break; + /* + * For memsw, soft limits are hard to implement in terms + * of semantics, for now, we support soft limits for + * control without swap + */ + if (type == _MEM) + ret = res_counter_set_soft_limit(&memcg->res, val); + else + ret = -EINVAL; + break; default: ret = -EINVAL; /* should be BUG() ? */ break; @@ -2149,6 +2651,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) res_counter_reset_failcnt(&mem->memsw); break; } + return 0; } @@ -2160,6 +2663,7 @@ enum { MCS_MAPPED_FILE, MCS_PGPGIN, MCS_PGPGOUT, + MCS_SWAP, MCS_INACTIVE_ANON, MCS_ACTIVE_ANON, MCS_INACTIVE_FILE, @@ -2181,6 +2685,7 @@ struct { {"mapped_file", "total_mapped_file"}, {"pgpgin", "total_pgpgin"}, {"pgpgout", "total_pgpgout"}, + {"swap", "total_swap"}, {"inactive_anon", "total_inactive_anon"}, {"active_anon", "total_active_anon"}, {"inactive_file", "total_inactive_file"}, @@ -2205,6 +2710,10 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) s->stat[MCS_PGPGIN] += val; val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); s->stat[MCS_PGPGOUT] += val; + if (do_swap_account) { + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); + s->stat[MCS_SWAP] += val * PAGE_SIZE; + } /* per zone stat */ val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); @@ -2236,8 +2745,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, memset(&mystat, 0, sizeof(mystat)); mem_cgroup_get_local_stat(mem_cont, &mystat); - for (i = 0; i < NR_MCS_STAT; i++) + for (i = 0; i < NR_MCS_STAT; i++) { + if (i == MCS_SWAP && !do_swap_account) + continue; cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); + } /* Hierarchical information */ { @@ -2250,9 +2762,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, memset(&mystat, 0, sizeof(mystat)); mem_cgroup_get_total_stat(mem_cont, &mystat); - for (i = 0; i < NR_MCS_STAT; i++) + for (i = 0; i < NR_MCS_STAT; i++) { + if (i == MCS_SWAP && !do_swap_account) + continue; cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); - + } #ifdef CONFIG_DEBUG_VM cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); @@ -2345,6 +2859,12 @@ static struct cftype mem_cgroup_files[] = { .read_u64 = mem_cgroup_read, }, { + .name = "soft_limit_in_bytes", + .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), + .write_string = mem_cgroup_write, + .read_u64 = mem_cgroup_read, + }, + { .name = "failcnt", .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), .trigger = mem_cgroup_reset, @@ -2438,6 +2958,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) mz = &pn->zoneinfo[zone]; for_each_lru(l) INIT_LIST_HEAD(&mz->lists[l]); + mz->usage_in_excess = 0; + mz->on_tree = false; + mz->mem = mem; } return 0; } @@ -2483,6 +3006,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) { int node; + mem_cgroup_remove_from_trees(mem); free_css_id(&mem_cgroup_subsys, &mem->css); for_each_node_state(node, N_POSSIBLE) @@ -2531,6 +3055,31 @@ static void __init enable_swap_cgroup(void) } #endif +static int mem_cgroup_soft_limit_tree_init(void) +{ + struct mem_cgroup_tree_per_node *rtpn; + struct mem_cgroup_tree_per_zone *rtpz; + int tmp, node, zone; + + for_each_node_state(node, N_POSSIBLE) { + tmp = node; + if (!node_state(node, N_NORMAL_MEMORY)) + tmp = -1; + rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); + if (!rtpn) + return 1; + + soft_limit_tree.rb_tree_per_node[node] = rtpn; + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + rtpz = &rtpn->rb_tree_per_zone[zone]; + rtpz->rb_root = RB_ROOT; + spin_lock_init(&rtpz->lock); + } + } + return 0; +} + static struct cgroup_subsys_state * __ref mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { @@ -2545,10 +3094,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) for_each_node_state(node, N_POSSIBLE) if (alloc_mem_cgroup_per_zone_info(mem, node)) goto free_out; + /* root ? */ if (cont->parent == NULL) { enable_swap_cgroup(); parent = NULL; + root_mem_cgroup = mem; + if (mem_cgroup_soft_limit_tree_init()) + goto free_out; + } else { parent = mem_cgroup_from_cont(cont->parent); mem->use_hierarchy = parent->use_hierarchy; @@ -2577,6 +3131,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) return &mem->css; free_out: __mem_cgroup_free(mem); + root_mem_cgroup = NULL; return ERR_PTR(error); } @@ -2612,7 +3167,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, - struct task_struct *p) + struct task_struct *p, + bool threadgroup) { mutex_lock(&memcg_tasklist); /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c new file mode 100644 index 000000000000..729d4b15b645 --- /dev/null +++ b/mm/memory-failure.c @@ -0,0 +1,832 @@ +/* + * Copyright (C) 2008, 2009 Intel Corporation + * Authors: Andi Kleen, Fengguang Wu + * + * This software may be redistributed and/or modified under the terms of + * the GNU General Public License ("GPL") version 2 only as published by the + * Free Software Foundation. + * + * High level machine check handler. Handles pages reported by the + * hardware as being corrupted usually due to a 2bit ECC memory or cache + * failure. + * + * Handles page cache pages in various states. The tricky part + * here is that we can access any page asynchronous to other VM + * users, because memory failures could happen anytime and anywhere, + * possibly violating some of their assumptions. This is why this code + * has to be extremely careful. Generally it tries to use normal locking + * rules, as in get the standard locks, even if that means the + * error handling takes potentially a long time. + * + * The operation to map back from RMAP chains to processes has to walk + * the complete process list and has non linear complexity with the number + * mappings. In short it can be quite slow. But since memory corruptions + * are rare we hope to get away with this. + */ + +/* + * Notebook: + * - hugetlb needs more code + * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages + * - pass bad pages to kdump next kernel + */ +#define DEBUG 1 /* remove me in 2.6.34 */ +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/page-flags.h> +#include <linux/sched.h> +#include <linux/rmap.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/backing-dev.h> +#include "internal.h" + +int sysctl_memory_failure_early_kill __read_mostly = 0; + +int sysctl_memory_failure_recovery __read_mostly = 1; + +atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); + +/* + * Send all the processes who have the page mapped an ``action optional'' + * signal. + */ +static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, + unsigned long pfn) +{ + struct siginfo si; + int ret; + + printk(KERN_ERR + "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", + pfn, t->comm, t->pid); + si.si_signo = SIGBUS; + si.si_errno = 0; + si.si_code = BUS_MCEERR_AO; + si.si_addr = (void *)addr; +#ifdef __ARCH_SI_TRAPNO + si.si_trapno = trapno; +#endif + si.si_addr_lsb = PAGE_SHIFT; + /* + * Don't use force here, it's convenient if the signal + * can be temporarily blocked. + * This could cause a loop when the user sets SIGBUS + * to SIG_IGN, but hopefully noone will do that? + */ + ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ + if (ret < 0) + printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", + t->comm, t->pid, ret); + return ret; +} + +/* + * Kill all processes that have a poisoned page mapped and then isolate + * the page. + * + * General strategy: + * Find all processes having the page mapped and kill them. + * But we keep a page reference around so that the page is not + * actually freed yet. + * Then stash the page away + * + * There's no convenient way to get back to mapped processes + * from the VMAs. So do a brute-force search over all + * running processes. + * + * Remember that machine checks are not common (or rather + * if they are common you have other problems), so this shouldn't + * be a performance issue. + * + * Also there are some races possible while we get from the + * error detection to actually handle it. + */ + +struct to_kill { + struct list_head nd; + struct task_struct *tsk; + unsigned long addr; + unsigned addr_valid:1; +}; + +/* + * Failure handling: if we can't find or can't kill a process there's + * not much we can do. We just print a message and ignore otherwise. + */ + +/* + * Schedule a process for later kill. + * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. + * TBD would GFP_NOIO be enough? + */ +static void add_to_kill(struct task_struct *tsk, struct page *p, + struct vm_area_struct *vma, + struct list_head *to_kill, + struct to_kill **tkc) +{ + struct to_kill *tk; + + if (*tkc) { + tk = *tkc; + *tkc = NULL; + } else { + tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); + if (!tk) { + printk(KERN_ERR + "MCE: Out of memory while machine check handling\n"); + return; + } + } + tk->addr = page_address_in_vma(p, vma); + tk->addr_valid = 1; + + /* + * In theory we don't have to kill when the page was + * munmaped. But it could be also a mremap. Since that's + * likely very rare kill anyways just out of paranoia, but use + * a SIGKILL because the error is not contained anymore. + */ + if (tk->addr == -EFAULT) { + pr_debug("MCE: Unable to find user space address %lx in %s\n", + page_to_pfn(p), tsk->comm); + tk->addr_valid = 0; + } + get_task_struct(tsk); + tk->tsk = tsk; + list_add_tail(&tk->nd, to_kill); +} + +/* + * Kill the processes that have been collected earlier. + * + * Only do anything when DOIT is set, otherwise just free the list + * (this is used for clean pages which do not need killing) + * Also when FAIL is set do a force kill because something went + * wrong earlier. + */ +static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, + int fail, unsigned long pfn) +{ + struct to_kill *tk, *next; + + list_for_each_entry_safe (tk, next, to_kill, nd) { + if (doit) { + /* + * In case something went wrong with munmaping + * make sure the process doesn't catch the + * signal and then access the memory. Just kill it. + * the signal handlers + */ + if (fail || tk->addr_valid == 0) { + printk(KERN_ERR + "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", + pfn, tk->tsk->comm, tk->tsk->pid); + force_sig(SIGKILL, tk->tsk); + } + + /* + * In theory the process could have mapped + * something else on the address in-between. We could + * check for that, but we need to tell the + * process anyways. + */ + else if (kill_proc_ao(tk->tsk, tk->addr, trapno, + pfn) < 0) + printk(KERN_ERR + "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", + pfn, tk->tsk->comm, tk->tsk->pid); + } + put_task_struct(tk->tsk); + kfree(tk); + } +} + +static int task_early_kill(struct task_struct *tsk) +{ + if (!tsk->mm) + return 0; + if (tsk->flags & PF_MCE_PROCESS) + return !!(tsk->flags & PF_MCE_EARLY); + return sysctl_memory_failure_early_kill; +} + +/* + * Collect processes when the error hit an anonymous page. + */ +static void collect_procs_anon(struct page *page, struct list_head *to_kill, + struct to_kill **tkc) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + struct anon_vma *av; + + read_lock(&tasklist_lock); + av = page_lock_anon_vma(page); + if (av == NULL) /* Not actually mapped anymore */ + goto out; + for_each_process (tsk) { + if (!task_early_kill(tsk)) + continue; + list_for_each_entry (vma, &av->head, anon_vma_node) { + if (!page_mapped_in_vma(page, vma)) + continue; + if (vma->vm_mm == tsk->mm) + add_to_kill(tsk, page, vma, to_kill, tkc); + } + } + page_unlock_anon_vma(av); +out: + read_unlock(&tasklist_lock); +} + +/* + * Collect processes when the error hit a file mapped page. + */ +static void collect_procs_file(struct page *page, struct list_head *to_kill, + struct to_kill **tkc) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + struct prio_tree_iter iter; + struct address_space *mapping = page->mapping; + + /* + * A note on the locking order between the two locks. + * We don't rely on this particular order. + * If you have some other code that needs a different order + * feel free to switch them around. Or add a reverse link + * from mm_struct to task_struct, then this could be all + * done without taking tasklist_lock and looping over all tasks. + */ + + read_lock(&tasklist_lock); + spin_lock(&mapping->i_mmap_lock); + for_each_process(tsk) { + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + + if (!task_early_kill(tsk)) + continue; + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, + pgoff) { + /* + * Send early kill signal to tasks where a vma covers + * the page but the corrupted page is not necessarily + * mapped it in its pte. + * Assume applications who requested early kill want + * to be informed of all such data corruptions. + */ + if (vma->vm_mm == tsk->mm) + add_to_kill(tsk, page, vma, to_kill, tkc); + } + } + spin_unlock(&mapping->i_mmap_lock); + read_unlock(&tasklist_lock); +} + +/* + * Collect the processes who have the corrupted page mapped to kill. + * This is done in two steps for locking reasons. + * First preallocate one tokill structure outside the spin locks, + * so that we can kill at least one process reasonably reliable. + */ +static void collect_procs(struct page *page, struct list_head *tokill) +{ + struct to_kill *tk; + + if (!page->mapping) + return; + + tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); + if (!tk) + return; + if (PageAnon(page)) + collect_procs_anon(page, tokill, &tk); + else + collect_procs_file(page, tokill, &tk); + kfree(tk); +} + +/* + * Error handlers for various types of pages. + */ + +enum outcome { + FAILED, /* Error handling failed */ + DELAYED, /* Will be handled later */ + IGNORED, /* Error safely ignored */ + RECOVERED, /* Successfully recovered */ +}; + +static const char *action_name[] = { + [FAILED] = "Failed", + [DELAYED] = "Delayed", + [IGNORED] = "Ignored", + [RECOVERED] = "Recovered", +}; + +/* + * Error hit kernel page. + * Do nothing, try to be lucky and not touch this instead. For a few cases we + * could be more sophisticated. + */ +static int me_kernel(struct page *p, unsigned long pfn) +{ + return DELAYED; +} + +/* + * Already poisoned page. + */ +static int me_ignore(struct page *p, unsigned long pfn) +{ + return IGNORED; +} + +/* + * Page in unknown state. Do nothing. + */ +static int me_unknown(struct page *p, unsigned long pfn) +{ + printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); + return FAILED; +} + +/* + * Free memory + */ +static int me_free(struct page *p, unsigned long pfn) +{ + return DELAYED; +} + +/* + * Clean (or cleaned) page cache page. + */ +static int me_pagecache_clean(struct page *p, unsigned long pfn) +{ + int err; + int ret = FAILED; + struct address_space *mapping; + + if (!isolate_lru_page(p)) + page_cache_release(p); + + /* + * For anonymous pages we're done the only reference left + * should be the one m_f() holds. + */ + if (PageAnon(p)) + return RECOVERED; + + /* + * Now truncate the page in the page cache. This is really + * more like a "temporary hole punch" + * Don't do this for block devices when someone else + * has a reference, because it could be file system metadata + * and that's not safe to truncate. + */ + mapping = page_mapping(p); + if (!mapping) { + /* + * Page has been teared down in the meanwhile + */ + return FAILED; + } + + /* + * Truncation is a bit tricky. Enable it per file system for now. + * + * Open: to take i_mutex or not for this? Right now we don't. + */ + if (mapping->a_ops->error_remove_page) { + err = mapping->a_ops->error_remove_page(mapping, p); + if (err != 0) { + printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", + pfn, err); + } else if (page_has_private(p) && + !try_to_release_page(p, GFP_NOIO)) { + pr_debug("MCE %#lx: failed to release buffers\n", pfn); + } else { + ret = RECOVERED; + } + } else { + /* + * If the file system doesn't support it just invalidate + * This fails on dirty or anything with private pages + */ + if (invalidate_inode_page(p)) + ret = RECOVERED; + else + printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", + pfn); + } + return ret; +} + +/* + * Dirty cache page page + * Issues: when the error hit a hole page the error is not properly + * propagated. + */ +static int me_pagecache_dirty(struct page *p, unsigned long pfn) +{ + struct address_space *mapping = page_mapping(p); + + SetPageError(p); + /* TBD: print more information about the file. */ + if (mapping) { + /* + * IO error will be reported by write(), fsync(), etc. + * who check the mapping. + * This way the application knows that something went + * wrong with its dirty file data. + * + * There's one open issue: + * + * The EIO will be only reported on the next IO + * operation and then cleared through the IO map. + * Normally Linux has two mechanisms to pass IO error + * first through the AS_EIO flag in the address space + * and then through the PageError flag in the page. + * Since we drop pages on memory failure handling the + * only mechanism open to use is through AS_AIO. + * + * This has the disadvantage that it gets cleared on + * the first operation that returns an error, while + * the PageError bit is more sticky and only cleared + * when the page is reread or dropped. If an + * application assumes it will always get error on + * fsync, but does other operations on the fd before + * and the page is dropped inbetween then the error + * will not be properly reported. + * + * This can already happen even without hwpoisoned + * pages: first on metadata IO errors (which only + * report through AS_EIO) or when the page is dropped + * at the wrong time. + * + * So right now we assume that the application DTRT on + * the first EIO, but we're not worse than other parts + * of the kernel. + */ + mapping_set_error(mapping, EIO); + } + + return me_pagecache_clean(p, pfn); +} + +/* + * Clean and dirty swap cache. + * + * Dirty swap cache page is tricky to handle. The page could live both in page + * cache and swap cache(ie. page is freshly swapped in). So it could be + * referenced concurrently by 2 types of PTEs: + * normal PTEs and swap PTEs. We try to handle them consistently by calling + * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs, + * and then + * - clear dirty bit to prevent IO + * - remove from LRU + * - but keep in the swap cache, so that when we return to it on + * a later page fault, we know the application is accessing + * corrupted data and shall be killed (we installed simple + * interception code in do_swap_page to catch it). + * + * Clean swap cache pages can be directly isolated. A later page fault will + * bring in the known good data from disk. + */ +static int me_swapcache_dirty(struct page *p, unsigned long pfn) +{ + int ret = FAILED; + + ClearPageDirty(p); + /* Trigger EIO in shmem: */ + ClearPageUptodate(p); + + if (!isolate_lru_page(p)) { + page_cache_release(p); + ret = DELAYED; + } + + return ret; +} + +static int me_swapcache_clean(struct page *p, unsigned long pfn) +{ + int ret = FAILED; + + if (!isolate_lru_page(p)) { + page_cache_release(p); + ret = RECOVERED; + } + delete_from_swap_cache(p); + return ret; +} + +/* + * Huge pages. Needs work. + * Issues: + * No rmap support so we cannot find the original mapper. In theory could walk + * all MMs and look for the mappings, but that would be non atomic and racy. + * Need rmap for hugepages for this. Alternatively we could employ a heuristic, + * like just walking the current process and hoping it has it mapped (that + * should be usually true for the common "shared database cache" case) + * Should handle free huge pages and dequeue them too, but this needs to + * handle huge page accounting correctly. + */ +static int me_huge_page(struct page *p, unsigned long pfn) +{ + return FAILED; +} + +/* + * Various page states we can handle. + * + * A page state is defined by its current page->flags bits. + * The table matches them in order and calls the right handler. + * + * This is quite tricky because we can access page at any time + * in its live cycle, so all accesses have to be extremly careful. + * + * This is not complete. More states could be added. + * For any missing state don't attempt recovery. + */ + +#define dirty (1UL << PG_dirty) +#define sc (1UL << PG_swapcache) +#define unevict (1UL << PG_unevictable) +#define mlock (1UL << PG_mlocked) +#define writeback (1UL << PG_writeback) +#define lru (1UL << PG_lru) +#define swapbacked (1UL << PG_swapbacked) +#define head (1UL << PG_head) +#define tail (1UL << PG_tail) +#define compound (1UL << PG_compound) +#define slab (1UL << PG_slab) +#define buddy (1UL << PG_buddy) +#define reserved (1UL << PG_reserved) + +static struct page_state { + unsigned long mask; + unsigned long res; + char *msg; + int (*action)(struct page *p, unsigned long pfn); +} error_states[] = { + { reserved, reserved, "reserved kernel", me_ignore }, + { buddy, buddy, "free kernel", me_free }, + + /* + * Could in theory check if slab page is free or if we can drop + * currently unused objects without touching them. But just + * treat it as standard kernel for now. + */ + { slab, slab, "kernel slab", me_kernel }, + +#ifdef CONFIG_PAGEFLAGS_EXTENDED + { head, head, "huge", me_huge_page }, + { tail, tail, "huge", me_huge_page }, +#else + { compound, compound, "huge", me_huge_page }, +#endif + + { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, + { sc|dirty, sc, "swapcache", me_swapcache_clean }, + + { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, + { unevict, unevict, "unevictable LRU", me_pagecache_clean}, + +#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT + { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, + { mlock, mlock, "mlocked LRU", me_pagecache_clean }, +#endif + + { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, + { lru|dirty, lru, "clean LRU", me_pagecache_clean }, + { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, + + /* + * Catchall entry: must be at end. + */ + { 0, 0, "unknown page state", me_unknown }, +}; + +#undef lru + +static void action_result(unsigned long pfn, char *msg, int result) +{ + struct page *page = NULL; + if (pfn_valid(pfn)) + page = pfn_to_page(pfn); + + printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", + pfn, + page && PageDirty(page) ? "dirty " : "", + msg, action_name[result]); +} + +static int page_action(struct page_state *ps, struct page *p, + unsigned long pfn, int ref) +{ + int result; + + result = ps->action(p, pfn); + action_result(pfn, ps->msg, result); + if (page_count(p) != 1 + ref) + printk(KERN_ERR + "MCE %#lx: %s page still referenced by %d users\n", + pfn, ps->msg, page_count(p) - 1); + + /* Could do more checks here if page looks ok */ + /* + * Could adjust zone counters here to correct for the missing page. + */ + + return result == RECOVERED ? 0 : -EBUSY; +} + +#define N_UNMAP_TRIES 5 + +/* + * Do all that is necessary to remove user space mappings. Unmap + * the pages and send SIGBUS to the processes if the data was dirty. + */ +static void hwpoison_user_mappings(struct page *p, unsigned long pfn, + int trapno) +{ + enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + struct address_space *mapping; + LIST_HEAD(tokill); + int ret; + int i; + int kill = 1; + + if (PageReserved(p) || PageCompound(p) || PageSlab(p)) + return; + + if (!PageLRU(p)) + lru_add_drain_all(); + + /* + * This check implies we don't kill processes if their pages + * are in the swap cache early. Those are always late kills. + */ + if (!page_mapped(p)) + return; + + if (PageSwapCache(p)) { + printk(KERN_ERR + "MCE %#lx: keeping poisoned page in swap cache\n", pfn); + ttu |= TTU_IGNORE_HWPOISON; + } + + /* + * Propagate the dirty bit from PTEs to struct page first, because we + * need this to decide if we should kill or just drop the page. + */ + mapping = page_mapping(p); + if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { + if (page_mkclean(p)) { + SetPageDirty(p); + } else { + kill = 0; + ttu |= TTU_IGNORE_HWPOISON; + printk(KERN_INFO + "MCE %#lx: corrupted page was clean: dropped without side effects\n", + pfn); + } + } + + /* + * First collect all the processes that have the page + * mapped in dirty form. This has to be done before try_to_unmap, + * because ttu takes the rmap data structures down. + * + * Error handling: We ignore errors here because + * there's nothing that can be done. + */ + if (kill) + collect_procs(p, &tokill); + + /* + * try_to_unmap can fail temporarily due to races. + * Try a few times (RED-PEN better strategy?) + */ + for (i = 0; i < N_UNMAP_TRIES; i++) { + ret = try_to_unmap(p, ttu); + if (ret == SWAP_SUCCESS) + break; + pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); + } + + if (ret != SWAP_SUCCESS) + printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", + pfn, page_mapcount(p)); + + /* + * Now that the dirty bit has been propagated to the + * struct page and all unmaps done we can decide if + * killing is needed or not. Only kill when the page + * was dirty, otherwise the tokill list is merely + * freed. When there was a problem unmapping earlier + * use a more force-full uncatchable kill to prevent + * any accesses to the poisoned memory. + */ + kill_procs_ao(&tokill, !!PageDirty(p), trapno, + ret != SWAP_SUCCESS, pfn); +} + +int __memory_failure(unsigned long pfn, int trapno, int ref) +{ + struct page_state *ps; + struct page *p; + int res; + + if (!sysctl_memory_failure_recovery) + panic("Memory failure from trap %d on page %lx", trapno, pfn); + + if (!pfn_valid(pfn)) { + action_result(pfn, "memory outside kernel control", IGNORED); + return -EIO; + } + + p = pfn_to_page(pfn); + if (TestSetPageHWPoison(p)) { + action_result(pfn, "already hardware poisoned", IGNORED); + return 0; + } + + atomic_long_add(1, &mce_bad_pages); + + /* + * We need/can do nothing about count=0 pages. + * 1) it's a free page, and therefore in safe hand: + * prep_new_page() will be the gate keeper. + * 2) it's part of a non-compound high order page. + * Implies some kernel user: cannot stop them from + * R/W the page; let's pray that the page has been + * used and will be freed some time later. + * In fact it's dangerous to directly bump up page count from 0, + * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. + */ + if (!get_page_unless_zero(compound_head(p))) { + action_result(pfn, "free or high order kernel", IGNORED); + return PageBuddy(compound_head(p)) ? 0 : -EBUSY; + } + + /* + * Lock the page and wait for writeback to finish. + * It's very difficult to mess with pages currently under IO + * and in many cases impossible, so we just avoid it here. + */ + lock_page_nosync(p); + wait_on_page_writeback(p); + + /* + * Now take care of user space mappings. + */ + hwpoison_user_mappings(p, pfn, trapno); + + /* + * Torn down by someone else? + */ + if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { + action_result(pfn, "already truncated LRU", IGNORED); + res = 0; + goto out; + } + + res = -EBUSY; + for (ps = error_states;; ps++) { + if ((p->flags & ps->mask) == ps->res) { + res = page_action(ps, p, pfn, ref); + break; + } + } +out: + unlock_page(p); + return res; +} +EXPORT_SYMBOL_GPL(__memory_failure); + +/** + * memory_failure - Handle memory failure of a page. + * @pfn: Page Number of the corrupted page + * @trapno: Trap number reported in the signal to user space. + * + * This function is called by the low level machine check code + * of an architecture when it detects hardware memory corruption + * of a page. It tries its best to recover, which includes + * dropping pages, killing processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Must run in process context (e.g. a work queue) with interrupts + * enabled and no spinlocks hold. + */ +void memory_failure(unsigned long pfn, int trapno) +{ + __memory_failure(pfn, trapno, 0); +} diff --git a/mm/memory.c b/mm/memory.c index aede2ce3aba4..7e91b5f9f690 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -45,6 +45,7 @@ #include <linux/swap.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/ksm.h> #include <linux/rmap.h> #include <linux/module.h> #include <linux/delayacct.h> @@ -56,6 +57,7 @@ #include <linux/swapops.h> #include <linux/elf.h> +#include <asm/io.h> #include <asm/pgalloc.h> #include <asm/uaccess.h> #include <asm/tlb.h> @@ -106,6 +108,18 @@ static int __init disable_randmaps(char *s) } __setup("norandmaps", disable_randmaps); +unsigned long zero_pfn __read_mostly; +unsigned long highest_memmap_pfn __read_mostly; + +/* + * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() + */ +static int __init init_zero_pfn(void) +{ + zero_pfn = page_to_pfn(ZERO_PAGE(0)); + return 0; +} +core_initcall(init_zero_pfn); /* * If a p?d_bad entry is found while walking page tables, report @@ -283,7 +297,8 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr = vma->vm_start; /* - * Hide vma from rmap and vmtruncate before freeing pgtables + * Hide vma from rmap and truncate_pagecache before freeing + * pgtables */ anon_vma_unlink(vma); unlink_file_vma(vma); @@ -442,6 +457,20 @@ static inline int is_cow_mapping(unsigned int flags) return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } +#ifndef is_zero_pfn +static inline int is_zero_pfn(unsigned long pfn) +{ + return pfn == zero_pfn; +} +#endif + +#ifndef my_zero_pfn +static inline unsigned long my_zero_pfn(unsigned long addr) +{ + return zero_pfn; +} +#endif + /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * @@ -497,7 +526,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, if (HAVE_PTE_SPECIAL) { if (likely(!pte_special(pte))) goto check_pfn; - if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) + if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) + return NULL; + if (!is_zero_pfn(pfn)) print_bad_pte(vma, addr, pte, NULL); return NULL; } @@ -519,6 +550,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } } + if (is_zero_pfn(pfn)) + return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); @@ -596,8 +629,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, page = vm_normal_page(vma, addr, pte); if (page) { get_page(page); - page_dup_rmap(page, vma, addr); - rss[!!PageAnon(page)]++; + page_dup_rmap(page); + rss[PageAnon(page)]++; } out_set_pte: @@ -1142,9 +1175,14 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; + page = vm_normal_page(vma, address, pte); - if (unlikely(!page)) - goto bad_page; + if (unlikely(!page)) { + if ((flags & FOLL_DUMP) || + !is_zero_pfn(pte_pfn(pte))) + goto bad_page; + page = pte_page(pte); + } if (flags & FOLL_GET) get_page(page); @@ -1172,65 +1210,46 @@ no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return page; - /* Fall through to ZERO_PAGE handling */ + no_page_table: /* * When core dumping an enormous anonymous area that nobody - * has touched so far, we don't want to allocate page tables. + * has touched so far, we don't want to allocate unnecessary pages or + * page tables. Return error instead of NULL to skip handle_mm_fault, + * then get_dump_page() will return NULL to leave a hole in the dump. + * But we can only make this optimization where a hole would surely + * be zero-filled if handle_mm_fault() actually did handle it. */ - if (flags & FOLL_ANON) { - page = ZERO_PAGE(0); - if (flags & FOLL_GET) - get_page(page); - BUG_ON(flags & FOLL_WRITE); - } + if ((flags & FOLL_DUMP) && + (!vma->vm_ops || !vma->vm_ops->fault)) + return ERR_PTR(-EFAULT); return page; } -/* Can we do the FOLL_ANON optimization? */ -static inline int use_zero_page(struct vm_area_struct *vma) -{ - /* - * We don't want to optimize FOLL_ANON for make_pages_present() - * when it tries to page in a VM_LOCKED region. As to VM_SHARED, - * we want to get the page from the page tables to make sure - * that we serialize and update with any other user of that - * mapping. - */ - if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) - return 0; - /* - * And if we have a fault routine, it's not an anonymous region. - */ - return !vma->vm_ops || !vma->vm_ops->fault; -} - - - int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, int flags, + unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { int i; - unsigned int vm_flags = 0; - int write = !!(flags & GUP_FLAGS_WRITE); - int force = !!(flags & GUP_FLAGS_FORCE); - int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); - int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); + unsigned long vm_flags; if (nr_pages <= 0) return 0; + + VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); + /* * Require read or write permissions. - * If 'force' is set, we only require the "MAY" flags. + * If FOLL_FORCE is set, we only require the "MAY" flags. */ - vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + vm_flags = (gup_flags & FOLL_WRITE) ? + (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= (gup_flags & FOLL_FORCE) ? + (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); i = 0; do { struct vm_area_struct *vma; - unsigned int foll_flags; vma = find_extend_vma(mm, start); if (!vma && in_gate_area(tsk, start)) { @@ -1242,7 +1261,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pte_t *pte; /* user gate pages are read-only */ - if (!ignore && write) + if (gup_flags & FOLL_WRITE) return i ? : -EFAULT; if (pg > TASK_SIZE) pgd = pgd_offset_k(pg); @@ -1276,38 +1295,26 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) || - (!ignore && !(vm_flags & vma->vm_flags))) + !(vm_flags & vma->vm_flags)) return i ? : -EFAULT; if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &nr_pages, i, write); + &start, &nr_pages, i, gup_flags); continue; } - foll_flags = FOLL_TOUCH; - if (pages) - foll_flags |= FOLL_GET; - if (!write && use_zero_page(vma)) - foll_flags |= FOLL_ANON; - do { struct page *page; + unsigned int foll_flags = gup_flags; /* * If we have a pending SIGKILL, don't keep faulting - * pages and potentially allocating memory, unless - * current is handling munlock--e.g., on exit. In - * that case, we are not allocating memory. Rather, - * we're only unlocking already resident/mapped pages. + * pages and potentially allocating memory. */ - if (unlikely(!ignore_sigkill && - fatal_signal_pending(current))) + if (unlikely(fatal_signal_pending(current))) return i ? i : -ERESTARTSYS; - if (write) - foll_flags |= FOLL_WRITE; - cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; @@ -1319,7 +1326,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; - else if (ret & VM_FAULT_SIGBUS) + if (ret & + (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) return i ? i : -EFAULT; BUG(); } @@ -1418,18 +1426,47 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { - int flags = 0; + int flags = FOLL_TOUCH; + if (pages) + flags |= FOLL_GET; if (write) - flags |= GUP_FLAGS_WRITE; + flags |= FOLL_WRITE; if (force) - flags |= GUP_FLAGS_FORCE; + flags |= FOLL_FORCE; return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); } - EXPORT_SYMBOL(get_user_pages); +/** + * get_dump_page() - pin user page in memory while writing it to core dump + * @addr: user address + * + * Returns struct page pointer of user page pinned for dump, + * to be freed afterwards by page_cache_release() or put_page(). + * + * Returns NULL on any kind of failure - a hole must then be inserted into + * the corefile, to preserve alignment with its headers; and also returns + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - + * allowing a hole to be left in the corefile to save diskspace. + * + * Called without mmap_sem, but after all other threads have been killed. + */ +#ifdef CONFIG_ELF_CORE +struct page *get_dump_page(unsigned long addr) +{ + struct vm_area_struct *vma; + struct page *page; + + if (__get_user_pages(current, current->mm, addr, 1, + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) + return NULL; + flush_cache_page(vma, addr, page_to_pfn(page)); + return page; +} +#endif /* CONFIG_ELF_CORE */ + pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { @@ -1607,7 +1644,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * If we don't have pte special, then we have to use the pfn_valid() * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* * refcount the page if pfn_valid is true (hence insert_page rather - * than insert_pfn). + * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP + * without pte special, it would there be refcounted as a normal page. */ if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { struct page *page; @@ -1973,7 +2011,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(old_page)) { + if (PageAnon(old_page) && !PageKsm(old_page)) { if (!trylock_page(old_page)) { page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); @@ -2074,10 +2112,19 @@ gotten: if (unlikely(anon_vma_prepare(vma))) goto oom; - VM_BUG_ON(old_page == ZERO_PAGE(0)); - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!new_page) - goto oom; + + if (is_zero_pfn(pte_pfn(orig_pte))) { + new_page = alloc_zeroed_user_highpage_movable(vma, address); + if (!new_page) + goto oom; + } else { + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) + goto oom; + cow_user_page(new_page, old_page, address, vma); + } + __SetPageUptodate(new_page); + /* * Don't let another task, with possibly unlocked vma, * keep the mlocked page. @@ -2087,8 +2134,6 @@ gotten: clear_page_mlock(old_page); unlock_page(old_page); } - cow_user_page(new_page, old_page, address, vma); - __SetPageUptodate(new_page); if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; @@ -2114,9 +2159,14 @@ gotten: * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush_notify(vma, address, page_table); + ptep_clear_flush(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address); - set_pte_at(mm, address, page_table, entry); + /* + * We call the notify macro here because, when using secondary + * mmu page tables (such as kvm shadow page tables), we want the + * new page to be mapped directly into the secondary page table. + */ + set_pte_at_notify(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); if (old_page) { /* @@ -2359,7 +2409,7 @@ restart: * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE - * boundary. Note that this is different from vmtruncate(), which + * boundary. Note that this is different from truncate_pagecache(), which * must keep the partial page. In contrast, we must get rid of * partial pages. * @holelen: size of prospective hole in bytes. This will be rounded @@ -2410,63 +2460,6 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -/** - * vmtruncate - unmap mappings "freed" by truncate() syscall - * @inode: inode of the file used - * @offset: file offset to start truncating - * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - */ -int vmtruncate(struct inode * inode, loff_t offset) -{ - if (inode->i_size < offset) { - unsigned long limit; - - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out_big; - i_size_write(inode, offset); - } else { - struct address_space *mapping = inode->i_mapping; - - /* - * truncation of in-use swapfiles is disallowed - it would - * cause subsequent swapout to scribble on the now-freed - * blocks. - */ - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - i_size_write(inode, offset); - - /* - * unmap_mapping_range is called twice, first simply for - * efficiency so that truncate_inode_pages does fewer - * single-page unmaps. However after this first call, and - * before truncate_inode_pages finishes, it is possible for - * private pages to be COWed, which remain after - * truncate_inode_pages finishes, hence the second - * unmap_mapping_range call must be made for correctness. - */ - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, offset); - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - } - - if (inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; - -out_sig: - send_sig(SIGXFSZ, current, 0); -out_big: - return -EFBIG; -} -EXPORT_SYMBOL(vmtruncate); - int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) { struct address_space *mapping = inode->i_mapping; @@ -2511,8 +2504,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; entry = pte_to_swp_entry(orig_pte); - if (is_migration_entry(entry)) { - migration_entry_wait(mm, pmd, address); + if (unlikely(non_swap_entry(entry))) { + if (is_migration_entry(entry)) { + migration_entry_wait(mm, pmd, address); + } else if (is_hwpoison_entry(entry)) { + ret = VM_FAULT_HWPOISON; + } else { + print_bad_pte(vma, address, orig_pte, NULL); + ret = VM_FAULT_OOM; + } goto out; } delayacct_set_flag(DELAYACCT_PF_SWAPIN); @@ -2536,6 +2536,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); + } else if (PageHWPoison(page)) { + ret = VM_FAULT_HWPOISON; + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + goto out; } lock_page(page); @@ -2624,6 +2628,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; pte_t entry; + if (!(flags & FAULT_FLAG_WRITE)) { + entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), + vma->vm_page_prot)); + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (!pte_none(*page_table)) + goto unlock; + goto setpte; + } + /* Allocate our own private page. */ pte_unmap(page_table); @@ -2638,13 +2652,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto release; + inc_mm_counter(mm, anon_rss); page_add_new_anon_rmap(page, vma, address); +setpte: set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ @@ -2699,6 +2716,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; + if (unlikely(PageHWPoison(vmf.page))) { + if (ret & VM_FAULT_LOCKED) + unlock_page(vmf.page); + return VM_FAULT_HWPOISON; + } + /* * For consistency in subsequent calls, make the faulted page always * locked. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e4412a676c88..821dee596377 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -339,8 +339,11 @@ EXPORT_SYMBOL_GPL(__remove_pages); void online_page(struct page *page) { + unsigned long pfn = page_to_pfn(page); + totalram_pages++; - num_physpages++; + if (pfn >= num_physpages) + num_physpages = pfn + 1; #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) @@ -410,7 +413,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) if (!populated_zone(zone)) need_zonelists_rebuild = 1; - ret = walk_memory_resource(pfn, nr_pages, &onlined_pages, + ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, online_pages_range); if (ret) { printk(KERN_DEBUG "online_pages %lx at %lx failed\n", @@ -422,6 +425,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; + zone_pcp_update(zone); setup_per_zone_wmarks(); calculate_zone_inactive_ratio(zone); if (onlined_pages) { @@ -701,7 +705,7 @@ offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, static void offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { - walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL, + walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, offline_isolated_pages_cb); } @@ -727,7 +731,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) long offlined = 0; int ret; - ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined, + ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, check_pages_isolated_cb); if (ret < 0) offlined = (long)ret; @@ -831,7 +835,6 @@ repeat: zone->present_pages -= offlined_pages; zone->zone_pgdat->node_present_pages -= offlined_pages; totalram_pages -= offlined_pages; - num_physpages -= offlined_pages; setup_per_zone_wmarks(); calculate_zone_inactive_ratio(zone); diff --git a/mm/mempool.c b/mm/mempool.c index 32e75d400503..1a3bc3d4d554 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -308,13 +308,6 @@ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) } EXPORT_SYMBOL(mempool_kmalloc); -void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) -{ - size_t size = (size_t)pool_data; - return kzalloc(size, gfp_mask); -} -EXPORT_SYMBOL(mempool_kzalloc); - void mempool_kfree(void *element, void *pool_data) { kfree(element); diff --git a/mm/migrate.c b/mm/migrate.c index 939888f9ddab..1a4bf4813780 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -67,6 +67,8 @@ int putback_lru_pages(struct list_head *l) list_for_each_entry_safe(page, page2, l, lru) { list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); putback_lru_page(page); count++; } @@ -147,7 +149,7 @@ out: static void remove_file_migration_ptes(struct page *old, struct page *new) { struct vm_area_struct *vma; - struct address_space *mapping = page_mapping(new); + struct address_space *mapping = new->mapping; struct prio_tree_iter iter; pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -270,7 +272,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, pslot = radix_tree_lookup_slot(&mapping->page_tree, page_index(page)); - expected_count = 2 + !!page_has_private(page); + expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || (struct page *)radix_tree_deref_slot(pslot) != page) { spin_unlock_irq(&mapping->tree_lock); @@ -312,7 +314,10 @@ static int migrate_page_move_mapping(struct address_space *mapping, */ __dec_zone_page_state(page, NR_FILE_PAGES); __inc_zone_page_state(newpage, NR_FILE_PAGES); - + if (PageSwapBacked(page)) { + __dec_zone_page_state(page, NR_SHMEM); + __inc_zone_page_state(newpage, NR_SHMEM); + } spin_unlock_irq(&mapping->tree_lock); return 0; @@ -664,13 +669,15 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * needs to be effective. */ try_to_free_buffers(page); + goto rcu_unlock; } - goto rcu_unlock; + goto skip_unmap; } /* Establish migration ptes or remove ptes */ - try_to_unmap(page, 1); + try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); +skip_unmap: if (!page_mapped(page)) rc = move_to_new_page(newpage, page); @@ -693,6 +700,8 @@ unlock: * restored. */ list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); putback_lru_page(page); } @@ -737,6 +746,13 @@ int migrate_pages(struct list_head *from, struct page *page2; int swapwrite = current->flags & PF_SWAPWRITE; int rc; + unsigned long flags; + + local_irq_save(flags); + list_for_each_entry(page, from, lru) + __inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + local_irq_restore(flags); if (!swapwrite) current->flags |= PF_SWAPWRITE; diff --git a/mm/mlock.c b/mm/mlock.c index 45eb650b9654..bd6f0e466f6c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -139,49 +139,36 @@ static void munlock_vma_page(struct page *page) } /** - * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma. + * __mlock_vma_pages_range() - mlock a range of pages in the vma. * @vma: target vma * @start: start address * @end: end address - * @mlock: 0 indicate munlock, otherwise mlock. * - * If @mlock == 0, unlock an mlocked range; - * else mlock the range of pages. This takes care of making the pages present , - * too. + * This takes care of making the pages present too. * * return 0 on success, negative error code on error. * * vma->vm_mm->mmap_sem must be held for at least read. */ static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - int mlock) + unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; struct page *pages[16]; /* 16 gives a reasonable batch */ int nr_pages = (end - start) / PAGE_SIZE; int ret = 0; - int gup_flags = 0; + int gup_flags; VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(end & ~PAGE_MASK); VM_BUG_ON(start < vma->vm_start); VM_BUG_ON(end > vma->vm_end); - VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && - (atomic_read(&mm->mm_users) != 0)); - - /* - * mlock: don't page populate if vma has PROT_NONE permission. - * munlock: always do munlock although the vma has PROT_NONE - * permission, or SIGKILL is pending. - */ - if (!mlock) - gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS | - GUP_FLAGS_IGNORE_SIGKILL; + VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + gup_flags = FOLL_TOUCH | FOLL_GET; if (vma->vm_flags & VM_WRITE) - gup_flags |= GUP_FLAGS_WRITE; + gup_flags |= FOLL_WRITE; while (nr_pages > 0) { int i; @@ -201,51 +188,45 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, * This can happen for, e.g., VM_NONLINEAR regions before * a page has been allocated and mapped at a given offset, * or for addresses that map beyond end of a file. - * We'll mlock the the pages if/when they get faulted in. + * We'll mlock the pages if/when they get faulted in. */ if (ret < 0) break; - if (ret == 0) { - /* - * We know the vma is there, so the only time - * we cannot get a single page should be an - * error (ret < 0) case. - */ - WARN_ON(1); - break; - } lru_add_drain(); /* push cached pages to LRU */ for (i = 0; i < ret; i++) { struct page *page = pages[i]; - lock_page(page); - /* - * Because we lock page here and migration is blocked - * by the elevated reference, we need only check for - * page truncation (file-cache only). - */ if (page->mapping) { - if (mlock) + /* + * That preliminary check is mainly to avoid + * the pointless overhead of lock_page on the + * ZERO_PAGE: which might bounce very badly if + * there is contention. However, we're still + * dirtying its cacheline with get/put_page: + * we'll add another __get_user_pages flag to + * avoid it if that case turns out to matter. + */ + lock_page(page); + /* + * Because we lock page here and migration is + * blocked by the elevated reference, we need + * only check for file-cache page truncation. + */ + if (page->mapping) mlock_vma_page(page); - else - munlock_vma_page(page); + unlock_page(page); } - unlock_page(page); - put_page(page); /* ref from get_user_pages() */ - - /* - * here we assume that get_user_pages() has given us - * a list of virtually contiguous pages. - */ - addr += PAGE_SIZE; /* for next get_user_pages() */ - nr_pages--; + put_page(page); /* ref from get_user_pages() */ } + + addr += ret * PAGE_SIZE; + nr_pages -= ret; ret = 0; } - return ret; /* count entire vma as locked_vm */ + return ret; /* 0 or negative error code */ } /* @@ -289,7 +270,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))) { - __mlock_vma_pages_range(vma, start, end, 1); + __mlock_vma_pages_range(vma, start, end); /* Hide errors from mmap() and other callers */ return 0; @@ -310,7 +291,6 @@ no_mlock: return nr_pages; /* error or pages NOT mlocked */ } - /* * munlock_vma_pages_range() - munlock all pages in the vma range.' * @vma - vma containing range to be munlock()ed. @@ -330,10 +310,38 @@ no_mlock: * free them. This will result in freeing mlocked pages. */ void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end) { + unsigned long addr; + + lru_add_drain(); vma->vm_flags &= ~VM_LOCKED; - __mlock_vma_pages_range(vma, start, end, 0); + + for (addr = start; addr < end; addr += PAGE_SIZE) { + struct page *page; + /* + * Although FOLL_DUMP is intended for get_dump_page(), + * it just so happens that its special treatment of the + * ZERO_PAGE (returning an error instead of doing get_page) + * suits munlock very well (and if somehow an abnormal page + * has sneaked into the range, we won't oops here: great). + */ + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + if (page && !IS_ERR(page)) { + lock_page(page); + /* + * Like in __mlock_vma_pages_range(), + * because we lock page here and migration is + * blocked by the elevated reference, we need + * only check for file-cache page truncation. + */ + if (page->mapping) + munlock_vma_page(page); + unlock_page(page); + put_page(page); + } + cond_resched(); + } } /* @@ -400,18 +408,14 @@ success: * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, __mlock_vma_pages_range will bring it back. */ - vma->vm_flags = newflags; if (lock) { - ret = __mlock_vma_pages_range(vma, start, end, 1); - - if (ret > 0) { - mm->locked_vm -= ret; - ret = 0; - } else - ret = __mlock_posix_error_return(ret); /* translate if needed */ + vma->vm_flags = newflags; + ret = __mlock_vma_pages_range(vma, start, end); + if (ret < 0) + ret = __mlock_posix_error_return(ret); } else { - __mlock_vma_pages_range(vma, start, end, 0); + munlock_vma_pages_range(vma, start, end); } out: diff --git a/mm/mmap.c b/mm/mmap.c index 8101de490c73..73f5e4b64010 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -28,7 +28,7 @@ #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/mmu_notifier.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -570,9 +570,9 @@ again: remove_next = 1 + (end > next->vm_end); /* * When changing only vma->vm_end, we don't really need - * anon_vma lock: but is that case worth optimizing out? + * anon_vma lock. */ - if (vma->anon_vma) + if (vma->anon_vma && (insert || importer || start != vma->vm_start)) anon_vma = vma->anon_vma; if (anon_vma) { spin_lock(&anon_vma->lock); @@ -656,9 +656,6 @@ again: remove_next = 1 + (end > next->vm_end); validate_mm(mm); } -/* Flags that can be inherited from an existing mapping when merging */ -#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR) - /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. @@ -666,7 +663,8 @@ again: remove_next = 1 + (end > next->vm_end); static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { - if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS) + /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */ + if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR) return 0; if (vma->vm_file != file) return 0; @@ -905,7 +903,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, #endif /* CONFIG_PROC_FS */ /* - * The caller must hold down_write(current->mm->mmap_sem). + * The caller must hold down_write(¤t->mm->mmap_sem). */ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, @@ -951,6 +949,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (mm->map_count > sysctl_max_map_count) return -ENOMEM; + if (flags & MAP_HUGETLB) { + struct user_struct *user = NULL; + if (file) + return -EINVAL; + + /* + * VM_NORESERVE is used because the reservations will be + * taken when vm_ops->mmap() is called + * A dummy user value is used because we are not locking + * memory so no accounting is necessary + */ + len = ALIGN(len, huge_page_size(&default_hstate)); + file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE); + if (IS_ERR(file)) + return PTR_ERR(file); + } + /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ @@ -965,11 +981,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - if (flags & MAP_LOCKED) { + if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM; - vm_flags |= VM_LOCKED; - } /* mlock MCL_FUTURE? */ if (vm_flags & VM_LOCKED) { @@ -1195,21 +1209,21 @@ munmap_back: goto unmap_and_free_vma; if (vm_flags & VM_EXECUTABLE) added_exe_file_vma(mm); + + /* Can addr have changed?? + * + * Answer: Yes, several device drivers can do it in their + * f_op->mmap method. -DaveM + */ + addr = vma->vm_start; + pgoff = vma->vm_pgoff; + vm_flags = vma->vm_flags; } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); if (error) goto free_vma; } - /* Can addr have changed?? - * - * Answer: Yes, several device drivers can do it in their - * f_op->mmap method. -DaveM - */ - addr = vma->vm_start; - pgoff = vma->vm_pgoff; - vm_flags = vma->vm_flags; - if (vma_wants_writenotify(vma)) vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); @@ -1220,7 +1234,7 @@ munmap_back: if (correct_wcount) atomic_inc(&inode->i_writecount); out: - perf_counter_mmap(vma); + perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); @@ -2111,6 +2125,7 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); + free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); @@ -2267,7 +2282,7 @@ static void special_mapping_close(struct vm_area_struct *vma) { } -static struct vm_operations_struct special_mapping_vmops = { +static const struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, }; @@ -2308,7 +2323,7 @@ int install_special_mapping(struct mm_struct *mm, mm->total_vm += len >> PAGE_SHIFT; - perf_counter_mmap(vma); + perf_event_mmap(vma); return 0; } diff --git a/mm/mmu_context.c b/mm/mmu_context.c new file mode 100644 index 000000000000..ded9081f4021 --- /dev/null +++ b/mm/mmu_context.c @@ -0,0 +1,58 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * + * See ../COPYING for licensing terms. + */ + +#include <linux/mm.h> +#include <linux/mmu_context.h> +#include <linux/sched.h> + +#include <asm/mmu_context.h> + +/* + * use_mm + * Makes the calling kernel thread take on the specified + * mm context. + * Called by the retry thread execute retries within the + * iocb issuer's mm context, so that copy_from/to_user + * operations work seamlessly for aio. + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void use_mm(struct mm_struct *mm) +{ + struct mm_struct *active_mm; + struct task_struct *tsk = current; + + task_lock(tsk); + active_mm = tsk->active_mm; + if (active_mm != mm) { + atomic_inc(&mm->mm_count); + tsk->active_mm = mm; + } + tsk->mm = mm; + switch_mm(active_mm, mm, tsk); + task_unlock(tsk); + + if (active_mm != mm) + mmdrop(active_mm); +} + +/* + * unuse_mm + * Reverses the effect of use_mm, i.e. releases the + * specified mm context which was earlier taken on + * by the calling kernel thread + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void unuse_mm(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + tsk->mm = NULL; + /* active_mm is still 'mm' */ + enter_lazy_tlb(mm, tsk); + task_unlock(tsk); +} diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 5f4ef0250bee..7e33f2cb3c77 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -99,6 +99,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, return young; } +void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, + pte_t pte) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->change_pte) + mn->ops->change_pte(mn, mm, address, pte); + /* + * Some drivers don't have change_pte, + * so we must call invalidate_page in that case. + */ + else if (mn->ops->invalidate_page) + mn->ops->invalidate_page(mn, mm, address); + } + rcu_read_unlock(); +} + void __mmu_notifier_invalidate_page(struct mm_struct *mm, unsigned long address) { diff --git a/mm/mprotect.c b/mm/mprotect.c index d80311baeb2d..8bc969d8112d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -23,7 +23,7 @@ #include <linux/swapops.h> #include <linux/mmu_notifier.h> #include <linux/migrate.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> @@ -300,7 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); if (error) goto out; - perf_counter_mmap(vma); + perf_event_mmap(vma); nstart = tmp; if (nstart < prev->vm_end) diff --git a/mm/mremap.c b/mm/mremap.c index a39b7b91be46..97bff2547719 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -11,6 +11,7 @@ #include <linux/hugetlb.h> #include <linux/slab.h> #include <linux/shm.h> +#include <linux/ksm.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/capability.h> @@ -85,8 +86,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (vma->vm_file) { /* * Subtle point from Rajesh Venkatasubramanian: before - * moving file-based ptes, we must lock vmtruncate out, - * since it might clean the dst vma before the src vma, + * moving file-based ptes, we must lock truncate_pagecache + * out, since it might clean the dst vma before the src vma, * and we propagate stale pages into the dst afterward. */ mapping = vma->vm_file->f_mapping; @@ -174,6 +175,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long excess = 0; unsigned long hiwater_vm; int split = 0; + int err; /* * We'd prefer to avoid failure later on in do_munmap: @@ -182,6 +184,18 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (mm->map_count >= sysctl_max_map_count - 3) return -ENOMEM; + /* + * Advise KSM to break any KSM pages in the area to be moved: + * it would be confusing if they were to turn up at the new + * location, where they happen to coincide with different KSM + * pages recently unmapped. But leave vma->vm_flags as it was, + * so KSM can come around to merge on vma and new_vma afterwards. + */ + err = ksm_madvise(vma, old_addr, old_addr + old_len, + MADV_UNMERGEABLE, &vm_flags); + if (err) + return err; + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); if (!new_vma) diff --git a/mm/nommu.c b/mm/nommu.c index 66e81e7e9fe9..5189b5aed8c0 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -33,6 +33,7 @@ #include <asm/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> +#include <asm/mmu_context.h> #include "internal.h" static inline __attribute__((format(printf, 1, 2))) @@ -56,12 +57,11 @@ void no_printk(const char *fmt, ...) no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) #endif -#include "internal.h" - void *high_memory; struct page *mem_map; unsigned long max_mapnr; unsigned long num_physpages; +unsigned long highest_memmap_pfn; struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ @@ -79,50 +79,10 @@ static struct kmem_cache *vm_region_jar; struct rb_root nommu_region_tree = RB_ROOT; DECLARE_RWSEM(nommu_region_sem); -struct vm_operations_struct generic_file_vm_ops = { +const struct vm_operations_struct generic_file_vm_ops = { }; /* - * Handle all mappings that got truncated by a "truncate()" - * system call. - * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - */ -int vmtruncate(struct inode *inode, loff_t offset) -{ - struct address_space *mapping = inode->i_mapping; - unsigned long limit; - - if (inode->i_size < offset) - goto do_expand; - i_size_write(inode, offset); - - truncate_inode_pages(mapping, offset); - goto out_truncate; - -do_expand: - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out; - i_size_write(inode, offset); - -out_truncate: - if (inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; -out_sig: - send_sig(SIGXFSZ, current, 0); -out: - return -EFBIG; -} - -EXPORT_SYMBOL(vmtruncate); - -/* * Return the total memory allocated for this pointer, not * just what the caller asked for. * @@ -170,21 +130,20 @@ unsigned int kobjsize(const void *objp) } int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, int flags, + unsigned long start, int nr_pages, unsigned int foll_flags, struct page **pages, struct vm_area_struct **vmas) { struct vm_area_struct *vma; unsigned long vm_flags; int i; - int write = !!(flags & GUP_FLAGS_WRITE); - int force = !!(flags & GUP_FLAGS_FORCE); - int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); /* calculate required read or write permissions. - * - if 'force' is set, we only require the "MAY" flags. + * If FOLL_FORCE is set, we only require the "MAY" flags. */ - vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + vm_flags = (foll_flags & FOLL_WRITE) ? + (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= (foll_flags & FOLL_FORCE) ? + (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); for (i = 0; i < nr_pages; i++) { vma = find_vma(mm, start); @@ -192,8 +151,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, goto finish_or_fault; /* protect what we can, including chardevs */ - if (vma->vm_flags & (VM_IO | VM_PFNMAP) || - (!ignore && !(vm_flags & vma->vm_flags))) + if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || + !(vm_flags & vma->vm_flags)) goto finish_or_fault; if (pages) { @@ -212,7 +171,6 @@ finish_or_fault: return i ? : -EFAULT; } - /* * get a list of pages in an address range belonging to the specified process * and indicate the VMA that covers each page @@ -227,9 +185,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int flags = 0; if (write) - flags |= GUP_FLAGS_WRITE; + flags |= FOLL_WRITE; if (force) - flags |= GUP_FLAGS_FORCE; + flags |= FOLL_FORCE; return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); } @@ -627,6 +585,22 @@ static void put_nommu_region(struct vm_region *region) } /* + * update protection on a vma + */ +static void protect_vma(struct vm_area_struct *vma, unsigned long flags) +{ +#ifdef CONFIG_MPU + struct mm_struct *mm = vma->vm_mm; + long start = vma->vm_start & PAGE_MASK; + while (start < vma->vm_end) { + protect_page(mm, start, flags); + start += PAGE_SIZE; + } + update_protections(mm); +#endif +} + +/* * add a VMA into a process's mm_struct in the appropriate place in the list * and tree and add to the address space's page tree also if not an anonymous * page @@ -645,6 +619,8 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) mm->map_count++; vma->vm_mm = mm; + protect_vma(vma, vma->vm_flags); + /* add the VMA to the mapping */ if (vma->vm_file) { mapping = vma->vm_file->f_mapping; @@ -707,6 +683,8 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) kenter("%p", vma); + protect_vma(vma, 0); + mm->map_count--; if (mm->mmap_cache == vma) mm->mmap_cache = NULL; @@ -848,7 +826,7 @@ static int validate_mmap_request(struct file *file, int ret; /* do the simple checks first */ - if (flags & MAP_FIXED || addr) { + if (flags & MAP_FIXED) { printk(KERN_DEBUG "%d: Can't do fixed-address/overlay mmap of RAM\n", current->pid); @@ -1056,7 +1034,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); if (ret == 0) { vma->vm_region->vm_top = vma->vm_region->vm_end; - return ret; + return 0; } if (ret != -ENOSYS) return ret; @@ -1073,7 +1051,8 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) */ static int do_mmap_private(struct vm_area_struct *vma, struct vm_region *region, - unsigned long len) + unsigned long len, + unsigned long capabilities) { struct page *pages; unsigned long total, point, n, rlen; @@ -1084,13 +1063,13 @@ static int do_mmap_private(struct vm_area_struct *vma, * shared mappings on devices or memory * - VM_MAYSHARE will be set if it may attempt to share */ - if (vma->vm_file) { + if (capabilities & BDI_CAP_MAP_DIRECT) { ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); if (ret == 0) { /* shouldn't return success if we're not sharing */ BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); vma->vm_region->vm_top = vma->vm_region->vm_end; - return ret; + return 0; } if (ret != -ENOSYS) return ret; @@ -1203,9 +1182,6 @@ unsigned long do_mmap_pgoff(struct file *file, kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); - if (!(flags & MAP_FIXED)) - addr = round_hint_to_min(addr); - /* decide whether we should attempt the mapping, and if so what sort of * mapping */ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, @@ -1215,6 +1191,9 @@ unsigned long do_mmap_pgoff(struct file *file, return ret; } + /* we ignore the address hint */ + addr = 0; + /* we've determined that we can make the mapping, now translate what we * now know into VMA flags */ vm_flags = determine_vm_flags(file, prot, flags, capabilities); @@ -1328,7 +1307,7 @@ unsigned long do_mmap_pgoff(struct file *file, * - this is the hook for quasi-memory character devices to * tell us the location of a shared mapping */ - if (file && file->f_op->get_unmapped_area) { + if (capabilities & BDI_CAP_MAP_DIRECT) { addr = file->f_op->get_unmapped_area(file, addr, len, pgoff, flags); if (IS_ERR((void *) addr)) { @@ -1352,15 +1331,17 @@ unsigned long do_mmap_pgoff(struct file *file, } vma->vm_region = region; - add_nommu_region(region); - /* set up the mapping */ + /* set up the mapping + * - the region is filled in if BDI_CAP_MAP_DIRECT is still set + */ if (file && vma->vm_flags & VM_SHARED) ret = do_mmap_shared_file(vma); else - ret = do_mmap_private(vma, region, len); + ret = do_mmap_private(vma, region, len, capabilities); if (ret < 0) - goto error_put_region; + goto error_just_free; + add_nommu_region(region); /* okay... we have a mapping; now we have to register it */ result = vma->vm_start; @@ -1378,19 +1359,6 @@ share: kleave(" = %lx", result); return result; -error_put_region: - __put_nommu_region(region); - if (vma) { - if (vma->vm_file) { - fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(vma->vm_mm); - } - kmem_cache_free(vm_area_cachep, vma); - } - kleave(" = %d [pr]", ret); - return ret; - error_just_free: up_write(&nommu_region_sem); error: diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a7b2460e922b..ea2147dabba6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -34,6 +34,23 @@ int sysctl_oom_dump_tasks; static DEFINE_SPINLOCK(zone_scan_lock); /* #define DEBUG */ +/* + * Is all threads of the target process nodes overlap ours? + */ +static int has_intersects_mems_allowed(struct task_struct *tsk) +{ + struct task_struct *t; + + t = tsk; + do { + if (cpuset_mems_allowed_intersects(current, t)) + return 1; + t = next_thread(t); + } while (t != tsk); + + return 0; +} + /** * badness - calculate a numeric value for how bad this task has been * @p: task struct of which task we should calculate @@ -58,6 +75,13 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; + int oom_adj = p->signal->oom_adj; + struct task_cputime task_time; + unsigned long utime; + unsigned long stime; + + if (oom_adj == OOM_DISABLE) + return 0; task_lock(p); mm = p->mm; @@ -79,7 +103,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) /* * swapoff can easily use up all memory, so kill those first. */ - if (p->flags & PF_SWAPOFF) + if (p->flags & PF_OOM_ORIGIN) return ULONG_MAX; /* @@ -102,8 +126,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * of seconds. There is no particular reason for this other than * that it turned out to work very well in practice. */ - cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime)) - >> (SHIFT_HZ + 3); + thread_group_cputime(p, &task_time); + utime = cputime_to_jiffies(task_time.utime); + stime = cputime_to_jiffies(task_time.stime); + cpu_time = (utime + stime) >> (SHIFT_HZ + 3); + if (uptime >= p->start_time.tv_sec) run_time = (uptime - p->start_time.tv_sec) >> 10; @@ -144,19 +171,19 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * because p may have allocated or otherwise mapped memory on * this node before. However it will be less likely. */ - if (!cpuset_mems_allowed_intersects(current, p)) + if (!has_intersects_mems_allowed(p)) points /= 8; /* - * Adjust the score by oomkilladj. + * Adjust the score by oom_adj. */ - if (p->oomkilladj) { - if (p->oomkilladj > 0) { + if (oom_adj) { + if (oom_adj > 0) { if (!points) points = 1; - points <<= p->oomkilladj; + points <<= oom_adj; } else - points >>= -(p->oomkilladj); + points >>= -(oom_adj); } #ifdef DEBUG @@ -200,13 +227,13 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, static struct task_struct *select_bad_process(unsigned long *ppoints, struct mem_cgroup *mem) { - struct task_struct *g, *p; + struct task_struct *p; struct task_struct *chosen = NULL; struct timespec uptime; *ppoints = 0; do_posix_clock_monotonic_gettime(&uptime); - do_each_thread(g, p) { + for_each_process(p) { unsigned long points; /* @@ -251,7 +278,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, *ppoints = ULONG_MAX; } - if (p->oomkilladj == OOM_DISABLE) + if (p->signal->oom_adj == OOM_DISABLE) continue; points = badness(p, uptime.tv_sec); @@ -259,7 +286,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, chosen = p; *ppoints = points; } - } while_each_thread(g, p); + } return chosen; } @@ -304,7 +331,7 @@ static void dump_tasks(const struct mem_cgroup *mem) } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, - get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, + get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj, p->comm); task_unlock(p); } while_each_thread(g, p); @@ -346,11 +373,6 @@ static void __oom_kill_task(struct task_struct *p, int verbose) static int oom_kill_task(struct task_struct *p) { - struct mm_struct *mm; - struct task_struct *g, *q; - - mm = p->mm; - /* WARNING: mm may not be dereferenced since we did not obtain its * value from get_task_mm(p). This is OK since all we need to do is * compare mm to q->mm below. @@ -359,30 +381,11 @@ static int oom_kill_task(struct task_struct *p) * change to NULL at any time since we do not hold task_lock(p). * However, this is of no concern to us. */ - - if (mm == NULL) + if (!p->mm || p->signal->oom_adj == OOM_DISABLE) return 1; - /* - * Don't kill the process if any threads are set to OOM_DISABLE - */ - do_each_thread(g, q) { - if (q->mm == mm && q->oomkilladj == OOM_DISABLE) - return 1; - } while_each_thread(g, q); - __oom_kill_task(p, 1); - /* - * kill all processes that share the ->mm (i.e. all threads), - * but are in a different thread group. Don't let them have access - * to memory reserves though, otherwise we might deplete all memory. - */ - do_each_thread(g, q) { - if (q->mm == mm && !same_thread_group(q, p)) - force_sig(SIGKILL, q); - } while_each_thread(g, q); - return 0; } @@ -394,8 +397,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, if (printk_ratelimit()) { printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", - current->comm, gfp_mask, order, current->oomkilladj); + "gfp_mask=0x%x, order=%d, oom_adj=%d\n", + current->comm, gfp_mask, order, + current->signal->oom_adj); task_lock(current); cpuset_print_task_mems_allowed(current); task_unlock(current); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 25e7770309b8..2c5d79236ead 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -44,18 +44,21 @@ static long ratelimit_pages = 32; /* * When balance_dirty_pages decides that the caller needs to perform some * non-background writeback, this is how many pages it will attempt to write. - * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably + * It should be somewhat larger than dirtied pages to ensure that reasonably * large amounts of I/O are submitted. */ -static inline long sync_writeback_pages(void) +static inline long sync_writeback_pages(unsigned long dirtied) { - return ratelimit_pages + ratelimit_pages / 2; + if (dirtied < ratelimit_pages) + dirtied = ratelimit_pages; + + return dirtied + dirtied / 2; } /* The following parameters are exported via /proc/sys/vm */ /* - * Start background writeback (via pdflush) at this percentage + * Start background writeback (via writeback threads) at this percentage */ int dirty_background_ratio = 10; @@ -155,37 +158,37 @@ static void update_completion_period(void) } int dirty_background_ratio_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_bytes = 0; return ret; } int dirty_background_bytes_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_ratio = 0; return ret; } int dirty_ratio_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; - ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { update_completion_period(); vm_dirty_bytes = 0; @@ -195,13 +198,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write, int dirty_bytes_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { update_completion_period(); vm_dirty_ratio = 0; @@ -315,7 +318,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { int ret = 0; - spin_lock(&bdi_lock); + spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { @@ -327,7 +330,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) ret = -EINVAL; } } - spin_unlock(&bdi_lock); + spin_unlock_bh(&bdi_lock); return ret; } @@ -339,14 +342,14 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) if (max_ratio > 100) return -EINVAL; - spin_lock(&bdi_lock); + spin_lock_bh(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; } - spin_unlock(&bdi_lock); + spin_unlock_bh(&bdi_lock); return ret; } @@ -380,7 +383,8 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); + x += zone_page_state(z, NR_FREE_PAGES) + + zone_reclaimable_pages(z); } /* * Make sure that the number of highmem pages is never larger @@ -404,7 +408,7 @@ unsigned long determine_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); + x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); @@ -473,10 +477,11 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to perform writeback if the system is over `vm_dirty_ratio'. - * If we're over `background_thresh' then pdflush is woken to perform some - * writeout. + * If we're over `background_thresh' then the writeback threads are woken to + * perform some writeout. */ -static void balance_dirty_pages(struct address_space *mapping) +static void balance_dirty_pages(struct address_space *mapping, + unsigned long write_chunk) { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; @@ -484,7 +489,7 @@ static void balance_dirty_pages(struct address_space *mapping) unsigned long dirty_thresh; unsigned long bdi_thresh; unsigned long pages_written = 0; - unsigned long write_chunk = sync_writeback_pages(); + unsigned long pause = 1; struct backing_dev_info *bdi = mapping->backing_dev_info; @@ -561,7 +566,16 @@ static void balance_dirty_pages(struct address_space *mapping) if (pages_written >= write_chunk) break; /* We've done our duty */ - schedule_timeout(1); + __set_current_state(TASK_INTERRUPTIBLE); + io_schedule_timeout(pause); + + /* + * Increase the delay for each loop, up to our previous + * default of taking a 100ms nap. + */ + pause <<= 1; + if (pause > HZ / 10) + pause = HZ / 10; } if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && @@ -569,7 +583,7 @@ static void balance_dirty_pages(struct address_space *mapping) bdi->dirty_exceeded = 0; if (writeback_in_progress(bdi)) - return; /* pdflush is already working this queue */ + return; /* * In laptop mode, we wait until hitting the higher threshold before @@ -580,18 +594,10 @@ static void balance_dirty_pages(struct address_space *mapping) * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY) - + global_page_state(NR_UNSTABLE_NFS)) - > background_thresh))) { - struct writeback_control wbc = { - .bdi = bdi, - .sync_mode = WB_SYNC_NONE, - .nr_to_write = nr_writeback, - }; - - - bdi_start_writeback(&wbc); - } + (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS)) + > background_thresh))) + bdi_start_writeback(bdi, NULL, 0); } void set_page_dirty_balance(struct page *page, int page_mkwrite) @@ -604,6 +610,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) } } +static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; + /** * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied @@ -621,7 +629,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied) { - static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; unsigned long ratelimit; unsigned long *p; @@ -634,12 +641,13 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, * tasks in balance_dirty_pages(). Period. */ preempt_disable(); - p = &__get_cpu_var(ratelimits); + p = &__get_cpu_var(bdp_ratelimits); *p += nr_pages_dirtied; if (unlikely(*p >= ratelimit)) { + ratelimit = sync_writeback_pages(*p); *p = 0; preempt_enable(); - balance_dirty_pages(mapping); + balance_dirty_pages(mapping, ratelimit); return; } preempt_enable(); @@ -683,9 +691,9 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); return 0; } @@ -1019,12 +1027,10 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) if (wbc->nr_to_write <= 0) return 0; - wbc->for_writepages = 1; if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); else ret = generic_writepages(mapping, wbc); - wbc->for_writepages = 0; return ret; } @@ -1148,6 +1154,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) EXPORT_SYMBOL(redirty_page_for_writepage); /* + * Dirty a page. + * + * For pages with a mapping this should be done under the page lock + * for the benefit of asynchronous memory errors who prefer a consistent + * dirty state. This rule can be broken in some special cases, + * but should be better not to. + * * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a0de15f46987..bf720550b44d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -48,6 +48,7 @@ #include <linux/page_cgroup.h> #include <linux/debugobjects.h> #include <linux/kmemleak.h> +#include <trace/events/kmem.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -71,7 +72,6 @@ EXPORT_SYMBOL(node_states); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; -unsigned long highest_memmap_pfn __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; @@ -123,8 +123,8 @@ static char * const zone_names[MAX_NR_ZONES] = { int min_free_kbytes = 1024; -unsigned long __meminitdata nr_kernel_pages; -unsigned long __meminitdata nr_all_pages; +static unsigned long __meminitdata nr_kernel_pages; +static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; #ifdef CONFIG_ARCH_POPULATES_NODE_MAP @@ -234,6 +234,12 @@ static void bad_page(struct page *page) static unsigned long nr_shown; static unsigned long nr_unshown; + /* Don't complain about poisoned pages */ + if (PageHWPoison(page)) { + __ClearPageBuddy(page); + return; + } + /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. @@ -510,7 +516,7 @@ static inline int free_pages_check(struct page *page) } /* - * Frees a list of pages. + * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone, and of same order. * count is the number of pages to free. * @@ -520,22 +526,42 @@ static inline int free_pages_check(struct page *page) * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static void free_pages_bulk(struct zone *zone, int count, - struct list_head *list, int order) +static void free_pcppages_bulk(struct zone *zone, int count, + struct per_cpu_pages *pcp) { + int migratetype = 0; + int batch_free = 0; + spin_lock(&zone->lock); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; - __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); - while (count--) { + __mod_zone_page_state(zone, NR_FREE_PAGES, count); + while (count) { struct page *page; + struct list_head *list; - VM_BUG_ON(list_empty(list)); - page = list_entry(list->prev, struct page, lru); - /* have to delete it as __free_one_page list manipulates */ - list_del(&page->lru); - __free_one_page(page, zone, order, page_private(page)); + /* + * Remove pages from lists in a round-robin fashion. A + * batch_free count is maintained that is incremented when an + * empty list is encountered. This is so more pages are freed + * off fuller lists instead of spinning excessively around empty + * lists + */ + do { + batch_free++; + if (++migratetype == MIGRATE_PCPTYPES) + migratetype = 0; + list = &pcp->lists[migratetype]; + } while (list_empty(list)); + + do { + page = list_entry(list->prev, struct page, lru); + /* must delete as __free_one_page list manipulates */ + list_del(&page->lru); + __free_one_page(page, zone, 0, migratetype); + trace_mm_page_pcpu_drain(page, 0, migratetype); + } while (--count && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); } @@ -557,7 +583,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) unsigned long flags; int i; int bad = 0; - int wasMlocked = TestClearPageMlocked(page); + int wasMlocked = __TestClearPageMlocked(page); kmemcheck_free_shadow(page, order); @@ -646,7 +672,7 @@ static inline void expand(struct zone *zone, struct page *page, /* * This page is about to be returned from the page allocator */ -static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +static inline int check_new_page(struct page *page) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | @@ -655,6 +681,18 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) bad_page(page); return 1; } + return 0; +} + +static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +{ + int i; + + for (i = 0; i < (1 << order); i++) { + struct page *p = page + i; + if (unlikely(check_new_page(p))) + return 1; + } set_page_private(page, 0); set_page_refcounted(page); @@ -783,6 +821,17 @@ static int move_freepages_block(struct zone *zone, struct page *page, return move_freepages(zone, start_page, end_page, migratetype); } +static void change_pageblock_range(struct page *pageblock_page, + int start_order, int migratetype) +{ + int nr_pageblocks = 1 << (start_order - pageblock_order); + + while (nr_pageblocks--) { + set_pageblock_migratetype(pageblock_page, migratetype); + pageblock_page += pageblock_nr_pages; + } +} + /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) @@ -836,11 +885,16 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) list_del(&page->lru); rmv_page_order(page); - if (current_order == pageblock_order) - set_pageblock_migratetype(page, + /* Take ownership for orders >= pageblock_order */ + if (current_order >= pageblock_order) + change_pageblock_range(page, current_order, start_migratetype); expand(zone, page, order, current_order, area, migratetype); + + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, migratetype); + return page; } } @@ -874,6 +928,7 @@ retry_reserve: } } + trace_mm_page_alloc_zone_locked(page, order, migratetype); return page; } @@ -934,7 +989,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) to_drain = pcp->batch; else to_drain = pcp->count; - free_pages_bulk(zone, to_drain, &pcp->list, 0); + free_pcppages_bulk(zone, to_drain, pcp); pcp->count -= to_drain; local_irq_restore(flags); } @@ -960,7 +1015,7 @@ static void drain_pages(unsigned int cpu) pcp = &pset->pcp; local_irq_save(flags); - free_pages_bulk(zone, pcp->count, &pcp->list, 0); + free_pcppages_bulk(zone, pcp->count, pcp); pcp->count = 0; local_irq_restore(flags); } @@ -1026,7 +1081,8 @@ static void free_hot_cold_page(struct page *page, int cold) struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; - int wasMlocked = TestClearPageMlocked(page); + int migratetype; + int wasMlocked = __TestClearPageMlocked(page); kmemcheck_free_shadow(page, 0); @@ -1043,35 +1099,49 @@ static void free_hot_cold_page(struct page *page, int cold) kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp; - set_page_private(page, get_pageblock_migratetype(page)); + migratetype = get_pageblock_migratetype(page); + set_page_private(page, migratetype); local_irq_save(flags); if (unlikely(wasMlocked)) free_page_mlock(page); __count_vm_event(PGFREE); + /* + * We only track unmovable, reclaimable and movable on pcp lists. + * Free ISOLATE pages back to the allocator because they are being + * offlined but treat RESERVE as movable pages so we can get those + * areas back if necessary. Otherwise, we may have to free + * excessively into the page allocator + */ + if (migratetype >= MIGRATE_PCPTYPES) { + if (unlikely(migratetype == MIGRATE_ISOLATE)) { + free_one_page(zone, page, 0, migratetype); + goto out; + } + migratetype = MIGRATE_MOVABLE; + } + if (cold) - list_add_tail(&page->lru, &pcp->list); + list_add_tail(&page->lru, &pcp->lists[migratetype]); else - list_add(&page->lru, &pcp->list); + list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { - free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + free_pcppages_bulk(zone, pcp->batch, pcp); pcp->count -= pcp->batch; } + +out: local_irq_restore(flags); put_cpu(); } void free_hot_page(struct page *page) { + trace_mm_page_free_direct(page, 0); free_hot_cold_page(page, 0); } -void free_cold_page(struct page *page) -{ - free_hot_cold_page(page, 1); -} - /* * split_page takes a non-compound higher-order page, and splits it into * n (1<<order) sub-pages: page[0..n] @@ -1119,35 +1189,23 @@ again: cpu = get_cpu(); if (likely(order == 0)) { struct per_cpu_pages *pcp; + struct list_head *list; pcp = &zone_pcp(zone, cpu)->pcp; + list = &pcp->lists[migratetype]; local_irq_save(flags); - if (!pcp->count) { - pcp->count = rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, + if (list_empty(list)) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, list, migratetype, cold); - if (unlikely(!pcp->count)) + if (unlikely(list_empty(list))) goto failed; } - /* Find a page of the appropriate migrate type */ - if (cold) { - list_for_each_entry_reverse(page, &pcp->list, lru) - if (page_private(page) == migratetype) - break; - } else { - list_for_each_entry(page, &pcp->list, lru) - if (page_private(page) == migratetype) - break; - } - - /* Allocate more to the pcp list if necessary */ - if (unlikely(&page->lru == &pcp->list)) { - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, - migratetype, cold); - page = list_entry(pcp->list.next, struct page, lru); - } + if (cold) + page = list_entry(list->prev, struct page, lru); + else + page = list_entry(list->next, struct page, lru); list_del(&page->lru); pcp->count--; @@ -1627,10 +1685,6 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - - /* - * The task's cpuset might have expanded its set of allowable nodes - */ p->flags |= PF_MEMALLOC; lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; @@ -1765,6 +1819,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, wake_all_kswapd(order, zonelist, high_zoneidx); +restart: /* * OK, we're below the kswapd watermark and have kicked background * reclaim. Now things get more complex, so set up alloc_flags according @@ -1772,7 +1827,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, */ alloc_flags = gfp_to_alloc_flags(gfp_mask); -restart: /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -1907,6 +1961,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + trace_mm_page_alloc(page, order, gfp_mask, migratetype); return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -1916,44 +1971,41 @@ EXPORT_SYMBOL(__alloc_pages_nodemask); */ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) { - struct page * page; + struct page *page; + + /* + * __get_free_pages() returns a 32-bit address, which cannot represent + * a highmem page + */ + VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); + page = alloc_pages(gfp_mask, order); if (!page) return 0; return (unsigned long) page_address(page); } - EXPORT_SYMBOL(__get_free_pages); unsigned long get_zeroed_page(gfp_t gfp_mask) { - struct page * page; - - /* - * get_zeroed_page() returns a 32-bit address, which cannot represent - * a highmem page - */ - VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); - - page = alloc_pages(gfp_mask | __GFP_ZERO, 0); - if (page) - return (unsigned long) page_address(page); - return 0; + return __get_free_pages(gfp_mask | __GFP_ZERO, 0); } - EXPORT_SYMBOL(get_zeroed_page); void __pagevec_free(struct pagevec *pvec) { int i = pagevec_count(pvec); - while (--i >= 0) + while (--i >= 0) { + trace_mm_pagevec_free(pvec->pages[i], pvec->cold); free_hot_cold_page(pvec->pages[i], pvec->cold); + } } void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { + trace_mm_page_free_direct(page, order); if (order == 0) free_hot_page(page); else @@ -2128,23 +2180,28 @@ void show_free_areas(void) } } - printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" - " inactive_file:%lu" + printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" + " active_file:%lu inactive_file:%lu isolated_file:%lu\n" " unevictable:%lu" - " dirty:%lu writeback:%lu unstable:%lu\n" - " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", + " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n" + " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" + " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), - global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_ANON), + global_page_state(NR_ISOLATED_ANON), + global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_FILE), + global_page_state(NR_ISOLATED_FILE), global_page_state(NR_UNEVICTABLE), global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), + nr_blockdev_pages(), global_page_state(NR_FREE_PAGES), - global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE), + global_page_state(NR_SLAB_RECLAIMABLE), + global_page_state(NR_SLAB_UNRECLAIMABLE), global_page_state(NR_FILE_MAPPED), + global_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE)); @@ -2162,7 +2219,21 @@ void show_free_areas(void) " active_file:%lukB" " inactive_file:%lukB" " unevictable:%lukB" + " isolated(anon):%lukB" + " isolated(file):%lukB" " present:%lukB" + " mlocked:%lukB" + " dirty:%lukB" + " writeback:%lukB" + " mapped:%lukB" + " shmem:%lukB" + " slab_reclaimable:%lukB" + " slab_unreclaimable:%lukB" + " kernel_stack:%lukB" + " pagetables:%lukB" + " unstable:%lukB" + " bounce:%lukB" + " writeback_tmp:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" "\n", @@ -2176,7 +2247,22 @@ void show_free_areas(void) K(zone_page_state(zone, NR_ACTIVE_FILE)), K(zone_page_state(zone, NR_INACTIVE_FILE)), K(zone_page_state(zone, NR_UNEVICTABLE)), + K(zone_page_state(zone, NR_ISOLATED_ANON)), + K(zone_page_state(zone, NR_ISOLATED_FILE)), K(zone->present_pages), + K(zone_page_state(zone, NR_MLOCK)), + K(zone_page_state(zone, NR_FILE_DIRTY)), + K(zone_page_state(zone, NR_WRITEBACK)), + K(zone_page_state(zone, NR_FILE_MAPPED)), + K(zone_page_state(zone, NR_SHMEM)), + K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), + K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), + zone_page_state(zone, NR_KERNEL_STACK) * + THREAD_SIZE / 1024, + K(zone_page_state(zone, NR_PAGETABLE)), + K(zone_page_state(zone, NR_UNSTABLE_NFS)), + K(zone_page_state(zone, NR_BOUNCE)), + K(zone_page_state(zone, NR_WRITEBACK_TEMP)), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") ); @@ -2305,7 +2391,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order); * sysctl handler for numa_zonelist_order */ int numa_zonelist_order_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, + void __user *buffer, size_t *length, loff_t *ppos) { char saved_string[NUMA_ZONELIST_ORDER_LEN]; @@ -2314,7 +2400,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, if (write) strncpy(saved_string, (char*)table->data, NUMA_ZONELIST_ORDER_LEN); - ret = proc_dostring(table, write, file, buffer, length, ppos); + ret = proc_dostring(table, write, buffer, length, ppos); if (ret) return ret; if (write) { @@ -2783,7 +2869,8 @@ static void setup_zone_migrate_reserve(struct zone *zone) { unsigned long start_pfn, pfn, end_pfn; struct page *page; - unsigned long reserve, block_migratetype; + unsigned long block_migratetype; + int reserve; /* Get the start pfn, end pfn and the number of blocks to reserve */ start_pfn = zone->zone_start_pfn; @@ -2791,6 +2878,15 @@ static void setup_zone_migrate_reserve(struct zone *zone) reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; + /* + * Reserve blocks are generally in place to help high-order atomic + * allocations that are short-lived. A min_free_kbytes value that + * would result in more than 2 reserve blocks for atomic allocations + * is assumed to be in place to help anti-fragmentation for the + * future allocation of hugepages at runtime. + */ + reserve = min(2, reserve); + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { if (!pfn_valid(pfn)) continue; @@ -2961,6 +3057,7 @@ static int zone_batchsize(struct zone *zone) static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) { struct per_cpu_pages *pcp; + int migratetype; memset(p, 0, sizeof(*p)); @@ -2968,7 +3065,8 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp->count = 0; pcp->high = 6 * batch; pcp->batch = max(1UL, 1 * batch); - INIT_LIST_HEAD(&pcp->list); + for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) + INIT_LIST_HEAD(&pcp->lists[migratetype]); } /* @@ -3146,6 +3244,32 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) return 0; } +static int __zone_pcp_update(void *data) +{ + struct zone *zone = data; + int cpu; + unsigned long batch = zone_batchsize(zone), flags; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; + + pset = zone_pcp(zone, cpu); + pcp = &pset->pcp; + + local_irq_save(flags); + free_pcppages_bulk(zone, pcp->count, pcp); + setup_pageset(pset, batch); + local_irq_restore(flags); + } + return 0; +} + +void zone_pcp_update(struct zone *zone) +{ + stop_machine(__zone_pcp_update, zone, NULL); +} + static __meminit void zone_pcp_init(struct zone *zone) { int cpu; @@ -3720,7 +3844,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_pcp_init(zone); for_each_lru(l) { INIT_LIST_HEAD(&zone->lru[l].list); - zone->lru[l].nr_saved_scan = 0; + zone->reclaim_stat.nr_saved_scan[l] = 0; } zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; @@ -4509,7 +4633,7 @@ void setup_per_zone_wmarks(void) calculate_totalreserve_pages(); } -/** +/* * The inactive anon list should be small enough that the VM never has to * do too much work, but large enough that each inactive page has a chance * to be referenced again before it is swapped out. @@ -4600,9 +4724,9 @@ module_init(init_per_zone_wmark_min) * changes. */ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (write) setup_per_zone_wmarks(); return 0; @@ -4610,12 +4734,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, #ifdef CONFIG_NUMA int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int rc; - rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; @@ -4626,12 +4750,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, } int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int rc; - rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; @@ -4652,9 +4776,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec_minmax(table, write, file, buffer, length, ppos); + proc_dointvec_minmax(table, write, buffer, length, ppos); setup_per_zone_lowmem_reserve(); return 0; } @@ -4666,13 +4790,13 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, */ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; unsigned int cpu; int ret; - ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (!write || (ret == -EINVAL)) return ret; for_each_populated_zone(zone) { @@ -4732,7 +4856,14 @@ void *__init alloc_large_system_hash(const char *tablename, numentries <<= (PAGE_SHIFT - scale); /* Make sure we've got at least a 0-order allocation.. */ - if (unlikely((numentries * bucketsize) < PAGE_SIZE)) + if (unlikely(flags & HASH_SMALL)) { + /* Makes no sense without HASH_EARLY */ + WARN_ON(!(flags & HASH_EARLY)); + if (!(numentries >> *_hash_shift)) { + numentries = 1UL << *_hash_shift; + BUG_ON(!numentries); + } + } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) numentries = PAGE_SIZE / bucketsize; } numentries = roundup_pow_of_two(numentries); @@ -4874,13 +5005,16 @@ int set_migratetype_isolate(struct page *page) struct zone *zone; unsigned long flags; int ret = -EBUSY; + int zone_idx; zone = page_zone(page); + zone_idx = zone_idx(zone); spin_lock_irqsave(&zone->lock, flags); /* * In future, more migrate types will be able to be isolation target. */ - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) + if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && + zone_idx != ZONE_MOVABLE) goto out; set_pageblock_migratetype(page, MIGRATE_ISOLATE); move_freepages_block(zone, page, MIGRATE_ISOLATE); diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index f22b4ebbd8dc..3d535d594826 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -116,10 +116,16 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) nid = page_to_nid(pfn_to_page(pfn)); table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; VM_BUG_ON(!slab_is_available()); - base = kmalloc_node(table_size, + if (node_state(nid, N_HIGH_MEMORY)) { + base = kmalloc_node(table_size, GFP_KERNEL | __GFP_NOWARN, nid); - if (!base) - base = vmalloc_node(table_size, nid); + if (!base) + base = vmalloc_node(table_size, nid); + } else { + base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); + if (!base) + base = vmalloc(table_size); + } } else { /* * We don't have to allocate page_cgroup again, but diff --git a/mm/percpu.c b/mm/percpu.c index 3311c8919f37..6af78c1ee704 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -8,12 +8,13 @@ * * This is percpu allocator which can handle both static and dynamic * areas. Percpu areas are allocated in chunks in vmalloc area. Each - * chunk is consisted of nr_cpu_ids units and the first chunk is used - * for static percpu variables in the kernel image (special boot time - * alloc/init handling necessary as these areas need to be brought up - * before allocation services are running). Unit grows as necessary - * and all units grow or shrink in unison. When a chunk is filled up, - * another chunk is allocated. ie. in vmalloc area + * chunk is consisted of boot-time determined number of units and the + * first chunk is used for static percpu variables in the kernel image + * (special boot time alloc/init handling necessary as these areas + * need to be brought up before allocation services are running). + * Unit grows as necessary and all units grow or shrink in unison. + * When a chunk is filled up, another chunk is allocated. ie. in + * vmalloc area * * c0 c1 c2 * ------------------- ------------------- ------------ @@ -22,11 +23,13 @@ * * Allocation is done in offset-size areas of single unit space. Ie, * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, - * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring - * percpu base registers pcpu_unit_size apart. + * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to + * cpus. On NUMA, the mapping can be non-linear and even sparse. + * Percpu access can be done by configuring percpu base registers + * according to cpu to unit mapping and pcpu_unit_size. * - * There are usually many small percpu allocations many of them as - * small as 4 bytes. The allocator organizes chunks into lists + * There are usually many small percpu allocations many of them being + * as small as 4 bytes. The allocator organizes chunks into lists * according to free size and tries to allocate from the fullest one. * Each chunk keeps the maximum contiguous area size hint which is * guaranteed to be eqaul to or larger than the maximum contiguous @@ -43,7 +46,7 @@ * * To use this allocator, arch code should do the followings. * - * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be @@ -55,7 +58,9 @@ #include <linux/bitmap.h> #include <linux/bootmem.h> +#include <linux/err.h> #include <linux/list.h> +#include <linux/log2.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mutex.h> @@ -89,25 +94,38 @@ struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ - struct vm_struct *vm; /* mapped vmalloc region */ + void *base_addr; /* base address of this chunk */ int map_used; /* # of map entries used */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ + struct vm_struct **vms; /* mapped vmalloc regions */ bool immutable; /* no [de]population allowed */ - struct page **page; /* points to page array */ - struct page *page_ar[]; /* #cpus * UNIT_PAGES */ + unsigned long populated[]; /* populated bitmap */ }; static int pcpu_unit_pages __read_mostly; static int pcpu_unit_size __read_mostly; -static int pcpu_chunk_size __read_mostly; +static int pcpu_nr_units __read_mostly; +static int pcpu_atom_size __read_mostly; static int pcpu_nr_slots __read_mostly; static size_t pcpu_chunk_struct_size __read_mostly; +/* cpus with the lowest and highest unit numbers */ +static unsigned int pcpu_first_unit_cpu __read_mostly; +static unsigned int pcpu_last_unit_cpu __read_mostly; + /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); +static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */ +const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */ + +/* group information, used for vm allocation */ +static int pcpu_nr_groups __read_mostly; +static const unsigned long *pcpu_group_offsets __read_mostly; +static const size_t *pcpu_group_sizes __read_mostly; + /* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different @@ -129,9 +147,9 @@ static int pcpu_reserved_chunk_limit; * Synchronization rules. * * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former - * protects allocation/reclaim paths, chunks and chunk->page arrays. - * The latter is a spinlock and protects the index data structures - - * chunk slots, chunks and area maps in chunks. + * protects allocation/reclaim paths, chunks, populated bitmap and + * vmalloc mapping. The latter is a spinlock and protects the index + * data structures - chunk slots, chunks and area maps in chunks. * * During allocation, pcpu_alloc_mutex is kept locked all the time and * pcpu_lock is grabbed and released as necessary. All actual memory @@ -178,31 +196,23 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) static int pcpu_page_idx(unsigned int cpu, int page_idx) { - return cpu * pcpu_unit_pages + page_idx; -} - -static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, - unsigned int cpu, int page_idx) -{ - return &chunk->page[pcpu_page_idx(cpu, page_idx)]; + return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; } static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { - return (unsigned long)chunk->vm->addr + - (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); + return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] + + (page_idx << PAGE_SHIFT); } -static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, - int page_idx) +static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) { - /* - * Any possible cpu id can be used here, so there's no need to - * worry about preemption or cpu hotplug. - */ - return *pcpu_chunk_pagep(chunk, raw_smp_processor_id(), - page_idx) != NULL; + /* must not be used on pre-mapped chunk */ + WARN_ON(chunk->immutable); + + return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); } /* set the pointer to a chunk in a page struct */ @@ -217,6 +227,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) return (struct pcpu_chunk *)page->index; } +static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end) +{ + *rs = find_next_zero_bit(chunk->populated, end, *rs); + *re = find_next_bit(chunk->populated, end, *rs + 1); +} + +static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end) +{ + *rs = find_next_bit(chunk->populated, end, *rs); + *re = find_next_zero_bit(chunk->populated, end, *rs + 1); +} + +/* + * (Un)populated page region iterators. Iterate over (un)populated + * page regions betwen @start and @end in @chunk. @rs and @re should + * be integer variables and will be set to start and end page index of + * the current region. + */ +#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end))) + +#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) + /** * pcpu_mem_alloc - allocate memory * @size: bytes to allocate @@ -292,10 +330,10 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { - void *first_start = pcpu_first_chunk->vm->addr; + void *first_start = pcpu_first_chunk->base_addr; /* is it in the first chunk? */ - if (addr >= first_start && addr < first_start + pcpu_chunk_size) { + if (addr >= first_start && addr < first_start + pcpu_unit_size) { /* is it in the reserved area? */ if (addr < first_start + pcpu_reserved_chunk_limit) return pcpu_reserved_chunk; @@ -309,7 +347,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * space. Note that any possible cpu id can be used here, so * there's no need to worry about preemption or cpu hotplug. */ - addr += raw_smp_processor_id() * pcpu_unit_size; + addr += pcpu_unit_offsets[raw_smp_processor_id()]; return pcpu_get_page_chunk(vmalloc_to_page(addr)); } @@ -558,125 +596,327 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) } /** - * pcpu_unmap - unmap pages out of a pcpu_chunk + * pcpu_get_pages_and_bitmap - get temp pages array and bitmap * @chunk: chunk of interest - * @page_start: page index of the first page to unmap - * @page_end: page index of the last page to unmap + 1 - * @flush_tlb: whether to flush tlb or not + * @bitmapp: output parameter for bitmap + * @may_alloc: may allocate the array * - * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. - * If @flush is true, vcache is flushed before unmapping and tlb - * after. + * Returns pointer to array of pointers to struct page and bitmap, + * both of which can be indexed with pcpu_page_idx(). The returned + * array is cleared to zero and *@bitmapp is copied from + * @chunk->populated. Note that there is only one array and bitmap + * and access exclusion is the caller's responsibility. + * + * CONTEXT: + * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. + * Otherwise, don't care. + * + * RETURNS: + * Pointer to temp pages array on success, NULL on failure. */ -static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, - bool flush_tlb) +static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, + unsigned long **bitmapp, + bool may_alloc) { - unsigned int last = nr_cpu_ids - 1; - unsigned int cpu; + static struct page **pages; + static unsigned long *bitmap; + size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); + size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * + sizeof(unsigned long); + + if (!pages || !bitmap) { + if (may_alloc && !pages) + pages = pcpu_mem_alloc(pages_size); + if (may_alloc && !bitmap) + bitmap = pcpu_mem_alloc(bitmap_size); + if (!pages || !bitmap) + return NULL; + } - /* unmap must not be done on immutable chunk */ - WARN_ON(chunk->immutable); + memset(pages, 0, pages_size); + bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); - /* - * Each flushing trial can be very expensive, issue flush on - * the whole region at once rather than doing it for each cpu. - * This could be an overkill but is more scalable. - */ - flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + *bitmapp = bitmap; + return pages; +} - for_each_possible_cpu(cpu) - unmap_kernel_range_noflush( - pcpu_chunk_addr(chunk, cpu, page_start), - (page_end - page_start) << PAGE_SHIFT); - - /* ditto as flush_cache_vunmap() */ - if (flush_tlb) - flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); +/** + * pcpu_free_pages - free pages which were allocated for @chunk + * @chunk: chunk pages were allocated for + * @pages: array of pages to be freed, indexed by pcpu_page_idx() + * @populated: populated bitmap + * @page_start: page index of the first page to be freed + * @page_end: page index of the last page to be freed + 1 + * + * Free pages [@page_start and @page_end) in @pages for all units. + * The pages were allocated for @chunk. + */ +static void pcpu_free_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page = pages[pcpu_page_idx(cpu, i)]; + + if (page) + __free_page(page); + } + } } /** - * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk - * @chunk: chunk to depopulate - * @off: offset to the area to depopulate - * @size: size of the area to depopulate in bytes - * @flush: whether to flush cache and tlb or not - * - * For each cpu, depopulate and unmap pages [@page_start,@page_end) - * from @chunk. If @flush is true, vcache is flushed before unmapping - * and tlb after. - * - * CONTEXT: - * pcpu_alloc_mutex. + * pcpu_alloc_pages - allocates pages for @chunk + * @chunk: target chunk + * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() + * @populated: populated bitmap + * @page_start: page index of the first page to be allocated + * @page_end: page index of the last page to be allocated + 1 + * + * Allocate pages [@page_start,@page_end) into @pages for all units. + * The allocation is for @chunk. Percpu core doesn't care about the + * content of @pages and will pass it verbatim to pcpu_map_pages(). */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, - bool flush) +static int pcpu_alloc_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); - int unmap_start = -1; - int uninitialized_var(unmap_end); + const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; unsigned int cpu; int i; - for (i = page_start; i < page_end; i++) { - for_each_possible_cpu(cpu) { - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; + + *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); + if (!*pagep) { + pcpu_free_pages(chunk, pages, populated, + page_start, page_end); + return -ENOMEM; + } + } + } + return 0; +} - if (!*pagep) - continue; +/** + * pcpu_pre_unmap_flush - flush cache prior to unmapping + * @chunk: chunk the regions to be flushed belongs to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages in [@page_start,@page_end) of @chunk are about to be + * unmapped. Flush cache. As each flushing trial can be very + * expensive, issue flush on the whole region at once rather than + * doing it for each cpu. This could be an overkill but is more + * scalable. + */ +static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_cache_vunmap( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); +} - __free_page(*pagep); +static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) +{ + unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); +} - /* - * If it's partial depopulation, it might get - * populated or depopulated again. Mark the - * page gone. - */ - *pagep = NULL; +/** + * pcpu_unmap_pages - unmap pages out of a pcpu_chunk + * @chunk: chunk of interest + * @pages: pages array which can be used to pass information to free + * @populated: populated bitmap + * @page_start: page index of the first page to unmap + * @page_end: page index of the last page to unmap + 1 + * + * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. + * Corresponding elements in @pages were cleared by the caller and can + * be used to carry information to pcpu_free_pages() which will be + * called after all unmaps are finished. The caller should call + * proper pre/post flush functions. + */ +static void pcpu_unmap_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page; - unmap_start = unmap_start < 0 ? i : unmap_start; - unmap_end = i + 1; + page = pcpu_chunk_page(chunk, cpu, i); + WARN_ON(!page); + pages[pcpu_page_idx(cpu, i)] = page; } + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), + page_end - page_start); } - if (unmap_start >= 0) - pcpu_unmap(chunk, unmap_start, unmap_end, flush); + for (i = page_start; i < page_end; i++) + __clear_bit(i, populated); } /** - * pcpu_map - map pages into a pcpu_chunk + * pcpu_post_unmap_tlb_flush - flush TLB after unmapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush + * TLB for the regions. This can be skipped if the area is to be + * returned to vmalloc as vmalloc will handle TLB flushing lazily. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_tlb_kernel_range( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); +} + +static int __pcpu_map_pages(unsigned long addr, struct page **pages, + int nr_pages) +{ + return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, + PAGE_KERNEL, pages); +} + +/** + * pcpu_map_pages - map pages into a pcpu_chunk * @chunk: chunk of interest + * @pages: pages array containing pages to be mapped + * @populated: populated bitmap * @page_start: page index of the first page to map * @page_end: page index of the last page to map + 1 * - * For each cpu, map pages [@page_start,@page_end) into @chunk. - * vcache is flushed afterwards. + * For each cpu, map pages [@page_start,@page_end) into @chunk. The + * caller is responsible for calling pcpu_post_map_flush() after all + * mappings are complete. + * + * This function is responsible for setting corresponding bits in + * @chunk->populated bitmap and whatever is necessary for reverse + * lookup (addr -> chunk). */ -static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) +static int pcpu_map_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) { - unsigned int last = nr_cpu_ids - 1; - unsigned int cpu; - int err; - - /* map must not be done on immutable chunk */ - WARN_ON(chunk->immutable); + unsigned int cpu, tcpu; + int i, err; for_each_possible_cpu(cpu) { - err = map_kernel_range_noflush( - pcpu_chunk_addr(chunk, cpu, page_start), - (page_end - page_start) << PAGE_SHIFT, - PAGE_KERNEL, - pcpu_chunk_pagep(chunk, cpu, page_start)); + err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), + &pages[pcpu_page_idx(cpu, page_start)], + page_end - page_start); if (err < 0) - return err; + goto err; + } + + /* mapping successful, link chunk and mark populated */ + for (i = page_start; i < page_end; i++) { + for_each_possible_cpu(cpu) + pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], + chunk); + __set_bit(i, populated); } - /* flush at once, please read comments in pcpu_unmap() */ - flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); return 0; + +err: + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), + page_end - page_start); + } + return err; +} + +/** + * pcpu_post_map_flush - flush cache after mapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been mapped. Flush + * cache. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_map_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_cache_vmap( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); +} + +/** + * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk + * @chunk: chunk to depopulate + * @off: offset to the area to depopulate + * @size: size of the area to depopulate in bytes + * @flush: whether to flush cache and tlb or not + * + * For each cpu, depopulate and unmap pages [@page_start,@page_end) + * from @chunk. If @flush is true, vcache is flushed before unmapping + * and tlb after. + * + * CONTEXT: + * pcpu_alloc_mutex. + */ +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +{ + int page_start = PFN_DOWN(off); + int page_end = PFN_UP(off + size); + struct page **pages; + unsigned long *populated; + int rs, re; + + /* quick path, check whether it's empty already */ + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + if (rs == page_start && re == page_end) + return; + break; + } + + /* immutable chunks can't be depopulated */ + WARN_ON(chunk->immutable); + + /* + * If control reaches here, there must have been at least one + * successful population attempt so the temp pages array must + * be available now. + */ + pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); + BUG_ON(!pages); + + /* unmap and free */ + pcpu_pre_unmap_flush(chunk, page_start, page_end); + + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) + pcpu_unmap_pages(chunk, pages, populated, rs, re); + + /* no need to flush tlb, vmalloc will handle it lazily */ + + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) + pcpu_free_pages(chunk, pages, populated, rs, re); + + /* commit new bitmap */ + bitmap_copy(chunk->populated, populated, pcpu_unit_pages); } /** @@ -693,58 +933,68 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) { - const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; int page_start = PFN_DOWN(off); int page_end = PFN_UP(off + size); - int map_start = -1; - int uninitialized_var(map_end); + int free_end = page_start, unmap_end = page_start; + struct page **pages; + unsigned long *populated; unsigned int cpu; - int i; + int rs, re, rc; - for (i = page_start; i < page_end; i++) { - if (pcpu_chunk_page_occupied(chunk, i)) { - if (map_start >= 0) { - if (pcpu_map(chunk, map_start, map_end)) - goto err; - map_start = -1; - } - continue; - } + /* quick path, check whether all pages are already there */ + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { + if (rs == page_start && re == page_end) + goto clear; + break; + } - map_start = map_start < 0 ? i : map_start; - map_end = i + 1; + /* need to allocate and map pages, this chunk can't be immutable */ + WARN_ON(chunk->immutable); - for_each_possible_cpu(cpu) { - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); + if (!pages) + return -ENOMEM; - *pagep = alloc_pages_node(cpu_to_node(cpu), - alloc_mask, 0); - if (!*pagep) - goto err; - pcpu_set_page_chunk(*pagep, chunk); - } + /* alloc and map */ + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); + if (rc) + goto err_free; + free_end = re; } - if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) - goto err; + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + rc = pcpu_map_pages(chunk, pages, populated, rs, re); + if (rc) + goto err_unmap; + unmap_end = re; + } + pcpu_post_map_flush(chunk, page_start, page_end); + /* commit new bitmap */ + bitmap_copy(chunk->populated, populated, pcpu_unit_pages); +clear: for_each_possible_cpu(cpu) - memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, - size); - + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); return 0; -err: - /* likely under heavy memory pressure, give memory back */ - pcpu_depopulate_chunk(chunk, off, size, true); - return -ENOMEM; + +err_unmap: + pcpu_pre_unmap_flush(chunk, page_start, unmap_end); + pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) + pcpu_unmap_pages(chunk, pages, populated, rs, re); + pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); +err_free: + pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) + pcpu_free_pages(chunk, pages, populated, rs, re); + return rc; } static void free_pcpu_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; - if (chunk->vm) - free_vm_area(chunk->vm); + if (chunk->vms) + pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups); pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); kfree(chunk); } @@ -760,10 +1010,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); chunk->map_alloc = PCPU_DFL_MAP_ALLOC; chunk->map[chunk->map_used++] = pcpu_unit_size; - chunk->page = chunk->page_ar; - chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC); - if (!chunk->vm) { + chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, + pcpu_nr_groups, pcpu_atom_size, + GFP_KERNEL); + if (!chunk->vms) { free_pcpu_chunk(chunk); return NULL; } @@ -771,6 +1022,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) INIT_LIST_HEAD(&chunk->list); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; + chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0]; return chunk; } @@ -791,7 +1043,9 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) */ static void *pcpu_alloc(size_t size, size_t align, bool reserved) { + static int warn_limit = 10; struct pcpu_chunk *chunk; + const char *err; int slot, off; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { @@ -807,11 +1061,14 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; if (size > chunk->contig_hint || - pcpu_extend_area_map(chunk) < 0) + pcpu_extend_area_map(chunk) < 0) { + err = "failed to extend area map of reserved chunk"; goto fail_unlock; + } off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; + err = "alloc from reserved chunk failed"; goto fail_unlock; } @@ -828,6 +1085,7 @@ restart: case 1: goto restart; /* pcpu_lock dropped, restart */ default: + err = "failed to extend area map"; goto fail_unlock; } @@ -841,8 +1099,10 @@ restart: spin_unlock_irq(&pcpu_lock); chunk = alloc_pcpu_chunk(); - if (!chunk) + if (!chunk) { + err = "failed to allocate new chunk"; goto fail_unlock_mutex; + } spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); @@ -855,17 +1115,26 @@ area_found: if (pcpu_populate_chunk(chunk, off, size)) { spin_lock_irq(&pcpu_lock); pcpu_free_area(chunk, off); + err = "failed to populate"; goto fail_unlock; } mutex_unlock(&pcpu_alloc_mutex); - return __addr_to_pcpu_ptr(chunk->vm->addr + off); + /* return address relative to base address */ + return __addr_to_pcpu_ptr(chunk->base_addr + off); fail_unlock: spin_unlock_irq(&pcpu_lock); fail_unlock_mutex: mutex_unlock(&pcpu_alloc_mutex); + if (warn_limit) { + pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " + "%s\n", size, align, err); + dump_stack(); + if (!--warn_limit) + pr_info("PERCPU: limit reached, disable warning\n"); + } return NULL; } @@ -938,12 +1207,13 @@ static void pcpu_reclaim(struct work_struct *work) } spin_unlock_irq(&pcpu_lock); - mutex_unlock(&pcpu_alloc_mutex); list_for_each_entry_safe(chunk, next, &todo, list) { - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); free_pcpu_chunk(chunk); } + + mutex_unlock(&pcpu_alloc_mutex); } /** @@ -968,7 +1238,7 @@ void free_percpu(void *ptr) spin_lock_irqsave(&pcpu_lock, flags); chunk = pcpu_chunk_addr_search(addr); - off = addr - chunk->vm->addr; + off = addr - chunk->base_addr; pcpu_free_area(chunk, off); @@ -987,30 +1257,299 @@ void free_percpu(void *ptr) } EXPORT_SYMBOL_GPL(free_percpu); +static inline size_t pcpu_calc_fc_sizes(size_t static_size, + size_t reserved_size, + ssize_t *dyn_sizep) +{ + size_t size_sum; + + size_sum = PFN_ALIGN(static_size + reserved_size + + (*dyn_sizep >= 0 ? *dyn_sizep : 0)); + if (*dyn_sizep != 0) + *dyn_sizep = size_sum - static_size - reserved_size; + + return size_sum; +} + /** - * pcpu_setup_first_chunk - initialize the first percpu chunk - * @get_page_fn: callback to fetch page pointer - * @static_size: the size of static percpu area in bytes + * pcpu_alloc_alloc_info - allocate percpu allocation info + * @nr_groups: the number of groups + * @nr_units: the number of units + * + * Allocate ai which is large enough for @nr_groups groups containing + * @nr_units units. The returned ai's groups[0].cpu_map points to the + * cpu_map array which is long enough for @nr_units and filled with + * NR_CPUS. It's the caller's responsibility to initialize cpu_map + * pointer of other groups. + * + * RETURNS: + * Pointer to the allocated pcpu_alloc_info on success, NULL on + * failure. + */ +struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, + int nr_units) +{ + struct pcpu_alloc_info *ai; + size_t base_size, ai_size; + void *ptr; + int unit; + + base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]), + __alignof__(ai->groups[0].cpu_map[0])); + ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); + + ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); + if (!ptr) + return NULL; + ai = ptr; + ptr += base_size; + + ai->groups[0].cpu_map = ptr; + + for (unit = 0; unit < nr_units; unit++) + ai->groups[0].cpu_map[unit] = NR_CPUS; + + ai->nr_groups = nr_groups; + ai->__ai_size = PFN_ALIGN(ai_size); + + return ai; +} + +/** + * pcpu_free_alloc_info - free percpu allocation info + * @ai: pcpu_alloc_info to free + * + * Free @ai which was allocated by pcpu_alloc_alloc_info(). + */ +void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) +{ + free_bootmem(__pa(ai), ai->__ai_size); +} + +/** + * pcpu_build_alloc_info - build alloc_info considering distances between CPUs * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto - * @base_addr: mapped address, NULL for auto - * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * + * This function determines grouping of units, their mappings to cpus + * and other parameters considering needed percpu size, allocation + * atom size and distances between CPUs. + * + * Groups are always mutliples of atom size and CPUs which are of + * LOCAL_DISTANCE both ways are grouped together and share space for + * units in the same group. The returned configuration is guaranteed + * to have CPUs on different nodes on different groups and >=75% usage + * of allocated virtual address space. + * + * RETURNS: + * On success, pointer to the new allocation_info is returned. On + * failure, ERR_PTR value is returned. + */ +struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + static int group_map[NR_CPUS] __initdata; + static int group_cnt[NR_CPUS] __initdata; + const size_t static_size = __per_cpu_end - __per_cpu_start; + int group_cnt_max = 0, nr_groups = 1, nr_units = 0; + size_t size_sum, min_unit_size, alloc_size; + int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int last_allocs, group, unit; + unsigned int cpu, tcpu; + struct pcpu_alloc_info *ai; + unsigned int *cpu_map; + + /* this function may be called multiple times */ + memset(group_map, 0, sizeof(group_map)); + memset(group_cnt, 0, sizeof(group_map)); + + /* + * Determine min_unit_size, alloc_size and max_upa such that + * alloc_size is multiple of atom_size and is the smallest + * which can accomodate 4k aligned segments which are equal to + * or larger than min_unit_size. + */ + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + + alloc_size = roundup(min_unit_size, atom_size); + upa = alloc_size / min_unit_size; + while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + upa--; + max_upa = upa; + + /* group cpus according to their proximity */ + for_each_possible_cpu(cpu) { + group = 0; + next_group: + for_each_possible_cpu(tcpu) { + if (cpu == tcpu) + break; + if (group_map[tcpu] == group && cpu_distance_fn && + (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || + cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { + group++; + nr_groups = max(nr_groups, group + 1); + goto next_group; + } + } + group_map[cpu] = group; + group_cnt[group]++; + group_cnt_max = max(group_cnt_max, group_cnt[group]); + } + + /* + * Expand unit size until address space usage goes over 75% + * and then as much as possible without using more address + * space. + */ + last_allocs = INT_MAX; + for (upa = max_upa; upa; upa--) { + int allocs = 0, wasted = 0; + + if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + continue; + + for (group = 0; group < nr_groups; group++) { + int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); + allocs += this_allocs; + wasted += this_allocs * upa - group_cnt[group]; + } + + /* + * Don't accept if wastage is over 25%. The + * greater-than comparison ensures upa==1 always + * passes the following check. + */ + if (wasted > num_possible_cpus() / 3) + continue; + + /* and then don't consume more memory */ + if (allocs > last_allocs) + break; + last_allocs = allocs; + best_upa = upa; + } + upa = best_upa; + + /* allocate and fill alloc_info */ + for (group = 0; group < nr_groups; group++) + nr_units += roundup(group_cnt[group], upa); + + ai = pcpu_alloc_alloc_info(nr_groups, nr_units); + if (!ai) + return ERR_PTR(-ENOMEM); + cpu_map = ai->groups[0].cpu_map; + + for (group = 0; group < nr_groups; group++) { + ai->groups[group].cpu_map = cpu_map; + cpu_map += roundup(group_cnt[group], upa); + } + + ai->static_size = static_size; + ai->reserved_size = reserved_size; + ai->dyn_size = dyn_size; + ai->unit_size = alloc_size / upa; + ai->atom_size = atom_size; + ai->alloc_size = alloc_size; + + for (group = 0, unit = 0; group_cnt[group]; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + + /* + * Initialize base_offset as if all groups are located + * back-to-back. The caller should update this to + * reflect actual allocation. + */ + gi->base_offset = unit * ai->unit_size; + + for_each_possible_cpu(cpu) + if (group_map[cpu] == group) + gi->cpu_map[gi->nr_units++] = cpu; + gi->nr_units = roundup(gi->nr_units, upa); + unit += gi->nr_units; + } + BUG_ON(unit != nr_units); + + return ai; +} + +/** + * pcpu_dump_alloc_info - print out information about pcpu_alloc_info + * @lvl: loglevel + * @ai: allocation info to dump + * + * Print out information about @ai using loglevel @lvl. + */ +static void pcpu_dump_alloc_info(const char *lvl, + const struct pcpu_alloc_info *ai) +{ + int group_width = 1, cpu_width = 1, width; + char empty_str[] = "--------"; + int alloc = 0, alloc_end = 0; + int group, v; + int upa, apl; /* units per alloc, allocs per line */ + + v = ai->nr_groups; + while (v /= 10) + group_width++; + + v = num_possible_cpus(); + while (v /= 10) + cpu_width++; + empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; + + upa = ai->alloc_size / ai->unit_size; + width = upa * (cpu_width + 1) + group_width + 3; + apl = rounddown_pow_of_two(max(60 / width, 1)); + + printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", + lvl, ai->static_size, ai->reserved_size, ai->dyn_size, + ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); + + for (group = 0; group < ai->nr_groups; group++) { + const struct pcpu_group_info *gi = &ai->groups[group]; + int unit = 0, unit_end = 0; + + BUG_ON(gi->nr_units % upa); + for (alloc_end += gi->nr_units / upa; + alloc < alloc_end; alloc++) { + if (!(alloc % apl)) { + printk("\n"); + printk("%spcpu-alloc: ", lvl); + } + printk("[%0*d] ", group_width, group); + + for (unit_end += upa; unit < unit_end; unit++) + if (gi->cpu_map[unit] != NR_CPUS) + printk("%0*d ", cpu_width, + gi->cpu_map[unit]); + else + printk("%s ", empty_str); + } + } + printk("\n"); +} + +/** + * pcpu_setup_first_chunk - initialize the first percpu chunk + * @ai: pcpu_alloc_info describing how to percpu area is shaped + * @base_addr: mapped address * * Initialize the first percpu chunk which contains the kernel static * perpcu area. This function is to be called from arch percpu area - * setup path. The first two parameters are mandatory. The rest are - * optional. - * - * @get_page_fn() should return pointer to percpu page given cpu - * number and page number. It should at least return enough pages to - * cover the static area. The returned pages for static area should - * have been initialized with valid data. If @unit_size is specified, - * it can also return pages after the static area. NULL return - * indicates end of pages for the cpu. Note that @get_page_fn() must - * return the same number of pages for all cpus. - * - * @reserved_size, if non-zero, specifies the amount of bytes to + * setup path. + * + * @ai contains all information necessary to initialize the first + * chunk and prime the dynamic percpu allocator. + * + * @ai->static_size is the size of static percpu area. + * + * @ai->reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves * the first chunk such that it's available only through reserved * percpu allocation. This is primarily used to serve module percpu @@ -1018,22 +1557,29 @@ EXPORT_SYMBOL_GPL(free_percpu); * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * - * @dyn_size, if non-negative, determines the number of bytes - * available for dynamic allocation in the first chunk. Specifying - * non-negative value makes percpu leave alone the area beyond - * @static_size + @reserved_size + @dyn_size. + * @ai->dyn_size determines the number of bytes available for dynamic + * allocation in the first chunk. The area between @ai->static_size + + * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. + * + * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE + * and equal to or larger than @ai->static_size + @ai->reserved_size + + * @ai->dyn_size. + * + * @ai->atom_size is the allocation atom size and used as alignment + * for vm areas. * - * @unit_size, if non-negative, specifies unit size and must be - * aligned to PAGE_SIZE and equal to or larger than @static_size + - * @reserved_size + if non-negative, @dyn_size. + * @ai->alloc_size is the allocation size and always multiple of + * @ai->atom_size. This is larger than @ai->atom_size if + * @ai->unit_size is larger than @ai->atom_size. * - * Non-null @base_addr means that the caller already allocated virtual - * region for the first chunk and mapped it. percpu must not mess - * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL - * @populate_pte_fn doesn't make any sense. + * @ai->nr_groups and @ai->groups describe virtual memory layout of + * percpu areas. Units which should be colocated are put into the + * same group. Dynamic VM areas will be allocated according to these + * groupings. If @ai->nr_groups is zero, a single group containing + * all units is assumed. * - * @populate_pte_fn is used to populate the pagetable. NULL means the - * caller already populated the pagetable. + * The caller should have mapped the first chunk at @base_addr and + * copied static data to each unit. * * If the first chunk ends up with both reserved and dynamic areas, it * is served by two chunks - one to serve the core static and reserved @@ -1043,49 +1589,98 @@ EXPORT_SYMBOL_GPL(free_percpu); * and available for dynamic allocation like any other chunks. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access. + * 0 on success, -errno on failure. */ -size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size, - void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn) +int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr) { - static struct vm_struct first_vm; + static char cpus_buf[4096] __initdata; static int smap[2], dmap[2]; - size_t size_sum = static_size + reserved_size + - (dyn_size >= 0 ? dyn_size : 0); + size_t dyn_size = ai->dyn_size; + size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; + unsigned long *group_offsets; + size_t *group_sizes; + unsigned long *unit_off; unsigned int cpu; - int nr_pages; - int err, i; + int *unit_map; + int group, unit, i; + + cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask); + +#define PCPU_SETUP_BUG_ON(cond) do { \ + if (unlikely(cond)) { \ + pr_emerg("PERCPU: failed to initialize, %s", #cond); \ + pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \ + pcpu_dump_alloc_info(KERN_EMERG, ai); \ + BUG(); \ + } \ +} while (0) - /* santiy checks */ + /* sanity checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); - BUG_ON(!static_size); - if (unit_size >= 0) { - BUG_ON(unit_size < size_sum); - BUG_ON(unit_size & ~PAGE_MASK); - BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); - } else - BUG_ON(base_addr); - BUG_ON(base_addr && populate_pte_fn); - - if (unit_size >= 0) - pcpu_unit_pages = unit_size >> PAGE_SHIFT; - else - pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, - PFN_UP(size_sum)); + PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); + PCPU_SETUP_BUG_ON(!ai->static_size); + PCPU_SETUP_BUG_ON(!base_addr); + PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); + PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); + + /* process group information and build config tables accordingly */ + group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); + group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); + unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); + unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); + + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + unit_map[cpu] = UINT_MAX; + pcpu_first_unit_cpu = NR_CPUS; + + for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { + const struct pcpu_group_info *gi = &ai->groups[group]; + + group_offsets[group] = gi->base_offset; + group_sizes[group] = gi->nr_units * ai->unit_size; + + for (i = 0; i < gi->nr_units; i++) { + cpu = gi->cpu_map[i]; + if (cpu == NR_CPUS) + continue; - pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; - pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size; - pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) - + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *); + PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids); + PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); + PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); - if (dyn_size < 0) - dyn_size = pcpu_unit_size - static_size - reserved_size; + unit_map[cpu] = unit + i; + unit_off[cpu] = gi->base_offset + i * ai->unit_size; + + if (pcpu_first_unit_cpu == NR_CPUS) + pcpu_first_unit_cpu = cpu; + } + } + pcpu_last_unit_cpu = cpu; + pcpu_nr_units = unit; + + for_each_possible_cpu(cpu) + PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX); + + /* we're done parsing the input, undefine BUG macro and dump config */ +#undef PCPU_SETUP_BUG_ON + pcpu_dump_alloc_info(KERN_INFO, ai); + + pcpu_nr_groups = ai->nr_groups; + pcpu_group_offsets = group_offsets; + pcpu_group_sizes = group_sizes; + pcpu_unit_map = unit_map; + pcpu_unit_offsets = unit_off; + + /* determine basic parameters */ + pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; + pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; + pcpu_atom_size = ai->atom_size; + pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); /* * Allocate chunk slots. The additional last slot is for @@ -1105,189 +1700,368 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, */ schunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&schunk->list); - schunk->vm = &first_vm; + schunk->base_addr = base_addr; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); - schunk->page = schunk->page_ar; + schunk->immutable = true; + bitmap_fill(schunk->populated, pcpu_unit_pages); - if (reserved_size) { - schunk->free_size = reserved_size; + if (ai->reserved_size) { + schunk->free_size = ai->reserved_size; pcpu_reserved_chunk = schunk; - pcpu_reserved_chunk_limit = static_size + reserved_size; + pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size; } else { schunk->free_size = dyn_size; dyn_size = 0; /* dynamic area covered */ } schunk->contig_hint = schunk->free_size; - schunk->map[schunk->map_used++] = -static_size; + schunk->map[schunk->map_used++] = -ai->static_size; if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; /* init dynamic chunk if necessary */ if (dyn_size) { - dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); + dchunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&dchunk->list); - dchunk->vm = &first_vm; + dchunk->base_addr = base_addr; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); - dchunk->page = schunk->page_ar; /* share page map with schunk */ + dchunk->immutable = true; + bitmap_fill(dchunk->populated, pcpu_unit_pages); dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; dchunk->map[dchunk->map_used++] = dchunk->free_size; } - /* allocate vm address */ - first_vm.flags = VM_ALLOC; - first_vm.size = pcpu_chunk_size; - - if (!base_addr) - vm_area_register_early(&first_vm, PAGE_SIZE); - else { - /* - * Pages already mapped. No need to remap into - * vmalloc area. In this case the first chunks can't - * be mapped or unmapped by percpu and are marked - * immutable. - */ - first_vm.addr = base_addr; - schunk->immutable = true; - if (dchunk) - dchunk->immutable = true; - } - - /* assign pages */ - nr_pages = -1; - for_each_possible_cpu(cpu) { - for (i = 0; i < pcpu_unit_pages; i++) { - struct page *page = get_page_fn(cpu, i); - - if (!page) - break; - *pcpu_chunk_pagep(schunk, cpu, i) = page; - } - - BUG_ON(i < PFN_UP(static_size)); - - if (nr_pages < 0) - nr_pages = i; - else - BUG_ON(nr_pages != i); - } - - /* map them */ - if (populate_pte_fn) { - for_each_possible_cpu(cpu) - for (i = 0; i < nr_pages; i++) - populate_pte_fn(pcpu_chunk_addr(schunk, - cpu, i)); - - err = pcpu_map(schunk, 0, nr_pages); - if (err) - panic("failed to setup static percpu area, err=%d\n", - err); - } - /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ - pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); - return pcpu_unit_size; + pcpu_base_addr = base_addr; + return 0; } -/* - * Embedding first chunk setup helper. - */ -static void *pcpue_ptr __initdata; -static size_t pcpue_size __initdata; -static size_t pcpue_unit_size __initdata; +const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { + [PCPU_FC_AUTO] = "auto", + [PCPU_FC_EMBED] = "embed", + [PCPU_FC_PAGE] = "page", +}; -static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) -{ - size_t off = (size_t)pageno << PAGE_SHIFT; +enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; - if (off >= pcpue_size) - return NULL; +static int __init percpu_alloc_setup(char *str) +{ + if (0) + /* nada */; +#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK + else if (!strcmp(str, "embed")) + pcpu_chosen_fc = PCPU_FC_EMBED; +#endif +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + else if (!strcmp(str, "page")) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif + else + pr_warning("PERCPU: unknown allocator %s specified\n", str); - return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); + return 0; } +early_param("percpu_alloc", percpu_alloc_setup); +#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ + !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * @alloc_fn: function to allocate percpu page + * @free_fn: funtion to free percpu page * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * If this function is used to setup the first chunk, it is allocated - * as a contiguous area using bootmem allocator and used as-is without - * being mapped into vmalloc area. This enables the first chunk to - * piggy back on the linear physical mapping which often uses larger - * page size. + * by calling @alloc_fn and used as-is without being mapped into + * vmalloc area. Allocations are always whole multiples of @atom_size + * aligned to @atom_size. + * + * This enables the first chunk to piggy back on the linear physical + * mapping which often uses larger page size. Please note that this + * can result in very sparse cpu->unit mapping on NUMA machines thus + * requiring large vmalloc address space. Don't use this allocator if + * vmalloc space is not orders of magnitude larger than distances + * between node memory addresses (ie. 32bit NUMA machines). * * When @dyn_size is positive, dynamic area might be larger than - * specified to fill page alignment. Also, when @dyn_size is auto, - * @dyn_size does not fill the whole first chunk but only what's - * necessary for page alignment after static and reserved areas. + * specified to fill page alignment. When @dyn_size is auto, + * @dyn_size is just big enough to fill page alignment after static + * and reserved areas. * * If the needed size is smaller than the minimum or specified unit - * size, the leftover is returned to the bootmem allocator. + * size, the leftover is returned using @free_fn. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access on success, -errno on failure. + * 0 on success, -errno on failure. */ -ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size) +int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn) { - size_t chunk_size; - unsigned int cpu; + void *base = (void *)ULONG_MAX; + void **areas = NULL; + struct pcpu_alloc_info *ai; + size_t size_sum, areas_size, max_distance; + int group, i, rc; + + ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, + cpu_distance_fn); + if (IS_ERR(ai)) + return PTR_ERR(ai); + + size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; + areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); + + areas = alloc_bootmem_nopanic(areas_size); + if (!areas) { + rc = -ENOMEM; + goto out_free; + } - /* determine parameters and allocate */ - pcpue_size = PFN_ALIGN(static_size + reserved_size + - (dyn_size >= 0 ? dyn_size : 0)); - if (dyn_size != 0) - dyn_size = pcpue_size - static_size - reserved_size; - - if (unit_size >= 0) { - BUG_ON(unit_size < pcpue_size); - pcpue_unit_size = unit_size; - } else - pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - - chunk_size = pcpue_unit_size * nr_cpu_ids; - - pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, - __pa(MAX_DMA_ADDRESS)); - if (!pcpue_ptr) { - pr_warning("PERCPU: failed to allocate %zu bytes for " - "embedding\n", chunk_size); - return -ENOMEM; + /* allocate, copy and determine base address */ + for (group = 0; group < ai->nr_groups; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + unsigned int cpu = NR_CPUS; + void *ptr; + + for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) + cpu = gi->cpu_map[i]; + BUG_ON(cpu == NR_CPUS); + + /* allocate space for the whole group */ + ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size); + if (!ptr) { + rc = -ENOMEM; + goto out_free_areas; + } + areas[group] = ptr; + + base = min(ptr, base); + + for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { + if (gi->cpu_map[i] == NR_CPUS) { + /* unused unit, free whole */ + free_fn(ptr, ai->unit_size); + continue; + } + /* copy and return the unused part */ + memcpy(ptr, __per_cpu_load, ai->static_size); + free_fn(ptr + size_sum, ai->unit_size - size_sum); + } } - /* return the leftover and copy */ - for (cpu = 0; cpu < nr_cpu_ids; cpu++) { - void *ptr = pcpue_ptr + cpu * pcpue_unit_size; + /* base address is now known, determine group base offsets */ + max_distance = 0; + for (group = 0; group < ai->nr_groups; group++) { + ai->groups[group].base_offset = areas[group] - base; + max_distance = max_t(size_t, max_distance, + ai->groups[group].base_offset); + } + max_distance += ai->unit_size; + + /* warn if maximum distance is further than 75% of vmalloc space */ + if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { + pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " + "space 0x%lx\n", + max_distance, VMALLOC_END - VMALLOC_START); +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + /* and fail if we have fallback */ + rc = -EINVAL; + goto out_free; +#endif + } + + pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", + PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, + ai->dyn_size, ai->unit_size); + + rc = pcpu_setup_first_chunk(ai, base); + goto out_free; + +out_free_areas: + for (group = 0; group < ai->nr_groups; group++) + free_fn(areas[group], + ai->groups[group].nr_units * ai->unit_size); +out_free: + pcpu_free_alloc_info(ai); + if (areas) + free_bootmem(__pa(areas), areas_size); + return rc; +} +#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || + !CONFIG_HAVE_SETUP_PER_CPU_AREA */ + +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK +/** + * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages + * @reserved_size: the size of reserved percpu area in bytes + * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE + * @free_fn: funtion to free percpu page, always called with PAGE_SIZE + * @populate_pte_fn: function to populate pte + * + * This is a helper to ease setting up page-remapped first percpu + * chunk and can be called where pcpu_setup_first_chunk() is expected. + * + * This is the basic allocator. Static percpu area is allocated + * page-by-page into vmalloc area. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) +{ + static struct vm_struct vm; + struct pcpu_alloc_info *ai; + char psize_str[16]; + int unit_pages; + size_t pages_size; + struct page **pages; + int unit, i, j, rc; + + snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); + + ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL); + if (IS_ERR(ai)) + return PTR_ERR(ai); + BUG_ON(ai->nr_groups != 1); + BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); + + unit_pages = ai->unit_size >> PAGE_SHIFT; + + /* unaligned allocations can't be freed, round up to page size */ + pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * + sizeof(pages[0])); + pages = alloc_bootmem(pages_size); + + /* allocate pages */ + j = 0; + for (unit = 0; unit < num_possible_cpus(); unit++) + for (i = 0; i < unit_pages; i++) { + unsigned int cpu = ai->groups[0].cpu_map[unit]; + void *ptr; + + ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); + if (!ptr) { + pr_warning("PERCPU: failed to allocate %s page " + "for cpu%u\n", psize_str, cpu); + goto enomem; + } + pages[j++] = virt_to_page(ptr); + } + + /* allocate vm area, map the pages and copy static data */ + vm.flags = VM_ALLOC; + vm.size = num_possible_cpus() * ai->unit_size; + vm_area_register_early(&vm, PAGE_SIZE); + + for (unit = 0; unit < num_possible_cpus(); unit++) { + unsigned long unit_addr = + (unsigned long)vm.addr + unit * ai->unit_size; + + for (i = 0; i < unit_pages; i++) + populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); - if (cpu_possible(cpu)) { - free_bootmem(__pa(ptr + pcpue_size), - pcpue_unit_size - pcpue_size); - memcpy(ptr, __per_cpu_load, static_size); - } else - free_bootmem(__pa(ptr), pcpue_unit_size); + /* pte already populated, the following shouldn't fail */ + rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], + unit_pages); + if (rc < 0) + panic("failed to map percpu area, err=%d\n", rc); + + /* + * FIXME: Archs with virtual cache should flush local + * cache for the linear mapping here - something + * equivalent to flush_cache_vmap() on the local cpu. + * flush_cache_vmap() can't be used as most supporting + * data structures are not set up yet. + */ + + /* copy static data */ + memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); } /* we're ready, commit */ - pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", - pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); + pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", + unit_pages, psize_str, vm.addr, ai->static_size, + ai->reserved_size, ai->dyn_size); + + rc = pcpu_setup_first_chunk(ai, vm.addr); + goto out_free_ar; + +enomem: + while (--j >= 0) + free_fn(page_address(pages[j]), PAGE_SIZE); + rc = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pages), pages_size); + pcpu_free_alloc_info(ai); + return rc; +} +#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ + +/* + * Generic percpu area setup. + * + * The embedding helper is used because its behavior closely resembles + * the original non-dynamic generic percpu area setup. This is + * important because many archs have addressing restrictions and might + * fail if the percpu area is located far away from the previous + * location. As an added bonus, in non-NUMA cases, embedding is + * generally a good idea TLB-wise because percpu area can piggy back + * on the physical linear memory mapping which uses large page + * mappings on applicable archs. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); + +static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, + size_t align) +{ + return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); +} - return pcpu_setup_first_chunk(pcpue_get_page, static_size, - reserved_size, dyn_size, - pcpue_unit_size, pcpue_ptr, NULL); +static void __init pcpu_dfl_fc_free(void *ptr, size_t size) +{ + free_bootmem(__pa(ptr), size); +} + +void __init setup_per_cpu_areas(void) +{ + unsigned long delta; + unsigned int cpu; + int rc; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, + pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); + if (rc < 0) + panic("Failed to initialized percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ diff --git a/mm/quicklist.c b/mm/quicklist.c index e66d07d1b4ff..6633965bb27b 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -19,7 +19,7 @@ #include <linux/module.h> #include <linux/quicklist.h> -DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; +DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); #define FRACTION_OF_NODE_MEM 16 @@ -29,7 +29,6 @@ static unsigned long max_pages(unsigned long min_pages) int node = numa_node_id(); struct zone *zones = NODE_DATA(node)->node_zones; int num_cpus_on_node; - const struct cpumask *cpumask_on_node = cpumask_of_node(node); node_free_pages = #ifdef CONFIG_ZONE_DMA @@ -42,7 +41,7 @@ static unsigned long max_pages(unsigned long min_pages) max = node_free_pages / FRACTION_OF_NODE_MEM; - num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); + num_cpus_on_node = cpumask_weight(cpumask_of_node(node)); max /= num_cpus_on_node; return max(max, min_pages); diff --git a/mm/rmap.c b/mm/rmap.c index 0895b5c7cbff..dd43373a483f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -36,6 +36,11 @@ * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within inode_lock in __sync_single_inode) + * + * (code doesn't rely on that order so it could be switched around) + * ->tasklist_lock + * anon_vma->lock (memory_failure, collect_procs_anon) + * pte map lock */ #include <linux/mm.h> @@ -191,7 +196,7 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -static struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma; unsigned long anon_mapping; @@ -211,7 +216,7 @@ out: return NULL; } -static void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma(struct anon_vma *anon_vma) { spin_unlock(&anon_vma->lock); rcu_read_unlock(); @@ -237,8 +242,8 @@ vma_address(struct page *page, struct vm_area_struct *vma) } /* - * At what user virtual address is page expected in vma? checking that the - * page matches the vma: currently only used on anon pages, by unuse_vma; + * At what user virtual address is page expected in vma? + * checking that the page matches the vma. */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { @@ -311,7 +316,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, * if the page is not mapped into the page tables of this VMA. Only * valid for normal file or anonymous VMAs. */ -static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) { unsigned long address; pte_t *pte; @@ -710,27 +715,6 @@ void page_add_file_rmap(struct page *page) } } -#ifdef CONFIG_DEBUG_VM -/** - * page_dup_rmap - duplicate pte mapping to a page - * @page: the page to add the mapping to - * @vma: the vm area being duplicated - * @address: the user virtual address mapped - * - * For copy_page_range only: minimal extract from page_add_file_rmap / - * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's - * quicker. - * - * The caller needs to hold the pte lock. - */ -void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) -{ - if (PageAnon(page)) - __page_check_anon_rmap(page, vma, address); - atomic_inc(&page->_mapcount); -} -#endif - /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from @@ -739,34 +723,37 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long */ void page_remove_rmap(struct page *page) { - if (atomic_add_negative(-1, &page->_mapcount)) { - /* - * Now that the last pte has gone, s390 must transfer dirty - * flag from storage key to struct page. We can usually skip - * this if the page is anon, so about to be freed; but perhaps - * not if it's in swapcache - there might be another pte slot - * containing the swap entry, but page not yet written to swap. - */ - if ((!PageAnon(page) || PageSwapCache(page)) && - page_test_dirty(page)) { - page_clear_dirty(page); - set_page_dirty(page); - } - if (PageAnon(page)) - mem_cgroup_uncharge_page(page); - __dec_zone_page_state(page, - PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); - mem_cgroup_update_mapped_file_stat(page, -1); - /* - * It would be tidy to reset the PageAnon mapping here, - * but that might overwrite a racing page_add_anon_rmap - * which increments mapcount after us but sets mapping - * before us: so leave the reset to free_hot_cold_page, - * and remember that it's only reliable while mapped. - * Leaving it set also helps swapoff to reinstate ptes - * faster for those pages still in swapcache. - */ + /* page still mapped by someone else? */ + if (!atomic_add_negative(-1, &page->_mapcount)) + return; + + /* + * Now that the last pte has gone, s390 must transfer dirty + * flag from storage key to struct page. We can usually skip + * this if the page is anon, so about to be freed; but perhaps + * not if it's in swapcache - there might be another pte slot + * containing the swap entry, but page not yet written to swap. + */ + if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { + page_clear_dirty(page); + set_page_dirty(page); + } + if (PageAnon(page)) { + mem_cgroup_uncharge_page(page); + __dec_zone_page_state(page, NR_ANON_PAGES); + } else { + __dec_zone_page_state(page, NR_FILE_MAPPED); } + mem_cgroup_update_mapped_file_stat(page, -1); + /* + * It would be tidy to reset the PageAnon mapping here, + * but that might overwrite a racing page_add_anon_rmap + * which increments mapcount after us but sets mapping + * before us: so leave the reset to free_hot_cold_page, + * and remember that it's only reliable while mapped. + * Leaving it set also helps swapoff to reinstate ptes + * faster for those pages still in swapcache. + */ } /* @@ -774,7 +761,7 @@ void page_remove_rmap(struct page *page) * repeatedly from either try_to_unmap_anon or try_to_unmap_file. */ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - int migration) + enum ttu_flags flags) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -796,11 +783,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ - if (!migration) { + if (!(flags & TTU_IGNORE_MLOCK)) { if (vma->vm_flags & VM_LOCKED) { ret = SWAP_MLOCK; goto out_unmap; } + } + if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pte)) { ret = SWAP_FAIL; goto out_unmap; @@ -818,7 +807,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (PageAnon(page)) { + if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { + if (PageAnon(page)) + dec_mm_counter(mm, anon_rss); + else + dec_mm_counter(mm, file_rss); + set_pte_at(mm, address, pte, + swp_entry_to_pte(make_hwpoison_entry(page))); + } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; if (PageSwapCache(page)) { @@ -840,12 +836,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ - BUG_ON(!migration); + BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); entry = make_migration_entry(page, pte_write(pteval)); } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); - } else if (PAGE_MIGRATION && migration) { + } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { /* Establish migration entry for a file page */ swp_entry_t entry; entry = make_migration_entry(page, pte_write(pteval)); @@ -1014,12 +1010,13 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) * vm_flags for that VMA. That should be OK, because that vma shouldn't be * 'LOCKED. */ -static int try_to_unmap_anon(struct page *page, int unlock, int migration) +static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) { struct anon_vma *anon_vma; struct vm_area_struct *vma; unsigned int mlocked = 0; int ret = SWAP_AGAIN; + int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; if (MLOCK_PAGES && unlikely(unlock)) ret = SWAP_SUCCESS; /* default for try_to_munlock() */ @@ -1035,7 +1032,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) continue; /* must visit all unlocked vmas */ ret = SWAP_MLOCK; /* saw at least one mlocked vma */ } else { - ret = try_to_unmap_one(page, vma, migration); + ret = try_to_unmap_one(page, vma, flags); if (ret == SWAP_FAIL || !page_mapped(page)) break; } @@ -1059,8 +1056,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) /** * try_to_unmap_file - unmap/unlock file page using the object-based rmap method * @page: the page to unmap/unlock - * @unlock: request for unlock rather than unmap [unlikely] - * @migration: unmapping for migration - ignored if @unlock + * @flags: action and flags * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. @@ -1072,7 +1068,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) * vm_flags for that VMA. That should be OK, because that vma shouldn't be * 'LOCKED. */ -static int try_to_unmap_file(struct page *page, int unlock, int migration) +static int try_to_unmap_file(struct page *page, enum ttu_flags flags) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -1084,6 +1080,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) unsigned long max_nl_size = 0; unsigned int mapcount; unsigned int mlocked = 0; + int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; if (MLOCK_PAGES && unlikely(unlock)) ret = SWAP_SUCCESS; /* default for try_to_munlock() */ @@ -1096,7 +1093,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) continue; /* must visit all vmas */ ret = SWAP_MLOCK; } else { - ret = try_to_unmap_one(page, vma, migration); + ret = try_to_unmap_one(page, vma, flags); if (ret == SWAP_FAIL || !page_mapped(page)) goto out; } @@ -1121,7 +1118,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) ret = SWAP_MLOCK; /* leave mlocked == 0 */ goto out; /* no need to look further */ } - if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) + if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && + (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) @@ -1155,7 +1153,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if (!MLOCK_PAGES && !migration && + if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; @@ -1195,7 +1193,7 @@ out: /** * try_to_unmap - try to remove all page table mappings to a page * @page: the page to get unmapped - * @migration: migration flag + * @flags: action and flags * * Tries to remove all the page table entries which are mapping this * page, used in the pageout path. Caller must hold the page lock. @@ -1206,16 +1204,16 @@ out: * SWAP_FAIL - the page is unswappable * SWAP_MLOCK - page is mlocked. */ -int try_to_unmap(struct page *page, int migration) +int try_to_unmap(struct page *page, enum ttu_flags flags) { int ret; BUG_ON(!PageLocked(page)); if (PageAnon(page)) - ret = try_to_unmap_anon(page, 0, migration); + ret = try_to_unmap_anon(page, flags); else - ret = try_to_unmap_file(page, 0, migration); + ret = try_to_unmap_file(page, flags); if (ret != SWAP_MLOCK && !page_mapped(page)) ret = SWAP_SUCCESS; return ret; @@ -1240,8 +1238,8 @@ int try_to_munlock(struct page *page) VM_BUG_ON(!PageLocked(page) || PageLRU(page)); if (PageAnon(page)) - return try_to_unmap_anon(page, 1, 0); + return try_to_unmap_anon(page, TTU_MUNLOCK); else - return try_to_unmap_file(page, 1, 0); + return try_to_unmap_file(page, TTU_MUNLOCK); } diff --git a/mm/shmem.c b/mm/shmem.c index 5a0b3d4055f3..356dd99566ec 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -49,7 +49,6 @@ static struct vfsmount *shm_mnt; #include <linux/backing-dev.h> #include <linux/shmem_fs.h> #include <linux/writeback.h> -#include <linux/vfs.h> #include <linux/blkdev.h> #include <linux/security.h> #include <linux/swapops.h> @@ -219,7 +218,7 @@ static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; -static struct vm_operations_struct shmem_vm_ops; +static const struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { .ra_pages = 0, /* No readahead */ @@ -1047,8 +1046,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * sync from ever calling shmem_writepage; but a stacking filesystem * may use the ->writepage of its underlying filesystem, in which case * tmpfs should write out to swap only in response to memory pressure, - * and not for pdflush or sync. However, in those cases, we do still - * want to check if there's a redundant swappage to be discarded. + * and not for the writeback threads or sync. However, in those cases, + * we do still want to check if there's a redundant swappage to be + * discarded. */ if (wbc->for_reclaim) swap = get_swap_page(); @@ -1097,6 +1097,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) shmem_swp_unmap(entry); unlock: spin_unlock(&info->lock); + /* + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. + */ swapcache_free(swap, NULL); redirty: set_page_dirty(page); @@ -1630,8 +1634,8 @@ shmem_write_end(struct file *file, struct address_space *mapping, if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); - unlock_page(page); set_page_dirty(page); + unlock_page(page); page_cache_release(page); return copied; @@ -1968,13 +1972,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s iput(inode); return error; } - unlock_page(page); inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, symname, len); kunmap_atomic(kaddr, KM_USER0); set_page_dirty(page); + unlock_page(page); page_cache_release(page); } if (dir->i_mode & S_ISGID) @@ -2298,8 +2302,7 @@ static void shmem_put_super(struct super_block *sb) sb->s_fs_info = NULL; } -static int shmem_fill_super(struct super_block *sb, - void *data, int silent) +int shmem_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; struct dentry *root; @@ -2307,17 +2310,14 @@ static int shmem_fill_super(struct super_block *sb, int err = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ - sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info), + sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), L1_CACHE_BYTES), GFP_KERNEL); if (!sbinfo) return -ENOMEM; - sbinfo->max_blocks = 0; - sbinfo->max_inodes = 0; sbinfo->mode = S_IRWXUGO | S_ISVTX; sbinfo->uid = current_fsuid(); sbinfo->gid = current_fsgid(); - sbinfo->mpol = NULL; sb->s_fs_info = sbinfo; #ifdef CONFIG_TMPFS @@ -2421,6 +2421,7 @@ static const struct address_space_operations shmem_aops = { .write_end = shmem_write_end, #endif .migratepage = migrate_page, + .error_remove_page = generic_error_remove_page, }; static const struct file_operations shmem_file_operations = { @@ -2497,7 +2498,7 @@ static const struct super_operations shmem_ops = { .put_super = shmem_put_super, }; -static struct vm_operations_struct shmem_vm_ops = { +static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, @@ -2519,7 +2520,7 @@ static struct file_system_type tmpfs_fs_type = { .kill_sb = kill_litter_super, }; -static int __init init_tmpfs(void) +int __init init_tmpfs(void) { int error; @@ -2576,7 +2577,7 @@ static struct file_system_type tmpfs_fs_type = { .kill_sb = kill_litter_super, }; -static int __init init_tmpfs(void) +int __init init_tmpfs(void) { BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); @@ -2591,6 +2592,11 @@ int shmem_unuse(swp_entry_t entry, struct page *page) return 0; } +int shmem_lock(struct file *file, int lock, struct user_struct *user) +{ + return 0; +} + #define shmem_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) @@ -2687,5 +2693,3 @@ int shmem_zero_setup(struct vm_area_struct *vma) vma->vm_ops = &shmem_vm_ops; return 0; } - -module_init(init_tmpfs) diff --git a/mm/slab.c b/mm/slab.c index 7b5d4deacfcd..7dfa481c96ba 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1384,7 +1384,7 @@ void __init kmem_cache_init(void) * Fragmentation resistance on low memory - only use bigger * page orders on machines with more than 32MB of memory. */ - if (num_physpages > (32 << 20) >> PAGE_SHIFT) + if (totalram_pages > (32 << 20) >> PAGE_SHIFT) slab_break_gfp_order = BREAK_GFP_ORDER_HI; /* Bootstrap is tricky, because several objects are allocated diff --git a/mm/slub.c b/mm/slub.c index 417ed843b251..4996fc719552 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1071,6 +1071,8 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize, } #define slub_debug 0 +#define disable_higher_order_debug 0 + static inline unsigned long slabs_node(struct kmem_cache *s, int node) { return 0; } static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) @@ -2111,8 +2113,8 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) */ #define NR_KMEM_CACHE_CPU 100 -static DEFINE_PER_CPU(struct kmem_cache_cpu, - kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; +static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU], + kmem_cache_cpu); static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); @@ -3343,6 +3345,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, { struct kmem_cache *s; + if (WARN_ON(!name)) + return NULL; + down_write(&slub_lock); s = find_mergeable(size, align, flags, name, ctor); if (s) { diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index a13ea6401ae7..d9714bdcb4a3 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -48,8 +48,14 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) { /* If the main allocator is up use that, fallback to bootmem. */ if (slab_is_available()) { - struct page *page = alloc_pages_node(node, + struct page *page; + + if (node_state(node, N_HIGH_MEMORY)) + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, get_order(size)); + else + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, + get_order(size)); if (page) return page_address(page); return NULL; diff --git a/mm/sparse.c b/mm/sparse.c index da432d9f0ae8..6ce4aab69e99 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -62,9 +62,12 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) unsigned long array_size = SECTIONS_PER_ROOT * sizeof(struct mem_section); - if (slab_is_available()) - section = kmalloc_node(array_size, GFP_KERNEL, nid); - else + if (slab_is_available()) { + if (node_state(nid, N_HIGH_MEMORY)) + section = kmalloc_node(array_size, GFP_KERNEL, nid); + else + section = kmalloc(array_size, GFP_KERNEL); + } else section = alloc_bootmem_node(NODE_DATA(nid), array_size); if (section) diff --git a/mm/swap.c b/mm/swap.c index cb29ae5d33ab..308e57d8d7ed 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -118,7 +118,7 @@ static void pagevec_move_tail(struct pagevec *pvec) spin_lock(&zone->lru_lock); } if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int lru = page_is_file_cache(page); + int lru = page_lru_base_type(page); list_move_tail(&page->lru, &zone->lru[lru].list); pgmoved++; } @@ -181,7 +181,7 @@ void activate_page(struct page *page) spin_lock_irq(&zone->lru_lock); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { int file = page_is_file_cache(page); - int lru = LRU_BASE + file; + int lru = page_lru_base_type(page); del_page_from_lru_list(zone, page, lru); SetPageActive(page); @@ -189,7 +189,7 @@ void activate_page(struct page *page) add_page_to_lru_list(zone, page, lru); __count_vm_event(PGACTIVATE); - update_page_reclaim_stat(zone, page, !!file, 1); + update_page_reclaim_stat(zone, page, file, 1); } spin_unlock_irq(&zone->lru_lock); } @@ -496,7 +496,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag); */ void __init swap_setup(void) { - unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); + unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); #ifdef CONFIG_SWAP bdi_init(swapper_space.backing_dev_info); diff --git a/mm/swap_state.c b/mm/swap_state.c index 5ae6b8b78c80..6d1daeb1cb4a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -67,10 +67,10 @@ void show_swap_cache_info(void) } /* - * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, + * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) +static int __add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; @@ -78,28 +78,43 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) VM_BUG_ON(PageSwapCache(page)); VM_BUG_ON(!PageSwapBacked(page)); + page_cache_get(page); + SetPageSwapCache(page); + set_page_private(page, entry.val); + + spin_lock_irq(&swapper_space.tree_lock); + error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + if (likely(!error)) { + total_swapcache_pages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + INC_CACHE_INFO(add_total); + } + spin_unlock_irq(&swapper_space.tree_lock); + + if (unlikely(error)) { + /* + * Only the context which have set SWAP_HAS_CACHE flag + * would call add_to_swap_cache(). + * So add_to_swap_cache() doesn't returns -EEXIST. + */ + VM_BUG_ON(error == -EEXIST); + set_page_private(page, 0UL); + ClearPageSwapCache(page); + page_cache_release(page); + } + + return error; +} + + +int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) +{ + int error; + error = radix_tree_preload(gfp_mask); if (!error) { - page_cache_get(page); - SetPageSwapCache(page); - set_page_private(page, entry.val); - - spin_lock_irq(&swapper_space.tree_lock); - error = radix_tree_insert(&swapper_space.page_tree, - entry.val, page); - if (likely(!error)) { - total_swapcache_pages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - INC_CACHE_INFO(add_total); - } - spin_unlock_irq(&swapper_space.tree_lock); + error = __add_to_swap_cache(page, entry); radix_tree_preload_end(); - - if (unlikely(error)) { - set_page_private(page, 0UL); - ClearPageSwapCache(page); - page_cache_release(page); - } } return error; } @@ -137,38 +152,34 @@ int add_to_swap(struct page *page) VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageUptodate(page)); - for (;;) { - entry = get_swap_page(); - if (!entry.val) - return 0; + entry = get_swap_page(); + if (!entry.val) + return 0; + /* + * Radix-tree node allocations from PF_MEMALLOC contexts could + * completely exhaust the page allocator. __GFP_NOMEMALLOC + * stops emergency reserves from being allocated. + * + * TODO: this could cause a theoretical memory reclaim + * deadlock in the swap out path. + */ + /* + * Add it to the swap cache and mark it dirty + */ + err = add_to_swap_cache(page, entry, + __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); + + if (!err) { /* Success */ + SetPageDirty(page); + return 1; + } else { /* -ENOMEM radix-tree allocation failure */ /* - * Radix-tree node allocations from PF_MEMALLOC contexts could - * completely exhaust the page allocator. __GFP_NOMEMALLOC - * stops emergency reserves from being allocated. - * - * TODO: this could cause a theoretical memory reclaim - * deadlock in the swap out path. - */ - /* - * Add it to the swap cache and mark it dirty + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. */ - err = add_to_swap_cache(page, entry, - __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - - switch (err) { - case 0: /* Success */ - SetPageDirty(page); - return 1; - case -EEXIST: - /* Raced with "speculative" read_swap_cache_async */ - swapcache_free(entry, NULL); - continue; - default: - /* -ENOMEM radix-tree allocation failure */ - swapcache_free(entry, NULL); - return 0; - } + swapcache_free(entry, NULL); + return 0; } } @@ -290,26 +301,31 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } /* + * call radix_tree_preload() while we can wait. + */ + err = radix_tree_preload(gfp_mask & GFP_KERNEL); + if (err) + break; + + /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); - if (err == -EEXIST) /* seems racy */ + if (err == -EEXIST) { /* seems racy */ + radix_tree_preload_end(); continue; - if (err) /* swp entry is obsolete ? */ + } + if (err) { /* swp entry is obsolete ? */ + radix_tree_preload_end(); break; + } - /* - * Associate the page with swap entry in the swap cache. - * May fail (-EEXIST) if there is already a page associated - * with this entry in the swap cache: added by a racing - * read_swap_cache_async, or add_to_swap or shmem_writepage - * re-using the just freed swap entry for an existing page. - * May fail (-ENOMEM) if radix-tree node allocation failed. - */ + /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __set_page_locked(new_page); SetPageSwapBacked(new_page); - err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); + err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { + radix_tree_preload_end(); /* * Initiate read into locked page and return. */ @@ -317,8 +333,13 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, swap_readpage(new_page); return new_page; } + radix_tree_preload_end(); ClearPageSwapBacked(new_page); __clear_page_locked(new_page); + /* + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. + */ swapcache_free(entry, NULL); } while (err != -ENOMEM); diff --git a/mm/swapfile.c b/mm/swapfile.c index 74f1102e8749..a1bc6b9af9a2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -699,7 +699,7 @@ int free_swap_and_cache(swp_entry_t entry) struct swap_info_struct *p; struct page *page = NULL; - if (is_migration_entry(entry)) + if (non_swap_entry(entry)) return 1; p = swap_info_get(entry); @@ -1575,9 +1575,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->flags &= ~SWP_WRITEOK; spin_unlock(&swap_lock); - current->flags |= PF_SWAPOFF; + current->flags |= PF_OOM_ORIGIN; err = try_to_unuse(type); - current->flags &= ~PF_SWAPOFF; + current->flags &= ~PF_OOM_ORIGIN; if (err) { /* re-insert swap space back into swap_list */ @@ -1974,12 +1974,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } - if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { - p->flags |= SWP_SOLIDSTATE; - p->cluster_next = 1 + (random32() % p->highest_bit); + if (p->bdev) { + if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { + p->flags |= SWP_SOLIDSTATE; + p->cluster_next = 1 + (random32() % p->highest_bit); + } + if (discard_swap(p) == 0) + p->flags |= SWP_DISCARDABLE; } - if (discard_swap(p) == 0) - p->flags |= SWP_DISCARDABLE; mutex_lock(&swapon_mutex); spin_lock(&swap_lock); @@ -2085,7 +2087,7 @@ static int __swap_duplicate(swp_entry_t entry, bool cache) int count; bool has_cache; - if (is_migration_entry(entry)) + if (non_swap_entry(entry)) return -EINVAL; type = swp_type(entry); diff --git a/mm/truncate.c b/mm/truncate.c index ccc3ecf7cb98..450cebdabfc0 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -93,11 +93,11 @@ EXPORT_SYMBOL(cancel_dirty_page); * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ -static void +static int truncate_complete_page(struct address_space *mapping, struct page *page) { if (page->mapping != mapping) - return; + return -EIO; if (page_has_private(page)) do_invalidatepage(page, 0); @@ -108,6 +108,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) remove_from_page_cache(page); ClearPageMappedToDisk(page); page_cache_release(page); /* pagecache ref */ + return 0; } /* @@ -135,6 +136,51 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) return ret; } +int truncate_inode_page(struct address_space *mapping, struct page *page) +{ + if (page_mapped(page)) { + unmap_mapping_range(mapping, + (loff_t)page->index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); + } + return truncate_complete_page(mapping, page); +} + +/* + * Used to get rid of pages on hardware memory corruption. + */ +int generic_error_remove_page(struct address_space *mapping, struct page *page) +{ + if (!mapping) + return -EINVAL; + /* + * Only punch for normal data pages for now. + * Handling other types like directories would need more auditing. + */ + if (!S_ISREG(mapping->host->i_mode)) + return -EIO; + return truncate_inode_page(mapping, page); +} +EXPORT_SYMBOL(generic_error_remove_page); + +/* + * Safely invalidate one page from its pagecache mapping. + * It only drops clean, unused pages. The page must be locked. + * + * Returns 1 if the page is successfully invalidated, otherwise 0. + */ +int invalidate_inode_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + if (!mapping) + return 0; + if (PageDirty(page) || PageWriteback(page)) + return 0; + if (page_mapped(page)) + return 0; + return invalidate_complete_page(mapping, page); +} + /** * truncate_inode_pages - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate @@ -196,12 +242,7 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); continue; } - if (page_mapped(page)) { - unmap_mapping_range(mapping, - (loff_t)page_index<<PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); - } - truncate_complete_page(mapping, page); + truncate_inode_page(mapping, page); unlock_page(page); } pagevec_release(&pvec); @@ -238,15 +279,10 @@ void truncate_inode_pages_range(struct address_space *mapping, break; lock_page(page); wait_on_page_writeback(page); - if (page_mapped(page)) { - unmap_mapping_range(mapping, - (loff_t)page->index<<PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); - } + truncate_inode_page(mapping, page); if (page->index > next) next = page->index; next++; - truncate_complete_page(mapping, page); unlock_page(page); } pagevec_release(&pvec); @@ -311,12 +347,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (lock_failed) continue; - if (PageDirty(page) || PageWriteback(page)) - goto unlock; - if (page_mapped(page)) - goto unlock; - ret += invalidate_complete_page(mapping, page); -unlock: + ret += invalidate_inode_page(page); + unlock_page(page); if (next > end) break; @@ -465,3 +497,67 @@ int invalidate_inode_pages2(struct address_space *mapping) return invalidate_inode_pages2_range(mapping, 0, -1); } EXPORT_SYMBOL_GPL(invalidate_inode_pages2); + +/** + * truncate_pagecache - unmap and remove pagecache that has been truncated + * @inode: inode + * @old: old file offset + * @new: new file offset + * + * inode's new i_size must already be written before truncate_pagecache + * is called. + * + * This function should typically be called before the filesystem + * releases resources associated with the freed range (eg. deallocates + * blocks). This way, pagecache will always stay logically coherent + * with on-disk format, and the filesystem would not have to deal with + * situations such as writepage being called for a page that has already + * had its underlying blocks deallocated. + */ +void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) +{ + if (new < old) { + struct address_space *mapping = inode->i_mapping; + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, new); + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + } +} +EXPORT_SYMBOL(truncate_pagecache); + +/** + * vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating + * + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. + */ +int vmtruncate(struct inode *inode, loff_t offset) +{ + loff_t oldsize; + int error; + + error = inode_newsize_ok(inode, offset); + if (error) + return error; + oldsize = inode->i_size; + i_size_write(inode, offset); + truncate_pagecache(inode, oldsize, offset); + if (inode->i_op->truncate) + inode->i_op->truncate(inode); + + return error; +} +EXPORT_SYMBOL(vmtruncate); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f8189a4b3e13..0f551a4a44cd 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -12,6 +12,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/highmem.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/interrupt.h> @@ -25,10 +26,10 @@ #include <linux/rcupdate.h> #include <linux/pfn.h> #include <linux/kmemleak.h> - #include <asm/atomic.h> #include <asm/uaccess.h> #include <asm/tlbflush.h> +#include <asm/shmparam.h> /*** Page table manipulation functions ***/ @@ -168,11 +169,9 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end, next = pgd_addr_end(addr, end); err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); if (err) - break; + return err; } while (pgd++, addr = next, addr != end); - if (unlikely(err)) - return err; return nr; } @@ -186,7 +185,7 @@ static int vmap_page_range(unsigned long start, unsigned long end, return ret; } -static inline int is_vmalloc_or_module_addr(const void *x) +int is_vmalloc_or_module_addr(const void *x) { /* * ARM, x86-64 and sparc64 put modules in a special place, @@ -265,6 +264,7 @@ struct vmap_area { static DEFINE_SPINLOCK(vmap_area_lock); static struct rb_root vmap_area_root = RB_ROOT; static LIST_HEAD(vmap_area_list); +static unsigned long vmap_area_pcpu_hole; static struct vmap_area *__find_vmap_area(unsigned long addr) { @@ -431,6 +431,15 @@ static void __free_vmap_area(struct vmap_area *va) RB_CLEAR_NODE(&va->rb_node); list_del_rcu(&va->list); + /* + * Track the highest possible candidate for pcpu area + * allocation. Areas outside of vmalloc area can be returned + * here too, consider only end addresses which fall inside + * vmalloc area proper. + */ + if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) + vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); + call_rcu(&va->rcu_head, rcu_free_va); } @@ -1038,6 +1047,9 @@ void __init vmalloc_init(void) va->va_end = va->va_start + tmp->size; __insert_vmap_area(va); } + + vmap_area_pcpu_hole = VMALLOC_END; + vmap_initialized = true; } @@ -1122,14 +1134,34 @@ EXPORT_SYMBOL_GPL(map_vm_area); DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; +static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, + unsigned long flags, void *caller) +{ + struct vm_struct *tmp, **p; + + vm->flags = flags; + vm->addr = (void *)va->va_start; + vm->size = va->va_end - va->va_start; + vm->caller = caller; + va->private = vm; + va->flags |= VM_VM_AREA; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { + if (tmp->addr >= vm->addr) + break; + } + vm->next = *p; + *p = vm; + write_unlock(&vmlist_lock); +} + static struct vm_struct *__get_vm_area_node(unsigned long size, - unsigned long flags, unsigned long start, unsigned long end, - int node, gfp_t gfp_mask, void *caller) + unsigned long align, unsigned long flags, unsigned long start, + unsigned long end, int node, gfp_t gfp_mask, void *caller) { static struct vmap_area *va; struct vm_struct *area; - struct vm_struct *tmp, **p; - unsigned long align = 1; BUG_ON(in_interrupt()); if (flags & VM_IOREMAP) { @@ -1147,7 +1179,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, if (unlikely(!size)) return NULL; - area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); + area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!area)) return NULL; @@ -1162,32 +1194,14 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - area->flags = flags; - area->addr = (void *)va->va_start; - area->size = size; - area->pages = NULL; - area->nr_pages = 0; - area->phys_addr = 0; - area->caller = caller; - va->private = area; - va->flags |= VM_VM_AREA; - - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { - if (tmp->addr >= area->addr) - break; - } - area->next = *p; - *p = area; - write_unlock(&vmlist_lock); - + insert_vmalloc_vm(area, va, flags, caller); return area; } struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end) { - return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, + return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, __builtin_return_address(0)); } EXPORT_SYMBOL_GPL(__get_vm_area); @@ -1196,7 +1210,7 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, void *caller) { - return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, + return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, caller); } @@ -1211,22 +1225,22 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, */ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { - return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, + return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, -1, GFP_KERNEL, __builtin_return_address(0)); } struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, void *caller) { - return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, + return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, -1, GFP_KERNEL, caller); } struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node, gfp_t gfp_mask) { - return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, - gfp_mask, __builtin_return_address(0)); + return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, + node, gfp_mask, __builtin_return_address(0)); } static struct vm_struct *find_vm_area(const void *addr) @@ -1256,17 +1270,21 @@ struct vm_struct *remove_vm_area(const void *addr) if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->private; struct vm_struct *tmp, **p; - - vmap_debug_free_range(va->va_start, va->va_end); - free_unmap_vmap_area(va); - vm->size -= PAGE_SIZE; - + /* + * remove from list and disallow access to this vm_struct + * before unmap. (address range confliction is maintained by + * vmap.) + */ write_lock(&vmlist_lock); for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) ; *p = tmp->next; write_unlock(&vmlist_lock); + vmap_debug_free_range(va->va_start, va->va_end); + free_unmap_vmap_area(va); + vm->size -= PAGE_SIZE; + return vm; } return NULL; @@ -1368,7 +1386,7 @@ void *vmap(struct page **pages, unsigned int count, might_sleep(); - if (count > num_physpages) + if (count > totalram_pages) return NULL; area = get_vm_area_caller((count << PAGE_SHIFT), flags, @@ -1385,7 +1403,8 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); -static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, +static void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, pgprot_t prot, int node, void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node, void *caller) @@ -1399,7 +1418,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, + pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO, PAGE_KERNEL, node, caller); area->flags |= VM_VPAGES; } else { @@ -1458,6 +1477,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size + * @align: desired alignment * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @node: node to use for allocation or -1 @@ -1467,19 +1487,20 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. */ -static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, - int node, void *caller) +static void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, pgprot_t prot, + int node, void *caller) { struct vm_struct *area; void *addr; unsigned long real_size = size; size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > num_physpages) + if (!size || (size >> PAGE_SHIFT) > totalram_pages) return NULL; - area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, - node, gfp_mask, caller); + area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, + VMALLOC_END, node, gfp_mask, caller); if (!area) return NULL; @@ -1498,7 +1519,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { - return __vmalloc_node(size, gfp_mask, prot, -1, + return __vmalloc_node(size, 1, gfp_mask, prot, -1, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); @@ -1514,7 +1535,7 @@ EXPORT_SYMBOL(__vmalloc); */ void *vmalloc(unsigned long size) { - return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, -1, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc); @@ -1531,7 +1552,8 @@ void *vmalloc_user(unsigned long size) struct vm_struct *area; void *ret; - ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + ret = __vmalloc_node(size, SHMLBA, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL, -1, __builtin_return_address(0)); if (ret) { area = find_vm_area(ret); @@ -1554,7 +1576,7 @@ EXPORT_SYMBOL(vmalloc_user); */ void *vmalloc_node(unsigned long size, int node) { - return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); @@ -1577,7 +1599,7 @@ EXPORT_SYMBOL(vmalloc_node); void *vmalloc_exec(unsigned long size) { - return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, -1, __builtin_return_address(0)); } @@ -1598,7 +1620,7 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL, + return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, -1, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32); @@ -1615,7 +1637,7 @@ void *vmalloc_32_user(unsigned long size) struct vm_struct *area; void *ret; - ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, + ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, -1, __builtin_return_address(0)); if (ret) { area = find_vm_area(ret); @@ -1625,10 +1647,120 @@ void *vmalloc_32_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_32_user); +/* + * small helper routine , copy contents to buf from addr. + * If the page is not present, fill zero. + */ + +static int aligned_vread(char *buf, char *addr, unsigned long count) +{ + struct page *p; + int copied = 0; + + while (count) { + unsigned long offset, length; + + offset = (unsigned long)addr & ~PAGE_MASK; + length = PAGE_SIZE - offset; + if (length > count) + length = count; + p = vmalloc_to_page(addr); + /* + * To do safe access to this _mapped_ area, we need + * lock. But adding lock here means that we need to add + * overhead of vmalloc()/vfree() calles for this _debug_ + * interface, rarely used. Instead of that, we'll use + * kmap() and get small overhead in this access function. + */ + if (p) { + /* + * we can expect USER0 is not used (see vread/vwrite's + * function description) + */ + void *map = kmap_atomic(p, KM_USER0); + memcpy(buf, map + offset, length); + kunmap_atomic(map, KM_USER0); + } else + memset(buf, 0, length); + + addr += length; + buf += length; + copied += length; + count -= length; + } + return copied; +} + +static int aligned_vwrite(char *buf, char *addr, unsigned long count) +{ + struct page *p; + int copied = 0; + + while (count) { + unsigned long offset, length; + + offset = (unsigned long)addr & ~PAGE_MASK; + length = PAGE_SIZE - offset; + if (length > count) + length = count; + p = vmalloc_to_page(addr); + /* + * To do safe access to this _mapped_ area, we need + * lock. But adding lock here means that we need to add + * overhead of vmalloc()/vfree() calles for this _debug_ + * interface, rarely used. Instead of that, we'll use + * kmap() and get small overhead in this access function. + */ + if (p) { + /* + * we can expect USER0 is not used (see vread/vwrite's + * function description) + */ + void *map = kmap_atomic(p, KM_USER0); + memcpy(map + offset, buf, length); + kunmap_atomic(map, KM_USER0); + } + addr += length; + buf += length; + copied += length; + count -= length; + } + return copied; +} + +/** + * vread() - read vmalloc area in a safe way. + * @buf: buffer for reading data + * @addr: vm address. + * @count: number of bytes to be read. + * + * Returns # of bytes which addr and buf should be increased. + * (same number to @count). Returns 0 if [addr...addr+count) doesn't + * includes any intersect with alive vmalloc area. + * + * This function checks that addr is a valid vmalloc'ed area, and + * copy data from that area to a given buffer. If the given memory range + * of [addr...addr+count) includes some valid address, data is copied to + * proper area of @buf. If there are memory holes, they'll be zero-filled. + * IOREMAP area is treated as memory hole and no copy is done. + * + * If [addr...addr+count) doesn't includes any intersects with alive + * vm_struct area, returns 0. + * @buf should be kernel's buffer. Because this function uses KM_USER0, + * the caller should guarantee KM_USER0 is not used. + * + * Note: In usual ops, vread() is never necessary because the caller + * should know vmalloc() area is valid and can use memcpy(). + * This is for routines which have to access vmalloc area without + * any informaion, as /dev/kmem. + * + */ + long vread(char *buf, char *addr, unsigned long count) { struct vm_struct *tmp; char *vaddr, *buf_start = buf; + unsigned long buflen = count; unsigned long n; /* Don't allow overflow */ @@ -1636,7 +1768,7 @@ long vread(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; read_lock(&vmlist_lock); - for (tmp = vmlist; tmp; tmp = tmp->next) { + for (tmp = vmlist; count && tmp; tmp = tmp->next) { vaddr = (char *) tmp->addr; if (addr >= vaddr + tmp->size - PAGE_SIZE) continue; @@ -1649,32 +1781,72 @@ long vread(char *buf, char *addr, unsigned long count) count--; } n = vaddr + tmp->size - PAGE_SIZE - addr; - do { - if (count == 0) - goto finished; - *buf = *addr; - buf++; - addr++; - count--; - } while (--n > 0); + if (n > count) + n = count; + if (!(tmp->flags & VM_IOREMAP)) + aligned_vread(buf, addr, n); + else /* IOREMAP area is treated as memory hole */ + memset(buf, 0, n); + buf += n; + addr += n; + count -= n; } finished: read_unlock(&vmlist_lock); - return buf - buf_start; + + if (buf == buf_start) + return 0; + /* zero-fill memory holes */ + if (buf != buf_start + buflen) + memset(buf, 0, buflen - (buf - buf_start)); + + return buflen; } +/** + * vwrite() - write vmalloc area in a safe way. + * @buf: buffer for source data + * @addr: vm address. + * @count: number of bytes to be read. + * + * Returns # of bytes which addr and buf should be incresed. + * (same number to @count). + * If [addr...addr+count) doesn't includes any intersect with valid + * vmalloc area, returns 0. + * + * This function checks that addr is a valid vmalloc'ed area, and + * copy data from a buffer to the given addr. If specified range of + * [addr...addr+count) includes some valid address, data is copied from + * proper area of @buf. If there are memory holes, no copy to hole. + * IOREMAP area is treated as memory hole and no copy is done. + * + * If [addr...addr+count) doesn't includes any intersects with alive + * vm_struct area, returns 0. + * @buf should be kernel's buffer. Because this function uses KM_USER0, + * the caller should guarantee KM_USER0 is not used. + * + * Note: In usual ops, vwrite() is never necessary because the caller + * should know vmalloc() area is valid and can use memcpy(). + * This is for routines which have to access vmalloc area without + * any informaion, as /dev/kmem. + * + * The caller should guarantee KM_USER1 is not used. + */ + long vwrite(char *buf, char *addr, unsigned long count) { struct vm_struct *tmp; - char *vaddr, *buf_start = buf; - unsigned long n; + char *vaddr; + unsigned long n, buflen; + int copied = 0; /* Don't allow overflow */ if ((unsigned long) addr + count < count) count = -(unsigned long) addr; + buflen = count; read_lock(&vmlist_lock); - for (tmp = vmlist; tmp; tmp = tmp->next) { + for (tmp = vmlist; count && tmp; tmp = tmp->next) { vaddr = (char *) tmp->addr; if (addr >= vaddr + tmp->size - PAGE_SIZE) continue; @@ -1686,18 +1858,21 @@ long vwrite(char *buf, char *addr, unsigned long count) count--; } n = vaddr + tmp->size - PAGE_SIZE - addr; - do { - if (count == 0) - goto finished; - *addr = *buf; - buf++; - addr++; - count--; - } while (--n > 0); + if (n > count) + n = count; + if (!(tmp->flags & VM_IOREMAP)) { + aligned_vwrite(buf, addr, n); + copied++; + } + buf += n; + addr += n; + count -= n; } finished: read_unlock(&vmlist_lock); - return buf - buf_start; + if (!copied) + return 0; + return buflen; } /** @@ -1818,6 +1993,286 @@ void free_vm_area(struct vm_struct *area) } EXPORT_SYMBOL_GPL(free_vm_area); +static struct vmap_area *node_to_va(struct rb_node *n) +{ + return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; +} + +/** + * pvm_find_next_prev - find the next and prev vmap_area surrounding @end + * @end: target address + * @pnext: out arg for the next vmap_area + * @pprev: out arg for the previous vmap_area + * + * Returns: %true if either or both of next and prev are found, + * %false if no vmap_area exists + * + * Find vmap_areas end addresses of which enclose @end. ie. if not + * NULL, *pnext->va_end > @end and *pprev->va_end <= @end. + */ +static bool pvm_find_next_prev(unsigned long end, + struct vmap_area **pnext, + struct vmap_area **pprev) +{ + struct rb_node *n = vmap_area_root.rb_node; + struct vmap_area *va = NULL; + + while (n) { + va = rb_entry(n, struct vmap_area, rb_node); + if (end < va->va_end) + n = n->rb_left; + else if (end > va->va_end) + n = n->rb_right; + else + break; + } + + if (!va) + return false; + + if (va->va_end > end) { + *pnext = va; + *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + } else { + *pprev = va; + *pnext = node_to_va(rb_next(&(*pprev)->rb_node)); + } + return true; +} + +/** + * pvm_determine_end - find the highest aligned address between two vmap_areas + * @pnext: in/out arg for the next vmap_area + * @pprev: in/out arg for the previous vmap_area + * @align: alignment + * + * Returns: determined end address + * + * Find the highest aligned address between *@pnext and *@pprev below + * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned + * down address is between the end addresses of the two vmap_areas. + * + * Please note that the address returned by this function may fall + * inside *@pnext vmap_area. The caller is responsible for checking + * that. + */ +static unsigned long pvm_determine_end(struct vmap_area **pnext, + struct vmap_area **pprev, + unsigned long align) +{ + const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + unsigned long addr; + + if (*pnext) + addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end); + else + addr = vmalloc_end; + + while (*pprev && (*pprev)->va_end > addr) { + *pnext = *pprev; + *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + } + + return addr; +} + +/** + * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator + * @offsets: array containing offset of each area + * @sizes: array containing size of each area + * @nr_vms: the number of areas to allocate + * @align: alignment, all entries in @offsets and @sizes must be aligned to this + * @gfp_mask: allocation mask + * + * Returns: kmalloc'd vm_struct pointer array pointing to allocated + * vm_structs on success, %NULL on failure + * + * Percpu allocator wants to use congruent vm areas so that it can + * maintain the offsets among percpu areas. This function allocates + * congruent vmalloc areas for it. These areas tend to be scattered + * pretty far, distance between two areas easily going up to + * gigabytes. To avoid interacting with regular vmallocs, these areas + * are allocated from top. + * + * Despite its complicated look, this allocator is rather simple. It + * does everything top-down and scans areas from the end looking for + * matching slot. While scanning, if any of the areas overlaps with + * existing vmap_area, the base address is pulled down to fit the + * area. Scanning is repeated till all the areas fit and then all + * necessary data structres are inserted and the result is returned. + */ +struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, + const size_t *sizes, int nr_vms, + size_t align, gfp_t gfp_mask) +{ + const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); + const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + struct vmap_area **vas, *prev, *next; + struct vm_struct **vms; + int area, area2, last_area, term_area; + unsigned long base, start, end, last_end; + bool purged = false; + + gfp_mask &= GFP_RECLAIM_MASK; + + /* verify parameters and allocate data structures */ + BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); + for (last_area = 0, area = 0; area < nr_vms; area++) { + start = offsets[area]; + end = start + sizes[area]; + + /* is everything aligned properly? */ + BUG_ON(!IS_ALIGNED(offsets[area], align)); + BUG_ON(!IS_ALIGNED(sizes[area], align)); + + /* detect the area with the highest address */ + if (start > offsets[last_area]) + last_area = area; + + for (area2 = 0; area2 < nr_vms; area2++) { + unsigned long start2 = offsets[area2]; + unsigned long end2 = start2 + sizes[area2]; + + if (area2 == area) + continue; + + BUG_ON(start2 >= start && start2 < end); + BUG_ON(end2 <= end && end2 > start); + } + } + last_end = offsets[last_area] + sizes[last_area]; + + if (vmalloc_end - vmalloc_start < last_end) { + WARN_ON(true); + return NULL; + } + + vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); + vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); + if (!vas || !vms) + goto err_free; + + for (area = 0; area < nr_vms; area++) { + vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); + vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); + if (!vas[area] || !vms[area]) + goto err_free; + } +retry: + spin_lock(&vmap_area_lock); + + /* start scanning - we scan from the top, begin with the last area */ + area = term_area = last_area; + start = offsets[area]; + end = start + sizes[area]; + + if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) { + base = vmalloc_end - last_end; + goto found; + } + base = pvm_determine_end(&next, &prev, align) - end; + + while (true) { + BUG_ON(next && next->va_end <= base + end); + BUG_ON(prev && prev->va_end > base + end); + + /* + * base might have underflowed, add last_end before + * comparing. + */ + if (base + last_end < vmalloc_start + last_end) { + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = true; + goto retry; + } + goto err_free; + } + + /* + * If next overlaps, move base downwards so that it's + * right below next and then recheck. + */ + if (next && next->va_start < base + end) { + base = pvm_determine_end(&next, &prev, align) - end; + term_area = area; + continue; + } + + /* + * If prev overlaps, shift down next and prev and move + * base so that it's right below new next and then + * recheck. + */ + if (prev && prev->va_end > base + start) { + next = prev; + prev = node_to_va(rb_prev(&next->rb_node)); + base = pvm_determine_end(&next, &prev, align) - end; + term_area = area; + continue; + } + + /* + * This area fits, move on to the previous one. If + * the previous one is the terminal one, we're done. + */ + area = (area + nr_vms - 1) % nr_vms; + if (area == term_area) + break; + start = offsets[area]; + end = start + sizes[area]; + pvm_find_next_prev(base + end, &next, &prev); + } +found: + /* we've found a fitting base, insert all va's */ + for (area = 0; area < nr_vms; area++) { + struct vmap_area *va = vas[area]; + + va->va_start = base + offsets[area]; + va->va_end = va->va_start + sizes[area]; + __insert_vmap_area(va); + } + + vmap_area_pcpu_hole = base + offsets[last_area]; + + spin_unlock(&vmap_area_lock); + + /* insert all vm's */ + for (area = 0; area < nr_vms; area++) + insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, + pcpu_get_vm_areas); + + kfree(vas); + return vms; + +err_free: + for (area = 0; area < nr_vms; area++) { + if (vas) + kfree(vas[area]); + if (vms) + kfree(vms[area]); + } + kfree(vas); + kfree(vms); + return NULL; +} + +/** + * pcpu_free_vm_areas - free vmalloc areas for percpu allocator + * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() + * @nr_vms: the number of allocated areas + * + * Free vm_structs and the array allocated by pcpu_get_vm_areas(). + */ +void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) +{ + int i; + + for (i = 0; i < nr_vms; i++) + free_vm_area(vms[i]); + kfree(vms); +} #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) diff --git a/mm/vmscan.c b/mm/vmscan.c index ba8228e0a806..64e438898832 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -148,8 +148,8 @@ static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, return &zone->reclaim_stat; } -static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc, - enum lru_list lru) +static unsigned long zone_nr_lru_pages(struct zone *zone, + struct scan_control *sc, enum lru_list lru) { if (!scanning_global_lru(sc)) return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); @@ -286,7 +286,12 @@ static inline int page_mapping_inuse(struct page *page) static inline int is_page_cache_freeable(struct page *page) { - return page_count(page) - !!page_has_private(page) == 2; + /* + * A freeable page cache page is referenced only by the caller + * that isolated the page, the page cache radix tree and + * optional buffer heads at page->private. + */ + return page_count(page) - page_has_private(page) == 2; } static int may_write_to_queue(struct backing_dev_info *bdi) @@ -361,7 +366,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. - * See swapfile.c:page_queue_congested(). */ if (!is_page_cache_freeable(page)) return PAGE_KEEP; @@ -531,7 +535,7 @@ redo: * unevictable page on [in]active list. * We know how to handle that. */ - lru = active + page_is_file_cache(page); + lru = active + page_lru_base_type(page); lru_cache_add_lru(page, lru); } else { /* @@ -659,7 +663,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, 0)) { + switch (try_to_unmap(page, TTU_UNMAP)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -821,7 +825,7 @@ int __isolate_lru_page(struct page *page, int mode, int file) if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) return ret; - if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) + if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) return ret; /* @@ -935,6 +939,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) continue; + + /* + * If we don't have enough swap space, reclaiming of + * anon page which don't already have a swap slot is + * pointless. + */ + if (nr_swap_pages <= 0 && PageAnon(cursor_page) && + !PageSwapCache(cursor_page)) + continue; + if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); mem_cgroup_del_lru(cursor_page); @@ -961,7 +975,7 @@ static unsigned long isolate_pages_global(unsigned long nr, if (file) lru += LRU_FILE; return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, - mode, !!file); + mode, file); } /* @@ -976,7 +990,7 @@ static unsigned long clear_active_flags(struct list_head *page_list, struct page *page; list_for_each_entry(page, page_list, lru) { - lru = page_is_file_cache(page); + lru = page_lru_base_type(page); if (PageActive(page)) { lru += LRU_ACTIVE; ClearPageActive(page); @@ -1034,6 +1048,31 @@ int isolate_lru_page(struct page *page) } /* + * Are there way too many processes in the direct reclaim path already? + */ +static int too_many_isolated(struct zone *zone, int file, + struct scan_control *sc) +{ + unsigned long inactive, isolated; + + if (current_is_kswapd()) + return 0; + + if (!scanning_global_lru(sc)) + return 0; + + if (file) { + inactive = zone_page_state(zone, NR_INACTIVE_FILE); + isolated = zone_page_state(zone, NR_ISOLATED_FILE); + } else { + inactive = zone_page_state(zone, NR_INACTIVE_ANON); + isolated = zone_page_state(zone, NR_ISOLATED_ANON); + } + + return isolated > inactive; +} + +/* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ @@ -1048,6 +1087,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); int lumpy_reclaim = 0; + while (unlikely(too_many_isolated(zone, file, sc))) { + congestion_wait(WRITE, HZ/10); + + /* We are about to die and free our memory. Return now. */ + if (fatal_signal_pending(current)) + return SWAP_CLUSTER_MAX; + } + /* * If we need a large contiguous chunk of memory, or have * trouble getting a small set of contiguous pages, we @@ -1072,10 +1119,26 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_active; unsigned int count[NR_LRU_LISTS] = { 0, }; int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; + unsigned long nr_anon; + unsigned long nr_file; nr_taken = sc->isolate_pages(sc->swap_cluster_max, &page_list, &nr_scan, sc->order, mode, zone, sc->mem_cgroup, 0, file); + + if (scanning_global_lru(sc)) { + zone->pages_scanned += nr_scan; + if (current_is_kswapd()) + __count_zone_vm_events(PGSCAN_KSWAPD, zone, + nr_scan); + else + __count_zone_vm_events(PGSCAN_DIRECT, zone, + nr_scan); + } + + if (nr_taken == 0) + goto done; + nr_active = clear_active_flags(&page_list, count); __count_vm_events(PGDEACTIVATE, nr_active); @@ -1088,8 +1151,10 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, __mod_zone_page_state(zone, NR_INACTIVE_ANON, -count[LRU_INACTIVE_ANON]); - if (scanning_global_lru(sc)) - zone->pages_scanned += nr_scan; + nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; + nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; + __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; @@ -1123,18 +1188,12 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, } nr_reclaimed += nr_freed; + local_irq_disable(); - if (current_is_kswapd()) { - __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); + if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_freed); - } else if (scanning_global_lru(sc)) - __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); - __count_zone_vm_events(PGSTEAL, zone, nr_freed); - if (nr_taken == 0) - goto done; - spin_lock(&zone->lru_lock); /* * Put back any unfreeable pages. @@ -1153,8 +1212,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, SetPageLRU(page); lru = page_lru(page); add_page_to_lru_list(zone, page, lru); - if (PageActive(page)) { - int file = !!page_is_file_cache(page); + if (is_active_lru(lru)) { + int file = is_file_lru(lru); reclaim_stat->recent_rotated[file]++; } if (!pagevec_add(&pvec, page)) { @@ -1163,10 +1222,13 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, spin_lock_irq(&zone->lru_lock); } } + __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + } while (nr_scanned < max_scan); - spin_unlock(&zone->lru_lock); + done: - local_irq_enable(); + spin_unlock_irq(&zone->lru_lock); pagevec_release(&pvec); return nr_reclaimed; } @@ -1215,15 +1277,10 @@ static void move_active_pages_to_lru(struct zone *zone, while (!list_empty(list)) { page = lru_to_page(list); - prefetchw_prev_lru_page(page, list, flags); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - if (!is_active_lru(lru)) - ClearPageActive(page); /* we are de-activating */ - list_move(&page->lru, &zone->lru[lru].list); mem_cgroup_add_lru_list(page, lru); pgmoved++; @@ -1244,7 +1301,7 @@ static void move_active_pages_to_lru(struct zone *zone, static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct scan_control *sc, int priority, int file) { - unsigned long pgmoved; + unsigned long nr_taken; unsigned long pgscanned; unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ @@ -1252,10 +1309,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, LIST_HEAD(l_inactive); struct page *page; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + unsigned long nr_rotated = 0; lru_add_drain(); spin_lock_irq(&zone->lru_lock); - pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, + nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE, zone, sc->mem_cgroup, 1, file); /* @@ -1265,16 +1323,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, if (scanning_global_lru(sc)) { zone->pages_scanned += pgscanned; } - reclaim_stat->recent_scanned[!!file] += pgmoved; + reclaim_stat->recent_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) - __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); + __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); else - __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); + __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); spin_unlock_irq(&zone->lru_lock); - pgmoved = 0; /* count referenced (mapping) mapped pages */ while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); @@ -1288,7 +1346,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, /* page_referenced clears PageReferenced */ if (page_mapping_inuse(page) && page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { - pgmoved++; + nr_rotated++; /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So @@ -1304,6 +1362,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } } + ClearPageActive(page); /* we are de-activating */ list_add(&page->lru, &l_inactive); } @@ -1317,13 +1376,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * helps balance scan pressure between file and anonymous pages in * get_scan_ratio. */ - reclaim_stat->recent_rotated[!!file] += pgmoved; + reclaim_stat->recent_rotated[file] += nr_rotated; move_active_pages_to_lru(zone, &l_active, LRU_ACTIVE + file * LRU_FILE); move_active_pages_to_lru(zone, &l_inactive, LRU_BASE + file * LRU_FILE); - + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); } @@ -1429,10 +1488,10 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, unsigned long ap, fp; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + - zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); - file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + - zone_nr_pages(zone, sc, LRU_INACTIVE_FILE); + anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); if (scanning_global_lru(sc)) { free = zone_page_state(zone, NR_FREE_PAGES); @@ -1526,6 +1585,7 @@ static void shrink_zone(int priority, struct zone *zone, enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; unsigned long swap_cluster_max = sc->swap_cluster_max; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); int noswap = 0; /* If we have no swap space, do not bother scanning anon pages. */ @@ -1540,17 +1600,14 @@ static void shrink_zone(int priority, struct zone *zone, int file = is_file_lru(l); unsigned long scan; - scan = zone_nr_pages(zone, sc, l); + scan = zone_nr_lru_pages(zone, sc, l); if (priority || noswap) { scan >>= priority; scan = (scan * percent[file]) / 100; } - if (scanning_global_lru(sc)) - nr[l] = nr_scan_try_batch(scan, - &zone->lru[l].nr_saved_scan, - swap_cluster_max); - else - nr[l] = scan; + nr[l] = nr_scan_try_batch(scan, + &reclaim_stat->nr_saved_scan[l], + swap_cluster_max); } while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -1652,10 +1709,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist, * * If the caller is !__GFP_FS then the probability of a failure is reasonably * high - the zone may be full of dirty or under-writeback pages, which this - * caller can't do much about. We kick pdflush and take explicit naps in the - * hope that some of these pages can be written. But if the allocating task - * holds filesystem locks which prevent writeout this might not work, and the - * allocation attempt will fail. + * caller can't do much about. We kick the writeback threads and take explicit + * naps in the hope that some of these pages can be written. But if the + * allocating task holds filesystem locks which prevent writeout this might not + * work, and the allocation attempt will fail. * * returns: 0, if no pages reclaimed * else, the number of pages reclaimed @@ -1685,7 +1742,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - lru_pages += zone_lru_pages(zone); + lru_pages += zone_reclaimable_pages(zone); } } @@ -1779,11 +1836,45 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR +unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, + gfp_t gfp_mask, bool noswap, + unsigned int swappiness, + struct zone *zone, int nid) +{ + struct scan_control sc = { + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = !noswap, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .swappiness = swappiness, + .order = 0, + .mem_cgroup = mem, + .isolate_pages = mem_cgroup_isolate_pages, + }; + nodemask_t nm = nodemask_of_node(nid); + + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); + sc.nodemask = &nm; + sc.nr_reclaimed = 0; + sc.nr_scanned = 0; + /* + * NOTE: Although we can get the priority field, using it + * here is not a good idea, since it limits the pages we can scan. + * if we don't reclaim here, the shrink_zone from balance_pgdat + * will pick up pages from other mem cgroup's as well. We hack + * the priority and make it zero. + */ + shrink_zone(0, zone, &sc); + return sc.nr_reclaimed; +} + unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, bool noswap, unsigned int swappiness) { + struct zonelist *zonelist; struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, @@ -1795,7 +1886,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .isolate_pages = mem_cgroup_isolate_pages, .nodemask = NULL, /* we don't care the placement */ }; - struct zonelist *zonelist; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -1902,7 +1992,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - lru_pages += zone_lru_pages(zone); + lru_pages += zone_reclaimable_pages(zone); } /* @@ -1917,6 +2007,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; int nr_slab; + int nid, zid; if (!populated_zone(zone)) continue; @@ -1931,6 +2022,15 @@ loop_again: temp_priority[i] = priority; sc.nr_scanned = 0; note_zone_scanning_priority(zone, priority); + + nid = pgdat->node_id; + zid = zone_idx(zone); + /* + * Call soft limit reclaim before calling shrink_zone. + * For now we ignore the return value + */ + mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask, + nid, zid); /* * We put equal pressure on every zone, unless one * zone has way too many pages free already. @@ -1946,7 +2046,7 @@ loop_again: if (zone_is_all_unreclaimable(zone)) continue; if (nr_slab == 0 && zone->pages_scanned >= - (zone_lru_pages(zone) * 6)) + (zone_reclaimable_pages(zone) * 6)) zone_set_flag(zone, ZONE_ALL_UNRECLAIMABLE); /* @@ -2113,12 +2213,39 @@ void wakeup_kswapd(struct zone *zone, int order) wake_up_interruptible(&pgdat->kswapd_wait); } -unsigned long global_lru_pages(void) +/* + * The reclaimable count would be mostly accurate. + * The less reclaimable pages may be + * - mlocked pages, which will be moved to unevictable list when encountered + * - mapped pages, which may require several travels to be reclaimed + * - dirty pages, which is not "instantly" reclaimable + */ +unsigned long global_reclaimable_pages(void) +{ + int nr; + + nr = global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_FILE); + + if (nr_swap_pages > 0) + nr += global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_INACTIVE_ANON); + + return nr; +} + +unsigned long zone_reclaimable_pages(struct zone *zone) { - return global_page_state(NR_ACTIVE_ANON) - + global_page_state(NR_ACTIVE_FILE) - + global_page_state(NR_INACTIVE_ANON) - + global_page_state(NR_INACTIVE_FILE); + int nr; + + nr = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (nr_swap_pages > 0) + nr += zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + + return nr; } #ifdef CONFIG_HIBERNATION @@ -2133,6 +2260,7 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, { struct zone *zone; unsigned long nr_reclaimed = 0; + struct zone_reclaim_stat *reclaim_stat; for_each_populated_zone(zone) { enum lru_list l; @@ -2149,11 +2277,14 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, l == LRU_ACTIVE_FILE)) continue; - zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; - if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { + reclaim_stat = get_reclaim_stat(zone, sc); + reclaim_stat->nr_saved_scan[l] += + (lru_pages >> prio) + 1; + if (reclaim_stat->nr_saved_scan[l] + >= nr_pages || pass > 3) { unsigned long nr_to_scan; - zone->lru[l].nr_saved_scan = 0; + reclaim_stat->nr_saved_scan[l] = 0; nr_to_scan = min(nr_pages, lru_pages); nr_reclaimed += shrink_list(l, nr_to_scan, zone, sc, prio); @@ -2190,7 +2321,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) current->reclaim_state = &reclaim_state; - lru_pages = global_lru_pages(); + lru_pages = global_reclaimable_pages(); nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { @@ -2232,7 +2363,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, - global_lru_pages()); + global_reclaimable_pages()); sc.nr_reclaimed += reclaim_state.reclaimed_slab; if (sc.nr_reclaimed >= nr_pages) goto out; @@ -2249,7 +2380,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if (!sc.nr_reclaimed) { do { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); + shrink_slab(nr_pages, sc.gfp_mask, + global_reclaimable_pages()); sc.nr_reclaimed += reclaim_state.reclaimed_slab; } while (sc.nr_reclaimed < nr_pages && reclaim_state.reclaimed_slab > 0); @@ -2569,7 +2701,7 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone) retry: ClearPageUnevictable(page); if (page_evictable(page, NULL)) { - enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); + enum lru_list l = page_lru_base_type(page); __dec_zone_state(zone, NR_UNEVICTABLE); list_move(&page->lru, &zone->lru[l].list); @@ -2712,10 +2844,10 @@ static void scan_all_zones_unevictable_pages(void) unsigned long scan_unevictable_pages; int scan_unevictable_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write && *(unsigned long *)table->data) scan_all_zones_unevictable_pages(); diff --git a/mm/vmstat.c b/mm/vmstat.c index 138bed53706e..c81321f9feec 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -639,11 +639,14 @@ static const char * const vmstat_text[] = { "nr_slab_reclaimable", "nr_slab_unreclaimable", "nr_page_table_pages", + "nr_kernel_stack", "nr_unstable", "nr_bounce", "nr_vmscan_write", "nr_writeback_temp", - + "nr_isolated_anon", + "nr_isolated_file", + "nr_shmem", #ifdef CONFIG_NUMA "numa_hit", "numa_miss", |