diff options
author | Linus Torvalds | 2017-09-09 19:30:07 +0200 |
---|---|---|
committer | Linus Torvalds | 2017-09-09 19:30:07 +0200 |
commit | fbf4432ff71b7a25bef993a5312906946d27f446 (patch) | |
tree | cf3e0024af4b8f9376eff75743f1fa1526e40900 /mm/memcontrol.c | |
parent | remove gperf left-overs from build system (diff) | |
parent | ipc: optimize semget/shmget/msgget for lots of keys (diff) | |
download | kernel-qcow2-linux-fbf4432ff71b7a25bef993a5312906946d27f446.tar.gz kernel-qcow2-linux-fbf4432ff71b7a25bef993a5312906946d27f446.tar.xz kernel-qcow2-linux-fbf4432ff71b7a25bef993a5312906946d27f446.zip |
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton:
- most of the rest of MM
- a small number of misc things
- lib/ updates
- checkpatch
- autofs updates
- ipc/ updates
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (126 commits)
ipc: optimize semget/shmget/msgget for lots of keys
ipc/sem: play nicer with large nsops allocations
ipc/sem: drop sem_checkid helper
ipc: convert kern_ipc_perm.refcount from atomic_t to refcount_t
ipc: convert sem_undo_list.refcnt from atomic_t to refcount_t
ipc: convert ipc_namespace.count from atomic_t to refcount_t
kcov: support compat processes
sh: defconfig: cleanup from old Kconfig options
mn10300: defconfig: cleanup from old Kconfig options
m32r: defconfig: cleanup from old Kconfig options
drivers/pps: use surrounding "if PPS" to remove numerous dependency checks
drivers/pps: aesthetic tweaks to PPS-related content
cpumask: make cpumask_next() out-of-line
kmod: move #ifdef CONFIG_MODULES wrapper to Makefile
kmod: split off umh headers into its own file
MAINTAINERS: clarify kmod is just a kernel module loader
kmod: split out umh code into its own file
test_kmod: flip INT checks to be consistent
test_kmod: remove paranoid UINT_MAX check on uint range processing
vfat: deduplicate hex2bin()
...
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 256 |
1 files changed, 169 insertions, 87 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6532b219b222..15af3da5af02 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -119,6 +119,7 @@ static const char *const mem_cgroup_lru_names[] = { struct mem_cgroup_tree_per_node { struct rb_root rb_root; + struct rb_node *rb_rightmost; spinlock_t lock; }; @@ -386,6 +387,7 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; struct mem_cgroup_per_node *mz_node; + bool rightmost = true; if (mz->on_tree) return; @@ -397,8 +399,11 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, parent = *p; mz_node = rb_entry(parent, struct mem_cgroup_per_node, tree_node); - if (mz->usage_in_excess < mz_node->usage_in_excess) + if (mz->usage_in_excess < mz_node->usage_in_excess) { p = &(*p)->rb_left; + rightmost = false; + } + /* * We can't avoid mem cgroups that are over their soft * limit by the same amount @@ -406,6 +411,10 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, else if (mz->usage_in_excess >= mz_node->usage_in_excess) p = &(*p)->rb_right; } + + if (rightmost) + mctz->rb_rightmost = &mz->tree_node; + rb_link_node(&mz->tree_node, parent, p); rb_insert_color(&mz->tree_node, &mctz->rb_root); mz->on_tree = true; @@ -416,6 +425,10 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, { if (!mz->on_tree) return; + + if (&mz->tree_node == mctz->rb_rightmost) + mctz->rb_rightmost = rb_prev(&mz->tree_node); + rb_erase(&mz->tree_node, &mctz->rb_root); mz->on_tree = false; } @@ -496,16 +509,15 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) static struct mem_cgroup_per_node * __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) { - struct rb_node *rightmost = NULL; struct mem_cgroup_per_node *mz; retry: mz = NULL; - rightmost = rb_last(&mctz->rb_root); - if (!rightmost) + if (!mctz->rb_rightmost) goto done; /* Nothing to reclaim from */ - mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node); + mz = rb_entry(mctz->rb_rightmost, + struct mem_cgroup_per_node, tree_node); /* * Remove the node now but someone else can add it back, * we will to add it back at the end of reclaim to its correct @@ -1792,6 +1804,9 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) } stock->nr_pages += nr_pages; + if (stock->nr_pages > CHARGE_BATCH) + drain_stock(stock); + local_irq_restore(flags); } @@ -4414,12 +4429,13 @@ enum mc_target_type { MC_TARGET_NONE = 0, MC_TARGET_PAGE, MC_TARGET_SWAP, + MC_TARGET_DEVICE, }; static struct page *mc_handle_present_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent) { - struct page *page = vm_normal_page(vma, addr, ptent); + struct page *page = _vm_normal_page(vma, addr, ptent, true); if (!page || !page_mapped(page)) return NULL; @@ -4436,7 +4452,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, return page; } -#ifdef CONFIG_SWAP +#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, pte_t ptent, swp_entry_t *entry) { @@ -4445,6 +4461,23 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) return NULL; + + /* + * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to + * a device and because they are not accessible by CPU they are store + * as special swap entry in the CPU page table. + */ + if (is_device_private_entry(ent)) { + page = device_private_entry_to_page(ent); + /* + * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have + * a refcount of 1 when free (unlike normal page) + */ + if (!page_ref_add_unless(page, 1, 1)) + return NULL; + return page; + } + /* * Because lookup_swap_cache() updates some statistics counter, * we call find_get_page() with swapper_space directly. @@ -4605,6 +4638,13 @@ out: * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a * target for charge migration. if @target is not NULL, the entry is stored * in target->ent. + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC + * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru). + * For now we such page is charge like a regular page would be as for all + * intent and purposes it is just special memory taking the place of a + * regular page. + * + * See Documentations/vm/hmm.txt and include/linux/hmm.h * * Called with pte lock held. */ @@ -4633,6 +4673,9 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, */ if (page->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; + if (is_device_private_page(page) || + is_device_public_page(page)) + ret = MC_TARGET_DEVICE; if (target) target->page = page; } @@ -4664,6 +4707,11 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, struct page *page = NULL; enum mc_target_type ret = MC_TARGET_NONE; + if (unlikely(is_swap_pmd(pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(pmd)); + return ret; + } page = pmd_page(pmd); VM_BUG_ON_PAGE(!page || !PageHead(page), page); if (!(mc.flags & MOVE_ANON)) @@ -4695,6 +4743,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { + /* + * Note their can not be MC_TARGET_DEVICE for now as we do not + * support transparent huge page with MEMORY_DEVICE_PUBLIC or + * MEMORY_DEVICE_PRIVATE but this might change. + */ if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; spin_unlock(ptl); @@ -4910,6 +4963,14 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, putback_lru_page(page); } put_page(page); + } else if (target_type == MC_TARGET_DEVICE) { + page = target.page; + if (!mem_cgroup_move_account(page, true, + mc.from, mc.to)) { + mc.precharge -= HPAGE_PMD_NR; + mc.moved_charge += HPAGE_PMD_NR; + } + put_page(page); } spin_unlock(ptl); return 0; @@ -4921,12 +4982,16 @@ retry: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { pte_t ptent = *(pte++); + bool device = false; swp_entry_t ent; if (!mc.precharge) break; switch (get_mctgt_type(vma, addr, ptent, &target)) { + case MC_TARGET_DEVICE: + device = true; + /* fall through */ case MC_TARGET_PAGE: page = target.page; /* @@ -4937,7 +5002,7 @@ retry: */ if (PageTransCompound(page)) goto put; - if (isolate_lru_page(page)) + if (!device && isolate_lru_page(page)) goto put; if (!mem_cgroup_move_account(page, false, mc.from, mc.to)) { @@ -4945,7 +5010,8 @@ retry: /* we uncharge from mc.from later. */ mc.moved_charge++; } - putback_lru_page(page); + if (!device) + putback_lru_page(page); put: /* get_mctgt_type() gets the page */ put_page(page); break; @@ -5535,48 +5601,102 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, cancel_charge(memcg, nr_pages); } -static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, - unsigned long nr_anon, unsigned long nr_file, - unsigned long nr_kmem, unsigned long nr_huge, - unsigned long nr_shmem, struct page *dummy_page) +struct uncharge_gather { + struct mem_cgroup *memcg; + unsigned long pgpgout; + unsigned long nr_anon; + unsigned long nr_file; + unsigned long nr_kmem; + unsigned long nr_huge; + unsigned long nr_shmem; + struct page *dummy_page; +}; + +static inline void uncharge_gather_clear(struct uncharge_gather *ug) +{ + memset(ug, 0, sizeof(*ug)); +} + +static void uncharge_batch(const struct uncharge_gather *ug) { - unsigned long nr_pages = nr_anon + nr_file + nr_kmem; + unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem; unsigned long flags; - if (!mem_cgroup_is_root(memcg)) { - page_counter_uncharge(&memcg->memory, nr_pages); + if (!mem_cgroup_is_root(ug->memcg)) { + page_counter_uncharge(&ug->memcg->memory, nr_pages); if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem) - page_counter_uncharge(&memcg->kmem, nr_kmem); - memcg_oom_recover(memcg); + page_counter_uncharge(&ug->memcg->memsw, nr_pages); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) + page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); + memcg_oom_recover(ug->memcg); } local_irq_save(flags); - __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon); - __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file); - __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge); - __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem); - __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout); - __this_cpu_add(memcg->stat->nr_page_events, nr_pages); - memcg_check_events(memcg, dummy_page); + __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon); + __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file); + __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge); + __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem); + __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout); + __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages); + memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); - if (!mem_cgroup_is_root(memcg)) - css_put_many(&memcg->css, nr_pages); + if (!mem_cgroup_is_root(ug->memcg)) + css_put_many(&ug->memcg->css, nr_pages); +} + +static void uncharge_page(struct page *page, struct uncharge_gather *ug) +{ + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page); + + if (!page->mem_cgroup) + return; + + /* + * Nobody should be changing or seriously looking at + * page->mem_cgroup at this point, we have fully + * exclusive access to the page. + */ + + if (ug->memcg != page->mem_cgroup) { + if (ug->memcg) { + uncharge_batch(ug); + uncharge_gather_clear(ug); + } + ug->memcg = page->mem_cgroup; + } + + if (!PageKmemcg(page)) { + unsigned int nr_pages = 1; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + ug->nr_huge += nr_pages; + } + if (PageAnon(page)) + ug->nr_anon += nr_pages; + else { + ug->nr_file += nr_pages; + if (PageSwapBacked(page)) + ug->nr_shmem += nr_pages; + } + ug->pgpgout++; + } else { + ug->nr_kmem += 1 << compound_order(page); + __ClearPageKmemcg(page); + } + + ug->dummy_page = page; + page->mem_cgroup = NULL; } static void uncharge_list(struct list_head *page_list) { - struct mem_cgroup *memcg = NULL; - unsigned long nr_shmem = 0; - unsigned long nr_anon = 0; - unsigned long nr_file = 0; - unsigned long nr_huge = 0; - unsigned long nr_kmem = 0; - unsigned long pgpgout = 0; + struct uncharge_gather ug; struct list_head *next; - struct page *page; + + uncharge_gather_clear(&ug); /* * Note that the list can be a single page->lru; hence the @@ -5584,57 +5704,16 @@ static void uncharge_list(struct list_head *page_list) */ next = page_list->next; do { + struct page *page; + page = list_entry(next, struct page, lru); next = page->lru.next; - VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page); - - if (!page->mem_cgroup) - continue; - - /* - * Nobody should be changing or seriously looking at - * page->mem_cgroup at this point, we have fully - * exclusive access to the page. - */ - - if (memcg != page->mem_cgroup) { - if (memcg) { - uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_kmem, nr_huge, nr_shmem, page); - pgpgout = nr_anon = nr_file = nr_kmem = 0; - nr_huge = nr_shmem = 0; - } - memcg = page->mem_cgroup; - } - - if (!PageKmemcg(page)) { - unsigned int nr_pages = 1; - - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - nr_huge += nr_pages; - } - if (PageAnon(page)) - nr_anon += nr_pages; - else { - nr_file += nr_pages; - if (PageSwapBacked(page)) - nr_shmem += nr_pages; - } - pgpgout++; - } else { - nr_kmem += 1 << compound_order(page); - __ClearPageKmemcg(page); - } - - page->mem_cgroup = NULL; + uncharge_page(page, &ug); } while (next != page_list); - if (memcg) - uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_kmem, nr_huge, nr_shmem, page); + if (ug.memcg) + uncharge_batch(&ug); } /** @@ -5646,6 +5725,8 @@ static void uncharge_list(struct list_head *page_list) */ void mem_cgroup_uncharge(struct page *page) { + struct uncharge_gather ug; + if (mem_cgroup_disabled()) return; @@ -5653,8 +5734,9 @@ void mem_cgroup_uncharge(struct page *page) if (!page->mem_cgroup) return; - INIT_LIST_HEAD(&page->lru); - uncharge_list(&page->lru); + uncharge_gather_clear(&ug); + uncharge_page(page, &ug); + uncharge_batch(&ug); } /** @@ -5819,8 +5901,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages); - page_counter_uncharge(&memcg->memory, nr_pages); - css_put_many(&memcg->css, nr_pages); + refill_stock(memcg, nr_pages); } static int __init cgroup_memory(char *s) @@ -5876,6 +5957,7 @@ static int __init mem_cgroup_init(void) node_online(node) ? node : NUMA_NO_NODE); rtpn->rb_root = RB_ROOT; + rtpn->rb_rightmost = NULL; spin_lock_init(&rtpn->lock); soft_limit_tree.rb_tree_per_node[node] = rtpn; } |