summaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorLinus Torvalds2019-07-12 20:40:28 +0200
committerLinus Torvalds2019-07-12 20:40:28 +0200
commitef8f3d48afd6a17a0dae8c277c2f539c2f19fd16 (patch)
tree5c28f9f287a552ad1a655b9e29e5330966652e89 /mm/memcontrol.c
parentMerge tag 'tag-chrome-platform-for-v5.3' of git://git.kernel.org/pub/scm/linu... (diff)
parentmm/oom_kill.c: remove redundant OOM score normalization in select_bad_process() (diff)
downloadkernel-qcow2-linux-ef8f3d48afd6a17a0dae8c277c2f539c2f19fd16.tar.gz
kernel-qcow2-linux-ef8f3d48afd6a17a0dae8c277c2f539c2f19fd16.tar.xz
kernel-qcow2-linux-ef8f3d48afd6a17a0dae8c277c2f539c2f19fd16.zip
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: "Am experimenting with splitting MM up into identifiable subsystems perhaps with a view to gitifying it in complex ways. Also with more verbose "incoming" emails. Most of MM is here and a few other trees. Subsystems affected by this patch series: - hotfixes - iommu - scripts - arch/sh - ocfs2 - mm:slab-generic - mm:slub - mm:kmemleak - mm:kasan - mm:cleanups - mm:debug - mm:pagecache - mm:swap - mm:memcg - mm:gup - mm:pagemap - mm:infrastructure - mm:vmalloc - mm:initialization - mm:pagealloc - mm:vmscan - mm:tools - mm:proc - mm:ras - mm:oom-kill hotfixes: mm: vmscan: scan anonymous pages on file refaults mm/nvdimm: add is_ioremap_addr and use that to check ioremap address mm/memcontrol: fix wrong statistics in memory.stat mm/z3fold.c: lock z3fold page before __SetPageMovable() nilfs2: do not use unexported cpu_to_le32()/le32_to_cpu() in uapi header MAINTAINERS: nilfs2: update email address iommu: include/linux/dmar.h: replace single-char identifiers in macros scripts: scripts/decode_stacktrace: match basepath using shell prefix operator, not regex scripts/decode_stacktrace: look for modules with .ko.debug extension scripts/spelling.txt: drop "sepc" from the misspelling list scripts/spelling.txt: add spelling fix for prohibited scripts/decode_stacktrace: Accept dash/underscore in modules scripts/spelling.txt: add more spellings to spelling.txt arch/sh: arch/sh/configs/sdk7786_defconfig: remove CONFIG_LOGFS sh: config: remove left-over BACKLIGHT_LCD_SUPPORT sh: prevent warnings when using iounmap ocfs2: fs: ocfs: fix spelling mistake "hearbeating" -> "heartbeat" ocfs2/dlm: use struct_size() helper ocfs2: add last unlock times in locking_state ocfs2: add locking filter debugfs file ocfs2: add first lock wait time in locking_state ocfs: no need to check return value of debugfs_create functions fs/ocfs2/dlmglue.c: unneeded variable: "status" ocfs2: use kmemdup rather than duplicating its implementation mm:slab-generic: Patch series "mm/slab: Improved sanity checking": mm/slab: validate cache membership under freelist hardening mm/slab: sanity-check page type when looking up cache lkdtm/heap: add tests for freelist hardening mm:slub: mm/slub.c: avoid double string traverse in kmem_cache_flags() slub: don't panic for memcg kmem cache creation failure mm:kmemleak: mm/kmemleak.c: fix check for softirq context mm/kmemleak.c: change error at _write when kmemleak is disabled docs: kmemleak: add more documentation details mm:kasan: mm/kasan: print frame description for stack bugs Patch series "Bitops instrumentation for KASAN", v5: lib/test_kasan: add bitops tests x86: use static_cpu_has in uaccess region to avoid instrumentation asm-generic, x86: add bitops instrumentation for KASAN Patch series "mm/kasan: Add object validation in ksize()", v3: mm/kasan: introduce __kasan_check_{read,write} mm/kasan: change kasan_check_{read,write} to return boolean lib/test_kasan: Add test for double-kzfree detection mm/slab: refactor common ksize KASAN logic into slab_common.c mm/kasan: add object validation in ksize() mm:cleanups: include/linux/pfn_t.h: remove pfn_t_to_virt() Patch series "remove ARCH_SELECT_MEMORY_MODEL where it has no effect": arm: remove ARCH_SELECT_MEMORY_MODEL s390: remove ARCH_SELECT_MEMORY_MODEL sparc: remove ARCH_SELECT_MEMORY_MODEL mm/gup.c: make follow_page_mask() static mm/memory.c: trivial clean up in insert_page() mm: make !CONFIG_HUGE_PAGE wrappers into static inlines include/linux/mm_types.h: ifdef struct vm_area_struct::swap_readahead_info mm: remove the account_page_dirtied export mm/page_isolation.c: change the prototype of undo_isolate_page_range() include/linux/vmpressure.h: use spinlock_t instead of struct spinlock mm: remove the exporting of totalram_pages include/linux/pagemap.h: document trylock_page() return value mm:debug: mm/failslab.c: by default, do not fail allocations with direct reclaim only Patch series "debug_pagealloc improvements": mm, debug_pagelloc: use static keys to enable debugging mm, page_alloc: more extensive free page checking with debug_pagealloc mm, debug_pagealloc: use a page type instead of page_ext flag mm:pagecache: Patch series "fix filler_t callback type mismatches", v2: mm/filemap.c: fix an overly long line in read_cache_page mm/filemap: don't cast ->readpage to filler_t for do_read_cache_page jffs2: pass the correct prototype to read_cache_page 9p: pass the correct prototype to read_cache_page mm/filemap.c: correct the comment about VM_FAULT_RETRY mm:swap: mm, swap: fix race between swapoff and some swap operations mm/swap_state.c: simplify total_swapcache_pages() with get_swap_device() mm, swap: use rbtree for swap_extent mm/mincore.c: fix race between swapoff and mincore mm:memcg: memcg, oom: no oom-kill for __GFP_RETRY_MAYFAIL memcg, fsnotify: no oom-kill for remote memcg charging mm, memcg: introduce memory.events.local mm: memcontrol: dump memory.stat during cgroup OOM Patch series "mm: reparent slab memory on cgroup removal", v7: mm: memcg/slab: postpone kmem_cache memcg pointer initialization to memcg_link_cache() mm: memcg/slab: rename slab delayed deactivation functions and fields mm: memcg/slab: generalize postponed non-root kmem_cache deactivation mm: memcg/slab: introduce __memcg_kmem_uncharge_memcg() mm: memcg/slab: unify SLAB and SLUB page accounting mm: memcg/slab: don't check the dying flag on kmem_cache creation mm: memcg/slab: synchronize access to kmem_cache dying flag using a spinlock mm: memcg/slab: rework non-root kmem_cache lifecycle management mm: memcg/slab: stop setting page->mem_cgroup pointer for slab pages mm: memcg/slab: reparent memcg kmem_caches on cgroup removal mm, memcg: add a memcg_slabinfo debugfs file mm:gup: Patch series "switch the remaining architectures to use generic GUP", v4: mm: use untagged_addr() for get_user_pages_fast addresses mm: simplify gup_fast_permitted mm: lift the x86_32 PAE version of gup_get_pte to common code MIPS: use the generic get_user_pages_fast code sh: add the missing pud_page definition sh: use the generic get_user_pages_fast code sparc64: add the missing pgd_page definition sparc64: define untagged_addr() sparc64: use the generic get_user_pages_fast code mm: rename CONFIG_HAVE_GENERIC_GUP to CONFIG_HAVE_FAST_GUP mm: reorder code blocks in gup.c mm: consolidate the get_user_pages* implementations mm: validate get_user_pages_fast flags mm: move the powerpc hugepd code to mm/gup.c mm: switch gup_hugepte to use try_get_compound_head mm: mark the page referenced in gup_hugepte mm/gup: speed up check_and_migrate_cma_pages() on huge page mm/gup.c: remove some BUG_ONs from get_gate_page() mm/gup.c: mark undo_dev_pagemap as __maybe_unused mm:pagemap: asm-generic, x86: introduce generic pte_{alloc,free}_one[_kernel] alpha: switch to generic version of pte allocation arm: switch to generic version of pte allocation arm64: switch to generic version of pte allocation csky: switch to generic version of pte allocation m68k: sun3: switch to generic version of pte allocation mips: switch to generic version of pte allocation nds32: switch to generic version of pte allocation nios2: switch to generic version of pte allocation parisc: switch to generic version of pte allocation riscv: switch to generic version of pte allocation um: switch to generic version of pte allocation unicore32: switch to generic version of pte allocation mm/pgtable: drop pgtable_t variable from pte_fn_t functions mm/memory.c: fail when offset == num in first check of __vm_map_pages() mm:infrastructure: mm/mmu_notifier: use hlist_add_head_rcu() mm:vmalloc: Patch series "Some cleanups for the KVA/vmalloc", v5: mm/vmalloc.c: remove "node" argument mm/vmalloc.c: preload a CPU with one object for split purpose mm/vmalloc.c: get rid of one single unlink_va() when merge mm/vmalloc.c: switch to WARN_ON() and move it under unlink_va() mm/vmalloc.c: spelling> s/informaion/information/ mm:initialization: mm/large system hash: use vmalloc for size > MAX_ORDER when !hashdist mm/large system hash: clear hashdist when only one node with memory is booted mm:pagealloc: arm64: move jump_label_init() before parse_early_param() Patch series "add init_on_alloc/init_on_free boot options", v10: mm: security: introduce init_on_alloc=1 and init_on_free=1 boot options mm: init: report memory auto-initialization features at boot time mm:vmscan: mm: vmscan: remove double slab pressure by inc'ing sc->nr_scanned mm: vmscan: correct some vmscan counters for THP swapout mm:tools: tools/vm/slabinfo: order command line options tools/vm/slabinfo: add partial slab listing to -X tools/vm/slabinfo: add option to sort by partial slabs tools/vm/slabinfo: add sorting info to help menu mm:proc: proc: use down_read_killable mmap_sem for /proc/pid/maps proc: use down_read_killable mmap_sem for /proc/pid/smaps_rollup proc: use down_read_killable mmap_sem for /proc/pid/pagemap proc: use down_read_killable mmap_sem for /proc/pid/clear_refs proc: use down_read_killable mmap_sem for /proc/pid/map_files mm: use down_read_killable for locking mmap_sem in access_remote_vm mm: smaps: split PSS into components mm: vmalloc: show number of vmalloc pages in /proc/meminfo mm:ras: mm/memory-failure.c: clarify error message mm:oom-kill: mm: memcontrol: use CSS_TASK_ITER_PROCS at mem_cgroup_scan_tasks() mm, oom: refactor dump_tasks for memcg OOMs mm, oom: remove redundant task_in_mem_cgroup() check oom: decouple mems_allowed from oom_unkillable_task mm/oom_kill.c: remove redundant OOM score normalization in select_bad_process()" * akpm: (147 commits) mm/oom_kill.c: remove redundant OOM score normalization in select_bad_process() oom: decouple mems_allowed from oom_unkillable_task mm, oom: remove redundant task_in_mem_cgroup() check mm, oom: refactor dump_tasks for memcg OOMs mm: memcontrol: use CSS_TASK_ITER_PROCS at mem_cgroup_scan_tasks() mm/memory-failure.c: clarify error message mm: vmalloc: show number of vmalloc pages in /proc/meminfo mm: smaps: split PSS into components mm: use down_read_killable for locking mmap_sem in access_remote_vm proc: use down_read_killable mmap_sem for /proc/pid/map_files proc: use down_read_killable mmap_sem for /proc/pid/clear_refs proc: use down_read_killable mmap_sem for /proc/pid/pagemap proc: use down_read_killable mmap_sem for /proc/pid/smaps_rollup proc: use down_read_killable mmap_sem for /proc/pid/maps tools/vm/slabinfo: add sorting info to help menu tools/vm/slabinfo: add option to sort by partial slabs tools/vm/slabinfo: add partial slab listing to -X tools/vm/slabinfo: order command line options mm: vmscan: correct some vmscan counters for THP swapout mm: vmscan: remove double slab pressure by inc'ing sc->nr_scanned ...
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c461
1 files changed, 256 insertions, 205 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ba9138a4a1de..4f05735b02d3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -57,6 +57,7 @@
#include <linux/lockdep.h>
#include <linux/file.h>
#include <linux/tracehook.h>
+#include <linux/seq_buf.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -485,7 +486,10 @@ ino_t page_cgroup_ino(struct page *page)
unsigned long ino = 0;
rcu_read_lock();
- memcg = READ_ONCE(page->mem_cgroup);
+ if (PageHead(page) && PageSlab(page))
+ memcg = memcg_from_slab_page(page);
+ else
+ memcg = READ_ONCE(page->mem_cgroup);
while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
if (memcg)
@@ -1163,7 +1167,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&iter->css, 0, &it);
+ css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
while (!ret && (task = css_task_iter_next(&it)))
ret = fn(task, arg);
css_task_iter_end(&it);
@@ -1255,32 +1259,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
*lru_size += nr_pages;
}
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
-{
- struct mem_cgroup *task_memcg;
- struct task_struct *p;
- bool ret;
-
- p = find_lock_task_mm(task);
- if (p) {
- task_memcg = get_mem_cgroup_from_mm(p->mm);
- task_unlock(p);
- } else {
- /*
- * All threads may have already detached their mm's, but the oom
- * killer still needs to detect if they have already been oom
- * killed to prevent needlessly killing additional tasks.
- */
- rcu_read_lock();
- task_memcg = mem_cgroup_from_task(task);
- css_get(&task_memcg->css);
- rcu_read_unlock();
- }
- ret = mem_cgroup_is_descendant(task_memcg, memcg);
- css_put(&task_memcg->css);
- return ret;
-}
-
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
* @memcg: the memory cgroup
@@ -1356,27 +1334,114 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
return false;
}
-static const unsigned int memcg1_stats[] = {
- MEMCG_CACHE,
- MEMCG_RSS,
- MEMCG_RSS_HUGE,
- NR_SHMEM,
- NR_FILE_MAPPED,
- NR_FILE_DIRTY,
- NR_WRITEBACK,
- MEMCG_SWAP,
-};
+static char *memory_stat_format(struct mem_cgroup *memcg)
+{
+ struct seq_buf s;
+ int i;
-static const char *const memcg1_stat_names[] = {
- "cache",
- "rss",
- "rss_huge",
- "shmem",
- "mapped_file",
- "dirty",
- "writeback",
- "swap",
-};
+ seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
+ if (!s.buffer)
+ return NULL;
+
+ /*
+ * Provide statistics on the state of the memory subsystem as
+ * well as cumulative event counters that show past behavior.
+ *
+ * This list is ordered following a combination of these gradients:
+ * 1) generic big picture -> specifics and details
+ * 2) reflecting userspace activity -> reflecting kernel heuristics
+ *
+ * Current memory state:
+ */
+
+ seq_buf_printf(&s, "anon %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_RSS) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "file %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_CACHE) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "kernel_stack %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
+ 1024);
+ seq_buf_printf(&s, "slab %llu\n",
+ (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "sock %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_SOCK) *
+ PAGE_SIZE);
+
+ seq_buf_printf(&s, "shmem %llu\n",
+ (u64)memcg_page_state(memcg, NR_SHMEM) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "file_mapped %llu\n",
+ (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "file_dirty %llu\n",
+ (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "file_writeback %llu\n",
+ (u64)memcg_page_state(memcg, NR_WRITEBACK) *
+ PAGE_SIZE);
+
+ /*
+ * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
+ * with the NR_ANON_THP vm counter, but right now it's a pain in the
+ * arse because it requires migrating the work out of rmap to a place
+ * where the page->mem_cgroup is set up and stable.
+ */
+ seq_buf_printf(&s, "anon_thp %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
+ PAGE_SIZE);
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i],
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
+
+ seq_buf_printf(&s, "slab_reclaimable %llu\n",
+ (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "slab_unreclaimable %llu\n",
+ (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
+ PAGE_SIZE);
+
+ /* Accumulated memory events */
+
+ seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
+ seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
+
+ seq_buf_printf(&s, "workingset_refault %lu\n",
+ memcg_page_state(memcg, WORKINGSET_REFAULT));
+ seq_buf_printf(&s, "workingset_activate %lu\n",
+ memcg_page_state(memcg, WORKINGSET_ACTIVATE));
+ seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
+ memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
+
+ seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
+ seq_buf_printf(&s, "pgscan %lu\n",
+ memcg_events(memcg, PGSCAN_KSWAPD) +
+ memcg_events(memcg, PGSCAN_DIRECT));
+ seq_buf_printf(&s, "pgsteal %lu\n",
+ memcg_events(memcg, PGSTEAL_KSWAPD) +
+ memcg_events(memcg, PGSTEAL_DIRECT));
+ seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
+ seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
+ seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
+ seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ seq_buf_printf(&s, "thp_fault_alloc %lu\n",
+ memcg_events(memcg, THP_FAULT_ALLOC));
+ seq_buf_printf(&s, "thp_collapse_alloc %lu\n",
+ memcg_events(memcg, THP_COLLAPSE_ALLOC));
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+ /* The above should easily fit into one page */
+ WARN_ON_ONCE(seq_buf_has_overflowed(&s));
+
+ return s.buffer;
+}
#define K(x) ((x) << (PAGE_SHIFT-10))
/**
@@ -1411,39 +1476,32 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *
*/
void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
- struct mem_cgroup *iter;
- unsigned int i;
+ char *buf;
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
K((u64)memcg->memory.max), memcg->memory.failcnt);
- pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
- K((u64)page_counter_read(&memcg->memsw)),
- K((u64)memcg->memsw.max), memcg->memsw.failcnt);
- pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
- K((u64)page_counter_read(&memcg->kmem)),
- K((u64)memcg->kmem.max), memcg->kmem.failcnt);
-
- for_each_mem_cgroup_tree(iter, memcg) {
- pr_info("Memory cgroup stats for ");
- pr_cont_cgroup_path(iter->css.cgroup);
- pr_cont(":");
-
- for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
- if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
- continue;
- pr_cont(" %s:%luKB", memcg1_stat_names[i],
- K(memcg_page_state_local(iter,
- memcg1_stats[i])));
- }
-
- for (i = 0; i < NR_LRU_LISTS; i++)
- pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
- K(memcg_page_state_local(iter,
- NR_LRU_BASE + i)));
-
- pr_cont("\n");
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->swap)),
+ K((u64)memcg->swap.max), memcg->swap.failcnt);
+ else {
+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->memsw)),
+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->kmem)),
+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);
}
+
+ pr_info("Memory cgroup stats for ");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ pr_cont(":");
+ buf = memory_stat_format(memcg);
+ if (!buf)
+ return;
+ pr_info("%s", buf);
+ kfree(buf);
}
/*
@@ -2279,7 +2337,6 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned long nr_reclaimed;
bool may_swap = true;
bool drained = false;
- bool oomed = false;
enum oom_status oom_status;
if (mem_cgroup_is_root(memcg))
@@ -2366,7 +2423,7 @@ retry:
if (nr_retries--)
goto retry;
- if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto nomem;
if (gfp_mask & __GFP_NOFAIL)
@@ -2385,7 +2442,6 @@ retry:
switch (oom_status) {
case OOM_SUCCESS:
nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- oomed = true;
goto retry;
case OOM_FAILED:
goto force;
@@ -2588,12 +2644,13 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
{
struct memcg_kmem_cache_create_work *cw;
+ if (!css_tryget_online(&memcg->css))
+ return;
+
cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
if (!cw)
return;
- css_get(&memcg->css);
-
cw->memcg = memcg;
cw->cachep = cachep;
INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
@@ -2628,6 +2685,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
+ struct memcg_cache_array *arr;
int kmemcg_id;
VM_BUG_ON(!is_root_cache(cachep));
@@ -2635,14 +2693,28 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
if (memcg_kmem_bypass())
return cachep;
- memcg = get_mem_cgroup_from_current();
+ rcu_read_lock();
+
+ if (unlikely(current->active_memcg))
+ memcg = current->active_memcg;
+ else
+ memcg = mem_cgroup_from_task(current);
+
+ if (!memcg || memcg == root_mem_cgroup)
+ goto out_unlock;
+
kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
- goto out;
+ goto out_unlock;
- memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
- if (likely(memcg_cachep))
- return memcg_cachep;
+ arr = rcu_dereference(cachep->memcg_params.memcg_caches);
+
+ /*
+ * Make sure we will access the up-to-date value. The code updating
+ * memcg_caches issues a write barrier to match the data dependency
+ * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
+ */
+ memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
/*
* If we are in a safe context (can wait, and not in interrupt
@@ -2655,10 +2727,20 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
* memcg_create_kmem_cache, this means no further allocation
* could happen with the slab_mutex held. So it's better to
* defer everything.
+ *
+ * If the memcg is dying or memcg_cache is about to be released,
+ * don't bother creating new kmem_caches. Because memcg_cachep
+ * is ZEROed as the fist step of kmem offlining, we don't need
+ * percpu_ref_tryget_live() here. css_tryget_online() check in
+ * memcg_schedule_kmem_cache_create() will prevent us from
+ * creation of a new kmem_cache.
*/
- memcg_schedule_kmem_cache_create(memcg, cachep);
-out:
- css_put(&memcg->css);
+ if (unlikely(!memcg_cachep))
+ memcg_schedule_kmem_cache_create(memcg, cachep);
+ else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
+ cachep = memcg_cachep;
+out_unlock:
+ rcu_read_unlock();
return cachep;
}
@@ -2669,7 +2751,7 @@ out:
void memcg_kmem_put_cache(struct kmem_cache *cachep)
{
if (!is_root_cache(cachep))
- css_put(&cachep->memcg_params.memcg->css);
+ percpu_ref_put(&cachep->memcg_params.refcnt);
}
/**
@@ -2697,9 +2779,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
cancel_charge(memcg, nr_pages);
return -ENOMEM;
}
-
- page->mem_cgroup = memcg;
-
return 0;
}
@@ -2722,12 +2801,30 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
memcg = get_mem_cgroup_from_current();
if (!mem_cgroup_is_root(memcg)) {
ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
- if (!ret)
+ if (!ret) {
+ page->mem_cgroup = memcg;
__SetPageKmemcg(page);
+ }
}
css_put(&memcg->css);
return ret;
}
+
+/**
+ * __memcg_kmem_uncharge_memcg: uncharge a kmem page
+ * @memcg: memcg to uncharge
+ * @nr_pages: number of pages to uncharge
+ */
+void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+}
/**
* __memcg_kmem_uncharge: uncharge a kmem page
* @page: page to uncharge
@@ -2742,14 +2839,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
return;
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
-
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_uncharge(&memcg->kmem, nr_pages);
-
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
-
+ __memcg_kmem_uncharge_memcg(memcg, nr_pages);
page->mem_cgroup = NULL;
/* slab pages do not have PageKmemcg flag set */
@@ -3168,15 +3258,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
*/
memcg->kmem_state = KMEM_ALLOCATED;
- memcg_deactivate_kmem_caches(memcg);
-
- kmemcg_id = memcg->kmemcg_id;
- BUG_ON(kmemcg_id < 0);
-
parent = parent_mem_cgroup(memcg);
if (!parent)
parent = root_mem_cgroup;
+ memcg_deactivate_kmem_caches(memcg, parent);
+
+ kmemcg_id = memcg->kmemcg_id;
+ BUG_ON(kmemcg_id < 0);
+
/*
* Change kmemcg_id of this cgroup and all its descendants to the
* parent's id, and then move all entries from this cgroup's list_lrus
@@ -3207,9 +3297,8 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
memcg_offline_kmem(memcg);
if (memcg->kmem_state == KMEM_ALLOCATED) {
- memcg_destroy_kmem_caches(memcg);
+ WARN_ON(!list_empty(&memcg->kmem_caches));
static_branch_dec(&memcg_kmem_enabled_key);
- WARN_ON(page_counter_read(&memcg->kmem));
}
}
#else
@@ -3472,6 +3561,28 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
}
#endif /* CONFIG_NUMA */
+static const unsigned int memcg1_stats[] = {
+ MEMCG_CACHE,
+ MEMCG_RSS,
+ MEMCG_RSS_HUGE,
+ NR_SHMEM,
+ NR_FILE_MAPPED,
+ NR_FILE_DIRTY,
+ NR_WRITEBACK,
+ MEMCG_SWAP,
+};
+
+static const char *const memcg1_stat_names[] = {
+ "cache",
+ "rss",
+ "rss_huge",
+ "shmem",
+ "mapped_file",
+ "dirty",
+ "writeback",
+ "swap",
+};
+
/* Universal VM events cgroup1 shows, original sort order */
static const unsigned int memcg1_events[] = {
PGPGIN,
@@ -3530,12 +3641,13 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
- (u64)memcg_page_state(memcg, i) * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, memcg1_stats[i]) *
+ PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
- (u64)memcg_events(memcg, i));
+ (u64)memcg_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
@@ -4634,6 +4746,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
/* The following stuff does not apply to the root */
if (!parent) {
+#ifdef CONFIG_MEMCG_KMEM
+ INIT_LIST_HEAD(&memcg->kmem_caches);
+#endif
root_mem_cgroup = memcg;
return &memcg->css;
}
@@ -5625,112 +5740,42 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
return nbytes;
}
+static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
+{
+ seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
+ seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
+ seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
+ seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
+ seq_printf(m, "oom_kill %lu\n",
+ atomic_long_read(&events[MEMCG_OOM_KILL]));
+}
+
static int memory_events_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- seq_printf(m, "low %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
- seq_printf(m, "high %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
- seq_printf(m, "max %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
- seq_printf(m, "oom %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
- seq_printf(m, "oom_kill %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
-
+ __memory_events_show(m, memcg->memory_events);
return 0;
}
-static int memory_stat_show(struct seq_file *m, void *v)
+static int memory_events_local_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- int i;
-
- /*
- * Provide statistics on the state of the memory subsystem as
- * well as cumulative event counters that show past behavior.
- *
- * This list is ordered following a combination of these gradients:
- * 1) generic big picture -> specifics and details
- * 2) reflecting userspace activity -> reflecting kernel heuristics
- *
- * Current memory state:
- */
-
- seq_printf(m, "anon %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_RSS) * PAGE_SIZE);
- seq_printf(m, "file %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_CACHE) * PAGE_SIZE);
- seq_printf(m, "kernel_stack %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * 1024);
- seq_printf(m, "slab %llu\n",
- (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
- memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
- PAGE_SIZE);
- seq_printf(m, "sock %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE);
-
- seq_printf(m, "shmem %llu\n",
- (u64)memcg_page_state(memcg, NR_SHMEM) * PAGE_SIZE);
- seq_printf(m, "file_mapped %llu\n",
- (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * PAGE_SIZE);
- seq_printf(m, "file_dirty %llu\n",
- (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * PAGE_SIZE);
- seq_printf(m, "file_writeback %llu\n",
- (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE);
- /*
- * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
- * with the NR_ANON_THP vm counter, but right now it's a pain in the
- * arse because it requires migrating the work out of rmap to a place
- * where the page->mem_cgroup is set up and stable.
- */
- seq_printf(m, "anon_thp %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * PAGE_SIZE);
-
- for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
- (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
- PAGE_SIZE);
-
- seq_printf(m, "slab_reclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
- PAGE_SIZE);
- seq_printf(m, "slab_unreclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
- PAGE_SIZE);
-
- /* Accumulated memory events */
-
- seq_printf(m, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
- seq_printf(m, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
-
- seq_printf(m, "workingset_refault %lu\n",
- memcg_page_state(memcg, WORKINGSET_REFAULT));
- seq_printf(m, "workingset_activate %lu\n",
- memcg_page_state(memcg, WORKINGSET_ACTIVATE));
- seq_printf(m, "workingset_nodereclaim %lu\n",
- memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
-
- seq_printf(m, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
- seq_printf(m, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) +
- memcg_events(memcg, PGSCAN_DIRECT));
- seq_printf(m, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) +
- memcg_events(memcg, PGSTEAL_DIRECT));
- seq_printf(m, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
- seq_printf(m, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
- seq_printf(m, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
- seq_printf(m, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
+ __memory_events_show(m, memcg->memory_events_local);
+ return 0;
+}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- seq_printf(m, "thp_fault_alloc %lu\n",
- memcg_events(memcg, THP_FAULT_ALLOC));
- seq_printf(m, "thp_collapse_alloc %lu\n",
- memcg_events(memcg, THP_COLLAPSE_ALLOC));
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static int memory_stat_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ char *buf;
+ buf = memory_stat_format(memcg);
+ if (!buf)
+ return -ENOMEM;
+ seq_puts(m, buf);
+ kfree(buf);
return 0;
}
@@ -5802,6 +5847,12 @@ static struct cftype memory_files[] = {
.seq_show = memory_events_show,
},
{
+ .name = "events.local",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct mem_cgroup, events_local_file),
+ .seq_show = memory_events_local_show,
+ },
+ {
.name = "stat",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_stat_show,