summaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c496
1 files changed, 277 insertions, 219 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ba9138a4a1de..cdbb7a84cb6e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -57,6 +57,7 @@
#include <linux/lockdep.h>
#include <linux/file.h>
#include <linux/tracehook.h>
+#include <linux/seq_buf.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -485,7 +486,10 @@ ino_t page_cgroup_ino(struct page *page)
unsigned long ino = 0;
rcu_read_lock();
- memcg = READ_ONCE(page->mem_cgroup);
+ if (PageHead(page) && PageSlab(page))
+ memcg = memcg_from_slab_page(page);
+ else
+ memcg = READ_ONCE(page->mem_cgroup);
while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
if (memcg)
@@ -691,12 +695,15 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
if (mem_cgroup_disabled())
return;
- __this_cpu_add(memcg->vmstats_local->stat[idx], val);
-
x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
struct mem_cgroup *mi;
+ /*
+ * Batch local counters to keep them in sync with
+ * the hierarchical ones.
+ */
+ __this_cpu_add(memcg->vmstats_local->stat[idx], x);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
atomic_long_add(x, &mi->vmstats[idx]);
x = 0;
@@ -745,13 +752,15 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
/* Update memcg */
__mod_memcg_state(memcg, idx, val);
- /* Update lruvec */
- __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
-
x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
struct mem_cgroup_per_node *pi;
+ /*
+ * Batch local counters to keep them in sync with
+ * the hierarchical ones.
+ */
+ __this_cpu_add(pn->lruvec_stat_local->count[idx], x);
for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
atomic_long_add(x, &pi->lruvec_stat[idx]);
x = 0;
@@ -773,12 +782,15 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
if (mem_cgroup_disabled())
return;
- __this_cpu_add(memcg->vmstats_local->events[idx], count);
-
x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
if (unlikely(x > MEMCG_CHARGE_BATCH)) {
struct mem_cgroup *mi;
+ /*
+ * Batch local counters to keep them in sync with
+ * the hierarchical ones.
+ */
+ __this_cpu_add(memcg->vmstats_local->events[idx], x);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
atomic_long_add(x, &mi->vmevents[idx]);
x = 0;
@@ -1163,7 +1175,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&iter->css, 0, &it);
+ css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
while (!ret && (task = css_task_iter_next(&it)))
ret = fn(task, arg);
css_task_iter_end(&it);
@@ -1255,32 +1267,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
*lru_size += nr_pages;
}
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
-{
- struct mem_cgroup *task_memcg;
- struct task_struct *p;
- bool ret;
-
- p = find_lock_task_mm(task);
- if (p) {
- task_memcg = get_mem_cgroup_from_mm(p->mm);
- task_unlock(p);
- } else {
- /*
- * All threads may have already detached their mm's, but the oom
- * killer still needs to detect if they have already been oom
- * killed to prevent needlessly killing additional tasks.
- */
- rcu_read_lock();
- task_memcg = mem_cgroup_from_task(task);
- css_get(&task_memcg->css);
- rcu_read_unlock();
- }
- ret = mem_cgroup_is_descendant(task_memcg, memcg);
- css_put(&task_memcg->css);
- return ret;
-}
-
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
* @memcg: the memory cgroup
@@ -1356,27 +1342,114 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
return false;
}
-static const unsigned int memcg1_stats[] = {
- MEMCG_CACHE,
- MEMCG_RSS,
- MEMCG_RSS_HUGE,
- NR_SHMEM,
- NR_FILE_MAPPED,
- NR_FILE_DIRTY,
- NR_WRITEBACK,
- MEMCG_SWAP,
-};
+static char *memory_stat_format(struct mem_cgroup *memcg)
+{
+ struct seq_buf s;
+ int i;
-static const char *const memcg1_stat_names[] = {
- "cache",
- "rss",
- "rss_huge",
- "shmem",
- "mapped_file",
- "dirty",
- "writeback",
- "swap",
-};
+ seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
+ if (!s.buffer)
+ return NULL;
+
+ /*
+ * Provide statistics on the state of the memory subsystem as
+ * well as cumulative event counters that show past behavior.
+ *
+ * This list is ordered following a combination of these gradients:
+ * 1) generic big picture -> specifics and details
+ * 2) reflecting userspace activity -> reflecting kernel heuristics
+ *
+ * Current memory state:
+ */
+
+ seq_buf_printf(&s, "anon %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_RSS) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "file %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_CACHE) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "kernel_stack %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
+ 1024);
+ seq_buf_printf(&s, "slab %llu\n",
+ (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "sock %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_SOCK) *
+ PAGE_SIZE);
+
+ seq_buf_printf(&s, "shmem %llu\n",
+ (u64)memcg_page_state(memcg, NR_SHMEM) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "file_mapped %llu\n",
+ (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "file_dirty %llu\n",
+ (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "file_writeback %llu\n",
+ (u64)memcg_page_state(memcg, NR_WRITEBACK) *
+ PAGE_SIZE);
+
+ /*
+ * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
+ * with the NR_ANON_THP vm counter, but right now it's a pain in the
+ * arse because it requires migrating the work out of rmap to a place
+ * where the page->mem_cgroup is set up and stable.
+ */
+ seq_buf_printf(&s, "anon_thp %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
+ PAGE_SIZE);
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i],
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
+
+ seq_buf_printf(&s, "slab_reclaimable %llu\n",
+ (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "slab_unreclaimable %llu\n",
+ (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
+ PAGE_SIZE);
+
+ /* Accumulated memory events */
+
+ seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
+ seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
+
+ seq_buf_printf(&s, "workingset_refault %lu\n",
+ memcg_page_state(memcg, WORKINGSET_REFAULT));
+ seq_buf_printf(&s, "workingset_activate %lu\n",
+ memcg_page_state(memcg, WORKINGSET_ACTIVATE));
+ seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
+ memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
+
+ seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
+ seq_buf_printf(&s, "pgscan %lu\n",
+ memcg_events(memcg, PGSCAN_KSWAPD) +
+ memcg_events(memcg, PGSCAN_DIRECT));
+ seq_buf_printf(&s, "pgsteal %lu\n",
+ memcg_events(memcg, PGSTEAL_KSWAPD) +
+ memcg_events(memcg, PGSTEAL_DIRECT));
+ seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
+ seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
+ seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
+ seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ seq_buf_printf(&s, "thp_fault_alloc %lu\n",
+ memcg_events(memcg, THP_FAULT_ALLOC));
+ seq_buf_printf(&s, "thp_collapse_alloc %lu\n",
+ memcg_events(memcg, THP_COLLAPSE_ALLOC));
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+ /* The above should easily fit into one page */
+ WARN_ON_ONCE(seq_buf_has_overflowed(&s));
+
+ return s.buffer;
+}
#define K(x) ((x) << (PAGE_SHIFT-10))
/**
@@ -1411,39 +1484,32 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *
*/
void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
- struct mem_cgroup *iter;
- unsigned int i;
+ char *buf;
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
K((u64)memcg->memory.max), memcg->memory.failcnt);
- pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
- K((u64)page_counter_read(&memcg->memsw)),
- K((u64)memcg->memsw.max), memcg->memsw.failcnt);
- pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
- K((u64)page_counter_read(&memcg->kmem)),
- K((u64)memcg->kmem.max), memcg->kmem.failcnt);
-
- for_each_mem_cgroup_tree(iter, memcg) {
- pr_info("Memory cgroup stats for ");
- pr_cont_cgroup_path(iter->css.cgroup);
- pr_cont(":");
-
- for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
- if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
- continue;
- pr_cont(" %s:%luKB", memcg1_stat_names[i],
- K(memcg_page_state_local(iter,
- memcg1_stats[i])));
- }
-
- for (i = 0; i < NR_LRU_LISTS; i++)
- pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
- K(memcg_page_state_local(iter,
- NR_LRU_BASE + i)));
-
- pr_cont("\n");
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->swap)),
+ K((u64)memcg->swap.max), memcg->swap.failcnt);
+ else {
+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->memsw)),
+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->kmem)),
+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);
}
+
+ pr_info("Memory cgroup stats for ");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ pr_cont(":");
+ buf = memory_stat_format(memcg);
+ if (!buf)
+ return;
+ pr_info("%s", buf);
+ kfree(buf);
}
/*
@@ -2279,7 +2345,6 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned long nr_reclaimed;
bool may_swap = true;
bool drained = false;
- bool oomed = false;
enum oom_status oom_status;
if (mem_cgroup_is_root(memcg))
@@ -2366,7 +2431,7 @@ retry:
if (nr_retries--)
goto retry;
- if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto nomem;
if (gfp_mask & __GFP_NOFAIL)
@@ -2385,7 +2450,6 @@ retry:
switch (oom_status) {
case OOM_SUCCESS:
nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- oomed = true;
goto retry;
case OOM_FAILED:
goto force;
@@ -2588,12 +2652,13 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
{
struct memcg_kmem_cache_create_work *cw;
+ if (!css_tryget_online(&memcg->css))
+ return;
+
cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
if (!cw)
return;
- css_get(&memcg->css);
-
cw->memcg = memcg;
cw->cachep = cachep;
INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
@@ -2628,6 +2693,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
+ struct memcg_cache_array *arr;
int kmemcg_id;
VM_BUG_ON(!is_root_cache(cachep));
@@ -2635,14 +2701,28 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
if (memcg_kmem_bypass())
return cachep;
- memcg = get_mem_cgroup_from_current();
+ rcu_read_lock();
+
+ if (unlikely(current->active_memcg))
+ memcg = current->active_memcg;
+ else
+ memcg = mem_cgroup_from_task(current);
+
+ if (!memcg || memcg == root_mem_cgroup)
+ goto out_unlock;
+
kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
- goto out;
+ goto out_unlock;
- memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
- if (likely(memcg_cachep))
- return memcg_cachep;
+ arr = rcu_dereference(cachep->memcg_params.memcg_caches);
+
+ /*
+ * Make sure we will access the up-to-date value. The code updating
+ * memcg_caches issues a write barrier to match the data dependency
+ * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
+ */
+ memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
/*
* If we are in a safe context (can wait, and not in interrupt
@@ -2655,10 +2735,20 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
* memcg_create_kmem_cache, this means no further allocation
* could happen with the slab_mutex held. So it's better to
* defer everything.
+ *
+ * If the memcg is dying or memcg_cache is about to be released,
+ * don't bother creating new kmem_caches. Because memcg_cachep
+ * is ZEROed as the fist step of kmem offlining, we don't need
+ * percpu_ref_tryget_live() here. css_tryget_online() check in
+ * memcg_schedule_kmem_cache_create() will prevent us from
+ * creation of a new kmem_cache.
*/
- memcg_schedule_kmem_cache_create(memcg, cachep);
-out:
- css_put(&memcg->css);
+ if (unlikely(!memcg_cachep))
+ memcg_schedule_kmem_cache_create(memcg, cachep);
+ else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
+ cachep = memcg_cachep;
+out_unlock:
+ rcu_read_unlock();
return cachep;
}
@@ -2669,7 +2759,7 @@ out:
void memcg_kmem_put_cache(struct kmem_cache *cachep)
{
if (!is_root_cache(cachep))
- css_put(&cachep->memcg_params.memcg->css);
+ percpu_ref_put(&cachep->memcg_params.refcnt);
}
/**
@@ -2697,9 +2787,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
cancel_charge(memcg, nr_pages);
return -ENOMEM;
}
-
- page->mem_cgroup = memcg;
-
return 0;
}
@@ -2722,12 +2809,30 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
memcg = get_mem_cgroup_from_current();
if (!mem_cgroup_is_root(memcg)) {
ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
- if (!ret)
+ if (!ret) {
+ page->mem_cgroup = memcg;
__SetPageKmemcg(page);
+ }
}
css_put(&memcg->css);
return ret;
}
+
+/**
+ * __memcg_kmem_uncharge_memcg: uncharge a kmem page
+ * @memcg: memcg to uncharge
+ * @nr_pages: number of pages to uncharge
+ */
+void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+}
/**
* __memcg_kmem_uncharge: uncharge a kmem page
* @page: page to uncharge
@@ -2742,14 +2847,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
return;
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
-
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_uncharge(&memcg->kmem, nr_pages);
-
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
-
+ __memcg_kmem_uncharge_memcg(memcg, nr_pages);
page->mem_cgroup = NULL;
/* slab pages do not have PageKmemcg flag set */
@@ -3168,15 +3266,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
*/
memcg->kmem_state = KMEM_ALLOCATED;
- memcg_deactivate_kmem_caches(memcg);
-
- kmemcg_id = memcg->kmemcg_id;
- BUG_ON(kmemcg_id < 0);
-
parent = parent_mem_cgroup(memcg);
if (!parent)
parent = root_mem_cgroup;
+ memcg_deactivate_kmem_caches(memcg, parent);
+
+ kmemcg_id = memcg->kmemcg_id;
+ BUG_ON(kmemcg_id < 0);
+
/*
* Change kmemcg_id of this cgroup and all its descendants to the
* parent's id, and then move all entries from this cgroup's list_lrus
@@ -3207,9 +3305,8 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
memcg_offline_kmem(memcg);
if (memcg->kmem_state == KMEM_ALLOCATED) {
- memcg_destroy_kmem_caches(memcg);
+ WARN_ON(!list_empty(&memcg->kmem_caches));
static_branch_dec(&memcg_kmem_enabled_key);
- WARN_ON(page_counter_read(&memcg->kmem));
}
}
#else
@@ -3472,6 +3569,28 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
}
#endif /* CONFIG_NUMA */
+static const unsigned int memcg1_stats[] = {
+ MEMCG_CACHE,
+ MEMCG_RSS,
+ MEMCG_RSS_HUGE,
+ NR_SHMEM,
+ NR_FILE_MAPPED,
+ NR_FILE_DIRTY,
+ NR_WRITEBACK,
+ MEMCG_SWAP,
+};
+
+static const char *const memcg1_stat_names[] = {
+ "cache",
+ "rss",
+ "rss_huge",
+ "shmem",
+ "mapped_file",
+ "dirty",
+ "writeback",
+ "swap",
+};
+
/* Universal VM events cgroup1 shows, original sort order */
static const unsigned int memcg1_events[] = {
PGPGIN,
@@ -3530,12 +3649,13 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
- (u64)memcg_page_state(memcg, i) * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, memcg1_stats[i]) *
+ PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
- (u64)memcg_events(memcg, i));
+ (u64)memcg_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
@@ -4634,6 +4754,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
/* The following stuff does not apply to the root */
if (!parent) {
+#ifdef CONFIG_MEMCG_KMEM
+ INIT_LIST_HEAD(&memcg->kmem_caches);
+#endif
root_mem_cgroup = memcg;
return &memcg->css;
}
@@ -4793,7 +4916,7 @@ enum mc_target_type {
static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
{
- struct page *page = _vm_normal_page(vma, addr, ptent, true);
+ struct page *page = vm_normal_page(vma, addr, ptent);
if (!page || !page_mapped(page))
return NULL;
@@ -4994,8 +5117,8 @@ out:
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
- * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
+ * (so ZONE_DEVICE page and thus not on the lru).
* For now we such page is charge like a regular page would be as for all
* intent and purposes it is just special memory taking the place of a
* regular page.
@@ -5029,8 +5152,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
*/
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
- if (is_device_private_page(page) ||
- is_device_public_page(page))
+ if (is_device_private_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
@@ -5101,8 +5223,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
if (ptl) {
/*
* Note their can not be MC_TARGET_DEVICE for now as we do not
- * support transparent huge page with MEMORY_DEVICE_PUBLIC or
- * MEMORY_DEVICE_PRIVATE but this might change.
+ * support transparent huge page with MEMORY_DEVICE_PRIVATE but
+ * this might change.
*/
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
@@ -5625,112 +5747,42 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
return nbytes;
}
+static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
+{
+ seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
+ seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
+ seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
+ seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
+ seq_printf(m, "oom_kill %lu\n",
+ atomic_long_read(&events[MEMCG_OOM_KILL]));
+}
+
static int memory_events_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- seq_printf(m, "low %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
- seq_printf(m, "high %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
- seq_printf(m, "max %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
- seq_printf(m, "oom %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
- seq_printf(m, "oom_kill %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
-
+ __memory_events_show(m, memcg->memory_events);
return 0;
}
-static int memory_stat_show(struct seq_file *m, void *v)
+static int memory_events_local_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- int i;
-
- /*
- * Provide statistics on the state of the memory subsystem as
- * well as cumulative event counters that show past behavior.
- *
- * This list is ordered following a combination of these gradients:
- * 1) generic big picture -> specifics and details
- * 2) reflecting userspace activity -> reflecting kernel heuristics
- *
- * Current memory state:
- */
- seq_printf(m, "anon %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_RSS) * PAGE_SIZE);
- seq_printf(m, "file %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_CACHE) * PAGE_SIZE);
- seq_printf(m, "kernel_stack %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * 1024);
- seq_printf(m, "slab %llu\n",
- (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
- memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
- PAGE_SIZE);
- seq_printf(m, "sock %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE);
-
- seq_printf(m, "shmem %llu\n",
- (u64)memcg_page_state(memcg, NR_SHMEM) * PAGE_SIZE);
- seq_printf(m, "file_mapped %llu\n",
- (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * PAGE_SIZE);
- seq_printf(m, "file_dirty %llu\n",
- (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * PAGE_SIZE);
- seq_printf(m, "file_writeback %llu\n",
- (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE);
-
- /*
- * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
- * with the NR_ANON_THP vm counter, but right now it's a pain in the
- * arse because it requires migrating the work out of rmap to a place
- * where the page->mem_cgroup is set up and stable.
- */
- seq_printf(m, "anon_thp %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * PAGE_SIZE);
-
- for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
- (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
- PAGE_SIZE);
-
- seq_printf(m, "slab_reclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
- PAGE_SIZE);
- seq_printf(m, "slab_unreclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
- PAGE_SIZE);
-
- /* Accumulated memory events */
-
- seq_printf(m, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
- seq_printf(m, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
-
- seq_printf(m, "workingset_refault %lu\n",
- memcg_page_state(memcg, WORKINGSET_REFAULT));
- seq_printf(m, "workingset_activate %lu\n",
- memcg_page_state(memcg, WORKINGSET_ACTIVATE));
- seq_printf(m, "workingset_nodereclaim %lu\n",
- memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
-
- seq_printf(m, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
- seq_printf(m, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) +
- memcg_events(memcg, PGSCAN_DIRECT));
- seq_printf(m, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) +
- memcg_events(memcg, PGSTEAL_DIRECT));
- seq_printf(m, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
- seq_printf(m, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
- seq_printf(m, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
- seq_printf(m, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
+ __memory_events_show(m, memcg->memory_events_local);
+ return 0;
+}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- seq_printf(m, "thp_fault_alloc %lu\n",
- memcg_events(memcg, THP_FAULT_ALLOC));
- seq_printf(m, "thp_collapse_alloc %lu\n",
- memcg_events(memcg, THP_COLLAPSE_ALLOC));
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static int memory_stat_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ char *buf;
+ buf = memory_stat_format(memcg);
+ if (!buf)
+ return -ENOMEM;
+ seq_puts(m, buf);
+ kfree(buf);
return 0;
}
@@ -5802,6 +5854,12 @@ static struct cftype memory_files[] = {
.seq_show = memory_events_show,
},
{
+ .name = "events.local",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct mem_cgroup, events_local_file),
+ .seq_show = memory_events_local_show,
+ },
+ {
.name = "stat",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_stat_show,