From c4843a7593a9df3ff5b1806084cefdfa81dd7c79 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 22 May 2015 17:13:16 -0400 Subject: memcg: add per cgroup dirty page accounting When modifying PG_Dirty on cached file pages, update the new MEM_CGROUP_STAT_DIRTY counter. This is done in the same places where global NR_FILE_DIRTY is managed. The new memcg stat is visible in the per memcg memory.stat cgroupfs file. The most recent past attempt at this was http://thread.gmane.org/gmane.linux.kernel.cgroups/8632 The new accounting supports future efforts to add per cgroup dirty page throttling and writeback. It also helps an administrator break down a container's memory usage and provides evidence to understand memcg oom kills (the new dirty count is included in memcg oom kill messages). The ability to move page accounting between memcg (memory.move_charge_at_immigrate) makes this accounting more complicated than the global counter. The existing mem_cgroup_{begin,end}_page_stat() lock is used to serialize move accounting with stat updates. Typical update operation: memcg = mem_cgroup_begin_page_stat(page) if (TestSetPageDirty()) { [...] mem_cgroup_update_page_stat(memcg) } mem_cgroup_end_page_stat(memcg) Summary of mem_cgroup_end_page_stat() overhead: - Without CONFIG_MEMCG it's a no-op - With CONFIG_MEMCG and no inter memcg task movement, it's just rcu_read_lock() - With CONFIG_MEMCG and inter memcg task movement, it's rcu_read_lock() + spin_lock_irqsave() A memcg parameter is added to several routines because their callers now grab mem_cgroup_begin_page_stat() which returns the memcg later needed by for mem_cgroup_update_page_stat(). Because mem_cgroup_begin_page_stat() may disable interrupts, some adjustments are needed: - move __mark_inode_dirty() from __set_page_dirty() to its caller. __mark_inode_dirty() locking does not want interrupts disabled. - use spin_lock_irqsave(tree_lock) rather than spin_lock_irq() in __delete_from_page_cache(), replace_page_cache_page(), invalidate_complete_page2(), and __remove_mapping(). text data bss dec hex filename 8925147 1774832 1785856 12485835 be84cb vmlinux-!CONFIG_MEMCG-before 8925339 1774832 1785856 12486027 be858b vmlinux-!CONFIG_MEMCG-after +192 text bytes 8965977 1784992 1785856 12536825 bf4bf9 vmlinux-CONFIG_MEMCG-before 8966750 1784992 1785856 12537598 bf4efe vmlinux-CONFIG_MEMCG-after +773 text bytes Performance tests run on v4.0-rc1-36-g4f671fe2f952. Lower is better for all metrics, they're all wall clock or cycle counts. The read and write fault benchmarks just measure fault time, they do not include I/O time. * CONFIG_MEMCG not set: baseline patched kbuild 1m25.030000(+-0.088% 3 samples) 1m25.426667(+-0.120% 3 samples) dd write 100 MiB 0.859211561 +-15.10% 0.874162885 +-15.03% dd write 200 MiB 1.670653105 +-17.87% 1.669384764 +-11.99% dd write 1000 MiB 8.434691190 +-14.15% 8.474733215 +-14.77% read fault cycles 254.0(+-0.000% 10 samples) 253.0(+-0.000% 10 samples) write fault cycles 2021.2(+-3.070% 10 samples) 1984.5(+-1.036% 10 samples) * CONFIG_MEMCG=y root_memcg: baseline patched kbuild 1m25.716667(+-0.105% 3 samples) 1m25.686667(+-0.153% 3 samples) dd write 100 MiB 0.855650830 +-14.90% 0.887557919 +-14.90% dd write 200 MiB 1.688322953 +-12.72% 1.667682724 +-13.33% dd write 1000 MiB 8.418601605 +-14.30% 8.673532299 +-15.00% read fault cycles 266.0(+-0.000% 10 samples) 266.0(+-0.000% 10 samples) write fault cycles 2051.7(+-1.349% 10 samples) 2049.6(+-1.686% 10 samples) * CONFIG_MEMCG=y non-root_memcg: baseline patched kbuild 1m26.120000(+-0.273% 3 samples) 1m25.763333(+-0.127% 3 samples) dd write 100 MiB 0.861723964 +-15.25% 0.818129350 +-14.82% dd write 200 MiB 1.669887569 +-13.30% 1.698645885 +-13.27% dd write 1000 MiB 8.383191730 +-14.65% 8.351742280 +-14.52% read fault cycles 265.7(+-0.172% 10 samples) 267.0(+-0.000% 10 samples) write fault cycles 2070.6(+-1.512% 10 samples) 2084.4(+-2.148% 10 samples) As expected anon page faults are not affected by this patch. tj: Updated to apply on top of the recent cancel_dirty_page() changes. Signed-off-by: Sha Zhengju Signed-off-by: Greg Thelen Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- mm/page-writeback.c | 50 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 8 deletions(-) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 227b867598e1..bdeecad00489 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2090,15 +2090,20 @@ int __set_page_dirty_no_writeback(struct page *page) /* * Helper function for set_page_dirty family. + * + * Caller must hold mem_cgroup_begin_page_stat(). + * * NOTE: This relies on being atomic wrt interrupts. */ -void account_page_dirtied(struct page *page, struct address_space *mapping) +void account_page_dirtied(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg) { trace_writeback_dirty_page(page, mapping); if (mapping_cap_account_dirty(mapping)) { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); __inc_bdi_stat(bdi, BDI_RECLAIMABLE); @@ -2112,10 +2117,14 @@ EXPORT_SYMBOL(account_page_dirtied); /* * Helper function for deaccounting dirty page without writeback. + * + * Caller must hold mem_cgroup_begin_page_stat(). */ -void account_page_cleaned(struct page *page, struct address_space *mapping) +void account_page_cleaned(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg) { if (mapping_cap_account_dirty(mapping)) { + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); task_io_account_cancelled_write(PAGE_CACHE_SIZE); @@ -2136,26 +2145,34 @@ void account_page_cleaned(struct page *page, struct address_space *mapping) */ int __set_page_dirty_nobuffers(struct page *page) { + struct mem_cgroup *memcg; + + memcg = mem_cgroup_begin_page_stat(page); if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); unsigned long flags; - if (!mapping) + if (!mapping) { + mem_cgroup_end_page_stat(memcg); return 1; + } spin_lock_irqsave(&mapping->tree_lock, flags); BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - account_page_dirtied(page, mapping); + account_page_dirtied(page, mapping, memcg); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); + if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } return 1; } + mem_cgroup_end_page_stat(memcg); return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); @@ -2273,8 +2290,20 @@ EXPORT_SYMBOL(set_page_dirty_lock); */ void cancel_dirty_page(struct page *page) { - if (TestClearPageDirty(page)) - account_page_cleaned(page, page_mapping(page)); + struct address_space *mapping = page_mapping(page); + + if (mapping_cap_account_dirty(mapping)) { + struct mem_cgroup *memcg; + + memcg = mem_cgroup_begin_page_stat(page); + + if (TestClearPageDirty(page)) + account_page_cleaned(page, mapping, memcg); + + mem_cgroup_end_page_stat(memcg); + } else { + ClearPageDirty(page); + } } EXPORT_SYMBOL(cancel_dirty_page); @@ -2295,6 +2324,8 @@ EXPORT_SYMBOL(cancel_dirty_page); int clear_page_dirty_for_io(struct page *page) { struct address_space *mapping = page_mapping(page); + struct mem_cgroup *memcg; + int ret = 0; BUG_ON(!PageLocked(page)); @@ -2334,13 +2365,16 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ + memcg = mem_cgroup_begin_page_stat(page); if (TestClearPageDirty(page)) { + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); - return 1; + ret = 1; } - return 0; + mem_cgroup_end_page_stat(memcg); + return ret; } return TestClearPageDirty(page); } -- cgit v1.2.3-55-g7522