summaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorTejun Heo2015-05-23 00:23:35 +0200
committerJens Axboe2015-06-02 16:38:13 +0200
commitc2aa723a6093633ae4ec15b08a4db276643cab3e (patch)
tree0dc3b9a2f2cc89aa149620c385446f9593b4e28b /mm/memcontrol.c
parentwriteback: reset wb_domain->dirty_limit[_tstmp] when memcg domain size changes (diff)
downloadkernel-qcow2-linux-c2aa723a6093633ae4ec15b08a4db276643cab3e.tar.gz
kernel-qcow2-linux-c2aa723a6093633ae4ec15b08a4db276643cab3e.tar.xz
kernel-qcow2-linux-c2aa723a6093633ae4ec15b08a4db276643cab3e.zip
writeback: implement memcg writeback domain based throttling
While cgroup writeback support now connects memcg and blkcg so that writeback IOs are properly attributed and controlled, the IO back pressure propagation mechanism implemented in balance_dirty_pages() and its subroutines wasn't aware of cgroup writeback. Processes belonging to a memcg may have access to only subset of total memory available in the system and not factoring this into dirty throttling rendered it completely ineffective for processes under memcg limits and memcg ended up building a separate ad-hoc degenerate mechanism directly into vmscan code to limit page dirtying. The previous patches updated balance_dirty_pages() and its subroutines so that they can deal with multiple wb_domain's (writeback domains) and defined per-memcg wb_domain. Processes belonging to a non-root memcg are bound to two wb_domains, global wb_domain and memcg wb_domain, and should be throttled according to IO pressures from both domains. This patch updates dirty throttling code so that it repeats similar calculations for the two domains - the differences between the two are few and minor - and applies the lower of the two sets of resulting constraints. wb_over_bg_thresh(), which controls when background writeback terminates, is also updated to consider both global and memcg wb_domains. It returns true if dirty is over bg_thresh for either domain. This makes the dirty throttling mechanism operational for memcg domains including writeback-bandwidth-proportional dirty page distribution inside them but the ad-hoc memcg throttling mechanism in vmscan is still in place. The next patch will rip it out. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: Jan Kara <jack@suse.cz> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Greg Thelen <gthelen@google.com> Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c43
1 files changed, 43 insertions, 0 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c0b0406ae5ca..f816d91c643b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4020,6 +4020,49 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
return &memcg->cgwb_domain;
}
+/**
+ * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
+ * @wb: bdi_writeback in question
+ * @pavail: out parameter for number of available pages
+ * @pdirty: out parameter for number of dirty pages
+ * @pwriteback: out parameter for number of pages under writeback
+ *
+ * Determine the numbers of available, dirty, and writeback pages in @wb's
+ * memcg. Dirty and writeback are self-explanatory. Available is a bit
+ * more involved.
+ *
+ * A memcg's headroom is "min(max, high) - used". The available memory is
+ * calculated as the lowest headroom of itself and the ancestors plus the
+ * number of pages already being used for file pages. Note that this
+ * doesn't consider the actual amount of available memory in the system.
+ * The caller should further cap *@pavail accordingly.
+ */
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+ unsigned long *pdirty, unsigned long *pwriteback)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ struct mem_cgroup *parent;
+ unsigned long head_room = PAGE_COUNTER_MAX;
+ unsigned long file_pages;
+
+ *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+
+ /* this should eventually include NR_UNSTABLE_NFS */
+ *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+
+ file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+ (1 << LRU_ACTIVE_FILE));
+ while ((parent = parent_mem_cgroup(memcg))) {
+ unsigned long ceiling = min(memcg->memory.limit, memcg->high);
+ unsigned long used = page_counter_read(&memcg->memory);
+
+ head_room = min(head_room, ceiling - min(ceiling, used));
+ memcg = parent;
+ }
+
+ *pavail = file_pages + head_room;
+}
+
#else /* CONFIG_CGROUP_WRITEBACK */
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)