summaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c151
1 files changed, 99 insertions, 52 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7acd0afdfc2a..44df66a98f2a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -131,6 +131,9 @@ struct scan_control {
unsigned int file_taken;
unsigned int taken;
} nr;
+
+ /* for recording the reclaimed slab by now */
+ struct reclaim_state reclaim_state;
};
#ifdef ARCH_HAS_PREFETCH
@@ -238,6 +241,18 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
}
#endif /* CONFIG_MEMCG_KMEM */
+static void set_task_reclaim_state(struct task_struct *task,
+ struct reclaim_state *rs)
+{
+ /* Check for an overwrite */
+ WARN_ON_ONCE(rs && task->reclaim_state);
+
+ /* Check for the nulling of an already-nulled member */
+ WARN_ON_ONCE(!rs && !task->reclaim_state);
+
+ task->reclaim_state = rs;
+}
+
#ifdef CONFIG_MEMCG
static bool global_reclaim(struct scan_control *sc)
{
@@ -1118,6 +1133,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
int may_enter_fs;
enum page_references references = PAGEREF_RECLAIM_CLEAN;
bool dirty, writeback;
+ unsigned int nr_pages;
cond_resched();
@@ -1129,7 +1145,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
VM_BUG_ON_PAGE(PageActive(page), page);
- sc->nr_scanned++;
+ nr_pages = 1 << compound_order(page);
+
+ /* Account the number of base pages even though THP */
+ sc->nr_scanned += nr_pages;
if (unlikely(!page_evictable(page)))
goto activate_locked;
@@ -1137,11 +1156,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (!sc->may_unmap && page_mapped(page))
goto keep_locked;
- /* Double the slab pressure for mapped and swapcache pages */
- if ((page_mapped(page) || PageSwapCache(page)) &&
- !(PageAnon(page) && !PageSwapBacked(page)))
- sc->nr_scanned++;
-
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
@@ -1255,7 +1269,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
case PAGEREF_ACTIVATE:
goto activate_locked;
case PAGEREF_KEEP:
- stat->nr_ref_keep++;
+ stat->nr_ref_keep += nr_pages;
goto keep_locked;
case PAGEREF_RECLAIM:
case PAGEREF_RECLAIM_CLEAN:
@@ -1287,7 +1301,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
if (!add_to_swap(page)) {
if (!PageTransHuge(page))
- goto activate_locked;
+ goto activate_locked_split;
/* Fallback to swap normal pages */
if (split_huge_page_to_list(page,
page_list))
@@ -1296,7 +1310,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
count_vm_event(THP_SWPOUT_FALLBACK);
#endif
if (!add_to_swap(page))
- goto activate_locked;
+ goto activate_locked_split;
}
may_enter_fs = 1;
@@ -1311,6 +1325,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
/*
+ * THP may get split above, need minus tail pages and update
+ * nr_pages to avoid accounting tail pages twice.
+ *
+ * The tail pages that are added into swap cache successfully
+ * reach here.
+ */
+ if ((nr_pages > 1) && !PageTransHuge(page)) {
+ sc->nr_scanned -= (nr_pages - 1);
+ nr_pages = 1;
+ }
+
+ /*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
@@ -1320,7 +1346,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (unlikely(PageTransHuge(page)))
flags |= TTU_SPLIT_HUGE_PMD;
if (!try_to_unmap(page, flags)) {
- stat->nr_unmap_fail++;
+ stat->nr_unmap_fail += nr_pages;
goto activate_locked;
}
}
@@ -1447,7 +1473,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
unlock_page(page);
free_it:
- nr_reclaimed++;
+ /*
+ * THP may get swapped out in a whole, need account
+ * all base pages.
+ */
+ nr_reclaimed += nr_pages;
/*
* Is there need to periodically free_page_list? It would
@@ -1460,6 +1490,15 @@ free_it:
list_add(&page->lru, &free_pages);
continue;
+activate_locked_split:
+ /*
+ * The tail pages that are failed to add into swap cache
+ * reach here. Fixup nr_scanned and nr_pages.
+ */
+ if (nr_pages > 1) {
+ sc->nr_scanned -= (nr_pages - 1);
+ nr_pages = 1;
+ }
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
@@ -1469,8 +1508,7 @@ activate_locked:
if (!PageMlocked(page)) {
int type = page_is_file_cache(page);
SetPageActive(page);
- pgactivate++;
- stat->nr_activate[type] += hpage_nr_pages(page);
+ stat->nr_activate[type] += nr_pages;
count_memcg_page_event(page, PGACTIVATE);
}
keep_locked:
@@ -1480,6 +1518,8 @@ keep:
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
+ pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
+
mem_cgroup_uncharge_list(&free_pages);
try_to_unmap_flush();
free_unref_page_list(&free_pages);
@@ -1505,7 +1545,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
list_for_each_entry_safe(page, next, page_list, lru) {
if (page_is_file_cache(page) && !PageDirty(page) &&
- !__PageMovable(page)) {
+ !__PageMovable(page) && !PageUnevictable(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
}
@@ -1651,10 +1691,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
LIST_HEAD(pages_skipped);
isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
+ total_scan = 0;
scan = 0;
- for (total_scan = 0;
- scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src);
- total_scan++) {
+ while (scan < nr_to_scan && !list_empty(src)) {
struct page *page;
page = lru_to_page(src);
@@ -1662,9 +1701,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
VM_BUG_ON_PAGE(!PageLRU(page), page);
+ nr_pages = 1 << compound_order(page);
+ total_scan += nr_pages;
+
if (page_zonenum(page) > sc->reclaim_idx) {
list_move(&page->lru, &pages_skipped);
- nr_skipped[page_zonenum(page)]++;
+ nr_skipped[page_zonenum(page)] += nr_pages;
continue;
}
@@ -1673,11 +1715,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* return with no isolated pages if the LRU mostly contains
* ineligible pages. This causes the VM to not reclaim any
* pages, triggering a premature OOM.
+ *
+ * Account all tail pages of THP. This would not cause
+ * premature OOM since __isolate_lru_page() returns -EBUSY
+ * only when the page is being freed somewhere else.
*/
- scan++;
+ scan += nr_pages;
switch (__isolate_lru_page(page, mode)) {
case 0:
- nr_pages = hpage_nr_pages(page);
nr_taken += nr_pages;
nr_zone_taken[page_zonenum(page)] += nr_pages;
list_move(&page->lru, dst);
@@ -1953,8 +1998,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (global_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
- reclaim_stat->recent_rotated[0] = stat.nr_activate[0];
- reclaim_stat->recent_rotated[1] = stat.nr_activate[1];
+ reclaim_stat->recent_rotated[0] += stat.nr_activate[0];
+ reclaim_stat->recent_rotated[1] += stat.nr_activate[1];
move_pages_to_lru(lruvec, &page_list);
@@ -2125,7 +2170,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
* 10TB 320 32GB
*/
static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
- struct scan_control *sc, bool actual_reclaim)
+ struct scan_control *sc, bool trace)
{
enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -2151,7 +2196,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
* rid of the stale workingset quickly.
*/
refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
- if (file && actual_reclaim && lruvec->refaults != refaults) {
+ if (file && lruvec->refaults != refaults) {
inactive_ratio = 0;
} else {
gb = (inactive + active) >> (30 - PAGE_SHIFT);
@@ -2161,7 +2206,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
inactive_ratio = 1;
}
- if (actual_reclaim)
+ if (trace)
trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
@@ -3161,11 +3206,13 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
return 1;
+ set_task_reclaim_state(current, &sc.reclaim_state);
trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+ set_task_reclaim_state(current, NULL);
return nr_reclaimed;
}
@@ -3188,6 +3235,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
};
unsigned long lru_pages;
+ set_task_reclaim_state(current, &sc.reclaim_state);
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -3205,7 +3253,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+ set_task_reclaim_state(current, NULL);
*nr_scanned = sc.nr_scanned;
+
return sc.nr_reclaimed;
}
@@ -3232,6 +3282,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_shrinkslab = 1,
};
+ set_task_reclaim_state(current, &sc.reclaim_state);
/*
* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
* take care of from where we get pages. So the node where we start the
@@ -3252,6 +3303,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
psi_memstall_leave(&pflags);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+ set_task_reclaim_state(current, NULL);
return nr_reclaimed;
}
@@ -3453,6 +3505,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
.may_unmap = 1,
};
+ set_task_reclaim_state(current, &sc.reclaim_state);
psi_memstall_enter(&pflags);
__fs_reclaim_acquire();
@@ -3634,6 +3687,8 @@ out:
snapshot_refaults(NULL, pgdat);
__fs_reclaim_release();
psi_memstall_leave(&pflags);
+ set_task_reclaim_state(current, NULL);
+
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
@@ -3644,19 +3699,18 @@ out:
}
/*
- * pgdat->kswapd_classzone_idx is the highest zone index that a recent
- * allocation request woke kswapd for. When kswapd has not woken recently,
- * the value is MAX_NR_ZONES which is not a valid index. This compares a
- * given classzone and returns it or the highest classzone index kswapd
- * was recently woke for.
+ * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be
+ * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not
+ * a valid index then either kswapd runs for first time or kswapd couldn't sleep
+ * after previous reclaim attempt (node is still unbalanced). In that case
+ * return the zone index of the previous kswapd reclaim cycle.
*/
static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
- enum zone_type classzone_idx)
+ enum zone_type prev_classzone_idx)
{
if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
- return classzone_idx;
-
- return max(pgdat->kswapd_classzone_idx, classzone_idx);
+ return prev_classzone_idx;
+ return pgdat->kswapd_classzone_idx;
}
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
@@ -3758,15 +3812,10 @@ static int kswapd(void *p)
unsigned int classzone_idx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
-
- struct reclaim_state reclaim_state = {
- .reclaimed_slab = 0,
- };
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk, cpumask);
- current->reclaim_state = &reclaim_state;
/*
* Tell the memory management that we're a "memory allocator",
@@ -3797,7 +3846,7 @@ kswapd_try_sleep:
/* Read the new order and classzone_idx */
alloc_order = reclaim_order = pgdat->kswapd_order;
- classzone_idx = kswapd_classzone_idx(pgdat, 0);
+ classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
pgdat->kswapd_order = 0;
pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
@@ -3828,7 +3877,6 @@ kswapd_try_sleep:
}
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
- current->reclaim_state = NULL;
return 0;
}
@@ -3851,8 +3899,12 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
if (!cpuset_zone_allowed(zone, gfp_flags))
return;
pgdat = zone->zone_pgdat;
- pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
- classzone_idx);
+
+ if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
+ pgdat->kswapd_classzone_idx = classzone_idx;
+ else
+ pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx,
+ classzone_idx);
pgdat->kswapd_order = max(pgdat->kswapd_order, order);
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
@@ -3889,7 +3941,6 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
*/
unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
{
- struct reclaim_state reclaim_state;
struct scan_control sc = {
.nr_to_reclaim = nr_to_reclaim,
.gfp_mask = GFP_HIGHUSER_MOVABLE,
@@ -3901,18 +3952,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
.hibernation_mode = 1,
};
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
- struct task_struct *p = current;
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
fs_reclaim_acquire(sc.gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
- reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
+ set_task_reclaim_state(current, &sc.reclaim_state);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
- p->reclaim_state = NULL;
+ set_task_reclaim_state(current, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
@@ -4077,7 +4126,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
/* Minimum pages needed in order to stay on node */
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
- struct reclaim_state reclaim_state;
unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
@@ -4102,8 +4150,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
*/
noreclaim_flag = memalloc_noreclaim_save();
p->flags |= PF_SWAPWRITE;
- reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
+ set_task_reclaim_state(p, &sc.reclaim_state);
if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
/*
@@ -4115,7 +4162,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
- p->reclaim_state = NULL;
+ set_task_reclaim_state(p, NULL);
current->flags &= ~PF_SWAPWRITE;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);