summaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c771
1 files changed, 462 insertions, 309 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b94fe1b3da43..b8a6fdc21312 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,15 @@
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/vmscan.h>
+
+enum lumpy_mode {
+ LUMPY_MODE_NONE,
+ LUMPY_MODE_ASYNC,
+ LUMPY_MODE_SYNC,
+};
+
struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
@@ -76,10 +85,10 @@ struct scan_control {
int order;
/*
- * Intend to reclaim enough contenious memory rather than to reclaim
- * enough amount memory. I.e, it's the mode for high order allocation.
+ * Intend to reclaim enough continuous memory rather than reclaim
+ * enough amount of memory. i.e, mode for high order allocation.
*/
- bool lumpy_reclaim_mode;
+ enum lumpy_mode lumpy_reclaim_mode;
/* Which cgroup do we reclaim from */
struct mem_cgroup *mem_cgroup;
@@ -262,6 +271,36 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
return ret;
}
+static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
+ bool sync)
+{
+ enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
+
+ /*
+ * Some reclaim have alredy been failed. No worth to try synchronous
+ * lumpy reclaim.
+ */
+ if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
+ return;
+
+ /*
+ * If we need a large contiguous chunk of memory, or have
+ * trouble getting a small set of contiguous pages, we
+ * will reclaim both active and inactive pages.
+ */
+ if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+ sc->lumpy_reclaim_mode = mode;
+ else if (sc->order && priority < DEF_PRIORITY - 2)
+ sc->lumpy_reclaim_mode = mode;
+ else
+ sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+}
+
+static void disable_lumpy_reclaim_mode(struct scan_control *sc)
+{
+ sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+}
+
static inline int is_page_cache_freeable(struct page *page)
{
/*
@@ -272,7 +311,8 @@ static inline int is_page_cache_freeable(struct page *page)
return page_count(page) - page_has_private(page) == 2;
}
-static int may_write_to_queue(struct backing_dev_info *bdi)
+static int may_write_to_queue(struct backing_dev_info *bdi,
+ struct scan_control *sc)
{
if (current->flags & PF_SWAPWRITE)
return 1;
@@ -280,6 +320,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
return 1;
if (bdi == current->backing_dev_info)
return 1;
+
+ /* lumpy reclaim for hugepage often need a lot of write */
+ if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+ return 1;
return 0;
}
@@ -304,12 +348,6 @@ static void handle_write_error(struct address_space *mapping,
unlock_page(page);
}
-/* Request for sync pageout. */
-enum pageout_io {
- PAGEOUT_IO_ASYNC,
- PAGEOUT_IO_SYNC,
-};
-
/* possible outcome of pageout() */
typedef enum {
/* failed to write page out, page is locked */
@@ -327,7 +365,7 @@ typedef enum {
* Calls ->writepage().
*/
static pageout_t pageout(struct page *page, struct address_space *mapping,
- enum pageout_io sync_writeback)
+ struct scan_control *sc)
{
/*
* If the page is dirty, only perform writeback if that write
@@ -363,7 +401,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_queue(mapping->backing_dev_info))
+ if (!may_write_to_queue(mapping->backing_dev_info, sc))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
@@ -373,7 +411,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
.nr_to_write = SWAP_CLUSTER_MAX,
.range_start = 0,
.range_end = LLONG_MAX,
- .nonblocking = 1,
.for_reclaim = 1,
};
@@ -391,13 +428,16 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
* direct reclaiming a large contiguous area and the
* first attempt to free a range of pages fails.
*/
- if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
+ if (PageWriteback(page) &&
+ sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC)
wait_on_page_writeback(page);
if (!PageWriteback(page)) {
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
+ trace_mm_vmscan_writepage(page,
+ trace_reclaim_flags(page, sc->lumpy_reclaim_mode));
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -575,7 +615,7 @@ static enum page_references page_check_references(struct page *page,
referenced_page = TestClearPageReferenced(page);
/* Lumpy reclaim - ignore references */
- if (sc->lumpy_reclaim_mode)
+ if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE)
return PAGEREF_RECLAIM;
/*
@@ -611,27 +651,46 @@ static enum page_references page_check_references(struct page *page,
}
/* Reclaim if clean, defer dirty pages to writeback */
- if (referenced_page)
+ if (referenced_page && !PageSwapBacked(page))
return PAGEREF_RECLAIM_CLEAN;
return PAGEREF_RECLAIM;
}
+static noinline_for_stack void free_page_list(struct list_head *free_pages)
+{
+ struct pagevec freed_pvec;
+ struct page *page, *tmp;
+
+ pagevec_init(&freed_pvec, 1);
+
+ list_for_each_entry_safe(page, tmp, free_pages, lru) {
+ list_del(&page->lru);
+ if (!pagevec_add(&freed_pvec, page)) {
+ __pagevec_free(&freed_pvec);
+ pagevec_reinit(&freed_pvec);
+ }
+ }
+
+ pagevec_free(&freed_pvec);
+}
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
static unsigned long shrink_page_list(struct list_head *page_list,
- struct scan_control *sc,
- enum pageout_io sync_writeback)
+ struct zone *zone,
+ struct scan_control *sc)
{
LIST_HEAD(ret_pages);
- struct pagevec freed_pvec;
+ LIST_HEAD(free_pages);
int pgactivate = 0;
+ unsigned long nr_dirty = 0;
+ unsigned long nr_congested = 0;
unsigned long nr_reclaimed = 0;
cond_resched();
- pagevec_init(&freed_pvec, 1);
while (!list_empty(page_list)) {
enum page_references references;
struct address_space *mapping;
@@ -647,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep;
VM_BUG_ON(PageActive(page));
+ VM_BUG_ON(page_zone(page) != zone);
sc->nr_scanned++;
@@ -672,10 +732,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* for any page for which writeback has already
* started.
*/
- if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
+ if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC &&
+ may_enter_fs)
wait_on_page_writeback(page);
- else
- goto keep_locked;
+ else {
+ unlock_page(page);
+ goto keep_lumpy;
+ }
}
references = page_check_references(page, sc);
@@ -721,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
if (PageDirty(page)) {
+ nr_dirty++;
+
if (references == PAGEREF_RECLAIM_CLEAN)
goto keep_locked;
if (!may_enter_fs)
@@ -729,14 +794,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep_locked;
/* Page is dirty, try to write it out here */
- switch (pageout(page, mapping, sync_writeback)) {
+ switch (pageout(page, mapping, sc)) {
case PAGE_KEEP:
+ nr_congested++;
goto keep_locked;
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
- if (PageWriteback(page) || PageDirty(page))
+ if (PageWriteback(page))
+ goto keep_lumpy;
+ if (PageDirty(page))
goto keep;
+
/*
* A synchronous write - probably a ramdisk. Go
* ahead and try to reclaim the page.
@@ -806,10 +875,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
__clear_page_locked(page);
free_it:
nr_reclaimed++;
- if (!pagevec_add(&freed_pvec, page)) {
- __pagevec_free(&freed_pvec);
- pagevec_reinit(&freed_pvec);
- }
+
+ /*
+ * Is there need to periodically free_page_list? It would
+ * appear not as the counts should be low
+ */
+ list_add(&page->lru, &free_pages);
continue;
cull_mlocked:
@@ -817,6 +888,7 @@ cull_mlocked:
try_to_free_swap(page);
unlock_page(page);
putback_lru_page(page);
+ disable_lumpy_reclaim_mode(sc);
continue;
activate_locked:
@@ -829,12 +901,24 @@ activate_locked:
keep_locked:
unlock_page(page);
keep:
+ disable_lumpy_reclaim_mode(sc);
+keep_lumpy:
list_add(&page->lru, &ret_pages);
VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
}
+
+ /*
+ * Tag a zone as congested if all the dirty pages encountered were
+ * backed by a congested BDI. In this case, reclaimers should just
+ * back off and wait for congestion to clear because further reclaim
+ * will encounter the same problem
+ */
+ if (nr_dirty == nr_congested)
+ zone_set_flag(zone, ZONE_CONGESTED);
+
+ free_page_list(&free_pages);
+
list_splice(&ret_pages, page_list);
- if (pagevec_count(&freed_pvec))
- __pagevec_free(&freed_pvec);
count_vm_events(PGACTIVATE, pgactivate);
return nr_reclaimed;
}
@@ -916,6 +1000,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long *scanned, int order, int mode, int file)
{
unsigned long nr_taken = 0;
+ unsigned long nr_lumpy_taken = 0;
+ unsigned long nr_lumpy_dirty = 0;
+ unsigned long nr_lumpy_failed = 0;
unsigned long scan;
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
@@ -978,7 +1065,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
/* Check that we have not crossed a zone boundary. */
if (unlikely(page_zone_id(cursor_page) != zone_id))
- continue;
+ break;
/*
* If we don't have enough swap space, reclaiming of
@@ -986,19 +1073,37 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* pointless.
*/
if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
- !PageSwapCache(cursor_page))
- continue;
+ !PageSwapCache(cursor_page))
+ break;
if (__isolate_lru_page(cursor_page, mode, file) == 0) {
list_move(&cursor_page->lru, dst);
mem_cgroup_del_lru(cursor_page);
nr_taken++;
+ nr_lumpy_taken++;
+ if (PageDirty(cursor_page))
+ nr_lumpy_dirty++;
scan++;
+ } else {
+ /* the page is freed already. */
+ if (!page_count(cursor_page))
+ continue;
+ break;
}
}
+
+ /* If we break out of the loop above, lumpy reclaim failed */
+ if (pfn < end_pfn)
+ nr_lumpy_failed++;
}
*scanned = scan;
+
+ trace_mm_vmscan_lru_isolate(order,
+ nr_to_scan, scan,
+ nr_taken,
+ nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
+ mode);
return nr_taken;
}
@@ -1035,7 +1140,8 @@ static unsigned long clear_active_flags(struct list_head *page_list,
ClearPageActive(page);
nr_active++;
}
- count[lru]++;
+ if (count)
+ count[lru]++;
}
return nr_active;
@@ -1112,174 +1218,209 @@ static int too_many_isolated(struct zone *zone, int file,
}
/*
- * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
- * of reclaimed pages
+ * TODO: Try merging with migrations version of putback_lru_pages
*/
-static unsigned long shrink_inactive_list(unsigned long max_scan,
- struct zone *zone, struct scan_control *sc,
- int priority, int file)
+static noinline_for_stack void
+putback_lru_pages(struct zone *zone, struct scan_control *sc,
+ unsigned long nr_anon, unsigned long nr_file,
+ struct list_head *page_list)
{
- LIST_HEAD(page_list);
+ struct page *page;
struct pagevec pvec;
- unsigned long nr_scanned = 0;
- unsigned long nr_reclaimed = 0;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
- while (unlikely(too_many_isolated(zone, file, sc))) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ pagevec_init(&pvec, 1);
- /* We are about to die and free our memory. Return now. */
- if (fatal_signal_pending(current))
- return SWAP_CLUSTER_MAX;
+ /*
+ * Put back any unfreeable pages.
+ */
+ spin_lock(&zone->lru_lock);
+ while (!list_empty(page_list)) {
+ int lru;
+ page = lru_to_page(page_list);
+ VM_BUG_ON(PageLRU(page));
+ list_del(&page->lru);
+ if (unlikely(!page_evictable(page, NULL))) {
+ spin_unlock_irq(&zone->lru_lock);
+ putback_lru_page(page);
+ spin_lock_irq(&zone->lru_lock);
+ continue;
+ }
+ SetPageLRU(page);
+ lru = page_lru(page);
+ add_page_to_lru_list(zone, page, lru);
+ if (is_active_lru(lru)) {
+ int file = is_file_lru(lru);
+ reclaim_stat->recent_rotated[file]++;
+ }
+ if (!pagevec_add(&pvec, page)) {
+ spin_unlock_irq(&zone->lru_lock);
+ __pagevec_release(&pvec);
+ spin_lock_irq(&zone->lru_lock);
+ }
}
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+
+ spin_unlock_irq(&zone->lru_lock);
+ pagevec_release(&pvec);
+}
+static noinline_for_stack void update_isolated_counts(struct zone *zone,
+ struct scan_control *sc,
+ unsigned long *nr_anon,
+ unsigned long *nr_file,
+ struct list_head *isolated_list)
+{
+ unsigned long nr_active;
+ unsigned int count[NR_LRU_LISTS] = { 0, };
+ struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
- pagevec_init(&pvec, 1);
+ nr_active = clear_active_flags(isolated_list, count);
+ __count_vm_events(PGDEACTIVATE, nr_active);
- lru_add_drain();
- spin_lock_irq(&zone->lru_lock);
- do {
- struct page *page;
- unsigned long nr_taken;
- unsigned long nr_scan;
- unsigned long nr_freed;
- unsigned long nr_active;
- unsigned int count[NR_LRU_LISTS] = { 0, };
- int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
- unsigned long nr_anon;
- unsigned long nr_file;
+ __mod_zone_page_state(zone, NR_ACTIVE_FILE,
+ -count[LRU_ACTIVE_FILE]);
+ __mod_zone_page_state(zone, NR_INACTIVE_FILE,
+ -count[LRU_INACTIVE_FILE]);
+ __mod_zone_page_state(zone, NR_ACTIVE_ANON,
+ -count[LRU_ACTIVE_ANON]);
+ __mod_zone_page_state(zone, NR_INACTIVE_ANON,
+ -count[LRU_INACTIVE_ANON]);
- if (scanning_global_lru(sc)) {
- nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX,
- &page_list, &nr_scan,
- sc->order, mode,
- zone, 0, file);
- zone->pages_scanned += nr_scan;
- if (current_is_kswapd())
- __count_zone_vm_events(PGSCAN_KSWAPD, zone,
- nr_scan);
- else
- __count_zone_vm_events(PGSCAN_DIRECT, zone,
- nr_scan);
- } else {
- nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX,
- &page_list, &nr_scan,
- sc->order, mode,
- zone, sc->mem_cgroup,
- 0, file);
- /*
- * mem_cgroup_isolate_pages() keeps track of
- * scanned pages on its own.
- */
- }
+ *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+ *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
- if (nr_taken == 0)
- goto done;
+ reclaim_stat->recent_scanned[0] += *nr_anon;
+ reclaim_stat->recent_scanned[1] += *nr_file;
+}
- nr_active = clear_active_flags(&page_list, count);
- __count_vm_events(PGDEACTIVATE, nr_active);
+/*
+ * Returns true if the caller should wait to clean dirty/writeback pages.
+ *
+ * If we are direct reclaiming for contiguous pages and we do not reclaim
+ * everything in the list, try again and wait for writeback IO to complete.
+ * This will stall high-order allocations noticeably. Only do that when really
+ * need to free the pages under high memory pressure.
+ */
+static inline bool should_reclaim_stall(unsigned long nr_taken,
+ unsigned long nr_freed,
+ int priority,
+ struct scan_control *sc)
+{
+ int lumpy_stall_priority;
- __mod_zone_page_state(zone, NR_ACTIVE_FILE,
- -count[LRU_ACTIVE_FILE]);
- __mod_zone_page_state(zone, NR_INACTIVE_FILE,
- -count[LRU_INACTIVE_FILE]);
- __mod_zone_page_state(zone, NR_ACTIVE_ANON,
- -count[LRU_ACTIVE_ANON]);
- __mod_zone_page_state(zone, NR_INACTIVE_ANON,
- -count[LRU_INACTIVE_ANON]);
+ /* kswapd should not stall on sync IO */
+ if (current_is_kswapd())
+ return false;
- nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
- nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
+ /* Only stall on lumpy reclaim */
+ if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
+ return false;
- reclaim_stat->recent_scanned[0] += nr_anon;
- reclaim_stat->recent_scanned[1] += nr_file;
+ /* If we have relaimed everything on the isolated list, no stall */
+ if (nr_freed == nr_taken)
+ return false;
- spin_unlock_irq(&zone->lru_lock);
+ /*
+ * For high-order allocations, there are two stall thresholds.
+ * High-cost allocations stall immediately where as lower
+ * order allocations such as stacks require the scanning
+ * priority to be much higher before stalling.
+ */
+ if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+ lumpy_stall_priority = DEF_PRIORITY;
+ else
+ lumpy_stall_priority = DEF_PRIORITY / 3;
- nr_scanned += nr_scan;
- nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+ return priority <= lumpy_stall_priority;
+}
- /*
- * If we are direct reclaiming for contiguous pages and we do
- * not reclaim everything in the list, try again and wait
- * for IO to complete. This will stall high-order allocations
- * but that should be acceptable to the caller
- */
- if (nr_freed < nr_taken && !current_is_kswapd() &&
- sc->lumpy_reclaim_mode) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+/*
+ * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
+ * of reclaimed pages
+ */
+static noinline_for_stack unsigned long
+shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
+ struct scan_control *sc, int priority, int file)
+{
+ LIST_HEAD(page_list);
+ unsigned long nr_scanned;
+ unsigned long nr_reclaimed = 0;
+ unsigned long nr_taken;
+ unsigned long nr_anon;
+ unsigned long nr_file;
- /*
- * The attempt at page out may have made some
- * of the pages active, mark them inactive again.
- */
- nr_active = clear_active_flags(&page_list, count);
- count_vm_events(PGDEACTIVATE, nr_active);
+ while (unlikely(too_many_isolated(zone, file, sc))) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
- nr_freed += shrink_page_list(&page_list, sc,
- PAGEOUT_IO_SYNC);
- }
+ /* We are about to die and free our memory. Return now. */
+ if (fatal_signal_pending(current))
+ return SWAP_CLUSTER_MAX;
+ }
- nr_reclaimed += nr_freed;
+ set_lumpy_reclaim_mode(priority, sc, false);
+ lru_add_drain();
+ spin_lock_irq(&zone->lru_lock);
- local_irq_disable();
+ if (scanning_global_lru(sc)) {
+ nr_taken = isolate_pages_global(nr_to_scan,
+ &page_list, &nr_scanned, sc->order,
+ sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
+ ISOLATE_INACTIVE : ISOLATE_BOTH,
+ zone, 0, file);
+ zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
- __count_vm_events(KSWAPD_STEAL, nr_freed);
- __count_zone_vm_events(PGSTEAL, zone, nr_freed);
-
- spin_lock(&zone->lru_lock);
+ __count_zone_vm_events(PGSCAN_KSWAPD, zone,
+ nr_scanned);
+ else
+ __count_zone_vm_events(PGSCAN_DIRECT, zone,
+ nr_scanned);
+ } else {
+ nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
+ &page_list, &nr_scanned, sc->order,
+ sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
+ ISOLATE_INACTIVE : ISOLATE_BOTH,
+ zone, sc->mem_cgroup,
+ 0, file);
/*
- * Put back any unfreeable pages.
+ * mem_cgroup_isolate_pages() keeps track of
+ * scanned pages on its own.
*/
- while (!list_empty(&page_list)) {
- int lru;
- page = lru_to_page(&page_list);
- VM_BUG_ON(PageLRU(page));
- list_del(&page->lru);
- if (unlikely(!page_evictable(page, NULL))) {
- spin_unlock_irq(&zone->lru_lock);
- putback_lru_page(page);
- spin_lock_irq(&zone->lru_lock);
- continue;
- }
- SetPageLRU(page);
- lru = page_lru(page);
- add_page_to_lru_list(zone, page, lru);
- if (is_active_lru(lru)) {
- int file = is_file_lru(lru);
- reclaim_stat->recent_rotated[file]++;
- }
- if (!pagevec_add(&pvec, page)) {
- spin_unlock_irq(&zone->lru_lock);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
- }
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+ }
- } while (nr_scanned < max_scan);
+ if (nr_taken == 0) {
+ spin_unlock_irq(&zone->lru_lock);
+ return 0;
+ }
+
+ update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
-done:
spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
- return nr_reclaimed;
-}
-/*
- * We are about to scan this zone at a certain priority level. If that priority
- * level is smaller (ie: more urgent) than the previous priority, then note
- * that priority level within the zone. This is done so that when the next
- * process comes in to scan this zone, it will immediately start out at this
- * priority level rather than having to build up its own scanning priority.
- * Here, this priority affects only the reclaim-mapped threshold.
- */
-static inline void note_zone_scanning_priority(struct zone *zone, int priority)
-{
- if (priority < zone->prev_priority)
- zone->prev_priority = priority;
+ nr_reclaimed = shrink_page_list(&page_list, zone, sc);
+
+ /* Check if we should syncronously wait for writeback */
+ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
+ set_lumpy_reclaim_mode(priority, sc, true);
+ nr_reclaimed += shrink_page_list(&page_list, zone, sc);
+ }
+
+ local_irq_disable();
+ if (current_is_kswapd())
+ __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
+ __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
+
+ putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+
+ trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
+ zone_idx(zone),
+ nr_scanned, nr_reclaimed,
+ priority,
+ trace_shrink_flags(file, sc->lumpy_reclaim_mode));
+ return nr_reclaimed;
}
/*
@@ -1426,6 +1567,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
spin_unlock_irq(&zone->lru_lock);
}
+#ifdef CONFIG_SWAP
static int inactive_anon_is_low_global(struct zone *zone)
{
unsigned long active, inactive;
@@ -1451,12 +1593,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
{
int low;
+ /*
+ * If we don't have swap space, anonymous page deactivation
+ * is pointless.
+ */
+ if (!total_swap_pages)
+ return 0;
+
if (scanning_global_lru(sc))
low = inactive_anon_is_low_global(zone);
else
low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
return low;
}
+#else
+static inline int inactive_anon_is_low(struct zone *zone,
+ struct scan_control *sc)
+{
+ return 0;
+}
+#endif
static int inactive_file_is_low_global(struct zone *zone)
{
@@ -1583,6 +1739,13 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
}
/*
+ * With swappiness at 100, anonymous and file have the same priority.
+ * This scanning priority is essentially the inverse of IO cost.
+ */
+ anon_prio = sc->swappiness;
+ file_prio = 200 - sc->swappiness;
+
+ /*
* OK, so we have swap space and a fair amount of page cache
* pages. We use the recently rotated / recently scanned
* ratios to determine how valuable each cache is.
@@ -1593,28 +1756,18 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
*
* anon in [0], file in [1]
*/
+ spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
- spin_lock_irq(&zone->lru_lock);
reclaim_stat->recent_scanned[0] /= 2;
reclaim_stat->recent_rotated[0] /= 2;
- spin_unlock_irq(&zone->lru_lock);
}
if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
- spin_lock_irq(&zone->lru_lock);
reclaim_stat->recent_scanned[1] /= 2;
reclaim_stat->recent_rotated[1] /= 2;
- spin_unlock_irq(&zone->lru_lock);
}
/*
- * With swappiness at 100, anonymous and file have the same priority.
- * This scanning priority is essentially the inverse of IO cost.
- */
- anon_prio = sc->swappiness;
- file_prio = 200 - sc->swappiness;
-
- /*
* The amount of pressure on anon vs file pages is inversely
* proportional to the fraction of recently scanned pages on
* each list that were recently referenced and in active use.
@@ -1624,6 +1777,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;
+ spin_unlock_irq(&zone->lru_lock);
fraction[0] = ap;
fraction[1] = fp;
@@ -1643,21 +1797,6 @@ out:
}
}
-static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
-{
- /*
- * If we need a large contiguous chunk of memory, or have
- * trouble getting a small set of contiguous pages, we
- * will reclaim both active and inactive pages.
- */
- if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
- sc->lumpy_reclaim_mode = 1;
- else if (sc->order && priority < DEF_PRIORITY - 2)
- sc->lumpy_reclaim_mode = 1;
- else
- sc->lumpy_reclaim_mode = 0;
-}
-
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
@@ -1672,8 +1811,6 @@ static void shrink_zone(int priority, struct zone *zone,
get_scan_count(zone, sc, nr, priority);
- set_lumpy_reclaim_mode(priority, sc);
-
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
for_each_evictable_lru(l) {
@@ -1704,7 +1841,7 @@ static void shrink_zone(int priority, struct zone *zone,
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
+ if (inactive_anon_is_low(zone, sc))
shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
throttle_vm_writeout(sc->gfp_mask);
@@ -1726,16 +1863,14 @@ static void shrink_zone(int priority, struct zone *zone,
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static bool shrink_zones(int priority, struct zonelist *zonelist,
+static void shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
- enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
struct zoneref *z;
struct zone *zone;
- bool all_unreclaimable = true;
- for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
- sc->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!populated_zone(zone))
continue;
/*
@@ -1745,22 +1880,43 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
if (scanning_global_lru(sc)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- note_zone_scanning_priority(zone, priority);
-
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
- } else {
- /*
- * Ignore cpuset limitation here. We just want to reduce
- * # of used pages by us regardless of memory shortage.
- */
- mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
- priority);
}
shrink_zone(priority, zone, sc);
- all_unreclaimable = false;
}
+}
+
+static bool zone_reclaimable(struct zone *zone)
+{
+ return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+}
+
+/*
+ * As hibernation is going on, kswapd is freezed so that it can't mark
+ * the zone into all_unreclaimable. It can't handle OOM during hibernation.
+ * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
+ */
+static bool all_unreclaimable(struct zonelist *zonelist,
+ struct scan_control *sc)
+{
+ struct zoneref *z;
+ struct zone *zone;
+ bool all_unreclaimable = true;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
+ if (!populated_zone(zone))
+ continue;
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+ if (zone_reclaimable(zone)) {
+ all_unreclaimable = false;
+ break;
+ }
+ }
+
return all_unreclaimable;
}
@@ -1784,13 +1940,10 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
int priority;
- bool all_unreclaimable;
unsigned long total_scanned = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
- unsigned long lru_pages = 0;
struct zoneref *z;
struct zone *zone;
- enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
unsigned long writeback_threshold;
get_mems_allowed();
@@ -1798,29 +1951,26 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
if (scanning_global_lru(sc))
count_vm_event(ALLOCSTALL);
- /*
- * mem_cgroup will not do shrink_slab.
- */
- if (scanning_global_lru(sc)) {
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
-
- lru_pages += zone_reclaimable_pages(zone);
- }
- }
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
sc->nr_scanned = 0;
if (!priority)
disable_swap_token();
- all_unreclaimable = shrink_zones(priority, zonelist, sc);
+ shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
*/
if (scanning_global_lru(sc)) {
+ unsigned long lru_pages = 0;
+ for_each_zone_zonelist(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask)) {
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+
+ lru_pages += zone_reclaimable_pages(zone);
+ }
+
shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -1846,32 +1996,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
/* Take a nap, wait for some writeback to complete */
if (!sc->hibernation_mode && sc->nr_scanned &&
- priority < DEF_PRIORITY - 2)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- }
-
-out:
- /*
- * Now that we've scanned all the zones at this priority level, note
- * that level within the zone so that the next thread which performs
- * scanning of this zone will immediately start out at this priority
- * level. This affects only the decision whether or not to bring
- * mapped pages onto the inactive list.
- */
- if (priority < 0)
- priority = 0;
-
- if (scanning_global_lru(sc)) {
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ priority < DEF_PRIORITY - 2) {
+ struct zone *preferred_zone;
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
-
- zone->prev_priority = priority;
+ first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
+ NULL, &preferred_zone);
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
}
- } else
- mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
+ }
+out:
delayacct_freepages_end();
put_mems_allowed();
@@ -1879,7 +2013,7 @@ out:
return sc->nr_reclaimed;
/* top priority shrink_zones still had more to do? don't OOM, then */
- if (scanning_global_lru(sc) && !all_unreclaimable)
+ if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
return 1;
return 0;
@@ -1888,6 +2022,7 @@ out:
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *nodemask)
{
+ unsigned long nr_reclaimed;
struct scan_control sc = {
.gfp_mask = gfp_mask,
.may_writepage = !laptop_mode,
@@ -1900,7 +2035,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.nodemask = nodemask,
};
- return do_try_to_free_pages(zonelist, &sc);
+ trace_mm_vmscan_direct_reclaim_begin(order,
+ sc.may_writepage,
+ gfp_mask);
+
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
+ trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+
+ return nr_reclaimed;
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -1908,9 +2051,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
unsigned int swappiness,
- struct zone *zone, int nid)
+ struct zone *zone)
{
struct scan_control sc = {
+ .nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
@@ -1918,13 +2062,13 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
.order = 0,
.mem_cgroup = mem,
};
- nodemask_t nm = nodemask_of_node(nid);
-
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
- sc.nodemask = &nm;
- sc.nr_reclaimed = 0;
- sc.nr_scanned = 0;
+
+ trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
+ sc.may_writepage,
+ sc.gfp_mask);
+
/*
* NOTE: Although we can get the priority field, using it
* here is not a good idea, since it limits the pages we can scan.
@@ -1933,6 +2077,9 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
* the priority and make it zero.
*/
shrink_zone(0, zone, &sc);
+
+ trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+
return sc.nr_reclaimed;
}
@@ -1942,6 +2089,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
unsigned int swappiness)
{
struct zonelist *zonelist;
+ unsigned long nr_reclaimed;
struct scan_control sc = {
.may_writepage = !laptop_mode,
.may_unmap = 1,
@@ -1956,7 +2104,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
zonelist = NODE_DATA(numa_node_id())->node_zonelists;
- return do_try_to_free_pages(zonelist, &sc);
+
+ trace_mm_vmscan_memcg_reclaim_begin(0,
+ sc.may_writepage,
+ sc.gfp_mask);
+
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
+ trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+
+ return nr_reclaimed;
}
#endif
@@ -2028,22 +2185,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
.order = order,
.mem_cgroup = NULL,
};
- /*
- * temp_priority is used to remember the scanning priority at which
- * this zone was successfully refilled to
- * free_pages == high_wmark_pages(zone).
- */
- int temp_priority[MAX_NR_ZONES];
-
loop_again:
total_scanned = 0;
sc.nr_reclaimed = 0;
sc.may_writepage = !laptop_mode;
count_vm_event(PAGEOUTRUN);
- for (i = 0; i < pgdat->nr_zones; i++)
- temp_priority[i] = DEF_PRIORITY;
-
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long lru_pages = 0;
@@ -2103,7 +2250,6 @@ loop_again:
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
int nr_slab;
- int nid, zid;
if (!populated_zone(zone))
continue;
@@ -2111,18 +2257,14 @@ loop_again:
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;
- temp_priority[i] = priority;
sc.nr_scanned = 0;
- note_zone_scanning_priority(zone, priority);
- nid = pgdat->node_id;
- zid = zone_idx(zone);
/*
* Call soft limit reclaim before calling shrink_zone.
* For now we ignore the return value
*/
- mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
- nid, zid);
+ mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
+
/*
* We put equal pressure on every zone, unless one
* zone has way too many pages free already.
@@ -2137,8 +2279,7 @@ loop_again:
total_scanned += sc.nr_scanned;
if (zone->all_unreclaimable)
continue;
- if (nr_slab == 0 &&
- zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6))
+ if (nr_slab == 0 && !zone_reclaimable(zone))
zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
@@ -2160,6 +2301,15 @@ loop_again:
if (!zone_watermark_ok(zone, order,
min_wmark_pages(zone), end_zone, 0))
has_under_min_watermark_zone = 1;
+ } else {
+ /*
+ * If a zone reaches its high watermark,
+ * consider it to be no longer congested. It's
+ * possible there are dirty pages backed by
+ * congested BDIs but as pressure is relieved,
+ * spectulatively avoid congestion waits
+ */
+ zone_clear_flag(zone, ZONE_CONGESTED);
}
}
@@ -2186,16 +2336,6 @@ loop_again:
break;
}
out:
- /*
- * Note within each zone the priority level at which this zone was
- * brought into a happy state. So that the next thread which scans this
- * zone will start out at that priority level.
- */
- for (i = 0; i < pgdat->nr_zones; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- zone->prev_priority = temp_priority[i];
- }
if (!all_zones_ok) {
cond_resched();
@@ -2299,9 +2439,10 @@ static int kswapd(void *p)
* premature sleep. If not, then go fully
* to sleep until explicitly woken up
*/
- if (!sleeping_prematurely(pgdat, order, remaining))
+ if (!sleeping_prematurely(pgdat, order, remaining)) {
+ trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
schedule();
- else {
+ } else {
if (remaining)
count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
else
@@ -2321,8 +2462,10 @@ static int kswapd(void *p)
* We can speed up thawing tasks if we don't call balance_pgdat
* after returning from the refrigerator
*/
- if (!ret)
+ if (!ret) {
+ trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
balance_pgdat(pgdat, order);
+ }
}
return 0;
}
@@ -2342,6 +2485,7 @@ void wakeup_kswapd(struct zone *zone, int order)
return;
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
return;
if (!waitqueue_active(&pgdat->kswapd_wait))
@@ -2590,9 +2734,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
.swappiness = vm_swappiness,
.order = order,
};
- unsigned long slab_reclaimable;
+ unsigned long nr_slab_pages0, nr_slab_pages1;
- disable_swap_token();
cond_resched();
/*
* We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -2611,14 +2754,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
*/
priority = ZONE_RECLAIM_PRIORITY;
do {
- note_zone_scanning_priority(zone, priority);
shrink_zone(priority, zone, &sc);
priority--;
} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
}
- slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
- if (slab_reclaimable > zone->min_slab_pages) {
+ nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ if (nr_slab_pages0 > zone->min_slab_pages) {
/*
* shrink_slab() does not currently allow us to determine how
* many pages were freed in this zone. So we take the current
@@ -2629,17 +2771,27 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* Note that shrink_slab will free memory on all zones and may
* take a long time.
*/
- while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
- zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
- slab_reclaimable - nr_pages)
- ;
+ for (;;) {
+ unsigned long lru_pages = zone_reclaimable_pages(zone);
+
+ /* No reclaimable slab or very low memory pressure */
+ if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
+ break;
+
+ /* Freed enough memory */
+ nr_slab_pages1 = zone_page_state(zone,
+ NR_SLAB_RECLAIMABLE);
+ if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
+ break;
+ }
/*
* Update nr_reclaimed by the number of slab pages we
* reclaimed from this zone.
*/
- sc.nr_reclaimed += slab_reclaimable -
- zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ if (nr_slab_pages1 < nr_slab_pages0)
+ sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
}
p->reclaim_state = NULL;
@@ -2898,6 +3050,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
return 0;
}
+#ifdef CONFIG_NUMA
/*
* per node 'scan_unevictable_pages' attribute. On demand re-scan of
* a specified node's per zone unevictable lists for evictable pages.
@@ -2944,4 +3097,4 @@ void scan_unevictable_unregister_node(struct node *node)
{
sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
}
-
+#endif