summaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c223
1 files changed, 156 insertions, 67 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0b9f577b1a2a..c02cff1ed56e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -266,7 +266,20 @@ compound_page_dtor * const compound_page_dtors[] = {
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
+#ifdef CONFIG_DISCONTIGMEM
+/*
+ * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
+ * are not on separate NUMA nodes. Functionally this works but with
+ * watermark_boost_factor, it can reclaim prematurely as the ranges can be
+ * quite small. By default, do not boost watermarks on discontigmem as in
+ * many cases very high-order allocations like THP are likely to be
+ * unsupported and the premature reclaim offsets the advantage of long-term
+ * fragmentation avoidance.
+ */
+int watermark_boost_factor __read_mostly;
+#else
int watermark_boost_factor __read_mostly = 15000;
+#endif
int watermark_scale_factor = 10;
static unsigned long nr_kernel_pages __initdata;
@@ -289,8 +302,8 @@ EXPORT_SYMBOL(movable_zone);
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
#if MAX_NUMNODES > 1
-int nr_node_ids __read_mostly = MAX_NUMNODES;
-int nr_online_nodes __read_mostly = 1;
+unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
+unsigned int nr_online_nodes __read_mostly = 1;
EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
@@ -789,6 +802,57 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
return 0;
}
+#ifdef CONFIG_COMPACTION
+static inline struct capture_control *task_capc(struct zone *zone)
+{
+ struct capture_control *capc = current->capture_control;
+
+ return capc &&
+ !(current->flags & PF_KTHREAD) &&
+ !capc->page &&
+ capc->cc->zone == zone &&
+ capc->cc->direct_compaction ? capc : NULL;
+}
+
+static inline bool
+compaction_capture(struct capture_control *capc, struct page *page,
+ int order, int migratetype)
+{
+ if (!capc || order != capc->cc->order)
+ return false;
+
+ /* Do not accidentally pollute CMA or isolated regions*/
+ if (is_migrate_cma(migratetype) ||
+ is_migrate_isolate(migratetype))
+ return false;
+
+ /*
+ * Do not let lower order allocations polluate a movable pageblock.
+ * This might let an unmovable request use a reclaimable pageblock
+ * and vice-versa but no more than normal fallback logic which can
+ * have trouble finding a high-order free page.
+ */
+ if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
+ return false;
+
+ capc->page = page;
+ return true;
+}
+
+#else
+static inline struct capture_control *task_capc(struct zone *zone)
+{
+ return NULL;
+}
+
+static inline bool
+compaction_capture(struct capture_control *capc, struct page *page,
+ int order, int migratetype)
+{
+ return false;
+}
+#endif /* CONFIG_COMPACTION */
+
/*
* Freeing function for a buddy system allocator.
*
@@ -822,6 +886,7 @@ static inline void __free_one_page(struct page *page,
unsigned long uninitialized_var(buddy_pfn);
struct page *buddy;
unsigned int max_order;
+ struct capture_control *capc = task_capc(zone);
max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
@@ -837,6 +902,11 @@ static inline void __free_one_page(struct page *page,
continue_merging:
while (order < max_order - 1) {
+ if (compaction_capture(capc, page, order, migratetype)) {
+ __mod_zone_freepage_state(zone, -(1 << order),
+ migratetype);
+ return;
+ }
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
@@ -1056,7 +1126,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
if (PageMappingFlags(page))
page->mapping = NULL;
if (memcg_kmem_enabled() && PageKmemcg(page))
- memcg_kmem_uncharge(page, order);
+ __memcg_kmem_uncharge(page, order);
if (check_free)
bad += free_pages_check(page);
if (bad)
@@ -1303,7 +1373,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
-static void __init __free_pages_boot_core(struct page *page, unsigned int order)
+void __free_pages_core(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
@@ -1382,7 +1452,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
{
if (early_page_uninitialised(pfn))
return;
- return __free_pages_boot_core(page, order);
+ __free_pages_core(page, order);
}
/*
@@ -1472,14 +1542,14 @@ static void __init deferred_free_range(unsigned long pfn,
if (nr_pages == pageblock_nr_pages &&
(pfn & (pageblock_nr_pages - 1)) == 0) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, pageblock_order);
+ __free_pages_core(page, pageblock_order);
return;
}
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if ((pfn & (pageblock_nr_pages - 1)) == 0)
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, 0);
+ __free_pages_core(page, 0);
}
}
@@ -1945,8 +2015,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
- kernel_poison_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
+ kernel_poison_pages(page, 1 << order, 1);
set_page_owner(page, order, gfp_flags);
}
@@ -2962,7 +3032,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
* watermark, because we already know our high-order page
* exists.
*/
- watermark = min_wmark_pages(zone) + (1UL << order);
+ watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
return 0;
@@ -3173,24 +3243,14 @@ static int __init fail_page_alloc_debugfs(void)
dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
&fail_page_alloc.attr);
- if (IS_ERR(dir))
- return PTR_ERR(dir);
-
- if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &fail_page_alloc.ignore_gfp_reclaim))
- goto fail;
- if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
- &fail_page_alloc.ignore_gfp_highmem))
- goto fail;
- if (!debugfs_create_u32("min-order", mode, dir,
- &fail_page_alloc.min_order))
- goto fail;
- return 0;
-fail:
- debugfs_remove_recursive(dir);
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_reclaim);
+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem);
+ debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
- return -ENOMEM;
+ return 0;
}
late_initcall(fail_page_alloc_debugfs);
@@ -3372,8 +3432,11 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
alloc_flags |= ALLOC_KSWAPD;
#ifdef CONFIG_ZONE_DMA32
+ if (!zone)
+ return alloc_flags;
+
if (zone_idx(zone) != ZONE_NORMAL)
- goto out;
+ return alloc_flags;
/*
* If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
@@ -3382,9 +3445,9 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
*/
BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
if (nr_online_nodes > 1 && !populated_zone(--zone))
- goto out;
+ return alloc_flags;
-out:
+ alloc_flags |= ALLOC_NOFRAGMENT;
#endif /* CONFIG_ZONE_DMA32 */
return alloc_flags;
}
@@ -3710,7 +3773,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, enum compact_result *compact_result)
{
- struct page *page;
+ struct page *page = NULL;
unsigned long pflags;
unsigned int noreclaim_flag;
@@ -3721,21 +3784,24 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
noreclaim_flag = memalloc_noreclaim_save();
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
- prio);
+ prio, &page);
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
- if (*compact_result <= COMPACT_INACTIVE)
- return NULL;
-
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
* count a compaction stall
*/
count_vm_event(COMPACTSTALL);
- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+ /* Prep a captured page if available */
+ if (page)
+ prep_new_page(page, order, gfp_mask, alloc_flags);
+
+ /* Try get a page from the freelist if available */
+ if (!page)
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page) {
struct zone *zone = page_zone(page);
@@ -4568,7 +4634,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
- unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
+ unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) {
__free_pages(page, order);
page = NULL;
}
@@ -4761,6 +4827,8 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
* This function is also limited by MAX_ORDER.
*
* Memory allocated by this function must be released by free_pages_exact().
+ *
+ * Return: pointer to the allocated area or %NULL in case of error.
*/
void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
{
@@ -4781,6 +4849,8 @@ EXPORT_SYMBOL(alloc_pages_exact);
*
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
* back.
+ *
+ * Return: pointer to the allocated area or %NULL in case of error.
*/
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
{
@@ -4814,11 +4884,13 @@ EXPORT_SYMBOL(free_pages_exact);
* nr_free_zone_pages - count number of pages beyond high watermark
* @offset: The zone index of the highest zone
*
- * nr_free_zone_pages() counts the number of counts pages which are beyond the
+ * nr_free_zone_pages() counts the number of pages which are beyond the
* high watermark within all zones at or below a given zone index. For each
* zone, the number of pages is calculated as:
*
* nr_free_zone_pages = managed_pages - high_pages
+ *
+ * Return: number of pages beyond high watermark.
*/
static unsigned long nr_free_zone_pages(int offset)
{
@@ -4845,6 +4917,9 @@ static unsigned long nr_free_zone_pages(int offset)
*
* nr_free_buffer_pages() counts the number of pages which are beyond the high
* watermark within ZONE_DMA and ZONE_NORMAL.
+ *
+ * Return: number of pages beyond high watermark within ZONE_DMA and
+ * ZONE_NORMAL.
*/
unsigned long nr_free_buffer_pages(void)
{
@@ -4857,6 +4932,8 @@ EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
*
* nr_free_pagecache_pages() counts the number of pages which are beyond the
* high watermark within all zones.
+ *
+ * Return: number of pages beyond high watermark within all zones.
*/
unsigned long nr_free_pagecache_pages(void)
{
@@ -5303,7 +5380,8 @@ static int node_load[MAX_NUMNODES];
* from each node to each node in the system), and should also prefer nodes
* with no CPUs, since presumably they'll have very little allocation pressure
* on them otherwise.
- * It returns -1 if no node is found.
+ *
+ * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
*/
static int find_next_best_node(int node, nodemask_t *used_node_mask)
{
@@ -5609,7 +5687,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
else
page_group_by_mobility_disabled = 0;
- pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
+ pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
@@ -6016,7 +6094,7 @@ int __meminit __early_pfn_to_nid(unsigned long pfn,
return state->last_nid;
nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
- if (nid != -1) {
+ if (nid != NUMA_NO_NODE) {
state->last_start = start_pfn;
state->last_end = end_pfn;
state->last_nid = nid;
@@ -6214,7 +6292,7 @@ unsigned long __init __absent_pages_in_range(int nid,
* @start_pfn: The start PFN to start searching for holes
* @end_pfn: The end PFN to stop searching for holes
*
- * It returns the number of pages frames in memory holes within a range.
+ * Return: the number of pages frames in memory holes within a range.
*/
unsigned long __init absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn)
@@ -6376,10 +6454,14 @@ static void __ref setup_usemap(struct pglist_data *pgdat,
{
unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
zone->pageblock_flags = NULL;
- if (usemapsize)
+ if (usemapsize) {
zone->pageblock_flags =
- memblock_alloc_node_nopanic(usemapsize,
- pgdat->node_id);
+ memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
+ pgdat->node_id);
+ if (!zone->pageblock_flags)
+ panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
+ usemapsize, zone->name, pgdat->node_id);
+ }
}
#else
static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -6608,7 +6690,11 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
end = pgdat_end_pfn(pgdat);
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
- map = memblock_alloc_node_nopanic(size, pgdat->node_id);
+ map = memblock_alloc_node(size, SMP_CACHE_BYTES,
+ pgdat->node_id);
+ if (!map)
+ panic("Failed to allocate %ld bytes for node %d memory map\n",
+ size, pgdat->node_id);
pgdat->node_mem_map = map + offset;
}
pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
@@ -6764,14 +6850,14 @@ void __init setup_nr_node_ids(void)
* model has fine enough granularity to avoid incorrect mapping for the
* populated node map.
*
- * Returns the determined alignment in pfn's. 0 if there is no alignment
+ * Return: the determined alignment in pfn's. 0 if there is no alignment
* requirement (single node).
*/
unsigned long __init node_map_pfn_alignment(void)
{
unsigned long accl_mask = 0, last_end = 0;
unsigned long start, end, mask;
- int last_nid = -1;
+ int last_nid = NUMA_NO_NODE;
int i, nid;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
@@ -6819,7 +6905,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
/**
* find_min_pfn_with_active_regions - Find the minimum PFN registered
*
- * It returns the minimum PFN based on information provided via
+ * Return: the minimum PFN based on information provided via
* memblock_set_node().
*/
unsigned long __init find_min_pfn_with_active_regions(void)
@@ -7267,7 +7353,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
return pages;
}
-EXPORT_SYMBOL(free_reserved_area);
#ifdef CONFIG_HIGHMEM
void free_highmem_page(struct page *page)
@@ -7496,7 +7581,7 @@ static void __setup_per_zone_wmarks(void)
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
- * deltas control asynch page reclaim, and so should
+ * deltas control async page reclaim, and so should
* not be capped for highmem.
*/
unsigned long min_pages;
@@ -7886,8 +7971,7 @@ void *__init alloc_large_system_hash(const char *tablename,
size = bucketsize << log2qty;
if (flags & HASH_EARLY) {
if (flags & HASH_ZERO)
- table = memblock_alloc_nopanic(size,
- SMP_CACHE_BYTES);
+ table = memblock_alloc(size, SMP_CACHE_BYTES);
else
table = memblock_alloc_raw(size,
SMP_CACHE_BYTES);
@@ -7932,7 +8016,10 @@ void *__init alloc_large_system_hash(const char *tablename,
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
int migratetype, int flags)
{
- unsigned long pfn, iter, found;
+ unsigned long found;
+ unsigned long iter = 0;
+ unsigned long pfn = page_to_pfn(page);
+ const char *reason = "unmovable page";
/*
* TODO we could make this much more efficient by not checking every
@@ -7942,17 +8029,20 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* can still lead to having bootmem allocations in zone_movable.
*/
- /*
- * CMA allocations (alloc_contig_range) really need to mark isolate
- * CMA pageblocks even when they are not movable in fact so consider
- * them movable here.
- */
- if (is_migrate_cma(migratetype) &&
- is_migrate_cma(get_pageblock_migratetype(page)))
- return false;
+ if (is_migrate_cma_page(page)) {
+ /*
+ * CMA allocations (alloc_contig_range) really need to mark
+ * isolate CMA pageblocks even when they are not movable in fact
+ * so consider them movable here.
+ */
+ if (is_migrate_cma(migratetype))
+ return false;
+
+ reason = "CMA page";
+ goto unmovable;
+ }
- pfn = page_to_pfn(page);
- for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
+ for (found = 0; iter < pageblock_nr_pages; iter++) {
unsigned long check = pfn + iter;
if (!pfn_valid_within(check))
@@ -7973,7 +8063,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
/*
* Hugepages are not in LRU lists, but they're movable.
- * We need not scan over tail pages bacause we don't
+ * We need not scan over tail pages because we don't
* handle each tail page individually in migration.
*/
if (PageHuge(page)) {
@@ -8032,7 +8122,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
unmovable:
WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
if (flags & REPORT_FAILURE)
- dump_page(pfn_to_page(pfn+iter), "unmovable page");
+ dump_page(pfn_to_page(pfn + iter), reason);
return true;
}
@@ -8112,7 +8202,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
* pageblocks in the range. Once isolated, the pageblocks should not
* be modified by others.
*
- * Returns zero on success or negative error code. On success all
+ * Return: zero on success or negative error code. On success all
* pages which PFN is in [start, end) are allocated for the caller and
* need to be freed with free_contig_range().
*/
@@ -8160,7 +8250,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
ret = start_isolate_page_range(pfn_max_align_down(start),
pfn_max_align_up(end), migratetype, 0);
- if (ret)
+ if (ret < 0)
return ret;
/*
@@ -8196,7 +8286,6 @@ int alloc_contig_range(unsigned long start, unsigned long end,
*/
lru_add_drain_all();
- drain_all_pages(cc.zone);
order = 0;
outer_start = start;