summaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c838
1 files changed, 538 insertions, 300 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 46285d28e43b..dbd0d5cbbcbb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/page_alloc.c
*
@@ -43,12 +44,12 @@
#include <linux/mempolicy.h>
#include <linux/memremap.h>
#include <linux/stop_machine.h>
+#include <linux/random.h>
#include <linux/sort.h>
#include <linux/pfn.h>
#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
-#include <linux/page_ext.h>
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
#include <linux/compaction.h>
@@ -72,6 +73,7 @@
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include "internal.h"
+#include "shuffle.h"
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
@@ -133,6 +135,55 @@ unsigned long totalcma_pages __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
+DEFINE_STATIC_KEY_TRUE(init_on_alloc);
+#else
+DEFINE_STATIC_KEY_FALSE(init_on_alloc);
+#endif
+EXPORT_SYMBOL(init_on_alloc);
+
+#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
+DEFINE_STATIC_KEY_TRUE(init_on_free);
+#else
+DEFINE_STATIC_KEY_FALSE(init_on_free);
+#endif
+EXPORT_SYMBOL(init_on_free);
+
+static int __init early_init_on_alloc(char *buf)
+{
+ int ret;
+ bool bool_result;
+
+ if (!buf)
+ return -EINVAL;
+ ret = kstrtobool(buf, &bool_result);
+ if (bool_result && page_poisoning_enabled())
+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
+ if (bool_result)
+ static_branch_enable(&init_on_alloc);
+ else
+ static_branch_disable(&init_on_alloc);
+ return ret;
+}
+early_param("init_on_alloc", early_init_on_alloc);
+
+static int __init early_init_on_free(char *buf)
+{
+ int ret;
+ bool bool_result;
+
+ if (!buf)
+ return -EINVAL;
+ ret = kstrtobool(buf, &bool_result);
+ if (bool_result && page_poisoning_enabled())
+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
+ if (bool_result)
+ static_branch_enable(&init_on_free);
+ else
+ static_branch_disable(&init_on_free);
+ return ret;
+}
+early_param("init_on_free", early_init_on_free);
/*
* A cached value of the page's pageblock's migratetype, used when the page is
@@ -221,8 +272,6 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
[ZONE_MOVABLE] = 0,
};
-EXPORT_SYMBOL(totalram_pages);
-
static char * const zone_names[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
"DMA",
@@ -266,7 +315,20 @@ compound_page_dtor * const compound_page_dtors[] = {
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
+#ifdef CONFIG_DISCONTIGMEM
+/*
+ * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
+ * are not on separate NUMA nodes. Functionally this works but with
+ * watermark_boost_factor, it can reclaim prematurely as the ranges can be
+ * quite small. By default, do not boost watermarks on discontigmem as in
+ * many cases very high-order allocations like THP are likely to be
+ * unsupported and the premature reclaim offsets the advantage of long-term
+ * fragmentation avoidance.
+ */
+int watermark_boost_factor __read_mostly;
+#else
int watermark_boost_factor __read_mostly = 15000;
+#endif
int watermark_scale_factor = 10;
static unsigned long nr_kernel_pages __initdata;
@@ -289,8 +351,8 @@ EXPORT_SYMBOL(movable_zone);
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
#if MAX_NUMNODES > 1
-int nr_node_ids __read_mostly = MAX_NUMNODES;
-int nr_online_nodes __read_mostly = 1;
+unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
+unsigned int nr_online_nodes __read_mostly = 1;
EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
@@ -630,30 +692,29 @@ void prep_compound_page(struct page *page, unsigned int order)
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
-bool _debug_pagealloc_enabled __read_mostly
- = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+
+#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT
+DEFINE_STATIC_KEY_TRUE(_debug_pagealloc_enabled);
+#else
+DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
+#endif
EXPORT_SYMBOL(_debug_pagealloc_enabled);
-bool _debug_guardpage_enabled __read_mostly;
+
+DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
static int __init early_debug_pagealloc(char *buf)
{
- if (!buf)
- return -EINVAL;
- return kstrtobool(buf, &_debug_pagealloc_enabled);
-}
-early_param("debug_pagealloc", early_debug_pagealloc);
+ bool enable = false;
-static bool need_debug_guardpage(void)
-{
- /* If we don't use debug_pagealloc, we don't need guard page */
- if (!debug_pagealloc_enabled())
- return false;
+ if (kstrtobool(buf, &enable))
+ return -EINVAL;
- if (!debug_guardpage_minorder())
- return false;
+ if (enable)
+ static_branch_enable(&_debug_pagealloc_enabled);
- return true;
+ return 0;
}
+early_param("debug_pagealloc", early_debug_pagealloc);
static void init_debug_guardpage(void)
{
@@ -663,14 +724,9 @@ static void init_debug_guardpage(void)
if (!debug_guardpage_minorder())
return;
- _debug_guardpage_enabled = true;
+ static_branch_enable(&_debug_guardpage_enabled);
}
-struct page_ext_operations debug_guardpage_ops = {
- .need = need_debug_guardpage,
- .init = init_debug_guardpage,
-};
-
static int __init debug_guardpage_minorder_setup(char *buf)
{
unsigned long res;
@@ -688,20 +744,13 @@ early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype)
{
- struct page_ext *page_ext;
-
if (!debug_guardpage_enabled())
return false;
if (order >= debug_guardpage_minorder())
return false;
- page_ext = lookup_page_ext(page);
- if (unlikely(!page_ext))
- return false;
-
- __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
-
+ __SetPageGuard(page);
INIT_LIST_HEAD(&page->lru);
set_page_private(page, order);
/* Guard pages are not available for any usage */
@@ -713,23 +762,16 @@ static inline bool set_page_guard(struct zone *zone, struct page *page,
static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype)
{
- struct page_ext *page_ext;
-
if (!debug_guardpage_enabled())
return;
- page_ext = lookup_page_ext(page);
- if (unlikely(!page_ext))
- return;
-
- __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
+ __ClearPageGuard(page);
set_page_private(page, 0);
if (!is_migrate_isolate(migratetype))
__mod_zone_freepage_state(zone, (1 << order), migratetype);
}
#else
-struct page_ext_operations debug_guardpage_ops;
static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
@@ -742,12 +784,6 @@ static inline void set_page_order(struct page *page, unsigned int order)
__SetPageBuddy(page);
}
-static inline void rmv_page_order(struct page *page)
-{
- __ClearPageBuddy(page);
- set_page_private(page, 0);
-}
-
/*
* This function checks whether a page is free && is the buddy
* we can coalesce a page and its buddy if
@@ -789,6 +825,57 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
return 0;
}
+#ifdef CONFIG_COMPACTION
+static inline struct capture_control *task_capc(struct zone *zone)
+{
+ struct capture_control *capc = current->capture_control;
+
+ return capc &&
+ !(current->flags & PF_KTHREAD) &&
+ !capc->page &&
+ capc->cc->zone == zone &&
+ capc->cc->direct_compaction ? capc : NULL;
+}
+
+static inline bool
+compaction_capture(struct capture_control *capc, struct page *page,
+ int order, int migratetype)
+{
+ if (!capc || order != capc->cc->order)
+ return false;
+
+ /* Do not accidentally pollute CMA or isolated regions*/
+ if (is_migrate_cma(migratetype) ||
+ is_migrate_isolate(migratetype))
+ return false;
+
+ /*
+ * Do not let lower order allocations polluate a movable pageblock.
+ * This might let an unmovable request use a reclaimable pageblock
+ * and vice-versa but no more than normal fallback logic which can
+ * have trouble finding a high-order free page.
+ */
+ if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
+ return false;
+
+ capc->page = page;
+ return true;
+}
+
+#else
+static inline struct capture_control *task_capc(struct zone *zone)
+{
+ return NULL;
+}
+
+static inline bool
+compaction_capture(struct capture_control *capc, struct page *page,
+ int order, int migratetype)
+{
+ return false;
+}
+#endif /* CONFIG_COMPACTION */
+
/*
* Freeing function for a buddy system allocator.
*
@@ -822,6 +909,7 @@ static inline void __free_one_page(struct page *page,
unsigned long uninitialized_var(buddy_pfn);
struct page *buddy;
unsigned int max_order;
+ struct capture_control *capc = task_capc(zone);
max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
@@ -837,6 +925,11 @@ static inline void __free_one_page(struct page *page,
continue_merging:
while (order < max_order - 1) {
+ if (compaction_capture(capc, page, order, migratetype)) {
+ __mod_zone_freepage_state(zone, -(1 << order),
+ migratetype);
+ return;
+ }
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
@@ -848,13 +941,10 @@ continue_merging:
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
*/
- if (page_is_guard(buddy)) {
+ if (page_is_guard(buddy))
clear_page_guard(zone, buddy, order, migratetype);
- } else {
- list_del(&buddy->lru);
- zone->free_area[order].nr_free--;
- rmv_page_order(buddy);
- }
+ else
+ del_page_from_free_area(buddy, &zone->free_area[order]);
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
@@ -896,7 +986,8 @@ done_merging:
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
- if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
+ if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)
+ && !is_shuffle_order(order)) {
struct page *higher_page, *higher_buddy;
combined_pfn = buddy_pfn & pfn;
higher_page = page + (combined_pfn - pfn);
@@ -904,15 +995,18 @@ done_merging:
higher_buddy = higher_page + (buddy_pfn - combined_pfn);
if (pfn_valid_within(buddy_pfn) &&
page_is_buddy(higher_page, higher_buddy, order + 1)) {
- list_add_tail(&page->lru,
- &zone->free_area[order].free_list[migratetype]);
- goto out;
+ add_to_free_area_tail(page, &zone->free_area[order],
+ migratetype);
+ return;
}
}
- list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
-out:
- zone->free_area[order].nr_free++;
+ if (is_shuffle_order(order))
+ add_to_free_area_random(page, &zone->free_area[order],
+ migratetype);
+ else
+ add_to_free_area(page, &zone->free_area[order], migratetype);
+
}
/*
@@ -1022,6 +1116,14 @@ out:
return ret;
}
+static void kernel_init_free_pages(struct page *page, int numpages)
+{
+ int i;
+
+ for (i = 0; i < numpages; i++)
+ clear_highpage(page + i);
+}
+
static __always_inline bool free_pages_prepare(struct page *page,
unsigned int order, bool check_free)
{
@@ -1056,7 +1158,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
if (PageMappingFlags(page))
page->mapping = NULL;
if (memcg_kmem_enabled() && PageKmemcg(page))
- memcg_kmem_uncharge(page, order);
+ __memcg_kmem_uncharge(page, order);
if (check_free)
bad += free_pages_check(page);
if (bad)
@@ -1073,27 +1175,49 @@ static __always_inline bool free_pages_prepare(struct page *page,
PAGE_SIZE << order);
}
arch_free_page(page, order);
+ if (want_init_on_free())
+ kernel_init_free_pages(page, 1 << order);
+
kernel_poison_pages(page, 1 << order, 0);
- kernel_map_pages(page, 1 << order, 0);
+ if (debug_pagealloc_enabled())
+ kernel_map_pages(page, 1 << order, 0);
+
kasan_free_nondeferred_pages(page, order);
return true;
}
#ifdef CONFIG_DEBUG_VM
-static inline bool free_pcp_prepare(struct page *page)
+/*
+ * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
+ * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
+ * moved from pcp lists to free lists.
+ */
+static bool free_pcp_prepare(struct page *page)
{
return free_pages_prepare(page, 0, true);
}
-static inline bool bulkfree_pcp_prepare(struct page *page)
+static bool bulkfree_pcp_prepare(struct page *page)
{
- return false;
+ if (debug_pagealloc_enabled())
+ return free_pages_check(page);
+ else
+ return false;
}
#else
+/*
+ * With DEBUG_VM disabled, order-0 pages being freed are checked only when
+ * moving from pcp lists to free list in order to reduce overhead. With
+ * debug_pagealloc enabled, they are checked also immediately when being freed
+ * to the pcp lists.
+ */
static bool free_pcp_prepare(struct page *page)
{
- return free_pages_prepare(page, 0, false);
+ if (debug_pagealloc_enabled())
+ return free_pages_prepare(page, 0, true);
+ else
+ return free_pages_prepare(page, 0, false);
}
static bool bulkfree_pcp_prepare(struct page *page)
@@ -1303,7 +1427,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
-static void __init __free_pages_boot_core(struct page *page, unsigned int order)
+void __free_pages_core(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
@@ -1344,36 +1468,22 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
#endif
#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-static inline bool __meminit __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
+/* Only safe to use early in boot when initialisation is single-threaded */
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
int nid;
- nid = __early_pfn_to_nid(pfn, state);
+ nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
if (nid >= 0 && nid != node)
return false;
return true;
}
-/* Only safe to use early in boot when initialisation is single-threaded */
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
- return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
-}
-
#else
-
static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
return true;
}
-static inline bool __meminit __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
-{
- return true;
-}
#endif
@@ -1382,7 +1492,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
{
if (early_page_uninitialised(pfn))
return;
- return __free_pages_boot_core(page, order);
+ __free_pages_core(page, order);
}
/*
@@ -1472,14 +1582,14 @@ static void __init deferred_free_range(unsigned long pfn,
if (nr_pages == pageblock_nr_pages &&
(pfn & (pageblock_nr_pages - 1)) == 0) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, pageblock_order);
+ __free_pages_core(page, pageblock_order);
return;
}
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if ((pfn & (pageblock_nr_pages - 1)) == 0)
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, 0);
+ __free_pages_core(page, 0);
}
}
@@ -1502,21 +1612,13 @@ static inline void __init pgdat_init_report_one_done(void)
*
* Then, we check if a current large page is valid by only checking the validity
* of the head pfn.
- *
- * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
- * within a node: a pfn is between start and end of a node, but does not belong
- * to this memory node.
*/
-static inline bool __init
-deferred_pfn_valid(int nid, unsigned long pfn,
- struct mminit_pfnnid_cache *nid_init_state)
+static inline bool __init deferred_pfn_valid(unsigned long pfn)
{
if (!pfn_valid_within(pfn))
return false;
if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
return false;
- if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
- return false;
return true;
}
@@ -1524,15 +1626,14 @@ deferred_pfn_valid(int nid, unsigned long pfn,
* Free pages to buddy allocator. Try to free aligned pages in
* pageblock_nr_pages sizes.
*/
-static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
+static void __init deferred_free_pages(unsigned long pfn,
unsigned long end_pfn)
{
- struct mminit_pfnnid_cache nid_init_state = { };
unsigned long nr_pgmask = pageblock_nr_pages - 1;
unsigned long nr_free = 0;
for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+ if (!deferred_pfn_valid(pfn)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 0;
} else if (!(pfn & nr_pgmask)) {
@@ -1552,17 +1653,18 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
* by performing it only once every pageblock_nr_pages.
* Return number of pages initialized.
*/
-static unsigned long __init deferred_init_pages(int nid, int zid,
+static unsigned long __init deferred_init_pages(struct zone *zone,
unsigned long pfn,
unsigned long end_pfn)
{
- struct mminit_pfnnid_cache nid_init_state = { };
unsigned long nr_pgmask = pageblock_nr_pages - 1;
+ int nid = zone_to_nid(zone);
unsigned long nr_pages = 0;
+ int zid = zone_idx(zone);
struct page *page = NULL;
for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+ if (!deferred_pfn_valid(pfn)) {
page = NULL;
continue;
} else if (!page || !(pfn & nr_pgmask)) {
@@ -1577,18 +1679,100 @@ static unsigned long __init deferred_init_pages(int nid, int zid,
return (nr_pages);
}
+/*
+ * This function is meant to pre-load the iterator for the zone init.
+ * Specifically it walks through the ranges until we are caught up to the
+ * first_init_pfn value and exits there. If we never encounter the value we
+ * return false indicating there are no valid ranges left.
+ */
+static bool __init
+deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
+ unsigned long *spfn, unsigned long *epfn,
+ unsigned long first_init_pfn)
+{
+ u64 j;
+
+ /*
+ * Start out by walking through the ranges in this zone that have
+ * already been initialized. We don't need to do anything with them
+ * so we just need to flush them out of the system.
+ */
+ for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
+ if (*epfn <= first_init_pfn)
+ continue;
+ if (*spfn < first_init_pfn)
+ *spfn = first_init_pfn;
+ *i = j;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Initialize and free pages. We do it in two loops: first we initialize
+ * struct page, then free to buddy allocator, because while we are
+ * freeing pages we can access pages that are ahead (computing buddy
+ * page in __free_one_page()).
+ *
+ * In order to try and keep some memory in the cache we have the loop
+ * broken along max page order boundaries. This way we will not cause
+ * any issues with the buddy page computation.
+ */
+static unsigned long __init
+deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+ unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
+ unsigned long spfn = *start_pfn, epfn = *end_pfn;
+ unsigned long nr_pages = 0;
+ u64 j = *i;
+
+ /* First we loop through and initialize the page values */
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
+ unsigned long t;
+
+ if (mo_pfn <= *start_pfn)
+ break;
+
+ t = min(mo_pfn, *end_pfn);
+ nr_pages += deferred_init_pages(zone, *start_pfn, t);
+
+ if (mo_pfn < *end_pfn) {
+ *start_pfn = mo_pfn;
+ break;
+ }
+ }
+
+ /* Reset values and now loop through freeing pages as needed */
+ swap(j, *i);
+
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
+ unsigned long t;
+
+ if (mo_pfn <= spfn)
+ break;
+
+ t = min(mo_pfn, epfn);
+ deferred_free_pages(spfn, t);
+
+ if (mo_pfn <= epfn)
+ break;
+ }
+
+ return nr_pages;
+}
+
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
{
pg_data_t *pgdat = data;
- int nid = pgdat->node_id;
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ unsigned long spfn = 0, epfn = 0, nr_pages = 0;
+ unsigned long first_init_pfn, flags;
unsigned long start = jiffies;
- unsigned long nr_pages = 0;
- unsigned long spfn, epfn, first_init_pfn, flags;
- phys_addr_t spa, epa;
- int zid;
struct zone *zone;
- const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ int zid;
u64 i;
/* Bind memory initialisation thread to a local node if possible */
@@ -1614,31 +1798,27 @@ static int __init deferred_init_memmap(void *data)
if (first_init_pfn < zone_end_pfn(zone))
break;
}
- first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
+
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_init_pfn))
+ goto zone_empty;
/*
- * Initialize and free pages. We do it in two loops: first we initialize
- * struct page, than free to buddy allocator, because while we are
- * freeing pages we can access pages that are ahead (computing buddy
- * page in __free_one_page()).
+ * Initialize and free pages in MAX_ORDER sized increments so
+ * that we can avoid introducing any issues with the buddy
+ * allocator.
*/
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
- }
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- deferred_free_pages(nid, zid, spfn, epfn);
- }
+ while (spfn < epfn)
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+zone_empty:
pgdat_resize_unlock(pgdat, &flags);
/* Sanity check that the next zone really is unpopulated */
WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
- pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
- jiffies_to_msecs(jiffies - start));
+ pr_info("node %d initialised, %lu pages in %ums\n",
+ pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start));
pgdat_init_report_one_done();
return 0;
@@ -1662,14 +1842,11 @@ static int __init deferred_init_memmap(void *data)
static noinline bool __init
deferred_grow_zone(struct zone *zone, unsigned int order)
{
- int zid = zone_idx(zone);
- int nid = zone_to_nid(zone);
- pg_data_t *pgdat = NODE_DATA(nid);
unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
- unsigned long nr_pages = 0;
- unsigned long first_init_pfn, spfn, epfn, t, flags;
+ pg_data_t *pgdat = zone->zone_pgdat;
unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
- phys_addr_t spa, epa;
+ unsigned long spfn, epfn, flags;
+ unsigned long nr_pages = 0;
u64 i;
/* Only the last zone may have deferred pages */
@@ -1698,38 +1875,36 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
return true;
}
- first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
-
- if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_deferred_pfn)) {
+ pgdat->first_deferred_pfn = ULONG_MAX;
pgdat_resize_unlock(pgdat, &flags);
- return false;
+ /* Retry only once. */
+ return first_deferred_pfn != ULONG_MAX;
}
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
+ /*
+ * Initialize and free pages in MAX_ORDER sized increments so
+ * that we can avoid introducing any issues with the buddy
+ * allocator.
+ */
+ while (spfn < epfn) {
+ /* update our first deferred PFN for this section */
+ first_deferred_pfn = spfn;
- while (spfn < epfn && nr_pages < nr_pages_needed) {
- t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
- first_deferred_pfn = min(t, epfn);
- nr_pages += deferred_init_pages(nid, zid, spfn,
- first_deferred_pfn);
- spfn = first_deferred_pfn;
- }
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ /* We should only stop along section boundaries */
+ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
+ continue;
+
+ /* If our quota has been met we can stop here */
if (nr_pages >= nr_pages_needed)
break;
}
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
- deferred_free_pages(nid, zid, spfn, epfn);
-
- if (first_deferred_pfn == epfn)
- break;
- }
- pgdat->first_deferred_pfn = first_deferred_pfn;
+ pgdat->first_deferred_pfn = spfn;
pgdat_resize_unlock(pgdat, &flags);
return nr_pages > 0;
@@ -1752,9 +1927,9 @@ _deferred_grow_zone(struct zone *zone, unsigned int order)
void __init page_alloc_init_late(void)
{
struct zone *zone;
+ int nid;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
- int nid;
/* There will be num_node_state(N_MEMORY) threads */
atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
@@ -1774,13 +1949,19 @@ void __init page_alloc_init_late(void)
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
#endif
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+
/* Discard memblock private memory */
memblock_discard();
-#endif
+
+ for_each_node_state(nid, N_MEMORY)
+ shuffle_free_memory(NODE_DATA(nid));
for_each_populated_zone(zone)
set_zone_contiguous(zone);
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ init_debug_guardpage();
+#endif
}
#ifdef CONFIG_CMA
@@ -1849,8 +2030,7 @@ static inline void expand(struct zone *zone, struct page *page,
if (set_page_guard(zone, &page[size], high, migratetype))
continue;
- list_add(&page[size].lru, &area->free_list[migratetype]);
- area->nr_free++;
+ add_to_free_area(&page[size], area, migratetype);
set_page_order(&page[size], high);
}
}
@@ -1865,7 +2045,7 @@ static void check_new_page_bad(struct page *page)
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
- bad_reason = "nonzero _count";
+ bad_reason = "nonzero _refcount";
if (unlikely(page->flags & __PG_HWPOISON)) {
bad_reason = "HWPoisoned (hardware-corrupted)";
bad_flags = __PG_HWPOISON;
@@ -1899,28 +2079,44 @@ static inline int check_new_page(struct page *page)
static inline bool free_pages_prezeroed(void)
{
- return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
- page_poisoning_enabled();
+ return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
+ page_poisoning_enabled()) || want_init_on_free();
}
#ifdef CONFIG_DEBUG_VM
-static bool check_pcp_refill(struct page *page)
+/*
+ * With DEBUG_VM enabled, order-0 pages are checked for expected state when
+ * being allocated from pcp lists. With debug_pagealloc also enabled, they are
+ * also checked when pcp lists are refilled from the free lists.
+ */
+static inline bool check_pcp_refill(struct page *page)
{
- return false;
+ if (debug_pagealloc_enabled())
+ return check_new_page(page);
+ else
+ return false;
}
-static bool check_new_pcp(struct page *page)
+static inline bool check_new_pcp(struct page *page)
{
return check_new_page(page);
}
#else
-static bool check_pcp_refill(struct page *page)
+/*
+ * With DEBUG_VM disabled, free order-0 pages are checked for expected state
+ * when pcp lists are being refilled from the free lists. With debug_pagealloc
+ * enabled, they are also checked when being allocated from the pcp lists.
+ */
+static inline bool check_pcp_refill(struct page *page)
{
return check_new_page(page);
}
-static bool check_new_pcp(struct page *page)
+static inline bool check_new_pcp(struct page *page)
{
- return false;
+ if (debug_pagealloc_enabled())
+ return check_new_page(page);
+ else
+ return false;
}
#endif /* CONFIG_DEBUG_VM */
@@ -1944,22 +2140,20 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
set_page_refcounted(page);
arch_alloc_page(page, order);
- kernel_map_pages(page, 1 << order, 1);
- kernel_poison_pages(page, 1 << order, 1);
+ if (debug_pagealloc_enabled())
+ kernel_map_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
+ kernel_poison_pages(page, 1 << order, 1);
set_page_owner(page, order, gfp_flags);
}
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
unsigned int alloc_flags)
{
- int i;
-
post_alloc_hook(page, order, gfp_flags);
- if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
- for (i = 0; i < (1 << order); i++)
- clear_highpage(page + i);
+ if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags))
+ kernel_init_free_pages(page, 1 << order);
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
@@ -1991,13 +2185,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
- page = list_first_entry_or_null(&area->free_list[migratetype],
- struct page, lru);
+ page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
- list_del(&page->lru);
- rmv_page_order(page);
- area->nr_free--;
+ del_page_from_free_area(page, area);
expand(zone, page, order, current_order, area, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
@@ -2083,8 +2274,7 @@ static int move_freepages(struct zone *zone,
}
order = page_order(page);
- list_move(&page->lru,
- &zone->free_area[order].free_list[migratetype]);
+ move_to_free_area(page, &zone->free_area[order], migratetype);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -2170,6 +2360,18 @@ static inline void boost_watermark(struct zone *zone)
max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
watermark_boost_factor, 10000);
+
+ /*
+ * high watermark may be uninitialised if fragmentation occurs
+ * very early in boot so do not boost. We do not fall
+ * through and boost by pageblock_nr_pages as failing
+ * allocations that early means that reclaim is not going
+ * to help and it may even be impossible to reclaim the
+ * boosted watermark resulting in a hang.
+ */
+ if (!max_boost)
+ return;
+
max_boost = max(pageblock_nr_pages, max_boost);
zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
@@ -2260,7 +2462,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
single_page:
area = &zone->free_area[current_order];
- list_move(&page->lru, &area->free_list[start_type]);
+ move_to_free_area(page, area, start_type);
}
/*
@@ -2284,7 +2486,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
if (fallback_mt == MIGRATE_TYPES)
break;
- if (list_empty(&area->free_list[fallback_mt]))
+ if (free_area_empty(area, fallback_mt))
continue;
if (can_steal_fallback(order, migratetype))
@@ -2371,9 +2573,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
for (order = 0; order < MAX_ORDER; order++) {
struct free_area *area = &(zone->free_area[order]);
- page = list_first_entry_or_null(
- &area->free_list[MIGRATE_HIGHATOMIC],
- struct page, lru);
+ page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
if (!page)
continue;
@@ -2496,8 +2696,7 @@ find_smallest:
VM_BUG_ON(current_order == MAX_ORDER);
do_steal:
- page = list_first_entry(&area->free_list[fallback_mt],
- struct page, lru);
+ page = get_page_from_free_area(area, fallback_mt);
steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
can_steal);
@@ -2934,6 +3133,7 @@ EXPORT_SYMBOL_GPL(split_page);
int __isolate_free_page(struct page *page, unsigned int order)
{
+ struct free_area *area = &page_zone(page)->free_area[order];
unsigned long watermark;
struct zone *zone;
int mt;
@@ -2950,7 +3150,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
* watermark, because we already know our high-order page
* exists.
*/
- watermark = min_wmark_pages(zone) + (1UL << order);
+ watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
return 0;
@@ -2958,9 +3158,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
}
/* Remove page from free list */
- list_del(&page->lru);
- zone->free_area[order].nr_free--;
- rmv_page_order(page);
+
+ del_page_from_free_area(page, area);
/*
* Set the pageblock if the isolated page is at least half of a
@@ -3035,9 +3234,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
- struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int migratetype,
- unsigned int alloc_flags)
+ struct zone *zone, gfp_t gfp_flags,
+ int migratetype, unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
@@ -3049,7 +3247,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
list = &pcp->lists[migratetype];
page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
if (page) {
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone);
}
local_irq_restore(flags);
@@ -3069,8 +3267,8 @@ struct page *rmqueue(struct zone *preferred_zone,
struct page *page;
if (likely(order == 0)) {
- page = rmqueue_pcplist(preferred_zone, zone, order,
- gfp_flags, migratetype, alloc_flags);
+ page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
+ migratetype, alloc_flags);
goto out;
}
@@ -3161,24 +3359,14 @@ static int __init fail_page_alloc_debugfs(void)
dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
&fail_page_alloc.attr);
- if (IS_ERR(dir))
- return PTR_ERR(dir);
-
- if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &fail_page_alloc.ignore_gfp_reclaim))
- goto fail;
- if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
- &fail_page_alloc.ignore_gfp_highmem))
- goto fail;
- if (!debugfs_create_u32("min-order", mode, dir,
- &fail_page_alloc.min_order))
- goto fail;
- return 0;
-fail:
- debugfs_remove_recursive(dir);
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_reclaim);
+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem);
+ debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
- return -ENOMEM;
+ return 0;
}
late_initcall(fail_page_alloc_debugfs);
@@ -3268,13 +3456,13 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
continue;
for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
- if (!list_empty(&area->free_list[mt]))
+ if (!free_area_empty(area, mt))
return true;
}
#ifdef CONFIG_CMA
if ((alloc_flags & ALLOC_CMA) &&
- !list_empty(&area->free_list[MIGRATE_CMA])) {
+ !free_area_empty(area, MIGRATE_CMA)) {
return true;
}
#endif
@@ -3360,8 +3548,11 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
alloc_flags |= ALLOC_KSWAPD;
#ifdef CONFIG_ZONE_DMA32
+ if (!zone)
+ return alloc_flags;
+
if (zone_idx(zone) != ZONE_NORMAL)
- goto out;
+ return alloc_flags;
/*
* If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
@@ -3370,9 +3561,9 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
*/
BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
if (nr_online_nodes > 1 && !populated_zone(--zone))
- goto out;
+ return alloc_flags;
-out:
+ alloc_flags |= ALLOC_NOFRAGMENT;
#endif /* CONFIG_ZONE_DMA32 */
return alloc_flags;
}
@@ -3698,7 +3889,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, enum compact_result *compact_result)
{
- struct page *page;
+ struct page *page = NULL;
unsigned long pflags;
unsigned int noreclaim_flag;
@@ -3709,21 +3900,24 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
noreclaim_flag = memalloc_noreclaim_save();
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
- prio);
+ prio, &page);
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
- if (*compact_result <= COMPACT_INACTIVE)
- return NULL;
-
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
* count a compaction stall
*/
count_vm_event(COMPACTSTALL);
- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+ /* Prep a captured page if available */
+ if (page)
+ prep_new_page(page, order, gfp_mask, alloc_flags);
+
+ /* Try get a page from the freelist if available */
+ if (!page)
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page) {
struct zone *zone = page_zone(page);
@@ -4556,7 +4750,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
- unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
+ unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) {
__free_pages(page, order);
page = NULL;
}
@@ -4675,11 +4869,11 @@ refill:
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
- page_ref_add(page, size);
+ page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
- nc->pagecnt_bias = size + 1;
+ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
nc->offset = size;
}
@@ -4695,10 +4889,10 @@ refill:
size = nc->size;
#endif
/* OK, page count is 0, we can safely set it */
- set_page_count(page, size + 1);
+ set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
/* reset page count bias and offset to start of new frag */
- nc->pagecnt_bias = size + 1;
+ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
offset = size - fragsz;
}
@@ -4740,7 +4934,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
/**
* alloc_pages_exact - allocate an exact number physically-contiguous pages.
* @size: the number of bytes to allocate
- * @gfp_mask: GFP flags for the allocation
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
*
* This function is similar to alloc_pages(), except that it allocates the
* minimum number of pages to satisfy the request. alloc_pages() can only
@@ -4749,12 +4943,17 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
* This function is also limited by MAX_ORDER.
*
* Memory allocated by this function must be released by free_pages_exact().
+ *
+ * Return: pointer to the allocated area or %NULL in case of error.
*/
void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
{
unsigned int order = get_order(size);
unsigned long addr;
+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
+ gfp_mask &= ~__GFP_COMP;
+
addr = __get_free_pages(gfp_mask, order);
return make_alloc_exact(addr, order, size);
}
@@ -4765,15 +4964,22 @@ EXPORT_SYMBOL(alloc_pages_exact);
* pages on a node.
* @nid: the preferred node ID where memory should be allocated
* @size: the number of bytes to allocate
- * @gfp_mask: GFP flags for the allocation
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
*
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
* back.
+ *
+ * Return: pointer to the allocated area or %NULL in case of error.
*/
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
{
unsigned int order = get_order(size);
- struct page *p = alloc_pages_node(nid, gfp_mask, order);
+ struct page *p;
+
+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
+ gfp_mask &= ~__GFP_COMP;
+
+ p = alloc_pages_node(nid, gfp_mask, order);
if (!p)
return NULL;
return make_alloc_exact((unsigned long)page_address(p), order, size);
@@ -4802,11 +5008,13 @@ EXPORT_SYMBOL(free_pages_exact);
* nr_free_zone_pages - count number of pages beyond high watermark
* @offset: The zone index of the highest zone
*
- * nr_free_zone_pages() counts the number of counts pages which are beyond the
+ * nr_free_zone_pages() counts the number of pages which are beyond the
* high watermark within all zones at or below a given zone index. For each
* zone, the number of pages is calculated as:
*
* nr_free_zone_pages = managed_pages - high_pages
+ *
+ * Return: number of pages beyond high watermark.
*/
static unsigned long nr_free_zone_pages(int offset)
{
@@ -4833,6 +5041,9 @@ static unsigned long nr_free_zone_pages(int offset)
*
* nr_free_buffer_pages() counts the number of pages which are beyond the high
* watermark within ZONE_DMA and ZONE_NORMAL.
+ *
+ * Return: number of pages beyond high watermark within ZONE_DMA and
+ * ZONE_NORMAL.
*/
unsigned long nr_free_buffer_pages(void)
{
@@ -4845,6 +5056,8 @@ EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
*
* nr_free_pagecache_pages() counts the number of pages which are beyond the
* high watermark within all zones.
+ *
+ * Return: number of pages beyond high watermark within all zones.
*/
unsigned long nr_free_pagecache_pages(void)
{
@@ -5176,7 +5389,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
types[order] = 0;
for (type = 0; type < MIGRATE_TYPES; type++) {
- if (!list_empty(&area->free_list[type]))
+ if (!free_area_empty(area, type))
types[order] |= 1 << type;
}
}
@@ -5291,7 +5504,8 @@ static int node_load[MAX_NUMNODES];
* from each node to each node in the system), and should also prefer nodes
* with no CPUs, since presumably they'll have very little allocation pressure
* on them otherwise.
- * It returns -1 if no node is found.
+ *
+ * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
*/
static int find_next_best_node(int node, nodemask_t *used_node_mask)
{
@@ -5597,7 +5811,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
else
page_group_by_mobility_disabled = 0;
- pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
+ pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
@@ -6004,7 +6218,7 @@ int __meminit __early_pfn_to_nid(unsigned long pfn,
return state->last_nid;
nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
- if (nid != -1) {
+ if (nid != NUMA_NO_NODE) {
state->last_start = start_pfn;
state->last_end = end_pfn;
state->last_nid = nid;
@@ -6154,13 +6368,15 @@ static unsigned long __init zone_spanned_pages_in_node(int nid,
unsigned long *zone_end_pfn,
unsigned long *ignored)
{
+ unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+ unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
/* Get the start and end of the zone */
- *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
- *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+ *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+ *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
zone_start_pfn, zone_end_pfn);
@@ -6202,7 +6418,7 @@ unsigned long __init __absent_pages_in_range(int nid,
* @start_pfn: The start PFN to start searching for holes
* @end_pfn: The end PFN to stop searching for holes
*
- * It returns the number of pages frames in memory holes within a range.
+ * Return: the number of pages frames in memory holes within a range.
*/
unsigned long __init absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn)
@@ -6364,10 +6580,14 @@ static void __ref setup_usemap(struct pglist_data *pgdat,
{
unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
zone->pageblock_flags = NULL;
- if (usemapsize)
+ if (usemapsize) {
zone->pageblock_flags =
- memblock_alloc_node_nopanic(usemapsize,
- pgdat->node_id);
+ memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
+ pgdat->node_id);
+ if (!zone->pageblock_flags)
+ panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
+ usemapsize, zone->name, pgdat->node_id);
+ }
}
#else
static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -6596,7 +6816,11 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
end = pgdat_end_pfn(pgdat);
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
- map = memblock_alloc_node_nopanic(size, pgdat->node_id);
+ map = memblock_alloc_node(size, SMP_CACHE_BYTES,
+ pgdat->node_id);
+ if (!map)
+ panic("Failed to allocate %ld bytes for node %d memory map\n",
+ size, pgdat->node_id);
pgdat->node_mem_map = map + offset;
}
pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
@@ -6752,14 +6976,14 @@ void __init setup_nr_node_ids(void)
* model has fine enough granularity to avoid incorrect mapping for the
* populated node map.
*
- * Returns the determined alignment in pfn's. 0 if there is no alignment
+ * Return: the determined alignment in pfn's. 0 if there is no alignment
* requirement (single node).
*/
unsigned long __init node_map_pfn_alignment(void)
{
unsigned long accl_mask = 0, last_end = 0;
unsigned long start, end, mask;
- int last_nid = -1;
+ int last_nid = NUMA_NO_NODE;
int i, nid;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
@@ -6807,7 +7031,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
/**
* find_min_pfn_with_active_regions - Find the minimum PFN registered
*
- * It returns the minimum PFN based on information provided via
+ * Return: the minimum PFN based on information provided via
* memblock_set_node().
*/
unsigned long __init find_min_pfn_with_active_regions(void)
@@ -7255,7 +7479,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
return pages;
}
-EXPORT_SYMBOL(free_reserved_area);
#ifdef CONFIG_HIGHMEM
void free_highmem_page(struct page *page)
@@ -7368,10 +7591,28 @@ static int page_alloc_cpu_dead(unsigned int cpu)
return 0;
}
+#ifdef CONFIG_NUMA
+int hashdist = HASHDIST_DEFAULT;
+
+static int __init set_hashdist(char *str)
+{
+ if (!str)
+ return 0;
+ hashdist = simple_strtoul(str, &str, 0);
+ return 1;
+}
+__setup("hashdist=", set_hashdist);
+#endif
+
void __init page_alloc_init(void)
{
int ret;
+#ifdef CONFIG_NUMA
+ if (num_node_state(N_MEMORY) == 1)
+ hashdist = 0;
+#endif
+
ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
"mm/page_alloc:dead", NULL,
page_alloc_cpu_dead);
@@ -7484,7 +7725,7 @@ static void __setup_per_zone_wmarks(void)
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
- * deltas control asynch page reclaim, and so should
+ * deltas control async page reclaim, and so should
* not be capped for highmem.
*/
unsigned long min_pages;
@@ -7756,19 +7997,6 @@ out:
return ret;
}
-#ifdef CONFIG_NUMA
-int hashdist = HASHDIST_DEFAULT;
-
-static int __init set_hashdist(char *str)
-{
- if (!str)
- return 0;
- hashdist = simple_strtoul(str, &str, 0);
- return 1;
-}
-__setup("hashdist=", set_hashdist);
-#endif
-
#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
/*
* Returns the number of pages that arch has reserved but
@@ -7815,6 +8043,7 @@ void *__init alloc_large_system_hash(const char *tablename,
unsigned long log2qty, size;
void *table = NULL;
gfp_t gfp_flags;
+ bool virt;
/* allow the kernel cmdline to have a say */
if (!numentries) {
@@ -7871,34 +8100,34 @@ void *__init alloc_large_system_hash(const char *tablename,
gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
do {
+ virt = false;
size = bucketsize << log2qty;
if (flags & HASH_EARLY) {
if (flags & HASH_ZERO)
- table = memblock_alloc_nopanic(size,
- SMP_CACHE_BYTES);
+ table = memblock_alloc(size, SMP_CACHE_BYTES);
else
table = memblock_alloc_raw(size,
SMP_CACHE_BYTES);
- } else if (hashdist) {
+ } else if (get_order(size) >= MAX_ORDER || hashdist) {
table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ virt = true;
} else {
/*
* If bucketsize is not a power-of-two, we may free
* some pages at the end of hash table which
* alloc_pages_exact() automatically does
*/
- if (get_order(size) < MAX_ORDER) {
- table = alloc_pages_exact(size, gfp_flags);
- kmemleak_alloc(table, size, 1, gfp_flags);
- }
+ table = alloc_pages_exact(size, gfp_flags);
+ kmemleak_alloc(table, size, 1, gfp_flags);
}
} while (!table && size > PAGE_SIZE && --log2qty);
if (!table)
panic("Failed to allocate %s hash table\n", tablename);
- pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
- tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
+ pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
+ tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
+ virt ? "vmalloc" : "linear");
if (_hash_shift)
*_hash_shift = log2qty;
@@ -7920,7 +8149,10 @@ void *__init alloc_large_system_hash(const char *tablename,
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
int migratetype, int flags)
{
- unsigned long pfn, iter, found;
+ unsigned long found;
+ unsigned long iter = 0;
+ unsigned long pfn = page_to_pfn(page);
+ const char *reason = "unmovable page";
/*
* TODO we could make this much more efficient by not checking every
@@ -7930,17 +8162,20 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* can still lead to having bootmem allocations in zone_movable.
*/
- /*
- * CMA allocations (alloc_contig_range) really need to mark isolate
- * CMA pageblocks even when they are not movable in fact so consider
- * them movable here.
- */
- if (is_migrate_cma(migratetype) &&
- is_migrate_cma(get_pageblock_migratetype(page)))
- return false;
+ if (is_migrate_cma_page(page)) {
+ /*
+ * CMA allocations (alloc_contig_range) really need to mark
+ * isolate CMA pageblocks even when they are not movable in fact
+ * so consider them movable here.
+ */
+ if (is_migrate_cma(migratetype))
+ return false;
+
+ reason = "CMA page";
+ goto unmovable;
+ }
- pfn = page_to_pfn(page);
- for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
+ for (found = 0; iter < pageblock_nr_pages; iter++) {
unsigned long check = pfn + iter;
if (!pfn_valid_within(check))
@@ -7961,7 +8196,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
/*
* Hugepages are not in LRU lists, but they're movable.
- * We need not scan over tail pages bacause we don't
+ * We need not scan over tail pages because we don't
* handle each tail page individually in migration.
*/
if (PageHuge(page)) {
@@ -8020,12 +8255,11 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
unmovable:
WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
if (flags & REPORT_FAILURE)
- dump_page(pfn_to_page(pfn+iter), "unmovable page");
+ dump_page(pfn_to_page(pfn + iter), reason);
return true;
}
-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
-
+#ifdef CONFIG_CONTIG_ALLOC
static unsigned long pfn_max_align_down(unsigned long pfn)
{
return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
@@ -8100,7 +8334,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
* pageblocks in the range. Once isolated, the pageblocks should not
* be modified by others.
*
- * Returns zero on success or negative error code. On success all
+ * Return: zero on success or negative error code. On success all
* pages which PFN is in [start, end) are allocated for the caller and
* need to be freed with free_contig_range().
*/
@@ -8148,7 +8382,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
ret = start_isolate_page_range(pfn_max_align_down(start),
pfn_max_align_up(end), migratetype, 0);
- if (ret)
+ if (ret < 0)
return ret;
/*
@@ -8184,7 +8418,6 @@ int alloc_contig_range(unsigned long start, unsigned long end,
*/
lru_add_drain_all();
- drain_all_pages(cc.zone);
order = 0;
outer_start = start;
@@ -8235,8 +8468,9 @@ done:
pfn_max_align_up(end), migratetype);
return ret;
}
+#endif /* CONFIG_CONTIG_ALLOC */
-void free_contig_range(unsigned long pfn, unsigned nr_pages)
+void free_contig_range(unsigned long pfn, unsigned int nr_pages)
{
unsigned int count = 0;
@@ -8248,7 +8482,6 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
}
WARN(count != 0, "%d pages are still in use!\n", count);
}
-#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/*
@@ -8290,7 +8523,7 @@ void zone_pcp_reset(struct zone *zone)
* All pages in the range must be in a single zone and isolated
* before calling this.
*/
-void
+unsigned long
__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
struct page *page;
@@ -8298,12 +8531,15 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
unsigned int order, i;
unsigned long pfn;
unsigned long flags;
+ unsigned long offlined_pages = 0;
+
/* find the first valid pfn */
for (pfn = start_pfn; pfn < end_pfn; pfn++)
if (pfn_valid(pfn))
break;
if (pfn == end_pfn)
- return;
+ return offlined_pages;
+
offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
@@ -8321,24 +8557,26 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
pfn++;
SetPageReserved(page);
+ offlined_pages++;
continue;
}
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
order = page_order(page);
+ offlined_pages += 1 << order;
#ifdef CONFIG_DEBUG_VM
pr_info("remove from free list %lx %d %lx\n",
pfn, 1 << order, end_pfn);
#endif
- list_del(&page->lru);
- rmv_page_order(page);
- zone->free_area[order].nr_free--;
+ del_page_from_free_area(page, &zone->free_area[order]);
for (i = 0; i < (1 << order); i++)
SetPageReserved((page+i));
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
+
+ return offlined_pages;
}
#endif