diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 18 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/backing-dev.c | 25 | ||||
-rw-r--r-- | mm/cleancache.c | 12 | ||||
-rw-r--r-- | mm/cma.c | 83 | ||||
-rw-r--r-- | mm/cma_debug.c | 25 | ||||
-rw-r--r-- | mm/compaction.c | 6 | ||||
-rw-r--r-- | mm/dmapool.c | 2 | ||||
-rw-r--r-- | mm/failslab.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 17 | ||||
-rw-r--r-- | mm/frontswap.c | 13 | ||||
-rw-r--r-- | mm/gup.c | 81 | ||||
-rw-r--r-- | mm/gup_benchmark.c | 2 | ||||
-rw-r--r-- | mm/hmm.c | 13 | ||||
-rw-r--r-- | mm/huge_memory.c | 22 | ||||
-rw-r--r-- | mm/hugetlb.c | 51 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 6 | ||||
-rw-r--r-- | mm/init-mm.c | 1 | ||||
-rw-r--r-- | mm/internal.h | 6 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 66 | ||||
-rw-r--r-- | mm/ksm.c | 48 | ||||
-rw-r--r-- | mm/memblock.c | 36 | ||||
-rw-r--r-- | mm/memcontrol.c | 356 | ||||
-rw-r--r-- | mm/memfd.c | 345 | ||||
-rw-r--r-- | mm/memory.c | 36 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 25 | ||||
-rw-r--r-- | mm/mempool.c | 108 | ||||
-rw-r--r-- | mm/migrate.c | 20 | ||||
-rw-r--r-- | mm/mmap.c | 93 | ||||
-rw-r--r-- | mm/mremap.c | 4 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 85 | ||||
-rw-r--r-- | mm/page-writeback.c | 18 | ||||
-rw-r--r-- | mm/page_alloc.c | 175 | ||||
-rw-r--r-- | mm/page_counter.c | 100 | ||||
-rw-r--r-- | mm/page_idle.c | 2 | ||||
-rw-r--r-- | mm/page_owner.c | 4 | ||||
-rw-r--r-- | mm/percpu-stats.c | 2 | ||||
-rw-r--r-- | mm/readahead.c | 39 | ||||
-rw-r--r-- | mm/rmap.c | 9 | ||||
-rw-r--r-- | mm/shmem.c | 391 | ||||
-rw-r--r-- | mm/slab.c | 7 | ||||
-rw-r--r-- | mm/slab_common.c | 37 | ||||
-rw-r--r-- | mm/slob.c | 4 | ||||
-rw-r--r-- | mm/slub.c | 131 | ||||
-rw-r--r-- | mm/sparse.c | 8 | ||||
-rw-r--r-- | mm/swap.c | 3 | ||||
-rw-r--r-- | mm/swap_slots.c | 14 | ||||
-rw-r--r-- | mm/swap_state.c | 5 | ||||
-rw-r--r-- | mm/swapfile.c | 14 | ||||
-rw-r--r-- | mm/userfaultfd.c | 22 | ||||
-rw-r--r-- | mm/util.c | 8 | ||||
-rw-r--r-- | mm/vmalloc.c | 62 | ||||
-rw-r--r-- | mm/vmpressure.c | 35 | ||||
-rw-r--r-- | mm/vmscan.c | 61 | ||||
-rw-r--r-- | mm/vmstat.c | 62 | ||||
-rw-r--r-- | mm/z3fold.c | 42 | ||||
-rw-r--r-- | mm/zsmalloc.c | 5 | ||||
-rw-r--r-- | mm/zswap.c | 38 |
59 files changed, 1683 insertions, 1225 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index d5004d82a1d6..ce95491abd6a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -266,7 +266,7 @@ config ARCH_ENABLE_THP_MIGRATION bool config PHYS_ADDR_T_64BIT - def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT + def_bool 64BIT config BOUNCE bool "Enable bounce buffers" @@ -305,7 +305,7 @@ config KSM the many instances by a single page with that content, so saving memory until one or another app needs to modify the content. Recommended for use with KVM, or with other duplicative applications. - See Documentation/vm/ksm.txt for more information: KSM is inactive + See Documentation/vm/ksm.rst for more information: KSM is inactive until a program has madvised that an area is MADV_MERGEABLE, and root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). @@ -530,7 +530,7 @@ config MEM_SOFT_DIRTY into a page just as regular dirty bit, but unlike the latter it can be cleared by hands. - See Documentation/vm/soft-dirty.txt for more details. + See Documentation/admin-guide/mm/soft-dirty.rst for more details. config ZSWAP bool "Compressed cache for swap pages (EXPERIMENTAL)" @@ -636,6 +636,7 @@ config DEFERRED_STRUCT_PAGE_INIT default n depends on NO_BOOTMEM depends on !FLATMEM + depends on !NEED_PER_CPU_KM help Ordinarily all struct pages are initialised during early boot in a single thread. On very large machines this can take a considerable @@ -656,7 +657,8 @@ config IDLE_PAGE_TRACKING be useful to tune memory cgroup limits and/or for job placement within a compute cluster. - See Documentation/vm/idle_page_tracking.txt for more details. + See Documentation/admin-guide/mm/idle_page_tracking.rst for + more details. # arch_add_memory() comprehends device memory config ARCH_HAS_ZONE_DEVICE @@ -692,6 +694,9 @@ config ARCH_HAS_HMM config MIGRATE_VMA_HELPER bool +config DEV_PAGEMAP_OPS + bool + config HMM bool select MIGRATE_VMA_HELPER @@ -712,6 +717,7 @@ config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" depends on ARCH_HAS_HMM select HMM + select DEV_PAGEMAP_OPS help Allows creation of struct pages to represent unaddressable device @@ -722,6 +728,7 @@ config DEVICE_PUBLIC bool "Addressable device memory (like GPU memory)" depends on ARCH_HAS_HMM select HMM + select DEV_PAGEMAP_OPS help Allows creation of struct pages to represent addressable device @@ -752,3 +759,6 @@ config GUP_BENCHMARK performance of get_user_pages_fast(). See tools/testing/selftests/vm/gup_benchmark.c + +config ARCH_HAS_PTE_SPECIAL + bool diff --git a/mm/Makefile b/mm/Makefile index b4e54a9ae9c5..8716bdabe1e6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -105,3 +105,4 @@ obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_HMM) += hmm.o +obj-$(CONFIG_MEMFD_CREATE) += memfd.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 023190c69dce..347cc834c04a 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -115,6 +115,7 @@ static int bdi_debug_register(struct backing_dev_info *bdi, const char *name) bdi, &bdi_debug_stats_fops); if (!bdi->debug_stats) { debugfs_remove(bdi->debug_dir); + bdi->debug_dir = NULL; return -ENOMEM; } @@ -383,7 +384,7 @@ static void wb_shutdown(struct bdi_writeback *wb) * the barrier provided by test_and_clear_bit() above. */ smp_wmb(); - clear_bit(WB_shutting_down, &wb->state); + clear_and_wake_up_bit(WB_shutting_down, &wb->state); } static void wb_exit(struct bdi_writeback *wb) @@ -411,6 +412,7 @@ static void wb_exit(struct bdi_writeback *wb) * protected. */ static DEFINE_SPINLOCK(cgwb_lock); +static struct workqueue_struct *cgwb_release_wq; /** * wb_congested_get_create - get or create a wb_congested @@ -521,7 +523,7 @@ static void cgwb_release(struct percpu_ref *refcnt) { struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, refcnt); - schedule_work(&wb->release_work); + queue_work(cgwb_release_wq, &wb->release_work); } static void cgwb_kill(struct bdi_writeback *wb) @@ -555,7 +557,7 @@ static int cgwb_create(struct backing_dev_info *bdi, memcg = mem_cgroup_from_css(memcg_css); blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); blkcg = css_to_blkcg(blkcg_css); - memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + memcg_cgwb_list = &memcg->cgwb_list; blkcg_cgwb_list = &blkcg->cgwb_list; /* look up again under lock and discard on blkcg mismatch */ @@ -734,7 +736,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi) */ void wb_memcg_offline(struct mem_cgroup *memcg) { - struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + struct list_head *memcg_cgwb_list = &memcg->cgwb_list; struct bdi_writeback *wb, *next; spin_lock_irq(&cgwb_lock); @@ -783,6 +785,21 @@ static void cgwb_bdi_register(struct backing_dev_info *bdi) spin_unlock_irq(&cgwb_lock); } +static int __init cgwb_init(void) +{ + /* + * There can be many concurrent release work items overwhelming + * system_wq. Put them in a separate wq and limit concurrency. + * There's no point in executing many of these in parallel. + */ + cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1); + if (!cgwb_release_wq) + return -ENOMEM; + + return 0; +} +subsys_initcall(cgwb_init); + #else /* CONFIG_CGROUP_WRITEBACK */ static int cgwb_bdi_init(struct backing_dev_info *bdi) diff --git a/mm/cleancache.c b/mm/cleancache.c index f7b9fdc79d97..2bf12da9baa0 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -3,7 +3,7 @@ * * This code provides the generic "frontend" layer to call a matching * "backend" driver implementation of cleancache. See - * Documentation/vm/cleancache.txt for more information. + * Documentation/vm/cleancache.rst for more information. * * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. * Author: Dan Magenheimer @@ -307,12 +307,10 @@ static int __init init_cleancache(void) struct dentry *root = debugfs_create_dir("cleancache", NULL); if (root == NULL) return -ENXIO; - debugfs_create_u64("succ_gets", S_IRUGO, root, &cleancache_succ_gets); - debugfs_create_u64("failed_gets", S_IRUGO, - root, &cleancache_failed_gets); - debugfs_create_u64("puts", S_IRUGO, root, &cleancache_puts); - debugfs_create_u64("invalidates", S_IRUGO, - root, &cleancache_invalidates); + debugfs_create_u64("succ_gets", 0444, root, &cleancache_succ_gets); + debugfs_create_u64("failed_gets", 0444, root, &cleancache_failed_gets); + debugfs_create_u64("puts", 0444, root, &cleancache_puts); + debugfs_create_u64("invalidates", 0444, root, &cleancache_invalidates); #endif return 0; } @@ -39,7 +39,6 @@ #include <trace/events/cma.h> #include "cma.h" -#include "internal.h" struct cma cma_areas[MAX_CMA_AREAS]; unsigned cma_area_count; @@ -110,25 +109,23 @@ static int __init cma_activate_area(struct cma *cma) if (!cma->bitmap) return -ENOMEM; + WARN_ON_ONCE(!pfn_valid(pfn)); + zone = page_zone(pfn_to_page(pfn)); + do { unsigned j; base_pfn = pfn; - if (!pfn_valid(base_pfn)) - goto err; - - zone = page_zone(pfn_to_page(base_pfn)); for (j = pageblock_nr_pages; j; --j, pfn++) { - if (!pfn_valid(pfn)) - goto err; - + WARN_ON_ONCE(!pfn_valid(pfn)); /* - * In init_cma_reserved_pageblock(), present_pages - * is adjusted with assumption that all pages in - * the pageblock come from a single zone. + * alloc_contig_range requires the pfn range + * specified to be in the same zone. Make this + * simple by forcing the entire CMA resv range + * to be in the same zone. */ if (page_zone(pfn_to_page(pfn)) != zone) - goto err; + goto not_in_zone; } init_cma_reserved_pageblock(pfn_to_page(base_pfn)); } while (--i); @@ -142,7 +139,7 @@ static int __init cma_activate_area(struct cma *cma) return 0; -err: +not_in_zone: pr_err("CMA area %s could not be activated\n", cma->name); kfree(cma->bitmap); cma->count = 0; @@ -152,41 +149,6 @@ err: static int __init cma_init_reserved_areas(void) { int i; - struct zone *zone; - pg_data_t *pgdat; - - if (!cma_area_count) - return 0; - - for_each_online_pgdat(pgdat) { - unsigned long start_pfn = UINT_MAX, end_pfn = 0; - - zone = &pgdat->node_zones[ZONE_MOVABLE]; - - /* - * In this case, we cannot adjust the zone range - * since it is now maximum node span and we don't - * know original zone range. - */ - if (populated_zone(zone)) - continue; - - for (i = 0; i < cma_area_count; i++) { - if (pfn_to_nid(cma_areas[i].base_pfn) != - pgdat->node_id) - continue; - - start_pfn = min(start_pfn, cma_areas[i].base_pfn); - end_pfn = max(end_pfn, cma_areas[i].base_pfn + - cma_areas[i].count); - } - - if (!end_pfn) - continue; - - zone->zone_start_pfn = start_pfn; - zone->spanned_pages = end_pfn - start_pfn; - } for (i = 0; i < cma_area_count; i++) { int ret = cma_activate_area(&cma_areas[i]); @@ -195,32 +157,9 @@ static int __init cma_init_reserved_areas(void) return ret; } - /* - * Reserved pages for ZONE_MOVABLE are now activated and - * this would change ZONE_MOVABLE's managed page counter and - * the other zones' present counter. We need to re-calculate - * various zone information that depends on this initialization. - */ - build_all_zonelists(NULL); - for_each_populated_zone(zone) { - if (zone_idx(zone) == ZONE_MOVABLE) { - zone_pcp_reset(zone); - setup_zone_pageset(zone); - } else - zone_pcp_update(zone); - - set_zone_contiguous(zone); - } - - /* - * We need to re-init per zone wmark by calling - * init_per_zone_wmark_min() but doesn't call here because it is - * registered on core_initcall and it will be called later than us. - */ - return 0; } -pure_initcall(cma_init_reserved_areas); +core_initcall(cma_init_reserved_areas); /** * cma_init_reserved_mem() - create custom contiguous area from reserved memory diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 275df8b5b22e..f23467291cfb 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -172,23 +172,18 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) tmp = debugfs_create_dir(name, cma_debugfs_root); - debugfs_create_file("alloc", S_IWUSR, tmp, cma, - &cma_alloc_fops); - - debugfs_create_file("free", S_IWUSR, tmp, cma, - &cma_free_fops); - - debugfs_create_file("base_pfn", S_IRUGO, tmp, - &cma->base_pfn, &cma_debugfs_fops); - debugfs_create_file("count", S_IRUGO, tmp, - &cma->count, &cma_debugfs_fops); - debugfs_create_file("order_per_bit", S_IRUGO, tmp, - &cma->order_per_bit, &cma_debugfs_fops); - debugfs_create_file("used", S_IRUGO, tmp, cma, &cma_used_fops); - debugfs_create_file("maxchunk", S_IRUGO, tmp, cma, &cma_maxchunk_fops); + debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops); + debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops); + debugfs_create_file("base_pfn", 0444, tmp, + &cma->base_pfn, &cma_debugfs_fops); + debugfs_create_file("count", 0444, tmp, &cma->count, &cma_debugfs_fops); + debugfs_create_file("order_per_bit", 0444, tmp, + &cma->order_per_bit, &cma_debugfs_fops); + debugfs_create_file("used", 0444, tmp, cma, &cma_used_fops); + debugfs_create_file("maxchunk", 0444, tmp, cma, &cma_maxchunk_fops); u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32)); - debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s); + debugfs_create_u32_array("bitmap", 0444, tmp, (u32 *)cma->bitmap, u32s); } static int __init cma_debugfs_init(void) diff --git a/mm/compaction.c b/mm/compaction.c index 028b7210a669..faca45ebe62d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1450,12 +1450,14 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * if compaction succeeds. * For costly orders, we require low watermark instead of min for * compaction to proceed to increase its chances. + * ALLOC_CMA is used, as pages in CMA pageblocks are considered + * suitable migration targets */ watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, - 0, wmark_target)) + ALLOC_CMA, wmark_target)) return COMPACT_SKIPPED; return COMPACT_CONTINUE; @@ -1897,7 +1899,7 @@ static ssize_t sysfs_compact_node(struct device *dev, return count; } -static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); +static DEVICE_ATTR(compact, 0200, NULL, sysfs_compact_node); int compaction_register_node(struct node *node) { diff --git a/mm/dmapool.c b/mm/dmapool.c index 4d90a64b2fdc..6d4b97e7e9e9 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -105,7 +105,7 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf) return PAGE_SIZE - size; } -static DEVICE_ATTR(pools, S_IRUGO, show_pools, NULL); +static DEVICE_ATTR(pools, 0444, show_pools, NULL); /** * dma_pool_create - Creates a pool of consistent memory blocks, for dma. diff --git a/mm/failslab.c b/mm/failslab.c index 1f2f248e3601..b135ebb88b6f 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -42,7 +42,7 @@ __setup("failslab=", setup_failslab); static int __init failslab_debugfs_init(void) { struct dentry *dir; - umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + umode_t mode = S_IFREG | 0600; dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr); if (IS_ERR(dir)) diff --git a/mm/filemap.c b/mm/filemap.c index 9276bdb2343c..52517f28e6f4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -786,7 +786,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) VM_BUG_ON_PAGE(!PageLocked(new), new); VM_BUG_ON_PAGE(new->mapping, new); - error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK); if (!error) { struct address_space *mapping = old->mapping; void (*freepage)(struct page *); @@ -842,7 +842,7 @@ static int __add_to_page_cache_locked(struct page *page, return error; } - error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); + error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK); if (error) { if (!huge) mem_cgroup_cancel_charge(page, memcg, false); @@ -1585,8 +1585,7 @@ no_page: if (fgp_flags & FGP_ACCESSED) __SetPageReferenced(page); - err = add_to_page_cache_lru(page, mapping, offset, - gfp_mask & GFP_RECLAIM_MASK); + err = add_to_page_cache_lru(page, mapping, offset, gfp_mask); if (unlikely(err)) { put_page(page); page = NULL; @@ -2387,7 +2386,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) if (!page) return -ENOMEM; - ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL); + ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask); if (ret == 0) ret = mapping->a_ops->readpage(file, page); else if (ret == -EEXIST) @@ -2490,7 +2489,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, * * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. */ -int filemap_fault(struct vm_fault *vmf) +vm_fault_t filemap_fault(struct vm_fault *vmf) { int error; struct file *file = vmf->vma->vm_file; @@ -2500,7 +2499,7 @@ int filemap_fault(struct vm_fault *vmf) pgoff_t offset = vmf->pgoff; pgoff_t max_off; struct page *page; - int ret = 0; + vm_fault_t ret = 0; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(offset >= max_off)) @@ -2694,11 +2693,11 @@ next: } EXPORT_SYMBOL(filemap_map_pages); -int filemap_page_mkwrite(struct vm_fault *vmf) +vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); - int ret = VM_FAULT_LOCKED; + vm_fault_t ret = VM_FAULT_LOCKED; sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); diff --git a/mm/frontswap.c b/mm/frontswap.c index fec8b5044040..157e5bf63504 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -3,7 +3,7 @@ * * This code provides the generic "frontend" layer to call a matching * "backend" driver implementation of frontswap. See - * Documentation/vm/frontswap.txt for more information. + * Documentation/vm/frontswap.rst for more information. * * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. * Author: Dan Magenheimer @@ -486,12 +486,11 @@ static int __init init_frontswap(void) struct dentry *root = debugfs_create_dir("frontswap", NULL); if (root == NULL) return -ENXIO; - debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads); - debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores); - debugfs_create_u64("failed_stores", S_IRUGO, root, - &frontswap_failed_stores); - debugfs_create_u64("invalidates", S_IRUGO, - root, &frontswap_invalidates); + debugfs_create_u64("loads", 0444, root, &frontswap_loads); + debugfs_create_u64("succ_stores", 0444, root, &frontswap_succ_stores); + debugfs_create_u64("failed_stores", 0444, root, + &frontswap_failed_stores); + debugfs_create_u64("invalidates", 0444, root, &frontswap_invalidates); #endif return 0; } @@ -212,53 +212,69 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, unsigned int flags, unsigned int *page_mask) { - pmd_t *pmd; + pmd_t *pmd, pmdval; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; pmd = pmd_offset(pudp, address); - if (pmd_none(*pmd)) + /* + * The READ_ONCE() will stabilize the pmdval in a register or + * on the stack so that it will stop changing under the code. + */ + pmdval = READ_ONCE(*pmd); + if (pmd_none(pmdval)) return no_page_table(vma, flags); - if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { + if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) { page = follow_huge_pmd(mm, address, pmd, flags); if (page) return page; return no_page_table(vma, flags); } - if (is_hugepd(__hugepd(pmd_val(*pmd)))) { + if (is_hugepd(__hugepd(pmd_val(pmdval)))) { page = follow_huge_pd(vma, address, - __hugepd(pmd_val(*pmd)), flags, + __hugepd(pmd_val(pmdval)), flags, PMD_SHIFT); if (page) return page; return no_page_table(vma, flags); } retry: - if (!pmd_present(*pmd)) { + if (!pmd_present(pmdval)) { if (likely(!(flags & FOLL_MIGRATION))) return no_page_table(vma, flags); VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(*pmd)); - if (is_pmd_migration_entry(*pmd)) + !is_pmd_migration_entry(pmdval)); + if (is_pmd_migration_entry(pmdval)) pmd_migration_entry_wait(mm, pmd); + pmdval = READ_ONCE(*pmd); + /* + * MADV_DONTNEED may convert the pmd to null because + * mmap_sem is held in read mode + */ + if (pmd_none(pmdval)) + return no_page_table(vma, flags); goto retry; } - if (pmd_devmap(*pmd)) { + if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags); spin_unlock(ptl); if (page) return page; } - if (likely(!pmd_trans_huge(*pmd))) + if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags); - if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) + if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) return no_page_table(vma, flags); retry_locked: ptl = pmd_lock(mm, pmd); + if (unlikely(pmd_none(*pmd))) { + spin_unlock(ptl); + return no_page_table(vma, flags); + } if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); if (likely(!(flags & FOLL_MIGRATION))) @@ -544,6 +560,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (vm_flags & (VM_IO | VM_PFNMAP)) return -EFAULT; + if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma)) + return -EFAULT; + if (write) { if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) @@ -1351,7 +1370,7 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } } -#ifdef __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -1427,7 +1446,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, { return 0; } -#endif /* __HAVE_ARCH_PTE_SPECIAL */ +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, @@ -1456,32 +1475,48 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, return 1; } -static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, +static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, struct page **pages, int *nr) { unsigned long fault_pfn; + int nr_start = *nr; - fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - return __gup_device_huge(fault_pfn, addr, end, pages, nr); + fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + if (!__gup_device_huge(fault_pfn, addr, end, pages, nr)) + return 0; + + if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { + undo_dev_pagemap(nr, nr_start, pages); + return 0; + } + return 1; } -static int __gup_device_huge_pud(pud_t pud, unsigned long addr, +static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, struct page **pages, int *nr) { unsigned long fault_pfn; + int nr_start = *nr; - fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - return __gup_device_huge(fault_pfn, addr, end, pages, nr); + fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + if (!__gup_device_huge(fault_pfn, addr, end, pages, nr)) + return 0; + + if (unlikely(pud_val(orig) != pud_val(*pudp))) { + undo_dev_pagemap(nr, nr_start, pages); + return 0; + } + return 1; } #else -static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, +static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, struct page **pages, int *nr) { BUILD_BUG(); return 0; } -static int __gup_device_huge_pud(pud_t pud, unsigned long addr, +static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, unsigned long end, struct page **pages, int *nr) { BUILD_BUG(); @@ -1499,7 +1534,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; if (pmd_devmap(orig)) - return __gup_device_huge_pmd(orig, addr, end, pages, nr); + return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr); refs = 0; page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); @@ -1537,7 +1572,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; if (pud_devmap(orig)) - return __gup_device_huge_pud(orig, addr, end, pages, nr); + return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr); refs = 0; page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 0f44759486e2..6a473709e9b6 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -23,7 +23,7 @@ static int __gup_benchmark_ioctl(unsigned int cmd, struct page **pages; nr_pages = gup->size / PAGE_SIZE; - pages = kvzalloc(sizeof(void *) * nr_pages, GFP_KERNEL); + pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL); if (!pages) return -ENOMEM; @@ -35,15 +35,6 @@ #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) -#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC) -/* - * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h - */ -DEFINE_STATIC_KEY_FALSE(device_private_key); -EXPORT_SYMBOL(device_private_key); -#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ - - #if IS_ENABLED(CONFIG_HMM_MIRROR) static const struct mmu_notifier_ops hmm_mmu_notifier_ops; @@ -1167,7 +1158,7 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, resource_size_t addr; int ret; - static_branch_enable(&device_private_key); + dev_pagemap_get_ops(); devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), GFP_KERNEL, dev_to_node(device)); @@ -1261,7 +1252,7 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) return ERR_PTR(-EINVAL); - static_branch_enable(&device_private_key); + dev_pagemap_get_ops(); devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), GFP_KERNEL, dev_to_node(device)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 14ed6ee5e02f..1cd7c1a57a14 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -483,11 +483,8 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) static inline struct list_head *page_deferred_list(struct page *page) { - /* - * ->lru in the tail pages is occupied by compound_head. - * Let's use ->mapping + ->index in the second tail page as list_head. - */ - return (struct list_head *)&page[2].mapping; + /* ->lru in the tail pages is occupied by compound_head. */ + return &page[2].deferred_list; } void prep_transhuge_page(struct page *page) @@ -1134,8 +1131,8 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ - pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, - GFP_KERNEL); + pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *), + GFP_KERNEL); if (unlikely(!pages)) { ret |= VM_FAULT_OOM; goto out; @@ -1185,7 +1182,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, * mmu_notifier_invalidate_range_end() happens which can lead to a * device seeing memory write in different order than CPU. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); @@ -2037,7 +2034,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, * replacing a zero pmd write protected page with a zero pte write * protected page. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ pmdp_huge_clear_flush(vma, haddr, pmd); @@ -2431,7 +2428,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, __split_huge_page_tail(head, i, lruvec, list); /* Some pages can be beyond i_size: drop them from page cache */ if (head[i].index >= end) { - __ClearPageDirty(head + i); + ClearPageDirty(head + i); __delete_from_page_cache(head + i, NULL); if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) shmem_uncharge(head->mapping->host, 1); @@ -2925,7 +2922,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) pmde = maybe_pmd_mkwrite(pmde, vma); flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); - page_add_anon_rmap(new, vma, mmun_start, true); + if (PageAnon(new)) + page_add_anon_rmap(new, vma, mmun_start, true); + else + page_add_file_rmap(new, true); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); if (vma->vm_flags & VM_LOCKED) mlock_vma_page(new); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 218679138255..3612fbb32e9d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2798,7 +2798,8 @@ static int __init hugetlb_init(void) num_fault_mutexes = 1; #endif hugetlb_fault_mutex_table = - kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL); + kmalloc_array(num_fault_mutexes, sizeof(struct mutex), + GFP_KERNEL); BUG_ON(!hugetlb_fault_mutex_table); for (i = 0; i < num_fault_mutexes; i++) @@ -3159,7 +3160,7 @@ static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ -static int hugetlb_vm_op_fault(struct vm_fault *vmf) +static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) { BUG(); return 0; @@ -3291,7 +3292,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, * table protection not changing it to point * to a new page. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ huge_ptep_set_wrprotect(src, addr, src_pte); } @@ -3686,6 +3687,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; pte_t new_pte; spinlock_t *ptl; + unsigned long haddr = address & huge_page_mask(h); /* * Currently, we are forced to kill the process in the event the @@ -3716,7 +3718,7 @@ retry: u32 hash; struct vm_fault vmf = { .vma = vma, - .address = address, + .address = haddr, .flags = flags, /* * Hard to debug if it ends up being @@ -3733,14 +3735,14 @@ retry: * fault to make calling code simpler. */ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, - idx, address); + idx, haddr); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, VM_UFFD_MISSING); mutex_lock(&hugetlb_fault_mutex_table[hash]); goto out; } - page = alloc_huge_page(vma, address, 0); + page = alloc_huge_page(vma, haddr, 0); if (IS_ERR(page)) { ret = PTR_ERR(page); if (ret == -ENOMEM) @@ -3789,12 +3791,12 @@ retry: * the spinlock. */ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - if (vma_needs_reservation(h, vma, address) < 0) { + if (vma_needs_reservation(h, vma, haddr) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, address); + vma_end_reservation(h, vma, haddr); } ptl = huge_pte_lock(h, mm, ptep); @@ -3808,17 +3810,17 @@ retry: if (anon_rmap) { ClearPagePrivate(page); - hugepage_add_new_anon_rmap(page, vma, address); + hugepage_add_new_anon_rmap(page, vma, haddr); } else page_dup_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); - set_huge_pte_at(mm, address, ptep, new_pte); + set_huge_pte_at(mm, haddr, ptep, new_pte); hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); + ret = hugetlb_cow(mm, vma, haddr, ptep, page, ptl); } spin_unlock(ptl); @@ -3830,7 +3832,7 @@ backout: spin_unlock(ptl); backout_unlocked: unlock_page(page); - restore_reserve_on_error(h, vma, address, page); + restore_reserve_on_error(h, vma, haddr, page); put_page(page); goto out; } @@ -3883,10 +3885,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; + unsigned long haddr = address & huge_page_mask(h); - address &= huge_page_mask(h); - - ptep = huge_pte_offset(mm, address, huge_page_size(h)); + ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { @@ -3896,20 +3897,20 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); } else { - ptep = huge_pte_alloc(mm, address, huge_page_size(h)); + ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); if (!ptep) return VM_FAULT_OOM; } mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); + idx = vma_hugecache_offset(h, vma, haddr); /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address); + hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); @@ -3939,16 +3940,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * consumed. */ if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { - if (vma_needs_reservation(h, vma, address) < 0) { + if (vma_needs_reservation(h, vma, haddr) < 0) { ret = VM_FAULT_OOM; goto out_mutex; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, address); + vma_end_reservation(h, vma, haddr); if (!(vma->vm_flags & VM_MAYSHARE)) pagecache_page = hugetlbfs_pagecache_page(h, - vma, address); + vma, haddr); } ptl = huge_pte_lock(h, mm, ptep); @@ -3973,16 +3974,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { - ret = hugetlb_cow(mm, vma, address, ptep, + ret = hugetlb_cow(mm, vma, haddr, ptep, pagecache_page, ptl); goto out_put_page; } entry = huge_pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (huge_ptep_set_access_flags(vma, address, ptep, entry, + if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, flags & FAULT_FLAG_WRITE)) - update_mmu_cache(vma, address, ptep); + update_mmu_cache(vma, haddr, ptep); out_put_page: if (page != pagecache_page) unlock_page(page); @@ -4357,7 +4358,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * No need to call mmu_notifier_invalidate_range() we are downgrading * page table protection not changing it to point to a new page. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); mmu_notifier_invalidate_range_end(mm, start, end); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index eec1150125b9..68c2f2f3c05b 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -84,7 +84,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, limit = round_down(PAGE_COUNTER_MAX, 1 << huge_page_order(&hstates[idx])); - ret = page_counter_limit(counter, limit); + ret = page_counter_set_max(counter, limit); VM_BUG_ON(ret); } } @@ -273,7 +273,7 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, case RES_USAGE: return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: - return (u64)counter->limit * PAGE_SIZE; + return (u64)counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_FAILCNT: @@ -306,7 +306,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_LIMIT: mutex_lock(&hugetlb_limit_mutex); - ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages); + ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages); mutex_unlock(&hugetlb_limit_mutex); break; default: diff --git a/mm/init-mm.c b/mm/init-mm.c index f94d5d15ebc0..f0179c9c04c2 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -22,6 +22,7 @@ struct mm_struct init_mm = { .mm_count = ATOMIC_INIT(1), .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns = &init_user_ns, INIT_MM_CONTEXT(init_mm) diff --git a/mm/internal.h b/mm/internal.h index 62d8c34e63d5..9e3654d70289 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -53,7 +53,7 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, struct zap_details *details); -extern int __do_page_cache_readahead(struct address_space *mapping, +extern unsigned int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size); @@ -168,9 +168,6 @@ extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; -extern void set_zone_contiguous(struct zone *zone); -extern void clear_zone_contiguous(struct zone *zone); - #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* @@ -498,6 +495,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ +#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ enum ttu_flags; struct tlbflush_unmap_batch; diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index bc0e68f7dc75..f185455b3406 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -792,6 +792,40 @@ DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); #ifdef CONFIG_MEMORY_HOTPLUG +static bool shadow_mapped(unsigned long addr) +{ + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (pgd_none(*pgd)) + return false; + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return false; + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return false; + + /* + * We can't use pud_large() or pud_huge(), the first one is + * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse + * pud_bad(), if pud is bad then it's bad because it's huge. + */ + if (pud_bad(*pud)) + return true; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return false; + + if (pmd_bad(*pmd)) + return true; + pte = pte_offset_kernel(pmd, addr); + return !pte_none(*pte); +} + static int __meminit kasan_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -813,6 +847,14 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb, case MEM_GOING_ONLINE: { void *ret; + /* + * If shadow is mapped already than it must have been mapped + * during the boot. This could happen if we onlining previously + * offlined memory. + */ + if (shadow_mapped(shadow_start)) + return NOTIFY_OK; + ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start, shadow_end, GFP_KERNEL, PAGE_KERNEL, VM_NO_GUARD, @@ -824,8 +866,26 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb, kmemleak_ignore(ret); return NOTIFY_OK; } - case MEM_OFFLINE: - vfree((void *)shadow_start); + case MEM_CANCEL_ONLINE: + case MEM_OFFLINE: { + struct vm_struct *vm; + + /* + * shadow_start was either mapped during boot by kasan_init() + * or during memory online by __vmalloc_node_range(). + * In the latter case we can use vfree() to free shadow. + * Non-NULL result of the find_vm_area() will tell us if + * that was the second case. + * + * Currently it's not possible to free shadow mapped + * during boot by kasan_init(). It's because the code + * to do that hasn't been written yet. So we'll just + * leak the memory. + */ + vm = find_vm_area((void *)shadow_start); + if (vm) + vfree((void *)shadow_start); + } } return NOTIFY_OK; @@ -838,5 +898,5 @@ static int __init kasan_memhotplug_init(void) return 0; } -module_init(kasan_memhotplug_init); +core_initcall(kasan_memhotplug_init); #endif @@ -51,7 +51,9 @@ #define DO_NUMA(x) do { } while (0) #endif -/* +/** + * DOC: Overview + * * A few notes about the KSM scanning process, * to make it easier to understand the data structures below: * @@ -67,6 +69,21 @@ * this tree is fully assured to be working (except when pages are unmapped), * and therefore this tree is called the stable tree. * + * The stable tree node includes information required for reverse + * mapping from a KSM page to virtual addresses that map this page. + * + * In order to avoid large latencies of the rmap walks on KSM pages, + * KSM maintains two types of nodes in the stable tree: + * + * * the regular nodes that keep the reverse mapping structures in a + * linked list + * * the "chains" that link nodes ("dups") that represent the same + * write protected memory content, but each "dup" corresponds to a + * different KSM page copy of that content + * + * Internally, the regular nodes, "dups" and "chains" are represented + * using the same :c:type:`struct stable_node` structure. + * * In addition to the stable tree, KSM uses a second data structure called the * unstable tree: this tree holds pointers to pages which have been found to * be "unchanged for a period of time". The unstable tree sorts these pages @@ -199,6 +216,8 @@ struct rmap_item { #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ +#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG) + /* to mask all the flags */ /* The stable and unstable tree heads */ static struct rb_root one_stable_tree[1] = { RB_ROOT }; @@ -823,6 +842,17 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, return err; } +static inline struct stable_node *page_stable_node(struct page *page) +{ + return PageKsm(page) ? page_rmapping(page) : NULL; +} + +static inline void set_page_stable_node(struct page *page, + struct stable_node *stable_node) +{ + page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); +} + #ifdef CONFIG_SYSFS /* * Only called through the sysfs control interface: @@ -1049,7 +1079,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * No need to notify as we are downgrading page table to read * only not changing it to point to a new page. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); /* @@ -1145,7 +1175,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * No need to notify as we are replacing a read only page with another * read only page with the same content. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); @@ -2570,10 +2600,15 @@ again: anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { + unsigned long addr; + cond_resched(); vma = vmac->vma; - if (rmap_item->address < vma->vm_start || - rmap_item->address >= vma->vm_end) + + /* Ignore the stable/unstable/sqnr flags */ + addr = rmap_item->address & ~KSM_FLAG_MASK; + + if (addr < vma->vm_start || addr >= vma->vm_end) continue; /* * Initially we examine only the vma which covers this @@ -2587,8 +2622,7 @@ again: if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - if (!rwc->rmap_one(page, vma, - rmap_item->address, rwc->arg)) { + if (!rwc->rmap_one(page, vma, addr, rwc->arg)) { anon_vma_unlock_read(anon_vma); return; } diff --git a/mm/memblock.c b/mm/memblock.c index 5108356ad8aa..03d48d8835ba 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -68,7 +68,7 @@ ulong __init_memblock choose_memblock_flags(void) /* adjust *@size so that (@base + *@size) doesn't overflow, return new size */ static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) { - return *size = min(*size, (phys_addr_t)ULLONG_MAX - base); + return *size = min(*size, PHYS_ADDR_MAX - base); } /* @@ -697,6 +697,11 @@ static int __init_memblock memblock_remove_range(struct memblock_type *type, int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) { + phys_addr_t end = base + size - 1; + + memblock_dbg("memblock_remove: [%pa-%pa] %pS\n", + &base, &end, (void *)_RET_IP_); + return memblock_remove_range(&memblock.memory, base, size); } @@ -925,7 +930,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : (phys_addr_t)ULLONG_MAX; + r->base : PHYS_ADDR_MAX; /* * if idx_b advanced past idx_a, @@ -1041,7 +1046,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : (phys_addr_t)ULLONG_MAX; + r->base : PHYS_ADDR_MAX; /* * if idx_b advanced past idx_a, * break out to advance idx_a @@ -1516,13 +1521,13 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) { - phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + phys_addr_t max_addr = PHYS_ADDR_MAX; struct memblock_region *r; /* * translate the memory @limit size into the max address within one of * the memory memblock regions, if the @limit exceeds the total size - * of those regions, max_addr will keep original value ULLONG_MAX + * of those regions, max_addr will keep original value PHYS_ADDR_MAX */ for_each_memblock(memory, r) { if (limit <= r->size) { @@ -1537,7 +1542,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) void __init memblock_enforce_memory_limit(phys_addr_t limit) { - phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + phys_addr_t max_addr = PHYS_ADDR_MAX; if (!limit) return; @@ -1545,14 +1550,14 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) max_addr = __find_max_addr(limit); /* @limit exceeds the total size of the memory, do nothing */ - if (max_addr == (phys_addr_t)ULLONG_MAX) + if (max_addr == PHYS_ADDR_MAX) return; /* truncate both memory and reserved regions */ memblock_remove_range(&memblock.memory, max_addr, - (phys_addr_t)ULLONG_MAX); + PHYS_ADDR_MAX); memblock_remove_range(&memblock.reserved, max_addr, - (phys_addr_t)ULLONG_MAX); + PHYS_ADDR_MAX); } void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) @@ -1580,7 +1585,7 @@ void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) /* truncate the reserved regions */ memblock_remove_range(&memblock.reserved, 0, base); memblock_remove_range(&memblock.reserved, - base + size, (phys_addr_t)ULLONG_MAX); + base + size, PHYS_ADDR_MAX); } void __init memblock_mem_limit_remove_map(phys_addr_t limit) @@ -1593,7 +1598,7 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit) max_addr = __find_max_addr(limit); /* @limit exceeds the total size of the memory, do nothing */ - if (max_addr == (phys_addr_t)ULLONG_MAX) + if (max_addr == PHYS_ADDR_MAX) return; memblock_cap_memory_range(0, max_addr); @@ -1803,10 +1808,13 @@ static int __init memblock_init_debugfs(void) struct dentry *root = debugfs_create_dir("memblock", NULL); if (!root) return -ENXIO; - debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops); - debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops); + debugfs_create_file("memory", 0444, root, + &memblock.memory, &memblock_debug_fops); + debugfs_create_file("reserved", 0444, root, + &memblock.reserved, &memblock_debug_fops); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP - debugfs_create_file("physmem", S_IRUGO, root, &memblock.physmem, &memblock_debug_fops); + debugfs_create_file("physmem", 0444, root, + &memblock.physmem, &memblock_debug_fops); #endif return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e074f7c637aa..e6f0d5ef320a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1034,13 +1034,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) unsigned long limit; count = page_counter_read(&memcg->memory); - limit = READ_ONCE(memcg->memory.limit); + limit = READ_ONCE(memcg->memory.max); if (count < limit) margin = limit - count; if (do_memsw_account()) { count = page_counter_read(&memcg->memsw); - limit = READ_ONCE(memcg->memsw.limit); + limit = READ_ONCE(memcg->memsw.max); if (count <= limit) margin = min(margin, limit - count); else @@ -1148,13 +1148,13 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), - K((u64)memcg->memory.limit), memcg->memory.failcnt); + K((u64)memcg->memory.max), memcg->memory.failcnt); pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memsw)), - K((u64)memcg->memsw.limit), memcg->memsw.failcnt); + K((u64)memcg->memsw.max), memcg->memsw.failcnt); pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->kmem)), - K((u64)memcg->kmem.limit), memcg->kmem.failcnt); + K((u64)memcg->kmem.max), memcg->kmem.failcnt); for_each_mem_cgroup_tree(iter, memcg) { pr_info("Memory cgroup stats for "); @@ -1179,21 +1179,21 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) /* * Return the memory (and swap, if configured) limit for a memcg. */ -unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { - unsigned long limit; + unsigned long max; - limit = memcg->memory.limit; + max = memcg->memory.max; if (mem_cgroup_swappiness(memcg)) { - unsigned long memsw_limit; - unsigned long swap_limit; + unsigned long memsw_max; + unsigned long swap_max; - memsw_limit = memcg->memsw.limit; - swap_limit = memcg->swap.limit; - swap_limit = min(swap_limit, (unsigned long)total_swap_pages); - limit = min(limit + swap_limit, memsw_limit); + memsw_max = memcg->memsw.max; + swap_max = memcg->swap.max; + swap_max = min(swap_max, (unsigned long)total_swap_pages); + max = min(max + swap_max, memsw_max); } - return limit; + return max; } static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, @@ -2192,7 +2192,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, { struct memcg_kmem_cache_create_work *cw; - cw = kmalloc(sizeof(*cw), GFP_NOWAIT); + cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); if (!cw) return; @@ -2444,12 +2444,13 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, } #endif -static DEFINE_MUTEX(memcg_limit_mutex); +static DEFINE_MUTEX(memcg_max_mutex); -static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, - unsigned long limit, bool memsw) +static int mem_cgroup_resize_max(struct mem_cgroup *memcg, + unsigned long max, bool memsw) { bool enlarge = false; + bool drained = false; int ret; bool limits_invariant; struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; @@ -2460,26 +2461,32 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, break; } - mutex_lock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); /* * Make sure that the new limit (memsw or memory limit) doesn't - * break our basic invariant rule memory.limit <= memsw.limit. + * break our basic invariant rule memory.max <= memsw.max. */ - limits_invariant = memsw ? limit >= memcg->memory.limit : - limit <= memcg->memsw.limit; + limits_invariant = memsw ? max >= memcg->memory.max : + max <= memcg->memsw.max; if (!limits_invariant) { - mutex_unlock(&memcg_limit_mutex); + mutex_unlock(&memcg_max_mutex); ret = -EINVAL; break; } - if (limit > counter->limit) + if (max > counter->max) enlarge = true; - ret = page_counter_limit(counter, limit); - mutex_unlock(&memcg_limit_mutex); + ret = page_counter_set_max(counter, max); + mutex_unlock(&memcg_max_mutex); if (!ret) break; + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw)) { ret = -EBUSY; @@ -2603,6 +2610,9 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) /* we call try-to-free pages for make this cgroup empty */ lru_add_drain_all(); + + drain_all_stock(memcg); + /* try to free all pages in this cgroup */ while (nr_retries && page_counter_read(&memcg->memory)) { int progress; @@ -2757,7 +2767,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: - return (u64)counter->limit * PAGE_SIZE; + return (u64)counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_FAILCNT: @@ -2871,24 +2881,24 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } #endif /* !CONFIG_SLOB */ -static int memcg_update_kmem_limit(struct mem_cgroup *memcg, - unsigned long limit) +static int memcg_update_kmem_max(struct mem_cgroup *memcg, + unsigned long max) { int ret; - mutex_lock(&memcg_limit_mutex); - ret = page_counter_limit(&memcg->kmem, limit); - mutex_unlock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); + ret = page_counter_set_max(&memcg->kmem, max); + mutex_unlock(&memcg_max_mutex); return ret; } -static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) +static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) { int ret; - mutex_lock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); - ret = page_counter_limit(&memcg->tcpmem, limit); + ret = page_counter_set_max(&memcg->tcpmem, max); if (ret) goto out; @@ -2913,7 +2923,7 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) memcg->tcpmem_active = true; } out: - mutex_unlock(&memcg_limit_mutex); + mutex_unlock(&memcg_max_mutex); return ret; } @@ -2941,16 +2951,16 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, } switch (MEMFILE_TYPE(of_cft(of)->private)) { case _MEM: - ret = mem_cgroup_resize_limit(memcg, nr_pages, false); + ret = mem_cgroup_resize_max(memcg, nr_pages, false); break; case _MEMSWAP: - ret = mem_cgroup_resize_limit(memcg, nr_pages, true); + ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; case _KMEM: - ret = memcg_update_kmem_limit(memcg, nr_pages); + ret = memcg_update_kmem_max(memcg, nr_pages); break; case _TCP: - ret = memcg_update_tcp_limit(memcg, nr_pages); + ret = memcg_update_tcp_max(memcg, nr_pages); break; } break; @@ -3083,7 +3093,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) #endif /* CONFIG_NUMA */ /* Universal VM events cgroup1 shows, original sort order */ -unsigned int memcg1_events[] = { +static const unsigned int memcg1_events[] = { PGPGIN, PGPGOUT, PGFAULT, @@ -3126,8 +3136,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) /* Hierarchical information */ memory = memsw = PAGE_COUNTER_MAX; for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { - memory = min(memory, mi->memory.limit); - memsw = min(memsw, mi->memsw.limit); + memory = min(memory, mi->memory.max); + memsw = min(memsw, mi->memsw.max); } seq_printf(m, "hierarchical_memory_limit %llu\n", (u64)memory * PAGE_SIZE); @@ -3540,7 +3550,8 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); - seq_printf(sf, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL)); + seq_printf(sf, "oom_kill %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); return 0; } @@ -3562,11 +3573,6 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, #ifdef CONFIG_CGROUP_WRITEBACK -struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) -{ - return &memcg->cgwb_list; -} - static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) { return wb_domain_init(&memcg->cgwb_domain, gfp); @@ -3626,7 +3632,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, *pheadroom = PAGE_COUNTER_MAX; while ((parent = parent_mem_cgroup(memcg))) { - unsigned long ceiling = min(memcg->memory.limit, memcg->high); + unsigned long ceiling = min(memcg->memory.max, memcg->high); unsigned long used = page_counter_read(&memcg->memory); *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); @@ -3849,7 +3855,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, if (ret) goto out_put_css; - efile.file->f_op->poll(efile.file, &event->pt); + vfs_poll(efile.file, &event->pt); spin_lock(&memcg->event_list_lock); list_add(&event->list, &memcg->event_list); @@ -4270,7 +4276,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); - memcg->low = 0; + page_counter_set_min(&memcg->memory, 0); + page_counter_set_low(&memcg->memory, 0); memcg_offline_kmem(memcg); wb_memcg_offline(memcg); @@ -4319,12 +4326,13 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX); - memcg->low = 0; + page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); + page_counter_set_min(&memcg->memory, 0); + page_counter_set_low(&memcg->memory, 0); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; memcg_wb_domain_size_changed(memcg); @@ -5061,10 +5069,40 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } +static int memory_min_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long min = READ_ONCE(memcg->memory.min); + + if (min == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE); + + return 0; +} + +static ssize_t memory_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long min; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &min); + if (err) + return err; + + page_counter_set_min(&memcg->memory, min); + + return nbytes; +} + static int memory_low_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long low = READ_ONCE(memcg->low); + unsigned long low = READ_ONCE(memcg->memory.low); if (low == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5086,7 +5124,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, if (err) return err; - memcg->low = low; + page_counter_set_low(&memcg->memory, low); return nbytes; } @@ -5131,7 +5169,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, static int memory_max_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->memory.limit); + unsigned long max = READ_ONCE(memcg->memory.max); if (max == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5155,7 +5193,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; - xchg(&memcg->memory.limit, max); + xchg(&memcg->memory.max, max); for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); @@ -5202,7 +5240,8 @@ static int memory_events_show(struct seq_file *m, void *v) atomic_long_read(&memcg->memory_events[MEMCG_MAX])); seq_printf(m, "oom %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_OOM])); - seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL)); + seq_printf(m, "oom_kill %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); return 0; } @@ -5296,6 +5335,12 @@ static struct cftype memory_files[] = { .read_u64 = memory_current_read, }, { + .name = "min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_min_show, + .write = memory_min_write, + }, + { .name = "low", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = memory_low_show, @@ -5344,54 +5389,144 @@ struct cgroup_subsys memory_cgrp_subsys = { }; /** - * mem_cgroup_low - check if memory consumption is below the normal range + * mem_cgroup_protected - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * - * Returns %true if memory consumption of @memcg, and that of all - * ancestors up to (but not including) @root, is below the normal range. + * WARNING: This function is not stateless! It can only be used as part + * of a top-down tree iteration, not for isolated queries. + * + * Returns one of the following: + * MEMCG_PROT_NONE: cgroup memory is not protected + * MEMCG_PROT_LOW: cgroup memory is protected as long there is + * an unprotected supply of reclaimable memory from other cgroups. + * MEMCG_PROT_MIN: cgroup memory is protected + * + * @root is exclusive; it is never protected when looked at directly * - * @root is exclusive; it is never low when looked at directly and isn't - * checked when traversing the hierarchy. + * To provide a proper hierarchical behavior, effective memory.min/low values + * are used. Below is the description of how effective memory.low is calculated. + * Effective memory.min values is calculated in the same way. * - * Excluding @root enables using memory.low to prioritize memory usage - * between cgroups within a subtree of the hierarchy that is limited by - * memory.high or memory.max. + * Effective memory.low is always equal or less than the original memory.low. + * If there is no memory.low overcommittment (which is always true for + * top-level memory cgroups), these two values are equal. + * Otherwise, it's a part of parent's effective memory.low, + * calculated as a cgroup's memory.low usage divided by sum of sibling's + * memory.low usages, where memory.low usage is the size of actually + * protected memory. * - * For example, given cgroup A with children B and C: + * low_usage + * elow = min( memory.low, parent->elow * ------------------ ), + * siblings_low_usage * - * A - * / \ - * B C + * | memory.current, if memory.current < memory.low + * low_usage = | + | 0, otherwise. * - * and * - * 1. A/memory.current > A/memory.high - * 2. A/B/memory.current < A/B/memory.low - * 3. A/C/memory.current >= A/C/memory.low + * Such definition of the effective memory.low provides the expected + * hierarchical behavior: parent's memory.low value is limiting + * children, unprotected memory is reclaimed first and cgroups, + * which are not using their guarantee do not affect actual memory + * distribution. * - * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we - * should reclaim from 'C' until 'A' is no longer high or until we can - * no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by - * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered - * low and we will reclaim indiscriminately from both 'B' and 'C'. + * For example, if there are memcgs A, A/B, A/C, A/D and A/E: + * + * A A/memory.low = 2G, A/memory.current = 6G + * //\\ + * BC DE B/memory.low = 3G B/memory.current = 2G + * C/memory.low = 1G C/memory.current = 2G + * D/memory.low = 0 D/memory.current = 2G + * E/memory.low = 10G E/memory.current = 0 + * + * and the memory pressure is applied, the following memory distribution + * is expected (approximately): + * + * A/memory.current = 2G + * + * B/memory.current = 1.3G + * C/memory.current = 0.6G + * D/memory.current = 0 + * E/memory.current = 0 + * + * These calculations require constant tracking of the actual low usages + * (see propagate_protected_usage()), as well as recursive calculation of + * effective memory.low values. But as we do call mem_cgroup_protected() + * path for each memory cgroup top-down from the reclaim, + * it's possible to optimize this part, and save calculated elow + * for next usage. This part is intentionally racy, but it's ok, + * as memory.low is a best-effort mechanism. */ -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) +enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, + struct mem_cgroup *memcg) { + struct mem_cgroup *parent; + unsigned long emin, parent_emin; + unsigned long elow, parent_elow; + unsigned long usage; + if (mem_cgroup_disabled()) - return false; + return MEMCG_PROT_NONE; if (!root) root = root_mem_cgroup; if (memcg == root) - return false; + return MEMCG_PROT_NONE; + + usage = page_counter_read(&memcg->memory); + if (!usage) + return MEMCG_PROT_NONE; - for (; memcg != root; memcg = parent_mem_cgroup(memcg)) { - if (page_counter_read(&memcg->memory) >= memcg->low) - return false; + emin = memcg->memory.min; + elow = memcg->memory.low; + + parent = parent_mem_cgroup(memcg); + /* No parent means a non-hierarchical mode on v1 memcg */ + if (!parent) + return MEMCG_PROT_NONE; + + if (parent == root) + goto exit; + + parent_emin = READ_ONCE(parent->memory.emin); + emin = min(emin, parent_emin); + if (emin && parent_emin) { + unsigned long min_usage, siblings_min_usage; + + min_usage = min(usage, memcg->memory.min); + siblings_min_usage = atomic_long_read( + &parent->memory.children_min_usage); + + if (min_usage && siblings_min_usage) + emin = min(emin, parent_emin * min_usage / + siblings_min_usage); } - return true; + parent_elow = READ_ONCE(parent->memory.elow); + elow = min(elow, parent_elow); + if (elow && parent_elow) { + unsigned long low_usage, siblings_low_usage; + + low_usage = min(usage, memcg->memory.low); + siblings_low_usage = atomic_long_read( + &parent->memory.children_low_usage); + + if (low_usage && siblings_low_usage) + elow = min(elow, parent_elow * low_usage / + siblings_low_usage); + } + +exit: + memcg->memory.emin = emin; + memcg->memory.elow = elow; + + if (usage <= emin) + return MEMCG_PROT_MIN; + else if (usage <= elow) + return MEMCG_PROT_LOW; + else + return MEMCG_PROT_NONE; } /** @@ -6012,10 +6147,17 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!memcg) return 0; + if (!entry.val) { + memcg_memory_event(memcg, MEMCG_SWAP_FAIL); + return 0; + } + memcg = mem_cgroup_id_get_online(memcg); if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { + memcg_memory_event(memcg, MEMCG_SWAP_MAX); + memcg_memory_event(memcg, MEMCG_SWAP_FAIL); mem_cgroup_id_put(memcg); return -ENOMEM; } @@ -6067,7 +6209,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, - READ_ONCE(memcg->swap.limit) - + READ_ONCE(memcg->swap.max) - page_counter_read(&memcg->swap)); return nr_swap_pages; } @@ -6088,7 +6230,7 @@ bool mem_cgroup_swap_full(struct page *page) return false; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) - if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit) + if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max) return true; return false; @@ -6122,7 +6264,7 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, static int swap_max_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->swap.limit); + unsigned long max = READ_ONCE(memcg->swap.max); if (max == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -6144,15 +6286,23 @@ static ssize_t swap_max_write(struct kernfs_open_file *of, if (err) return err; - mutex_lock(&memcg_limit_mutex); - err = page_counter_limit(&memcg->swap, max); - mutex_unlock(&memcg_limit_mutex); - if (err) - return err; + xchg(&memcg->swap.max, max); return nbytes; } +static int swap_events_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "max %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); + seq_printf(m, "fail %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); + + return 0; +} + static struct cftype swap_files[] = { { .name = "swap.current", @@ -6165,6 +6315,12 @@ static struct cftype swap_files[] = { .seq_show = swap_max_show, .write = swap_max_write, }, + { + .name = "swap.events", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, swap_events_file), + .seq_show = swap_events_show, + }, { } /* terminate */ }; diff --git a/mm/memfd.c b/mm/memfd.c new file mode 100644 index 000000000000..27069518e3c5 --- /dev/null +++ b/mm/memfd.c @@ -0,0 +1,345 @@ +/* + * memfd_create system call and file sealing support + * + * Code was originally included in shmem.c, and broken out to facilitate + * use by hugetlbfs as well as tmpfs. + * + * This file is released under the GPL. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/pagemap.h> +#include <linux/file.h> +#include <linux/mm.h> +#include <linux/sched/signal.h> +#include <linux/khugepaged.h> +#include <linux/syscalls.h> +#include <linux/hugetlb.h> +#include <linux/shmem_fs.h> +#include <linux/memfd.h> +#include <uapi/linux/memfd.h> + +/* + * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, + * so reuse a tag which we firmly believe is never set or cleared on tmpfs + * or hugetlbfs because they are memory only filesystems. + */ +#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE +#define LAST_SCAN 4 /* about 150ms max */ + +static void memfd_tag_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void __rcu **slot; + pgoff_t start; + struct page *page; + + lru_add_drain(); + start = 0; + rcu_read_lock(); + + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { + page = radix_tree_deref_slot(slot); + if (!page || radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + } else if (page_count(page) - page_mapcount(page) > 1) { + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_set(&mapping->i_pages, iter.index, + MEMFD_TAG_PINNED); + xa_unlock_irq(&mapping->i_pages); + } + + if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + } + rcu_read_unlock(); +} + +/* + * Setting SEAL_WRITE requires us to verify there's no pending writer. However, + * via get_user_pages(), drivers might have some pending I/O without any active + * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages + * and see whether it has an elevated ref-count. If so, we tag them and wait for + * them to be dropped. + * The caller must guarantee that no new user will acquire writable references + * to those pages to avoid races. + */ +static int memfd_wait_for_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void __rcu **slot; + pgoff_t start; + struct page *page; + int error, scan; + + memfd_tag_pins(mapping); + + error = 0; + for (scan = 0; scan <= LAST_SCAN; scan++) { + if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) + break; + + if (!scan) + lru_add_drain_all(); + else if (schedule_timeout_killable((HZ << scan) / 200)) + scan = LAST_SCAN; + + start = 0; + rcu_read_lock(); + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, + start, MEMFD_TAG_PINNED) { + + page = radix_tree_deref_slot(slot); + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + page = NULL; + } + + if (page && + page_count(page) - page_mapcount(page) != 1) { + if (scan < LAST_SCAN) + goto continue_resched; + + /* + * On the last scan, we clean up all those tags + * we inserted; but make a note that we still + * found pages pinned. + */ + error = -EBUSY; + } + + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_clear(&mapping->i_pages, + iter.index, MEMFD_TAG_PINNED); + xa_unlock_irq(&mapping->i_pages); +continue_resched: + if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + } + rcu_read_unlock(); + } + + return error; +} + +static unsigned int *memfd_file_seals_ptr(struct file *file) +{ + if (shmem_file(file)) + return &SHMEM_I(file_inode(file))->seals; + +#ifdef CONFIG_HUGETLBFS + if (is_file_hugepages(file)) + return &HUGETLBFS_I(file_inode(file))->seals; +#endif + + return NULL; +} + +#define F_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE) + +static int memfd_add_seals(struct file *file, unsigned int seals) +{ + struct inode *inode = file_inode(file); + unsigned int *file_seals; + int error; + + /* + * SEALING + * Sealing allows multiple parties to share a tmpfs or hugetlbfs file + * but restrict access to a specific subset of file operations. Seals + * can only be added, but never removed. This way, mutually untrusted + * parties can share common memory regions with a well-defined policy. + * A malicious peer can thus never perform unwanted operations on a + * shared object. + * + * Seals are only supported on special tmpfs or hugetlbfs files and + * always affect the whole underlying inode. Once a seal is set, it + * may prevent some kinds of access to the file. Currently, the + * following seals are defined: + * SEAL_SEAL: Prevent further seals from being set on this file + * SEAL_SHRINK: Prevent the file from shrinking + * SEAL_GROW: Prevent the file from growing + * SEAL_WRITE: Prevent write access to the file + * + * As we don't require any trust relationship between two parties, we + * must prevent seals from being removed. Therefore, sealing a file + * only adds a given set of seals to the file, it never touches + * existing seals. Furthermore, the "setting seals"-operation can be + * sealed itself, which basically prevents any further seal from being + * added. + * + * Semantics of sealing are only defined on volatile files. Only + * anonymous tmpfs and hugetlbfs files support sealing. More + * importantly, seals are never written to disk. Therefore, there's + * no plan to support it on other file types. + */ + + if (!(file->f_mode & FMODE_WRITE)) + return -EPERM; + if (seals & ~(unsigned int)F_ALL_SEALS) + return -EINVAL; + + inode_lock(inode); + + file_seals = memfd_file_seals_ptr(file); + if (!file_seals) { + error = -EINVAL; + goto unlock; + } + + if (*file_seals & F_SEAL_SEAL) { + error = -EPERM; + goto unlock; + } + + if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { + error = mapping_deny_writable(file->f_mapping); + if (error) + goto unlock; + + error = memfd_wait_for_pins(file->f_mapping); + if (error) { + mapping_allow_writable(file->f_mapping); + goto unlock; + } + } + + *file_seals |= seals; + error = 0; + +unlock: + inode_unlock(inode); + return error; +} + +static int memfd_get_seals(struct file *file) +{ + unsigned int *seals = memfd_file_seals_ptr(file); + + return seals ? *seals : -EINVAL; +} + +long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long error; + + switch (cmd) { + case F_ADD_SEALS: + /* disallow upper 32bit */ + if (arg > UINT_MAX) + return -EINVAL; + + error = memfd_add_seals(file, arg); + break; + case F_GET_SEALS: + error = memfd_get_seals(file); + break; + default: + error = -EINVAL; + break; + } + + return error; +} + +#define MFD_NAME_PREFIX "memfd:" +#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) +#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) + +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + unsigned int *file_seals; + struct file *file; + int fd, error; + char *name; + long len; + + if (!(flags & MFD_HUGETLB)) { + if (flags & ~(unsigned int)MFD_ALL_FLAGS) + return -EINVAL; + } else { + /* Allow huge page size encoding in flags. */ + if (flags & ~(unsigned int)(MFD_ALL_FLAGS | + (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) + return -EINVAL; + } + + /* length includes terminating zero */ + len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); + if (len <= 0) + return -EFAULT; + if (len > MFD_NAME_MAX_LEN + 1) + return -EINVAL; + + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); + if (!name) + return -ENOMEM; + + strcpy(name, MFD_NAME_PREFIX); + if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + error = -EFAULT; + goto err_name; + } + + /* terminating-zero may have changed after strnlen_user() returned */ + if (name[len + MFD_NAME_PREFIX_LEN - 1]) { + error = -EFAULT; + goto err_name; + } + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + if (flags & MFD_HUGETLB) { + struct user_struct *user = NULL; + + file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, + HUGETLB_ANONHUGE_INODE, + (flags >> MFD_HUGE_SHIFT) & + MFD_HUGE_MASK); + } else + file = shmem_file_setup(name, 0, VM_NORESERVE); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + file->f_flags |= O_RDWR | O_LARGEFILE; + + if (flags & MFD_ALLOW_SEALING) { + file_seals = memfd_file_seals_ptr(file); + *file_seals &= ~F_SEAL_SEAL; + } + + fd_install(fd, file); + kfree(name); + return fd; + +err_fd: + put_unused_fd(fd); +err_name: + kfree(name); + return error; +} diff --git a/mm/memory.c b/mm/memory.c index 5d8c2afb0730..7206a634270b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -817,17 +817,12 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * PFNMAP mappings in order to support COWable mappings. * */ -#ifdef __HAVE_ARCH_PTE_SPECIAL -# define HAVE_PTE_SPECIAL 1 -#else -# define HAVE_PTE_SPECIAL 0 -#endif struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte, bool with_public_device) { unsigned long pfn = pte_pfn(pte); - if (HAVE_PTE_SPECIAL) { + if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { if (likely(!pte_special(pte))) goto check_pfn; if (vma->vm_ops && vma->vm_ops->find_special_page) @@ -862,7 +857,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return NULL; } - /* !HAVE_PTE_SPECIAL case follows: */ + /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { @@ -881,6 +876,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, if (is_zero_pfn(pfn)) return NULL; + check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); @@ -904,7 +900,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, /* * There is no pmd_special() but there may be special pmds, e.g. * in a direct-access (dax) mapping, so let's just replicate the - * !HAVE_PTE_SPECIAL case from vm_normal_page() here. + * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { @@ -1932,7 +1928,8 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ - if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && + !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { struct page *page; /* @@ -1954,12 +1951,25 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_mixed); -int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) +/* + * If the insertion of PTE failed because someone else already added a + * different entry in the mean time, we treat that as success as we assume + * the same entry was actually inserted. + */ + +vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, + unsigned long addr, pfn_t pfn) { - return __vm_insert_mixed(vma, addr, pfn, true); + int err; + + err = __vm_insert_mixed(vma, addr, pfn, true); + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + return VM_FAULT_NOPAGE; } -EXPORT_SYMBOL(vm_insert_mixed_mkwrite); +EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); /* * maps a range of physical memory into the requested pages. the old diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f74826cdceea..7deb49f69e27 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1158,7 +1158,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) * nodes have to go through register_node. * TODO clean up this mess. */ - ret = link_mem_sections(nid, start_pfn, nr_pages); + ret = link_mem_sections(nid, start_pfn, nr_pages, false); register_fail: /* * If sysfs file of new node can't create, cpu on the node @@ -1237,6 +1237,29 @@ static struct page *next_active_pageblock(struct page *page) return page + pageblock_nr_pages; } +static bool is_pageblock_removable_nolock(struct page *page) +{ + struct zone *zone; + unsigned long pfn; + + /* + * We have to be careful here because we are iterating over memory + * sections which are not zone aware so we might end up outside of + * the zone but still within the section. + * We have to take care about the node as well. If the node is offline + * its NODE_DATA will be NULL - see page_zone. + */ + if (!node_online(page_to_nid(page))) + return false; + + zone = page_zone(page); + pfn = page_to_pfn(page); + if (!zone_spans_pfn(zone, pfn)) + return false; + + return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); +} + /* Checks if this range of memory is likely to be hot-removable. */ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { diff --git a/mm/mempool.c b/mm/mempool.c index 5c9dce34719b..b54f2c20e5e0 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -138,6 +138,28 @@ static void *remove_element(mempool_t *pool, gfp_t flags) } /** + * mempool_exit - exit a mempool initialized with mempool_init() + * @pool: pointer to the memory pool which was initialized with + * mempool_init(). + * + * Free all reserved elements in @pool and @pool itself. This function + * only sleeps if the free_fn() function sleeps. + * + * May be called on a zeroed but uninitialized mempool (i.e. allocated with + * kzalloc()). + */ +void mempool_exit(mempool_t *pool) +{ + while (pool->curr_nr) { + void *element = remove_element(pool, GFP_KERNEL); + pool->free(element, pool->pool_data); + } + kfree(pool->elements); + pool->elements = NULL; +} +EXPORT_SYMBOL(mempool_exit); + +/** * mempool_destroy - deallocate a memory pool * @pool: pointer to the memory pool which was allocated via * mempool_create(). @@ -150,15 +172,65 @@ void mempool_destroy(mempool_t *pool) if (unlikely(!pool)) return; - while (pool->curr_nr) { - void *element = remove_element(pool, GFP_KERNEL); - pool->free(element, pool->pool_data); - } - kfree(pool->elements); + mempool_exit(pool); kfree(pool); } EXPORT_SYMBOL(mempool_destroy); +int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, + gfp_t gfp_mask, int node_id) +{ + spin_lock_init(&pool->lock); + pool->min_nr = min_nr; + pool->pool_data = pool_data; + pool->alloc = alloc_fn; + pool->free = free_fn; + init_waitqueue_head(&pool->wait); + + pool->elements = kmalloc_array_node(min_nr, sizeof(void *), + gfp_mask, node_id); + if (!pool->elements) + return -ENOMEM; + + /* + * First pre-allocate the guaranteed number of buffers. + */ + while (pool->curr_nr < pool->min_nr) { + void *element; + + element = pool->alloc(gfp_mask, pool->pool_data); + if (unlikely(!element)) { + mempool_exit(pool); + return -ENOMEM; + } + add_element(pool, element); + } + + return 0; +} +EXPORT_SYMBOL(mempool_init_node); + +/** + * mempool_init - initialize a memory pool + * @min_nr: the minimum number of elements guaranteed to be + * allocated for this pool. + * @alloc_fn: user-defined element-allocation function. + * @free_fn: user-defined element-freeing function. + * @pool_data: optional private data available to the user-defined functions. + * + * Like mempool_create(), but initializes the pool in (i.e. embedded in another + * structure). + */ +int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data) +{ + return mempool_init_node(pool, min_nr, alloc_fn, free_fn, + pool_data, GFP_KERNEL, NUMA_NO_NODE); + +} +EXPORT_SYMBOL(mempool_init); + /** * mempool_create - create a memory pool * @min_nr: the minimum number of elements guaranteed to be @@ -186,35 +258,17 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, gfp_t gfp_mask, int node_id) { mempool_t *pool; + pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id); if (!pool) return NULL; - pool->elements = kmalloc_array_node(min_nr, sizeof(void *), - gfp_mask, node_id); - if (!pool->elements) { + + if (mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data, + gfp_mask, node_id)) { kfree(pool); return NULL; } - spin_lock_init(&pool->lock); - pool->min_nr = min_nr; - pool->pool_data = pool_data; - init_waitqueue_head(&pool->wait); - pool->alloc = alloc_fn; - pool->free = free_fn; - /* - * First pre-allocate the guaranteed number of buffers. - */ - while (pool->curr_nr < pool->min_nr) { - void *element; - - element = pool->alloc(gfp_mask, pool->pool_data); - if (unlikely(!element)) { - mempool_destroy(pool); - return NULL; - } - add_element(pool, element); - } return pool; } EXPORT_SYMBOL(mempool_create_node); diff --git a/mm/migrate.c b/mm/migrate.c index f65dd69e1fd1..8c0af0f7cab1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -472,7 +472,7 @@ int migrate_page_move_mapping(struct address_space *mapping, pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); - expected_count += 1 + page_has_private(page); + expected_count += hpage_nr_pages(page) + page_has_private(page); if (page_count(page) != expected_count || radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { @@ -505,7 +505,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ newpage->index = page->index; newpage->mapping = page->mapping; - get_page(newpage); /* add cache reference */ + page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */ if (PageSwapBacked(page)) { __SetPageSwapBacked(newpage); if (PageSwapCache(page)) { @@ -524,13 +524,24 @@ int migrate_page_move_mapping(struct address_space *mapping, } radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + if (PageTransHuge(page)) { + int i; + int index = page_index(page); + + for (i = 1; i < HPAGE_PMD_NR; i++) { + pslot = radix_tree_lookup_slot(&mapping->i_pages, + index + i); + radix_tree_replace_slot(&mapping->i_pages, pslot, + newpage + i); + } + } /* * Drop cache reference from old page by unfreezing * to one less reference. * We know this isn't the last reference. */ - page_ref_unfreeze(page, expected_count - 1); + page_ref_unfreeze(page, expected_count - hpage_nr_pages(page)); xa_unlock(&mapping->i_pages); /* Leave irq disabled to prevent preemption while updating stats */ @@ -1622,6 +1633,9 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, current_node = NUMA_NO_NODE; } out_flush: + if (list_empty(&pagelist)) + return err; + /* Make sure we do not overwrite the existing error */ err1 = do_move_pages_to_node(mm, &pagelist, current_node); if (!err1) diff --git a/mm/mmap.c b/mm/mmap.c index 188f195883b9..d1eb87ef4b1a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -100,11 +100,20 @@ pgprot_t protection_map[16] __ro_after_init = { __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 }; +#ifndef CONFIG_ARCH_HAS_FILTER_PGPROT +static inline pgprot_t arch_filter_pgprot(pgprot_t prot) +{ + return prot; +} +#endif + pgprot_t vm_get_page_prot(unsigned long vm_flags) { - return __pgprot(pgprot_val(protection_map[vm_flags & + pgprot_t ret = __pgprot(pgprot_val(protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | pgprot_val(arch_vm_get_page_prot(vm_flags))); + + return arch_filter_pgprot(ret); } EXPORT_SYMBOL(vm_get_page_prot); @@ -1315,6 +1324,35 @@ static inline int mlock_future_check(struct mm_struct *mm, return 0; } +static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + return MAX_LFS_FILESIZE; + + if (S_ISBLK(inode->i_mode)) + return MAX_LFS_FILESIZE; + + /* Special "we do even unsigned file positions" case */ + if (file->f_mode & FMODE_UNSIGNED_OFFSET) + return 0; + + /* Yes, random drivers might want more. But I'm tired of buggy drivers */ + return ULONG_MAX; +} + +static inline bool file_mmap_ok(struct file *file, struct inode *inode, + unsigned long pgoff, unsigned long len) +{ + u64 maxsize = file_mmap_size_max(file, inode); + + if (maxsize && len > maxsize) + return false; + maxsize -= len; + if (pgoff > maxsize >> PAGE_SHIFT) + return false; + return true; +} + /* * The caller must hold down_write(¤t->mm->mmap_sem). */ @@ -1400,6 +1438,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr, struct inode *inode = file_inode(file); unsigned long flags_mask; + if (!file_mmap_ok(file, inode, pgoff, len)) + return -EOVERFLOW; + flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags; switch (flags & MAP_TYPE) { @@ -2787,7 +2828,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long ret = -EINVAL; struct file *file; - pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n", + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n", current->comm, current->pid); if (prot) @@ -3015,6 +3056,32 @@ void exit_mmap(struct mm_struct *mm) /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); + if (unlikely(mm_is_oom_victim(mm))) { + /* + * Manually reap the mm to free as much memory as possible. + * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard + * this mm from further consideration. Taking mm->mmap_sem for + * write after setting MMF_OOM_SKIP will guarantee that the oom + * reaper will not run on this mm again after mmap_sem is + * dropped. + * + * Nothing can be holding mm->mmap_sem here and the above call + * to mmu_notifier_release(mm) ensures mmu notifier callbacks in + * __oom_reap_task_mm() will not block. + * + * This needs to be done before calling munlock_vma_pages_all(), + * which clears VM_LOCKED, otherwise the oom reaper cannot + * reliably test it. + */ + mutex_lock(&oom_lock); + __oom_reap_task_mm(mm); + mutex_unlock(&oom_lock); + + set_bit(MMF_OOM_SKIP, &mm->flags); + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } + if (mm->locked_vm) { vma = mm->mmap; while (vma) { @@ -3036,24 +3103,6 @@ void exit_mmap(struct mm_struct *mm) /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); - - if (unlikely(mm_is_oom_victim(mm))) { - /* - * Wait for oom_reap_task() to stop working on this - * mm. Because MMF_OOM_SKIP is already set before - * calling down_read(), oom_reap_task() will not run - * on this "mm" post up_write(). - * - * mm_is_oom_victim() cannot be set from under us - * either because victim->mm is already set to NULL - * under task_lock before calling mmput and oom_mm is - * set not NULL by the OOM killer only if victim->mm - * is found not NULL while holding the task_lock. - */ - set_bit(MMF_OOM_SKIP, &mm->flags); - down_write(&mm->mmap_sem); - up_write(&mm->mmap_sem); - } free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); @@ -3228,7 +3277,7 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) mm->data_vm += npages; } -static int special_mapping_fault(struct vm_fault *vmf); +static vm_fault_t special_mapping_fault(struct vm_fault *vmf); /* * Having a close hook prevents vma merging regardless of flags. @@ -3267,7 +3316,7 @@ static const struct vm_operations_struct legacy_special_mapping_vmops = { .fault = special_mapping_fault, }; -static int special_mapping_fault(struct vm_fault *vmf) +static vm_fault_t special_mapping_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgoff_t pgoff; diff --git a/mm/mremap.c b/mm/mremap.c index 049470aa1e3e..5c2e18505f75 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -191,8 +191,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, drop_rmap_locks(vma); } -#define LATENCY_LIMIT (64 * PAGE_SIZE) - unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, @@ -247,8 +245,6 @@ unsigned long move_page_tables(struct vm_area_struct *vma, next = (new_addr + PMD_SIZE) & PMD_MASK; if (extent > next - new_addr) extent = next - new_addr; - if (extent > LATENCY_LIMIT) - extent = LATENCY_LIMIT; move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma, new_pmd, new_addr, need_rmap_locks, &need_flush); } diff --git a/mm/nommu.c b/mm/nommu.c index 13723736d38f..4452d8bd9ae4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1763,7 +1763,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; } -int filemap_fault(struct vm_fault *vmf) +vm_fault_t filemap_fault(struct vm_fault *vmf) { BUG(); return 0; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ff992fa8760a..84081e77bc51 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -256,7 +256,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) int nid; if (is_memcg_oom(oc)) { - oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1; + oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1; return CONSTRAINT_MEMCG; } @@ -469,7 +469,6 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) return false; } - #ifdef CONFIG_MMU /* * OOM Reaper kernel thread which tries to reap the memory used by the OOM @@ -480,16 +479,54 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); -static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) +void __oom_reap_task_mm(struct mm_struct *mm) { - struct mmu_gather tlb; struct vm_area_struct *vma; + + /* + * Tell all users of get_user/copy_from_user etc... that the content + * is no longer stable. No barriers really needed because unmapping + * should imply barriers already and the reader would hit a page fault + * if it stumbled over a reaped memory. + */ + set_bit(MMF_UNSTABLE, &mm->flags); + + for (vma = mm->mmap ; vma; vma = vma->vm_next) { + if (!can_madv_dontneed_vma(vma)) + continue; + + /* + * Only anonymous pages have a good chance to be dropped + * without additional steps which we cannot afford as we + * are OOM already. + * + * We do not even care about fs backed pages because all + * which are reclaimable have already been reclaimed and + * we do not want to block exit_mmap by keeping mm ref + * count elevated without a good reason. + */ + if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { + const unsigned long start = vma->vm_start; + const unsigned long end = vma->vm_end; + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, mm, start, end); + mmu_notifier_invalidate_range_start(mm, start, end); + unmap_page_range(&tlb, vma, start, end, NULL); + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_finish_mmu(&tlb, start, end); + } + } +} + +static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) +{ bool ret = true; /* * We have to make sure to not race with the victim exit path * and cause premature new oom victim selection: - * __oom_reap_task_mm exit_mm + * oom_reap_task_mm exit_mm * mmget_not_zero * mmput * atomic_dec_and_test @@ -534,39 +571,8 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) trace_start_task_reaping(tsk->pid); - /* - * Tell all users of get_user/copy_from_user etc... that the content - * is no longer stable. No barriers really needed because unmapping - * should imply barriers already and the reader would hit a page fault - * if it stumbled over a reaped memory. - */ - set_bit(MMF_UNSTABLE, &mm->flags); - - for (vma = mm->mmap ; vma; vma = vma->vm_next) { - if (!can_madv_dontneed_vma(vma)) - continue; + __oom_reap_task_mm(mm); - /* - * Only anonymous pages have a good chance to be dropped - * without additional steps which we cannot afford as we - * are OOM already. - * - * We do not even care about fs backed pages because all - * which are reclaimable have already been reclaimed and - * we do not want to block exit_mmap by keeping mm ref - * count elevated without a good reason. - */ - if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { - const unsigned long start = vma->vm_start; - const unsigned long end = vma->vm_end; - - tlb_gather_mmu(&tlb, mm, start, end); - mmu_notifier_invalidate_range_start(mm, start, end); - unmap_page_range(&tlb, vma, start, end, NULL); - mmu_notifier_invalidate_range_end(mm, start, end); - tlb_finish_mmu(&tlb, start, end); - } - } pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(tsk), tsk->comm, K(get_mm_counter(mm, MM_ANONPAGES)), @@ -587,14 +593,13 @@ static void oom_reap_task(struct task_struct *tsk) struct mm_struct *mm = tsk->signal->oom_mm; /* Retry the down_read_trylock(mmap_sem) a few times */ - while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) + while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm)) schedule_timeout_idle(HZ/10); if (attempts <= MAX_OOM_REAP_RETRIES || test_bit(MMF_OOM_SKIP, &mm->flags)) goto done; - pr_info("oom_reaper: unable to reap pid:%d (%s)\n", task_pid_nr(tsk), tsk->comm); debug_show_all_locks(); @@ -908,7 +913,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) /* Raise event before sending signal: task reaper must see this */ count_vm_event(OOM_KILL); - count_memcg_event_mm(mm, OOM_KILL); + memcg_memory_event_mm(mm, MEMCG_OOM_KILL); /* * We should send SIGKILL before granting access to memory reserves diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5c1a3279e63f..337c6afb3345 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2502,13 +2502,13 @@ void account_page_redirty(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); current->nr_dirtied--; dec_node_page_state(page, NR_DIRTIED); dec_wb_stat(wb, WB_DIRTIED); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); } } EXPORT_SYMBOL(account_page_redirty); @@ -2614,15 +2614,15 @@ void __cancel_dirty_page(struct page *page) if (mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; lock_page_memcg(page); - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) account_page_cleaned(page, mapping, wb); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); unlock_page_memcg(page); } else { ClearPageDirty(page); @@ -2654,7 +2654,7 @@ int clear_page_dirty_for_io(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; /* * Yes, Virginia, this is indeed insane. @@ -2691,14 +2691,14 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) { dec_lruvec_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); return ret; } return TestClearPageDirty(page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 905db9d7962f..1521100f1e63 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -705,16 +705,14 @@ static inline void rmv_page_order(struct page *page) /* * This function checks whether a page is free && is the buddy - * we can do coalesce a page and its buddy if + * we can coalesce a page and its buddy if * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * - * For recording whether a page is in the buddy system, we set ->_mapcount - * PAGE_BUDDY_MAPCOUNT_VALUE. - * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is - * serialized by zone->lock. + * For recording whether a page is in the buddy system, we set PageBuddy. + * Setting, clearing, and testing PageBuddy is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ @@ -759,9 +757,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with _mapcount - * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) - * field. + * free pages of length of (1 << order) and marked with PageBuddy. + * Page's order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were * free, the remainder of the region must be split into blocks. @@ -946,7 +943,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) } switch (page - head_page) { case 1: - /* the first tail page: ->mapping is compound_mapcount() */ + /* the first tail page: ->mapping may be compound_mapcount() */ if (unlikely(compound_mapcount(page))) { bad_page(page, "nonzero compound_mapcount", 0); goto out; @@ -955,7 +952,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) case 2: /* * the second tail page: ->mapping is - * page_deferred_list().next -- ignore value. + * deferred_list.next -- ignore value. */ break; default: @@ -1743,38 +1740,16 @@ void __init page_alloc_init_late(void) } #ifdef CONFIG_CMA -static void __init adjust_present_page_count(struct page *page, long count) -{ - struct zone *zone = page_zone(page); - - /* We don't need to hold a lock since it is boot-up process */ - zone->present_pages += count; -} - /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) { unsigned i = pageblock_nr_pages; - unsigned long pfn = page_to_pfn(page); struct page *p = page; - int nid = page_to_nid(page); - - /* - * ZONE_MOVABLE will steal present pages from other zones by - * changing page links so page_zone() is changed. Before that, - * we need to adjust previous zone's page count first. - */ - adjust_present_page_count(page, -pageblock_nr_pages); do { __ClearPageReserved(p); set_page_count(p, 0); - - /* Steal pages from other zones */ - set_page_links(p, ZONE_MOVABLE, nid, pfn); - } while (++p, ++pfn, --i); - - adjust_present_page_count(page, pageblock_nr_pages); + } while (++p, --i); set_pageblock_migratetype(page, MIGRATE_CMA); @@ -2889,7 +2864,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * exists. */ watermark = min_wmark_pages(zone) + (1UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) return 0; __mod_zone_freepage_state(zone, -(1UL << order), mt); @@ -3086,7 +3061,7 @@ static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) static int __init fail_page_alloc_debugfs(void) { - umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + umode_t mode = S_IFREG | 0600; struct dentry *dir; dir = fault_create_debugfs_attr("fail_page_alloc", NULL, @@ -3165,6 +3140,12 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } +#ifdef CONFIG_CMA + /* If allocation can't use CMA areas don't use free CMA pages */ + if (!(alloc_flags & ALLOC_CMA)) + free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); +#endif + /* * Check watermarks for an order-0 allocation request. If these * are not met, then a high-order request also cannot go ahead @@ -3191,8 +3172,10 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } #ifdef CONFIG_CMA - if (!list_empty(&area->free_list[MIGRATE_CMA])) + if ((alloc_flags & ALLOC_CMA) && + !list_empty(&area->free_list[MIGRATE_CMA])) { return true; + } #endif if (alloc_harder && !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) @@ -3212,6 +3195,13 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, unsigned int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); + long cma_pages = 0; + +#ifdef CONFIG_CMA + /* If allocation can't use CMA areas don't use free CMA pages */ + if (!(alloc_flags & ALLOC_CMA)) + cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); +#endif /* * Fast check for order-0 only. If this fails then the reserves @@ -3220,7 +3210,7 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * the caller is !atomic then it'll uselessly search the free * list. That corner case is then slower but it is harmless. */ - if (!order && free_pages > mark + z->lowmem_reserve[classzone_idx]) + if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) return true; return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, @@ -3708,7 +3698,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla #endif /* CONFIG_COMPACTION */ #ifdef CONFIG_LOCKDEP -struct lockdep_map __fs_reclaim_map = +static struct lockdep_map __fs_reclaim_map = STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); static bool __need_fs_reclaim(gfp_t gfp_mask) @@ -3733,17 +3723,27 @@ static bool __need_fs_reclaim(gfp_t gfp_mask) return true; } +void __fs_reclaim_acquire(void) +{ + lock_map_acquire(&__fs_reclaim_map); +} + +void __fs_reclaim_release(void) +{ + lock_map_release(&__fs_reclaim_map); +} + void fs_reclaim_acquire(gfp_t gfp_mask) { if (__need_fs_reclaim(gfp_mask)) - lock_map_acquire(&__fs_reclaim_map); + __fs_reclaim_acquire(); } EXPORT_SYMBOL_GPL(fs_reclaim_acquire); void fs_reclaim_release(gfp_t gfp_mask) { if (__need_fs_reclaim(gfp_mask)) - lock_map_release(&__fs_reclaim_map); + __fs_reclaim_release(); } EXPORT_SYMBOL_GPL(fs_reclaim_release); #endif @@ -3761,8 +3761,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - noreclaim_flag = memalloc_noreclaim_save(); fs_reclaim_acquire(gfp_mask); + noreclaim_flag = memalloc_noreclaim_save(); reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; @@ -3770,8 +3770,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, ac->nodemask); current->reclaim_state = NULL; - fs_reclaim_release(gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(gfp_mask); cond_resched(); @@ -3856,6 +3856,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; +#ifdef CONFIG_CMA + if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; +#endif return alloc_flags; } @@ -4165,7 +4169,6 @@ retry: * orientated. */ if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { - ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); } @@ -4322,12 +4325,14 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return false; + if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) + *alloc_flags |= ALLOC_CMA; + return true; } /* Determine whether to spread dirty pages and what the first usable zone */ -static inline void finalise_ac(gfp_t gfp_mask, - unsigned int order, struct alloc_context *ac) +static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) { /* Dirty zone balancing only done in the fast path */ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); @@ -4358,7 +4363,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; - finalise_ac(gfp_mask, order, &ac); + finalise_ac(gfp_mask, &ac); /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); @@ -6204,7 +6209,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; - unsigned long node_end_pfn = 0; pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING @@ -6230,22 +6234,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long size, realsize, freesize, memmap_pages; + unsigned long size, freesize, memmap_pages; unsigned long zone_start_pfn = zone->zone_start_pfn; - unsigned long movable_size = 0; size = zone->spanned_pages; - realsize = freesize = zone->present_pages; - if (zone_end_pfn(zone) > node_end_pfn) - node_end_pfn = zone_end_pfn(zone); - + freesize = zone->present_pages; /* * Adjust freesize so that it accounts for how much memory * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */ - memmap_pages = calc_memmap_size(size, realsize); + memmap_pages = calc_memmap_size(size, freesize); if (!is_highmem_idx(j)) { if (freesize >= memmap_pages) { freesize -= memmap_pages; @@ -6277,7 +6277,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) * when the bootmem allocator frees pages into the buddy system. * And all highmem pages will be managed by the buddy system. */ - zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; + zone->managed_pages = freesize; #ifdef CONFIG_NUMA zone->node = nid; #endif @@ -6287,30 +6287,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) zone_seqlock_init(zone); zone_pcp_init(zone); - /* - * The size of the CMA area is unknown now so we need to - * prepare the memory for the usemap at maximum. - */ - if (IS_ENABLED(CONFIG_CMA) && j == ZONE_MOVABLE && - pgdat->node_spanned_pages) { - movable_size = node_end_pfn - pgdat->node_start_pfn; - } - - if (!size && !movable_size) + if (!size) continue; set_pageblock_order(); - if (movable_size) { - zone->zone_start_pfn = pgdat->node_start_pfn; - zone->spanned_pages = movable_size; - setup_usemap(pgdat, zone, - pgdat->node_start_pfn, movable_size); - init_currently_empty_zone(zone, - pgdat->node_start_pfn, movable_size); - } else { - setup_usemap(pgdat, zone, zone_start_pfn, size); - init_currently_empty_zone(zone, zone_start_pfn, size); - } + setup_usemap(pgdat, zone, zone_start_pfn, size); + init_currently_empty_zone(zone, zone_start_pfn, size); memmap_init(size, nid, j, zone_start_pfn); } } @@ -7621,11 +7603,12 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, unsigned long pfn, iter, found; /* - * For avoiding noise data, lru_add_drain_all() should be called - * If ZONE_MOVABLE, the zone never contains unmovable pages + * TODO we could make this much more efficient by not checking every + * page in the range if we know all of them are in MOVABLE_ZONE and + * that the movable zone guarantees that pages are migratable but + * the later is not the case right now unfortunatelly. E.g. movablecore + * can still lead to having bootmem allocations in zone_movable. */ - if (zone_idx(zone) == ZONE_MOVABLE) - return false; /* * CMA allocations (alloc_contig_range) really need to mark isolate @@ -7646,7 +7629,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, page = pfn_to_page(check); if (PageReserved(page)) - return true; + goto unmovable; /* * Hugepages are not in LRU lists, but they're movable. @@ -7696,32 +7679,12 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * page at boot. */ if (found > count) - return true; + goto unmovable; } return false; -} - -bool is_pageblock_removable_nolock(struct page *page) -{ - struct zone *zone; - unsigned long pfn; - - /* - * We have to be careful here because we are iterating over memory - * sections which are not zone aware so we might end up outside of - * the zone but still within the section. - * We have to take care about the node as well. If the node is offline - * its NODE_DATA will be NULL - see page_zone. - */ - if (!node_online(page_to_nid(page))) - return false; - - zone = page_zone(page); - pfn = page_to_pfn(page); - if (!zone_spans_pfn(zone, pfn)) - return false; - - return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); +unmovable: + WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); + return true; } #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) @@ -7951,7 +7914,7 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) } #endif -#if defined CONFIG_MEMORY_HOTPLUG || defined CONFIG_CMA +#ifdef CONFIG_MEMORY_HOTPLUG /* * The zone indicated has a new number of managed_pages; batch sizes and percpu * page high values need to be recalulated. diff --git a/mm/page_counter.c b/mm/page_counter.c index 2a8df3ad60a4..de31470655f6 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -13,6 +13,40 @@ #include <linux/bug.h> #include <asm/page.h> +static void propagate_protected_usage(struct page_counter *c, + unsigned long usage) +{ + unsigned long protected, old_protected; + long delta; + + if (!c->parent) + return; + + if (c->min || atomic_long_read(&c->min_usage)) { + if (usage <= c->min) + protected = usage; + else + protected = 0; + + old_protected = atomic_long_xchg(&c->min_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_min_usage); + } + + if (c->low || atomic_long_read(&c->low_usage)) { + if (usage <= c->low) + protected = usage; + else + protected = 0; + + old_protected = atomic_long_xchg(&c->low_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_low_usage); + } +} + /** * page_counter_cancel - take pages out of the local counter * @counter: counter @@ -22,7 +56,8 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) { long new; - new = atomic_long_sub_return(nr_pages, &counter->count); + new = atomic_long_sub_return(nr_pages, &counter->usage); + propagate_protected_usage(counter, new); /* More uncharges than charges? */ WARN_ON_ONCE(new < 0); } @@ -41,7 +76,8 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) for (c = counter; c; c = c->parent) { long new; - new = atomic_long_add_return(nr_pages, &c->count); + new = atomic_long_add_return(nr_pages, &c->usage); + propagate_protected_usage(counter, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. @@ -82,9 +118,10 @@ bool page_counter_try_charge(struct page_counter *counter, * we either see the new limit or the setter sees the * counter has changed and retries. */ - new = atomic_long_add_return(nr_pages, &c->count); - if (new > c->limit) { - atomic_long_sub(nr_pages, &c->count); + new = atomic_long_add_return(nr_pages, &c->usage); + if (new > c->max) { + atomic_long_sub(nr_pages, &c->usage); + propagate_protected_usage(counter, new); /* * This is racy, but we can live with some * inaccuracy in the failcnt. @@ -93,6 +130,7 @@ bool page_counter_try_charge(struct page_counter *counter, *fail = c; goto failed; } + propagate_protected_usage(counter, new); /* * Just like with failcnt, we can live with some * inaccuracy in the watermark. @@ -123,20 +161,20 @@ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) } /** - * page_counter_limit - limit the number of pages allowed + * page_counter_set_max - set the maximum number of pages allowed * @counter: counter - * @limit: limit to set + * @nr_pages: limit to set * * Returns 0 on success, -EBUSY if the current number of pages on the * counter already exceeds the specified limit. * * The caller must serialize invocations on the same counter. */ -int page_counter_limit(struct page_counter *counter, unsigned long limit) +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) { for (;;) { unsigned long old; - long count; + long usage; /* * Update the limit while making sure that it's not @@ -149,22 +187,56 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit) * the limit, so if it sees the old limit, we see the * modified counter and retry. */ - count = atomic_long_read(&counter->count); + usage = atomic_long_read(&counter->usage); - if (count > limit) + if (usage > nr_pages) return -EBUSY; - old = xchg(&counter->limit, limit); + old = xchg(&counter->max, nr_pages); - if (atomic_long_read(&counter->count) <= count) + if (atomic_long_read(&counter->usage) <= usage) return 0; - counter->limit = old; + counter->max = old; cond_resched(); } } /** + * page_counter_set_min - set the amount of protected memory + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + counter->min = nr_pages; + + for (c = counter; c; c = c->parent) + propagate_protected_usage(c, atomic_long_read(&c->usage)); +} + +/** + * page_counter_set_low - set the amount of protected memory + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + counter->low = nr_pages; + + for (c = counter; c; c = c->parent) + propagate_protected_usage(c, atomic_long_read(&c->usage)); +} + +/** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse * @max: string meaning maximum possible value diff --git a/mm/page_idle.c b/mm/page_idle.c index e412a63b2b74..6302bc62c27d 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -201,7 +201,7 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, } static struct bin_attribute page_idle_bitmap_attr = - __BIN_ATTR(bitmap, S_IRUSR | S_IWUSR, + __BIN_ATTR(bitmap, 0600, page_idle_bitmap_read, page_idle_bitmap_write, 0); static struct bin_attribute *page_idle_bin_attrs[] = { diff --git a/mm/page_owner.c b/mm/page_owner.c index 75d21a2259b3..d80adfe702d3 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -631,8 +631,8 @@ static int __init pageowner_init(void) return 0; } - dentry = debugfs_create_file("page_owner", S_IRUSR, NULL, - NULL, &proc_page_owner_operations); + dentry = debugfs_create_file("page_owner", 0400, NULL, + NULL, &proc_page_owner_operations); return PTR_ERR_OR_ZERO(dentry); } diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index 063ff60ecd90..b5fdd43b60c9 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -144,7 +144,7 @@ alloc_buffer: spin_unlock_irq(&pcpu_lock); /* there can be at most this many free and allocated fragments */ - buffer = vmalloc((2 * max_nr_alloc + 1) * sizeof(int)); + buffer = vmalloc(array_size(sizeof(int), (2 * max_nr_alloc + 1))); if (!buffer) return -ENOMEM; diff --git a/mm/readahead.c b/mm/readahead.c index 539bbb6c1fad..e273f0de3376 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -140,23 +140,23 @@ out: } /* - * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all - * the pages first, then submits them all for I/O. This avoids the very bad + * __do_page_cache_readahead() actually reads a chunk of disk. It allocates + * the pages first, then submits them for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. * * Returns the number of pages requested, or the maximum amount of I/O allowed. */ -int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read, - unsigned long lookahead_size) +unsigned int __do_page_cache_readahead(struct address_space *mapping, + struct file *filp, pgoff_t offset, unsigned long nr_to_read, + unsigned long lookahead_size) { struct inode *inode = mapping->host; struct page *page; unsigned long end_index; /* The last page we want to read */ LIST_HEAD(page_pool); int page_idx; - int ret = 0; + unsigned int nr_pages = 0; loff_t isize = i_size_read(inode); gfp_t gfp_mask = readahead_gfp_mask(mapping); @@ -177,8 +177,18 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, rcu_read_lock(); page = radix_tree_lookup(&mapping->i_pages, page_offset); rcu_read_unlock(); - if (page && !radix_tree_exceptional_entry(page)) + if (page && !radix_tree_exceptional_entry(page)) { + /* + * Page already present? Kick off the current batch of + * contiguous pages before continuing with the next + * batch. + */ + if (nr_pages) + read_pages(mapping, filp, &page_pool, nr_pages, + gfp_mask); + nr_pages = 0; continue; + } page = __page_cache_alloc(gfp_mask); if (!page) @@ -187,7 +197,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, list_add(&page->lru, &page_pool); if (page_idx == nr_to_read - lookahead_size) SetPageReadahead(page); - ret++; + nr_pages++; } /* @@ -195,11 +205,11 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, * uptodate then the caller will launch readpage again, and * will then handle the error. */ - if (ret) - read_pages(mapping, filp, &page_pool, ret, gfp_mask); + if (nr_pages) + read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask); BUG_ON(!list_empty(&page_pool)); out: - return ret; + return nr_pages; } /* @@ -223,16 +233,11 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); nr_to_read = min(nr_to_read, max_pages); while (nr_to_read) { - int err; - unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE; if (this_chunk > nr_to_read) this_chunk = nr_to_read; - err = __do_page_cache_readahead(mapping, filp, - offset, this_chunk, 0); - if (err < 0) - return err; + __do_page_cache_readahead(mapping, filp, offset, this_chunk, 0); offset += this_chunk; nr_to_read -= this_chunk; diff --git a/mm/rmap.c b/mm/rmap.c index f0dd4e4565bc..6db729dc4c50 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -942,7 +942,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, * downgrading page table protection not changing it to point * to a new page. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ if (ret) (*cleaned)++; @@ -1374,9 +1374,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (!pvmw.pte && (flags & TTU_MIGRATION)) { VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); - if (!PageAnon(page)) - continue; - set_pmd_migration_entry(&pvmw, page); continue; } @@ -1602,7 +1599,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * point at new page while a device still is using this * page. * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ dec_mm_counter(mm, mm_counter_file(page)); } @@ -1612,7 +1609,7 @@ discard: * done above for all cases requiring it to happen under page * table lock before mmu_notifier_invalidate_range_end() * - * See Documentation/vm/mmu_notifier.txt + * See Documentation/vm/mmu_notifier.rst */ page_remove_rmap(subpage, PageHuge(page)); put_page(page); diff --git a/mm/shmem.c b/mm/shmem.c index 9d6c7e595415..2cab84403055 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -327,7 +327,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { struct radix_tree_node *node; - void **pslot; + void __rcu **pslot; void *item; VM_BUG_ON(!expected); @@ -395,7 +395,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE /* ifdef here to avoid bloating shmem.o when not necessary */ -int shmem_huge __read_mostly; +static int shmem_huge __read_mostly; #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static int shmem_parse_huge(const char *str) @@ -571,6 +571,15 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, } #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) +{ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && + shmem_huge != SHMEM_HUGE_DENY) + return true; + return false; +} + /* * Like add_to_page_cache_locked, but error if expected item has gone. */ @@ -682,7 +691,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct radix_tree_iter iter; - void **slot; + void __rcu **slot; struct page *page; unsigned long swapped = 0; @@ -988,6 +997,7 @@ static int shmem_getattr(const struct path *path, struct kstat *stat, { struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb); if (info->alloced - info->swapped != inode->i_mapping->nrpages) { spin_lock_irq(&info->lock); @@ -995,6 +1005,10 @@ static int shmem_getattr(const struct path *path, struct kstat *stat, spin_unlock_irq(&info->lock); } generic_fillattr(inode, stat); + + if (is_huge_enabled(sb_info)) + stat->blksize = HPAGE_PMD_SIZE; + return 0; } @@ -1098,13 +1112,19 @@ static void shmem_evict_inode(struct inode *inode) static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) { struct radix_tree_iter iter; - void **slot; + void __rcu **slot; unsigned long found = -1; unsigned int checked = 0; rcu_read_lock(); radix_tree_for_each_slot(slot, root, &iter, 0) { - if (*slot == item) { + void *entry = radix_tree_deref_slot(slot); + + if (radix_tree_deref_retry(entry)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + if (entry == item) { found = iter.index; break; } @@ -1322,9 +1342,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (!swap.val) goto redirty; - if (mem_cgroup_try_charge_swap(page, swap)) - goto free_swap; - /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the page is @@ -1353,7 +1370,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } mutex_unlock(&shmem_swaplist_mutex); -free_swap: put_swap_page(page, swap); redirty: set_page_dirty(page); @@ -1404,10 +1420,9 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma, struct shmem_inode_info *info, pgoff_t index) { /* Create a pseudo vma that just contains the policy */ - vma->vm_start = 0; + memset(vma, 0, sizeof(*vma)); /* Bias interleave by inode number to distribute better across nodes */ vma->vm_pgoff = index + info->vfs_inode.i_ino; - vma->vm_ops = NULL; vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); } @@ -1931,14 +1946,14 @@ static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, in return ret; } -static int shmem_fault(struct vm_fault *vmf) +static vm_fault_t shmem_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); enum sgp_type sgp; - int error; - int ret = VM_FAULT_LOCKED; + int err; + vm_fault_t ret = VM_FAULT_LOCKED; /* * Trinity finds that probing a hole which tmpfs is punching can @@ -2006,10 +2021,10 @@ static int shmem_fault(struct vm_fault *vmf) else if (vma->vm_flags & VM_HUGEPAGE) sgp = SGP_HUGE; - error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, + err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, gfp, vma, vmf, &ret); - if (error) - return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); + if (err) + return vmf_error(err); return ret; } @@ -2616,241 +2631,6 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) return offset; } -/* - * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, - * so reuse a tag which we firmly believe is never set or cleared on shmem. - */ -#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE -#define LAST_SCAN 4 /* about 150ms max */ - -static void shmem_tag_pins(struct address_space *mapping) -{ - struct radix_tree_iter iter; - void **slot; - pgoff_t start; - struct page *page; - - lru_add_drain(); - start = 0; - rcu_read_lock(); - - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - page = radix_tree_deref_slot(slot); - if (!page || radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - } else if (page_count(page) - page_mapcount(page) > 1) { - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_set(&mapping->i_pages, iter.index, - SHMEM_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); - } - - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } - } - rcu_read_unlock(); -} - -/* - * Setting SEAL_WRITE requires us to verify there's no pending writer. However, - * via get_user_pages(), drivers might have some pending I/O without any active - * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages - * and see whether it has an elevated ref-count. If so, we tag them and wait for - * them to be dropped. - * The caller must guarantee that no new user will acquire writable references - * to those pages to avoid races. - */ -static int shmem_wait_for_pins(struct address_space *mapping) -{ - struct radix_tree_iter iter; - void **slot; - pgoff_t start; - struct page *page; - int error, scan; - - shmem_tag_pins(mapping); - - error = 0; - for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->i_pages, SHMEM_TAG_PINNED)) - break; - - if (!scan) - lru_add_drain_all(); - else if (schedule_timeout_killable((HZ << scan) / 200)) - scan = LAST_SCAN; - - start = 0; - rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, - start, SHMEM_TAG_PINNED) { - - page = radix_tree_deref_slot(slot); - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - page = NULL; - } - - if (page && - page_count(page) - page_mapcount(page) != 1) { - if (scan < LAST_SCAN) - goto continue_resched; - - /* - * On the last scan, we clean up all those tags - * we inserted; but make a note that we still - * found pages pinned. - */ - error = -EBUSY; - } - - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_clear(&mapping->i_pages, - iter.index, SHMEM_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); -continue_resched: - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } - } - rcu_read_unlock(); - } - - return error; -} - -static unsigned int *memfd_file_seals_ptr(struct file *file) -{ - if (file->f_op == &shmem_file_operations) - return &SHMEM_I(file_inode(file))->seals; - -#ifdef CONFIG_HUGETLBFS - if (file->f_op == &hugetlbfs_file_operations) - return &HUGETLBFS_I(file_inode(file))->seals; -#endif - - return NULL; -} - -#define F_ALL_SEALS (F_SEAL_SEAL | \ - F_SEAL_SHRINK | \ - F_SEAL_GROW | \ - F_SEAL_WRITE) - -static int memfd_add_seals(struct file *file, unsigned int seals) -{ - struct inode *inode = file_inode(file); - unsigned int *file_seals; - int error; - - /* - * SEALING - * Sealing allows multiple parties to share a shmem-file but restrict - * access to a specific subset of file operations. Seals can only be - * added, but never removed. This way, mutually untrusted parties can - * share common memory regions with a well-defined policy. A malicious - * peer can thus never perform unwanted operations on a shared object. - * - * Seals are only supported on special shmem-files and always affect - * the whole underlying inode. Once a seal is set, it may prevent some - * kinds of access to the file. Currently, the following seals are - * defined: - * SEAL_SEAL: Prevent further seals from being set on this file - * SEAL_SHRINK: Prevent the file from shrinking - * SEAL_GROW: Prevent the file from growing - * SEAL_WRITE: Prevent write access to the file - * - * As we don't require any trust relationship between two parties, we - * must prevent seals from being removed. Therefore, sealing a file - * only adds a given set of seals to the file, it never touches - * existing seals. Furthermore, the "setting seals"-operation can be - * sealed itself, which basically prevents any further seal from being - * added. - * - * Semantics of sealing are only defined on volatile files. Only - * anonymous shmem files support sealing. More importantly, seals are - * never written to disk. Therefore, there's no plan to support it on - * other file types. - */ - - if (!(file->f_mode & FMODE_WRITE)) - return -EPERM; - if (seals & ~(unsigned int)F_ALL_SEALS) - return -EINVAL; - - inode_lock(inode); - - file_seals = memfd_file_seals_ptr(file); - if (!file_seals) { - error = -EINVAL; - goto unlock; - } - - if (*file_seals & F_SEAL_SEAL) { - error = -EPERM; - goto unlock; - } - - if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { - error = mapping_deny_writable(file->f_mapping); - if (error) - goto unlock; - - error = shmem_wait_for_pins(file->f_mapping); - if (error) { - mapping_allow_writable(file->f_mapping); - goto unlock; - } - } - - *file_seals |= seals; - error = 0; - -unlock: - inode_unlock(inode); - return error; -} - -static int memfd_get_seals(struct file *file) -{ - unsigned int *seals = memfd_file_seals_ptr(file); - - return seals ? *seals : -EINVAL; -} - -long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) -{ - long error; - - switch (cmd) { - case F_ADD_SEALS: - /* disallow upper 32bit */ - if (arg > UINT_MAX) - return -EINVAL; - - error = memfd_add_seals(file, arg); - break; - case F_GET_SEALS: - error = memfd_get_seals(file); - break; - default: - error = -EINVAL; - break; - } - - return error; -} - static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -3233,7 +3013,8 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s if (len > PAGE_SIZE) return -ENAMETOOLONG; - inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); + inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0, + VM_NORESERVE); if (!inode) return -ENOSPC; @@ -3428,6 +3209,15 @@ static int shmem_match(struct inode *ino, void *vfh) return ino->i_ino == inum && fh[0] == ino->i_generation; } +/* Find any alias of inode, but prefer a hashed alias */ +static struct dentry *shmem_find_alias(struct inode *inode) +{ + struct dentry *alias = d_find_alias(inode); + + return alias ?: d_find_any_alias(inode); +} + + static struct dentry *shmem_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { @@ -3444,7 +3234,7 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), shmem_match, fid->raw); if (inode) { - dentry = d_find_alias(inode); + dentry = shmem_find_alias(inode); iput(inode); } @@ -3656,7 +3446,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) sbinfo->max_blocks << (PAGE_SHIFT - 10)); if (sbinfo->max_inodes != shmem_default_max_inodes()) seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); - if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) + if (sbinfo->mode != (0777 | S_ISVTX)) seq_printf(seq, ",mode=%03ho", sbinfo->mode); if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) seq_printf(seq, ",uid=%u", @@ -3673,93 +3463,6 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) return 0; } -#define MFD_NAME_PREFIX "memfd:" -#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) -#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) - -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) - -SYSCALL_DEFINE2(memfd_create, - const char __user *, uname, - unsigned int, flags) -{ - unsigned int *file_seals; - struct file *file; - int fd, error; - char *name; - long len; - - if (!(flags & MFD_HUGETLB)) { - if (flags & ~(unsigned int)MFD_ALL_FLAGS) - return -EINVAL; - } else { - /* Allow huge page size encoding in flags. */ - if (flags & ~(unsigned int)(MFD_ALL_FLAGS | - (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) - return -EINVAL; - } - - /* length includes terminating zero */ - len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); - if (len <= 0) - return -EFAULT; - if (len > MFD_NAME_MAX_LEN + 1) - return -EINVAL; - - name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); - if (!name) - return -ENOMEM; - - strcpy(name, MFD_NAME_PREFIX); - if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { - error = -EFAULT; - goto err_name; - } - - /* terminating-zero may have changed after strnlen_user() returned */ - if (name[len + MFD_NAME_PREFIX_LEN - 1]) { - error = -EFAULT; - goto err_name; - } - - fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); - if (fd < 0) { - error = fd; - goto err_name; - } - - if (flags & MFD_HUGETLB) { - struct user_struct *user = NULL; - - file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, - HUGETLB_ANONHUGE_INODE, - (flags >> MFD_HUGE_SHIFT) & - MFD_HUGE_MASK); - } else - file = shmem_file_setup(name, 0, VM_NORESERVE); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_fd; - } - file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; - file->f_flags |= O_RDWR | O_LARGEFILE; - - if (flags & MFD_ALLOW_SEALING) { - file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; - } - - fd_install(fd, file); - kfree(name); - return fd; - -err_fd: - put_unused_fd(fd); -err_name: - kfree(name); - return error; -} - #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) @@ -3784,7 +3487,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) if (!sbinfo) return -ENOMEM; - sbinfo->mode = S_IRWXUGO | S_ISVTX; + sbinfo->mode = 0777 | S_ISVTX; sbinfo->uid = current_fsuid(); sbinfo->gid = current_fsgid(); sb->s_fs_info = sbinfo; @@ -4227,7 +3930,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l d_set_d_op(path.dentry, &anon_ops); res = ERR_PTR(-ENOSPC); - inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); + inode = shmem_get_inode(sb, NULL, S_IFREG | 0777, 0, flags); if (!inode) goto put_memory; diff --git a/mm/slab.c b/mm/slab.c index 2f308253c3d7..aa76a70e087e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1235,8 +1235,6 @@ void __init kmem_cache_init(void) { int i; - BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < - sizeof(struct rcu_head)); kmem_cache = &kmem_cache_boot; if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1) @@ -2665,6 +2663,7 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, invalid_mask, &invalid_mask, flags, &flags); dump_stack(); } + WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); check_irq_off(); @@ -3071,6 +3070,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { + WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); if (!objp) return objp; if (cachep->flags & SLAB_POISON) { @@ -4338,7 +4338,8 @@ static int leaks_show(struct seq_file *m, void *p) if (x[0] == x[1]) { /* Increase the buffer size */ mutex_unlock(&slab_mutex); - m->private = kzalloc(x[0] * 4 * sizeof(unsigned long), GFP_KERNEL); + m->private = kcalloc(x[0] * 4, sizeof(unsigned long), + GFP_KERNEL); if (!m->private) { /* Too bad, we are really out */ m->private = x; diff --git a/mm/slab_common.c b/mm/slab_common.c index 98dcdc352062..890b1f04a03a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -136,6 +136,7 @@ void slab_init_memcg_params(struct kmem_cache *s) s->memcg_params.root_cache = NULL; RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL); INIT_LIST_HEAD(&s->memcg_params.children); + s->memcg_params.dying = false; } static int init_memcg_params(struct kmem_cache *s, @@ -608,7 +609,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, * The memory cgroup could have been offlined while the cache * creation work was pending. */ - if (memcg->kmem_state != KMEM_ONLINE) + if (memcg->kmem_state != KMEM_ONLINE || root_cache->memcg_params.dying) goto out_unlock; idx = memcg_cache_id(memcg); @@ -712,6 +713,9 @@ void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, WARN_ON_ONCE(s->memcg_params.deact_fn)) return; + if (s->memcg_params.root_cache->memcg_params.dying) + return; + /* pin memcg so that @s doesn't get destroyed in the middle */ css_get(&s->memcg_params.memcg->css); @@ -823,11 +827,36 @@ static int shutdown_memcg_caches(struct kmem_cache *s) return -EBUSY; return 0; } + +static void flush_memcg_workqueue(struct kmem_cache *s) +{ + mutex_lock(&slab_mutex); + s->memcg_params.dying = true; + mutex_unlock(&slab_mutex); + + /* + * SLUB deactivates the kmem_caches through call_rcu_sched. Make + * sure all registered rcu callbacks have been invoked. + */ + if (IS_ENABLED(CONFIG_SLUB)) + rcu_barrier_sched(); + + /* + * SLAB and SLUB create memcg kmem_caches through workqueue and SLUB + * deactivates the memcg kmem_caches through workqueue. Make sure all + * previous workitems on workqueue are processed. + */ + flush_workqueue(memcg_kmem_cache_wq); +} #else static inline int shutdown_memcg_caches(struct kmem_cache *s) { return 0; } + +static inline void flush_memcg_workqueue(struct kmem_cache *s) +{ +} #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ void slab_kmem_cache_release(struct kmem_cache *s) @@ -845,6 +874,8 @@ void kmem_cache_destroy(struct kmem_cache *s) if (unlikely(!s)) return; + flush_memcg_workqueue(s); + get_online_cpus(); get_online_mems(); @@ -1212,9 +1243,9 @@ void cache_random_seq_destroy(struct kmem_cache *cachep) #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) #ifdef CONFIG_SLAB -#define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR) +#define SLABINFO_RIGHTS (0600) #else -#define SLABINFO_RIGHTS S_IRUSR +#define SLABINFO_RIGHTS (0400) #endif static void print_slabinfo_header(struct seq_file *m) diff --git a/mm/slob.c b/mm/slob.c index 623e8a5c46ce..307c2c9feb44 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -555,8 +555,10 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) flags, node); } - if (b && c->ctor) + if (b && c->ctor) { + WARN_ON_ONCE(flags & __GFP_ZERO); c->ctor(b); + } kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); return b; diff --git a/mm/slub.c b/mm/slub.c index 44aa7847324a..a3b8467c14af 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -52,11 +52,11 @@ * and to synchronize major metadata changes to slab cache structures. * * The slab_lock is only used for debugging and on arches that do not - * have the ability to do a cmpxchg_double. It only protects the second - * double word in the page struct. Meaning + * have the ability to do a cmpxchg_double. It only protects: * A. page->freelist -> List of object free in a page - * B. page->counters -> Counters of objects - * C. page->frozen -> frozen state + * B. page->inuse -> Number of objects in use + * C. page->objects -> Number of objects in page + * D. page->frozen -> frozen state * * If a slab is frozen then it is exempt from list management. It is not * on any list. The processor that froze the slab is the one who can @@ -316,16 +316,16 @@ static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } -static inline unsigned int order_objects(unsigned int order, unsigned int size, unsigned int reserved) +static inline unsigned int order_objects(unsigned int order, unsigned int size) { - return (((unsigned int)PAGE_SIZE << order) - reserved) / size; + return ((unsigned int)PAGE_SIZE << order) / size; } static inline struct kmem_cache_order_objects oo_make(unsigned int order, - unsigned int size, unsigned int reserved) + unsigned int size) { struct kmem_cache_order_objects x = { - (order << OO_SHIFT) + order_objects(order, size, reserved) + (order << OO_SHIFT) + order_objects(order, size) }; return x; @@ -356,21 +356,6 @@ static __always_inline void slab_unlock(struct page *page) __bit_spin_unlock(PG_locked, &page->flags); } -static inline void set_page_slub_counters(struct page *page, unsigned long counters_new) -{ - struct page tmp; - tmp.counters = counters_new; - /* - * page->counters can cover frozen/inuse/objects as well - * as page->_refcount. If we assign to ->counters directly - * we run the risk of losing updates to page->_refcount, so - * be careful and only assign to the fields we need. - */ - page->frozen = tmp.frozen; - page->inuse = tmp.inuse; - page->objects = tmp.objects; -} - /* Interrupts must be disabled (for the fallback code to work right) */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, @@ -392,7 +377,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; - set_page_slub_counters(page, counters_new); + page->counters = counters_new; slab_unlock(page); return true; } @@ -431,7 +416,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; - set_page_slub_counters(page, counters_new); + page->counters = counters_new; slab_unlock(page); local_irq_restore(flags); return true; @@ -711,7 +696,7 @@ void object_err(struct kmem_cache *s, struct page *page, print_trailer(s, page, object); } -static void slab_err(struct kmem_cache *s, struct page *page, +static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...) { va_list args; @@ -847,7 +832,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - length = (PAGE_SIZE << compound_order(page)) - s->reserved; + length = PAGE_SIZE << compound_order(page); end = start + length; remainder = length % s->size; if (!remainder) @@ -936,7 +921,7 @@ static int check_slab(struct kmem_cache *s, struct page *page) return 0; } - maxobj = order_objects(compound_order(page), s->size, s->reserved); + maxobj = order_objects(compound_order(page), s->size); if (page->objects > maxobj) { slab_err(s, page, "objects %u > max %u", page->objects, maxobj); @@ -986,7 +971,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - max_objects = order_objects(compound_order(page), s->size, s->reserved); + max_objects = order_objects(compound_order(page), s->size); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -1694,24 +1679,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); - page_mapcount_reset(page); + page->mapping = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; memcg_uncharge_slab(page, order, s); __free_pages(page, order); } -#define need_reserve_slab_rcu \ - (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) - static void rcu_free_slab(struct rcu_head *h) { - struct page *page; - - if (need_reserve_slab_rcu) - page = virt_to_head_page(h); - else - page = container_of((struct list_head *)h, struct page, lru); + struct page *page = container_of(h, struct page, rcu_head); __free_slab(page->slab_cache, page); } @@ -1719,19 +1696,7 @@ static void rcu_free_slab(struct rcu_head *h) static void free_slab(struct kmem_cache *s, struct page *page) { if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { - struct rcu_head *head; - - if (need_reserve_slab_rcu) { - int order = compound_order(page); - int offset = (PAGE_SIZE << order) - s->reserved; - - VM_BUG_ON(s->reserved != sizeof(*head)); - head = page_address(page) + offset; - } else { - head = &page->rcu_head; - } - - call_rcu(head, rcu_free_slab); + call_rcu(&page->rcu_head, rcu_free_slab); } else __free_slab(s, page); } @@ -2444,6 +2409,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, struct kmem_cache_cpu *c = *pc; struct page *page; + WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); + freelist = get_partial(s, flags, node, c); if (freelist) @@ -3226,21 +3193,21 @@ static unsigned int slub_min_objects; */ static inline unsigned int slab_order(unsigned int size, unsigned int min_objects, unsigned int max_order, - unsigned int fract_leftover, unsigned int reserved) + unsigned int fract_leftover) { unsigned int min_order = slub_min_order; unsigned int order; - if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) + if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; - for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved)); + for (order = max(min_order, (unsigned int)get_order(min_objects * size)); order <= max_order; order++) { unsigned int slab_size = (unsigned int)PAGE_SIZE << order; unsigned int rem; - rem = (slab_size - reserved) % size; + rem = slab_size % size; if (rem <= slab_size / fract_leftover) break; @@ -3249,7 +3216,7 @@ static inline unsigned int slab_order(unsigned int size, return order; } -static inline int calculate_order(unsigned int size, unsigned int reserved) +static inline int calculate_order(unsigned int size) { unsigned int order; unsigned int min_objects; @@ -3266,7 +3233,7 @@ static inline int calculate_order(unsigned int size, unsigned int reserved) min_objects = slub_min_objects; if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); - max_objects = order_objects(slub_max_order, size, reserved); + max_objects = order_objects(slub_max_order, size); min_objects = min(min_objects, max_objects); while (min_objects > 1) { @@ -3275,7 +3242,7 @@ static inline int calculate_order(unsigned int size, unsigned int reserved) fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, - slub_max_order, fraction, reserved); + slub_max_order, fraction); if (order <= slub_max_order) return order; fraction /= 2; @@ -3287,14 +3254,14 @@ static inline int calculate_order(unsigned int size, unsigned int reserved) * We were unable to place multiple objects in a slab. Now * lets see if we can place a single object there. */ - order = slab_order(size, 1, slub_max_order, 1, reserved); + order = slab_order(size, 1, slub_max_order, 1); if (order <= slub_max_order) return order; /* * Doh this slab cannot be placed using slub_max_order. */ - order = slab_order(size, 1, MAX_ORDER, 1, reserved); + order = slab_order(size, 1, MAX_ORDER, 1); if (order < MAX_ORDER) return order; return -ENOSYS; @@ -3562,7 +3529,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) if (forced_order >= 0) order = forced_order; else - order = calculate_order(size, s->reserved); + order = calculate_order(size); if ((int)order < 0) return 0; @@ -3580,8 +3547,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) /* * Determine the number of objects per slab */ - s->oo = oo_make(order, size, s->reserved); - s->min = oo_make(get_order(size), size, s->reserved); + s->oo = oo_make(order, size); + s->min = oo_make(get_order(size), size); if (oo_objects(s->oo) > oo_objects(s->max)) s->max = s->oo; @@ -3591,14 +3558,10 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); - s->reserved = 0; #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); #endif - if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) - s->reserved = sizeof(struct rcu_head); - if (!calculate_sizes(s, -1)) goto error; if (disable_higher_order_debug) { @@ -3660,8 +3623,9 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) * - sizeof(long), GFP_ATOMIC); + unsigned long *map = kcalloc(BITS_TO_LONGS(page->objects), + sizeof(long), + GFP_ATOMIC); if (!map) return; slab_err(s, page, text, s->name); @@ -4239,12 +4203,6 @@ void __init kmem_cache_init(void) SLAB_HWCACHE_ALIGN, 0, 0); kmem_cache = bootstrap(&boot_kmem_cache); - - /* - * Allocate kmem_cache_node properly from the kmem_cache slab. - * kmem_cache_node is separately allocated so no need to - * update any list pointers. - */ kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ @@ -4455,8 +4413,9 @@ static long validate_slab_cache(struct kmem_cache *s) { int node; unsigned long count = 0; - unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * - sizeof(unsigned long), GFP_KERNEL); + unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)), + sizeof(unsigned long), + GFP_KERNEL); struct kmem_cache_node *n; if (!map) @@ -4616,8 +4575,9 @@ static int list_locations(struct kmem_cache *s, char *buf, unsigned long i; struct loc_track t = { 0, 0, NULL }; int node; - unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * - sizeof(unsigned long), GFP_KERNEL); + unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)), + sizeof(unsigned long), + GFP_KERNEL); struct kmem_cache_node *n; if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), @@ -4793,7 +4753,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, int x; unsigned long *nodes; - nodes = kzalloc(sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); + nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); if (!nodes) return -ENOMEM; @@ -5117,12 +5077,6 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(destroy_by_rcu); -static ssize_t reserved_show(struct kmem_cache *s, char *buf) -{ - return sprintf(buf, "%u\n", s->reserved); -} -SLAB_ATTR_RO(reserved); - #ifdef CONFIG_SLUB_DEBUG static ssize_t slabs_show(struct kmem_cache *s, char *buf) { @@ -5342,7 +5296,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) unsigned long sum = 0; int cpu; int len; - int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); + int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL); if (!data) return -ENOMEM; @@ -5435,7 +5389,6 @@ static struct attribute *slab_attrs[] = { &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, &shrink_attr.attr, - &reserved_attr.attr, &slabs_cpu_partial_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, diff --git a/mm/sparse.c b/mm/sparse.c index 62eef264a7bd..f13f2723950a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -190,15 +190,13 @@ static inline int next_present_section_nr(int section_nr) section_nr++; if (present_section_nr(section_nr)) return section_nr; - } while ((section_nr < NR_MEM_SECTIONS) && - (section_nr <= __highest_present_section_nr)); + } while ((section_nr <= __highest_present_section_nr)); return -1; } #define for_each_present_section_nr(start, section_nr) \ for (section_nr = next_present_section_nr(start-1); \ ((section_nr >= 0) && \ - (section_nr < NR_MEM_SECTIONS) && \ (section_nr <= __highest_present_section_nr)); \ section_nr = next_present_section_nr(section_nr)) @@ -524,7 +522,7 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func) map_count = 1; } /* ok, last chunk */ - alloc_func(data, pnum_begin, NR_MEM_SECTIONS, + alloc_func(data, pnum_begin, __highest_present_section_nr+1, map_count, nodeid_begin); } @@ -629,7 +627,7 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - unsigned long section_nr = pfn_to_section_nr(start_pfn); + unsigned long section_nr = pfn_to_section_nr(pfn); struct mem_section *ms; /* diff --git a/mm/swap.c b/mm/swap.c index 3dd518832096..26fc9b5f1b6c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -29,6 +29,7 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/backing-dev.h> +#include <linux/memremap.h> #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> @@ -743,7 +744,7 @@ void release_pages(struct page **pages, int nr) flags); locked_pgdat = NULL; } - put_zone_device_private_or_public_page(page); + put_devmap_managed_page(page); continue; } diff --git a/mm/swap_slots.c b/mm/swap_slots.c index f2641894f440..a791411fed71 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -122,12 +122,12 @@ static int alloc_swap_slot_cache(unsigned int cpu) * as kvzalloc could trigger reclaim and get_swap_page, * which can lock swap_slots_cache_mutex. */ - slots = kvzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE, + slots = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t), GFP_KERNEL); if (!slots) return -ENOMEM; - slots_ret = kvzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE, + slots_ret = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t), GFP_KERNEL); if (!slots_ret) { kvfree(slots); @@ -317,7 +317,7 @@ swp_entry_t get_swap_page(struct page *page) if (PageTransHuge(page)) { if (IS_ENABLED(CONFIG_THP_SWAP)) get_swap_pages(1, true, &entry); - return entry; + goto out; } /* @@ -347,10 +347,14 @@ repeat: } mutex_unlock(&cache->alloc_lock); if (entry.val) - return entry; + goto out; } get_swap_pages(1, false, &entry); - +out: + if (mem_cgroup_try_charge_swap(page, entry)) { + put_swap_page(page, entry); + entry.val = 0; + } return entry; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 07f9aa2340c3..ecee9c6c4cc1 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -216,9 +216,6 @@ int add_to_swap(struct page *page) if (!entry.val) return 0; - if (mem_cgroup_try_charge_swap(page, entry)) - goto fail; - /* * Radix-tree node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC @@ -623,7 +620,7 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) unsigned int i, nr; nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); - spaces = kvzalloc(sizeof(struct address_space) * nr, GFP_KERNEL); + spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); if (!spaces) return -ENOMEM; for (i = 0; i < nr; i++) { diff --git a/mm/swapfile.c b/mm/swapfile.c index cc2cf04d9018..2cc2972eedaf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -100,7 +100,7 @@ atomic_t nr_rotate_swap = ATOMIC_INIT(0); static inline unsigned char swap_count(unsigned char ent) { - return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ + return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ } /* returns 1 if swap entry is freed */ @@ -3112,6 +3112,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) unsigned long *frontswap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; + bool inced_nr_rotate_swap = false; if (swap_flags & ~SWAP_FLAGS_VALID) return -EINVAL; @@ -3195,7 +3196,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->cluster_next = 1 + (prandom_u32() % p->highest_bit); nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - cluster_info = kvzalloc(nr_cluster * sizeof(*cluster_info), + cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), GFP_KERNEL); if (!cluster_info) { error = -ENOMEM; @@ -3215,8 +3216,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) cluster = per_cpu_ptr(p->percpu_cluster, cpu); cluster_set_null(&cluster->index); } - } else + } else { atomic_inc(&nr_rotate_swap); + inced_nr_rotate_swap = true; + } error = swap_cgroup_swapon(p->type, maxpages); if (error) @@ -3230,7 +3233,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } /* frontswap enabled? set up bit-per-page map for frontswap */ if (IS_ENABLED(CONFIG_FRONTSWAP)) - frontswap_map = kvzalloc(BITS_TO_LONGS(maxpages) * sizeof(long), + frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages), + sizeof(long), GFP_KERNEL); if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { @@ -3307,6 +3311,8 @@ bad_swap: vfree(swap_map); kvfree(cluster_info); kvfree(frontswap_map); + if (inced_nr_rotate_swap) + atomic_dec(&nr_rotate_swap); if (swap_file) { if (inode && S_ISREG(inode->i_mode)) { inode_unlock(inode); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 39791b81ede7..5029f241908f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -404,7 +404,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool zeropage) + bool zeropage, + bool *mmap_changing) { struct vm_area_struct *dst_vma; ssize_t err; @@ -431,6 +432,15 @@ retry: down_read(&dst_mm->mmap_sem); /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + err = -EAGAIN; + if (mmap_changing && READ_ONCE(*mmap_changing)) + goto out_unlock; + + /* * Make sure the vma is not shared, that the dst range is * both valid and fully within a single existing vma. */ @@ -563,13 +573,15 @@ out: } ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long src_start, unsigned long len) + unsigned long src_start, unsigned long len, + bool *mmap_changing) { - return __mcopy_atomic(dst_mm, dst_start, src_start, len, false); + return __mcopy_atomic(dst_mm, dst_start, src_start, len, false, + mmap_changing); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, - unsigned long len) + unsigned long len, bool *mmap_changing) { - return __mcopy_atomic(dst_mm, start, 0, len, true); + return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing); } diff --git a/mm/util.c b/mm/util.c index 45fc3169e7b0..3351659200e6 100644 --- a/mm/util.c +++ b/mm/util.c @@ -391,7 +391,8 @@ EXPORT_SYMBOL(vm_mmap); * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * - * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people. + * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not + * fall back to vmalloc. */ void *kvmalloc_node(size_t size, gfp_t flags, int node) { @@ -402,7 +403,8 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) * so the given set of flags has to be compatible. */ - WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL); + if ((flags & GFP_KERNEL) != GFP_KERNEL) + return kmalloc_node(size, flags, node); /* * We want to attempt a large physically contiguous block first because @@ -621,7 +623,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); * succeed and -ENOMEM implies there is not. * * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting + * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting.rst * * Strict overcommit modes added 2002 Feb 26 by Alan Cox. * Additional code 2002 Jul 20 by Robert Love. diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ebff729cc956..cfea25be7754 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -603,26 +603,6 @@ static void unmap_vmap_area(struct vmap_area *va) vunmap_page_range(va->va_start, va->va_end); } -static void vmap_debug_free_range(unsigned long start, unsigned long end) -{ - /* - * Unmap page tables and force a TLB flush immediately if pagealloc - * debugging is enabled. This catches use after free bugs similarly to - * those in linear kernel virtual address space after a page has been - * freed. - * - * All the lazy freeing logic is still retained, in order to minimise - * intrusiveness of this debugging feature. - * - * This is going to be *slow* (linear kernel virtual address debugging - * doesn't do a broadcast TLB flush so it is a lot faster). - */ - if (debug_pagealloc_enabled()) { - vunmap_page_range(start, end); - flush_tlb_kernel_range(start, end); - } -} - /* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. @@ -756,6 +736,9 @@ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); unmap_vmap_area(va); + if (debug_pagealloc_enabled()) + flush_tlb_kernel_range(va->va_start, va->va_end); + free_vmap_area_noflush(va); } @@ -1053,6 +1036,10 @@ static void vb_free(const void *addr, unsigned long size) vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + if (debug_pagealloc_enabled()) + flush_tlb_kernel_range((unsigned long)addr, + (unsigned long)addr + size); + spin_lock(&vb->lock); /* Expand dirty range */ @@ -1141,16 +1128,16 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(addr > VMALLOC_END); BUG_ON(!PAGE_ALIGNED(addr)); - debug_check_no_locks_freed(mem, size); - vmap_debug_free_range(addr, addr+size); - if (likely(count <= VMAP_MAX_ALLOC)) { + debug_check_no_locks_freed(mem, size); vb_free(mem, size); return; } va = find_vmap_area(addr); BUG_ON(!va); + debug_check_no_locks_freed((void *)va->va_start, + (va->va_end - va->va_start)); free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); @@ -1499,7 +1486,6 @@ struct vm_struct *remove_vm_area(const void *addr) va->flags |= VM_LAZY_FREE; spin_unlock(&vmap_area_lock); - vmap_debug_free_range(va->va_start, va->va_end); kasan_free_shadow(vm); free_unmap_vmap_area(va); @@ -1519,16 +1505,17 @@ static void __vunmap(const void *addr, int deallocate_pages) addr)) return; - area = remove_vm_area(addr); + area = find_vmap_area((unsigned long)addr)->vm; if (unlikely(!area)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); return; } - debug_check_no_locks_freed(addr, get_vm_area_size(area)); - debug_check_no_obj_freed(addr, get_vm_area_size(area)); + debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); + debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); + remove_vm_area(addr); if (deallocate_pages) { int i; @@ -2751,25 +2738,14 @@ static const struct seq_operations vmalloc_op = { .show = s_show, }; -static int vmalloc_open(struct inode *inode, struct file *file) +static int __init proc_vmalloc_init(void) { if (IS_ENABLED(CONFIG_NUMA)) - return seq_open_private(file, &vmalloc_op, - nr_node_ids * sizeof(unsigned int)); + proc_create_seq_private("vmallocinfo", 0400, NULL, + &vmalloc_op, + nr_node_ids * sizeof(unsigned int), NULL); else - return seq_open(file, &vmalloc_op); -} - -static const struct file_operations proc_vmalloc_operations = { - .open = vmalloc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - -static int __init proc_vmalloc_init(void) -{ - proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations); + proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op); return 0; } module_init(proc_vmalloc_init); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 85350ce2d25d..4854584ec436 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -342,26 +342,6 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) vmpressure(gfp, memcg, true, vmpressure_win, 0); } -static enum vmpressure_levels str_to_level(const char *arg) -{ - enum vmpressure_levels level; - - for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) - if (!strcmp(vmpressure_str_levels[level], arg)) - return level; - return -1; -} - -static enum vmpressure_modes str_to_mode(const char *arg) -{ - enum vmpressure_modes mode; - - for (mode = 0; mode < VMPRESSURE_NUM_MODES; mode++) - if (!strcmp(vmpressure_str_modes[mode], arg)) - return mode; - return -1; -} - #define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) /** @@ -390,27 +370,26 @@ int vmpressure_register_event(struct mem_cgroup *memcg, char *token; int ret = 0; - spec_orig = spec = kzalloc(MAX_VMPRESSURE_ARGS_LEN + 1, GFP_KERNEL); + spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL); if (!spec) { ret = -ENOMEM; goto out; } - strncpy(spec, args, MAX_VMPRESSURE_ARGS_LEN); /* Find required level */ token = strsep(&spec, ","); - level = str_to_level(token); - if (level == -1) { - ret = -EINVAL; + level = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token); + if (level < 0) { + ret = level; goto out; } /* Find optional mode */ token = strsep(&spec, ","); if (token) { - mode = str_to_mode(token); - if (mode == -1) { - ret = -EINVAL; + mode = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token); + if (mode < 0) { + ret = mode; goto out; } } diff --git a/mm/vmscan.c b/mm/vmscan.c index 8b920ce3ae02..03822f86f288 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -303,7 +303,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone /* * Add a shrinker callback to be called from the vm. */ -int register_shrinker(struct shrinker *shrinker) +int prealloc_shrinker(struct shrinker *shrinker) { size_t size = sizeof(*shrinker->nr_deferred); @@ -313,10 +313,29 @@ int register_shrinker(struct shrinker *shrinker) shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); if (!shrinker->nr_deferred) return -ENOMEM; + return 0; +} + +void free_prealloced_shrinker(struct shrinker *shrinker) +{ + kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; +} +void register_shrinker_prepared(struct shrinker *shrinker) +{ down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem); +} + +int register_shrinker(struct shrinker *shrinker) +{ + int err = prealloc_shrinker(shrinker); + + if (err) + return err; + register_shrinker_prepared(shrinker); return 0; } EXPORT_SYMBOL(register_shrinker); @@ -1399,7 +1418,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) return ret; mapping = page_mapping(page); - migrate_dirty = mapping && mapping->a_ops->migratepage; + migrate_dirty = !mapping || mapping->a_ops->migratepage; unlock_page(page); if (!migrate_dirty) return ret; @@ -2525,12 +2544,28 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long reclaimed; unsigned long scanned; - if (mem_cgroup_low(root, memcg)) { + switch (mem_cgroup_protected(root, memcg)) { + case MEMCG_PROT_MIN: + /* + * Hard protection. + * If there is no reclaimable memory, OOM. + */ + continue; + case MEMCG_PROT_LOW: + /* + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. + */ if (!sc->memcg_low_reclaim) { sc->memcg_low_skipped = 1; continue; } memcg_memory_event(memcg, MEMCG_LOW); + break; + case MEMCG_PROT_NONE: + break; } reclaimed = sc->nr_reclaimed; @@ -3299,11 +3334,15 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) .may_unmap = 1, .may_swap = 1, }; + + __fs_reclaim_acquire(); + count_vm_event(PAGEOUTRUN); do { unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; + bool ret; sc.reclaim_idx = classzone_idx; @@ -3376,7 +3415,10 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ - if (try_to_freeze() || kthread_should_stop()) + __fs_reclaim_release(); + ret = try_to_freeze(); + __fs_reclaim_acquire(); + if (ret || kthread_should_stop()) break; /* @@ -3393,6 +3435,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) out: snapshot_refaults(NULL, pgdat); + __fs_reclaim_release(); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller @@ -3581,9 +3624,7 @@ kswapd_try_sleep: */ trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, alloc_order); - fs_reclaim_acquire(GFP_KERNEL); reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); - fs_reclaim_release(GFP_KERNEL); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } @@ -3665,16 +3706,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) unsigned long nr_reclaimed; unsigned int noreclaim_flag; - noreclaim_flag = memalloc_noreclaim_save(); fs_reclaim_acquire(sc.gfp_mask); + noreclaim_flag = memalloc_noreclaim_save(); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; nr_reclaimed = do_try_to_free_pages(zonelist, &sc); p->reclaim_state = NULL; - fs_reclaim_release(sc.gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(sc.gfp_mask); return nr_reclaimed; } @@ -3851,6 +3892,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in }; cond_resched(); + fs_reclaim_acquire(sc.gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP * and we also need to be able to write out pages for RECLAIM_WRITE @@ -3858,7 +3900,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - fs_reclaim_acquire(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3873,9 +3914,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in } p->reclaim_state = NULL; - fs_reclaim_release(gfp_mask); current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(sc.gfp_mask); return sc.nr_reclaimed >= nr_pages; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 536332e988b8..75eda9c2b260 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1161,7 +1161,7 @@ const char * const vmstat_text[] = { "nr_vmscan_immediate_reclaim", "nr_dirtied", "nr_written", - "nr_indirectly_reclaimable", + "", /* nr_indirectly_reclaimable */ /* enum writeback_stat_item counters */ "nr_dirty_threshold", @@ -1516,18 +1516,6 @@ static const struct seq_operations fragmentation_op = { .show = frag_show, }; -static int fragmentation_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &fragmentation_op); -} - -static const struct file_operations buddyinfo_file_operations = { - .open = fragmentation_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static const struct seq_operations pagetypeinfo_op = { .start = frag_start, .next = frag_next, @@ -1535,18 +1523,6 @@ static const struct seq_operations pagetypeinfo_op = { .show = pagetypeinfo_show, }; -static int pagetypeinfo_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &pagetypeinfo_op); -} - -static const struct file_operations pagetypeinfo_file_operations = { - .open = pagetypeinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone) { int zid; @@ -1663,18 +1639,6 @@ static const struct seq_operations zoneinfo_op = { .show = zoneinfo_show, }; -static int zoneinfo_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &zoneinfo_op); -} - -static const struct file_operations zoneinfo_file_operations = { - .open = zoneinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - enum writeback_stat_item { NR_DIRTY_THRESHOLD, NR_DIRTY_BG_THRESHOLD, @@ -1740,6 +1704,10 @@ static int vmstat_show(struct seq_file *m, void *arg) unsigned long *l = arg; unsigned long off = l - (unsigned long *)m->private; + /* Skip hidden vmstat items. */ + if (*vmstat_text[off] == '\0') + return 0; + seq_puts(m, vmstat_text[off]); seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); @@ -1758,18 +1726,6 @@ static const struct seq_operations vmstat_op = { .stop = vmstat_stop, .show = vmstat_show, }; - -static int vmstat_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &vmstat_op); -} - -static const struct file_operations vmstat_file_operations = { - .open = vmstat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SMP @@ -2016,10 +1972,10 @@ void __init init_mm_internals(void) start_shepherd_timer(); #endif #ifdef CONFIG_PROC_FS - proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations); - proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations); - proc_create("vmstat", 0444, NULL, &vmstat_file_operations); - proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations); + proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op); + proc_create_seq("pagetypeinfo", 0444, NULL, &pagetypeinfo_op); + proc_create_seq("vmstat", 0444, NULL, &vmstat_op); + proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op); #endif } diff --git a/mm/z3fold.c b/mm/z3fold.c index c0bca6153b95..4b366d181f35 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -144,7 +144,8 @@ enum z3fold_page_flags { PAGE_HEADLESS = 0, MIDDLE_CHUNK_MAPPED, NEEDS_COMPACTING, - PAGE_STALE + PAGE_STALE, + UNDER_RECLAIM }; /***************** @@ -173,6 +174,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); clear_bit(NEEDS_COMPACTING, &page->private); clear_bit(PAGE_STALE, &page->private); + clear_bit(UNDER_RECLAIM, &page->private); spin_lock_init(&zhdr->page_lock); kref_init(&zhdr->refcount); @@ -756,6 +758,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) atomic64_dec(&pool->pages_nr); return; } + if (test_bit(UNDER_RECLAIM, &page->private)) { + z3fold_page_unlock(zhdr); + return; + } if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { z3fold_page_unlock(zhdr); return; @@ -840,6 +846,8 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) kref_get(&zhdr->refcount); list_del_init(&zhdr->buddy); zhdr->cpu = -1; + set_bit(UNDER_RECLAIM, &page->private); + break; } list_del_init(&page->lru); @@ -887,25 +895,35 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) goto next; } next: - spin_lock(&pool->lock); if (test_bit(PAGE_HEADLESS, &page->private)) { if (ret == 0) { - spin_unlock(&pool->lock); free_z3fold_page(page); return 0; } - } else if (kref_put(&zhdr->refcount, release_z3fold_page)) { - atomic64_dec(&pool->pages_nr); + spin_lock(&pool->lock); + list_add(&page->lru, &pool->lru); + spin_unlock(&pool->lock); + } else { + z3fold_page_lock(zhdr); + clear_bit(UNDER_RECLAIM, &page->private); + if (kref_put(&zhdr->refcount, + release_z3fold_page_locked)) { + atomic64_dec(&pool->pages_nr); + return 0; + } + /* + * if we are here, the page is still not completely + * free. Take the global pool lock then to be able + * to add it back to the lru list + */ + spin_lock(&pool->lock); + list_add(&page->lru, &pool->lru); spin_unlock(&pool->lock); - return 0; + z3fold_page_unlock(zhdr); } - /* - * Add to the beginning of LRU. - * Pool lock has to be kept here to ensure the page has - * not already been released - */ - list_add(&page->lru, &pool->lru); + /* We started off locked to we need to lock the pool back */ + spin_lock(&pool->lock); } spin_unlock(&pool->lock); return -EAGAIN; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 61cb05dc950c..8d87e973a4f5 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -661,8 +661,9 @@ static void zs_pool_stat_create(struct zs_pool *pool, const char *name) } pool->stat_dentry = entry; - entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, - pool->stat_dentry, pool, &zs_stats_size_fops); + entry = debugfs_create_file("classes", S_IFREG | 0444, + pool->stat_dentry, pool, + &zs_stats_size_fops); if (!entry) { pr_warn("%s: debugfs file entry <%s> creation failed\n", name, "classes"); diff --git a/mm/zswap.c b/mm/zswap.c index 61a5c41972db..7d34e69507e3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1256,26 +1256,26 @@ static int __init zswap_debugfs_init(void) if (!zswap_debugfs_root) return -ENOMEM; - debugfs_create_u64("pool_limit_hit", S_IRUGO, - zswap_debugfs_root, &zswap_pool_limit_hit); - debugfs_create_u64("reject_reclaim_fail", S_IRUGO, - zswap_debugfs_root, &zswap_reject_reclaim_fail); - debugfs_create_u64("reject_alloc_fail", S_IRUGO, - zswap_debugfs_root, &zswap_reject_alloc_fail); - debugfs_create_u64("reject_kmemcache_fail", S_IRUGO, - zswap_debugfs_root, &zswap_reject_kmemcache_fail); - debugfs_create_u64("reject_compress_poor", S_IRUGO, - zswap_debugfs_root, &zswap_reject_compress_poor); - debugfs_create_u64("written_back_pages", S_IRUGO, - zswap_debugfs_root, &zswap_written_back_pages); - debugfs_create_u64("duplicate_entry", S_IRUGO, - zswap_debugfs_root, &zswap_duplicate_entry); - debugfs_create_u64("pool_total_size", S_IRUGO, - zswap_debugfs_root, &zswap_pool_total_size); - debugfs_create_atomic_t("stored_pages", S_IRUGO, - zswap_debugfs_root, &zswap_stored_pages); + debugfs_create_u64("pool_limit_hit", 0444, + zswap_debugfs_root, &zswap_pool_limit_hit); + debugfs_create_u64("reject_reclaim_fail", 0444, + zswap_debugfs_root, &zswap_reject_reclaim_fail); + debugfs_create_u64("reject_alloc_fail", 0444, + zswap_debugfs_root, &zswap_reject_alloc_fail); + debugfs_create_u64("reject_kmemcache_fail", 0444, + zswap_debugfs_root, &zswap_reject_kmemcache_fail); + debugfs_create_u64("reject_compress_poor", 0444, + zswap_debugfs_root, &zswap_reject_compress_poor); + debugfs_create_u64("written_back_pages", 0444, + zswap_debugfs_root, &zswap_written_back_pages); + debugfs_create_u64("duplicate_entry", 0444, + zswap_debugfs_root, &zswap_duplicate_entry); + debugfs_create_u64("pool_total_size", 0444, + zswap_debugfs_root, &zswap_pool_total_size); + debugfs_create_atomic_t("stored_pages", 0444, + zswap_debugfs_root, &zswap_stored_pages); debugfs_create_atomic_t("same_filled_pages", 0444, - zswap_debugfs_root, &zswap_same_filled_pages); + zswap_debugfs_root, &zswap_same_filled_pages); return 0; } |