From 58d002097b98664e2a39cc708f30d11549d870b2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 29 Nov 2012 13:54:20 -0800 Subject: mm: compaction: fix return value of capture_free_page() Commit ef6c5be658f6 ("fix incorrect NR_FREE_PAGES accounting (appears like memory leak)") fixes a NR_FREE_PAGE accounting leak but missed the return value which was also missed by this reviewer until today. That return value is used by compaction when adding pages to a list of isolated free pages and without this follow-up fix, there is a risk of free list corruption. Signed-off-by: Mel Gorman Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 92871579cbee..7e208f0ad68c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1422,7 +1422,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) } } - return 1UL << order; + return 1UL << alloc_order; } /* -- cgit v1.2.3-55-g7522 From ae64ffcac35de0db628ba9631edf8ff34c5cd7ac Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Thu, 29 Nov 2012 13:54:21 -0800 Subject: mm/vmemmap: fix wrong use of virt_to_page I enable CONFIG_DEBUG_VIRTUAL and CONFIG_SPARSEMEM_VMEMMAP, when doing memory hotremove, there is a kernel BUG at arch/x86/mm/physaddr.c:20. It is caused by free_section_usemap()->virt_to_page(), virt_to_page() is only used for kernel direct mapping address, but sparse-vmemmap uses vmemmap address, so it is going wrong here. ------------[ cut here ]------------ kernel BUG at arch/x86/mm/physaddr.c:20! invalid opcode: 0000 [#1] SMP Modules linked in: acpihp_drv acpihp_slot edd cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf fuse vfat fat loop dm_mod coretemp kvm crc32c_intel ipv6 ixgbe igb iTCO_wdt i7core_edac edac_core pcspkr iTCO_vendor_support ioatdma microcode joydev sr_mod i2c_i801 dca lpc_ich mfd_core mdio tpm_tis i2c_core hid_generic tpm cdrom sg tpm_bios rtc_cmos button ext3 jbd mbcache usbhid hid uhci_hcd ehci_hcd usbcore usb_common sd_mod crc_t10dif processor thermal_sys hwmon scsi_dh_alua scsi_dh_hp_sw scsi_dh_rdac scsi_dh_emc scsi_dh ata_generic ata_piix libata megaraid_sas scsi_mod CPU 39 Pid: 6454, comm: sh Not tainted 3.7.0-rc1-acpihp-final+ #45 QCI QSSC-S4R/QSSC-S4R RIP: 0010:[] [] __phys_addr+0x88/0x90 RSP: 0018:ffff8804440d7c08 EFLAGS: 00010006 RAX: 0000000000000006 RBX: ffffea0012000000 RCX: 000000000000002c ... Signed-off-by: Jianguo Wu Signed-off-by: Jiang Liu Reviewd-by: Wen Congyang Acked-by: Johannes Weiner Reviewed-by: Yasuaki Ishimatsu Reviewed-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index fac95f2888f2..a83de2f72b30 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -617,7 +617,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) { return; /* XXX: Not implemented yet */ } -static void free_map_bootmem(struct page *page, unsigned long nr_pages) +static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) { } #else @@ -658,10 +658,11 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) get_order(sizeof(struct page) * nr_pages)); } -static void free_map_bootmem(struct page *page, unsigned long nr_pages) +static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) { unsigned long maps_section_nr, removing_section_nr, i; unsigned long magic; + struct page *page = virt_to_page(memmap); for (i = 0; i < nr_pages; i++, page++) { magic = (unsigned long) page->lru.next; @@ -710,13 +711,10 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) */ if (memmap) { - struct page *memmap_page; - memmap_page = virt_to_page(memmap); - nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) >> PAGE_SHIFT; - free_map_bootmem(memmap_page, nr_pages); + free_map_bootmem(memmap, nr_pages); } } -- cgit v1.2.3-55-g7522 From 60cefed485a02bd99b6299dad70666fe49245da7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 29 Nov 2012 13:54:23 -0800 Subject: mm: vmscan: fix endless loop in kswapd balancing Kswapd does not in all places have the same criteria for a balanced zone. Zones are only being reclaimed when their high watermark is breached, but compaction checks loop over the zonelist again when the zone does not meet the low watermark plus two times the size of the allocation. This gets kswapd stuck in an endless loop over a small zone, like the DMA zone, where the high watermark is smaller than the compaction requirement. Add a function, zone_balanced(), that checks the watermark, and, for higher order allocations, if compaction has enough free memory. Then use it uniformly to check for balanced zones. This makes sure that when the compaction watermark is not met, at least reclaim happens and progress is made - or the zone is declared unreclaimable at some point and skipped entirely. Signed-off-by: Johannes Weiner Reported-by: George Spelvin Reported-by: Johannes Hirte Reported-by: Tomas Racek Tested-by: Johannes Hirte Reviewed-by: Rik van Riel Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index cbf84e152f04..83f4d0e85601 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2414,6 +2414,19 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) } while (memcg); } +static bool zone_balanced(struct zone *zone, int order, + unsigned long balance_gap, int classzone_idx) +{ + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + + balance_gap, classzone_idx, 0)) + return false; + + if (COMPACTION_BUILD && order && !compaction_suitable(zone, order)) + return false; + + return true; +} + /* * pgdat_balanced is used when checking if a node is balanced for high-order * allocations. Only zones that meet watermarks and are in a zone allowed @@ -2492,8 +2505,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, continue; } - if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), - i, 0)) + if (!zone_balanced(zone, order, 0, i)) all_zones_ok = false; else balanced += zone->present_pages; @@ -2602,8 +2614,7 @@ loop_again: break; } - if (!zone_watermark_ok_safe(zone, order, - high_wmark_pages(zone), 0, 0)) { + if (!zone_balanced(zone, order, 0, 0)) { end_zone = i; break; } else { @@ -2679,9 +2690,8 @@ loop_again: testorder = 0; if ((buffer_heads_over_limit && is_highmem_idx(i)) || - !zone_watermark_ok_safe(zone, testorder, - high_wmark_pages(zone) + balance_gap, - end_zone, 0)) { + !zone_balanced(zone, testorder, + balance_gap, end_zone)) { shrink_zone(zone, &sc); reclaim_state->reclaimed_slab = 0; @@ -2708,8 +2718,7 @@ loop_again: continue; } - if (!zone_watermark_ok_safe(zone, testorder, - high_wmark_pages(zone), end_zone, 0)) { + if (!zone_balanced(zone, testorder, 0, end_zone)) { all_zones_ok = 0; /* * We are still under min water mark. This -- cgit v1.2.3-55-g7522 From a50915394f1fc02c2861d3b7ce7014788aa5066e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 29 Nov 2012 13:54:27 -0800 Subject: revert "Revert "mm: remove __GFP_NO_KSWAPD"" It apepars that this patch was innocent, and we hope that "mm: avoid waking kswapd for THP allocations when compaction is deferred or contended" will fix the final kswapd-spinning cause. Cc: Zdenek Kabelac Cc: Seth Jennings Cc: Valdis Kletnieks Cc: Jiri Slaby Cc: Rik van Riel Cc: Robert Jennings Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/mtd/mtdcore.c | 6 ++---- include/linux/gfp.h | 13 +++++-------- include/trace/events/gfpflags.h | 1 - mm/page_alloc.c | 7 +++---- 4 files changed, 10 insertions(+), 17 deletions(-) diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index ec794a72975d..374c46dff7dd 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -1077,8 +1077,7 @@ EXPORT_SYMBOL_GPL(mtd_writev); * until the request succeeds or until the allocation size falls below * the system page size. This attempts to make sure it does not adversely * impact system performance, so when allocating more than one page, we - * ask the memory allocator to avoid re-trying, swapping, writing back - * or performing I/O. + * ask the memory allocator to avoid re-trying. * * Note, this function also makes sure that the allocated buffer is aligned to * the MTD device's min. I/O unit, i.e. the "mtd->writesize" value. @@ -1092,8 +1091,7 @@ EXPORT_SYMBOL_GPL(mtd_writev); */ void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size) { - gfp_t flags = __GFP_NOWARN | __GFP_WAIT | - __GFP_NORETRY | __GFP_NO_KSWAPD; + gfp_t flags = __GFP_NOWARN | __GFP_WAIT | __GFP_NORETRY; size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE); void *kbuf; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index d0a79678f169..76e1aa206f57 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -30,10 +30,9 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u -#define ___GFP_NOTRACK 0x200000u -#define ___GFP_NO_KSWAPD 0x400000u -#define ___GFP_OTHER_NODE 0x800000u -#define ___GFP_WRITE 0x1000000u +#define ___GFP_NOTRACK 0x100000u +#define ___GFP_OTHER_NODE 0x200000u +#define ___GFP_WRITE 0x400000u /* * GFP bitmasks.. @@ -86,7 +85,6 @@ struct vm_area_struct; #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ -#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ @@ -96,7 +94,7 @@ struct vm_area_struct; */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 23 /* Room for N __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ @@ -116,8 +114,7 @@ struct vm_area_struct; __GFP_MOVABLE) #define GFP_IOFS (__GFP_IO | __GFP_FS) #define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ - __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ - __GFP_NO_KSWAPD) + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index d6fd8e5b14b7..9391706e9254 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -36,7 +36,6 @@ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ - {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \ ) : "GFP_NOWAIT" diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7e208f0ad68c..8193809f3de0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2416,9 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapd(order, zonelist, high_zoneidx, - zone_idx(preferred_zone)); + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background @@ -2495,7 +2494,7 @@ rebalance: * system then fail the allocation instead of entering direct reclaim. */ if ((deferred_compaction || contended_compaction) && - (gfp_mask & __GFP_NO_KSWAPD)) + (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) goto nopage; /* Try direct reclaim and then allocating */ -- cgit v1.2.3-55-g7522 From 782fd30406ecb9d9b082816abe0c6008fc72a7b0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 29 Nov 2012 13:54:30 -0800 Subject: mm: avoid waking kswapd for THP allocations when compaction is deferred or contended With "mm: vmscan: scale number of pages reclaimed by reclaim/compaction based on failures" reverted, Zdenek Kabelac reported the following Hmm, so it's just took longer to hit the problem and observe kswapd0 spinning on my CPU again - it's not as endless like before - but still it easily eats minutes - it helps to turn off Firefox or TB (memory hungry apps) so kswapd0 stops soon - and restart those apps again. (And I still have like >1GB of cached memory) kswapd0 R running task 0 30 2 0x00000000 Call Trace: preempt_schedule+0x42/0x60 _raw_spin_unlock+0x55/0x60 put_super+0x31/0x40 drop_super+0x22/0x30 prune_super+0x149/0x1b0 shrink_slab+0xba/0x510 The sysrq+m indicates the system has no swap so it'll never reclaim anonymous pages as part of reclaim/compaction. That is one part of the problem but not the root cause as file-backed pages could also be reclaimed. The likely underlying problem is that kswapd is woken up or kept awake for each THP allocation request in the page allocator slow path. If compaction fails for the requesting process then compaction will be deferred for a time and direct reclaim is avoided. However, if there are a storm of THP requests that are simply rejected, it will still be the the case that kswapd is awake for a prolonged period of time as pgdat->kswapd_max_order is updated each time. This is noticed by the main kswapd() loop and it will not call kswapd_try_to_sleep(). Instead it will loopp, shrinking a small number of pages and calling shrink_slab() on each iteration. This patch defers when kswapd gets woken up for THP allocations. For !THP allocations, kswapd is always woken up. For THP allocations, kswapd is woken up iff the process is willing to enter into direct reclaim/compaction. [akpm@linux-foundation.org: fix typo in comment] Signed-off-by: Mel Gorman Cc: Zdenek Kabelac Cc: Seth Jennings Cc: Jiri Slaby Cc: Rik van Riel Cc: Robert Jennings Cc: Valdis Kletnieks Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8193809f3de0..a8f2c87792c3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2378,6 +2378,15 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); } +/* Returns true if the allocation is likely for THP */ +static bool is_thp_alloc(gfp_t gfp_mask, unsigned int order) +{ + if (order == pageblock_order && + (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) + return true; + return false; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -2416,7 +2425,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx, + /* The decision whether to wake kswapd for THP is made later */ + if (!is_thp_alloc(gfp_mask, order)) + wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(preferred_zone)); /* @@ -2487,15 +2498,21 @@ rebalance: goto got_pg; sync_migration = true; - /* - * If compaction is deferred for high-order allocations, it is because - * sync compaction recently failed. In this is the case and the caller - * requested a movable allocation that does not heavily disrupt the - * system then fail the allocation instead of entering direct reclaim. - */ - if ((deferred_compaction || contended_compaction) && - (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) - goto nopage; + if (is_thp_alloc(gfp_mask, order)) { + /* + * If compaction is deferred for high-order allocations, it is + * because sync compaction recently failed. If this is the case + * and the caller requested a movable allocation that does not + * heavily disrupt the system then fail the allocation instead + * of entering direct reclaim. + */ + if (deferred_compaction || contended_compaction) + goto nopage; + + /* If process is willing to reclaim/compact then wake kswapd */ + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); + } /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, -- cgit v1.2.3-55-g7522 From 783657a7dc20e5c0efbc9a09a9dd38e238a723da Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 29 Nov 2012 13:54:34 -0800 Subject: mm: soft offline: split thp at the beginning of soft_offline_page() When we try to soft-offline a thp tail page, put_page() is called on the tail page unthinkingly and VM_BUG_ON is triggered in put_compound_page(). This patch splits thp before going into the main body of soft-offlining. Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Cc: Andi Kleen Cc: Wu Fengguang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6c5899b9034a..8b20278be6a6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1476,9 +1476,17 @@ int soft_offline_page(struct page *page, int flags) { int ret; unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_trans_head(page); if (PageHuge(page)) return soft_offline_huge_page(page, flags); + if (PageTransHuge(hpage)) { + if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { + pr_info("soft offline: %#lx: failed to split THP\n", + pfn); + return -EBUSY; + } + } ret = get_any_page(page, pfn, flags); if (ret < 0) -- cgit v1.2.3-55-g7522 From 1430e17844e2cd15bc15f3c21b5349e762f6ba41 Mon Sep 17 00:00:00 2001 From: Kim, Milo Date: Thu, 29 Nov 2012 13:54:36 -0800 Subject: drivers/rtc/rtc-tps65910.c: fix invalid pointer access on _remove() The tps65910_rtc data is registered as the platform driver data in _probe(= ). Therefore the tps65910_rtc should be used on unregistering the rtc device. And device pointer should be retrieved from the platform_device structure. This patch fixes the below oops: Unable to handle kernel NULL pointer dereference at virtual address 00000008 Modules linked in: rtc_tps65910(-) CPU: 0 Not tainted (3.7.0-rc7-next-20121128-g6b1f974-dirty #7) PC is at tps65910_rtc_alarm_irq_enable+0x20/0x2c [rtc_tps65910] (tps65910_rtc_alarm_irq_enable+0x20/0x2c [rtc_tps65910]) (tps65910_rtc_remove+0x18/0x28 [rtc_tps65910]) (platform_drv_remove+0x18/0x1c) (__device_release_driver+0x70/0xcc) (driver_detach+0xb4/0xb8) (bus_remove_driver+0x7c/0xc0) (sys_delete_module+0x148/0x21c) Signed-off-by: Milo(Woogyom) Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-tps65910.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-tps65910.c b/drivers/rtc/rtc-tps65910.c index 7a82337e4dee..073108dcf9e7 100644 --- a/drivers/rtc/rtc-tps65910.c +++ b/drivers/rtc/rtc-tps65910.c @@ -288,11 +288,11 @@ static int __devinit tps65910_rtc_probe(struct platform_device *pdev) static int __devexit tps65910_rtc_remove(struct platform_device *pdev) { /* leave rtc running, but disable irqs */ - struct rtc_device *rtc = platform_get_drvdata(pdev); + struct tps65910_rtc *tps_rtc = platform_get_drvdata(pdev); - tps65910_rtc_alarm_irq_enable(&rtc->dev, 0); + tps65910_rtc_alarm_irq_enable(&pdev->dev, 0); - rtc_device_unregister(rtc); + rtc_device_unregister(tps_rtc->rtc); return 0; } -- cgit v1.2.3-55-g7522