summaryrefslogtreecommitdiffstats
path: root/mm/memory-failure.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r--mm/memory-failure.c327
1 files changed, 248 insertions, 79 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 620b0b461593..124324134ff6 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -7,21 +7,26 @@
* Free Software Foundation.
*
* High level machine check handler. Handles pages reported by the
- * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * hardware as being corrupted usually due to a multi-bit ECC memory or cache
* failure.
+ *
+ * In addition there is a "soft offline" entry point that allows stop using
+ * not-yet-corrupted-by-suspicious pages without killing anything.
*
* Handles page cache pages in various states. The tricky part
- * here is that we can access any page asynchronous to other VM
- * users, because memory failures could happen anytime and anywhere,
- * possibly violating some of their assumptions. This is why this code
- * has to be extremely careful. Generally it tries to use normal locking
- * rules, as in get the standard locks, even if that means the
- * error handling takes potentially a long time.
- *
- * The operation to map back from RMAP chains to processes has to walk
- * the complete process list and has non linear complexity with the number
- * mappings. In short it can be quite slow. But since memory corruptions
- * are rare we hope to get away with this.
+ * here is that we can access any page asynchronously in respect to
+ * other VM users, because memory failures could happen anytime and
+ * anywhere. This could violate some of their assumptions. This is why
+ * this code has to be extremely careful. Generally it tries to use
+ * normal locking rules, as in get the standard locks, even if that means
+ * the error handling takes potentially a long time.
+ *
+ * There are several operations here with exponential complexity because
+ * of unsuitable VM data structures. For example the operation to map back
+ * from RMAP chains to processes has to walk the complete process list and
+ * has non linear complexity with the number. But since memory corruptions
+ * are rare we hope to get away with this. This avoids impacting the core
+ * VM.
*/
/*
@@ -30,7 +35,6 @@
* - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
* - pass bad pages to kdump next kernel
*/
-#define DEBUG 1 /* remove me in 2.6.34 */
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
@@ -45,6 +49,8 @@
#include <linux/page-isolation.h>
#include <linux/suspend.h>
#include <linux/slab.h>
+#include <linux/swapops.h>
+#include <linux/hugetlb.h>
#include "internal.h"
int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -76,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p)
return 0;
/*
- * page_mapping() does not accept slab page
+ * page_mapping() does not accept slab pages.
*/
if (PageSlab(p))
return -EINVAL;
@@ -181,7 +187,7 @@ EXPORT_SYMBOL_GPL(hwpoison_filter);
* signal.
*/
static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
- unsigned long pfn)
+ unsigned long pfn, struct page *page)
{
struct siginfo si;
int ret;
@@ -196,7 +202,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
#ifdef __ARCH_SI_TRAPNO
si.si_trapno = trapno;
#endif
- si.si_addr_lsb = PAGE_SHIFT;
+ si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
/*
* Don't use force here, it's convenient if the signal
* can be temporarily blocked.
@@ -233,7 +239,7 @@ void shake_page(struct page *p, int access)
int nr;
do {
nr = shrink_slab(1000, GFP_KERNEL, 1000);
- if (page_count(p) == 0)
+ if (page_count(p) == 1)
break;
} while (nr > 10);
}
@@ -266,7 +272,7 @@ struct to_kill {
struct list_head nd;
struct task_struct *tsk;
unsigned long addr;
- unsigned addr_valid:1;
+ char addr_valid;
};
/*
@@ -307,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* a SIGKILL because the error is not contained anymore.
*/
if (tk->addr == -EFAULT) {
- pr_debug("MCE: Unable to find user space address %lx in %s\n",
+ pr_info("MCE: Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
tk->addr_valid = 0;
}
@@ -325,7 +331,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* wrong earlier.
*/
static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
- int fail, unsigned long pfn)
+ int fail, struct page *page, unsigned long pfn)
{
struct to_kill *tk, *next;
@@ -350,7 +356,7 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
* process anyways.
*/
else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
- pfn) < 0)
+ pfn, page) < 0)
printk(KERN_ERR
"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
@@ -575,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
pfn, err);
} else if (page_has_private(p) &&
!try_to_release_page(p, GFP_NOIO)) {
- pr_debug("MCE %#lx: failed to release buffers\n", pfn);
+ pr_info("MCE %#lx: failed to release buffers\n", pfn);
} else {
ret = RECOVERED;
}
@@ -689,17 +695,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
/*
* Huge pages. Needs work.
* Issues:
- * No rmap support so we cannot find the original mapper. In theory could walk
- * all MMs and look for the mappings, but that would be non atomic and racy.
- * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
- * like just walking the current process and hoping it has it mapped (that
- * should be usually true for the common "shared database cache" case)
- * Should handle free huge pages and dequeue them too, but this needs to
- * handle huge page accounting correctly.
+ * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
+ * To narrow down kill region to one page, we need to break up pmd.
*/
static int me_huge_page(struct page *p, unsigned long pfn)
{
- return FAILED;
+ int res = 0;
+ struct page *hpage = compound_head(p);
+ /*
+ * We can safely recover from error on free or reserved (i.e.
+ * not in-use) hugepage by dequeuing it from freelist.
+ * To check whether a hugepage is in-use or not, we can't use
+ * page->lru because it can be used in other hugepage operations,
+ * such as __unmap_hugepage_range() and gather_surplus_pages().
+ * So instead we use page_mapping() and PageAnon().
+ * We assume that this function is called with page lock held,
+ * so there is no race between isolation and mapping/unmapping.
+ */
+ if (!(page_mapping(hpage) || PageAnon(hpage))) {
+ res = dequeue_hwpoisoned_huge_page(hpage);
+ if (!res)
+ return RECOVERED;
+ }
+ return DELAYED;
}
/*
@@ -822,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p,
return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
}
-#define N_UNMAP_TRIES 5
-
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
@@ -835,8 +851,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
- int i;
int kill = 1;
+ struct page *hpage = compound_head(p);
if (PageReserved(p) || PageSlab(p))
return SWAP_SUCCESS;
@@ -845,10 +861,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* This check implies we don't kill processes if their pages
* are in the swap cache early. Those are always late kills.
*/
- if (!page_mapped(p))
+ if (!page_mapped(hpage))
return SWAP_SUCCESS;
- if (PageCompound(p) || PageKsm(p))
+ if (PageKsm(p))
return SWAP_FAIL;
if (PageSwapCache(p)) {
@@ -863,10 +879,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* XXX: the dirty test could be racy: set_page_dirty() may not always
* be called inside page lock (it's recommended but not enforced).
*/
- mapping = page_mapping(p);
- if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
- if (page_mkclean(p)) {
- SetPageDirty(p);
+ mapping = page_mapping(hpage);
+ if (!PageDirty(hpage) && mapping &&
+ mapping_cap_writeback_dirty(mapping)) {
+ if (page_mkclean(hpage)) {
+ SetPageDirty(hpage);
} else {
kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
@@ -885,22 +902,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* there's nothing that can be done.
*/
if (kill)
- collect_procs(p, &tokill);
-
- /*
- * try_to_unmap can fail temporarily due to races.
- * Try a few times (RED-PEN better strategy?)
- */
- for (i = 0; i < N_UNMAP_TRIES; i++) {
- ret = try_to_unmap(p, ttu);
- if (ret == SWAP_SUCCESS)
- break;
- pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
- }
+ collect_procs(hpage, &tokill);
+ ret = try_to_unmap(hpage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(p));
+ pfn, page_mapcount(hpage));
/*
* Now that the dirty bit has been propagated to the
@@ -911,17 +918,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- kill_procs_ao(&tokill, !!PageDirty(p), trapno,
- ret != SWAP_SUCCESS, pfn);
+ kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
+ ret != SWAP_SUCCESS, p, pfn);
return ret;
}
+static void set_page_hwpoison_huge_page(struct page *hpage)
+{
+ int i;
+ int nr_pages = 1 << compound_order(hpage);
+ for (i = 0; i < nr_pages; i++)
+ SetPageHWPoison(hpage + i);
+}
+
+static void clear_page_hwpoison_huge_page(struct page *hpage)
+{
+ int i;
+ int nr_pages = 1 << compound_order(hpage);
+ for (i = 0; i < nr_pages; i++)
+ ClearPageHWPoison(hpage + i);
+}
+
int __memory_failure(unsigned long pfn, int trapno, int flags)
{
struct page_state *ps;
struct page *p;
+ struct page *hpage;
int res;
+ unsigned int nr_pages;
if (!sysctl_memory_failure_recovery)
panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -934,18 +959,23 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
}
p = pfn_to_page(pfn);
+ hpage = compound_head(p);
if (TestSetPageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
return 0;
}
- atomic_long_add(1, &mce_bad_pages);
+ nr_pages = 1 << compound_order(hpage);
+ atomic_long_add(nr_pages, &mce_bad_pages);
/*
* We need/can do nothing about count=0 pages.
* 1) it's a free page, and therefore in safe hand:
* prep_new_page() will be the gate keeper.
- * 2) it's part of a non-compound high order page.
+ * 2) it's a free hugepage, which is also safe:
+ * an affected hugepage will be dequeued from hugepage freelist,
+ * so there's no concern about reusing it ever after.
+ * 3) it's part of a non-compound high order page.
* Implies some kernel user: cannot stop them from
* R/W the page; let's pray that the page has been
* used and will be freed some time later.
@@ -953,10 +983,28 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
*/
if (!(flags & MF_COUNT_INCREASED) &&
- !get_page_unless_zero(compound_head(p))) {
+ !get_page_unless_zero(hpage)) {
if (is_free_buddy_page(p)) {
action_result(pfn, "free buddy", DELAYED);
return 0;
+ } else if (PageHuge(hpage)) {
+ /*
+ * Check "just unpoisoned", "filter hit", and
+ * "race with other subpage."
+ */
+ lock_page_nosync(hpage);
+ if (!PageHWPoison(hpage)
+ || (hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != hpage && TestSetPageHWPoison(hpage))) {
+ atomic_long_sub(nr_pages, &mce_bad_pages);
+ return 0;
+ }
+ set_page_hwpoison_huge_page(hpage);
+ res = dequeue_hwpoisoned_huge_page(hpage);
+ action_result(pfn, "free huge",
+ res ? IGNORED : DELAYED);
+ unlock_page(hpage);
+ return res;
} else {
action_result(pfn, "high order kernel", IGNORED);
return -EBUSY;
@@ -971,9 +1019,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
- if (!PageLRU(p))
+ if (!PageLRU(p) && !PageHuge(p))
shake_page(p, 0);
- if (!PageLRU(p)) {
+ if (!PageLRU(p) && !PageHuge(p)) {
/*
* shake_page could have turned it free.
*/
@@ -991,7 +1039,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* It's very difficult to mess with pages currently under IO
* and in many cases impossible, so we just avoid it here.
*/
- lock_page_nosync(p);
+ lock_page_nosync(hpage);
/*
* unpoison always clear PG_hwpoison inside page lock
@@ -1003,12 +1051,32 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
- atomic_long_dec(&mce_bad_pages);
- unlock_page(p);
- put_page(p);
+ atomic_long_sub(nr_pages, &mce_bad_pages);
+ unlock_page(hpage);
+ put_page(hpage);
return 0;
}
+ /*
+ * For error on the tail page, we should set PG_hwpoison
+ * on the head page to show that the hugepage is hwpoisoned
+ */
+ if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+ action_result(pfn, "hugepage already hardware poisoned",
+ IGNORED);
+ unlock_page(hpage);
+ put_page(hpage);
+ return 0;
+ }
+ /*
+ * Set PG_hwpoison on all pages in an error hugepage,
+ * because containment is done in hugepage unit for now.
+ * Since we have done TestSetPageHWPoison() for the head page with
+ * page lock held, we can safely set PG_hwpoison bits on tail pages.
+ */
+ if (PageHuge(p))
+ set_page_hwpoison_huge_page(hpage);
+
wait_on_page_writeback(p);
/*
@@ -1038,7 +1106,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
}
}
out:
- unlock_page(p);
+ unlock_page(hpage);
return res;
}
EXPORT_SYMBOL_GPL(__memory_failure);
@@ -1082,6 +1150,7 @@ int unpoison_memory(unsigned long pfn)
struct page *page;
struct page *p;
int freeit = 0;
+ unsigned int nr_pages;
if (!pfn_valid(pfn))
return -ENXIO;
@@ -1090,14 +1159,26 @@ int unpoison_memory(unsigned long pfn)
page = compound_head(p);
if (!PageHWPoison(p)) {
- pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
+ pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
return 0;
}
+ nr_pages = 1 << compound_order(page);
+
if (!get_page_unless_zero(page)) {
+ /*
+ * Since HWPoisoned hugepage should have non-zero refcount,
+ * race between memory failure and unpoison seems to happen.
+ * In such case unpoison fails and memory failure runs
+ * to the end.
+ */
+ if (PageHuge(page)) {
+ pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+ return 0;
+ }
if (TestClearPageHWPoison(p))
- atomic_long_dec(&mce_bad_pages);
- pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
+ atomic_long_sub(nr_pages, &mce_bad_pages);
+ pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
return 0;
}
@@ -1108,10 +1189,12 @@ int unpoison_memory(unsigned long pfn)
* the PG_hwpoison page will be caught and isolated on the entrance to
* the free buddy page pool.
*/
- if (TestClearPageHWPoison(p)) {
- pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
- atomic_long_dec(&mce_bad_pages);
+ if (TestClearPageHWPoison(page)) {
+ pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
+ atomic_long_sub(nr_pages, &mce_bad_pages);
freeit = 1;
+ if (PageHuge(page))
+ clear_page_hwpoison_huge_page(page);
}
unlock_page(page);
@@ -1126,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory);
static struct page *new_page(struct page *p, unsigned long private, int **x)
{
int nid = page_to_nid(p);
- return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+ if (PageHuge(p))
+ return alloc_huge_page_node(page_hstate(compound_head(p)),
+ nid);
+ else
+ return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
}
/*
@@ -1154,14 +1241,21 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
* was free.
*/
set_migratetype_isolate(p);
+ /*
+ * When the target page is a free hugepage, just remove it
+ * from free hugepage list.
+ */
if (!get_page_unless_zero(compound_head(p))) {
- if (is_free_buddy_page(p)) {
- pr_debug("get_any_page: %#lx free buddy page\n", pfn);
+ if (PageHuge(p)) {
+ pr_info("get_any_page: %#lx free huge page\n", pfn);
+ ret = dequeue_hwpoisoned_huge_page(compound_head(p));
+ } else if (is_free_buddy_page(p)) {
+ pr_info("get_any_page: %#lx free buddy page\n", pfn);
/* Set hwpoison bit while page is still isolated */
SetPageHWPoison(p);
ret = 0;
} else {
- pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
+ pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
pfn, p->flags);
ret = -EIO;
}
@@ -1174,6 +1268,46 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
return ret;
}
+static int soft_offline_huge_page(struct page *page, int flags)
+{
+ int ret;
+ unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_head(page);
+ LIST_HEAD(pagelist);
+
+ ret = get_any_page(page, pfn, flags);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ goto done;
+
+ if (PageHWPoison(hpage)) {
+ put_page(hpage);
+ pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
+ return -EBUSY;
+ }
+
+ /* Keep page count to indicate a given hugepage is isolated. */
+
+ list_add(&hpage->lru, &pagelist);
+ ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+ if (ret) {
+ putback_lru_pages(&pagelist);
+ pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+ pfn, ret, page->flags);
+ if (ret > 0)
+ ret = -EIO;
+ return ret;
+ }
+done:
+ if (!PageHWPoison(hpage))
+ atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
+ set_page_hwpoison_huge_page(hpage);
+ dequeue_hwpoisoned_huge_page(hpage);
+ /* keep elevated page count for bad page */
+ return ret;
+}
+
/**
* soft_offline_page - Soft offline a page.
* @page: page to offline
@@ -1201,6 +1335,9 @@ int soft_offline_page(struct page *page, int flags)
int ret;
unsigned long pfn = page_to_pfn(page);
+ if (PageHuge(page))
+ return soft_offline_huge_page(page, flags);
+
ret = get_any_page(page, pfn, flags);
if (ret < 0)
return ret;
@@ -1227,7 +1364,7 @@ int soft_offline_page(struct page *page, int flags)
goto done;
}
if (!PageLRU(page)) {
- pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
+ pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
pfn, page->flags);
return -EIO;
}
@@ -1241,7 +1378,7 @@ int soft_offline_page(struct page *page, int flags)
if (PageHWPoison(page)) {
unlock_page(page);
put_page(page);
- pr_debug("soft offline: %#lx page already poisoned\n", pfn);
+ pr_info("soft offline: %#lx page already poisoned\n", pfn);
return -EBUSY;
}
@@ -1262,7 +1399,7 @@ int soft_offline_page(struct page *page, int flags)
put_page(page);
if (ret == 1) {
ret = 0;
- pr_debug("soft_offline: %#lx: invalidated\n", pfn);
+ pr_info("soft_offline: %#lx: invalidated\n", pfn);
goto done;
}
@@ -1278,13 +1415,13 @@ int soft_offline_page(struct page *page, int flags)
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
if (ret) {
- pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+ pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
}
} else {
- pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
+ pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
pfn, ret, page_count(page), page->flags);
}
if (ret)
@@ -1296,3 +1433,35 @@ done:
/* keep elevated page count for bad page */
return ret;
}
+
+/*
+ * The caller must hold current->mm->mmap_sem in read mode.
+ */
+int is_hwpoison_address(unsigned long addr)
+{
+ pgd_t *pgdp;
+ pud_t pud, *pudp;
+ pmd_t pmd, *pmdp;
+ pte_t pte, *ptep;
+ swp_entry_t entry;
+
+ pgdp = pgd_offset(current->mm, addr);
+ if (!pgd_present(*pgdp))
+ return 0;
+ pudp = pud_offset(pgdp, addr);
+ pud = *pudp;
+ if (!pud_present(pud) || pud_large(pud))
+ return 0;
+ pmdp = pmd_offset(pudp, addr);
+ pmd = *pmdp;
+ if (!pmd_present(pmd) || pmd_large(pmd))
+ return 0;
+ ptep = pte_offset_map(pmdp, addr);
+ pte = *ptep;
+ pte_unmap(ptep);
+ if (!is_swap_pte(pte))
+ return 0;
+ entry = pte_to_swp_entry(pte);
+ return is_hwpoison_entry(entry);
+}
+EXPORT_SYMBOL_GPL(is_hwpoison_address);