From 42139eb356e3384759ca143ae04d82376346eb4c Mon Sep 17 00:00:00 2001 From: Chen, Gong Date: Fri, 6 Dec 2013 01:17:10 -0500 Subject: ACPI, eMCA: Combine eMCA/EDAC event reporting priority eMCA has higher H/W event reporting priority. Once it is loaded, EDAC event reporting should be disabled, unless EDAC overrides eMCA priority via command line parameter "edac_report=force". Signed-off-by: Chen, Gong Acked-by: Tony Luck Link: http://lkml.kernel.org/r/1386310630-12529-4-git-send-email-gong.chen@linux.intel.com [ Boris: massage printk message. ] Signed-off-by: Borislav Petkov --- drivers/acpi/acpi_extlog.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'drivers/acpi') diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index a6869e110ce5..5d33c5415405 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,8 @@ struct extlog_l1_head { u8 rev1[12]; }; +static int old_edac_report_status; + static u8 extlog_dsm_uuid[] = "663E35AF-CC10-41A4-88EA-5470AF055295"; /* L1 table related physical address */ @@ -150,7 +153,7 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu); - return NOTIFY_DONE; + return NOTIFY_STOP; } static int extlog_get_dsm(acpi_handle handle, int rev, int func, u64 *ret) @@ -231,8 +234,12 @@ static int __init extlog_init(void) u64 cap; int rc; - rc = -ENODEV; + if (get_edac_report_status() == EDAC_REPORTING_FORCE) { + pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n"); + return -EPERM; + } + rc = -ENODEV; rdmsrl(MSR_IA32_MCG_CAP, cap); if (!(cap & MCG_ELOG_P)) return rc; @@ -287,6 +294,12 @@ static int __init extlog_init(void) if (elog_buf == NULL) goto err_release_elog; + /* + * eMCA event report method has higher priority than EDAC method, + * unless EDAC event report method is mandatory. + */ + old_edac_report_status = get_edac_report_status(); + set_edac_report_status(EDAC_REPORTING_DISABLED); mce_register_decode_chain(&extlog_mce_dec); /* enable OS to be involved to take over management from BIOS */ ((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN; @@ -308,6 +321,7 @@ err: static void __exit extlog_exit(void) { + set_edac_report_status(old_edac_report_status); mce_unregister_decode_chain(&extlog_mce_dec); ((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN; if (extlog_l1_addr) -- cgit v1.2.3-55-g7522 From 3482fb5e0c1c20ce0dbcfc5ca3b6558a8c455b10 Mon Sep 17 00:00:00 2001 From: Luck, Tony Date: Wed, 6 Nov 2013 13:30:36 -0800 Subject: ACPI, APEI, EINJ: Changes to the ACPI/APEI/EINJ debugfs interface When I added support for ACPI5 I made the assumption that injected processor errors would just need to know the APICID, memory errors just the address and mask, and PCIe errors just the segment/bus/device/function. So I had the code check the type of injection and multiplex the "param1" value appropriately. This was not a good assumption :-( There are injection scenarios where we need to specify more than one of these items. E.g. injecting a cache error we need to specify an APICID of the cpu that owns the cache, and also an address (so that we can trip the error by accessing the address). Add a "flags" file to give the user direct access to specify which items are valid in the ACPI SET_ERROR_TYPE_WITH_ADDRESS structure. Also add new files param3 and param4 to hold all these values. For backwards compatability with old injection scripts we maintain the old behaviour if flags remains set at zero (or is reset to 0). Acked-by: Naveen N. Rao Signed-off-by: Tony Luck --- Documentation/acpi/apei/einj.txt | 19 ++++++++++++++++++- drivers/acpi/apei/einj.c | 39 ++++++++++++++++++++++++++++++++++----- 2 files changed, 52 insertions(+), 6 deletions(-) (limited to 'drivers/acpi') diff --git a/Documentation/acpi/apei/einj.txt b/Documentation/acpi/apei/einj.txt index a58b63da1a36..f51861bcb07b 100644 --- a/Documentation/acpi/apei/einj.txt +++ b/Documentation/acpi/apei/einj.txt @@ -45,11 +45,22 @@ directory apei/einj. The following files are provided. injection. Before this, please specify all necessary error parameters. +- flags + Present for kernel version 3.13 and above. Used to specify which + of param{1..4} are valid and should be used by BIOS during injection. + Value is a bitmask as specified in ACPI5.0 spec for the + SET_ERROR_TYPE_WITH_ADDRESS data structure: + Bit 0 - Processor APIC field valid (see param3 below) + Bit 1 - Memory address and mask valid (param1 and param2) + Bit 2 - PCIe (seg,bus,dev,fn) valid (param4 below) + If set to zero, legacy behaviour is used where the type of injection + specifies just one bit set, and param1 is multiplexed. + - param1 This file is used to set the first error parameter value. Effect of parameter depends on error_type specified. For example, if error type is memory related type, the param1 should be a valid physical - memory address. + memory address. [Unless "flag" is set - see above] - param2 This file is used to set the second error parameter value. Effect of @@ -58,6 +69,12 @@ directory apei/einj. The following files are provided. address mask. Linux requires page or narrower granularity, say, 0xfffffffffffff000. +- param3 + Used when the 0x1 bit is set in "flag" to specify the APIC id + +- param4 + Used when the 0x4 bit is set in "flag" to specify target PCIe device + - notrigger The EINJ mechanism is a two step process. First inject the error, then perform some actions to trigger it. Setting "notrigger" to 1 skips the diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c index fb57d03e698b..c76674e2a01f 100644 --- a/drivers/acpi/apei/einj.c +++ b/drivers/acpi/apei/einj.c @@ -416,7 +416,8 @@ out: return rc; } -static int __einj_error_inject(u32 type, u64 param1, u64 param2) +static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, + u64 param3, u64 param4) { struct apei_exec_context ctx; u64 val, trigger_paddr, timeout = FIRMWARE_TIMEOUT; @@ -446,6 +447,12 @@ static int __einj_error_inject(u32 type, u64 param1, u64 param2) break; } v5param->flags = vendor_flags; + } else if (flags) { + v5param->flags = flags; + v5param->memory_address = param1; + v5param->memory_address_range = param2; + v5param->apicid = param3; + v5param->pcie_sbdf = param4; } else { switch (type) { case ACPI_EINJ_PROCESSOR_CORRECTABLE: @@ -514,11 +521,17 @@ static int __einj_error_inject(u32 type, u64 param1, u64 param2) } /* Inject the specified hardware error */ -static int einj_error_inject(u32 type, u64 param1, u64 param2) +static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, + u64 param3, u64 param4) { int rc; unsigned long pfn; + /* If user manually set "flags", make sure it is legal */ + if (flags && (flags & + ~(SETWA_FLAGS_APICID|SETWA_FLAGS_MEM|SETWA_FLAGS_PCIE_SBDF))) + return -EINVAL; + /* * We need extra sanity checks for memory errors. * Other types leap directly to injection. @@ -532,7 +545,7 @@ static int einj_error_inject(u32 type, u64 param1, u64 param2) if (type & ACPI5_VENDOR_BIT) { if (vendor_flags != SETWA_FLAGS_MEM) goto inject; - } else if (!(type & MEM_ERROR_MASK)) + } else if (!(type & MEM_ERROR_MASK) && !(flags & SETWA_FLAGS_MEM)) goto inject; /* @@ -546,15 +559,18 @@ static int einj_error_inject(u32 type, u64 param1, u64 param2) inject: mutex_lock(&einj_mutex); - rc = __einj_error_inject(type, param1, param2); + rc = __einj_error_inject(type, flags, param1, param2, param3, param4); mutex_unlock(&einj_mutex); return rc; } static u32 error_type; +static u32 error_flags; static u64 error_param1; static u64 error_param2; +static u64 error_param3; +static u64 error_param4; static struct dentry *einj_debug_dir; static int available_error_type_show(struct seq_file *m, void *v) @@ -648,7 +664,8 @@ static int error_inject_set(void *data, u64 val) if (!error_type) return -EINVAL; - return einj_error_inject(error_type, error_param1, error_param2); + return einj_error_inject(error_type, error_flags, error_param1, error_param2, + error_param3, error_param4); } DEFINE_SIMPLE_ATTRIBUTE(error_inject_fops, NULL, @@ -729,6 +746,10 @@ static int __init einj_init(void) rc = -ENOMEM; einj_param = einj_get_parameter_address(); if ((param_extension || acpi5) && einj_param) { + fentry = debugfs_create_x32("flags", S_IRUSR | S_IWUSR, + einj_debug_dir, &error_flags); + if (!fentry) + goto err_unmap; fentry = debugfs_create_x64("param1", S_IRUSR | S_IWUSR, einj_debug_dir, &error_param1); if (!fentry) @@ -737,6 +758,14 @@ static int __init einj_init(void) einj_debug_dir, &error_param2); if (!fentry) goto err_unmap; + fentry = debugfs_create_x64("param3", S_IRUSR | S_IWUSR, + einj_debug_dir, &error_param3); + if (!fentry) + goto err_unmap; + fentry = debugfs_create_x64("param4", S_IRUSR | S_IWUSR, + einj_debug_dir, &error_param4); + if (!fentry) + goto err_unmap; fentry = debugfs_create_x32("notrigger", S_IRUSR | S_IWUSR, einj_debug_dir, ¬rigger); -- cgit v1.2.3-55-g7522 From addccbb264e5e0e5762f4893f6df24afad327c8c Mon Sep 17 00:00:00 2001 From: Chen, Gong Date: Mon, 25 Nov 2013 02:15:00 -0500 Subject: ACPI, APEI, GHES: Do not report only correctable errors with SCI Currently SCI is employed to handle corrected errors - memory corrected errors, more specifically but in fact SCI still can be used to handle any errors, e.g. uncorrected or even fatal ones if enabled by the BIOS. Enable logging for those kinds of errors too. Signed-off-by: Chen, Gong Acked-by: Naveen N. Rao Cc: Tony Luck Link: http://lkml.kernel.org/r/1385363701-12387-1-git-send-email-gong.chen@linux.intel.com [ Boris: massage commit message, rename function arg. ] Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce-apei.c | 14 ++++++++++---- drivers/acpi/apei/ghes.c | 3 +-- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'drivers/acpi') diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index de8b60a53f69..a1aef9533154 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -33,22 +33,28 @@ #include #include #include +#include #include #include "mce-internal.h" -void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) +void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) { struct mce m; - /* Only corrected MC is reported */ - if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA)) + if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) return; mce_setup(&m); m.bank = 1; - /* Fake a memory read corrected error with unknown channel */ + /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; + + if (severity >= GHES_SEV_RECOVERABLE) + m.status |= MCI_STATUS_UC; + if (severity >= GHES_SEV_PANIC) + m.status |= MCI_STATUS_PCC; + m.addr = mem_err->physical_addr; mce_log(&m); mce_notify_irq(); diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index a30bc313787b..ce3683d93a13 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -453,8 +453,7 @@ static void ghes_do_proc(struct ghes *ghes, ghes_edac_report_mem_error(ghes, sev, mem_err); #ifdef CONFIG_X86_MCE - apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED, - mem_err); + apei_mce_report_mem_error(sev, mem_err); #endif ghes_handle_memory_failure(gdata, sev); } -- cgit v1.2.3-55-g7522 From d3ab3edc029bf79b09f91d6a22881c24ecaeb000 Mon Sep 17 00:00:00 2001 From: Chen, Gong Date: Wed, 18 Dec 2013 01:30:49 -0500 Subject: ACPI, APEI: Cleanup alignment-aware accesses We do use memcpy to avoid access alignment issues between firmware and OS. Now we can use a better and standard way to avoid this issue. While at it, simplify some variable names to avoid the 80 cols limit and use structure assignment instead of unnecessary memcpy. No functional changes. Because ERST record id cache is implemented in memory to increase the access speed via caching ERST content we can refrain from using memcpy there too and use regular assignment instead. Signed-off-by: Chen, Gong Cc: Cc: Tony Luck Link: http://lkml.kernel.org/r/1387348249-20014-1-git-send-email-gong.chen@linux.intel.com [ Boris: massage commit message a bit. ] Signed-off-by: Borislav Petkov --- drivers/acpi/apei/apei-base.c | 4 ++-- drivers/acpi/apei/einj.c | 19 +++++++++---------- drivers/acpi/apei/erst.c | 2 +- 3 files changed, 12 insertions(+), 13 deletions(-) (limited to 'drivers/acpi') diff --git a/drivers/acpi/apei/apei-base.c b/drivers/acpi/apei/apei-base.c index 6d2c49b86b7f..e55584a072c6 100644 --- a/drivers/acpi/apei/apei-base.c +++ b/drivers/acpi/apei/apei-base.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "apei-internal.h" @@ -567,8 +568,7 @@ static int apei_check_gar(struct acpi_generic_address *reg, u64 *paddr, bit_offset = reg->bit_offset; access_size_code = reg->access_width; space_id = reg->space_id; - /* Handle possible alignment issues */ - memcpy(paddr, ®->address, sizeof(*paddr)); + *paddr = get_unaligned(®->address); if (!*paddr) { pr_warning(FW_BUG APEI_PFX "Invalid physical address in GAR [0x%llx/%u/%u/%u/%u]\n", diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c index fb57d03e698b..361177a9df3a 100644 --- a/drivers/acpi/apei/einj.c +++ b/drivers/acpi/apei/einj.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "apei-internal.h" @@ -216,7 +217,7 @@ static void check_vendor_extension(u64 paddr, static void *einj_get_parameter_address(void) { int i; - u64 paddrv4 = 0, paddrv5 = 0; + u64 pa_v4 = 0, pa_v5 = 0; struct acpi_whea_header *entry; entry = EINJ_TAB_ENTRY(einj_tab); @@ -225,30 +226,28 @@ static void *einj_get_parameter_address(void) entry->instruction == ACPI_EINJ_WRITE_REGISTER && entry->register_region.space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) - memcpy(&paddrv4, &entry->register_region.address, - sizeof(paddrv4)); + pa_v4 = get_unaligned(&entry->register_region.address); if (entry->action == ACPI_EINJ_SET_ERROR_TYPE_WITH_ADDRESS && entry->instruction == ACPI_EINJ_WRITE_REGISTER && entry->register_region.space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) - memcpy(&paddrv5, &entry->register_region.address, - sizeof(paddrv5)); + pa_v5 = get_unaligned(&entry->register_region.address); entry++; } - if (paddrv5) { + if (pa_v5) { struct set_error_type_with_address *v5param; - v5param = acpi_os_map_memory(paddrv5, sizeof(*v5param)); + v5param = acpi_os_map_memory(pa_v5, sizeof(*v5param)); if (v5param) { acpi5 = 1; - check_vendor_extension(paddrv5, v5param); + check_vendor_extension(pa_v5, v5param); return v5param; } } - if (param_extension && paddrv4) { + if (param_extension && pa_v4) { struct einj_parameter *v4param; - v4param = acpi_os_map_memory(paddrv4, sizeof(*v4param)); + v4param = acpi_os_map_memory(pa_v4, sizeof(*v4param)); if (!v4param) return NULL; if (v4param->reserved1 || v4param->reserved2) { diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index 26311f23c824..bf30a12f1988 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -611,7 +611,7 @@ static void __erst_record_id_cache_compact(void) if (entries[i] == APEI_ERST_INVALID_RECORD_ID) continue; if (wpos != i) - memcpy(&entries[wpos], &entries[i], sizeof(entries[i])); + entries[wpos] = entries[i]; wpos++; } erst_record_id_cache.len = wpos; -- cgit v1.2.3-55-g7522 From ca104edc17841da87850b20ab77e57fe0a99ead6 Mon Sep 17 00:00:00 2001 From: Chen, Gong Date: Mon, 25 Nov 2013 02:15:01 -0500 Subject: ACPI, APEI, GHES: Cleanup ghes memory error handling Cleanup the logic in ghes_handle_memory_failure(). While at it, add proper PFN validity check for UC error and cleanup the code logic to make it simpler and cleaner. Signed-off-by: Chen, Gong Acked-by: Naveen N. Rao Link: http://lkml.kernel.org/r/1385363701-12387-2-git-send-email-gong.chen@linux.intel.com [ Boris: massage commit message. ] Signed-off-by: Borislav Petkov --- drivers/acpi/apei/ghes.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) (limited to 'drivers/acpi') diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index ce3683d93a13..46766ef7ef5d 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -413,27 +413,31 @@ static void ghes_handle_memory_failure(struct acpi_generic_data *gdata, int sev) { #ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE unsigned long pfn; + int flags = -1; int sec_sev = ghes_severity(gdata->error_severity); struct cper_sec_mem_err *mem_err; mem_err = (struct cper_sec_mem_err *)(gdata + 1); - if (sec_sev == GHES_SEV_CORRECTED && - (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED) && - (mem_err->validation_bits & CPER_MEM_VALID_PA)) { - pfn = mem_err->physical_addr >> PAGE_SHIFT; - if (pfn_valid(pfn)) - memory_failure_queue(pfn, 0, MF_SOFT_OFFLINE); - else if (printk_ratelimit()) - pr_warn(FW_WARN GHES_PFX - "Invalid address in generic error data: %#llx\n", - mem_err->physical_addr); - } - if (sev == GHES_SEV_RECOVERABLE && - sec_sev == GHES_SEV_RECOVERABLE && - mem_err->validation_bits & CPER_MEM_VALID_PA) { - pfn = mem_err->physical_addr >> PAGE_SHIFT; - memory_failure_queue(pfn, 0, 0); + if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) + return; + + pfn = mem_err->physical_addr >> PAGE_SHIFT; + if (!pfn_valid(pfn)) { + pr_warn_ratelimited(FW_WARN GHES_PFX + "Invalid address in generic error data: %#llx\n", + mem_err->physical_addr); + return; } + + /* iff following two events can be handled properly by now */ + if (sec_sev == GHES_SEV_CORRECTED && + (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED)) + flags = MF_SOFT_OFFLINE; + if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE) + flags = 0; + + if (flags != -1) + memory_failure_queue(pfn, 0, flags); #endif } -- cgit v1.2.3-55-g7522