From 993ff6d9df74305bc7b5bbc7a0810cf599b6394c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 7 Aug 2018 23:21:56 +1000 Subject: powerpc/64s: Fix PACA_IRQ_HARD_DIS accounting in idle_power4() When idle_power4() hard disables interrupts then finds a soft pending interrupt, it returns with interrupts hard disabled but without PACA_IRQ_HARD_DIS set. Commit 9b81c0211c ("powerpc/64s: make PACA_IRQ_HARD_DIS track MSR[EE] closely") added a warning for that condition (since disabled). Fix this by adding the PACA_IRQ_HARD_DIS for that case. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/idle_power4.S | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S index dd7471fe20bd..a09b3c7ca176 100644 --- a/arch/powerpc/kernel/idle_power4.S +++ b/arch/powerpc/kernel/idle_power4.S @@ -32,6 +32,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP) cmpwi 0,r4,0 beqlr + /* This sequence is similar to prep_irq_for_idle() */ + /* Hard disable interrupts */ mfmsr r7 rldicl r0,r7,48,1 @@ -41,10 +43,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP) /* Check if something happened while soft-disabled */ lbz r0,PACAIRQHAPPENED(r13) cmpwi cr0,r0,0 - bnelr + bne- 2f - /* Soft-enable interrupts */ + /* + * Soft-enable interrupts. This will make power4_fixup_nap return + * to our caller with interrupts enabled (soft and hard). The caller + * can cope with either interrupts disabled or enabled upon return. + */ #ifdef CONFIG_TRACE_IRQFLAGS + /* Tell the tracer interrupts are on, because idle responds to them. */ mflr r0 std r0,16(r1) stdu r1,-128(r1) @@ -73,3 +80,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) isync b 1b +2: /* Return if an interrupt had happened while soft disabled */ + /* Set the HARD_DIS flag because interrupts are now hard disabled */ + ori r0,r0,PACA_IRQ_HARD_DIS + stb r0,PACAIRQHAPPENED(r13) + blr -- cgit v1.2.3-55-g7522 From 997dd26cb3c8b7c9b8765751cc1491ad33b2024f Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 16 Aug 2018 15:27:47 +1000 Subject: powerpc/traps: Avoid rate limit messages from show unhandled signals In the recent commit to add an explicit ratelimit state when showing unhandled signals, commit 35a52a10c3ac ("powerpc/traps: Use an explicit ratelimit state for show_signal_msg()"), I put the check of show_unhandled_signals and the ratelimit state before the call to unhandled_signal() so as to avoid unnecessarily calling the latter when show_unhandled_signals is false. However that causes us to check the ratelimit state on every call, so if we take a lot of *handled* signals that has the effect of making the ratelimit code print warnings that callbacks have been suppressed when they haven't. So rearrange the code so that we check show_unhandled_signals first, then call unhandled_signal() and finally check the ratelimit state. Signed-off-by: Michael Ellerman Reviewed-by: Murilo Opsfelder Araujo --- arch/powerpc/kernel/traps.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 070e96f1773a..c85adb858271 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -315,22 +315,21 @@ void user_single_step_siginfo(struct task_struct *tsk, info->si_addr = (void __user *)regs->nip; } -static bool show_unhandled_signals_ratelimited(void) +static void show_signal_msg(int signr, struct pt_regs *regs, int code, + unsigned long addr) { static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - return show_unhandled_signals && __ratelimit(&rs); -} -static void show_signal_msg(int signr, struct pt_regs *regs, int code, - unsigned long addr) -{ - if (!show_unhandled_signals_ratelimited()) + if (!show_unhandled_signals) return; if (!unhandled_signal(current, signr)) return; + if (!__ratelimit(&rs)) + return; + pr_info("%s[%d]: %s (%d) at %lx nip %lx lr %lx code %x", current->comm, current->pid, signame(signr), signr, addr, regs->nip, regs->link, code); -- cgit v1.2.3-55-g7522 From 95b861a76c1ded3e89d33a3d9f4552dce22e9875 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 1 May 2018 00:55:58 +1000 Subject: powerpc/powernv: provide a console flush operation for opal hvc driver Provide the flush hv_op for the opal hvc driver. This will flush the firmware console buffers without spinning with interrupts disabled. Cc: Benjamin Herrenschmidt Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/opal.h | 1 + arch/powerpc/platforms/powernv/opal.c | 83 ++++++++++++++++++++++------------- drivers/tty/hvc/hvc_opal.c | 2 + 3 files changed, 55 insertions(+), 31 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 834e7e29f1e4..ff3866473afe 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -308,6 +308,7 @@ extern void opal_configure_cores(void); extern int opal_get_chars(uint32_t vtermno, char *buf, int count); extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len); extern int opal_put_chars_atomic(uint32_t vtermno, const char *buf, int total_len); +extern int opal_flush_chars(uint32_t vtermno, bool wait); extern int opal_flush_console(uint32_t vtermno); extern void hvc_opal_init_early(void); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 404c379db168..38fe4087484a 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -370,12 +370,8 @@ static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, b olen = cpu_to_be64(total_len); rc = opal_console_write(vtermno, &olen, data); if (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { - if (rc == OPAL_BUSY_EVENT) { - mdelay(OPAL_BUSY_DELAY_MS); + if (rc == OPAL_BUSY_EVENT) opal_poll_events(NULL); - } else if (rc == OPAL_BUSY_EVENT) { - mdelay(OPAL_BUSY_DELAY_MS); - } written = -EAGAIN; goto out; } @@ -401,15 +397,6 @@ out: if (atomic) spin_unlock_irqrestore(&opal_write_lock, flags); - /* In the -EAGAIN case, callers loop, so we have to flush the console - * here in case they have interrupts off (and we don't want to wait - * for async flushing if we can make immediate progress here). If - * necessary the API could be made entirely non-flushing if the - * callers had a ->flush API to use. - */ - if (written == -EAGAIN) - opal_flush_console(vtermno); - return written; } @@ -429,40 +416,74 @@ int opal_put_chars_atomic(uint32_t vtermno, const char *data, int total_len) return __opal_put_chars(vtermno, data, total_len, true); } -int opal_flush_console(uint32_t vtermno) +static s64 __opal_flush_console(uint32_t vtermno) { s64 rc; if (!opal_check_token(OPAL_CONSOLE_FLUSH)) { __be64 evt; - WARN_ONCE(1, "opal: OPAL_CONSOLE_FLUSH missing.\n"); /* * If OPAL_CONSOLE_FLUSH is not implemented in the firmware, * the console can still be flushed by calling the polling * function while it has OPAL_EVENT_CONSOLE_OUTPUT events. */ - do { - opal_poll_events(&evt); - } while (be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT); + WARN_ONCE(1, "opal: OPAL_CONSOLE_FLUSH missing.\n"); + + opal_poll_events(&evt); + if (!(be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT)) + return OPAL_SUCCESS; + return OPAL_BUSY; - return OPAL_SUCCESS; + } else { + rc = opal_console_flush(vtermno); + if (rc == OPAL_BUSY_EVENT) { + opal_poll_events(NULL); + rc = OPAL_BUSY; + } + return rc; } - do { - rc = OPAL_BUSY; - while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { - rc = opal_console_flush(vtermno); - if (rc == OPAL_BUSY_EVENT) { - mdelay(OPAL_BUSY_DELAY_MS); - opal_poll_events(NULL); - } else if (rc == OPAL_BUSY) { - mdelay(OPAL_BUSY_DELAY_MS); +} + +/* + * opal_flush_console spins until the console is flushed + */ +int opal_flush_console(uint32_t vtermno) +{ + for (;;) { + s64 rc = __opal_flush_console(vtermno); + + if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) { + mdelay(1); + continue; + } + + return opal_error_code(rc); + } +} + +/* + * opal_flush_chars is an hvc interface that sleeps until the console is + * flushed if wait, otherwise it will return -EBUSY if the console has data, + * -EAGAIN if it has data and some of it was flushed. + */ +int opal_flush_chars(uint32_t vtermno, bool wait) +{ + for (;;) { + s64 rc = __opal_flush_console(vtermno); + + if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) { + if (wait) { + msleep(OPAL_BUSY_DELAY_MS); + continue; } + if (rc == OPAL_PARTIAL) + return -EAGAIN; } - } while (rc == OPAL_PARTIAL); /* More to flush */ - return opal_error_code(rc); + return opal_error_code(rc); + } } static int opal_recover_mce(struct pt_regs *regs, diff --git a/drivers/tty/hvc/hvc_opal.c b/drivers/tty/hvc/hvc_opal.c index f631f8bee308..77baf895108f 100644 --- a/drivers/tty/hvc/hvc_opal.c +++ b/drivers/tty/hvc/hvc_opal.c @@ -52,6 +52,7 @@ static u32 hvc_opal_boot_termno; static const struct hv_ops hvc_opal_raw_ops = { .get_chars = opal_get_chars, .put_chars = opal_put_chars, + .flush = opal_flush_chars, .notifier_add = notifier_add_irq, .notifier_del = notifier_del_irq, .notifier_hangup = notifier_hangup_irq, @@ -141,6 +142,7 @@ static int hvc_opal_hvsi_tiocmset(struct hvc_struct *hp, unsigned int set, static const struct hv_ops hvc_opal_hvsi_ops = { .get_chars = hvc_opal_hvsi_get_chars, .put_chars = hvc_opal_hvsi_put_chars, + .flush = opal_flush_chars, .notifier_add = hvc_opal_hvsi_open, .notifier_del = hvc_opal_hvsi_close, .notifier_hangup = hvc_opal_hvsi_hangup, -- cgit v1.2.3-55-g7522 From a58183138cb72059a0c278f8370a47f41dae9afa Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Sat, 18 Aug 2018 02:10:56 +0530 Subject: powerpc/fadump: cleanup crash memory ranges support Commit 1bd6a1c4b80a ("powerpc/fadump: handle crash memory ranges array index overflow") changed crash memory ranges to a dynamic array that is reallocated on-demand with krealloc(). The relevant header for this call was not included. The kernel compiles though. But be cautious and add the header anyway. Also, memory allocation logic in fadump_add_crash_memory() takes care of memory allocation for crash memory ranges in all scenarios. Drop unnecessary memory allocation in fadump_setup_crash_memory_ranges(). Fixes: 1bd6a1c4b80a ("powerpc/fadump: handle crash memory ranges array index overflow") Cc: Mahesh Salgaonkar Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/fadump.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 986ec476fd5d..a711d22339ea 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -1019,13 +1020,6 @@ static int fadump_setup_crash_memory_ranges(void) pr_debug("Setup crash memory ranges.\n"); crash_mem_ranges = 0; - /* allocate memory for crash memory ranges for the first time */ - if (!max_crash_mem_ranges) { - ret = allocate_crash_memory_ranges(); - if (ret) - return ret; - } - /* * add the first memory chunk (RMA_START through boot_memory_size) as * a separate memory chunk. The reason is, at the time crash firmware -- cgit v1.2.3-55-g7522 From db2173198b9513f7add8009f225afa1f1c79bcc6 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 17 Aug 2018 17:30:39 +1000 Subject: powerpc/powernv/pci: Work around races in PCI bridge enabling The generic code is racy when multiple children of a PCI bridge try to enable it simultaneously. This leads to drivers trying to access a device through a not-yet-enabled bridge, and this EEH errors under various circumstances when using parallel driver probing. There is work going on to fix that properly in the PCI core but it will take some time. x86 gets away with it because (outside of hotplug), the BIOS enables all the bridges at boot time. This patch does the same thing on powernv by enabling all bridges that have child devices at boot time, thus avoiding subsequent races. It's suitable for backporting to stable and distros, while the proper PCI fix will probably be significantly more invasive. Signed-off-by: Benjamin Herrenschmidt Cc: stable@vger.kernel.org Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 37 +++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index cb5d8ee23acb..463308786b8a 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3227,12 +3227,49 @@ static void pnv_pci_ioda_create_dbgfs(void) #endif /* CONFIG_DEBUG_FS */ } +static void pnv_pci_enable_bridge(struct pci_bus *bus) +{ + struct pci_dev *dev = bus->self; + struct pci_bus *child; + + /* Empty bus ? bail */ + if (list_empty(&bus->devices)) + return; + + /* + * If there's a bridge associated with that bus enable it. This works + * around races in the generic code if the enabling is done during + * parallel probing. This can be removed once those races have been + * fixed. + */ + if (dev) { + int rc = pci_enable_device(dev); + if (rc) + pci_err(dev, "Error enabling bridge (%d)\n", rc); + pci_set_master(dev); + } + + /* Perform the same to child busses */ + list_for_each_entry(child, &bus->children, node) + pnv_pci_enable_bridge(child); +} + +static void pnv_pci_enable_bridges(void) +{ + struct pci_controller *hose; + + list_for_each_entry(hose, &hose_list, list_node) + pnv_pci_enable_bridge(hose->bus); +} + static void pnv_pci_ioda_fixup(void) { pnv_pci_ioda_setup_PEs(); pnv_pci_ioda_setup_iommu_api(); pnv_pci_ioda_create_dbgfs(); + pnv_pci_enable_bridges(); + #ifdef CONFIG_EEH pnv_eeh_post_init(); #endif -- cgit v1.2.3-55-g7522 From d6ee76d3d37d156c479348821574b6f99d6472a1 Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Thu, 16 Aug 2018 21:36:26 +0000 Subject: powerpc64/ftrace: Include ftrace.h needed for enable/disable calls this_cpu_disable_ftrace and this_cpu_enable_ftrace are inlines in ftrace.h Without it included, the build fails. Fixes: a4bc64d305af ("powerpc64/ftrace: Disable ftrace during kvm entry/exit") Cc: stable@vger.kernel.org # v4.18+ Signed-off-by: Luke Dashjr Acked-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kvm/book3s_hv.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 06cffc6446fe..db677a1132fa 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -46,6 +46,7 @@ #include #include +#include #include #include #include -- cgit v1.2.3-55-g7522 From 2ea62630681027c455117aa471ea3ab8bb099ead Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 17 Aug 2018 20:24:39 +0530 Subject: powerpc/topology: Get topology for shared processors at boot On a shared LPAR, Phyp will not update the CPU associativity at boot time. Just after the boot system does recognize itself as a shared LPAR and trigger a request for correct CPU associativity. But by then the scheduler would have already created/destroyed its sched domains. This causes - Broken load balance across Nodes causing islands of cores. - Performance degradation esp if the system is lightly loaded - dmesg to wrongly report all CPUs to be in Node 0. - Messages in dmesg saying borken topology. - With commit 051f3ca02e46 ("sched/topology: Introduce NUMA identity node sched domain"), can cause rcu stalls at boot up. The sched_domains_numa_masks table which is used to generate cpumasks is only created at boot time just before creating sched domains and never updated. Hence, its better to get the topology correct before the sched domains are created. For example on 64 core Power 8 shared LPAR, dmesg reports Brought up 512 CPUs Node 0 CPUs: 0-511 Node 1 CPUs: Node 2 CPUs: Node 3 CPUs: Node 4 CPUs: Node 5 CPUs: Node 6 CPUs: Node 7 CPUs: Node 8 CPUs: Node 9 CPUs: Node 10 CPUs: Node 11 CPUs: ... BUG: arch topology borken the DIE domain not a subset of the NUMA domain BUG: arch topology borken the DIE domain not a subset of the NUMA domain numactl/lscpu output will still be correct with cores spreading across all nodes: Socket(s): 64 NUMA node(s): 12 Model: 2.0 (pvr 004d 0200) Model name: POWER8 (architected), altivec supported Hypervisor vendor: pHyp Virtualization type: para L1d cache: 64K L1i cache: 32K NUMA node0 CPU(s): 0-7,32-39,64-71,96-103,176-183,272-279,368-375,464-471 NUMA node1 CPU(s): 8-15,40-47,72-79,104-111,184-191,280-287,376-383,472-479 NUMA node2 CPU(s): 16-23,48-55,80-87,112-119,192-199,288-295,384-391,480-487 NUMA node3 CPU(s): 24-31,56-63,88-95,120-127,200-207,296-303,392-399,488-495 NUMA node4 CPU(s): 208-215,304-311,400-407,496-503 NUMA node5 CPU(s): 168-175,264-271,360-367,456-463 NUMA node6 CPU(s): 128-135,224-231,320-327,416-423 NUMA node7 CPU(s): 136-143,232-239,328-335,424-431 NUMA node8 CPU(s): 216-223,312-319,408-415,504-511 NUMA node9 CPU(s): 144-151,240-247,336-343,432-439 NUMA node10 CPU(s): 152-159,248-255,344-351,440-447 NUMA node11 CPU(s): 160-167,256-263,352-359,448-455 Currently on this LPAR, the scheduler detects 2 levels of Numa and created numa sched domains for all CPUs, but it finds a single DIE domain consisting of all CPUs. Hence it deletes all numa sched domains. To address this, detect the shared processor and update topology soon after CPUs are setup so that correct topology is updated just before scheduler creates sched domain. With the fix, dmesg reports: numa: Node 0 CPUs: 0-7 32-39 64-71 96-103 176-183 272-279 368-375 464-471 numa: Node 1 CPUs: 8-15 40-47 72-79 104-111 184-191 280-287 376-383 472-479 numa: Node 2 CPUs: 16-23 48-55 80-87 112-119 192-199 288-295 384-391 480-487 numa: Node 3 CPUs: 24-31 56-63 88-95 120-127 200-207 296-303 392-399 488-495 numa: Node 4 CPUs: 208-215 304-311 400-407 496-503 numa: Node 5 CPUs: 168-175 264-271 360-367 456-463 numa: Node 6 CPUs: 128-135 224-231 320-327 416-423 numa: Node 7 CPUs: 136-143 232-239 328-335 424-431 numa: Node 8 CPUs: 216-223 312-319 408-415 504-511 numa: Node 9 CPUs: 144-151 240-247 336-343 432-439 numa: Node 10 CPUs: 152-159 248-255 344-351 440-447 numa: Node 11 CPUs: 160-167 256-263 352-359 448-455 and lscpu also reports: Socket(s): 64 NUMA node(s): 12 Model: 2.0 (pvr 004d 0200) Model name: POWER8 (architected), altivec supported Hypervisor vendor: pHyp Virtualization type: para L1d cache: 64K L1i cache: 32K NUMA node0 CPU(s): 0-7,32-39,64-71,96-103,176-183,272-279,368-375,464-471 NUMA node1 CPU(s): 8-15,40-47,72-79,104-111,184-191,280-287,376-383,472-479 NUMA node2 CPU(s): 16-23,48-55,80-87,112-119,192-199,288-295,384-391,480-487 NUMA node3 CPU(s): 24-31,56-63,88-95,120-127,200-207,296-303,392-399,488-495 NUMA node4 CPU(s): 208-215,304-311,400-407,496-503 NUMA node5 CPU(s): 168-175,264-271,360-367,456-463 NUMA node6 CPU(s): 128-135,224-231,320-327,416-423 NUMA node7 CPU(s): 136-143,232-239,328-335,424-431 NUMA node8 CPU(s): 216-223,312-319,408-415,504-511 NUMA node9 CPU(s): 144-151,240-247,336-343,432-439 NUMA node10 CPU(s): 152-159,248-255,344-351,440-447 NUMA node11 CPU(s): 160-167,256-263,352-359,448-455 Reported-by: Manjunatha H R Signed-off-by: Srikar Dronamraju [mpe: Trim / format change log] Tested-by: Michael Ellerman Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/topology.h | 5 +++++ arch/powerpc/kernel/smp.c | 5 +++++ arch/powerpc/mm/numa.c | 20 ++++++++++---------- 3 files changed, 20 insertions(+), 10 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 16b077801a5f..a4a718dbfec6 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -92,6 +92,7 @@ extern int stop_topology_update(void); extern int prrn_is_enabled(void); extern int find_and_online_cpu_nid(int cpu); extern int timed_topology_update(int nsecs); +extern void __init shared_proc_topology_init(void); #else static inline int start_topology_update(void) { @@ -113,6 +114,10 @@ static inline int timed_topology_update(int nsecs) { return 0; } + +#ifdef CONFIG_SMP +static inline void shared_proc_topology_init(void) {} +#endif #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */ #include diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index b19d832ef386..61c1fadbc644 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1160,6 +1160,11 @@ void __init smp_cpus_done(unsigned int max_cpus) if (smp_ops && smp_ops->bringup_done) smp_ops->bringup_done(); + /* + * On a shared LPAR, associativity needs to be requested. + * Hence, get numa topology before dumping cpu topology + */ + shared_proc_topology_init(); dump_numa_cpu_topology(); /* diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 0c7e05d89244..35ac5422903a 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1078,7 +1078,6 @@ static int prrn_enabled; static void reset_topology_timer(void); static int topology_timer_secs = 1; static int topology_inited; -static int topology_update_needed; /* * Change polling interval for associativity changes. @@ -1306,11 +1305,8 @@ int numa_update_cpu_topology(bool cpus_locked) struct device *dev; int weight, new_nid, i = 0; - if (!prrn_enabled && !vphn_enabled) { - if (!topology_inited) - topology_update_needed = 1; + if (!prrn_enabled && !vphn_enabled && topology_inited) return 0; - } weight = cpumask_weight(&cpu_associativity_changes_mask); if (!weight) @@ -1423,7 +1419,6 @@ int numa_update_cpu_topology(bool cpus_locked) out: kfree(updates); - topology_update_needed = 0; return changed; } @@ -1551,6 +1546,15 @@ int prrn_is_enabled(void) return prrn_enabled; } +void __init shared_proc_topology_init(void) +{ + if (lppaca_shared_proc(get_lppaca())) { + bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask), + nr_cpumask_bits); + numa_update_cpu_topology(false); + } +} + static int topology_read(struct seq_file *file, void *v) { if (vphn_enabled || prrn_enabled) @@ -1608,10 +1612,6 @@ static int topology_update_init(void) return -ENOMEM; topology_inited = 1; - if (topology_update_needed) - bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask), - nr_cpumask_bits); - return 0; } device_initcall(topology_update_init); -- cgit v1.2.3-55-g7522 From 810e9f86f36f59f1d6f6710220c49afe0c705f38 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 21 Aug 2018 13:03:23 +0000 Subject: powerpc/nohash: fix pte_access_permitted() Commit 5769beaf180a8 ("powerpc/mm: Add proper pte access check helper for other platforms") replaced generic pte_access_permitted() by an arch specific one. The generic one is defined as (pte_present(pte) && (!(write) || pte_write(pte))) The arch specific one is open coded checking that _PAGE_USER and _PAGE_WRITE (_PAGE_RW) flags are set, but lacking to check that _PAGE_RO and _PAGE_PRIVILEGED are unset, leading to a useless test on targets like the 8xx which defines _PAGE_RW and _PAGE_USER as 0. Commit 5fa5b16be5b31 ("powerpc/mm/hugetlb: Use pte_access_permitted for hugetlb access check") replaced some tests performed with pte helpers by a call to pte_access_permitted(), leading to the same issue. This patch rewrites powerpc/nohash pte_access_permitted() using pte helpers. Fixes: 5769beaf180a8 ("powerpc/mm: Add proper pte access check helper for other platforms") Fixes: 5fa5b16be5b31 ("powerpc/mm/hugetlb: Use pte_access_permitted for hugetlb access check") Cc: stable@vger.kernel.org # v4.15+ Signed-off-by: Christophe Leroy Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/pgtable.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 2160be2e4339..b321c82b3624 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -51,17 +51,14 @@ static inline int pte_present(pte_t pte) #define pte_access_permitted pte_access_permitted static inline bool pte_access_permitted(pte_t pte, bool write) { - unsigned long pteval = pte_val(pte); /* * A read-only access is controlled by _PAGE_USER bit. * We have _PAGE_READ set for WRITE and EXECUTE */ - unsigned long need_pte_bits = _PAGE_PRESENT | _PAGE_USER; - - if (write) - need_pte_bits |= _PAGE_WRITE; + if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte)) + return false; - if ((pteval & need_pte_bits) != need_pte_bits) + if (write && !pte_write(pte)) return false; return true; -- cgit v1.2.3-55-g7522 From bd0dbb73e01306a1060e56f81e5fe287be936477 Mon Sep 17 00:00:00 2001 From: Aneesh Kumar K.V Date: Wed, 22 Aug 2018 22:46:04 +0530 Subject: powerpc/mm/books3s: Add new pte bit to mark pte temporarily invalid. When splitting a huge pmd pte, we need to mark the pmd entry invalid. We can do that by clearing _PAGE_PRESENT bit. But then that will be taken as a swap pte. In order to differentiate between the two use a software pte bit when invalidating. For regular pte, due to bd5050e38aec ("powerpc/mm/radix: Change pte relax sequence to handle nest MMU hang") we need to mark the pte entry invalid when relaxing access permission. Instead of marking pte_none which can result in different page table walk routines possibly skipping this pte entry, invalidate it but still keep it marked present. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/pgtable.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 676118743a06..13a688fc8cd0 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -44,6 +44,16 @@ #define _PAGE_PTE 0x4000000000000000UL /* distinguishes PTEs from pointers */ #define _PAGE_PRESENT 0x8000000000000000UL /* pte contains a translation */ +/* + * We need to mark a pmd pte invalid while splitting. We can do that by clearing + * the _PAGE_PRESENT bit. But then that will be taken as a swap pte. In order to + * differentiate between two use a SW field when invalidating. + * + * We do that temporary invalidate for regular pte entry in ptep_set_access_flags + * + * This is used only when _PAGE_PRESENT is cleared. + */ +#define _PAGE_INVALID _RPAGE_SW0 /* * Top and bottom bits of RPN which can be used by hash @@ -568,7 +578,13 @@ static inline pte_t pte_clear_savedwrite(pte_t pte) static inline int pte_present(pte_t pte) { - return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT)); + /* + * A pte is considerent present if _PAGE_PRESENT is set. + * We also need to consider the pte present which is marked + * invalid during ptep_set_access_flags. Hence we look for _PAGE_INVALID + * if we find _PAGE_PRESENT cleared. + */ + return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID)); } #ifdef CONFIG_PPC_MEM_KEYS -- cgit v1.2.3-55-g7522 From f08d08f3db55452d31ba4a37c702da6245876b96 Mon Sep 17 00:00:00 2001 From: Aneesh Kumar K.V Date: Wed, 22 Aug 2018 22:46:05 +0530 Subject: powerpc/mm/radix: Only need the Nest MMU workaround for R -> RW transition The Nest MMU workaround is only needed for RW upgrades. Avoid doing that for other PTE updates. We also avoid clearing the PTE while marking it invalid. This is because other page table walkers will find this PTE none and can result in unexpected behaviour due to that. Instead we clear _PAGE_PRESENT and set the software PTE bit _PAGE_INVALID. pte_present() is already updated to check for both bits. This makes sure page table walkers will find the PTE present and things like pte_pfn(pte) returns the right value. Based on an original patch from Benjamin Herrenschmidt Signed-off-by: Aneesh Kumar K.V Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-radix.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 7be99fd9af15..c879979faa73 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -1045,20 +1045,22 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, struct mm_struct *mm = vma->vm_mm; unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); + + unsigned long change = pte_val(entry) ^ pte_val(*ptep); /* * To avoid NMMU hang while relaxing access, we need mark * the pte invalid in between. */ - if (atomic_read(&mm->context.copros) > 0) { + if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) { unsigned long old_pte, new_pte; - old_pte = __radix_pte_update(ptep, ~0, 0); + old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); /* * new value of pte */ new_pte = old_pte | set; radix__flush_tlb_page_psize(mm, address, psize); - __radix_pte_update(ptep, 0, new_pte); + __radix_pte_update(ptep, _PAGE_INVALID, new_pte); } else { __radix_pte_update(ptep, 0, set); /* -- cgit v1.2.3-55-g7522 From 8cfbdbdc24815417a3ab35101ccf706b9a23ff17 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 23 Aug 2018 10:08:58 +1000 Subject: KVM: PPC: Book3S: Fix guest DMA when guest partially backed by THP pages Commit 76fa4975f3ed ("KVM: PPC: Check if IOMMU page is contained in the pinned physical page", 2018-07-17) added some checks to ensure that guest DMA mappings don't attempt to map more than the guest is entitled to access. However, errors in the logic mean that legitimate guest requests to map pages for DMA are being denied in some situations. Specifically, if the first page of the range passed to mm_iommu_get() is mapped with a normal page, and subsequent pages are mapped with transparent huge pages, we end up with mem->pageshift == 0. That means that the page size checks in mm_iommu_ua_to_hpa() and mm_iommu_up_to_hpa_rm() will always fail for every page in that region, and thus the guest can never map any memory in that region for DMA, typically leading to a flood of error messages like this: qemu-system-ppc64: VFIO_MAP_DMA: -22 qemu-system-ppc64: vfio_dma_map(0x10005f47780, 0x800000000000000, 0x10000, 0x7fff63ff0000) = -22 (Invalid argument) The logic errors in mm_iommu_get() are: (a) use of 'ua' not 'ua + (i << PAGE_SHIFT)' in the find_linux_pte() call (meaning that find_linux_pte() returns the pte for the first address in the range, not the address we are currently up to); (b) use of 'pageshift' as the variable to receive the hugepage shift returned by find_linux_pte() - for a normal page this gets set to 0, leading to us setting mem->pageshift to 0 when we conclude that the pte returned by find_linux_pte() didn't match the page we were looking at; (c) comparing 'compshift', which is a page order, i.e. log base 2 of the number of pages, with 'pageshift', which is a log base 2 of the number of bytes. To fix these problems, this patch introduces 'cur_ua' to hold the current user address and uses that in the find_linux_pte() call; introduces 'pteshift' to hold the hugepage shift found by find_linux_pte(); and compares 'pteshift' with 'compshift + PAGE_SHIFT' rather than 'compshift'. The patch also moves the local_irq_restore to the point after the PTE pointer returned by find_linux_pte() has been dereferenced because otherwise the PTE could change underneath us, and adds a check to avoid doing the find_linux_pte() call once mem->pageshift has been reduced to PAGE_SHIFT, as an optimization. Fixes: 76fa4975f3ed ("KVM: PPC: Check if IOMMU page is contained in the pinned physical page") Cc: stable@vger.kernel.org # v4.12+ Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/mm/mmu_context_iommu.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index a4ca57612558..c9ee9e23845f 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -129,6 +129,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, long i, j, ret = 0, locked_entries = 0; unsigned int pageshift; unsigned long flags; + unsigned long cur_ua; struct page *page = NULL; mutex_lock(&mem_list_mutex); @@ -177,7 +178,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, } for (i = 0; i < entries; ++i) { - if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT), + cur_ua = ua + (i << PAGE_SHIFT); + if (1 != get_user_pages_fast(cur_ua, 1/* pages */, 1/* iswrite */, &page)) { ret = -EFAULT; for (j = 0; j < i; ++j) @@ -196,7 +198,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, if (is_migrate_cma_page(page)) { if (mm_iommu_move_page_from_cma(page)) goto populate; - if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT), + if (1 != get_user_pages_fast(cur_ua, 1/* pages */, 1/* iswrite */, &page)) { ret = -EFAULT; @@ -210,20 +212,21 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, } populate: pageshift = PAGE_SHIFT; - if (PageCompound(page)) { + if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) { pte_t *pte; struct page *head = compound_head(page); unsigned int compshift = compound_order(head); + unsigned int pteshift; local_irq_save(flags); /* disables as well */ - pte = find_linux_pte(mm->pgd, ua, NULL, &pageshift); - local_irq_restore(flags); + pte = find_linux_pte(mm->pgd, cur_ua, NULL, &pteshift); /* Double check it is still the same pinned page */ if (pte && pte_page(*pte) == head && - pageshift == compshift) - pageshift = max_t(unsigned int, pageshift, + pteshift == compshift + PAGE_SHIFT) + pageshift = max_t(unsigned int, pteshift, PAGE_SHIFT); + local_irq_restore(flags); } mem->pageshift = min(mem->pageshift, pageshift); mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; -- cgit v1.2.3-55-g7522 From 0f52b3a00c789569d7ed822b5a6b30f59a8d4393 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Thu, 23 Aug 2018 10:26:08 +0530 Subject: powerpc/mce: Fix SLB rebolting during MCE recovery path. The commit e7e81847478 ("powerpc/64s: move machine check SLB flushing to mm/slb.c") introduced a bug in reloading bolted SLB entries. Unused bolted entries are stored with .esid=0 in the slb_shadow area, and that value is now used directly as the RB input to slbmte, which means the RB[52:63] index field is set to 0, which causes SLB entry 0 to be cleared. Fix this by storing the index bits in the unused bolted entries, which directs the slbmte to the right place. The SLB shadow area is also used by the hypervisor, but PAPR is okay with that, from LoPAPR v1.1, 14.11.1.3 SLB Shadow Buffer: Note: SLB is filled sequentially starting at index 0 from the shadow buffer ignoring the contents of RB field bits 52-63 Fixes: e7e81847478b ("powerpc/64s: move machine check SLB flushing to mm/slb.c") Signed-off-by: Mahesh Salgaonkar Signed-off-by: Nicholas Piggin Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 0b095fa54049..9f574e59d178 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -70,7 +70,7 @@ static inline void slb_shadow_update(unsigned long ea, int ssize, static inline void slb_shadow_clear(enum slb_index index) { - WRITE_ONCE(get_slb_shadow()->save_area[index].esid, 0); + WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index)); } static inline void create_shadowed_slbe(unsigned long ea, int ssize, -- cgit v1.2.3-55-g7522