From 993ff6d9df74305bc7b5bbc7a0810cf599b6394c Mon Sep 17 00:00:00 2001
From: Nicholas Piggin
Date: Tue, 7 Aug 2018 23:21:56 +1000
Subject: powerpc/64s: Fix PACA_IRQ_HARD_DIS accounting in idle_power4()

When idle_power4() hard disables interrupts then finds a soft pending
interrupt, it returns with interrupts hard disabled but without
PACA_IRQ_HARD_DIS set. Commit 9b81c0211c ("powerpc/64s: make
PACA_IRQ_HARD_DIS track MSR[EE] closely") added a warning for that
condition (since disabled).

Fix this by adding the PACA_IRQ_HARD_DIS for that case.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/idle_power4.S | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S
index dd7471fe20bd..a09b3c7ca176 100644
--- a/arch/powerpc/kernel/idle_power4.S
+++ b/arch/powerpc/kernel/idle_power4.S
@@ -32,6 +32,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP)
 	cmpwi	0,r4,0
 	beqlr
 
+	/* This sequence is similar to prep_irq_for_idle() */
+
 	/* Hard disable interrupts */
 	mfmsr	r7
 	rldicl	r0,r7,48,1
@@ -41,10 +43,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP)
 	/* Check if something happened while soft-disabled */
 	lbz	r0,PACAIRQHAPPENED(r13)
 	cmpwi	cr0,r0,0
-	bnelr
+	bne-	2f
 
-	/* Soft-enable interrupts */
+	/*
+	 * Soft-enable interrupts. This will make power4_fixup_nap return
+	 * to our caller with interrupts enabled (soft and hard). The caller
+	 * can cope with either interrupts disabled or enabled upon return.
+	 */
 #ifdef CONFIG_TRACE_IRQFLAGS
+	/* Tell the tracer interrupts are on, because idle responds to them. */
 	mflr	r0
 	std	r0,16(r1)
 	stdu    r1,-128(r1)
@@ -73,3 +80,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	isync
 	b	1b
 
+2:	/* Return if an interrupt had happened while soft disabled */
+	/* Set the HARD_DIS flag because interrupts are now hard disabled */
+	ori	r0,r0,PACA_IRQ_HARD_DIS
+	stb	r0,PACAIRQHAPPENED(r13)
+	blr
-- 
cgit v1.2.3-55-g7522


From 997dd26cb3c8b7c9b8765751cc1491ad33b2024f Mon Sep 17 00:00:00 2001
From: Michael Ellerman
Date: Thu, 16 Aug 2018 15:27:47 +1000
Subject: powerpc/traps: Avoid rate limit messages from show unhandled signals

In the recent commit to add an explicit ratelimit state when showing
unhandled signals, commit 35a52a10c3ac ("powerpc/traps: Use an
explicit ratelimit state for show_signal_msg()"), I put the check of
show_unhandled_signals and the ratelimit state before the call to
unhandled_signal() so as to avoid unnecessarily calling the latter
when show_unhandled_signals is false.

However that causes us to check the ratelimit state on every call, so
if we take a lot of *handled* signals that has the effect of making
the ratelimit code print warnings that callbacks have been suppressed
when they haven't.

So rearrange the code so that we check show_unhandled_signals first,
then call unhandled_signal() and finally check the ratelimit state.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Murilo Opsfelder Araujo <muriloo@linux.ibm.com>
---
 arch/powerpc/kernel/traps.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 070e96f1773a..c85adb858271 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -315,22 +315,21 @@ void user_single_step_siginfo(struct task_struct *tsk,
 	info->si_addr = (void __user *)regs->nip;
 }
 
-static bool show_unhandled_signals_ratelimited(void)
+static void show_signal_msg(int signr, struct pt_regs *regs, int code,
+			    unsigned long addr)
 {
 	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
-	return show_unhandled_signals && __ratelimit(&rs);
-}
 
-static void show_signal_msg(int signr, struct pt_regs *regs, int code,
-			    unsigned long addr)
-{
-	if (!show_unhandled_signals_ratelimited())
+	if (!show_unhandled_signals)
 		return;
 
 	if (!unhandled_signal(current, signr))
 		return;
 
+	if (!__ratelimit(&rs))
+		return;
+
 	pr_info("%s[%d]: %s (%d) at %lx nip %lx lr %lx code %x",
 		current->comm, current->pid, signame(signr), signr,
 		addr, regs->nip, regs->link, code);
-- 
cgit v1.2.3-55-g7522


From 95b861a76c1ded3e89d33a3d9f4552dce22e9875 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin
Date: Tue, 1 May 2018 00:55:58 +1000
Subject: powerpc/powernv: provide a console flush operation for opal hvc
 driver

Provide the flush hv_op for the opal hvc driver. This will flush the
firmware console buffers without spinning with interrupts disabled.

Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/opal.h       |  1 +
 arch/powerpc/platforms/powernv/opal.c | 83 ++++++++++++++++++++++-------------
 drivers/tty/hvc/hvc_opal.c            |  2 +
 3 files changed, 55 insertions(+), 31 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 834e7e29f1e4..ff3866473afe 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -308,6 +308,7 @@ extern void opal_configure_cores(void);
 extern int opal_get_chars(uint32_t vtermno, char *buf, int count);
 extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len);
 extern int opal_put_chars_atomic(uint32_t vtermno, const char *buf, int total_len);
+extern int opal_flush_chars(uint32_t vtermno, bool wait);
 extern int opal_flush_console(uint32_t vtermno);
 
 extern void hvc_opal_init_early(void);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 404c379db168..38fe4087484a 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -370,12 +370,8 @@ static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, b
 	olen = cpu_to_be64(total_len);
 	rc = opal_console_write(vtermno, &olen, data);
 	if (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
-		if (rc == OPAL_BUSY_EVENT) {
-			mdelay(OPAL_BUSY_DELAY_MS);
+		if (rc == OPAL_BUSY_EVENT)
 			opal_poll_events(NULL);
-		} else if (rc == OPAL_BUSY_EVENT) {
-			mdelay(OPAL_BUSY_DELAY_MS);
-		}
 		written = -EAGAIN;
 		goto out;
 	}
@@ -401,15 +397,6 @@ out:
 	if (atomic)
 		spin_unlock_irqrestore(&opal_write_lock, flags);
 
-	/* In the -EAGAIN case, callers loop, so we have to flush the console
-	 * here in case they have interrupts off (and we don't want to wait
-	 * for async flushing if we can make immediate progress here). If
-	 * necessary the API could be made entirely non-flushing if the
-	 * callers had a ->flush API to use.
-	 */
-	if (written == -EAGAIN)
-		opal_flush_console(vtermno);
-
 	return written;
 }
 
@@ -429,40 +416,74 @@ int opal_put_chars_atomic(uint32_t vtermno, const char *data, int total_len)
 	return __opal_put_chars(vtermno, data, total_len, true);
 }
 
-int opal_flush_console(uint32_t vtermno)
+static s64 __opal_flush_console(uint32_t vtermno)
 {
 	s64 rc;
 
 	if (!opal_check_token(OPAL_CONSOLE_FLUSH)) {
 		__be64 evt;
 
-		WARN_ONCE(1, "opal: OPAL_CONSOLE_FLUSH missing.\n");
 		/*
 		 * If OPAL_CONSOLE_FLUSH is not implemented in the firmware,
 		 * the console can still be flushed by calling the polling
 		 * function while it has OPAL_EVENT_CONSOLE_OUTPUT events.
 		 */
-		do {
-			opal_poll_events(&evt);
-		} while (be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT);
+		WARN_ONCE(1, "opal: OPAL_CONSOLE_FLUSH missing.\n");
+
+		opal_poll_events(&evt);
+		if (!(be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT))
+			return OPAL_SUCCESS;
+		return OPAL_BUSY;
 
-		return OPAL_SUCCESS;
+	} else {
+		rc = opal_console_flush(vtermno);
+		if (rc == OPAL_BUSY_EVENT) {
+			opal_poll_events(NULL);
+			rc = OPAL_BUSY;
+		}
+		return rc;
 	}
 
-	do  {
-		rc = OPAL_BUSY;
-		while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
-			rc = opal_console_flush(vtermno);
-			if (rc == OPAL_BUSY_EVENT) {
-				mdelay(OPAL_BUSY_DELAY_MS);
-				opal_poll_events(NULL);
-			} else if (rc == OPAL_BUSY) {
-				mdelay(OPAL_BUSY_DELAY_MS);
+}
+
+/*
+ * opal_flush_console spins until the console is flushed
+ */
+int opal_flush_console(uint32_t vtermno)
+{
+	for (;;) {
+		s64 rc = __opal_flush_console(vtermno);
+
+		if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) {
+			mdelay(1);
+			continue;
+		}
+
+		return opal_error_code(rc);
+	}
+}
+
+/*
+ * opal_flush_chars is an hvc interface that sleeps until the console is
+ * flushed if wait, otherwise it will return -EBUSY if the console has data,
+ * -EAGAIN if it has data and some of it was flushed.
+ */
+int opal_flush_chars(uint32_t vtermno, bool wait)
+{
+	for (;;) {
+		s64 rc = __opal_flush_console(vtermno);
+
+		if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) {
+			if (wait) {
+				msleep(OPAL_BUSY_DELAY_MS);
+				continue;
 			}
+			if (rc == OPAL_PARTIAL)
+				return -EAGAIN;
 		}
-	} while (rc == OPAL_PARTIAL); /* More to flush */
 
-	return opal_error_code(rc);
+		return opal_error_code(rc);
+	}
 }
 
 static int opal_recover_mce(struct pt_regs *regs,
diff --git a/drivers/tty/hvc/hvc_opal.c b/drivers/tty/hvc/hvc_opal.c
index f631f8bee308..77baf895108f 100644
--- a/drivers/tty/hvc/hvc_opal.c
+++ b/drivers/tty/hvc/hvc_opal.c
@@ -52,6 +52,7 @@ static u32 hvc_opal_boot_termno;
 static const struct hv_ops hvc_opal_raw_ops = {
 	.get_chars = opal_get_chars,
 	.put_chars = opal_put_chars,
+	.flush = opal_flush_chars,
 	.notifier_add = notifier_add_irq,
 	.notifier_del = notifier_del_irq,
 	.notifier_hangup = notifier_hangup_irq,
@@ -141,6 +142,7 @@ static int hvc_opal_hvsi_tiocmset(struct hvc_struct *hp, unsigned int set,
 static const struct hv_ops hvc_opal_hvsi_ops = {
 	.get_chars = hvc_opal_hvsi_get_chars,
 	.put_chars = hvc_opal_hvsi_put_chars,
+	.flush = opal_flush_chars,
 	.notifier_add = hvc_opal_hvsi_open,
 	.notifier_del = hvc_opal_hvsi_close,
 	.notifier_hangup = hvc_opal_hvsi_hangup,
-- 
cgit v1.2.3-55-g7522


From a58183138cb72059a0c278f8370a47f41dae9afa Mon Sep 17 00:00:00 2001
From: Hari Bathini
Date: Sat, 18 Aug 2018 02:10:56 +0530
Subject: powerpc/fadump: cleanup crash memory ranges support

Commit 1bd6a1c4b80a ("powerpc/fadump: handle crash memory ranges array
index overflow") changed crash memory ranges to a dynamic array that
is reallocated on-demand with krealloc(). The relevant header for this
call was not included. The kernel compiles though. But be cautious and
add the header anyway.

Also, memory allocation logic in fadump_add_crash_memory() takes care
of memory allocation for crash memory ranges in all scenarios. Drop
unnecessary memory allocation in fadump_setup_crash_memory_ranges().

Fixes: 1bd6a1c4b80a ("powerpc/fadump: handle crash memory ranges array index overflow")
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/fadump.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 986ec476fd5d..a711d22339ea 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -34,6 +34,7 @@
 #include <linux/crash_dump.h>
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
+#include <linux/slab.h>
 
 #include <asm/debugfs.h>
 #include <asm/page.h>
@@ -1019,13 +1020,6 @@ static int fadump_setup_crash_memory_ranges(void)
 	pr_debug("Setup crash memory ranges.\n");
 	crash_mem_ranges = 0;
 
-	/* allocate memory for crash memory ranges for the first time */
-	if (!max_crash_mem_ranges) {
-		ret = allocate_crash_memory_ranges();
-		if (ret)
-			return ret;
-	}
-
 	/*
 	 * add the first memory chunk (RMA_START through boot_memory_size) as
 	 * a separate memory chunk. The reason is, at the time crash firmware
-- 
cgit v1.2.3-55-g7522


From db2173198b9513f7add8009f225afa1f1c79bcc6 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt
Date: Fri, 17 Aug 2018 17:30:39 +1000
Subject: powerpc/powernv/pci: Work around races in PCI bridge enabling

The generic code is racy when multiple children of a PCI bridge try to
enable it simultaneously.

This leads to drivers trying to access a device through a
not-yet-enabled bridge, and this EEH errors under various
circumstances when using parallel driver probing.

There is work going on to fix that properly in the PCI core but it
will take some time.

x86 gets away with it because (outside of hotplug), the BIOS enables
all the bridges at boot time.

This patch does the same thing on powernv by enabling all bridges that
have child devices at boot time, thus avoiding subsequent races. It's
suitable for backporting to stable and distros, while the proper PCI
fix will probably be significantly more invasive.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: stable@vger.kernel.org
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 37 +++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index cb5d8ee23acb..463308786b8a 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3227,12 +3227,49 @@ static void pnv_pci_ioda_create_dbgfs(void)
 #endif /* CONFIG_DEBUG_FS */
 }
 
+static void pnv_pci_enable_bridge(struct pci_bus *bus)
+{
+	struct pci_dev *dev = bus->self;
+	struct pci_bus *child;
+
+	/* Empty bus ? bail */
+	if (list_empty(&bus->devices))
+		return;
+
+	/*
+	 * If there's a bridge associated with that bus enable it. This works
+	 * around races in the generic code if the enabling is done during
+	 * parallel probing. This can be removed once those races have been
+	 * fixed.
+	 */
+	if (dev) {
+		int rc = pci_enable_device(dev);
+		if (rc)
+			pci_err(dev, "Error enabling bridge (%d)\n", rc);
+		pci_set_master(dev);
+	}
+
+	/* Perform the same to child busses */
+	list_for_each_entry(child, &bus->children, node)
+		pnv_pci_enable_bridge(child);
+}
+
+static void pnv_pci_enable_bridges(void)
+{
+	struct pci_controller *hose;
+
+	list_for_each_entry(hose, &hose_list, list_node)
+		pnv_pci_enable_bridge(hose->bus);
+}
+
 static void pnv_pci_ioda_fixup(void)
 {
 	pnv_pci_ioda_setup_PEs();
 	pnv_pci_ioda_setup_iommu_api();
 	pnv_pci_ioda_create_dbgfs();
 
+	pnv_pci_enable_bridges();
+
 #ifdef CONFIG_EEH
 	pnv_eeh_post_init();
 #endif
-- 
cgit v1.2.3-55-g7522


From d6ee76d3d37d156c479348821574b6f99d6472a1 Mon Sep 17 00:00:00 2001
From: Luke Dashjr
Date: Thu, 16 Aug 2018 21:36:26 +0000
Subject: powerpc64/ftrace: Include ftrace.h needed for enable/disable calls

this_cpu_disable_ftrace and this_cpu_enable_ftrace are inlines in
ftrace.h Without it included, the build fails.

Fixes: a4bc64d305af ("powerpc64/ftrace: Disable ftrace during kvm entry/exit")
Cc: stable@vger.kernel.org # v4.18+
Signed-off-by: Luke Dashjr <luke-jr+git@utopios.org>
Acked-by: Naveen N. Rao <naveen.n.rao at linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kvm/book3s_hv.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 06cffc6446fe..db677a1132fa 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -46,6 +46,7 @@
 #include <linux/compiler.h>
 #include <linux/of.h>
 
+#include <asm/ftrace.h>
 #include <asm/reg.h>
 #include <asm/ppc-opcode.h>
 #include <asm/asm-prototypes.h>
-- 
cgit v1.2.3-55-g7522


From 2ea62630681027c455117aa471ea3ab8bb099ead Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju
Date: Fri, 17 Aug 2018 20:24:39 +0530
Subject: powerpc/topology: Get topology for shared processors at boot

On a shared LPAR, Phyp will not update the CPU associativity at boot
time. Just after the boot system does recognize itself as a shared
LPAR and trigger a request for correct CPU associativity. But by then
the scheduler would have already created/destroyed its sched domains.

This causes
  - Broken load balance across Nodes causing islands of cores.
  - Performance degradation esp if the system is lightly loaded
  - dmesg to wrongly report all CPUs to be in Node 0.
  - Messages in dmesg saying borken topology.
  - With commit 051f3ca02e46 ("sched/topology: Introduce NUMA identity
    node sched domain"), can cause rcu stalls at boot up.

The sched_domains_numa_masks table which is used to generate cpumasks
is only created at boot time just before creating sched domains and
never updated. Hence, its better to get the topology correct before
the sched domains are created.

For example on 64 core Power 8 shared LPAR, dmesg reports

  Brought up 512 CPUs
  Node 0 CPUs: 0-511
  Node 1 CPUs:
  Node 2 CPUs:
  Node 3 CPUs:
  Node 4 CPUs:
  Node 5 CPUs:
  Node 6 CPUs:
  Node 7 CPUs:
  Node 8 CPUs:
  Node 9 CPUs:
  Node 10 CPUs:
  Node 11 CPUs:
  ...
  BUG: arch topology borken
       the DIE domain not a subset of the NUMA domain
  BUG: arch topology borken
       the DIE domain not a subset of the NUMA domain

numactl/lscpu output will still be correct with cores spreading across
all nodes:

  Socket(s):             64
  NUMA node(s):          12
  Model:                 2.0 (pvr 004d 0200)
  Model name:            POWER8 (architected), altivec supported
  Hypervisor vendor:     pHyp
  Virtualization type:   para
  L1d cache:             64K
  L1i cache:             32K
  NUMA node0 CPU(s): 0-7,32-39,64-71,96-103,176-183,272-279,368-375,464-471
  NUMA node1 CPU(s): 8-15,40-47,72-79,104-111,184-191,280-287,376-383,472-479
  NUMA node2 CPU(s): 16-23,48-55,80-87,112-119,192-199,288-295,384-391,480-487
  NUMA node3 CPU(s): 24-31,56-63,88-95,120-127,200-207,296-303,392-399,488-495
  NUMA node4 CPU(s):     208-215,304-311,400-407,496-503
  NUMA node5 CPU(s):     168-175,264-271,360-367,456-463
  NUMA node6 CPU(s):     128-135,224-231,320-327,416-423
  NUMA node7 CPU(s):     136-143,232-239,328-335,424-431
  NUMA node8 CPU(s):     216-223,312-319,408-415,504-511
  NUMA node9 CPU(s):     144-151,240-247,336-343,432-439
  NUMA node10 CPU(s):    152-159,248-255,344-351,440-447
  NUMA node11 CPU(s):    160-167,256-263,352-359,448-455

Currently on this LPAR, the scheduler detects 2 levels of Numa and
created numa sched domains for all CPUs, but it finds a single DIE
domain consisting of all CPUs. Hence it deletes all numa sched
domains.

To address this, detect the shared processor and update topology soon
after CPUs are setup so that correct topology is updated just before
scheduler creates sched domain.

With the fix, dmesg reports:

  numa: Node 0 CPUs: 0-7 32-39 64-71 96-103 176-183 272-279 368-375 464-471
  numa: Node 1 CPUs: 8-15 40-47 72-79 104-111 184-191 280-287 376-383 472-479
  numa: Node 2 CPUs: 16-23 48-55 80-87 112-119 192-199 288-295 384-391 480-487
  numa: Node 3 CPUs: 24-31 56-63 88-95 120-127 200-207 296-303 392-399 488-495
  numa: Node 4 CPUs: 208-215 304-311 400-407 496-503
  numa: Node 5 CPUs: 168-175 264-271 360-367 456-463
  numa: Node 6 CPUs: 128-135 224-231 320-327 416-423
  numa: Node 7 CPUs: 136-143 232-239 328-335 424-431
  numa: Node 8 CPUs: 216-223 312-319 408-415 504-511
  numa: Node 9 CPUs: 144-151 240-247 336-343 432-439
  numa: Node 10 CPUs: 152-159 248-255 344-351 440-447
  numa: Node 11 CPUs: 160-167 256-263 352-359 448-455

and lscpu also reports:

  Socket(s):             64
  NUMA node(s):          12
  Model:                 2.0 (pvr 004d 0200)
  Model name:            POWER8 (architected), altivec supported
  Hypervisor vendor:     pHyp
  Virtualization type:   para
  L1d cache:             64K
  L1i cache:             32K
  NUMA node0 CPU(s): 0-7,32-39,64-71,96-103,176-183,272-279,368-375,464-471
  NUMA node1 CPU(s): 8-15,40-47,72-79,104-111,184-191,280-287,376-383,472-479
  NUMA node2 CPU(s): 16-23,48-55,80-87,112-119,192-199,288-295,384-391,480-487
  NUMA node3 CPU(s): 24-31,56-63,88-95,120-127,200-207,296-303,392-399,488-495
  NUMA node4 CPU(s):     208-215,304-311,400-407,496-503
  NUMA node5 CPU(s):     168-175,264-271,360-367,456-463
  NUMA node6 CPU(s):     128-135,224-231,320-327,416-423
  NUMA node7 CPU(s):     136-143,232-239,328-335,424-431
  NUMA node8 CPU(s):     216-223,312-319,408-415,504-511
  NUMA node9 CPU(s):     144-151,240-247,336-343,432-439
  NUMA node10 CPU(s):    152-159,248-255,344-351,440-447
  NUMA node11 CPU(s):    160-167,256-263,352-359,448-455

Reported-by: Manjunatha H R <manjuhr1@in.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
[mpe: Trim / format change log]
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/topology.h |  5 +++++
 arch/powerpc/kernel/smp.c           |  5 +++++
 arch/powerpc/mm/numa.c              | 20 ++++++++++----------
 3 files changed, 20 insertions(+), 10 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 16b077801a5f..a4a718dbfec6 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -92,6 +92,7 @@ extern int stop_topology_update(void);
 extern int prrn_is_enabled(void);
 extern int find_and_online_cpu_nid(int cpu);
 extern int timed_topology_update(int nsecs);
+extern void __init shared_proc_topology_init(void);
 #else
 static inline int start_topology_update(void)
 {
@@ -113,6 +114,10 @@ static inline int timed_topology_update(int nsecs)
 {
 	return 0;
 }
+
+#ifdef CONFIG_SMP
+static inline void shared_proc_topology_init(void) {}
+#endif
 #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
 
 #include <asm-generic/topology.h>
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index b19d832ef386..61c1fadbc644 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1160,6 +1160,11 @@ void __init smp_cpus_done(unsigned int max_cpus)
 	if (smp_ops && smp_ops->bringup_done)
 		smp_ops->bringup_done();
 
+	/*
+	 * On a shared LPAR, associativity needs to be requested.
+	 * Hence, get numa topology before dumping cpu topology
+	 */
+	shared_proc_topology_init();
 	dump_numa_cpu_topology();
 
 	/*
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 0c7e05d89244..35ac5422903a 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1078,7 +1078,6 @@ static int prrn_enabled;
 static void reset_topology_timer(void);
 static int topology_timer_secs = 1;
 static int topology_inited;
-static int topology_update_needed;
 
 /*
  * Change polling interval for associativity changes.
@@ -1306,11 +1305,8 @@ int numa_update_cpu_topology(bool cpus_locked)
 	struct device *dev;
 	int weight, new_nid, i = 0;
 
-	if (!prrn_enabled && !vphn_enabled) {
-		if (!topology_inited)
-			topology_update_needed = 1;
+	if (!prrn_enabled && !vphn_enabled && topology_inited)
 		return 0;
-	}
 
 	weight = cpumask_weight(&cpu_associativity_changes_mask);
 	if (!weight)
@@ -1423,7 +1419,6 @@ int numa_update_cpu_topology(bool cpus_locked)
 
 out:
 	kfree(updates);
-	topology_update_needed = 0;
 	return changed;
 }
 
@@ -1551,6 +1546,15 @@ int prrn_is_enabled(void)
 	return prrn_enabled;
 }
 
+void __init shared_proc_topology_init(void)
+{
+	if (lppaca_shared_proc(get_lppaca())) {
+		bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
+			    nr_cpumask_bits);
+		numa_update_cpu_topology(false);
+	}
+}
+
 static int topology_read(struct seq_file *file, void *v)
 {
 	if (vphn_enabled || prrn_enabled)
@@ -1608,10 +1612,6 @@ static int topology_update_init(void)
 		return -ENOMEM;
 
 	topology_inited = 1;
-	if (topology_update_needed)
-		bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
-					nr_cpumask_bits);
-
 	return 0;
 }
 device_initcall(topology_update_init);
-- 
cgit v1.2.3-55-g7522


From 810e9f86f36f59f1d6f6710220c49afe0c705f38 Mon Sep 17 00:00:00 2001
From: Christophe Leroy
Date: Tue, 21 Aug 2018 13:03:23 +0000
Subject: powerpc/nohash: fix pte_access_permitted()

Commit 5769beaf180a8 ("powerpc/mm: Add proper pte access check helper
for other platforms") replaced generic pte_access_permitted() by an
arch specific one.

The generic one is defined as
(pte_present(pte) && (!(write) || pte_write(pte)))

The arch specific one is open coded checking that _PAGE_USER and
_PAGE_WRITE (_PAGE_RW) flags are set, but lacking to check that
_PAGE_RO and _PAGE_PRIVILEGED are unset, leading to a useless test
on targets like the 8xx which defines _PAGE_RW and _PAGE_USER as 0.

Commit 5fa5b16be5b31 ("powerpc/mm/hugetlb: Use pte_access_permitted
for hugetlb access check") replaced some tests performed with
pte helpers by a call to pte_access_permitted(), leading to the same
issue.

This patch rewrites powerpc/nohash pte_access_permitted()
using pte helpers.

Fixes: 5769beaf180a8 ("powerpc/mm: Add proper pte access check helper for other platforms")
Fixes: 5fa5b16be5b31 ("powerpc/mm/hugetlb: Use pte_access_permitted for hugetlb access check")
Cc: stable@vger.kernel.org # v4.15+
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/nohash/pgtable.h | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h
index 2160be2e4339..b321c82b3624 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -51,17 +51,14 @@ static inline int pte_present(pte_t pte)
 #define pte_access_permitted pte_access_permitted
 static inline bool pte_access_permitted(pte_t pte, bool write)
 {
-	unsigned long pteval = pte_val(pte);
 	/*
 	 * A read-only access is controlled by _PAGE_USER bit.
 	 * We have _PAGE_READ set for WRITE and EXECUTE
 	 */
-	unsigned long need_pte_bits = _PAGE_PRESENT | _PAGE_USER;
-
-	if (write)
-		need_pte_bits |= _PAGE_WRITE;
+	if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte))
+		return false;
 
-	if ((pteval & need_pte_bits) != need_pte_bits)
+	if (write && !pte_write(pte))
 		return false;
 
 	return true;
-- 
cgit v1.2.3-55-g7522


From bd0dbb73e01306a1060e56f81e5fe287be936477 Mon Sep 17 00:00:00 2001
From: Aneesh Kumar K.V
Date: Wed, 22 Aug 2018 22:46:04 +0530
Subject: powerpc/mm/books3s: Add new pte bit to mark pte temporarily invalid.

When splitting a huge pmd pte, we need to mark the pmd entry invalid. We
can do that by clearing _PAGE_PRESENT bit. But then that will be taken as a
swap pte. In order to differentiate between the two use a software pte bit
when invalidating.

For regular pte, due to bd5050e38aec ("powerpc/mm/radix: Change pte relax
sequence to handle nest MMU hang") we need to mark the pte entry invalid when
relaxing access permission. Instead of marking pte_none which can result in
different page table walk routines possibly skipping this pte entry, invalidate
it but still keep it marked present.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 676118743a06..13a688fc8cd0 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -44,6 +44,16 @@
 
 #define _PAGE_PTE		0x4000000000000000UL	/* distinguishes PTEs from pointers */
 #define _PAGE_PRESENT		0x8000000000000000UL	/* pte contains a translation */
+/*
+ * We need to mark a pmd pte invalid while splitting. We can do that by clearing
+ * the _PAGE_PRESENT bit. But then that will be taken as a swap pte. In order to
+ * differentiate between two use a SW field when invalidating.
+ *
+ * We do that temporary invalidate for regular pte entry in ptep_set_access_flags
+ *
+ * This is used only when _PAGE_PRESENT is cleared.
+ */
+#define _PAGE_INVALID		_RPAGE_SW0
 
 /*
  * Top and bottom bits of RPN which can be used by hash
@@ -568,7 +578,13 @@ static inline pte_t pte_clear_savedwrite(pte_t pte)
 
 static inline int pte_present(pte_t pte)
 {
-	return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT));
+	/*
+	 * A pte is considerent present if _PAGE_PRESENT is set.
+	 * We also need to consider the pte present which is marked
+	 * invalid during ptep_set_access_flags. Hence we look for _PAGE_INVALID
+	 * if we find _PAGE_PRESENT cleared.
+	 */
+	return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
 }
 
 #ifdef CONFIG_PPC_MEM_KEYS
-- 
cgit v1.2.3-55-g7522


From f08d08f3db55452d31ba4a37c702da6245876b96 Mon Sep 17 00:00:00 2001
From: Aneesh Kumar K.V
Date: Wed, 22 Aug 2018 22:46:05 +0530
Subject: powerpc/mm/radix: Only need the Nest MMU workaround for R -> RW
 transition

The Nest MMU workaround is only needed for RW upgrades. Avoid doing
that for other PTE updates.

We also avoid clearing the PTE while marking it invalid. This is
because other page table walkers will find this PTE none and can
result in unexpected behaviour due to that. Instead we clear
_PAGE_PRESENT and set the software PTE bit _PAGE_INVALID.
pte_present() is already updated to check for both bits. This makes
sure page table walkers will find the PTE present and things like
pte_pfn(pte) returns the right value.

Based on an original patch from Benjamin Herrenschmidt <benh@kernel.crashing.org>

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/pgtable-radix.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 7be99fd9af15..c879979faa73 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -1045,20 +1045,22 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
 					      _PAGE_RW | _PAGE_EXEC);
+
+	unsigned long change = pte_val(entry) ^ pte_val(*ptep);
 	/*
 	 * To avoid NMMU hang while relaxing access, we need mark
 	 * the pte invalid in between.
 	 */
-	if (atomic_read(&mm->context.copros) > 0) {
+	if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) {
 		unsigned long old_pte, new_pte;
 
-		old_pte = __radix_pte_update(ptep, ~0, 0);
+		old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
 		/*
 		 * new value of pte
 		 */
 		new_pte = old_pte | set;
 		radix__flush_tlb_page_psize(mm, address, psize);
-		__radix_pte_update(ptep, 0, new_pte);
+		__radix_pte_update(ptep, _PAGE_INVALID, new_pte);
 	} else {
 		__radix_pte_update(ptep, 0, set);
 		/*
-- 
cgit v1.2.3-55-g7522


From 8cfbdbdc24815417a3ab35101ccf706b9a23ff17 Mon Sep 17 00:00:00 2001
From: Paul Mackerras
Date: Thu, 23 Aug 2018 10:08:58 +1000
Subject: KVM: PPC: Book3S: Fix guest DMA when guest partially backed by THP
 pages

Commit 76fa4975f3ed ("KVM: PPC: Check if IOMMU page is contained in
the pinned physical page", 2018-07-17) added some checks to ensure
that guest DMA mappings don't attempt to map more than the guest is
entitled to access. However, errors in the logic mean that legitimate
guest requests to map pages for DMA are being denied in some
situations. Specifically, if the first page of the range passed to
mm_iommu_get() is mapped with a normal page, and subsequent pages are
mapped with transparent huge pages, we end up with mem->pageshift ==
0. That means that the page size checks in mm_iommu_ua_to_hpa() and
mm_iommu_up_to_hpa_rm() will always fail for every page in that
region, and thus the guest can never map any memory in that region for
DMA, typically leading to a flood of error messages like this:

  qemu-system-ppc64: VFIO_MAP_DMA: -22
  qemu-system-ppc64: vfio_dma_map(0x10005f47780, 0x800000000000000, 0x10000, 0x7fff63ff0000) = -22 (Invalid argument)

The logic errors in mm_iommu_get() are:

  (a) use of 'ua' not 'ua + (i << PAGE_SHIFT)' in the find_linux_pte()
      call (meaning that find_linux_pte() returns the pte for the
      first address in the range, not the address we are currently up
      to);
  (b) use of 'pageshift' as the variable to receive the hugepage shift
      returned by find_linux_pte() - for a normal page this gets set
      to 0, leading to us setting mem->pageshift to 0 when we conclude
      that the pte returned by find_linux_pte() didn't match the page
      we were looking at;
  (c) comparing 'compshift', which is a page order, i.e. log base 2 of
      the number of pages, with 'pageshift', which is a log base 2 of
      the number of bytes.

To fix these problems, this patch introduces 'cur_ua' to hold the
current user address and uses that in the find_linux_pte() call;
introduces 'pteshift' to hold the hugepage shift found by
find_linux_pte(); and compares 'pteshift' with 'compshift +
PAGE_SHIFT' rather than 'compshift'.

The patch also moves the local_irq_restore to the point after the PTE
pointer returned by find_linux_pte() has been dereferenced because
otherwise the PTE could change underneath us, and adds a check to
avoid doing the find_linux_pte() call once mem->pageshift has been
reduced to PAGE_SHIFT, as an optimization.

Fixes: 76fa4975f3ed ("KVM: PPC: Check if IOMMU page is contained in the pinned physical page")
Cc: stable@vger.kernel.org # v4.12+
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/mmu_context_iommu.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index a4ca57612558..c9ee9e23845f 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -129,6 +129,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
 	long i, j, ret = 0, locked_entries = 0;
 	unsigned int pageshift;
 	unsigned long flags;
+	unsigned long cur_ua;
 	struct page *page = NULL;
 
 	mutex_lock(&mem_list_mutex);
@@ -177,7 +178,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
 	}
 
 	for (i = 0; i < entries; ++i) {
-		if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
+		cur_ua = ua + (i << PAGE_SHIFT);
+		if (1 != get_user_pages_fast(cur_ua,
 					1/* pages */, 1/* iswrite */, &page)) {
 			ret = -EFAULT;
 			for (j = 0; j < i; ++j)
@@ -196,7 +198,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
 		if (is_migrate_cma_page(page)) {
 			if (mm_iommu_move_page_from_cma(page))
 				goto populate;
-			if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
+			if (1 != get_user_pages_fast(cur_ua,
 						1/* pages */, 1/* iswrite */,
 						&page)) {
 				ret = -EFAULT;
@@ -210,20 +212,21 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
 		}
 populate:
 		pageshift = PAGE_SHIFT;
-		if (PageCompound(page)) {
+		if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) {
 			pte_t *pte;
 			struct page *head = compound_head(page);
 			unsigned int compshift = compound_order(head);
+			unsigned int pteshift;
 
 			local_irq_save(flags); /* disables as well */
-			pte = find_linux_pte(mm->pgd, ua, NULL, &pageshift);
-			local_irq_restore(flags);
+			pte = find_linux_pte(mm->pgd, cur_ua, NULL, &pteshift);
 
 			/* Double check it is still the same pinned page */
 			if (pte && pte_page(*pte) == head &&
-					pageshift == compshift)
-				pageshift = max_t(unsigned int, pageshift,
+			    pteshift == compshift + PAGE_SHIFT)
+				pageshift = max_t(unsigned int, pteshift,
 						PAGE_SHIFT);
+			local_irq_restore(flags);
 		}
 		mem->pageshift = min(mem->pageshift, pageshift);
 		mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
-- 
cgit v1.2.3-55-g7522


From 0f52b3a00c789569d7ed822b5a6b30f59a8d4393 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar
Date: Thu, 23 Aug 2018 10:26:08 +0530
Subject: powerpc/mce: Fix SLB rebolting during MCE recovery path.

The commit e7e81847478 ("powerpc/64s: move machine check SLB flushing
to mm/slb.c") introduced a bug in reloading bolted SLB entries. Unused
bolted entries are stored with .esid=0 in the slb_shadow area, and
that value is now used directly as the RB input to slbmte, which means
the RB[52:63] index field is set to 0, which causes SLB entry 0 to be
cleared.

Fix this by storing the index bits in the unused bolted entries, which
directs the slbmte to the right place.

The SLB shadow area is also used by the hypervisor, but PAPR is okay
with that, from LoPAPR v1.1, 14.11.1.3 SLB Shadow Buffer:

  Note: SLB is filled sequentially starting at index 0
  from the shadow buffer ignoring the contents of
  RB field bits 52-63

Fixes: e7e81847478b ("powerpc/64s: move machine check SLB flushing to mm/slb.c")
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/slb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 0b095fa54049..9f574e59d178 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -70,7 +70,7 @@ static inline void slb_shadow_update(unsigned long ea, int ssize,
 
 static inline void slb_shadow_clear(enum slb_index index)
 {
-	WRITE_ONCE(get_slb_shadow()->save_area[index].esid, 0);
+	WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index));
 }
 
 static inline void create_shadowed_slbe(unsigned long ea, int ssize,
-- 
cgit v1.2.3-55-g7522