11 files changed, 87 insertions, 764 deletions
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 9ade4489f415..620a986209f5 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1,8 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * The file intends to implement the platform dependent EEH operations on
- * powernv platform. Actually, the powernv was created in order to fully
- * hypervisor support.
+ * PowerNV Platform dependent EEH operations
  *
  * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013.
  */
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 2f4479b94ac3..09f49eed7fb8 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -716,7 +716,7 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
 		 * to reload MMCR0 (see mmcr0 comment above).
 		 */
 		if (!cpu_has_feature(CPU_FTR_POWER9_DD2_1)) {
-			asm volatile(PPC_INVALIDATE_ERAT);
+			asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT);
 			mtspr(SPRN_MMCR0, mmcr0);
 		}
 
@@ -758,7 +758,6 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
 	mtspr(SPRN_PTCR,	sprs.ptcr);
 	mtspr(SPRN_RPR,		sprs.rpr);
 	mtspr(SPRN_TSCR,	sprs.tscr);
-	mtspr(SPRN_LDBAR,	sprs.ldbar);
 
 	if (pls >= pnv_first_tb_loss_level) {
 		/* TB loss */
@@ -790,6 +789,7 @@ core_woken:
 	mtspr(SPRN_MMCR0,	sprs.mmcr0);
 	mtspr(SPRN_MMCR1,	sprs.mmcr1);
 	mtspr(SPRN_MMCR2,	sprs.mmcr2);
+	mtspr(SPRN_LDBAR,	sprs.ldbar);
 
 	mtspr(SPRN_SPRG3,	local_paca->sprg_vdso);
 
@@ -1155,10 +1155,10 @@ static void __init pnv_power9_idle_init(void)
 			pnv_deepest_stop_psscr_mask);
 	}
 
-	pr_info("cpuidle-powernv: First stop level that may lose SPRs = 0x%lld\n",
+	pr_info("cpuidle-powernv: First stop level that may lose SPRs = 0x%llx\n",
 		pnv_first_spr_loss_level);
 
-	pr_info("cpuidle-powernv: First stop level that may lose timebase = 0x%lld\n",
+	pr_info("cpuidle-powernv: First stop level that may lose timebase = 0x%llx\n",
 		pnv_first_tb_loss_level);
 }
 
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index c321fdbc2200..c16249d251f1 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -19,18 +19,25 @@
 
 #include "pci.h"
 
-/*
- * spinlock to protect initialisation of an npu_context for a particular
- * mm_struct.
- */
-static DEFINE_SPINLOCK(npu_context_lock);
-
 static struct pci_dev *get_pci_dev(struct device_node *dn)
 {
 	struct pci_dn *pdn = PCI_DN(dn);
+	struct pci_dev *pdev;
 
-	return pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
+	pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
 					   pdn->busno, pdn->devfn);
+
+	/*
+	 * pci_get_domain_bus_and_slot() increased the reference count of
+	 * the PCI device, but callers don't need that actually as the PE
+	 * already holds a reference to the device. Since callers aren't
+	 * aware of the reference count change, call pci_dev_put() now to
+	 * avoid leaks.
+	 */
+	if (pdev)
+		pci_dev_put(pdev);
+
+	return pdev;
 }
 
 /* Given a NPU device get the associated PCI device. */
@@ -359,15 +366,6 @@ struct npu_comp {
 /* An NPU descriptor, valid for POWER9 only */
 struct npu {
 	int index;
-	__be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS];
-	unsigned int mmio_atsd_count;
-
-	/* Bitmask for MMIO register usage */
-	unsigned long mmio_atsd_usage;
-
-	/* Do we need to explicitly flush the nest mmu? */
-	bool nmmu_flush;
-
 	struct npu_comp npucomp;
 };
 
@@ -624,534 +622,8 @@ struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
 }
 #endif /* CONFIG_IOMMU_API */
 
-/* Maximum number of nvlinks per npu */
-#define NV_MAX_LINKS 6
-
-/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */
-static int max_npu2_index;
-
-struct npu_context {
-	struct mm_struct *mm;
-	struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
-	struct mmu_notifier mn;
-	struct kref kref;
-	bool nmmu_flush;
-
-	/* Callback to stop translation requests on a given GPU */
-	void (*release_cb)(struct npu_context *context, void *priv);
-
-	/*
-	 * Private pointer passed to the above callback for usage by
-	 * device drivers.
-	 */
-	void *priv;
-};
-
-struct mmio_atsd_reg {
-	struct npu *npu;
-	int reg;
-};
-
-/*
- * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
- * if none are available.
- */
-static int get_mmio_atsd_reg(struct npu *npu)
-{
-	int i;
-
-	for (i = 0; i < npu->mmio_atsd_count; i++) {
-		if (!test_bit(i, &npu->mmio_atsd_usage))
-			if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage))
-				return i;
-	}
-
-	return -ENOSPC;
-}
-
-static void put_mmio_atsd_reg(struct npu *npu, int reg)
-{
-	clear_bit_unlock(reg, &npu->mmio_atsd_usage);
-}
-
-/* MMIO ATSD register offsets */
-#define XTS_ATSD_LAUNCH 0
-#define XTS_ATSD_AVA    1
-#define XTS_ATSD_STAT   2
-
-static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize)
-{
-	unsigned long launch = 0;
-
-	if (psize == MMU_PAGE_COUNT) {
-		/* IS set to invalidate entire matching PID */
-		launch |= PPC_BIT(12);
-	} else {
-		/* AP set to invalidate region of psize */
-		launch |= (u64)mmu_get_ap(psize) << PPC_BITLSHIFT(17);
-	}
-
-	/* PRS set to process-scoped */
-	launch |= PPC_BIT(13);
-
-	/* PID */
-	launch |= pid << PPC_BITLSHIFT(38);
-
-	/* Leave "No flush" (bit 39) 0 so every ATSD performs a flush */
-
-	return launch;
-}
-
-static void mmio_atsd_regs_write(struct mmio_atsd_reg
-			mmio_atsd_reg[NV_MAX_NPUS], unsigned long offset,
-			unsigned long val)
-{
-	struct npu *npu;
-	int i, reg;
-
-	for (i = 0; i <= max_npu2_index; i++) {
-		reg = mmio_atsd_reg[i].reg;
-		if (reg < 0)
-			continue;
-
-		npu = mmio_atsd_reg[i].npu;
-		__raw_writeq_be(val, npu->mmio_atsd_regs[reg] + offset);
-	}
-}
-
-static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
-				unsigned long pid)
-{
-	unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT);
-
-	/* Invalidating the entire process doesn't use a va */
-	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
-}
-
-static void mmio_invalidate_range(struct mmio_atsd_reg
-			mmio_atsd_reg[NV_MAX_NPUS], unsigned long pid,
-			unsigned long start, unsigned long psize)
-{
-	unsigned long launch = get_atsd_launch_val(pid, psize);
-
-	/* Write all VAs first */
-	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, start);
-
-	/* Issue one barrier for all address writes */
-	eieio();
-
-	/* Launch */
-	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
-}
-
-#define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
-
-static void mmio_invalidate_wait(
-	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
-{
-	struct npu *npu;
-	int i, reg;
-
-	/* Wait for all invalidations to complete */
-	for (i = 0; i <= max_npu2_index; i++) {
-		if (mmio_atsd_reg[i].reg < 0)
-			continue;
-
-		/* Wait for completion */
-		npu = mmio_atsd_reg[i].npu;
-		reg = mmio_atsd_reg[i].reg;
-		while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
-			cpu_relax();
-	}
-}
-
-/*
- * Acquires all the address translation shootdown (ATSD) registers required to
- * launch an ATSD on all links this npu_context is active on.
- */
-static void acquire_atsd_reg(struct npu_context *npu_context,
-			struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
-{
-	int i, j;
-	struct npu *npu;
-	struct pci_dev *npdev;
-
-	for (i = 0; i <= max_npu2_index; i++) {
-		mmio_atsd_reg[i].reg = -1;
-		for (j = 0; j < NV_MAX_LINKS; j++) {
-			/*
-			 * There are no ordering requirements with respect to
-			 * the setup of struct npu_context, but to ensure
-			 * consistent behaviour we need to ensure npdev[][] is
-			 * only read once.
-			 */
-			npdev = READ_ONCE(npu_context->npdev[i][j]);
-			if (!npdev)
-				continue;
-
-			npu = pci_bus_to_host(npdev->bus)->npu;
-			if (!npu)
-				continue;
-
-			mmio_atsd_reg[i].npu = npu;
-			mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
-			while (mmio_atsd_reg[i].reg < 0) {
-				mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
-				cpu_relax();
-			}
-			break;
-		}
-	}
-}
-
-/*
- * Release previously acquired ATSD registers. To avoid deadlocks the registers
- * must be released in the same order they were acquired above in
- * acquire_atsd_reg.
- */
-static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
-{
-	int i;
-
-	for (i = 0; i <= max_npu2_index; i++) {
-		/*
-		 * We can't rely on npu_context->npdev[][] being the same here
-		 * as when acquire_atsd_reg() was called, hence we use the
-		 * values stored in mmio_atsd_reg during the acquire phase
-		 * rather than re-reading npdev[][].
-		 */
-		if (mmio_atsd_reg[i].reg < 0)
-			continue;
-
-		put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg);
-	}
-}
-
-/*
- * Invalidate a virtual address range
- */
-static void mmio_invalidate(struct npu_context *npu_context,
-			unsigned long start, unsigned long size)
-{
-	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
-	unsigned long pid = npu_context->mm->context.id;
-	unsigned long atsd_start = 0;
-	unsigned long end = start + size - 1;
-	int atsd_psize = MMU_PAGE_COUNT;
-
-	/*
-	 * Convert the input range into one of the supported sizes. If the range
-	 * doesn't fit, use the next larger supported size. Invalidation latency
-	 * is high, so over-invalidation is preferred to issuing multiple
-	 * invalidates.
-	 *
-	 * A 4K page size isn't supported by NPU/GPU ATS, so that case is
-	 * ignored.
-	 */
-	if (size == SZ_64K) {
-		atsd_start = start;
-		atsd_psize = MMU_PAGE_64K;
-	} else if (ALIGN_DOWN(start, SZ_2M) == ALIGN_DOWN(end, SZ_2M)) {
-		atsd_start = ALIGN_DOWN(start, SZ_2M);
-		atsd_psize = MMU_PAGE_2M;
-	} else if (ALIGN_DOWN(start, SZ_1G) == ALIGN_DOWN(end, SZ_1G)) {
-		atsd_start = ALIGN_DOWN(start, SZ_1G);
-		atsd_psize = MMU_PAGE_1G;
-	}
-
-	if (npu_context->nmmu_flush)
-		/*
-		 * Unfortunately the nest mmu does not support flushing specific
-		 * addresses so we have to flush the whole mm once before
-		 * shooting down the GPU translation.
-		 */
-		flush_all_mm(npu_context->mm);
-
-	/*
-	 * Loop over all the NPUs this process is active on and launch
-	 * an invalidate.
-	 */
-	acquire_atsd_reg(npu_context, mmio_atsd_reg);
-
-	if (atsd_psize == MMU_PAGE_COUNT)
-		mmio_invalidate_pid(mmio_atsd_reg, pid);
-	else
-		mmio_invalidate_range(mmio_atsd_reg, pid, atsd_start,
-					atsd_psize);
-
-	mmio_invalidate_wait(mmio_atsd_reg);
-
-	/*
-	 * The GPU requires two flush ATSDs to ensure all entries have been
-	 * flushed. We use PID 0 as it will never be used for a process on the
-	 * GPU.
-	 */
-	mmio_invalidate_pid(mmio_atsd_reg, 0);
-	mmio_invalidate_wait(mmio_atsd_reg);
-	mmio_invalidate_pid(mmio_atsd_reg, 0);
-	mmio_invalidate_wait(mmio_atsd_reg);
-
-	release_atsd_reg(mmio_atsd_reg);
-}
-
-static void pnv_npu2_mn_release(struct mmu_notifier *mn,
-				struct mm_struct *mm)
-{
-	struct npu_context *npu_context = mn_to_npu_context(mn);
-
-	/* Call into device driver to stop requests to the NMMU */
-	if (npu_context->release_cb)
-		npu_context->release_cb(npu_context, npu_context->priv);
-
-	/*
-	 * There should be no more translation requests for this PID, but we
-	 * need to ensure any entries for it are removed from the TLB.
-	 */
-	mmio_invalidate(npu_context, 0, ~0UL);
-}
-
-static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
-					struct mm_struct *mm,
-					unsigned long start, unsigned long end)
-{
-	struct npu_context *npu_context = mn_to_npu_context(mn);
-	mmio_invalidate(npu_context, start, end - start);
-}
-
-static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
-	.release = pnv_npu2_mn_release,
-	.invalidate_range = pnv_npu2_mn_invalidate_range,
-};
-
-/*
- * Call into OPAL to setup the nmmu context for the current task in
- * the NPU. This must be called to setup the context tables before the
- * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
- *
- * A release callback should be registered to allow a device driver to
- * be notified that it should not launch any new translation requests
- * as the final TLB invalidate is about to occur.
- *
- * Returns an error if there no contexts are currently available or a
- * npu_context which should be passed to pnv_npu2_handle_fault().
- *
- * mmap_sem must be held in write mode and must not be called from interrupt
- * context.
- */
-struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
-			unsigned long flags,
-			void (*cb)(struct npu_context *, void *),
-			void *priv)
-{
-	int rc;
-	u32 nvlink_index;
-	struct device_node *nvlink_dn;
-	struct mm_struct *mm = current->mm;
-	struct npu *npu;
-	struct npu_context *npu_context;
-	struct pci_controller *hose;
-
-	/*
-	 * At present we don't support GPUs connected to multiple NPUs and I'm
-	 * not sure the hardware does either.
-	 */
-	struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
-
-	if (!npdev)
-		/* No nvlink associated with this GPU device */
-		return ERR_PTR(-ENODEV);
-
-	/* We only support DR/PR/HV in pnv_npu2_map_lpar_dev() */
-	if (flags & ~(MSR_DR | MSR_PR | MSR_HV))
-		return ERR_PTR(-EINVAL);
-
-	nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
-	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
-							&nvlink_index)))
-		return ERR_PTR(-ENODEV);
-
-	if (!mm || mm->context.id == 0) {
-		/*
-		 * Kernel thread contexts are not supported and context id 0 is
-		 * reserved on the GPU.
-		 */
-		return ERR_PTR(-EINVAL);
-	}
-
-	hose = pci_bus_to_host(npdev->bus);
-	npu = hose->npu;
-	if (!npu)
-		return ERR_PTR(-ENODEV);
-
-	/*
-	 * We store the npu pci device so we can more easily get at the
-	 * associated npus.
-	 */
-	spin_lock(&npu_context_lock);
-	npu_context = mm->context.npu_context;
-	if (npu_context) {
-		if (npu_context->release_cb != cb ||
-			npu_context->priv != priv) {
-			spin_unlock(&npu_context_lock);
-			return ERR_PTR(-EINVAL);
-		}
-
-		WARN_ON(!kref_get_unless_zero(&npu_context->kref));
-	}
-	spin_unlock(&npu_context_lock);
-
-	if (!npu_context) {
-		/*
-		 * We can set up these fields without holding the
-		 * npu_context_lock as the npu_context hasn't been returned to
-		 * the caller meaning it can't be destroyed. Parallel allocation
-		 * is protected against by mmap_sem.
-		 */
-		rc = -ENOMEM;
-		npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
-		if (npu_context) {
-			kref_init(&npu_context->kref);
-			npu_context->mm = mm;
-			npu_context->mn.ops = &nv_nmmu_notifier_ops;
-			rc = __mmu_notifier_register(&npu_context->mn, mm);
-		}
-
-		if (rc) {
-			kfree(npu_context);
-			return ERR_PTR(rc);
-		}
-
-		mm->context.npu_context = npu_context;
-	}
-
-	npu_context->release_cb = cb;
-	npu_context->priv = priv;
-
-	/*
-	 * npdev is a pci_dev pointer setup by the PCI code. We assign it to
-	 * npdev[][] to indicate to the mmu notifiers that an invalidation
-	 * should also be sent over this nvlink. The notifiers don't use any
-	 * other fields in npu_context, so we just need to ensure that when they
-	 * deference npu_context->npdev[][] it is either a valid pointer or
-	 * NULL.
-	 */
-	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
-
-	if (!npu->nmmu_flush) {
-		/*
-		 * If we're not explicitly flushing ourselves we need to mark
-		 * the thread for global flushes
-		 */
-		npu_context->nmmu_flush = false;
-		mm_context_add_copro(mm);
-	} else
-		npu_context->nmmu_flush = true;
-
-	return npu_context;
-}
-EXPORT_SYMBOL(pnv_npu2_init_context);
-
-static void pnv_npu2_release_context(struct kref *kref)
-{
-	struct npu_context *npu_context =
-		container_of(kref, struct npu_context, kref);
-
-	if (!npu_context->nmmu_flush)
-		mm_context_remove_copro(npu_context->mm);
-
-	npu_context->mm->context.npu_context = NULL;
-}
-
-/*
- * Destroy a context on the given GPU. May free the npu_context if it is no
- * longer active on any GPUs. Must not be called from interrupt context.
- */
-void pnv_npu2_destroy_context(struct npu_context *npu_context,
-			struct pci_dev *gpdev)
-{
-	int removed;
-	struct npu *npu;
-	struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
-	struct device_node *nvlink_dn;
-	u32 nvlink_index;
-	struct pci_controller *hose;
-
-	if (WARN_ON(!npdev))
-		return;
-
-	hose = pci_bus_to_host(npdev->bus);
-	npu = hose->npu;
-	if (!npu)
-		return;
-	nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
-	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
-							&nvlink_index)))
-		return;
-	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
-	spin_lock(&npu_context_lock);
-	removed = kref_put(&npu_context->kref, pnv_npu2_release_context);
-	spin_unlock(&npu_context_lock);
-
-	/*
-	 * We need to do this outside of pnv_npu2_release_context so that it is
-	 * outside the spinlock as mmu_notifier_destroy uses SRCU.
-	 */
-	if (removed) {
-		mmu_notifier_unregister(&npu_context->mn,
-					npu_context->mm);
-
-		kfree(npu_context);
-	}
-
-}
-EXPORT_SYMBOL(pnv_npu2_destroy_context);
-
-/*
- * Assumes mmap_sem is held for the contexts associated mm.
- */
-int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
-			unsigned long *flags, unsigned long *status, int count)
-{
-	u64 rc = 0, result = 0;
-	int i, is_write;
-	struct page *page[1];
-	const char __user *u;
-	char c;
-
-	/* mmap_sem should be held so the struct_mm must be present */
-	struct mm_struct *mm = context->mm;
-
-	WARN_ON(!rwsem_is_locked(&mm->mmap_sem));
-
-	for (i = 0; i < count; i++) {
-		is_write = flags[i] & NPU2_WRITE;
-		rc = get_user_pages_remote(NULL, mm, ea[i], 1,
-					is_write ? FOLL_WRITE : 0,
-					page, NULL, NULL);
-
-		if (rc != 1) {
-			status[i] = rc;
-			result = -EFAULT;
-			continue;
-		}
-
-		/* Make sure partition scoped tree gets a pte */
-		u = page_address(page[0]);
-		if (__get_user(c, u))
-			result = -EFAULT;
-
-		status[i] = 0;
-		put_page(page[0]);
-	}
-
-	return result;
-}
-EXPORT_SYMBOL(pnv_npu2_handle_fault);
-
 int pnv_npu2_init(struct pci_controller *hose)
 {
-	unsigned int i;
-	u64 mmio_atsd;
 	static int npu_index;
 	struct npu *npu;
 	int ret;
@@ -1160,33 +632,18 @@ int pnv_npu2_init(struct pci_controller *hose)
 	if (!npu)
 		return -ENOMEM;
 
-	npu->nmmu_flush = of_property_read_bool(hose->dn, "ibm,nmmu-flush");
-
-	for (i = 0; i < ARRAY_SIZE(npu->mmio_atsd_regs) &&
-			!of_property_read_u64_index(hose->dn, "ibm,mmio-atsd",
-				i, &mmio_atsd); i++)
-		npu->mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
-
-	pr_info("NPU%d: Found %d MMIO ATSD registers", hose->global_number, i);
-	npu->mmio_atsd_count = i;
-	npu->mmio_atsd_usage = 0;
 	npu_index++;
 	if (WARN_ON(npu_index >= NV_MAX_NPUS)) {
 		ret = -ENOSPC;
 		goto fail_exit;
 	}
-	max_npu2_index = npu_index;
 	npu->index = npu_index;
 	hose->npu = npu;
 
 	return 0;
 
 fail_exit:
-	for (i = 0; i < npu->mmio_atsd_count; ++i)
-		iounmap(npu->mmio_atsd_regs[i]);
-
 	kfree(npu);
-
 	return ret;
 }
 
diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c
index 36c8fa3647a2..29ca523c1c79 100644
--- a/arch/powerpc/platforms/powernv/opal-call.c
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -273,7 +273,6 @@ OPAL_CALL(opal_npu_map_lpar,			OPAL_NPU_MAP_LPAR);
 OPAL_CALL(opal_imc_counters_init,		OPAL_IMC_COUNTERS_INIT);
 OPAL_CALL(opal_imc_counters_start,		OPAL_IMC_COUNTERS_START);
 OPAL_CALL(opal_imc_counters_stop,		OPAL_IMC_COUNTERS_STOP);
-OPAL_CALL(opal_pci_set_p2p,			OPAL_PCI_SET_P2P);
 OPAL_CALL(opal_get_powercap,			OPAL_GET_POWERCAP);
 OPAL_CALL(opal_set_powercap,			OPAL_SET_POWERCAP);
 OPAL_CALL(opal_get_power_shift_ratio,		OPAL_GET_POWER_SHIFT_RATIO);
diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c
index 5cae375525d0..3e1f064a18db 100644
--- a/arch/powerpc/platforms/powernv/opal-hmi.c
+++ b/arch/powerpc/platforms/powernv/opal-hmi.c
@@ -137,6 +137,43 @@ static void print_nx_checkstop_reason(const char *level,
 					xstop_reason[i].description);
 }
 
+static void print_npu_checkstop_reason(const char *level,
+					struct OpalHMIEvent *hmi_evt)
+{
+	uint8_t reason, reason_count, i;
+
+	/*
+	 * We may not have a checkstop reason on some combination of
+	 * hardware and/or skiboot version
+	 */
+	if (!hmi_evt->u.xstop_error.xstop_reason) {
+		printk("%s	NPU checkstop on chip %x\n", level,
+			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
+		return;
+	}
+
+	/*
+	 * NPU2 has 3 FIRs. Reason encoded on a byte as:
+	 *   2 bits for the FIR number
+	 *   6 bits for the bit number
+	 * It may be possible to find several reasons.
+	 *
+	 * We don't display a specific message per FIR bit as there
+	 * are too many and most are meaningless without the workbook
+	 * and/or hw team help anyway.
+	 */
+	reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) /
+		sizeof(reason);
+	for (i = 0; i < reason_count; i++) {
+		reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 0xFF;
+		if (reason)
+			printk("%s	NPU checkstop on chip %x: FIR%d bit %d is set\n",
+				level,
+				be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id),
+				reason >> 6, reason & 0x3F);
+	}
+}
+
 static void print_checkstop_reason(const char *level,
 					struct OpalHMIEvent *hmi_evt)
 {
@@ -148,6 +185,9 @@ static void print_checkstop_reason(const char *level,
 	case CHECKSTOP_TYPE_NX:
 		print_nx_checkstop_reason(level, hmi_evt);
 		break;
+	case CHECKSTOP_TYPE_NPU:
+		print_npu_checkstop_reason(level, hmi_evt);
+		break;
 	default:
 		printk("%s	Unknown Malfunction Alert of type %d\n",
 		       level, type);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 98c5d94b17fb..aba443be7daa 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -202,16 +202,18 @@ static int __init opal_register_exception_handlers(void)
 	glue = 0x7000;
 
 	/*
-	 * Check if we are running on newer firmware that exports
-	 * OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to patch
-	 * the HMI interrupt and we catch it directly in Linux.
+	 * Only ancient OPAL firmware requires this.
+	 * Specifically, firmware from FW810.00 (released June 2014)
+	 * through FW810.20 (Released October 2014).
 	 *
-	 * For older firmware (i.e currently released POWER8 System Firmware
-	 * as of today <= SV810_087), we fallback to old behavior and let OPAL
-	 * patch the HMI vector and handle it inside OPAL firmware.
+	 * Check if we are running on newer (post Oct 2014) firmware that
+	 * exports the OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to
+	 * patch the HMI interrupt and we catch it directly in Linux.
 	 *
-	 * For newer firmware (in development/yet to be released) we will
-	 * start catching/handling HMI directly in Linux.
+	 * For older firmware (i.e < FW810.20), we fallback to old behavior and
+	 * let OPAL patch the HMI vector and handle it inside OPAL firmware.
+	 *
+	 * For newer firmware we catch/handle the HMI directly in Linux.
 	 */
 	if (!opal_check_token(OPAL_HANDLE_HMI)) {
 		pr_info("Old firmware detected, OPAL handles HMIs.\n");
@@ -221,6 +223,11 @@ static int __init opal_register_exception_handlers(void)
 		glue += 128;
 	}
 
+	/*
+	 * Only applicable to ancient firmware, all modern
+	 * (post March 2015/skiboot 5.0) firmware will just return
+	 * OPAL_UNSUPPORTED.
+	 */
 	opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue);
 #endif
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 10cc42b9e541..d8080558d020 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -50,6 +50,8 @@
 static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
 					      "NPU_OCAPI" };
 
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+
 void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 			    const char *fmt, ...)
 {
@@ -2356,7 +2358,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
 	return 0;
 }
 
-void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 {
 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
 	int64_t rc;
@@ -2456,6 +2458,14 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 	if (!pnv_iommu_bypass_disabled)
 		pnv_pci_ioda2_set_bypass(pe, true);
 
+	/*
+	 * Set table base for the case of IOMMU DMA use. Usually this is done
+	 * from dma_dev_setup() which is not called when a device is returned
+	 * from VFIO so do it here.
+	 */
+	if (pe->pdev)
+		set_iommu_table_base(&pe->pdev->dev, tbl);
+
 	return 0;
 }
 
@@ -2543,6 +2553,8 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
 	pnv_pci_ioda2_unset_window(&pe->table_group, 0);
 	if (pe->pbus)
 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
+	else if (pe->pdev)
+		set_iommu_table_base(&pe->pdev->dev, NULL);
 	iommu_tce_table_put(tbl);
 }
 
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index ff1a33fee8e6..6104418c9ad5 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -34,7 +34,6 @@
 #include "powernv.h"
 #include "pci.h"
 
-static DEFINE_MUTEX(p2p_mutex);
 static DEFINE_MUTEX(tunnel_mutex);
 
 int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id)
@@ -857,79 +856,6 @@ void pnv_pci_dma_bus_setup(struct pci_bus *bus)
 	}
 }
 
-int pnv_pci_set_p2p(struct pci_dev *initiator, struct pci_dev *target, u64 desc)
-{
-	struct pci_controller *hose;
-	struct pnv_phb *phb_init, *phb_target;
-	struct pnv_ioda_pe *pe_init;
-	int rc;
-
-	if (!opal_check_token(OPAL_PCI_SET_P2P))
-		return -ENXIO;
-
-	hose = pci_bus_to_host(initiator->bus);
-	phb_init = hose->private_data;
-
-	hose = pci_bus_to_host(target->bus);
-	phb_target = hose->private_data;
-
-	pe_init = pnv_ioda_get_pe(initiator);
-	if (!pe_init)
-		return -ENODEV;
-
-	/*
-	 * Configuring the initiator's PHB requires to adjust its
-	 * TVE#1 setting. Since the same device can be an initiator
-	 * several times for different target devices, we need to keep
-	 * a reference count to know when we can restore the default
-	 * bypass setting on its TVE#1 when disabling. Opal is not
-	 * tracking PE states, so we add a reference count on the PE
-	 * in linux.
-	 *
-	 * For the target, the configuration is per PHB, so we keep a
-	 * target reference count on the PHB.
-	 */
-	mutex_lock(&p2p_mutex);
-
-	if (desc & OPAL_PCI_P2P_ENABLE) {
-		/* always go to opal to validate the configuration */
-		rc = opal_pci_set_p2p(phb_init->opal_id, phb_target->opal_id,
-				      desc, pe_init->pe_number);
-
-		if (rc != OPAL_SUCCESS) {
-			rc = -EIO;
-			goto out;
-		}
-
-		pe_init->p2p_initiator_count++;
-		phb_target->p2p_target_count++;
-	} else {
-		if (!pe_init->p2p_initiator_count ||
-			!phb_target->p2p_target_count) {
-			rc = -EINVAL;
-			goto out;
-		}
-
-		if (--pe_init->p2p_initiator_count == 0)
-			pnv_pci_ioda2_set_bypass(pe_init, true);
-
-		if (--phb_target->p2p_target_count == 0) {
-			rc = opal_pci_set_p2p(phb_init->opal_id,
-					      phb_target->opal_id, desc,
-					      pe_init->pe_number);
-			if (rc != OPAL_SUCCESS) {
-				rc = -EIO;
-				goto out;
-			}
-		}
-	}
-	rc = 0;
-out:
-	mutex_unlock(&p2p_mutex);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(pnv_pci_set_p2p);
-
 struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev)
 {
 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
@@ -938,54 +864,6 @@ struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev)
 }
 EXPORT_SYMBOL(pnv_pci_get_phb_node);
 
-int pnv_pci_enable_tunnel(struct pci_dev *dev, u64 *asnind)
-{
-	struct device_node *np;
-	const __be32 *prop;
-	struct pnv_ioda_pe *pe;
-	uint16_t window_id;
-	int rc;
-
-	if (!radix_enabled())
-		return -ENXIO;
-
-	if (!(np = pnv_pci_get_phb_node(dev)))
-		return -ENXIO;
-
-	prop = of_get_property(np, "ibm,phb-indications", NULL);
-	of_node_put(np);
-
-	if (!prop || !prop[1])
-		return -ENXIO;
-
-	*asnind = (u64)be32_to_cpu(prop[1]);
-	pe = pnv_ioda_get_pe(dev);
-	if (!pe)
-		return -ENODEV;
-
-	/* Increase real window size to accept as_notify messages. */
-	window_id = (pe->pe_number << 1 ) + 1;
-	rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, pe->pe_number,
-					     window_id, pe->tce_bypass_base,
-					     (uint64_t)1 << 48);
-	return opal_error_code(rc);
-}
-EXPORT_SYMBOL_GPL(pnv_pci_enable_tunnel);
-
-int pnv_pci_disable_tunnel(struct pci_dev *dev)
-{
-	struct pnv_ioda_pe *pe;
-
-	pe = pnv_ioda_get_pe(dev);
-	if (!pe)
-		return -ENODEV;
-
-	/* Restore default real window size. */
-	pnv_pci_ioda2_set_bypass(pe, true);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pnv_pci_disable_tunnel);
-
 int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable)
 {
 	__be64 val;
@@ -1040,29 +918,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(pnv_pci_set_tunnel_bar);
 
-#ifdef CONFIG_PPC64	/* for thread.tidr */
-int pnv_pci_get_as_notify_info(struct task_struct *task, u32 *lpid, u32 *pid,
-			       u32 *tid)
-{
-	struct mm_struct *mm = NULL;
-
-	if (task == NULL)
-		return -EINVAL;
-
-	mm = get_task_mm(task);
-	if (mm == NULL)
-		return -EINVAL;
-
-	*pid = mm->context.id;
-	mmput(mm);
-
-	*tid = task->thread.tidr;
-	*lpid = mfspr(SPRN_LPID);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pnv_pci_get_as_notify_info);
-#endif
-
 void pnv_pci_shutdown(void)
 {
 	struct pci_controller *hose;
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index be26ab3d99e0..469c24463247 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -79,9 +79,6 @@ struct pnv_ioda_pe {
 	struct pnv_ioda_pe	*master;
 	struct list_head	slaves;
 
-	/* PCI peer-to-peer*/
-	int			p2p_initiator_count;
-
 	/* Link in list of PE#s */
 	struct list_head	list;
 };
@@ -172,8 +169,6 @@ struct pnv_phb {
 	/* PHB and hub diagnostics */
 	unsigned int		diag_data_size;
 	u8			*diag_data;
-
-	int p2p_target_count;
 };
 
 extern struct pci_ops pnv_pci_ops;
@@ -200,7 +195,6 @@ extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
 extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
 extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
 extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
-extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
 extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
 		__u64 window_size, __u32 levels);
 extern int pnv_eeh_post_init(void);
diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index ea5ca0201da8..0c0d27d17976 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -40,16 +40,6 @@ static void compute_paste_address(struct vas_window *window, u64 *addr, int *len
 	pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr);
 }
 
-u64 vas_win_paste_addr(struct vas_window *win)
-{
-	u64 addr;
-
-	compute_paste_address(win, &addr, NULL);
-
-	return addr;
-}
-EXPORT_SYMBOL(vas_win_paste_addr);
-
 static inline void get_hvwc_mmio_bar(struct vas_window *window,
 			u64 *start, int *len)
 {
@@ -1264,12 +1254,3 @@ int vas_win_close(struct vas_window *window)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(vas_win_close);
-
-/*
- * Return a system-wide unique window id for the window @win.
- */
-u32 vas_win_id(struct vas_window *win)
-{
-	return encode_pswid(win->vinst->vas_id, win->winid);
-}
-EXPORT_SYMBOL_GPL(vas_win_id);
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index 9cc5251816db..5574aec9ee88 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -444,26 +444,6 @@ static inline u64 read_hvwc_reg(struct vas_window *win,
 	return in_be64(win->hvwc_map+reg);
 }
 
-/*
- * Encode/decode the Partition Send Window ID (PSWID) for a window in
- * a way that we can uniquely identify any window in the system. i.e.
- * we should be able to locate the 'struct vas_window' given the PSWID.
- *
- *	Bits	Usage
- *	0:7	VAS id (8 bits)
- *	8:15	Unused, 0 (3 bits)
- *	16:31	Window id (16 bits)
- */
-static inline u32 encode_pswid(int vasid, int winid)
-{
-	u32 pswid = 0;
-
-	pswid |= vasid << (31 - 7);
-	pswid |= winid;
-
-	return pswid;
-}
-
 static inline void decode_pswid(u32 pswid, int *vasid, int *winid)
 {
 	if (vasid)