diff options
Diffstat (limited to 'arch/powerpc/platforms/powernv')
29 files changed, 3291 insertions, 147 deletions
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 6a6f4ef46b9e..f8dc98d3dc01 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 config PPC_POWERNV depends on PPC64 && PPC_BOOK3S bool "IBM PowerNV (Non-Virtualized) platform support" @@ -30,3 +31,25 @@ config OPAL_PRD help This enables the opal-prd driver, a facility to run processor recovery diagnostics on OpenPower machines + +config PPC_MEMTRACE + bool "Enable removal of RAM from kernel mappings for tracing" + depends on PPC_POWERNV && MEMORY_HOTREMOVE + default n + help + Enabling this option allows for the removal of memory (RAM) + from the kernel mappings to be used for hardware tracing. + +config PPC_VAS + bool "IBM Virtual Accelerator Switchboard (VAS)" + depends on PPC_POWERNV && PPC_64K_PAGES + default y + help + This enables support for IBM Virtual Accelerator Switchboard (VAS). + + VAS allows accelerators in co-processors like NX-GZIP and NX-842 + to be accessible to kernel subsystems and user processes. + + VAS adapters are found in POWER9 based systems. + + If unsure, say N. diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index b5d98cb3f482..7a31c26500e6 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,8 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 obj-y += setup.o opal-wrappers.o opal.o opal-async.o idle.o obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o -obj-y += opal-kmsg.o +obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o @@ -12,3 +13,6 @@ obj-$(CONFIG_PPC_SCOM) += opal-xscom.o obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o obj-$(CONFIG_OPAL_PRD) += opal-prd.o +obj-$(CONFIG_PERF_EVENTS) += opal-imc.o +obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o +obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o diff --git a/arch/powerpc/platforms/powernv/copy-paste.h b/arch/powerpc/platforms/powernv/copy-paste.h new file mode 100644 index 000000000000..c9a503623431 --- /dev/null +++ b/arch/powerpc/platforms/powernv/copy-paste.h @@ -0,0 +1,46 @@ +/* + * Copyright 2016-17 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/ppc-opcode.h> + +#define CR0_SHIFT 28 +#define CR0_MASK 0xF +/* + * Copy/paste instructions: + * + * copy RA,RB + * Copy contents of address (RA) + effective_address(RB) + * to internal copy-buffer. + * + * paste RA,RB + * Paste contents of internal copy-buffer to the address + * (RA) + effective_address(RB) + */ +static inline int vas_copy(void *crb, int offset) +{ + asm volatile(PPC_COPY(%0, %1)";" + : + : "b" (offset), "b" (crb) + : "memory"); + + return 0; +} + +static inline int vas_paste(void *paste_address, int offset) +{ + u32 cr; + + cr = 0; + asm volatile(PPC_PASTE(%1, %2)";" + "mfocrf %0, 0x80;" + : "=r" (cr) + : "b" (offset), "b" (paste_address) + : "memory", "cr0"); + + return (cr >> CR0_SHIFT) & CR0_MASK; +} diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 3f48f6df1cf3..8864065eba22 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -113,7 +113,6 @@ static ssize_t pnv_eeh_ei_write(struct file *filp, size_t count, loff_t *ppos) { struct pci_controller *hose = filp->private_data; - struct eeh_dev *edev; struct eeh_pe *pe; int pe_no, type, func; unsigned long addr, mask; @@ -135,13 +134,7 @@ static ssize_t pnv_eeh_ei_write(struct file *filp, return -EINVAL; /* Retrieve PE */ - edev = kzalloc(sizeof(*edev), GFP_KERNEL); - if (!edev) - return -ENOMEM; - edev->phb = hose; - edev->pe_config_addr = pe_no; - pe = eeh_pe_get(edev); - kfree(edev); + pe = eeh_pe_get(hose, pe_no, 0); if (!pe) return -ENODEV; @@ -359,6 +352,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) struct eeh_dev *edev = pdn_to_eeh_dev(pdn); uint32_t pcie_flags; int ret; + int config_addr = (pdn->busno << 8) | (pdn->devfn); /* * When probing the root bridge, which doesn't have any @@ -393,8 +387,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) } } - edev->config_addr = (pdn->busno << 8) | (pdn->devfn); - edev->pe_config_addr = phb->ioda.pe_rmap[edev->config_addr]; + edev->pe_config_addr = phb->ioda.pe_rmap[config_addr]; /* Create PE */ ret = eeh_add_to_parent_pe(edev); @@ -933,7 +926,6 @@ void pnv_pci_reset_secondary_bus(struct pci_dev *dev) static void pnv_eeh_wait_for_pending(struct pci_dn *pdn, const char *type, int pos, u16 mask) { - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); int i, status = 0; /* Wait for Transaction Pending bit to be cleared */ @@ -947,7 +939,7 @@ static void pnv_eeh_wait_for_pending(struct pci_dn *pdn, const char *type, pr_warn("%s: Pending transaction while issuing %sFLR to %04x:%02x:%02x.%01x\n", __func__, type, - edev->phb->global_number, pdn->busno, + pdn->phb->global_number, pdn->busno, PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); } @@ -1381,7 +1373,6 @@ static int pnv_eeh_get_pe(struct pci_controller *hose, struct pnv_phb *phb = hose->private_data; struct pnv_ioda_pe *pnv_pe; struct eeh_pe *dev_pe; - struct eeh_dev edev; /* * If PHB supports compound PE, to fetch @@ -1397,10 +1388,7 @@ static int pnv_eeh_get_pe(struct pci_controller *hose, } /* Find the PE according to PE# */ - memset(&edev, 0, sizeof(struct eeh_dev)); - edev.phb = hose; - edev.pe_config_addr = pe_no; - dev_pe = eeh_pe_get(&edev); + dev_pe = eeh_pe_get(hose, pe_no, 0); if (!dev_pe) return -EEXIST; @@ -1711,6 +1699,7 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn) struct eeh_dev *edev = pdn_to_eeh_dev(pdn); struct pnv_phb *phb; s64 ret; + int config_addr = (pdn->busno << 8) | (pdn->devfn); if (!edev) return -EEXIST; @@ -1725,14 +1714,14 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn) if (edev->physfn) { ret = pnv_eeh_restore_vf_config(pdn); } else { - phb = edev->phb->private_data; + phb = pdn->phb->private_data; ret = opal_pci_reinit(phb->opal_id, - OPAL_REINIT_PCI_DEV, edev->config_addr); + OPAL_REINIT_PCI_DEV, config_addr); } if (ret) { pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n", - __func__, edev->config_addr, ret); + __func__, config_addr, ret); return -EIO; } diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index a553aeea7af6..443d5ca71995 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -69,7 +69,7 @@ static int pnv_save_sprs_for_deep_states(void) * all cpus at boot. Get these reg values of current cpu and use the * same across all cpus. */ - uint64_t lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1; + uint64_t lpcr_val = mfspr(SPRN_LPCR); uint64_t hid0_val = mfspr(SPRN_HID0); uint64_t hid1_val = mfspr(SPRN_HID1); uint64_t hid4_val = mfspr(SPRN_HID4); @@ -388,6 +388,20 @@ void power9_idle(void) } #ifdef CONFIG_HOTPLUG_CPU +static void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) +{ + u64 pir = get_hard_smp_processor_id(cpu); + + mtspr(SPRN_LPCR, lpcr_val); + + /* + * Program the LPCR via stop-api only if the deepest stop state + * can lose hypervisor context. + */ + if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) + opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val); +} + /* * pnv_cpu_offline: A function that puts the CPU into the deepest * available platform idle state on a CPU-Offline. @@ -397,6 +411,20 @@ unsigned long pnv_cpu_offline(unsigned int cpu) { unsigned long srr1; u32 idle_states = pnv_get_supported_cpuidle_states(); + u64 lpcr_val; + + /* + * We don't want to take decrementer interrupts while we are + * offline, so clear LPCR:PECE1. We keep PECE2 (and + * LPCR_PECE_HVEE on P9) enabled as to let IPIs in. + * + * If the CPU gets woken up by a special wakeup, ensure that + * the SLW engine sets LPCR with decrementer bit cleared, else + * the CPU will come back to the kernel due to a spurious + * wakeup. + */ + lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1; + pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); __ppc64_runlatch_off(); @@ -428,6 +456,16 @@ unsigned long pnv_cpu_offline(unsigned int cpu) __ppc64_runlatch_on(); + /* + * Re-enable decrementer interrupts in LPCR. + * + * Further, we want stop states to be woken up by decrementer + * for non-hotplug cases. So program the LPCR via stop api as + * well. + */ + lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1; + pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); + return srr1; } #endif diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c new file mode 100644 index 000000000000..de470caf0784 --- /dev/null +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -0,0 +1,282 @@ +/* + * Copyright (C) IBM Corporation, 2014, 2017 + * Anton Blanchard, Rashmica Gupta. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#define pr_fmt(fmt) "memtrace: " fmt + +#include <linux/bitops.h> +#include <linux/string.h> +#include <linux/memblock.h> +#include <linux/init.h> +#include <linux/moduleparam.h> +#include <linux/fs.h> +#include <linux/debugfs.h> +#include <linux/slab.h> +#include <linux/memory.h> +#include <linux/memory_hotplug.h> +#include <asm/machdep.h> +#include <asm/debugfs.h> + +/* This enables us to keep track of the memory removed from each node. */ +struct memtrace_entry { + void *mem; + u64 start; + u64 size; + u32 nid; + struct dentry *dir; + char name[16]; +}; + +static u64 memtrace_size; + +static struct memtrace_entry *memtrace_array; +static unsigned int memtrace_array_nr; + + +static ssize_t memtrace_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct memtrace_entry *ent = filp->private_data; + + return simple_read_from_buffer(ubuf, count, ppos, ent->mem, ent->size); +} + +static bool valid_memtrace_range(struct memtrace_entry *dev, + unsigned long start, unsigned long size) +{ + if ((start >= dev->start) && + ((start + size) <= (dev->start + dev->size))) + return true; + + return false; +} + +static int memtrace_mmap(struct file *filp, struct vm_area_struct *vma) +{ + unsigned long size = vma->vm_end - vma->vm_start; + struct memtrace_entry *dev = filp->private_data; + + if (!valid_memtrace_range(dev, vma->vm_pgoff << PAGE_SHIFT, size)) + return -EINVAL; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + if (remap_pfn_range(vma, vma->vm_start, + vma->vm_pgoff + (dev->start >> PAGE_SHIFT), + size, vma->vm_page_prot)) + return -EAGAIN; + + return 0; +} + +static const struct file_operations memtrace_fops = { + .llseek = default_llseek, + .read = memtrace_read, + .mmap = memtrace_mmap, + .open = simple_open, +}; + +static void flush_memory_region(u64 base, u64 size) +{ + unsigned long line_size = ppc64_caches.l1d.size; + u64 end = base + size; + u64 addr; + + base = round_down(base, line_size); + end = round_up(end, line_size); + + for (addr = base; addr < end; addr += line_size) + asm volatile("dcbf 0,%0" : "=r" (addr) :: "memory"); +} + +static int check_memblock_online(struct memory_block *mem, void *arg) +{ + if (mem->state != MEM_ONLINE) + return -1; + + return 0; +} + +static int change_memblock_state(struct memory_block *mem, void *arg) +{ + unsigned long state = (unsigned long)arg; + + mem->state = state; + + return 0; +} + +static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages) +{ + u64 end_pfn = start_pfn + nr_pages - 1; + + if (walk_memory_range(start_pfn, end_pfn, NULL, + check_memblock_online)) + return false; + + walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE, + change_memblock_state); + + if (offline_pages(start_pfn, nr_pages)) { + walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE, + change_memblock_state); + return false; + } + + walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE, + change_memblock_state); + + /* RCU grace period? */ + flush_memory_region((u64)__va(start_pfn << PAGE_SHIFT), + nr_pages << PAGE_SHIFT); + + lock_device_hotplug(); + remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT); + unlock_device_hotplug(); + + return true; +} + +static u64 memtrace_alloc_node(u32 nid, u64 size) +{ + u64 start_pfn, end_pfn, nr_pages; + u64 base_pfn; + + if (!NODE_DATA(nid) || !node_spanned_pages(nid)) + return 0; + + start_pfn = node_start_pfn(nid); + end_pfn = node_end_pfn(nid); + nr_pages = size >> PAGE_SHIFT; + + /* Trace memory needs to be aligned to the size */ + end_pfn = round_down(end_pfn - nr_pages, nr_pages); + + for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) { + if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) + return base_pfn << PAGE_SHIFT; + } + + return 0; +} + +static int memtrace_init_regions_runtime(u64 size) +{ + u32 nid; + u64 m; + + memtrace_array = kcalloc(num_online_nodes(), + sizeof(struct memtrace_entry), GFP_KERNEL); + if (!memtrace_array) { + pr_err("Failed to allocate memtrace_array\n"); + return -EINVAL; + } + + for_each_online_node(nid) { + m = memtrace_alloc_node(nid, size); + + /* + * A node might not have any local memory, so warn but + * continue on. + */ + if (!m) { + pr_err("Failed to allocate trace memory on node %d\n", nid); + continue; + } + + pr_info("Allocated trace memory on node %d at 0x%016llx\n", nid, m); + + memtrace_array[memtrace_array_nr].start = m; + memtrace_array[memtrace_array_nr].size = size; + memtrace_array[memtrace_array_nr].nid = nid; + memtrace_array_nr++; + } + + return 0; +} + +static struct dentry *memtrace_debugfs_dir; + +static int memtrace_init_debugfs(void) +{ + int ret = 0; + int i; + + for (i = 0; i < memtrace_array_nr; i++) { + struct dentry *dir; + struct memtrace_entry *ent = &memtrace_array[i]; + + ent->mem = ioremap(ent->start, ent->size); + /* Warn but continue on */ + if (!ent->mem) { + pr_err("Failed to map trace memory at 0x%llx\n", + ent->start); + ret = -1; + continue; + } + + snprintf(ent->name, 16, "%08x", ent->nid); + dir = debugfs_create_dir(ent->name, memtrace_debugfs_dir); + if (!dir) + return -1; + + ent->dir = dir; + debugfs_create_file("trace", 0400, dir, ent, &memtrace_fops); + debugfs_create_x64("start", 0400, dir, &ent->start); + debugfs_create_x64("size", 0400, dir, &ent->size); + } + + return ret; +} + +static int memtrace_enable_set(void *data, u64 val) +{ + if (memtrace_size) + return -EINVAL; + + if (!val) + return -EINVAL; + + /* Make sure size is aligned to a memory block */ + if (val & (memory_block_size_bytes() - 1)) + return -EINVAL; + + if (memtrace_init_regions_runtime(val)) + return -EINVAL; + + if (memtrace_init_debugfs()) + return -EINVAL; + + memtrace_size = val; + + return 0; +} + +static int memtrace_enable_get(void *data, u64 *val) +{ + *val = memtrace_size; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(memtrace_init_fops, memtrace_enable_get, + memtrace_enable_set, "0x%016llx\n"); + +static int memtrace_init(void) +{ + memtrace_debugfs_dir = debugfs_create_dir("memtrace", + powerpc_debugfs_root); + if (!memtrace_debugfs_dir) + return -1; + + debugfs_create_file("enable", 0600, memtrace_debugfs_dir, + NULL, &memtrace_init_fops); + + return 0; +} +machine_device_initcall(powernv, memtrace_init); diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index b5d960d6db3d..2cb6cbea4b3b 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -546,6 +546,12 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, unsigned long pid = npu_context->mm->context.id; /* + * Unfortunately the nest mmu does not support flushing specific + * addresses so we have to flush the whole mm. + */ + flush_tlb_mm(npu_context->mm); + + /* * Loop over all the NPUs this process is active on and launch * an invalidate. */ @@ -576,12 +582,6 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, } } - /* - * Unfortunately the nest mmu does not support flushing specific - * addresses so we have to flush the whole mm. - */ - flush_tlb_mm(npu_context->mm); - mmio_invalidate_wait(mmio_atsd_reg, flush); if (flush) /* Wait for the flush to complete */ @@ -614,15 +614,6 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, mmio_invalidate(npu_context, 1, address, true); } -static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address) -{ - struct npu_context *npu_context = mn_to_npu_context(mn); - - mmio_invalidate(npu_context, 1, address, true); -} - static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) @@ -640,7 +631,6 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { .release = pnv_npu2_mn_release, .change_pte = pnv_npu2_mn_change_pte, - .invalidate_page = pnv_npu2_mn_invalidate_page, .invalidate_range = pnv_npu2_mn_invalidate_range, }; diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index 83bebeec0fea..cf33769a7b72 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -171,8 +171,8 @@ int __init opal_async_comp_init(void) async = of_get_property(opal_node, "opal-msg-async-num", NULL); if (!async) { - pr_err("%s: %s has no opal-msg-async-num\n", - __func__, opal_node->full_name); + pr_err("%s: %pOF has no opal-msg-async-num\n", + __func__, opal_node); err = -ENOENT; goto out_opal_node; } diff --git a/arch/powerpc/platforms/powernv/opal-flash.c b/arch/powerpc/platforms/powernv/opal-flash.c index 4ec6219287fc..2fa3ac80cb4e 100644 --- a/arch/powerpc/platforms/powernv/opal-flash.c +++ b/arch/powerpc/platforms/powernv/opal-flash.c @@ -520,7 +520,7 @@ out: * update_flash : Flash new firmware image * */ -static struct bin_attribute image_data_attr = { +static const struct bin_attribute image_data_attr = { .attr = {.name = "image", .mode = 0200}, .size = MAX_IMAGE_SIZE, /* Limit image size */ .write = image_data_write, diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c index 88f3c61eec95..d78fed728cdf 100644 --- a/arch/powerpc/platforms/powernv/opal-hmi.c +++ b/arch/powerpc/platforms/powernv/opal-hmi.c @@ -30,6 +30,8 @@ #include <asm/cputable.h> #include <asm/machdep.h> +#include "powernv.h" + static int opal_hmi_handler_nb_init; struct OpalHmiEvtNode { struct list_head list; @@ -267,8 +269,6 @@ static void hmi_event_handler(struct work_struct *work) spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); if (unrecoverable) { - int ret; - /* Pull all HMI events from OPAL before we panic. */ while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) { u32 type; @@ -284,23 +284,7 @@ static void hmi_event_handler(struct work_struct *work) print_hmi_event_info(hmi_evt); } - /* - * Unrecoverable HMI exception. We need to inform BMC/OCC - * about this error so that it can collect relevant data - * for error analysis before rebooting. - */ - ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, - "Unrecoverable HMI exception"); - if (ret == OPAL_UNSUPPORTED) { - pr_emerg("Reboot type %d not supported\n", - OPAL_REBOOT_PLATFORM_ERROR); - } - - /* - * Fall through and panic if opal_cec_reboot2() returns - * OPAL_UNSUPPORTED. - */ - panic("Unrecoverable HMI exception"); + pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception"); } } diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c new file mode 100644 index 000000000000..21f6531fae20 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -0,0 +1,226 @@ +/* + * OPAL IMC interface detection driver + * Supported on POWERNV platform + * + * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation. + * (C) 2017 Anju T Sudhakar, IBM Corporation. + * (C) 2017 Hemant K Shaw, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or later version. + */ +#include <linux/kernel.h> +#include <linux/platform_device.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/of_platform.h> +#include <linux/crash_dump.h> +#include <asm/opal.h> +#include <asm/io.h> +#include <asm/imc-pmu.h> +#include <asm/cputhreads.h> + +/* + * imc_get_mem_addr_nest: Function to get nest counter memory region + * for each chip + */ +static int imc_get_mem_addr_nest(struct device_node *node, + struct imc_pmu *pmu_ptr, + u32 offset) +{ + int nr_chips = 0, i; + u64 *base_addr_arr, baddr; + u32 *chipid_arr; + + nr_chips = of_property_count_u32_elems(node, "chip-id"); + if (nr_chips <= 0) + return -ENODEV; + + base_addr_arr = kcalloc(nr_chips, sizeof(u64), GFP_KERNEL); + if (!base_addr_arr) + return -ENOMEM; + + chipid_arr = kcalloc(nr_chips, sizeof(u32), GFP_KERNEL); + if (!chipid_arr) + return -ENOMEM; + + if (of_property_read_u32_array(node, "chip-id", chipid_arr, nr_chips)) + goto error; + + if (of_property_read_u64_array(node, "base-addr", base_addr_arr, + nr_chips)) + goto error; + + pmu_ptr->mem_info = kcalloc(nr_chips, sizeof(struct imc_mem_info), + GFP_KERNEL); + if (!pmu_ptr->mem_info) + goto error; + + for (i = 0; i < nr_chips; i++) { + pmu_ptr->mem_info[i].id = chipid_arr[i]; + baddr = base_addr_arr[i] + offset; + pmu_ptr->mem_info[i].vbase = phys_to_virt(baddr); + } + + pmu_ptr->imc_counter_mmaped = true; + kfree(base_addr_arr); + kfree(chipid_arr); + return 0; + +error: + kfree(pmu_ptr->mem_info); + kfree(base_addr_arr); + kfree(chipid_arr); + return -1; +} + +/* + * imc_pmu_create : Takes the parent device which is the pmu unit, pmu_index + * and domain as the inputs. + * Allocates memory for the struct imc_pmu, sets up its domain, size and offsets + */ +static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain) +{ + int ret = 0; + struct imc_pmu *pmu_ptr; + u32 offset; + + /* memory for pmu */ + pmu_ptr = kzalloc(sizeof(struct imc_pmu), GFP_KERNEL); + if (!pmu_ptr) + return -ENOMEM; + + /* Set the domain */ + pmu_ptr->domain = domain; + + ret = of_property_read_u32(parent, "size", &pmu_ptr->counter_mem_size); + if (ret) { + ret = -EINVAL; + goto free_pmu; + } + + if (!of_property_read_u32(parent, "offset", &offset)) { + if (imc_get_mem_addr_nest(parent, pmu_ptr, offset)) { + ret = -EINVAL; + goto free_pmu; + } + } + + /* Function to register IMC pmu */ + ret = init_imc_pmu(parent, pmu_ptr, pmu_index); + if (ret) + pr_err("IMC PMU %s Register failed\n", pmu_ptr->pmu.name); + + return 0; + +free_pmu: + kfree(pmu_ptr); + return ret; +} + +static void disable_nest_pmu_counters(void) +{ + int nid, cpu; + const struct cpumask *l_cpumask; + + get_online_cpus(); + for_each_online_node(nid) { + l_cpumask = cpumask_of_node(nid); + cpu = cpumask_first(l_cpumask); + opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST, + get_hard_smp_processor_id(cpu)); + } + put_online_cpus(); +} + +static void disable_core_pmu_counters(void) +{ + cpumask_t cores_map; + int cpu, rc; + + get_online_cpus(); + /* Disable the IMC Core functions */ + cores_map = cpu_online_cores_map(); + for_each_cpu(cpu, &cores_map) { + rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE, + get_hard_smp_processor_id(cpu)); + if (rc) + pr_err("%s: Failed to stop Core (cpu = %d)\n", + __FUNCTION__, cpu); + } + put_online_cpus(); +} + +static int opal_imc_counters_probe(struct platform_device *pdev) +{ + struct device_node *imc_dev = pdev->dev.of_node; + int pmu_count = 0, domain; + u32 type; + + /* + * Check whether this is kdump kernel. If yes, force the engines to + * stop and return. + */ + if (is_kdump_kernel()) { + disable_nest_pmu_counters(); + disable_core_pmu_counters(); + return -ENODEV; + } + + for_each_compatible_node(imc_dev, NULL, IMC_DTB_UNIT_COMPAT) { + if (of_property_read_u32(imc_dev, "type", &type)) { + pr_warn("IMC Device without type property\n"); + continue; + } + + switch (type) { + case IMC_TYPE_CHIP: + domain = IMC_DOMAIN_NEST; + break; + case IMC_TYPE_CORE: + domain =IMC_DOMAIN_CORE; + break; + case IMC_TYPE_THREAD: + domain = IMC_DOMAIN_THREAD; + break; + default: + pr_warn("IMC Unknown Device type \n"); + domain = -1; + break; + } + + if (!imc_pmu_create(imc_dev, pmu_count, domain)) + pmu_count++; + } + + return 0; +} + +static void opal_imc_counters_shutdown(struct platform_device *pdev) +{ + /* + * Function only stops the engines which is bare minimum. + * TODO: Need to handle proper memory cleanup and pmu + * unregister. + */ + disable_nest_pmu_counters(); + disable_core_pmu_counters(); +} + +static const struct of_device_id opal_imc_match[] = { + { .compatible = IMC_DTB_COMPAT }, + {}, +}; + +static struct platform_driver opal_imc_driver = { + .driver = { + .name = "opal-imc-counters", + .of_match_table = opal_imc_match, + }, + .probe = opal_imc_counters_probe, + .shutdown = opal_imc_counters_shutdown, +}; + +builtin_platform_driver(opal_imc_driver); diff --git a/arch/powerpc/platforms/powernv/opal-powercap.c b/arch/powerpc/platforms/powernv/opal-powercap.c new file mode 100644 index 000000000000..badb29bde93f --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-powercap.c @@ -0,0 +1,244 @@ +/* + * PowerNV OPAL Powercap interface + * + * Copyright 2017 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "opal-powercap: " fmt + +#include <linux/of.h> +#include <linux/kobject.h> +#include <linux/slab.h> + +#include <asm/opal.h> + +DEFINE_MUTEX(powercap_mutex); + +static struct kobject *powercap_kobj; + +struct powercap_attr { + u32 handle; + struct kobj_attribute attr; +}; + +static struct pcap { + struct attribute_group pg; + struct powercap_attr *pattrs; +} *pcaps; + +static ssize_t powercap_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct powercap_attr *pcap_attr = container_of(attr, + struct powercap_attr, attr); + struct opal_msg msg; + u32 pcap; + int ret, token; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + pr_devel("Failed to get token\n"); + return token; + } + + ret = mutex_lock_interruptible(&powercap_mutex); + if (ret) + goto out_token; + + ret = opal_get_powercap(pcap_attr->handle, token, (u32 *)__pa(&pcap)); + switch (ret) { + case OPAL_ASYNC_COMPLETION: + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_devel("Failed to wait for the async response\n"); + ret = -EIO; + goto out; + } + ret = opal_error_code(opal_get_async_rc(msg)); + if (!ret) { + ret = sprintf(buf, "%u\n", be32_to_cpu(pcap)); + if (ret < 0) + ret = -EIO; + } + break; + case OPAL_SUCCESS: + ret = sprintf(buf, "%u\n", be32_to_cpu(pcap)); + if (ret < 0) + ret = -EIO; + break; + default: + ret = opal_error_code(ret); + } + +out: + mutex_unlock(&powercap_mutex); +out_token: + opal_async_release_token(token); + return ret; +} + +static ssize_t powercap_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t count) +{ + struct powercap_attr *pcap_attr = container_of(attr, + struct powercap_attr, attr); + struct opal_msg msg; + u32 pcap; + int ret, token; + + ret = kstrtoint(buf, 0, &pcap); + if (ret) + return ret; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + pr_devel("Failed to get token\n"); + return token; + } + + ret = mutex_lock_interruptible(&powercap_mutex); + if (ret) + goto out_token; + + ret = opal_set_powercap(pcap_attr->handle, token, pcap); + switch (ret) { + case OPAL_ASYNC_COMPLETION: + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_devel("Failed to wait for the async response\n"); + ret = -EIO; + goto out; + } + ret = opal_error_code(opal_get_async_rc(msg)); + if (!ret) + ret = count; + break; + case OPAL_SUCCESS: + ret = count; + break; + default: + ret = opal_error_code(ret); + } + +out: + mutex_unlock(&powercap_mutex); +out_token: + opal_async_release_token(token); + return ret; +} + +static void powercap_add_attr(int handle, const char *name, + struct powercap_attr *attr) +{ + attr->handle = handle; + sysfs_attr_init(&attr->attr.attr); + attr->attr.attr.name = name; + attr->attr.attr.mode = 0444; + attr->attr.show = powercap_show; +} + +void __init opal_powercap_init(void) +{ + struct device_node *powercap, *node; + int i = 0; + + powercap = of_find_compatible_node(NULL, NULL, "ibm,opal-powercap"); + if (!powercap) { + pr_devel("Powercap node not found\n"); + return; + } + + pcaps = kcalloc(of_get_child_count(powercap), sizeof(*pcaps), + GFP_KERNEL); + if (!pcaps) + return; + + powercap_kobj = kobject_create_and_add("powercap", opal_kobj); + if (!powercap_kobj) { + pr_warn("Failed to create powercap kobject\n"); + goto out_pcaps; + } + + i = 0; + for_each_child_of_node(powercap, node) { + u32 cur, min, max; + int j = 0; + bool has_cur = false, has_min = false, has_max = false; + + if (!of_property_read_u32(node, "powercap-min", &min)) { + j++; + has_min = true; + } + + if (!of_property_read_u32(node, "powercap-max", &max)) { + j++; + has_max = true; + } + + if (!of_property_read_u32(node, "powercap-current", &cur)) { + j++; + has_cur = true; + } + + pcaps[i].pattrs = kcalloc(j, sizeof(struct powercap_attr), + GFP_KERNEL); + if (!pcaps[i].pattrs) + goto out_pcaps_pattrs; + + pcaps[i].pg.attrs = kcalloc(j + 1, sizeof(struct attribute *), + GFP_KERNEL); + if (!pcaps[i].pg.attrs) { + kfree(pcaps[i].pattrs); + goto out_pcaps_pattrs; + } + + j = 0; + pcaps[i].pg.name = node->name; + if (has_min) { + powercap_add_attr(min, "powercap-min", + &pcaps[i].pattrs[j]); + pcaps[i].pg.attrs[j] = &pcaps[i].pattrs[j].attr.attr; + j++; + } + + if (has_max) { + powercap_add_attr(max, "powercap-max", + &pcaps[i].pattrs[j]); + pcaps[i].pg.attrs[j] = &pcaps[i].pattrs[j].attr.attr; + j++; + } + + if (has_cur) { + powercap_add_attr(cur, "powercap-current", + &pcaps[i].pattrs[j]); + pcaps[i].pattrs[j].attr.attr.mode |= 0220; + pcaps[i].pattrs[j].attr.store = powercap_store; + pcaps[i].pg.attrs[j] = &pcaps[i].pattrs[j].attr.attr; + j++; + } + + if (sysfs_create_group(powercap_kobj, &pcaps[i].pg)) { + pr_warn("Failed to create powercap attribute group %s\n", + pcaps[i].pg.name); + goto out_pcaps_pattrs; + } + i++; + } + + return; + +out_pcaps_pattrs: + while (--i >= 0) { + kfree(pcaps[i].pattrs); + kfree(pcaps[i].pg.attrs); + } + kobject_put(powercap_kobj); +out_pcaps: + kfree(pcaps); +} diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c index 2d6ee1c5ad85..de4dd09f4a15 100644 --- a/arch/powerpc/platforms/powernv/opal-prd.c +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -241,15 +241,9 @@ static ssize_t opal_prd_write(struct file *file, const char __user *buf, size = be16_to_cpu(hdr.size); - msg = kmalloc(size, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - rc = copy_from_user(msg, buf, size); - if (rc) { - size = -EFAULT; - goto out_free; - } + msg = memdup_user(buf, size); + if (IS_ERR(msg)) + return PTR_ERR(msg); rc = opal_prd_msg(msg); if (rc) { @@ -257,7 +251,6 @@ static ssize_t opal_prd_write(struct file *file, const char __user *buf, size = -EIO; } -out_free: kfree(msg); return size; diff --git a/arch/powerpc/platforms/powernv/opal-psr.c b/arch/powerpc/platforms/powernv/opal-psr.c new file mode 100644 index 000000000000..7313b7fc9071 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-psr.c @@ -0,0 +1,175 @@ +/* + * PowerNV OPAL Power-Shift-Ratio interface + * + * Copyright 2017 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "opal-psr: " fmt + +#include <linux/of.h> +#include <linux/kobject.h> +#include <linux/slab.h> + +#include <asm/opal.h> + +DEFINE_MUTEX(psr_mutex); + +static struct kobject *psr_kobj; + +struct psr_attr { + u32 handle; + struct kobj_attribute attr; +} *psr_attrs; + +static ssize_t psr_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct psr_attr *psr_attr = container_of(attr, struct psr_attr, attr); + struct opal_msg msg; + int psr, ret, token; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + pr_devel("Failed to get token\n"); + return token; + } + + ret = mutex_lock_interruptible(&psr_mutex); + if (ret) + goto out_token; + + ret = opal_get_power_shift_ratio(psr_attr->handle, token, + (u32 *)__pa(&psr)); + switch (ret) { + case OPAL_ASYNC_COMPLETION: + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_devel("Failed to wait for the async response\n"); + ret = -EIO; + goto out; + } + ret = opal_error_code(opal_get_async_rc(msg)); + if (!ret) { + ret = sprintf(buf, "%u\n", be32_to_cpu(psr)); + if (ret < 0) + ret = -EIO; + } + break; + case OPAL_SUCCESS: + ret = sprintf(buf, "%u\n", be32_to_cpu(psr)); + if (ret < 0) + ret = -EIO; + break; + default: + ret = opal_error_code(ret); + } + +out: + mutex_unlock(&psr_mutex); +out_token: + opal_async_release_token(token); + return ret; +} + +static ssize_t psr_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct psr_attr *psr_attr = container_of(attr, struct psr_attr, attr); + struct opal_msg msg; + int psr, ret, token; + + ret = kstrtoint(buf, 0, &psr); + if (ret) + return ret; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + pr_devel("Failed to get token\n"); + return token; + } + + ret = mutex_lock_interruptible(&psr_mutex); + if (ret) + goto out_token; + + ret = opal_set_power_shift_ratio(psr_attr->handle, token, psr); + switch (ret) { + case OPAL_ASYNC_COMPLETION: + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_devel("Failed to wait for the async response\n"); + ret = -EIO; + goto out; + } + ret = opal_error_code(opal_get_async_rc(msg)); + if (!ret) + ret = count; + break; + case OPAL_SUCCESS: + ret = count; + break; + default: + ret = opal_error_code(ret); + } + +out: + mutex_unlock(&psr_mutex); +out_token: + opal_async_release_token(token); + return ret; +} + +void __init opal_psr_init(void) +{ + struct device_node *psr, *node; + int i = 0; + + psr = of_find_compatible_node(NULL, NULL, + "ibm,opal-power-shift-ratio"); + if (!psr) { + pr_devel("Power-shift-ratio node not found\n"); + return; + } + + psr_attrs = kcalloc(of_get_child_count(psr), sizeof(struct psr_attr), + GFP_KERNEL); + if (!psr_attrs) + return; + + psr_kobj = kobject_create_and_add("psr", opal_kobj); + if (!psr_kobj) { + pr_warn("Failed to create psr kobject\n"); + goto out; + } + + for_each_child_of_node(psr, node) { + if (of_property_read_u32(node, "handle", + &psr_attrs[i].handle)) + goto out_kobj; + + sysfs_attr_init(&psr_attrs[i].attr.attr); + if (of_property_read_string(node, "label", + &psr_attrs[i].attr.attr.name)) + goto out_kobj; + psr_attrs[i].attr.attr.mode = 0664; + psr_attrs[i].attr.show = psr_show; + psr_attrs[i].attr.store = psr_store; + if (sysfs_create_file(psr_kobj, &psr_attrs[i].attr.attr)) { + pr_devel("Failed to create psr sysfs file %s\n", + psr_attrs[i].attr.attr.name); + goto out_kobj; + } + i++; + } + + return; +out_kobj: + kobject_put(psr_kobj); +out: + kfree(psr_attrs); +} diff --git a/arch/powerpc/platforms/powernv/opal-sensor-groups.c b/arch/powerpc/platforms/powernv/opal-sensor-groups.c new file mode 100644 index 000000000000..7e5a235ebf76 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-sensor-groups.c @@ -0,0 +1,212 @@ +/* + * PowerNV OPAL Sensor-groups interface + * + * Copyright 2017 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "opal-sensor-groups: " fmt + +#include <linux/of.h> +#include <linux/kobject.h> +#include <linux/slab.h> + +#include <asm/opal.h> + +DEFINE_MUTEX(sg_mutex); + +static struct kobject *sg_kobj; + +struct sg_attr { + u32 handle; + struct kobj_attribute attr; +}; + +static struct sensor_group { + char name[20]; + struct attribute_group sg; + struct sg_attr *sgattrs; +} *sgs; + +static ssize_t sg_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct sg_attr *sattr = container_of(attr, struct sg_attr, attr); + struct opal_msg msg; + u32 data; + int ret, token; + + ret = kstrtoint(buf, 0, &data); + if (ret) + return ret; + + if (data != 1) + return -EINVAL; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + pr_devel("Failed to get token\n"); + return token; + } + + ret = mutex_lock_interruptible(&sg_mutex); + if (ret) + goto out_token; + + ret = opal_sensor_group_clear(sattr->handle, token); + switch (ret) { + case OPAL_ASYNC_COMPLETION: + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_devel("Failed to wait for the async response\n"); + ret = -EIO; + goto out; + } + ret = opal_error_code(opal_get_async_rc(msg)); + if (!ret) + ret = count; + break; + case OPAL_SUCCESS: + ret = count; + break; + default: + ret = opal_error_code(ret); + } + +out: + mutex_unlock(&sg_mutex); +out_token: + opal_async_release_token(token); + return ret; +} + +static struct sg_ops_info { + int opal_no; + const char *attr_name; + ssize_t (*store)(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count); +} ops_info[] = { + { OPAL_SENSOR_GROUP_CLEAR, "clear", sg_store }, +}; + +static void add_attr(int handle, struct sg_attr *attr, int index) +{ + attr->handle = handle; + sysfs_attr_init(&attr->attr.attr); + attr->attr.attr.name = ops_info[index].attr_name; + attr->attr.attr.mode = 0220; + attr->attr.store = ops_info[index].store; +} + +static int add_attr_group(const __be32 *ops, int len, struct sensor_group *sg, + u32 handle) +{ + int i, j; + int count = 0; + + for (i = 0; i < len; i++) + for (j = 0; j < ARRAY_SIZE(ops_info); j++) + if (be32_to_cpu(ops[i]) == ops_info[j].opal_no) { + add_attr(handle, &sg->sgattrs[count], j); + sg->sg.attrs[count] = + &sg->sgattrs[count].attr.attr; + count++; + } + + return sysfs_create_group(sg_kobj, &sg->sg); +} + +static int get_nr_attrs(const __be32 *ops, int len) +{ + int i, j; + int nr_attrs = 0; + + for (i = 0; i < len; i++) + for (j = 0; j < ARRAY_SIZE(ops_info); j++) + if (be32_to_cpu(ops[i]) == ops_info[j].opal_no) + nr_attrs++; + + return nr_attrs; +} + +void __init opal_sensor_groups_init(void) +{ + struct device_node *sg, *node; + int i = 0; + + sg = of_find_compatible_node(NULL, NULL, "ibm,opal-sensor-group"); + if (!sg) { + pr_devel("Sensor groups node not found\n"); + return; + } + + sgs = kcalloc(of_get_child_count(sg), sizeof(*sgs), GFP_KERNEL); + if (!sgs) + return; + + sg_kobj = kobject_create_and_add("sensor_groups", opal_kobj); + if (!sg_kobj) { + pr_warn("Failed to create sensor group kobject\n"); + goto out_sgs; + } + + for_each_child_of_node(sg, node) { + const __be32 *ops; + u32 sgid, len, nr_attrs, chipid; + + ops = of_get_property(node, "ops", &len); + if (!ops) + continue; + + nr_attrs = get_nr_attrs(ops, len); + if (!nr_attrs) + continue; + + sgs[i].sgattrs = kcalloc(nr_attrs, sizeof(struct sg_attr), + GFP_KERNEL); + if (!sgs[i].sgattrs) + goto out_sgs_sgattrs; + + sgs[i].sg.attrs = kcalloc(nr_attrs + 1, + sizeof(struct attribute *), + GFP_KERNEL); + + if (!sgs[i].sg.attrs) { + kfree(sgs[i].sgattrs); + goto out_sgs_sgattrs; + } + + if (of_property_read_u32(node, "sensor-group-id", &sgid)) { + pr_warn("sensor-group-id property not found\n"); + goto out_sgs_sgattrs; + } + + if (!of_property_read_u32(node, "ibm,chip-id", &chipid)) + sprintf(sgs[i].name, "%s%d", node->name, chipid); + else + sprintf(sgs[i].name, "%s", node->name); + + sgs[i].sg.name = sgs[i].name; + if (add_attr_group(ops, len, &sgs[i], sgid)) { + pr_warn("Failed to create sensor attribute group %s\n", + sgs[i].sg.name); + goto out_sgs_sgattrs; + } + i++; + } + + return; + +out_sgs_sgattrs: + while (--i >= 0) { + kfree(sgs[i].sgattrs); + kfree(sgs[i].sg.attrs); + } + kobject_put(sg_kobj); +out_sgs: + kfree(sgs); +} diff --git a/arch/powerpc/platforms/powernv/opal-tracepoints.c b/arch/powerpc/platforms/powernv/opal-tracepoints.c index 3c447002edff..1ab7d26c0a2c 100644 --- a/arch/powerpc/platforms/powernv/opal-tracepoints.c +++ b/arch/powerpc/platforms/powernv/opal-tracepoints.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/percpu.h> #include <linux/jump_label.h> #include <asm/trace.h> diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 4ca6c26a56d5..8c1ede2d3f7e 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -27,7 +27,7 @@ .globl opal_tracepoint_refcount opal_tracepoint_refcount: - .llong 0 + .8byte 0 .section ".text" @@ -310,3 +310,12 @@ OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); +OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT); +OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START); +OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP); +OPAL_CALL(opal_pci_set_p2p, OPAL_PCI_SET_P2P); +OPAL_CALL(opal_get_powercap, OPAL_GET_POWERCAP); +OPAL_CALL(opal_set_powercap, OPAL_SET_POWERCAP); +OPAL_CALL(opal_get_power_shift_ratio, OPAL_GET_POWER_SHIFT_RATIO); +OPAL_CALL(opal_set_power_shift_ratio, OPAL_SET_POWER_SHIFT_RATIO); +OPAL_CALL(opal_sensor_group_clear, OPAL_SENSOR_GROUP_CLEAR); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index 28651fb25417..81c0a943dea9 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -36,14 +36,14 @@ static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count) const __be32 *gcid; if (!of_get_property(dev, "scom-controller", NULL)) { - pr_err("%s: device %s is not a SCOM controller\n", - __func__, dev->full_name); + pr_err("%s: device %pOF is not a SCOM controller\n", + __func__, dev); return SCOM_MAP_INVALID; } gcid = of_get_property(dev, "ibm,chip-id", NULL); if (!gcid) { - pr_err("%s: device %s has no ibm,chip-id\n", - __func__, dev->full_name); + pr_err("%s: device %pOF has no ibm,chip-id\n", + __func__, dev); return SCOM_MAP_INVALID; } m = kmalloc(sizeof(struct opal_scom_map), GFP_KERNEL); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index cad6b57ce494..65c79ecf5a4d 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -16,6 +16,7 @@ #include <linux/of.h> #include <linux/of_fdt.h> #include <linux/of_platform.h> +#include <linux/of_address.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/slab.h> @@ -25,11 +26,17 @@ #include <linux/memblock.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/printk.h> +#include <linux/kmsg_dump.h> +#include <linux/console.h> +#include <linux/sched/debug.h> #include <asm/machdep.h> #include <asm/opal.h> #include <asm/firmware.h> #include <asm/mce.h> +#include <asm/imc-pmu.h> +#include <asm/bug.h> #include "powernv.h" @@ -162,12 +169,9 @@ int __init early_init_dt_scan_recoverable_ranges(unsigned long node, sizeof(struct mcheck_recoverable_range); /* - * Allocate a buffer to hold the MC recoverable ranges. We would be - * accessing them in real mode, hence it needs to be within - * RMO region. + * Allocate a buffer to hold the MC recoverable ranges. */ - mc_recoverable_range =__va(memblock_alloc_base(size, __alignof__(u64), - ppc64_rma_size)); + mc_recoverable_range =__va(memblock_alloc(size, __alignof__(u64))); memset(mc_recoverable_range, 0, size); for (i = 0; i < mc_recoverable_range_len; i++) { @@ -422,24 +426,88 @@ static int opal_recover_mce(struct pt_regs *regs, /* Fatal machine check */ pr_err("Machine check interrupt is fatal\n"); recovered = 0; - } else if ((evt->severity == MCE_SEV_ERROR_SYNC) && - (user_mode(regs) && !is_global_init(current))) { + } + + if (!recovered && evt->severity == MCE_SEV_ERROR_SYNC) { /* - * For now, kill the task if we have received exception when - * in userspace. + * Try to kill processes if we get a synchronous machine check + * (e.g., one caused by execution of this instruction). This + * will devolve into a panic if we try to kill init or are in + * an interrupt etc. * * TODO: Queue up this address for hwpoisioning later. + * TODO: This is not quite right for d-side machine + * checks ->nip is not necessarily the important + * address. */ - _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); - recovered = 1; + if ((user_mode(regs))) { + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; + } else if (die_will_crash()) { + /* + * die() would kill the kernel, so better to go via + * the platform reboot code that will log the + * machine check. + */ + recovered = 0; + } else { + die("Machine check", regs, SIGBUS); + recovered = 1; + } } + return recovered; } +void pnv_platform_error_reboot(struct pt_regs *regs, const char *msg) +{ + /* + * This is mostly taken from kernel/panic.c, but tries to do + * relatively minimal work. Don't use delay functions (TB may + * be broken), don't crash dump (need to set a firmware log), + * don't run notifiers. We do want to get some information to + * Linux console. + */ + console_verbose(); + bust_spinlocks(1); + pr_emerg("Hardware platform error: %s\n", msg); + if (regs) + show_regs(regs); + smp_send_stop(); + printk_safe_flush_on_panic(); + kmsg_dump(KMSG_DUMP_PANIC); + bust_spinlocks(0); + debug_locks_off(); + console_flush_on_panic(); + + /* + * Don't bother to shut things down because this will + * xstop the system. + */ + if (opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, msg) + == OPAL_UNSUPPORTED) { + pr_emerg("Reboot type %d not supported for %s\n", + OPAL_REBOOT_PLATFORM_ERROR, msg); + } + + /* + * We reached here. There can be three possibilities: + * 1. We are running on a firmware level that do not support + * opal_cec_reboot2() + * 2. We are running on a firmware level that do not support + * OPAL_REBOOT_PLATFORM_ERROR reboot type. + * 3. We are running on FSP based system that does not need + * opal to trigger checkstop explicitly for error analysis. + * The FSP PRD component would have already got notified + * about this error through other channels. + */ + + ppc_md.restart(NULL); +} + int opal_machine_check(struct pt_regs *regs) { struct machine_check_event evt; - int ret; if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) return 0; @@ -455,43 +523,7 @@ int opal_machine_check(struct pt_regs *regs) if (opal_recover_mce(regs, &evt)) return 1; - /* - * Unrecovered machine check, we are heading to panic path. - * - * We may have hit this MCE in very early stage of kernel - * initialization even before opal-prd has started running. If - * this is the case then this MCE error may go un-noticed or - * un-analyzed if we go down panic path. We need to inform - * BMC/OCC about this error so that they can collect relevant - * data for error analysis before rebooting. - * Use opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR) to do so. - * This function may not return on BMC based system. - */ - ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, - "Unrecoverable Machine Check exception"); - if (ret == OPAL_UNSUPPORTED) { - pr_emerg("Reboot type %d not supported\n", - OPAL_REBOOT_PLATFORM_ERROR); - } - - /* - * We reached here. There can be three possibilities: - * 1. We are running on a firmware level that do not support - * opal_cec_reboot2() - * 2. We are running on a firmware level that do not support - * OPAL_REBOOT_PLATFORM_ERROR reboot type. - * 3. We are running on FSP based system that does not need opal - * to trigger checkstop explicitly for error analysis. The FSP - * PRD component would have already got notified about this - * error through other channels. - * - * If hardware marked this as an unrecoverable MCE, we are - * going to panic anyway. Even if it didn't, it's not safe to - * continue at this point, so we should explicitly panic. - */ - - panic("PowerNV Unrecovered Machine Check"); - return 0; + pnv_platform_error_reboot(regs, "Unrecoverable Machine Check exception"); } /* Early hmi handler called in real mode. */ @@ -720,6 +752,15 @@ static void opal_pdev_init(const char *compatible) of_platform_device_create(np, NULL, NULL); } +static void __init opal_imc_init_dev(void) +{ + struct device_node *np; + + np = of_find_compatible_node(NULL, NULL, IMC_DTB_COMPAT); + if (np) + of_platform_device_create(np, NULL, NULL); +} + static int kopald(void *unused) { unsigned long timeout = msecs_to_jiffies(opal_heartbeat) + 1; @@ -793,6 +834,9 @@ static int __init opal_init(void) /* Setup a heatbeat thread if requested by OPAL */ opal_init_heartbeat(); + /* Detect In-Memory Collection counters and create devices*/ + opal_imc_init_dev(); + /* Create leds platform devices */ leds = of_find_node_by_path("/ibm,opal/leds"); if (leds) { @@ -836,6 +880,15 @@ static int __init opal_init(void) /* Initialise OPAL kmsg dumper for flushing console on panic */ opal_kmsg_init(); + /* Initialise OPAL powercap interface */ + opal_powercap_init(); + + /* Initialise OPAL Power-Shifting-Ratio interface */ + opal_psr_init(); + + /* Initialise OPAL sensor groups */ + opal_sensor_groups_init(); + return 0; } machine_subsys_initcall(powernv, opal_init); @@ -952,6 +1005,7 @@ int opal_error_code(int rc) case OPAL_UNSUPPORTED: return -EIO; case OPAL_HARDWARE: return -EIO; case OPAL_INTERNAL_ERROR: return -EIO; + case OPAL_TIMEOUT: return -ETIMEDOUT; default: pr_err("%s: unexpected OPAL error %d\n", __func__, rc); return -EIO; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index b900eb1d5e17..57f9e55f4352 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -444,8 +444,8 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) r = of_get_property(dn, "ibm,opal-m64-window", NULL); if (!r) { - pr_info(" No <ibm,opal-m64-window> on %s\n", - dn->full_name); + pr_info(" No <ibm,opal-m64-window> on %pOF\n", + dn); return; } @@ -1408,7 +1408,6 @@ m64_failed: static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, int num); -static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe) { @@ -2402,7 +2401,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, return 0; } -static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) +void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) { uint16_t window_id = (pe->pe_number << 1 ) + 1; int64_t rc; @@ -3797,8 +3796,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, if (!of_device_is_available(np)) return; - pr_info("Initializing %s PHB (%s)\n", - pnv_phb_names[ioda_type], of_node_full_name(np)); + pr_info("Initializing %s PHB (%pOF)\n", pnv_phb_names[ioda_type], np); prop64 = of_get_property(np, "ibm,opal-phbid", NULL); if (!prop64) { @@ -3813,8 +3811,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, /* Allocate PCI controller */ phb->hose = hose = pcibios_alloc_controller(np); if (!phb->hose) { - pr_err(" Can't allocate PCI controller for %s\n", - np->full_name); + pr_err(" Can't allocate PCI controller for %pOF\n", + np); memblock_free(__pa(phb), sizeof(struct pnv_phb)); return; } @@ -3825,7 +3823,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, hose->first_busno = be32_to_cpu(prop32[0]); hose->last_busno = be32_to_cpu(prop32[1]); } else { - pr_warn(" Broken <bus-range> on %s\n", np->full_name); + pr_warn(" Broken <bus-range> on %pOF\n", np); hose->first_busno = 0; hose->last_busno = 0xff; } @@ -4046,7 +4044,7 @@ void __init pnv_pci_init_ioda_hub(struct device_node *np) const __be64 *prop64; u64 hub_id; - pr_info("Probing IODA IO-Hub %s\n", np->full_name); + pr_info("Probing IODA IO-Hub %pOF\n", np); prop64 = of_get_property(np, "ibm,opal-hubid", NULL); if (!prop64) { diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 7905d179d036..5422f4a6317c 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -37,6 +37,8 @@ #include "powernv.h" #include "pci.h" +static DEFINE_MUTEX(p2p_mutex); + int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id) { struct device_node *parent = np; @@ -1017,6 +1019,79 @@ void pnv_pci_dma_bus_setup(struct pci_bus *bus) } } +int pnv_pci_set_p2p(struct pci_dev *initiator, struct pci_dev *target, u64 desc) +{ + struct pci_controller *hose; + struct pnv_phb *phb_init, *phb_target; + struct pnv_ioda_pe *pe_init; + int rc; + + if (!opal_check_token(OPAL_PCI_SET_P2P)) + return -ENXIO; + + hose = pci_bus_to_host(initiator->bus); + phb_init = hose->private_data; + + hose = pci_bus_to_host(target->bus); + phb_target = hose->private_data; + + pe_init = pnv_ioda_get_pe(initiator); + if (!pe_init) + return -ENODEV; + + /* + * Configuring the initiator's PHB requires to adjust its + * TVE#1 setting. Since the same device can be an initiator + * several times for different target devices, we need to keep + * a reference count to know when we can restore the default + * bypass setting on its TVE#1 when disabling. Opal is not + * tracking PE states, so we add a reference count on the PE + * in linux. + * + * For the target, the configuration is per PHB, so we keep a + * target reference count on the PHB. + */ + mutex_lock(&p2p_mutex); + + if (desc & OPAL_PCI_P2P_ENABLE) { + /* always go to opal to validate the configuration */ + rc = opal_pci_set_p2p(phb_init->opal_id, phb_target->opal_id, + desc, pe_init->pe_number); + + if (rc != OPAL_SUCCESS) { + rc = -EIO; + goto out; + } + + pe_init->p2p_initiator_count++; + phb_target->p2p_target_count++; + } else { + if (!pe_init->p2p_initiator_count || + !phb_target->p2p_target_count) { + rc = -EINVAL; + goto out; + } + + if (--pe_init->p2p_initiator_count == 0) + pnv_pci_ioda2_set_bypass(pe_init, true); + + if (--phb_target->p2p_target_count == 0) { + rc = opal_pci_set_p2p(phb_init->opal_id, + phb_target->opal_id, desc, + pe_init->pe_number); + if (rc != OPAL_SUCCESS) { + rc = -EIO; + goto out; + } + } + } + rc = 0; +out: + mutex_unlock(&p2p_mutex); + return rc; +} +EXPORT_SYMBOL_GPL(pnv_pci_set_p2p); + void pnv_pci_shutdown(void) { struct pci_controller *hose; diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index f16bc403ec03..b47f9406d97e 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __POWERNV_PCI_H #define __POWERNV_PCI_H @@ -78,6 +79,9 @@ struct pnv_ioda_pe { struct pnv_ioda_pe *master; struct list_head slaves; + /* PCI peer-to-peer*/ + int p2p_initiator_count; + /* Link in list of PE#s */ struct list_head list; }; @@ -189,6 +193,7 @@ struct pnv_phb { #ifdef CONFIG_CXL_BASE struct cxl_afu *cxl_afu; #endif + int p2p_target_count; }; extern struct pci_ops pnv_pci_ops; @@ -229,6 +234,7 @@ extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); extern bool pnv_pci_enable_device_hook(struct pci_dev *dev); +extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...); diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index 6dbc0a1da1f6..94f17ab1374b 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _POWERNV_H #define _POWERNV_H @@ -7,6 +8,8 @@ extern void pnv_smp_init(void); static inline void pnv_smp_init(void) { } #endif +extern void pnv_platform_error_reboot(struct pt_regs *regs, const char *msg) __noreturn; + struct pci_dev; #ifdef CONFIG_PCI diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c index 1a9d84371a4d..718f50ed22f1 100644 --- a/arch/powerpc/platforms/powernv/rng.c +++ b/arch/powerpc/platforms/powernv/rng.c @@ -16,11 +16,13 @@ #include <linux/slab.h> #include <linux/smp.h> #include <asm/archrandom.h> +#include <asm/cputable.h> #include <asm/io.h> #include <asm/prom.h> #include <asm/machdep.h> #include <asm/smp.h> +#define DARN_ERR 0xFFFFFFFFFFFFFFFFul struct powernv_rng { void __iomem *regs; @@ -67,6 +69,41 @@ int powernv_get_random_real_mode(unsigned long *v) return 1; } +int powernv_get_random_darn(unsigned long *v) +{ + unsigned long val; + + /* Using DARN with L=1 - 64-bit conditioned random number */ + asm volatile(PPC_DARN(%0, 1) : "=r"(val)); + + if (val == DARN_ERR) + return 0; + + *v = val; + + return 1; +} + +static int initialise_darn(void) +{ + unsigned long val; + int i; + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -ENODEV; + + for (i = 0; i < 10; i++) { + if (powernv_get_random_darn(&val)) { + ppc_md.get_random_seed = powernv_get_random_darn; + return 0; + } + } + + pr_warn("Unable to use DARN for get_random_seed()\n"); + + return -EIO; +} + int powernv_get_random_long(unsigned long *v) { struct powernv_rng *rng; @@ -88,7 +125,7 @@ static __init void rng_init_per_cpu(struct powernv_rng *rng, chip_id = of_get_ibm_chip_id(dn); if (chip_id == -1) - pr_warn("No ibm,chip-id found for %s.\n", dn->full_name); + pr_warn("No ibm,chip-id found for %pOF.\n", dn); for_each_possible_cpu(cpu) { if (per_cpu(powernv_rng, cpu) == NULL || @@ -141,8 +178,8 @@ static __init int rng_init(void) for_each_compatible_node(dn, NULL, "ibm,power-rng") { rc = rng_create(dn); if (rc) { - pr_err("Failed creating rng for %s (%d).\n", - dn->full_name, rc); + pr_err("Failed creating rng for %pOF (%d).\n", + dn, rc); continue; } @@ -150,6 +187,8 @@ static __init int rng_init(void) of_platform_device_create(dn, NULL, NULL); } + initialise_darn(); + return 0; } machine_subsys_initcall(powernv, rng_init); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 897aa1400eb8..bbb73aa0eb8f 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -272,7 +272,15 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE static unsigned long pnv_memory_block_size(void) { - return 256UL * 1024 * 1024; + /* + * We map the kernel linear region with 1GB large pages on radix. For + * memory hot unplug to work our memory block size must be at least + * this size. + */ + if (radix_enabled()) + return 1UL * 1024 * 1024 * 1024; + else + return 256UL * 1024 * 1024; } #endif diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 40dae96f7e20..c17f81e433f7 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -57,7 +57,7 @@ static void pnv_smp_setup_cpu(int cpu) static int pnv_smp_kick_cpu(int nr) { - unsigned int pcpu = get_hard_smp_processor_id(nr); + unsigned int pcpu; unsigned long start_here = __pa(ppc_function_entry(generic_secondary_smp_init)); long rc; @@ -66,6 +66,7 @@ static int pnv_smp_kick_cpu(int nr) if (nr < 0 || nr >= nr_cpu_ids) return -EINVAL; + pcpu = get_hard_smp_processor_id(nr); /* * If we already started or OPAL is not supported, we just * kick the CPU via the PACA @@ -164,12 +165,6 @@ static void pnv_smp_cpu_kill_self(void) if (cpu_has_feature(CPU_FTR_ARCH_207S)) wmask = SRR1_WAKEMASK_P8; - /* We don't want to take decrementer interrupts while we are offline, - * so clear LPCR:PECE1. We keep PECE2 (and LPCR_PECE_HVEE on P9) - * enabled as to let IPIs in. - */ - mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1); - while (!generic_check_cpu_restart(cpu)) { /* * Clear IPI flag, since we don't handle IPIs while @@ -219,8 +214,6 @@ static void pnv_smp_cpu_kill_self(void) } - /* Re-enable decrementer interrupts */ - mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_PECE1); DBG("CPU%d coming online...\n", cpu); } diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c new file mode 100644 index 000000000000..5aae845b8cd9 --- /dev/null +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -0,0 +1,1134 @@ +/* + * Copyright 2016-17 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "vas: " fmt + +#include <linux/types.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/io.h> +#include <linux/log2.h> +#include <linux/rcupdate.h> +#include <linux/cred.h> + +#include "vas.h" +#include "copy-paste.h" + +/* + * Compute the paste address region for the window @window using the + * ->paste_base_addr and ->paste_win_id_shift we got from device tree. + */ +static void compute_paste_address(struct vas_window *window, u64 *addr, int *len) +{ + int winid; + u64 base, shift; + + base = window->vinst->paste_base_addr; + shift = window->vinst->paste_win_id_shift; + winid = window->winid; + + *addr = base + (winid << shift); + if (len) + *len = PAGE_SIZE; + + pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr); +} + +static inline void get_hvwc_mmio_bar(struct vas_window *window, + u64 *start, int *len) +{ + u64 pbaddr; + + pbaddr = window->vinst->hvwc_bar_start; + *start = pbaddr + window->winid * VAS_HVWC_SIZE; + *len = VAS_HVWC_SIZE; +} + +static inline void get_uwc_mmio_bar(struct vas_window *window, + u64 *start, int *len) +{ + u64 pbaddr; + + pbaddr = window->vinst->uwc_bar_start; + *start = pbaddr + window->winid * VAS_UWC_SIZE; + *len = VAS_UWC_SIZE; +} + +/* + * Map the paste bus address of the given send window into kernel address + * space. Unlike MMIO regions (map_mmio_region() below), paste region must + * be mapped cache-able and is only applicable to send windows. + */ +static void *map_paste_region(struct vas_window *txwin) +{ + int len; + void *map; + char *name; + u64 start; + + name = kasprintf(GFP_KERNEL, "window-v%d-w%d", txwin->vinst->vas_id, + txwin->winid); + if (!name) + goto free_name; + + txwin->paste_addr_name = name; + compute_paste_address(txwin, &start, &len); + + if (!request_mem_region(start, len, name)) { + pr_devel("%s(): request_mem_region(0x%llx, %d) failed\n", + __func__, start, len); + goto free_name; + } + + map = ioremap_cache(start, len); + if (!map) { + pr_devel("%s(): ioremap_cache(0x%llx, %d) failed\n", __func__, + start, len); + goto free_name; + } + + pr_devel("Mapped paste addr 0x%llx to kaddr 0x%p\n", start, map); + return map; + +free_name: + kfree(name); + return ERR_PTR(-ENOMEM); +} + +static void *map_mmio_region(char *name, u64 start, int len) +{ + void *map; + + if (!request_mem_region(start, len, name)) { + pr_devel("%s(): request_mem_region(0x%llx, %d) failed\n", + __func__, start, len); + return NULL; + } + + map = ioremap(start, len); + if (!map) { + pr_devel("%s(): ioremap(0x%llx, %d) failed\n", __func__, start, + len); + return NULL; + } + + return map; +} + +static void unmap_region(void *addr, u64 start, int len) +{ + iounmap(addr); + release_mem_region((phys_addr_t)start, len); +} + +/* + * Unmap the paste address region for a window. + */ +static void unmap_paste_region(struct vas_window *window) +{ + int len; + u64 busaddr_start; + + if (window->paste_kaddr) { + compute_paste_address(window, &busaddr_start, &len); + unmap_region(window->paste_kaddr, busaddr_start, len); + window->paste_kaddr = NULL; + kfree(window->paste_addr_name); + window->paste_addr_name = NULL; + } +} + +/* + * Unmap the MMIO regions for a window. + */ +static void unmap_winctx_mmio_bars(struct vas_window *window) +{ + int len; + u64 busaddr_start; + + if (window->hvwc_map) { + get_hvwc_mmio_bar(window, &busaddr_start, &len); + unmap_region(window->hvwc_map, busaddr_start, len); + window->hvwc_map = NULL; + } + + if (window->uwc_map) { + get_uwc_mmio_bar(window, &busaddr_start, &len); + unmap_region(window->uwc_map, busaddr_start, len); + window->uwc_map = NULL; + } +} + +/* + * Find the Hypervisor Window Context (HVWC) MMIO Base Address Region and the + * OS/User Window Context (UWC) MMIO Base Address Region for the given window. + * Map these bus addresses and save the mapped kernel addresses in @window. + */ +int map_winctx_mmio_bars(struct vas_window *window) +{ + int len; + u64 start; + + get_hvwc_mmio_bar(window, &start, &len); + window->hvwc_map = map_mmio_region("HVWCM_Window", start, len); + + get_uwc_mmio_bar(window, &start, &len); + window->uwc_map = map_mmio_region("UWCM_Window", start, len); + + if (!window->hvwc_map || !window->uwc_map) { + unmap_winctx_mmio_bars(window); + return -1; + } + + return 0; +} + +/* + * Reset all valid registers in the HV and OS/User Window Contexts for + * the window identified by @window. + * + * NOTE: We cannot really use a for loop to reset window context. Not all + * offsets in a window context are valid registers and the valid + * registers are not sequential. And, we can only write to offsets + * with valid registers. + */ +void reset_window_regs(struct vas_window *window) +{ + write_hvwc_reg(window, VREG(LPID), 0ULL); + write_hvwc_reg(window, VREG(PID), 0ULL); + write_hvwc_reg(window, VREG(XLATE_MSR), 0ULL); + write_hvwc_reg(window, VREG(XLATE_LPCR), 0ULL); + write_hvwc_reg(window, VREG(XLATE_CTL), 0ULL); + write_hvwc_reg(window, VREG(AMR), 0ULL); + write_hvwc_reg(window, VREG(SEIDR), 0ULL); + write_hvwc_reg(window, VREG(FAULT_TX_WIN), 0ULL); + write_hvwc_reg(window, VREG(OSU_INTR_SRC_RA), 0ULL); + write_hvwc_reg(window, VREG(HV_INTR_SRC_RA), 0ULL); + write_hvwc_reg(window, VREG(PSWID), 0ULL); + write_hvwc_reg(window, VREG(LFIFO_BAR), 0ULL); + write_hvwc_reg(window, VREG(LDATA_STAMP_CTL), 0ULL); + write_hvwc_reg(window, VREG(LDMA_CACHE_CTL), 0ULL); + write_hvwc_reg(window, VREG(LRFIFO_PUSH), 0ULL); + write_hvwc_reg(window, VREG(CURR_MSG_COUNT), 0ULL); + write_hvwc_reg(window, VREG(LNOTIFY_AFTER_COUNT), 0ULL); + write_hvwc_reg(window, VREG(LRX_WCRED), 0ULL); + write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), 0ULL); + write_hvwc_reg(window, VREG(TX_WCRED), 0ULL); + write_hvwc_reg(window, VREG(TX_WCRED_ADDER), 0ULL); + write_hvwc_reg(window, VREG(LFIFO_SIZE), 0ULL); + write_hvwc_reg(window, VREG(WINCTL), 0ULL); + write_hvwc_reg(window, VREG(WIN_STATUS), 0ULL); + write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), 0ULL); + write_hvwc_reg(window, VREG(TX_RSVD_BUF_COUNT), 0ULL); + write_hvwc_reg(window, VREG(LRFIFO_WIN_PTR), 0ULL); + write_hvwc_reg(window, VREG(LNOTIFY_CTL), 0ULL); + write_hvwc_reg(window, VREG(LNOTIFY_PID), 0ULL); + write_hvwc_reg(window, VREG(LNOTIFY_LPID), 0ULL); + write_hvwc_reg(window, VREG(LNOTIFY_TID), 0ULL); + write_hvwc_reg(window, VREG(LNOTIFY_SCOPE), 0ULL); + write_hvwc_reg(window, VREG(NX_UTIL_ADDER), 0ULL); + + /* Skip read-only registers: NX_UTIL and NX_UTIL_SE */ + + /* + * The send and receive window credit adder registers are also + * accessible from HVWC and have been initialized above. We don't + * need to initialize from the OS/User Window Context, so skip + * following calls: + * + * write_uwc_reg(window, VREG(TX_WCRED_ADDER), 0ULL); + * write_uwc_reg(window, VREG(LRX_WCRED_ADDER), 0ULL); + */ +} + +/* + * Initialize window context registers related to Address Translation. + * These registers are common to send/receive windows although they + * differ for user/kernel windows. As we resolve the TODOs we may + * want to add fields to vas_winctx and move the initialization to + * init_vas_winctx_regs(). + */ +static void init_xlate_regs(struct vas_window *window, bool user_win) +{ + u64 lpcr, val; + + /* + * MSR_TA, MSR_US are false for both kernel and user. + * MSR_DR and MSR_PR are false for kernel. + */ + val = 0ULL; + val = SET_FIELD(VAS_XLATE_MSR_HV, val, 1); + val = SET_FIELD(VAS_XLATE_MSR_SF, val, 1); + if (user_win) { + val = SET_FIELD(VAS_XLATE_MSR_DR, val, 1); + val = SET_FIELD(VAS_XLATE_MSR_PR, val, 1); + } + write_hvwc_reg(window, VREG(XLATE_MSR), val); + + lpcr = mfspr(SPRN_LPCR); + val = 0ULL; + /* + * NOTE: From Section 5.7.8.1 Segment Lookaside Buffer of the + * Power ISA, v3.0B, Page size encoding is 0 = 4KB, 5 = 64KB. + * + * NOTE: From Section 1.3.1, Address Translation Context of the + * Nest MMU Workbook, LPCR_SC should be 0 for Power9. + */ + val = SET_FIELD(VAS_XLATE_LPCR_PAGE_SIZE, val, 5); + val = SET_FIELD(VAS_XLATE_LPCR_ISL, val, lpcr & LPCR_ISL); + val = SET_FIELD(VAS_XLATE_LPCR_TC, val, lpcr & LPCR_TC); + val = SET_FIELD(VAS_XLATE_LPCR_SC, val, 0); + write_hvwc_reg(window, VREG(XLATE_LPCR), val); + + /* + * Section 1.3.1 (Address translation Context) of NMMU workbook. + * 0b00 Hashed Page Table mode + * 0b01 Reserved + * 0b10 Radix on HPT + * 0b11 Radix on Radix + */ + val = 0ULL; + val = SET_FIELD(VAS_XLATE_MODE, val, radix_enabled() ? 3 : 2); + write_hvwc_reg(window, VREG(XLATE_CTL), val); + + /* + * TODO: Can we mfspr(AMR) even for user windows? + */ + val = 0ULL; + val = SET_FIELD(VAS_AMR, val, mfspr(SPRN_AMR)); + write_hvwc_reg(window, VREG(AMR), val); + + val = 0ULL; + val = SET_FIELD(VAS_SEIDR, val, 0); + write_hvwc_reg(window, VREG(SEIDR), val); +} + +/* + * Initialize Reserved Send Buffer Count for the send window. It involves + * writing to the register, reading it back to confirm that the hardware + * has enough buffers to reserve. See section 1.3.1.2.1 of VAS workbook. + * + * Since we can only make a best-effort attempt to fulfill the request, + * we don't return any errors if we cannot. + * + * TODO: Reserved (aka dedicated) send buffers are not supported yet. + */ +static void init_rsvd_tx_buf_count(struct vas_window *txwin, + struct vas_winctx *winctx) +{ + write_hvwc_reg(txwin, VREG(TX_RSVD_BUF_COUNT), 0ULL); +} + +/* + * init_winctx_regs() + * Initialize window context registers for a receive window. + * Except for caching control and marking window open, the registers + * are initialized in the order listed in Section 3.1.4 (Window Context + * Cache Register Details) of the VAS workbook although they don't need + * to be. + * + * Design note: For NX receive windows, NX allocates the FIFO buffer in OPAL + * (so that it can get a large contiguous area) and passes that buffer + * to kernel via device tree. We now write that buffer address to the + * FIFO BAR. Would it make sense to do this all in OPAL? i.e have OPAL + * write the per-chip RX FIFO addresses to the windows during boot-up + * as a one-time task? That could work for NX but what about other + * receivers? Let the receivers tell us the rx-fifo buffers for now. + */ +int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx) +{ + u64 val; + int fifo_size; + + reset_window_regs(window); + + val = 0ULL; + val = SET_FIELD(VAS_LPID, val, winctx->lpid); + write_hvwc_reg(window, VREG(LPID), val); + + val = 0ULL; + val = SET_FIELD(VAS_PID_ID, val, winctx->pidr); + write_hvwc_reg(window, VREG(PID), val); + + init_xlate_regs(window, winctx->user_win); + + val = 0ULL; + val = SET_FIELD(VAS_FAULT_TX_WIN, val, 0); + write_hvwc_reg(window, VREG(FAULT_TX_WIN), val); + + /* In PowerNV, interrupts go to HV. */ + write_hvwc_reg(window, VREG(OSU_INTR_SRC_RA), 0ULL); + + val = 0ULL; + val = SET_FIELD(VAS_HV_INTR_SRC_RA, val, winctx->irq_port); + write_hvwc_reg(window, VREG(HV_INTR_SRC_RA), val); + + val = 0ULL; + val = SET_FIELD(VAS_PSWID_EA_HANDLE, val, winctx->pswid); + write_hvwc_reg(window, VREG(PSWID), val); + + write_hvwc_reg(window, VREG(SPARE1), 0ULL); + write_hvwc_reg(window, VREG(SPARE2), 0ULL); + write_hvwc_reg(window, VREG(SPARE3), 0ULL); + + /* + * NOTE: VAS expects the FIFO address to be copied into the LFIFO_BAR + * register as is - do NOT shift the address into VAS_LFIFO_BAR + * bit fields! Ok to set the page migration select fields - + * VAS ignores the lower 10+ bits in the address anyway, because + * the minimum FIFO size is 1K? + * + * See also: Design note in function header. + */ + val = __pa(winctx->rx_fifo); + val = SET_FIELD(VAS_PAGE_MIGRATION_SELECT, val, 0); + write_hvwc_reg(window, VREG(LFIFO_BAR), val); + + val = 0ULL; + val = SET_FIELD(VAS_LDATA_STAMP, val, winctx->data_stamp); + write_hvwc_reg(window, VREG(LDATA_STAMP_CTL), val); + + val = 0ULL; + val = SET_FIELD(VAS_LDMA_TYPE, val, winctx->dma_type); + val = SET_FIELD(VAS_LDMA_FIFO_DISABLE, val, winctx->fifo_disable); + write_hvwc_reg(window, VREG(LDMA_CACHE_CTL), val); + + write_hvwc_reg(window, VREG(LRFIFO_PUSH), 0ULL); + write_hvwc_reg(window, VREG(CURR_MSG_COUNT), 0ULL); + write_hvwc_reg(window, VREG(LNOTIFY_AFTER_COUNT), 0ULL); + + val = 0ULL; + val = SET_FIELD(VAS_LRX_WCRED, val, winctx->wcreds_max); + write_hvwc_reg(window, VREG(LRX_WCRED), val); + + val = 0ULL; + val = SET_FIELD(VAS_TX_WCRED, val, winctx->wcreds_max); + write_hvwc_reg(window, VREG(TX_WCRED), val); + + write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), 0ULL); + write_hvwc_reg(window, VREG(TX_WCRED_ADDER), 0ULL); + + fifo_size = winctx->rx_fifo_size / 1024; + + val = 0ULL; + val = SET_FIELD(VAS_LFIFO_SIZE, val, ilog2(fifo_size)); + write_hvwc_reg(window, VREG(LFIFO_SIZE), val); + + /* Update window control and caching control registers last so + * we mark the window open only after fully initializing it and + * pushing context to cache. + */ + + write_hvwc_reg(window, VREG(WIN_STATUS), 0ULL); + + init_rsvd_tx_buf_count(window, winctx); + + /* for a send window, point to the matching receive window */ + val = 0ULL; + val = SET_FIELD(VAS_LRX_WIN_ID, val, winctx->rx_win_id); + write_hvwc_reg(window, VREG(LRFIFO_WIN_PTR), val); + + write_hvwc_reg(window, VREG(SPARE4), 0ULL); + + val = 0ULL; + val = SET_FIELD(VAS_NOTIFY_DISABLE, val, winctx->notify_disable); + val = SET_FIELD(VAS_INTR_DISABLE, val, winctx->intr_disable); + val = SET_FIELD(VAS_NOTIFY_EARLY, val, winctx->notify_early); + val = SET_FIELD(VAS_NOTIFY_OSU_INTR, val, winctx->notify_os_intr_reg); + write_hvwc_reg(window, VREG(LNOTIFY_CTL), val); + + val = 0ULL; + val = SET_FIELD(VAS_LNOTIFY_PID, val, winctx->lnotify_pid); + write_hvwc_reg(window, VREG(LNOTIFY_PID), val); + + val = 0ULL; + val = SET_FIELD(VAS_LNOTIFY_LPID, val, winctx->lnotify_lpid); + write_hvwc_reg(window, VREG(LNOTIFY_LPID), val); + + val = 0ULL; + val = SET_FIELD(VAS_LNOTIFY_TID, val, winctx->lnotify_tid); + write_hvwc_reg(window, VREG(LNOTIFY_TID), val); + + val = 0ULL; + val = SET_FIELD(VAS_LNOTIFY_MIN_SCOPE, val, winctx->min_scope); + val = SET_FIELD(VAS_LNOTIFY_MAX_SCOPE, val, winctx->max_scope); + write_hvwc_reg(window, VREG(LNOTIFY_SCOPE), val); + + /* Skip read-only registers NX_UTIL and NX_UTIL_SE */ + + write_hvwc_reg(window, VREG(SPARE5), 0ULL); + write_hvwc_reg(window, VREG(NX_UTIL_ADDER), 0ULL); + write_hvwc_reg(window, VREG(SPARE6), 0ULL); + + /* Finally, push window context to memory and... */ + val = 0ULL; + val = SET_FIELD(VAS_PUSH_TO_MEM, val, 1); + write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), val); + + /* ... mark the window open for business */ + val = 0ULL; + val = SET_FIELD(VAS_WINCTL_REJ_NO_CREDIT, val, winctx->rej_no_credit); + val = SET_FIELD(VAS_WINCTL_PIN, val, winctx->pin_win); + val = SET_FIELD(VAS_WINCTL_TX_WCRED_MODE, val, winctx->tx_wcred_mode); + val = SET_FIELD(VAS_WINCTL_RX_WCRED_MODE, val, winctx->rx_wcred_mode); + val = SET_FIELD(VAS_WINCTL_TX_WORD_MODE, val, winctx->tx_word_mode); + val = SET_FIELD(VAS_WINCTL_RX_WORD_MODE, val, winctx->rx_word_mode); + val = SET_FIELD(VAS_WINCTL_FAULT_WIN, val, winctx->fault_win); + val = SET_FIELD(VAS_WINCTL_NX_WIN, val, winctx->nx_win); + val = SET_FIELD(VAS_WINCTL_OPEN, val, 1); + write_hvwc_reg(window, VREG(WINCTL), val); + + return 0; +} + +static DEFINE_SPINLOCK(vas_ida_lock); + +static void vas_release_window_id(struct ida *ida, int winid) +{ + spin_lock(&vas_ida_lock); + ida_remove(ida, winid); + spin_unlock(&vas_ida_lock); +} + +static int vas_assign_window_id(struct ida *ida) +{ + int rc, winid; + + do { + rc = ida_pre_get(ida, GFP_KERNEL); + if (!rc) + return -EAGAIN; + + spin_lock(&vas_ida_lock); + rc = ida_get_new(ida, &winid); + spin_unlock(&vas_ida_lock); + } while (rc == -EAGAIN); + + if (rc) + return rc; + + if (winid > VAS_WINDOWS_PER_CHIP) { + pr_err("Too many (%d) open windows\n", winid); + vas_release_window_id(ida, winid); + return -EAGAIN; + } + + return winid; +} + +static void vas_window_free(struct vas_window *window) +{ + int winid = window->winid; + struct vas_instance *vinst = window->vinst; + + unmap_winctx_mmio_bars(window); + kfree(window); + + vas_release_window_id(&vinst->ida, winid); +} + +static struct vas_window *vas_window_alloc(struct vas_instance *vinst) +{ + int winid; + struct vas_window *window; + + winid = vas_assign_window_id(&vinst->ida); + if (winid < 0) + return ERR_PTR(winid); + + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window) + goto out_free; + + window->vinst = vinst; + window->winid = winid; + + if (map_winctx_mmio_bars(window)) + goto out_free; + + return window; + +out_free: + kfree(window); + vas_release_window_id(&vinst->ida, winid); + return ERR_PTR(-ENOMEM); +} + +static void put_rx_win(struct vas_window *rxwin) +{ + /* Better not be a send window! */ + WARN_ON_ONCE(rxwin->tx_win); + + atomic_dec(&rxwin->num_txwins); +} + +/* + * Get the VAS receive window associated with NX engine identified + * by @cop and if applicable, @pswid. + * + * See also function header of set_vinst_win(). + */ +static struct vas_window *get_vinst_rxwin(struct vas_instance *vinst, + enum vas_cop_type cop, u32 pswid) +{ + struct vas_window *rxwin; + + mutex_lock(&vinst->mutex); + + if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) + rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL); + else + rxwin = ERR_PTR(-EINVAL); + + if (!IS_ERR(rxwin)) + atomic_inc(&rxwin->num_txwins); + + mutex_unlock(&vinst->mutex); + + return rxwin; +} + +/* + * We have two tables of windows in a VAS instance. The first one, + * ->windows[], contains all the windows in the instance and allows + * looking up a window by its id. It is used to look up send windows + * during fault handling and receive windows when pairing user space + * send/receive windows. + * + * The second table, ->rxwin[], contains receive windows that are + * associated with NX engines. This table has VAS_COP_TYPE_MAX + * entries and is used to look up a receive window by its + * coprocessor type. + * + * Here, we save @window in the ->windows[] table. If it is a receive + * window, we also save the window in the ->rxwin[] table. + */ +static void set_vinst_win(struct vas_instance *vinst, + struct vas_window *window) +{ + int id = window->winid; + + mutex_lock(&vinst->mutex); + + /* + * There should only be one receive window for a coprocessor type + * unless its a user (FTW) window. + */ + if (!window->user_win && !window->tx_win) { + WARN_ON_ONCE(vinst->rxwin[window->cop]); + vinst->rxwin[window->cop] = window; + } + + WARN_ON_ONCE(vinst->windows[id] != NULL); + vinst->windows[id] = window; + + mutex_unlock(&vinst->mutex); +} + +/* + * Clear this window from the table(s) of windows for this VAS instance. + * See also function header of set_vinst_win(). + */ +static void clear_vinst_win(struct vas_window *window) +{ + int id = window->winid; + struct vas_instance *vinst = window->vinst; + + mutex_lock(&vinst->mutex); + + if (!window->user_win && !window->tx_win) { + WARN_ON_ONCE(!vinst->rxwin[window->cop]); + vinst->rxwin[window->cop] = NULL; + } + + WARN_ON_ONCE(vinst->windows[id] != window); + vinst->windows[id] = NULL; + + mutex_unlock(&vinst->mutex); +} + +static void init_winctx_for_rxwin(struct vas_window *rxwin, + struct vas_rx_win_attr *rxattr, + struct vas_winctx *winctx) +{ + /* + * We first zero (memset()) all fields and only set non-zero fields. + * Following fields are 0/false but maybe deserve a comment: + * + * ->notify_os_intr_reg In powerNV, send intrs to HV + * ->notify_disable False for NX windows + * ->intr_disable False for Fault Windows + * ->xtra_write False for NX windows + * ->notify_early NA for NX windows + * ->rsvd_txbuf_count NA for Rx windows + * ->lpid, ->pid, ->tid NA for Rx windows + */ + + memset(winctx, 0, sizeof(struct vas_winctx)); + + winctx->rx_fifo = rxattr->rx_fifo; + winctx->rx_fifo_size = rxattr->rx_fifo_size; + winctx->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT; + winctx->pin_win = rxattr->pin_win; + + winctx->nx_win = rxattr->nx_win; + winctx->fault_win = rxattr->fault_win; + winctx->rx_word_mode = rxattr->rx_win_ord_mode; + winctx->tx_word_mode = rxattr->tx_win_ord_mode; + winctx->rx_wcred_mode = rxattr->rx_wcred_mode; + winctx->tx_wcred_mode = rxattr->tx_wcred_mode; + + if (winctx->nx_win) { + winctx->data_stamp = true; + winctx->intr_disable = true; + winctx->pin_win = true; + + WARN_ON_ONCE(winctx->fault_win); + WARN_ON_ONCE(!winctx->rx_word_mode); + WARN_ON_ONCE(!winctx->tx_word_mode); + WARN_ON_ONCE(winctx->notify_after_count); + } else if (winctx->fault_win) { + winctx->notify_disable = true; + } else if (winctx->user_win) { + /* + * Section 1.8.1 Low Latency Core-Core Wake up of + * the VAS workbook: + * + * - disable credit checks ([tr]x_wcred_mode = false) + * - disable FIFO writes + * - enable ASB_Notify, disable interrupt + */ + winctx->fifo_disable = true; + winctx->intr_disable = true; + winctx->rx_fifo = NULL; + } + + winctx->lnotify_lpid = rxattr->lnotify_lpid; + winctx->lnotify_pid = rxattr->lnotify_pid; + winctx->lnotify_tid = rxattr->lnotify_tid; + winctx->pswid = rxattr->pswid; + winctx->dma_type = VAS_DMA_TYPE_INJECT; + winctx->tc_mode = rxattr->tc_mode; + + winctx->min_scope = VAS_SCOPE_LOCAL; + winctx->max_scope = VAS_SCOPE_VECTORED_GROUP; +} + +static bool rx_win_args_valid(enum vas_cop_type cop, + struct vas_rx_win_attr *attr) +{ + dump_rx_win_attr(attr); + + if (cop >= VAS_COP_TYPE_MAX) + return false; + + if (cop != VAS_COP_TYPE_FTW && + attr->rx_fifo_size < VAS_RX_FIFO_SIZE_MIN) + return false; + + if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX) + return false; + + if (attr->nx_win) { + /* cannot be fault or user window if it is nx */ + if (attr->fault_win || attr->user_win) + return false; + /* + * Section 3.1.4.32: NX Windows must not disable notification, + * and must not enable interrupts or early notification. + */ + if (attr->notify_disable || !attr->intr_disable || + attr->notify_early) + return false; + } else if (attr->fault_win) { + /* cannot be both fault and user window */ + if (attr->user_win) + return false; + + /* + * Section 3.1.4.32: Fault windows must disable notification + * but not interrupts. + */ + if (!attr->notify_disable || attr->intr_disable) + return false; + + } else if (attr->user_win) { + /* + * User receive windows are only for fast-thread-wakeup + * (FTW). They don't need a FIFO and must disable interrupts + */ + if (attr->rx_fifo || attr->rx_fifo_size || !attr->intr_disable) + return false; + } else { + /* Rx window must be one of NX or Fault or User window. */ + return false; + } + + return true; +} + +void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, enum vas_cop_type cop) +{ + memset(rxattr, 0, sizeof(*rxattr)); + + if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) { + rxattr->pin_win = true; + rxattr->nx_win = true; + rxattr->fault_win = false; + rxattr->intr_disable = true; + rxattr->rx_wcred_mode = true; + rxattr->tx_wcred_mode = true; + rxattr->rx_win_ord_mode = true; + rxattr->tx_win_ord_mode = true; + } else if (cop == VAS_COP_TYPE_FAULT) { + rxattr->pin_win = true; + rxattr->fault_win = true; + rxattr->notify_disable = true; + rxattr->rx_wcred_mode = true; + rxattr->tx_wcred_mode = true; + rxattr->rx_win_ord_mode = true; + rxattr->tx_win_ord_mode = true; + } else if (cop == VAS_COP_TYPE_FTW) { + rxattr->user_win = true; + rxattr->intr_disable = true; + + /* + * As noted in the VAS Workbook we disable credit checks. + * If we enable credit checks in the future, we must also + * implement a mechanism to return the user credits or new + * paste operations will fail. + */ + } +} +EXPORT_SYMBOL_GPL(vas_init_rx_win_attr); + +struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop, + struct vas_rx_win_attr *rxattr) +{ + struct vas_window *rxwin; + struct vas_winctx winctx; + struct vas_instance *vinst; + + if (!rx_win_args_valid(cop, rxattr)) + return ERR_PTR(-EINVAL); + + vinst = find_vas_instance(vasid); + if (!vinst) { + pr_devel("vasid %d not found!\n", vasid); + return ERR_PTR(-EINVAL); + } + pr_devel("Found instance %d\n", vasid); + + rxwin = vas_window_alloc(vinst); + if (IS_ERR(rxwin)) { + pr_devel("Unable to allocate memory for Rx window\n"); + return rxwin; + } + + rxwin->tx_win = false; + rxwin->nx_win = rxattr->nx_win; + rxwin->user_win = rxattr->user_win; + rxwin->cop = cop; + if (rxattr->user_win) + rxwin->pid = task_pid_vnr(current); + + init_winctx_for_rxwin(rxwin, rxattr, &winctx); + init_winctx_regs(rxwin, &winctx); + + set_vinst_win(vinst, rxwin); + + return rxwin; +} +EXPORT_SYMBOL_GPL(vas_rx_win_open); + +void vas_init_tx_win_attr(struct vas_tx_win_attr *txattr, enum vas_cop_type cop) +{ + memset(txattr, 0, sizeof(*txattr)); + + if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) { + txattr->rej_no_credit = false; + txattr->rx_wcred_mode = true; + txattr->tx_wcred_mode = true; + txattr->rx_win_ord_mode = true; + txattr->tx_win_ord_mode = true; + } else if (cop == VAS_COP_TYPE_FTW) { + txattr->user_win = true; + } +} +EXPORT_SYMBOL_GPL(vas_init_tx_win_attr); + +static void init_winctx_for_txwin(struct vas_window *txwin, + struct vas_tx_win_attr *txattr, + struct vas_winctx *winctx) +{ + /* + * We first zero all fields and only set non-zero ones. Following + * are some fields set to 0/false for the stated reason: + * + * ->notify_os_intr_reg In powernv, send intrs to HV + * ->rsvd_txbuf_count Not supported yet. + * ->notify_disable False for NX windows + * ->xtra_write False for NX windows + * ->notify_early NA for NX windows + * ->lnotify_lpid NA for Tx windows + * ->lnotify_pid NA for Tx windows + * ->lnotify_tid NA for Tx windows + * ->tx_win_cred_mode Ignore for now for NX windows + * ->rx_win_cred_mode Ignore for now for NX windows + */ + memset(winctx, 0, sizeof(struct vas_winctx)); + + winctx->wcreds_max = txattr->wcreds_max ?: VAS_WCREDS_DEFAULT; + + winctx->user_win = txattr->user_win; + winctx->nx_win = txwin->rxwin->nx_win; + winctx->pin_win = txattr->pin_win; + + winctx->rx_wcred_mode = txattr->rx_wcred_mode; + winctx->tx_wcred_mode = txattr->tx_wcred_mode; + winctx->rx_word_mode = txattr->rx_win_ord_mode; + winctx->tx_word_mode = txattr->tx_win_ord_mode; + + if (winctx->nx_win) { + winctx->data_stamp = true; + winctx->intr_disable = true; + } + + winctx->lpid = txattr->lpid; + winctx->pidr = txattr->pidr; + winctx->rx_win_id = txwin->rxwin->winid; + + winctx->dma_type = VAS_DMA_TYPE_INJECT; + winctx->tc_mode = txattr->tc_mode; + winctx->min_scope = VAS_SCOPE_LOCAL; + winctx->max_scope = VAS_SCOPE_VECTORED_GROUP; + + winctx->pswid = 0; +} + +static bool tx_win_args_valid(enum vas_cop_type cop, + struct vas_tx_win_attr *attr) +{ + if (attr->tc_mode != VAS_THRESH_DISABLED) + return false; + + if (cop > VAS_COP_TYPE_MAX) + return false; + + if (attr->user_win && + (cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count)) + return false; + + return true; +} + +struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, + struct vas_tx_win_attr *attr) +{ + int rc; + struct vas_window *txwin; + struct vas_window *rxwin; + struct vas_winctx winctx; + struct vas_instance *vinst; + + if (!tx_win_args_valid(cop, attr)) + return ERR_PTR(-EINVAL); + + vinst = find_vas_instance(vasid); + if (!vinst) { + pr_devel("vasid %d not found!\n", vasid); + return ERR_PTR(-EINVAL); + } + + rxwin = get_vinst_rxwin(vinst, cop, attr->pswid); + if (IS_ERR(rxwin)) { + pr_devel("No RxWin for vasid %d, cop %d\n", vasid, cop); + return rxwin; + } + + txwin = vas_window_alloc(vinst); + if (IS_ERR(txwin)) { + rc = PTR_ERR(txwin); + goto put_rxwin; + } + + txwin->tx_win = 1; + txwin->rxwin = rxwin; + txwin->nx_win = txwin->rxwin->nx_win; + txwin->pid = attr->pid; + txwin->user_win = attr->user_win; + + init_winctx_for_txwin(txwin, attr, &winctx); + + init_winctx_regs(txwin, &winctx); + + /* + * If its a kernel send window, map the window address into the + * kernel's address space. For user windows, user must issue an + * mmap() to map the window into their address space. + * + * NOTE: If kernel ever resubmits a user CRB after handling a page + * fault, we will need to map this into kernel as well. + */ + if (!txwin->user_win) { + txwin->paste_kaddr = map_paste_region(txwin); + if (IS_ERR(txwin->paste_kaddr)) { + rc = PTR_ERR(txwin->paste_kaddr); + goto free_window; + } + } + + set_vinst_win(vinst, txwin); + + return txwin; + +free_window: + vas_window_free(txwin); + +put_rxwin: + put_rx_win(rxwin); + return ERR_PTR(rc); + +} +EXPORT_SYMBOL_GPL(vas_tx_win_open); + +int vas_copy_crb(void *crb, int offset) +{ + return vas_copy(crb, offset); +} +EXPORT_SYMBOL_GPL(vas_copy_crb); + +#define RMA_LSMP_REPORT_ENABLE PPC_BIT(53) +int vas_paste_crb(struct vas_window *txwin, int offset, bool re) +{ + int rc; + void *addr; + uint64_t val; + + /* + * Only NX windows are supported for now and hardware assumes + * report-enable flag is set for NX windows. Ensure software + * complies too. + */ + WARN_ON_ONCE(txwin->nx_win && !re); + + addr = txwin->paste_kaddr; + if (re) { + /* + * Set the REPORT_ENABLE bit (equivalent to writing + * to 1K offset of the paste address) + */ + val = SET_FIELD(RMA_LSMP_REPORT_ENABLE, 0ULL, 1); + addr += val; + } + + /* + * Map the raw CR value from vas_paste() to an error code (there + * is just pass or fail for now though). + */ + rc = vas_paste(addr, offset); + if (rc == 2) + rc = 0; + else + rc = -EINVAL; + + print_fifo_msg_count(txwin); + + return rc; +} +EXPORT_SYMBOL_GPL(vas_paste_crb); + +static void poll_window_busy_state(struct vas_window *window) +{ + int busy; + u64 val; + +retry: + /* + * Poll Window Busy flag + */ + val = read_hvwc_reg(window, VREG(WIN_STATUS)); + busy = GET_FIELD(VAS_WIN_BUSY, val); + if (busy) { + val = 0; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ); + goto retry; + } +} + +static void poll_window_castout(struct vas_window *window) +{ + int cached; + u64 val; + + /* Cast window context out of the cache */ +retry: + val = read_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL)); + cached = GET_FIELD(VAS_WIN_CACHE_STATUS, val); + if (cached) { + val = 0ULL; + val = SET_FIELD(VAS_CASTOUT_REQ, val, 1); + val = SET_FIELD(VAS_PUSH_TO_MEM, val, 0); + write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), val); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ); + goto retry; + } +} + +/* + * Close a window. + * + * See Section 1.12.1 of VAS workbook v1.05 for details on closing window: + * - Disable new paste operations (unmap paste address) + * - Poll for the "Window Busy" bit to be cleared + * - Clear the Open/Enable bit for the Window. + * - Poll for return of window Credits (implies FIFO empty for Rx win?) + * - Unpin and cast window context out of cache + * + * Besides the hardware, kernel has some bookkeeping of course. + */ +int vas_win_close(struct vas_window *window) +{ + u64 val; + + if (!window) + return 0; + + if (!window->tx_win && atomic_read(&window->num_txwins) != 0) { + pr_devel("Attempting to close an active Rx window!\n"); + WARN_ON_ONCE(1); + return -EBUSY; + } + + unmap_paste_region(window); + + clear_vinst_win(window); + + poll_window_busy_state(window); + + /* Unpin window from cache and close it */ + val = read_hvwc_reg(window, VREG(WINCTL)); + val = SET_FIELD(VAS_WINCTL_PIN, val, 0); + val = SET_FIELD(VAS_WINCTL_OPEN, val, 0); + write_hvwc_reg(window, VREG(WINCTL), val); + + poll_window_castout(window); + + /* if send window, drop reference to matching receive window */ + if (window->tx_win) + put_rx_win(window->rxwin); + + vas_window_free(window); + + return 0; +} +EXPORT_SYMBOL_GPL(vas_win_close); diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c new file mode 100644 index 000000000000..565a4878fefa --- /dev/null +++ b/arch/powerpc/platforms/powernv/vas.c @@ -0,0 +1,151 @@ +/* + * Copyright 2016-17 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "vas: " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/platform_device.h> +#include <linux/of_platform.h> +#include <linux/of_address.h> +#include <linux/of.h> + +#include "vas.h" + +static DEFINE_MUTEX(vas_mutex); +static LIST_HEAD(vas_instances); + +static int init_vas_instance(struct platform_device *pdev) +{ + int rc, vasid; + struct resource *res; + struct vas_instance *vinst; + struct device_node *dn = pdev->dev.of_node; + + rc = of_property_read_u32(dn, "ibm,vas-id", &vasid); + if (rc) { + pr_err("No ibm,vas-id property for %s?\n", pdev->name); + return -ENODEV; + } + + if (pdev->num_resources != 4) { + pr_err("Unexpected DT configuration for [%s, %d]\n", + pdev->name, vasid); + return -ENODEV; + } + + vinst = kzalloc(sizeof(*vinst), GFP_KERNEL); + if (!vinst) + return -ENOMEM; + + INIT_LIST_HEAD(&vinst->node); + ida_init(&vinst->ida); + mutex_init(&vinst->mutex); + vinst->vas_id = vasid; + vinst->pdev = pdev; + + res = &pdev->resource[0]; + vinst->hvwc_bar_start = res->start; + + res = &pdev->resource[1]; + vinst->uwc_bar_start = res->start; + + res = &pdev->resource[2]; + vinst->paste_base_addr = res->start; + + res = &pdev->resource[3]; + if (res->end > 62) { + pr_err("Bad 'paste_win_id_shift' in DT, %llx\n", res->end); + goto free_vinst; + } + + vinst->paste_win_id_shift = 63 - res->end; + + pr_devel("Initialized instance [%s, %d], paste_base 0x%llx, " + "paste_win_id_shift 0x%llx\n", pdev->name, vasid, + vinst->paste_base_addr, vinst->paste_win_id_shift); + + mutex_lock(&vas_mutex); + list_add(&vinst->node, &vas_instances); + mutex_unlock(&vas_mutex); + + dev_set_drvdata(&pdev->dev, vinst); + + return 0; + +free_vinst: + kfree(vinst); + return -ENODEV; + +} + +/* + * Although this is read/used multiple times, it is written to only + * during initialization. + */ +struct vas_instance *find_vas_instance(int vasid) +{ + struct list_head *ent; + struct vas_instance *vinst; + + mutex_lock(&vas_mutex); + list_for_each(ent, &vas_instances) { + vinst = list_entry(ent, struct vas_instance, node); + if (vinst->vas_id == vasid) { + mutex_unlock(&vas_mutex); + return vinst; + } + } + mutex_unlock(&vas_mutex); + + pr_devel("Instance %d not found\n", vasid); + return NULL; +} + +static int vas_probe(struct platform_device *pdev) +{ + return init_vas_instance(pdev); +} + +static const struct of_device_id powernv_vas_match[] = { + { .compatible = "ibm,vas",}, + {}, +}; + +static struct platform_driver vas_driver = { + .driver = { + .name = "vas", + .of_match_table = powernv_vas_match, + }, + .probe = vas_probe, +}; + +static int __init vas_init(void) +{ + int found = 0; + struct device_node *dn; + + platform_driver_register(&vas_driver); + + for_each_compatible_node(dn, NULL, "ibm,vas") { + of_platform_device_create(dn, NULL, NULL); + found++; + } + + if (!found) + return -ENODEV; + + pr_devel("Found %d instances\n", found); + + return 0; +} +device_initcall(vas_init); diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h new file mode 100644 index 000000000000..38dee5d50f31 --- /dev/null +++ b/arch/powerpc/platforms/powernv/vas.h @@ -0,0 +1,467 @@ +/* + * Copyright 2016-17 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _VAS_H +#define _VAS_H +#include <linux/atomic.h> +#include <linux/idr.h> +#include <asm/vas.h> +#include <linux/io.h> + +/* + * Overview of Virtual Accelerator Switchboard (VAS). + * + * VAS is a hardware "switchboard" that allows senders and receivers to + * exchange messages with _minimal_ kernel involvment. The receivers are + * typically NX coprocessor engines that perform compression or encryption + * in hardware, but receivers can also be other software threads. + * + * Senders are user/kernel threads that submit compression/encryption or + * other requests to the receivers. Senders must format their messages as + * Coprocessor Request Blocks (CRB)s and submit them using the "copy" and + * "paste" instructions which were introduced in Power9. + * + * A Power node can have (upto?) 8 Power chips. There is one instance of + * VAS in each Power9 chip. Each instance of VAS has 64K windows or ports, + * Senders and receivers must each connect to a separate window before they + * can exchange messages through the switchboard. + * + * Each window is described by two types of window contexts: + * + * Hypervisor Window Context (HVWC) of size VAS_HVWC_SIZE bytes + * + * OS/User Window Context (UWC) of size VAS_UWC_SIZE bytes. + * + * A window context can be viewed as a set of 64-bit registers. The settings + * in these registers configure/control/determine the behavior of the VAS + * hardware when messages are sent/received through the window. The registers + * in the HVWC are configured by the kernel while the registers in the UWC can + * be configured by the kernel or by the user space application that is using + * the window. + * + * The HVWCs for all windows on a specific instance of VAS are in a contiguous + * range of hardware addresses or Base address region (BAR) referred to as the + * HVWC BAR for the instance. Similarly the UWCs for all windows on an instance + * are referred to as the UWC BAR for the instance. + * + * The two BARs for each instance are defined Power9 MMIO Ranges spreadsheet + * and available to the kernel in the VAS node's "reg" property in the device + * tree: + * + * /proc/device-tree/vasm@.../reg + * + * (see vas_probe() for details on the reg property). + * + * The kernel maps the HVWC and UWC BAR regions into the kernel address + * space (hvwc_map and uwc_map). The kernel can then access the window + * contexts of a specific window using: + * + * hvwc = hvwc_map + winid * VAS_HVWC_SIZE. + * uwc = uwc_map + winid * VAS_UWC_SIZE. + * + * where winid is the window index (0..64K). + * + * As mentioned, a window context is used to "configure" a window. Besides + * this configuration address, each _send_ window also has a unique hardware + * "paste" address that is used to submit requests/CRBs (see vas_paste_crb()). + * + * The hardware paste address for a window is computed using the "paste + * base address" and "paste win id shift" reg properties in the VAS device + * tree node using: + * + * paste_addr = paste_base + ((winid << paste_win_id_shift)) + * + * (again, see vas_probe() for ->paste_base_addr and ->paste_win_id_shift). + * + * The kernel maps this hardware address into the sender's address space + * after which they can use the 'paste' instruction (new in Power9) to + * send a message (submit a request aka CRB) to the coprocessor. + * + * NOTE: In the initial version, senders can only in-kernel drivers/threads. + * Support for user space threads will be added in follow-on patches. + * + * TODO: Do we need to map the UWC into user address space so they can return + * credits? Its NA for NX but may be needed for other receive windows. + * + */ + +#define VAS_WINDOWS_PER_CHIP (64 << 10) + +/* + * Hypervisor and OS/USer Window Context sizes + */ +#define VAS_HVWC_SIZE 512 +#define VAS_UWC_SIZE PAGE_SIZE + +/* + * Initial per-process credits. + * Max send window credits: 4K-1 (12-bits in VAS_TX_WCRED) + * Max receive window credits: 64K-1 (16 bits in VAS_LRX_WCRED) + * + * TODO: Needs tuning for per-process credits + */ +#define VAS_WCREDS_MIN 16 +#define VAS_WCREDS_MAX ((64 << 10) - 1) +#define VAS_WCREDS_DEFAULT (1 << 10) + +/* + * VAS Window Context Register Offsets and bitmasks. + * See Section 3.1.4 of VAS Work book + */ +#define VAS_LPID_OFFSET 0x010 +#define VAS_LPID PPC_BITMASK(0, 11) + +#define VAS_PID_OFFSET 0x018 +#define VAS_PID_ID PPC_BITMASK(0, 19) + +#define VAS_XLATE_MSR_OFFSET 0x020 +#define VAS_XLATE_MSR_DR PPC_BIT(0) +#define VAS_XLATE_MSR_TA PPC_BIT(1) +#define VAS_XLATE_MSR_PR PPC_BIT(2) +#define VAS_XLATE_MSR_US PPC_BIT(3) +#define VAS_XLATE_MSR_HV PPC_BIT(4) +#define VAS_XLATE_MSR_SF PPC_BIT(5) + +#define VAS_XLATE_LPCR_OFFSET 0x028 +#define VAS_XLATE_LPCR_PAGE_SIZE PPC_BITMASK(0, 2) +#define VAS_XLATE_LPCR_ISL PPC_BIT(3) +#define VAS_XLATE_LPCR_TC PPC_BIT(4) +#define VAS_XLATE_LPCR_SC PPC_BIT(5) + +#define VAS_XLATE_CTL_OFFSET 0x030 +#define VAS_XLATE_MODE PPC_BITMASK(0, 1) + +#define VAS_AMR_OFFSET 0x040 +#define VAS_AMR PPC_BITMASK(0, 63) + +#define VAS_SEIDR_OFFSET 0x048 +#define VAS_SEIDR PPC_BITMASK(0, 63) + +#define VAS_FAULT_TX_WIN_OFFSET 0x050 +#define VAS_FAULT_TX_WIN PPC_BITMASK(48, 63) + +#define VAS_OSU_INTR_SRC_RA_OFFSET 0x060 +#define VAS_OSU_INTR_SRC_RA PPC_BITMASK(8, 63) + +#define VAS_HV_INTR_SRC_RA_OFFSET 0x070 +#define VAS_HV_INTR_SRC_RA PPC_BITMASK(8, 63) + +#define VAS_PSWID_OFFSET 0x078 +#define VAS_PSWID_EA_HANDLE PPC_BITMASK(0, 31) + +#define VAS_SPARE1_OFFSET 0x080 +#define VAS_SPARE2_OFFSET 0x088 +#define VAS_SPARE3_OFFSET 0x090 +#define VAS_SPARE4_OFFSET 0x130 +#define VAS_SPARE5_OFFSET 0x160 +#define VAS_SPARE6_OFFSET 0x188 + +#define VAS_LFIFO_BAR_OFFSET 0x0A0 +#define VAS_LFIFO_BAR PPC_BITMASK(8, 53) +#define VAS_PAGE_MIGRATION_SELECT PPC_BITMASK(54, 56) + +#define VAS_LDATA_STAMP_CTL_OFFSET 0x0A8 +#define VAS_LDATA_STAMP PPC_BITMASK(0, 1) +#define VAS_XTRA_WRITE PPC_BIT(2) + +#define VAS_LDMA_CACHE_CTL_OFFSET 0x0B0 +#define VAS_LDMA_TYPE PPC_BITMASK(0, 1) +#define VAS_LDMA_FIFO_DISABLE PPC_BIT(2) + +#define VAS_LRFIFO_PUSH_OFFSET 0x0B8 +#define VAS_LRFIFO_PUSH PPC_BITMASK(0, 15) + +#define VAS_CURR_MSG_COUNT_OFFSET 0x0C0 +#define VAS_CURR_MSG_COUNT PPC_BITMASK(0, 7) + +#define VAS_LNOTIFY_AFTER_COUNT_OFFSET 0x0C8 +#define VAS_LNOTIFY_AFTER_COUNT PPC_BITMASK(0, 7) + +#define VAS_LRX_WCRED_OFFSET 0x0E0 +#define VAS_LRX_WCRED PPC_BITMASK(0, 15) + +#define VAS_LRX_WCRED_ADDER_OFFSET 0x190 +#define VAS_LRX_WCRED_ADDER PPC_BITMASK(0, 15) + +#define VAS_TX_WCRED_OFFSET 0x0F0 +#define VAS_TX_WCRED PPC_BITMASK(4, 15) + +#define VAS_TX_WCRED_ADDER_OFFSET 0x1A0 +#define VAS_TX_WCRED_ADDER PPC_BITMASK(4, 15) + +#define VAS_LFIFO_SIZE_OFFSET 0x100 +#define VAS_LFIFO_SIZE PPC_BITMASK(0, 3) + +#define VAS_WINCTL_OFFSET 0x108 +#define VAS_WINCTL_OPEN PPC_BIT(0) +#define VAS_WINCTL_REJ_NO_CREDIT PPC_BIT(1) +#define VAS_WINCTL_PIN PPC_BIT(2) +#define VAS_WINCTL_TX_WCRED_MODE PPC_BIT(3) +#define VAS_WINCTL_RX_WCRED_MODE PPC_BIT(4) +#define VAS_WINCTL_TX_WORD_MODE PPC_BIT(5) +#define VAS_WINCTL_RX_WORD_MODE PPC_BIT(6) +#define VAS_WINCTL_RSVD_TXBUF PPC_BIT(7) +#define VAS_WINCTL_THRESH_CTL PPC_BITMASK(8, 9) +#define VAS_WINCTL_FAULT_WIN PPC_BIT(10) +#define VAS_WINCTL_NX_WIN PPC_BIT(11) + +#define VAS_WIN_STATUS_OFFSET 0x110 +#define VAS_WIN_BUSY PPC_BIT(1) + +#define VAS_WIN_CTX_CACHING_CTL_OFFSET 0x118 +#define VAS_CASTOUT_REQ PPC_BIT(0) +#define VAS_PUSH_TO_MEM PPC_BIT(1) +#define VAS_WIN_CACHE_STATUS PPC_BIT(4) + +#define VAS_TX_RSVD_BUF_COUNT_OFFSET 0x120 +#define VAS_RXVD_BUF_COUNT PPC_BITMASK(58, 63) + +#define VAS_LRFIFO_WIN_PTR_OFFSET 0x128 +#define VAS_LRX_WIN_ID PPC_BITMASK(0, 15) + +/* + * Local Notification Control Register controls what happens in _response_ + * to a paste command and hence applies only to receive windows. + */ +#define VAS_LNOTIFY_CTL_OFFSET 0x138 +#define VAS_NOTIFY_DISABLE PPC_BIT(0) +#define VAS_INTR_DISABLE PPC_BIT(1) +#define VAS_NOTIFY_EARLY PPC_BIT(2) +#define VAS_NOTIFY_OSU_INTR PPC_BIT(3) + +#define VAS_LNOTIFY_PID_OFFSET 0x140 +#define VAS_LNOTIFY_PID PPC_BITMASK(0, 19) + +#define VAS_LNOTIFY_LPID_OFFSET 0x148 +#define VAS_LNOTIFY_LPID PPC_BITMASK(0, 11) + +#define VAS_LNOTIFY_TID_OFFSET 0x150 +#define VAS_LNOTIFY_TID PPC_BITMASK(0, 15) + +#define VAS_LNOTIFY_SCOPE_OFFSET 0x158 +#define VAS_LNOTIFY_MIN_SCOPE PPC_BITMASK(0, 1) +#define VAS_LNOTIFY_MAX_SCOPE PPC_BITMASK(2, 3) + +#define VAS_NX_UTIL_OFFSET 0x1B0 +#define VAS_NX_UTIL PPC_BITMASK(0, 63) + +/* SE: Side effects */ +#define VAS_NX_UTIL_SE_OFFSET 0x1B8 +#define VAS_NX_UTIL_SE PPC_BITMASK(0, 63) + +#define VAS_NX_UTIL_ADDER_OFFSET 0x180 +#define VAS_NX_UTIL_ADDER PPC_BITMASK(32, 63) + +/* + * Local Notify Scope Control Register. (Receive windows only). + */ +enum vas_notify_scope { + VAS_SCOPE_LOCAL, + VAS_SCOPE_GROUP, + VAS_SCOPE_VECTORED_GROUP, + VAS_SCOPE_UNUSED, +}; + +/* + * Local DMA Cache Control Register (Receive windows only). + */ +enum vas_dma_type { + VAS_DMA_TYPE_INJECT, + VAS_DMA_TYPE_WRITE, +}; + +/* + * Local Notify Scope Control Register. (Receive windows only). + * Not applicable to NX receive windows. + */ +enum vas_notify_after_count { + VAS_NOTIFY_AFTER_256 = 0, + VAS_NOTIFY_NONE, + VAS_NOTIFY_AFTER_2 +}; + +/* + * One per instance of VAS. Each instance will have a separate set of + * receive windows, one per coprocessor type. + * + * See also function header of set_vinst_win() for details on ->windows[] + * and ->rxwin[] tables. + */ +struct vas_instance { + int vas_id; + struct ida ida; + struct list_head node; + struct platform_device *pdev; + + u64 hvwc_bar_start; + u64 uwc_bar_start; + u64 paste_base_addr; + u64 paste_win_id_shift; + + struct mutex mutex; + struct vas_window *rxwin[VAS_COP_TYPE_MAX]; + struct vas_window *windows[VAS_WINDOWS_PER_CHIP]; +}; + +/* + * In-kernel state a VAS window. One per window. + */ +struct vas_window { + /* Fields common to send and receive windows */ + struct vas_instance *vinst; + int winid; + bool tx_win; /* True if send window */ + bool nx_win; /* True if NX window */ + bool user_win; /* True if user space window */ + void *hvwc_map; /* HV window context */ + void *uwc_map; /* OS/User window context */ + pid_t pid; /* Linux process id of owner */ + + /* Fields applicable only to send windows */ + void *paste_kaddr; + char *paste_addr_name; + struct vas_window *rxwin; + + /* Feilds applicable only to receive windows */ + enum vas_cop_type cop; + atomic_t num_txwins; +}; + +/* + * Container for the hardware state of a window. One per-window. + * + * A VAS Window context is a 512-byte area in the hardware that contains + * a set of 64-bit registers. Individual bit-fields in these registers + * determine the configuration/operation of the hardware. struct vas_winctx + * is a container for the register fields in the window context. + */ +struct vas_winctx { + void *rx_fifo; + int rx_fifo_size; + int wcreds_max; + int rsvd_txbuf_count; + + bool user_win; + bool nx_win; + bool fault_win; + bool rsvd_txbuf_enable; + bool pin_win; + bool rej_no_credit; + bool tx_wcred_mode; + bool rx_wcred_mode; + bool tx_word_mode; + bool rx_word_mode; + bool data_stamp; + bool xtra_write; + bool notify_disable; + bool intr_disable; + bool fifo_disable; + bool notify_early; + bool notify_os_intr_reg; + + int lpid; + int pidr; /* value from SPRN_PID, not linux pid */ + int lnotify_lpid; + int lnotify_pid; + int lnotify_tid; + u32 pswid; + int rx_win_id; + int fault_win_id; + int tc_mode; + + u64 irq_port; + + enum vas_dma_type dma_type; + enum vas_notify_scope min_scope; + enum vas_notify_scope max_scope; + enum vas_notify_after_count notify_after_count; +}; + +extern struct vas_instance *find_vas_instance(int vasid); + +/* + * VREG(x): + * Expand a register's short name (eg: LPID) into two parameters: + * - the register's short name in string form ("LPID"), and + * - the name of the macro (eg: VAS_LPID_OFFSET), defining the + * register's offset in the window context + */ +#define VREG_SFX(n, s) __stringify(n), VAS_##n##s +#define VREG(r) VREG_SFX(r, _OFFSET) + +#ifdef vas_debug +static inline void dump_rx_win_attr(struct vas_rx_win_attr *attr) +{ + pr_err("fault %d, notify %d, intr %d early %d\n", + attr->fault_win, attr->notify_disable, + attr->intr_disable, attr->notify_early); + + pr_err("rx_fifo_size %d, max value %d\n", + attr->rx_fifo_size, VAS_RX_FIFO_SIZE_MAX); +} + +static inline void vas_log_write(struct vas_window *win, char *name, + void *regptr, u64 val) +{ + if (val) + pr_err("%swin #%d: %s reg %p, val 0x%016llx\n", + win->tx_win ? "Tx" : "Rx", win->winid, name, + regptr, val); +} + +#else /* vas_debug */ + +#define vas_log_write(win, name, reg, val) +#define dump_rx_win_attr(attr) + +#endif /* vas_debug */ + +static inline void write_uwc_reg(struct vas_window *win, char *name, + s32 reg, u64 val) +{ + void *regptr; + + regptr = win->uwc_map + reg; + vas_log_write(win, name, regptr, val); + + out_be64(regptr, val); +} + +static inline void write_hvwc_reg(struct vas_window *win, char *name, + s32 reg, u64 val) +{ + void *regptr; + + regptr = win->hvwc_map + reg; + vas_log_write(win, name, regptr, val); + + out_be64(regptr, val); +} + +static inline u64 read_hvwc_reg(struct vas_window *win, + char *name __maybe_unused, s32 reg) +{ + return in_be64(win->hvwc_map+reg); +} + +#ifdef vas_debug + +static void print_fifo_msg_count(struct vas_window *txwin) +{ + uint64_t read_hvwc_reg(struct vas_window *w, char *n, uint64_t o); + pr_devel("Winid %d, Msg count %llu\n", txwin->winid, + (uint64_t)read_hvwc_reg(txwin, VREG(LRFIFO_PUSH))); +} +#else /* vas_debug */ + +#define print_fifo_msg_count(window) + +#endif /* vas_debug */ + +#endif /* _VAS_H */ |