diff options
author | Peter Maydell | 2022-02-20 16:05:41 +0100 |
---|---|---|
committer | Peter Maydell | 2022-02-20 16:05:41 +0100 |
commit | e670f6d825d4dee248b311197fd4048469d6772b (patch) | |
tree | 1addb755097a04198bdf18ff79d1b2223397bd79 /hw | |
parent | Merge remote-tracking branch 'remotes/dgilbert-gitlab/tags/pull-virtiofs-2022... (diff) | |
parent | target/ppc: Move common SPR functions out of cpu_init (diff) | |
download | qemu-e670f6d825d4dee248b311197fd4048469d6772b.tar.gz qemu-e670f6d825d4dee248b311197fd4048469d6772b.tar.xz qemu-e670f6d825d4dee248b311197fd4048469d6772b.zip |
Merge remote-tracking branch 'remotes/legoater/tags/pull-ppc-20220218' into staging
ppc-7.0 queue
* target/ppc: SPR registration cleanups (Fabiano)
* ppc: nested KVM HV for spapr virtual hypervisor (Nicholas)
* spapr: nvdimm: Introduce spapr-nvdimm device (Shivaprasad)
# gpg: Signature made Fri 18 Feb 2022 07:59:29 GMT
# gpg: using RSA key A0F66548F04895EBFE6B0B6051A343C7CFFBECA1
# gpg: Good signature from "Cédric Le Goater <clg@kaod.org>" [undefined]
# gpg: WARNING: This key is not certified with a trusted signature!
# gpg: There is no indication that the signature belongs to the owner.
# Primary key fingerprint: A0F6 6548 F048 95EB FE6B 0B60 51A3 43C7 CFFB ECA1
* remotes/legoater/tags/pull-ppc-20220218: (39 commits)
target/ppc: Move common SPR functions out of cpu_init
target/ppc: cpu_init: Move check_pow and QOM macros to a header
target/ppc: cpu_init: Move SPR registration macros to a header
target/ppc: cpu_init: Expose some SPR registration helpers
target/ppc: Rename spr_tcg.h to spr_common.h
target/ppc: cpu_init: Remove register_usprg3_sprs
target/ppc: cpu_init: Rename register_ne_601_sprs
target/ppc: cpu_init: Reuse init_proc_745 for the 755
target/ppc: cpu_init: Reuse init_proc_604 for the 604e
target/ppc: cpu_init: Reuse init_proc_603 for the e300
target/ppc: cpu_init: Move 604e SPR registration into a function
target/ppc: cpu_init: Move e300 SPR registration into a function
target/ppc: cpu_init: Move 755 L2 cache SPRs into a function
target/ppc: cpu_init: Deduplicate 7xx SPR registration
target/ppc: cpu_init: Deduplicate 745/755 SPR registration
target/ppc: cpu_init: Deduplicate 604 SPR registration
target/ppc: cpu_init: Deduplicate 603 SPR registration
target/ppc: cpu_init: Deduplicate 440 SPR registration
target/ppc: cpu_init: Decouple 74xx SPR registration from 7xx
target/ppc: cpu_init: Decouple G2 SPR registration from 755
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'hw')
-rw-r--r-- | hw/mem/nvdimm.c | 16 | ||||
-rw-r--r-- | hw/mem/pc-dimm.c | 5 | ||||
-rw-r--r-- | hw/ppc/pegasos2.c | 6 | ||||
-rw-r--r-- | hw/ppc/ppc.c | 23 | ||||
-rw-r--r-- | hw/ppc/spapr.c | 48 | ||||
-rw-r--r-- | hw/ppc/spapr_caps.c | 14 | ||||
-rw-r--r-- | hw/ppc/spapr_cpu_core.c | 6 | ||||
-rw-r--r-- | hw/ppc/spapr_hcall.c | 333 | ||||
-rw-r--r-- | hw/ppc/spapr_nvdimm.c | 392 |
9 files changed, 830 insertions, 13 deletions
diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c index 7397b67156..59959d5563 100644 --- a/hw/mem/nvdimm.c +++ b/hw/mem/nvdimm.c @@ -181,10 +181,25 @@ static MemoryRegion *nvdimm_md_get_memory_region(MemoryDeviceState *md, static void nvdimm_realize(PCDIMMDevice *dimm, Error **errp) { NVDIMMDevice *nvdimm = NVDIMM(dimm); + NVDIMMClass *ndc = NVDIMM_GET_CLASS(nvdimm); if (!nvdimm->nvdimm_mr) { nvdimm_prepare_memory_region(nvdimm, errp); } + + if (ndc->realize) { + ndc->realize(nvdimm, errp); + } +} + +static void nvdimm_unrealize(PCDIMMDevice *dimm) +{ + NVDIMMDevice *nvdimm = NVDIMM(dimm); + NVDIMMClass *ndc = NVDIMM_GET_CLASS(nvdimm); + + if (ndc->unrealize) { + ndc->unrealize(nvdimm); + } } /* @@ -240,6 +255,7 @@ static void nvdimm_class_init(ObjectClass *oc, void *data) DeviceClass *dc = DEVICE_CLASS(oc); ddc->realize = nvdimm_realize; + ddc->unrealize = nvdimm_unrealize; mdc->get_memory_region = nvdimm_md_get_memory_region; device_class_set_props(dc, nvdimm_properties); diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c index 48b913aba6..03bd0dd60e 100644 --- a/hw/mem/pc-dimm.c +++ b/hw/mem/pc-dimm.c @@ -216,6 +216,11 @@ static void pc_dimm_realize(DeviceState *dev, Error **errp) static void pc_dimm_unrealize(DeviceState *dev) { PCDIMMDevice *dimm = PC_DIMM(dev); + PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm); + + if (ddc->unrealize) { + ddc->unrealize(dimm); + } host_memory_backend_set_mapped(dimm->hostmem, false); } diff --git a/hw/ppc/pegasos2.c b/hw/ppc/pegasos2.c index 298e6b93e2..d45008ac71 100644 --- a/hw/ppc/pegasos2.c +++ b/hw/ppc/pegasos2.c @@ -449,6 +449,11 @@ static target_ulong pegasos2_rtas(PowerPCCPU *cpu, Pegasos2MachineState *pm, } } +static bool pegasos2_cpu_in_nested(PowerPCCPU *cpu) +{ + return false; +} + static void pegasos2_hypercall(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu) { Pegasos2MachineState *pm = PEGASOS2_MACHINE(vhyp); @@ -504,6 +509,7 @@ static void pegasos2_machine_class_init(ObjectClass *oc, void *data) mc->default_ram_id = "pegasos2.ram"; mc->default_ram_size = 512 * MiB; + vhc->cpu_in_nested = pegasos2_cpu_in_nested; vhc->hypercall = pegasos2_hypercall; vhc->cpu_exec_enter = vhyp_nop; vhc->cpu_exec_exit = vhyp_nop; diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c index ba7fa0f3b5..9e99625ea9 100644 --- a/hw/ppc/ppc.c +++ b/hw/ppc/ppc.c @@ -1072,7 +1072,7 @@ clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq) } /* Create new timer */ tb_env->decr_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, &cpu_ppc_decr_cb, cpu); - if (env->has_hv_mode) { + if (env->has_hv_mode && !cpu->vhyp) { tb_env->hdecr_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, &cpu_ppc_hdecr_cb, cpu); } else { @@ -1083,6 +1083,27 @@ clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq) return &cpu_ppc_set_tb_clk; } +/* cpu_ppc_hdecr_init may be used if the timer is not used by HDEC emulation */ +void cpu_ppc_hdecr_init(CPUPPCState *env) +{ + PowerPCCPU *cpu = env_archcpu(env); + + assert(env->tb_env->hdecr_timer == NULL); + + env->tb_env->hdecr_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, + &cpu_ppc_hdecr_cb, cpu); +} + +void cpu_ppc_hdecr_exit(CPUPPCState *env) +{ + PowerPCCPU *cpu = env_archcpu(env); + + timer_free(env->tb_env->hdecr_timer); + env->tb_env->hdecr_timer = NULL; + + cpu_ppc_hdecr_lower(cpu); +} + /*****************************************************************************/ /* PowerPC 40x timers */ diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 3d6ec309dd..f0b75b22bb 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -1270,6 +1270,8 @@ static void emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp, /* The TCG path should also be holding the BQL at this point */ g_assert(qemu_mutex_iothread_locked()); + g_assert(!vhyp_cpu_in_nested(cpu)); + if (msr_pr) { hcall_dprintf("Hypercall made with MSR[PR]=1\n"); env->gpr[3] = H_PRIVILEGE; @@ -1309,13 +1311,40 @@ void spapr_set_all_lpcrs(target_ulong value, target_ulong mask) } } -static void spapr_get_pate(PPCVirtualHypervisor *vhyp, ppc_v3_pate_t *entry) +static bool spapr_get_pate(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu, + target_ulong lpid, ppc_v3_pate_t *entry) { SpaprMachineState *spapr = SPAPR_MACHINE(vhyp); + SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu); + + if (!spapr_cpu->in_nested) { + assert(lpid == 0); + + /* Copy PATE1:GR into PATE0:HR */ + entry->dw0 = spapr->patb_entry & PATE0_HR; + entry->dw1 = spapr->patb_entry; + + } else { + uint64_t patb, pats; + + assert(lpid != 0); + + patb = spapr->nested_ptcr & PTCR_PATB; + pats = spapr->nested_ptcr & PTCR_PATS; + + /* Calculate number of entries */ + pats = 1ull << (pats + 12 - 4); + if (pats <= lpid) { + return false; + } - /* Copy PATE1:GR into PATE0:HR */ - entry->dw0 = spapr->patb_entry & PATE0_HR; - entry->dw1 = spapr->patb_entry; + /* Grab entry */ + patb += 16 * lpid; + entry->dw0 = ldq_phys(CPU(cpu)->as, patb); + entry->dw1 = ldq_phys(CPU(cpu)->as, patb + 8); + } + + return true; } #define HPTE(_table, _i) (void *)(((uint64_t *)(_table)) + ((_i) * 2)) @@ -1634,6 +1663,8 @@ static void spapr_machine_reset(MachineState *machine) spapr->ov5_cas = spapr_ovec_clone(spapr->ov5); } + spapr_nvdimm_finish_flushes(); + /* DRC reset may cause a device to be unplugged. This will cause troubles * if this device is used by another device (eg, a running vhost backend * will crash QEMU if the DIMM holding the vring goes away). To avoid such @@ -4465,6 +4496,13 @@ PowerPCCPU *spapr_find_cpu(int vcpu_id) return NULL; } +static bool spapr_cpu_in_nested(PowerPCCPU *cpu) +{ + SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu); + + return spapr_cpu->in_nested; +} + static void spapr_cpu_exec_enter(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu) { SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu); @@ -4573,6 +4611,8 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) fwc->get_dev_path = spapr_get_fw_dev_path; nc->nmi_monitor_handler = spapr_nmi; smc->phb_placement = spapr_phb_placement; + vhc->cpu_in_nested = spapr_cpu_in_nested; + vhc->deliver_hv_excp = spapr_exit_nested; vhc->hypercall = emulate_spapr_hypercall; vhc->hpt_mask = spapr_hpt_mask; vhc->map_hptes = spapr_map_hptes; diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c index ed7c077a0d..6167431271 100644 --- a/hw/ppc/spapr_caps.c +++ b/hw/ppc/spapr_caps.c @@ -444,19 +444,23 @@ static void cap_nested_kvm_hv_apply(SpaprMachineState *spapr, { ERRP_GUARD(); PowerPCCPU *cpu = POWERPC_CPU(first_cpu); + CPUPPCState *env = &cpu->env; if (!val) { /* capability disabled by default */ return; } - if (tcg_enabled()) { - error_setg(errp, "No Nested KVM-HV support in TCG"); + if (!(env->insns_flags2 & PPC2_ISA300)) { + error_setg(errp, "Nested-HV only supported on POWER9 and later"); error_append_hint(errp, "Try appending -machine cap-nested-hv=off\n"); - } else if (kvm_enabled()) { + return; + } + + if (kvm_enabled()) { if (!ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0, spapr->max_compat_pvr)) { - error_setg(errp, "Nested KVM-HV only supported on POWER9"); + error_setg(errp, "Nested-HV only supported on POWER9 and later"); error_append_hint(errp, "Try appending -machine max-cpu-compat=power9\n"); return; @@ -464,7 +468,7 @@ static void cap_nested_kvm_hv_apply(SpaprMachineState *spapr, if (!kvmppc_has_cap_nested_kvm_hv()) { error_setg(errp, - "KVM implementation does not support Nested KVM-HV"); + "KVM implementation does not support Nested-HV"); error_append_hint(errp, "Try appending -machine cap-nested-hv=off\n"); } else if (kvmppc_set_cap_nested_kvm_hv(val) < 0) { diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c index a781e97f8d..ed84713960 100644 --- a/hw/ppc/spapr_cpu_core.c +++ b/hw/ppc/spapr_cpu_core.c @@ -261,12 +261,12 @@ static bool spapr_realize_vcpu(PowerPCCPU *cpu, SpaprMachineState *spapr, return false; } - /* Set time-base frequency to 512 MHz */ - cpu_ppc_tb_init(env, SPAPR_TIMEBASE_FREQ); - cpu_ppc_set_vhyp(cpu, PPC_VIRTUAL_HYPERVISOR(spapr)); kvmppc_set_papr(cpu); + /* Set time-base frequency to 512 MHz. vhyp must be set first. */ + cpu_ppc_tb_init(env, SPAPR_TIMEBASE_FREQ); + if (spapr_irq_cpu_intc_create(spapr, cpu, errp) < 0) { qdev_unrealize(DEVICE(cpu)); return false; diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index 222c1b6bbd..f008290787 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -9,6 +9,7 @@ #include "qemu/error-report.h" #include "exec/exec-all.h" #include "helper_regs.h" +#include "hw/ppc/ppc.h" #include "hw/ppc/spapr.h" #include "hw/ppc/spapr_cpu_core.h" #include "mmu-hash64.h" @@ -1497,6 +1498,333 @@ static void hypercall_register_softmmu(void) } #endif +/* TCG only */ +#define PRTS_MASK 0x1f + +static target_ulong h_set_ptbl(PowerPCCPU *cpu, + SpaprMachineState *spapr, + target_ulong opcode, + target_ulong *args) +{ + target_ulong ptcr = args[0]; + + if (!spapr_get_cap(spapr, SPAPR_CAP_NESTED_KVM_HV)) { + return H_FUNCTION; + } + + if ((ptcr & PRTS_MASK) + 12 - 4 > 12) { + return H_PARAMETER; + } + + spapr->nested_ptcr = ptcr; /* Save new partition table */ + + return H_SUCCESS; +} + +static target_ulong h_tlb_invalidate(PowerPCCPU *cpu, + SpaprMachineState *spapr, + target_ulong opcode, + target_ulong *args) +{ + /* + * The spapr virtual hypervisor nested HV implementation retains no L2 + * translation state except for TLB. And the TLB is always invalidated + * across L1<->L2 transitions, so nothing is required here. + */ + + return H_SUCCESS; +} + +static target_ulong h_copy_tofrom_guest(PowerPCCPU *cpu, + SpaprMachineState *spapr, + target_ulong opcode, + target_ulong *args) +{ + /* + * This HCALL is not required, L1 KVM will take a slow path and walk the + * page tables manually to do the data copy. + */ + return H_FUNCTION; +} + +/* + * When this handler returns, the environment is switched to the L2 guest + * and TCG begins running that. spapr_exit_nested() performs the switch from + * L2 back to L1 and returns from the H_ENTER_NESTED hcall. + */ +static target_ulong h_enter_nested(PowerPCCPU *cpu, + SpaprMachineState *spapr, + target_ulong opcode, + target_ulong *args) +{ + PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu); + CPUState *cs = CPU(cpu); + CPUPPCState *env = &cpu->env; + SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu); + target_ulong hv_ptr = args[0]; + target_ulong regs_ptr = args[1]; + target_ulong hdec, now = cpu_ppc_load_tbl(env); + target_ulong lpcr, lpcr_mask; + struct kvmppc_hv_guest_state *hvstate; + struct kvmppc_hv_guest_state hv_state; + struct kvmppc_pt_regs *regs; + hwaddr len; + uint64_t cr; + int i; + + if (spapr->nested_ptcr == 0) { + return H_NOT_AVAILABLE; + } + + len = sizeof(*hvstate); + hvstate = address_space_map(CPU(cpu)->as, hv_ptr, &len, false, + MEMTXATTRS_UNSPECIFIED); + if (len != sizeof(*hvstate)) { + address_space_unmap(CPU(cpu)->as, hvstate, len, 0, false); + return H_PARAMETER; + } + + memcpy(&hv_state, hvstate, len); + + address_space_unmap(CPU(cpu)->as, hvstate, len, len, false); + + /* + * We accept versions 1 and 2. Version 2 fields are unused because TCG + * does not implement DAWR*. + */ + if (hv_state.version > HV_GUEST_STATE_VERSION) { + return H_PARAMETER; + } + + spapr_cpu->nested_host_state = g_try_malloc(sizeof(CPUPPCState)); + if (!spapr_cpu->nested_host_state) { + return H_NO_MEM; + } + + memcpy(spapr_cpu->nested_host_state, env, sizeof(CPUPPCState)); + + len = sizeof(*regs); + regs = address_space_map(CPU(cpu)->as, regs_ptr, &len, false, + MEMTXATTRS_UNSPECIFIED); + if (!regs || len != sizeof(*regs)) { + address_space_unmap(CPU(cpu)->as, regs, len, 0, false); + g_free(spapr_cpu->nested_host_state); + return H_P2; + } + + len = sizeof(env->gpr); + assert(len == sizeof(regs->gpr)); + memcpy(env->gpr, regs->gpr, len); + + env->lr = regs->link; + env->ctr = regs->ctr; + cpu_write_xer(env, regs->xer); + + cr = regs->ccr; + for (i = 7; i >= 0; i--) { + env->crf[i] = cr & 15; + cr >>= 4; + } + + env->msr = regs->msr; + env->nip = regs->nip; + + address_space_unmap(CPU(cpu)->as, regs, len, len, false); + + env->cfar = hv_state.cfar; + + assert(env->spr[SPR_LPIDR] == 0); + env->spr[SPR_LPIDR] = hv_state.lpid; + + lpcr_mask = LPCR_DPFD | LPCR_ILE | LPCR_AIL | LPCR_LD | LPCR_MER; + lpcr = (env->spr[SPR_LPCR] & ~lpcr_mask) | (hv_state.lpcr & lpcr_mask); + lpcr |= LPCR_HR | LPCR_UPRT | LPCR_GTSE | LPCR_HVICE | LPCR_HDICE; + lpcr &= ~LPCR_LPES0; + env->spr[SPR_LPCR] = lpcr & pcc->lpcr_mask; + + env->spr[SPR_PCR] = hv_state.pcr; + /* hv_state.amor is not used */ + env->spr[SPR_DPDES] = hv_state.dpdes; + env->spr[SPR_HFSCR] = hv_state.hfscr; + hdec = hv_state.hdec_expiry - now; + spapr_cpu->nested_tb_offset = hv_state.tb_offset; + /* TCG does not implement DAWR*, CIABR, PURR, SPURR, IC, VTB, HEIR SPRs*/ + env->spr[SPR_SRR0] = hv_state.srr0; + env->spr[SPR_SRR1] = hv_state.srr1; + env->spr[SPR_SPRG0] = hv_state.sprg[0]; + env->spr[SPR_SPRG1] = hv_state.sprg[1]; + env->spr[SPR_SPRG2] = hv_state.sprg[2]; + env->spr[SPR_SPRG3] = hv_state.sprg[3]; + env->spr[SPR_BOOKS_PID] = hv_state.pidr; + env->spr[SPR_PPR] = hv_state.ppr; + + cpu_ppc_hdecr_init(env); + cpu_ppc_store_hdecr(env, hdec); + + /* + * The hv_state.vcpu_token is not needed. It is used by the KVM + * implementation to remember which L2 vCPU last ran on which physical + * CPU so as to invalidate process scope translations if it is moved + * between physical CPUs. For now TLBs are always flushed on L1<->L2 + * transitions so this is not a problem. + * + * Could validate that the same vcpu_token does not attempt to run on + * different L1 vCPUs at the same time, but that would be a L1 KVM bug + * and it's not obviously worth a new data structure to do it. + */ + + env->tb_env->tb_offset += spapr_cpu->nested_tb_offset; + spapr_cpu->in_nested = true; + + hreg_compute_hflags(env); + tlb_flush(cs); + env->reserve_addr = -1; /* Reset the reservation */ + + /* + * The spapr hcall helper sets env->gpr[3] to the return value, but at + * this point the L1 is not returning from the hcall but rather we + * start running the L2, so r3 must not be clobbered, so return env->gpr[3] + * to leave it unchanged. + */ + return env->gpr[3]; +} + +void spapr_exit_nested(PowerPCCPU *cpu, int excp) +{ + CPUState *cs = CPU(cpu); + CPUPPCState *env = &cpu->env; + SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu); + target_ulong r3_return = env->excp_vectors[excp]; /* hcall return value */ + target_ulong hv_ptr = spapr_cpu->nested_host_state->gpr[4]; + target_ulong regs_ptr = spapr_cpu->nested_host_state->gpr[5]; + struct kvmppc_hv_guest_state *hvstate; + struct kvmppc_pt_regs *regs; + hwaddr len; + uint64_t cr; + int i; + + assert(spapr_cpu->in_nested); + + cpu_ppc_hdecr_exit(env); + + len = sizeof(*hvstate); + hvstate = address_space_map(CPU(cpu)->as, hv_ptr, &len, true, + MEMTXATTRS_UNSPECIFIED); + if (len != sizeof(*hvstate)) { + address_space_unmap(CPU(cpu)->as, hvstate, len, 0, true); + r3_return = H_PARAMETER; + goto out_restore_l1; + } + + hvstate->cfar = env->cfar; + hvstate->lpcr = env->spr[SPR_LPCR]; + hvstate->pcr = env->spr[SPR_PCR]; + hvstate->dpdes = env->spr[SPR_DPDES]; + hvstate->hfscr = env->spr[SPR_HFSCR]; + + if (excp == POWERPC_EXCP_HDSI) { + hvstate->hdar = env->spr[SPR_HDAR]; + hvstate->hdsisr = env->spr[SPR_HDSISR]; + hvstate->asdr = env->spr[SPR_ASDR]; + } else if (excp == POWERPC_EXCP_HISI) { + hvstate->asdr = env->spr[SPR_ASDR]; + } + + /* HEIR should be implemented for HV mode and saved here. */ + hvstate->srr0 = env->spr[SPR_SRR0]; + hvstate->srr1 = env->spr[SPR_SRR1]; + hvstate->sprg[0] = env->spr[SPR_SPRG0]; + hvstate->sprg[1] = env->spr[SPR_SPRG1]; + hvstate->sprg[2] = env->spr[SPR_SPRG2]; + hvstate->sprg[3] = env->spr[SPR_SPRG3]; + hvstate->pidr = env->spr[SPR_BOOKS_PID]; + hvstate->ppr = env->spr[SPR_PPR]; + + /* Is it okay to specify write length larger than actual data written? */ + address_space_unmap(CPU(cpu)->as, hvstate, len, len, true); + + len = sizeof(*regs); + regs = address_space_map(CPU(cpu)->as, regs_ptr, &len, true, + MEMTXATTRS_UNSPECIFIED); + if (!regs || len != sizeof(*regs)) { + address_space_unmap(CPU(cpu)->as, regs, len, 0, true); + r3_return = H_P2; + goto out_restore_l1; + } + + len = sizeof(env->gpr); + assert(len == sizeof(regs->gpr)); + memcpy(regs->gpr, env->gpr, len); + + regs->link = env->lr; + regs->ctr = env->ctr; + regs->xer = cpu_read_xer(env); + + cr = 0; + for (i = 0; i < 8; i++) { + cr |= (env->crf[i] & 15) << (4 * (7 - i)); + } + regs->ccr = cr; + + if (excp == POWERPC_EXCP_MCHECK || + excp == POWERPC_EXCP_RESET || + excp == POWERPC_EXCP_SYSCALL) { + regs->nip = env->spr[SPR_SRR0]; + regs->msr = env->spr[SPR_SRR1] & env->msr_mask; + } else { + regs->nip = env->spr[SPR_HSRR0]; + regs->msr = env->spr[SPR_HSRR1] & env->msr_mask; + } + + /* Is it okay to specify write length larger than actual data written? */ + address_space_unmap(CPU(cpu)->as, regs, len, len, true); + +out_restore_l1: + memcpy(env->gpr, spapr_cpu->nested_host_state->gpr, sizeof(env->gpr)); + env->lr = spapr_cpu->nested_host_state->lr; + env->ctr = spapr_cpu->nested_host_state->ctr; + memcpy(env->crf, spapr_cpu->nested_host_state->crf, sizeof(env->crf)); + env->cfar = spapr_cpu->nested_host_state->cfar; + env->xer = spapr_cpu->nested_host_state->xer; + env->so = spapr_cpu->nested_host_state->so; + env->ov = spapr_cpu->nested_host_state->ov; + env->ov32 = spapr_cpu->nested_host_state->ov32; + env->ca32 = spapr_cpu->nested_host_state->ca32; + env->msr = spapr_cpu->nested_host_state->msr; + env->nip = spapr_cpu->nested_host_state->nip; + + assert(env->spr[SPR_LPIDR] != 0); + env->spr[SPR_LPCR] = spapr_cpu->nested_host_state->spr[SPR_LPCR]; + env->spr[SPR_LPIDR] = spapr_cpu->nested_host_state->spr[SPR_LPIDR]; + env->spr[SPR_PCR] = spapr_cpu->nested_host_state->spr[SPR_PCR]; + env->spr[SPR_DPDES] = 0; + env->spr[SPR_HFSCR] = spapr_cpu->nested_host_state->spr[SPR_HFSCR]; + env->spr[SPR_SRR0] = spapr_cpu->nested_host_state->spr[SPR_SRR0]; + env->spr[SPR_SRR1] = spapr_cpu->nested_host_state->spr[SPR_SRR1]; + env->spr[SPR_SPRG0] = spapr_cpu->nested_host_state->spr[SPR_SPRG0]; + env->spr[SPR_SPRG1] = spapr_cpu->nested_host_state->spr[SPR_SPRG1]; + env->spr[SPR_SPRG2] = spapr_cpu->nested_host_state->spr[SPR_SPRG2]; + env->spr[SPR_SPRG3] = spapr_cpu->nested_host_state->spr[SPR_SPRG3]; + env->spr[SPR_BOOKS_PID] = spapr_cpu->nested_host_state->spr[SPR_BOOKS_PID]; + env->spr[SPR_PPR] = spapr_cpu->nested_host_state->spr[SPR_PPR]; + + /* + * Return the interrupt vector address from H_ENTER_NESTED to the L1 + * (or error code). + */ + env->gpr[3] = r3_return; + + env->tb_env->tb_offset -= spapr_cpu->nested_tb_offset; + spapr_cpu->in_nested = false; + + hreg_compute_hflags(env); + tlb_flush(cs); + env->reserve_addr = -1; /* Reset the reservation */ + + g_free(spapr_cpu->nested_host_state); + spapr_cpu->nested_host_state = NULL; +} + static void hypercall_register_types(void) { hypercall_register_softmmu(); @@ -1552,6 +1880,11 @@ static void hypercall_register_types(void) spapr_register_hypercall(KVMPPC_H_CAS, h_client_architecture_support); spapr_register_hypercall(KVMPPC_H_UPDATE_DT, h_update_dt); + + spapr_register_hypercall(KVMPPC_H_SET_PARTITION_TABLE, h_set_ptbl); + spapr_register_hypercall(KVMPPC_H_ENTER_NESTED, h_enter_nested); + spapr_register_hypercall(KVMPPC_H_TLB_INVALIDATE, h_tlb_invalidate); + spapr_register_hypercall(KVMPPC_H_COPY_TOFROM_GUEST, h_copy_tofrom_guest); } type_init(hypercall_register_types) diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c index 91de1052f2..c4c97da5de 100644 --- a/hw/ppc/spapr_nvdimm.c +++ b/hw/ppc/spapr_nvdimm.c @@ -22,6 +22,7 @@ * THE SOFTWARE. */ #include "qemu/osdep.h" +#include "qemu/cutils.h" #include "qapi/error.h" #include "hw/ppc/spapr_drc.h" #include "hw/ppc/spapr_nvdimm.h" @@ -30,6 +31,10 @@ #include "hw/ppc/fdt.h" #include "qemu/range.h" #include "hw/ppc/spapr_numa.h" +#include "block/thread-pool.h" +#include "migration/vmstate.h" +#include "qemu/pmem.h" +#include "hw/qdev-properties.h" /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */ /* SCM device is unable to persist memory contents */ @@ -47,11 +52,25 @@ /* Have an explicit check for alignment */ QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE); +#define TYPE_SPAPR_NVDIMM "spapr-nvdimm" +OBJECT_DECLARE_TYPE(SpaprNVDIMMDevice, SPAPRNVDIMMClass, SPAPR_NVDIMM) + +struct SPAPRNVDIMMClass { + /* private */ + NVDIMMClass parent_class; + + /* public */ + void (*realize)(NVDIMMDevice *dimm, Error **errp); + void (*unrealize)(NVDIMMDevice *dimm, Error **errp); +}; + bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, uint64_t size, Error **errp) { const MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev); const MachineState *ms = MACHINE(hotplug_dev); + PCDIMMDevice *dimm = PC_DIMM(nvdimm); + MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem); g_autofree char *uuidstr = NULL; QemuUUID uuid; int ret; @@ -89,6 +108,14 @@ bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, return false; } + if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM) && + (memory_region_get_fd(mr) < 0)) { + error_setg(errp, "spapr-nvdimm device requires the " + "memdev %s to be of memory-backend-file type", + object_get_canonical_path_component(OBJECT(dimm->hostmem))); + return false; + } + return true; } @@ -160,6 +187,20 @@ static int spapr_dt_nvdimm(SpaprMachineState *spapr, void *fdt, "operating-system"))); _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0)); + if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) { + bool is_pmem = false, pmem_override = false; + PCDIMMDevice *dimm = PC_DIMM(nvdimm); + HostMemoryBackend *hostmem = dimm->hostmem; + + is_pmem = object_property_get_bool(OBJECT(hostmem), "pmem", NULL); + pmem_override = object_property_get_bool(OBJECT(nvdimm), + "pmem-override", NULL); + if (!is_pmem || pmem_override) { + _FDT(fdt_setprop(fdt, child_offset, "ibm,hcall-flush-required", + NULL, 0)); + } + } + return child_offset; } @@ -375,6 +416,293 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr, return H_SUCCESS; } +typedef struct SpaprNVDIMMDeviceFlushState { + uint64_t continue_token; + int64_t hcall_ret; + uint32_t drcidx; + + QLIST_ENTRY(SpaprNVDIMMDeviceFlushState) node; +} SpaprNVDIMMDeviceFlushState; + +typedef struct SpaprNVDIMMDevice SpaprNVDIMMDevice; +struct SpaprNVDIMMDevice { + /* private */ + NVDIMMDevice parent_obj; + + bool hcall_flush_required; + uint64_t nvdimm_flush_token; + QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) pending_nvdimm_flush_states; + QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) completed_nvdimm_flush_states; + + /* public */ + + /* + * The 'on' value for this property forced the qemu to enable the hcall + * flush for the nvdimm device even if the backend is a pmem + */ + bool pmem_override; +}; + +static int flush_worker_cb(void *opaque) +{ + SpaprNVDIMMDeviceFlushState *state = opaque; + SpaprDrc *drc = spapr_drc_by_index(state->drcidx); + PCDIMMDevice *dimm = PC_DIMM(drc->dev); + HostMemoryBackend *backend = MEMORY_BACKEND(dimm->hostmem); + int backend_fd = memory_region_get_fd(&backend->mr); + + if (object_property_get_bool(OBJECT(backend), "pmem", NULL)) { + MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem); + void *ptr = memory_region_get_ram_ptr(mr); + size_t size = object_property_get_uint(OBJECT(dimm), PC_DIMM_SIZE_PROP, + NULL); + + /* flush pmem backend */ + pmem_persist(ptr, size); + } else { + /* flush raw backing image */ + if (qemu_fdatasync(backend_fd) < 0) { + error_report("papr_scm: Could not sync nvdimm to backend file: %s", + strerror(errno)); + return H_HARDWARE; + } + } + + return H_SUCCESS; +} + +static void spapr_nvdimm_flush_completion_cb(void *opaque, int hcall_ret) +{ + SpaprNVDIMMDeviceFlushState *state = opaque; + SpaprDrc *drc = spapr_drc_by_index(state->drcidx); + SpaprNVDIMMDevice *s_nvdimm = SPAPR_NVDIMM(drc->dev); + + state->hcall_ret = hcall_ret; + QLIST_REMOVE(state, node); + QLIST_INSERT_HEAD(&s_nvdimm->completed_nvdimm_flush_states, state, node); +} + +static int spapr_nvdimm_flush_post_load(void *opaque, int version_id) +{ + SpaprNVDIMMDevice *s_nvdimm = (SpaprNVDIMMDevice *)opaque; + SpaprNVDIMMDeviceFlushState *state; + ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context()); + HostMemoryBackend *backend = MEMORY_BACKEND(PC_DIMM(s_nvdimm)->hostmem); + bool is_pmem = object_property_get_bool(OBJECT(backend), "pmem", NULL); + bool pmem_override = object_property_get_bool(OBJECT(s_nvdimm), + "pmem-override", NULL); + bool dest_hcall_flush_required = pmem_override || !is_pmem; + + if (!s_nvdimm->hcall_flush_required && dest_hcall_flush_required) { + error_report("The file backend for the spapr-nvdimm device %s at " + "source is a pmem, use pmem=on and pmem-override=off to " + "continue.", DEVICE(s_nvdimm)->id); + return -EINVAL; + } + if (s_nvdimm->hcall_flush_required && !dest_hcall_flush_required) { + error_report("The guest expects hcall-flush support for the " + "spapr-nvdimm device %s, use pmem_override=on to " + "continue.", DEVICE(s_nvdimm)->id); + return -EINVAL; + } + + QLIST_FOREACH(state, &s_nvdimm->pending_nvdimm_flush_states, node) { + thread_pool_submit_aio(pool, flush_worker_cb, state, + spapr_nvdimm_flush_completion_cb, state); + } + + return 0; +} + +static const VMStateDescription vmstate_spapr_nvdimm_flush_state = { + .name = "spapr_nvdimm_flush_state", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT64(continue_token, SpaprNVDIMMDeviceFlushState), + VMSTATE_INT64(hcall_ret, SpaprNVDIMMDeviceFlushState), + VMSTATE_UINT32(drcidx, SpaprNVDIMMDeviceFlushState), + VMSTATE_END_OF_LIST() + }, +}; + +const VMStateDescription vmstate_spapr_nvdimm_states = { + .name = "spapr_nvdimm_states", + .version_id = 1, + .minimum_version_id = 1, + .post_load = spapr_nvdimm_flush_post_load, + .fields = (VMStateField[]) { + VMSTATE_BOOL(hcall_flush_required, SpaprNVDIMMDevice), + VMSTATE_UINT64(nvdimm_flush_token, SpaprNVDIMMDevice), + VMSTATE_QLIST_V(completed_nvdimm_flush_states, SpaprNVDIMMDevice, 1, + vmstate_spapr_nvdimm_flush_state, + SpaprNVDIMMDeviceFlushState, node), + VMSTATE_QLIST_V(pending_nvdimm_flush_states, SpaprNVDIMMDevice, 1, + vmstate_spapr_nvdimm_flush_state, + SpaprNVDIMMDeviceFlushState, node), + VMSTATE_END_OF_LIST() + }, +}; + +/* + * Assign a token and reserve it for the new flush state. + */ +static SpaprNVDIMMDeviceFlushState *spapr_nvdimm_init_new_flush_state( + SpaprNVDIMMDevice *spapr_nvdimm) +{ + SpaprNVDIMMDeviceFlushState *state; + + state = g_malloc0(sizeof(*state)); + + spapr_nvdimm->nvdimm_flush_token++; + /* Token zero is presumed as no job pending. Assert on overflow to zero */ + g_assert(spapr_nvdimm->nvdimm_flush_token != 0); + + state->continue_token = spapr_nvdimm->nvdimm_flush_token; + + QLIST_INSERT_HEAD(&spapr_nvdimm->pending_nvdimm_flush_states, state, node); + + return state; +} + +/* + * spapr_nvdimm_finish_flushes + * Waits for all pending flush requests to complete + * their execution and free the states + */ +void spapr_nvdimm_finish_flushes(void) +{ + SpaprNVDIMMDeviceFlushState *state, *next; + GSList *list, *nvdimms; + + /* + * Called on reset path, the main loop thread which calls + * the pending BHs has gotten out running in the reset path, + * finally reaching here. Other code path being guest + * h_client_architecture_support, thats early boot up. + */ + nvdimms = nvdimm_get_device_list(); + for (list = nvdimms; list; list = list->next) { + NVDIMMDevice *nvdimm = list->data; + if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) { + SpaprNVDIMMDevice *s_nvdimm = SPAPR_NVDIMM(nvdimm); + while (!QLIST_EMPTY(&s_nvdimm->pending_nvdimm_flush_states)) { + aio_poll(qemu_get_aio_context(), true); + } + + QLIST_FOREACH_SAFE(state, &s_nvdimm->completed_nvdimm_flush_states, + node, next) { + QLIST_REMOVE(state, node); + g_free(state); + } + } + } + g_slist_free(nvdimms); +} + +/* + * spapr_nvdimm_get_flush_status + * Fetches the status of the hcall worker and returns + * H_LONG_BUSY_ORDER_10_MSEC if the worker is still running. + */ +static int spapr_nvdimm_get_flush_status(SpaprNVDIMMDevice *s_nvdimm, + uint64_t token) +{ + SpaprNVDIMMDeviceFlushState *state, *node; + + QLIST_FOREACH(state, &s_nvdimm->pending_nvdimm_flush_states, node) { + if (state->continue_token == token) { + return H_LONG_BUSY_ORDER_10_MSEC; + } + } + + QLIST_FOREACH_SAFE(state, &s_nvdimm->completed_nvdimm_flush_states, + node, node) { + if (state->continue_token == token) { + int ret = state->hcall_ret; + QLIST_REMOVE(state, node); + g_free(state); + return ret; + } + } + + /* If not found in complete list too, invalid token */ + return H_P2; +} + +/* + * H_SCM_FLUSH + * Input: drc_index, continue-token + * Out: continue-token + * Return Value: H_SUCCESS, H_Parameter, H_P2, H_LONG_BUSY_ORDER_10_MSEC, + * H_UNSUPPORTED + * + * Given a DRC Index Flush the data to backend NVDIMM device. The hcall returns + * H_LONG_BUSY_ORDER_10_MSEC when the flush takes longer time and the hcall + * needs to be issued multiple times in order to be completely serviced. The + * continue-token from the output to be passed in the argument list of + * subsequent hcalls until the hcall is completely serviced at which point + * H_SUCCESS or other error is returned. + */ +static target_ulong h_scm_flush(PowerPCCPU *cpu, SpaprMachineState *spapr, + target_ulong opcode, target_ulong *args) +{ + int ret; + uint32_t drc_index = args[0]; + uint64_t continue_token = args[1]; + SpaprDrc *drc = spapr_drc_by_index(drc_index); + PCDIMMDevice *dimm; + HostMemoryBackend *backend = NULL; + SpaprNVDIMMDeviceFlushState *state; + ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context()); + int fd; + + if (!drc || !drc->dev || + spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) { + return H_PARAMETER; + } + + dimm = PC_DIMM(drc->dev); + if (!object_dynamic_cast(OBJECT(dimm), TYPE_SPAPR_NVDIMM)) { + return H_PARAMETER; + } + if (continue_token == 0) { + bool is_pmem = false, pmem_override = false; + backend = MEMORY_BACKEND(dimm->hostmem); + fd = memory_region_get_fd(&backend->mr); + + if (fd < 0) { + return H_UNSUPPORTED; + } + + is_pmem = object_property_get_bool(OBJECT(backend), "pmem", NULL); + pmem_override = object_property_get_bool(OBJECT(dimm), + "pmem-override", NULL); + if (is_pmem && !pmem_override) { + return H_UNSUPPORTED; + } + + state = spapr_nvdimm_init_new_flush_state(SPAPR_NVDIMM(dimm)); + if (!state) { + return H_HARDWARE; + } + + state->drcidx = drc_index; + + thread_pool_submit_aio(pool, flush_worker_cb, state, + spapr_nvdimm_flush_completion_cb, state); + + continue_token = state->continue_token; + } + + ret = spapr_nvdimm_get_flush_status(SPAPR_NVDIMM(dimm), continue_token); + if (H_IS_LONG_BUSY(ret)) { + args[0] = continue_token; + } + + return ret; +} + static target_ulong h_scm_unbind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr, target_ulong opcode, target_ulong *args) { @@ -523,6 +851,70 @@ static void spapr_scm_register_types(void) spapr_register_hypercall(H_SCM_UNBIND_MEM, h_scm_unbind_mem); spapr_register_hypercall(H_SCM_UNBIND_ALL, h_scm_unbind_all); spapr_register_hypercall(H_SCM_HEALTH, h_scm_health); + spapr_register_hypercall(H_SCM_FLUSH, h_scm_flush); } type_init(spapr_scm_register_types) + +static void spapr_nvdimm_realize(NVDIMMDevice *dimm, Error **errp) +{ + SpaprNVDIMMDevice *s_nvdimm = SPAPR_NVDIMM(dimm); + HostMemoryBackend *backend = MEMORY_BACKEND(PC_DIMM(dimm)->hostmem); + bool is_pmem = object_property_get_bool(OBJECT(backend), "pmem", NULL); + bool pmem_override = object_property_get_bool(OBJECT(dimm), "pmem-override", + NULL); + if (!is_pmem || pmem_override) { + s_nvdimm->hcall_flush_required = true; + } + + vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, + &vmstate_spapr_nvdimm_states, dimm); +} + +static void spapr_nvdimm_unrealize(NVDIMMDevice *dimm) +{ + vmstate_unregister(NULL, &vmstate_spapr_nvdimm_states, dimm); +} + +static Property spapr_nvdimm_properties[] = { +#ifdef CONFIG_LIBPMEM + DEFINE_PROP_BOOL("pmem-override", SpaprNVDIMMDevice, pmem_override, false), +#endif + DEFINE_PROP_END_OF_LIST(), +}; + +static void spapr_nvdimm_class_init(ObjectClass *oc, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(oc); + NVDIMMClass *nvc = NVDIMM_CLASS(oc); + + nvc->realize = spapr_nvdimm_realize; + nvc->unrealize = spapr_nvdimm_unrealize; + + device_class_set_props(dc, spapr_nvdimm_properties); +} + +static void spapr_nvdimm_init(Object *obj) +{ + SpaprNVDIMMDevice *s_nvdimm = SPAPR_NVDIMM(obj); + + s_nvdimm->hcall_flush_required = false; + QLIST_INIT(&s_nvdimm->pending_nvdimm_flush_states); + QLIST_INIT(&s_nvdimm->completed_nvdimm_flush_states); +} + +static TypeInfo spapr_nvdimm_info = { + .name = TYPE_SPAPR_NVDIMM, + .parent = TYPE_NVDIMM, + .class_init = spapr_nvdimm_class_init, + .class_size = sizeof(SPAPRNVDIMMClass), + .instance_size = sizeof(SpaprNVDIMMDevice), + .instance_init = spapr_nvdimm_init, +}; + +static void spapr_nvdimm_register_types(void) +{ + type_register_static(&spapr_nvdimm_info); +} + +type_init(spapr_nvdimm_register_types) |