summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/kernel')
-rw-r--r--arch/powerpc/kernel/Makefile2
-rw-r--r--arch/powerpc/kernel/asm-offsets.c10
-rw-r--r--arch/powerpc/kernel/entry_64.S193
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S284
-rw-r--r--arch/powerpc/kernel/fadump.c196
-rw-r--r--arch/powerpc/kernel/idle_book3s.S188
-rw-r--r--arch/powerpc/kernel/irq.c62
-rw-r--r--arch/powerpc/kernel/kprobes.c8
-rw-r--r--arch/powerpc/kernel/mce.c1
-rw-r--r--arch/powerpc/kernel/mce_power.c3
-rw-r--r--arch/powerpc/kernel/misc_32.S6
-rw-r--r--arch/powerpc/kernel/optprobes.c53
-rw-r--r--arch/powerpc/kernel/process.c45
-rw-r--r--arch/powerpc/kernel/setup-common.c4
-rw-r--r--arch/powerpc/kernel/smp.c7
-rw-r--r--arch/powerpc/kernel/time.c96
-rw-r--r--arch/powerpc/kernel/tm.S4
-rw-r--r--arch/powerpc/kernel/traps.c3
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S61
19 files changed, 823 insertions, 403 deletions
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index e132902e1f14..0845eebc5af3 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -25,8 +25,6 @@ CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
-# timers used by tracing
-CFLAGS_REMOVE_time.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
endif
obj-y := cputable.o ptrace.o syscalls.o \
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index ae8e89e0d083..6e95c2c19a7e 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -100,12 +100,12 @@ int main(void)
OFFSET(THREAD_NORMSAVES, thread_struct, normsave[0]);
#endif
OFFSET(THREAD_FPEXC_MODE, thread_struct, fpexc_mode);
- OFFSET(THREAD_FPSTATE, thread_struct, fp_state);
+ OFFSET(THREAD_FPSTATE, thread_struct, fp_state.fpr);
OFFSET(THREAD_FPSAVEAREA, thread_struct, fp_save_area);
OFFSET(FPSTATE_FPSCR, thread_fp_state, fpscr);
OFFSET(THREAD_LOAD_FP, thread_struct, load_fp);
#ifdef CONFIG_ALTIVEC
- OFFSET(THREAD_VRSTATE, thread_struct, vr_state);
+ OFFSET(THREAD_VRSTATE, thread_struct, vr_state.vr);
OFFSET(THREAD_VRSAVEAREA, thread_struct, vr_save_area);
OFFSET(THREAD_VRSAVE, thread_struct, vrsave);
OFFSET(THREAD_USED_VR, thread_struct, used_vr);
@@ -145,9 +145,9 @@ int main(void)
OFFSET(THREAD_TM_PPR, thread_struct, tm_ppr);
OFFSET(THREAD_TM_DSCR, thread_struct, tm_dscr);
OFFSET(PT_CKPT_REGS, thread_struct, ckpt_regs);
- OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state);
+ OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state.vr);
OFFSET(THREAD_CKVRSAVE, thread_struct, ckvrsave);
- OFFSET(THREAD_CKFPSTATE, thread_struct, ckfp_state);
+ OFFSET(THREAD_CKFPSTATE, thread_struct, ckfp_state.fpr);
/* Local pt_regs on stack for Transactional Memory funcs. */
DEFINE(TM_FRAME_SIZE, STACK_FRAME_OVERHEAD +
sizeof(struct pt_regs) + 16);
@@ -745,9 +745,11 @@ int main(void)
OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask);
OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask);
OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas);
+ OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr);
#endif
DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);
+ DEFINE(PPC_DBELL_MSGTYPE, PPC_DBELL_MSGTYPE);
#ifdef CONFIG_PPC_8xx
DEFINE(VIRT_IMMR_BASE, (u64)__fix_to_virt(FIX_IMMR_BASE));
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index bfbad08a1207..49d8422767b4 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -57,7 +57,7 @@ system_call_common:
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
extrdi. r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */
- bne tabort_syscall
+ bne .Ltabort_syscall
END_FTR_SECTION_IFSET(CPU_FTR_TM)
#endif
andi. r10,r12,MSR_PR
@@ -143,6 +143,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
mtmsrd r11,1
#endif /* CONFIG_PPC_BOOK3E */
+system_call: /* label this so stack traces look sane */
/* We do need to set SOFTE in the stack frame or the return
* from interrupt will be painful
*/
@@ -152,11 +153,11 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
CURRENT_THREAD_INFO(r11, r1)
ld r10,TI_FLAGS(r11)
andi. r11,r10,_TIF_SYSCALL_DOTRACE
- bne syscall_dotrace /* does not return */
+ bne .Lsyscall_dotrace /* does not return */
cmpldi 0,r0,NR_syscalls
- bge- syscall_enosys
+ bge- .Lsyscall_enosys
-system_call: /* label this so stack traces look sane */
+.Lsyscall:
/*
* Need to vector to 32 Bit or default sys_call_table here,
* based on caller's run-mode / personality.
@@ -185,8 +186,20 @@ system_call: /* label this so stack traces look sane */
#ifdef CONFIG_PPC_BOOK3S
/* No MSR:RI on BookE */
andi. r10,r8,MSR_RI
- beq- unrecov_restore
+ beq- .Lunrecov_restore
#endif
+
+/*
+ * This is a few instructions into the actual syscall exit path (which actually
+ * starts at .Lsyscall_exit) to cater to kprobe blacklisting and to reduce the
+ * number of visible symbols for profiling purposes.
+ *
+ * We can probe from system_call until this point as MSR_RI is set. But once it
+ * is cleared below, we won't be able to take a trap.
+ *
+ * This is blacklisted from kprobes further below with _ASM_NOKPROBE_SYMBOL().
+ */
+system_call_exit:
/*
* Disable interrupts so current_thread_info()->flags can't change,
* and so that we don't get interrupted after loading SRR0/1.
@@ -208,31 +221,21 @@ system_call: /* label this so stack traces look sane */
ld r9,TI_FLAGS(r12)
li r11,-MAX_ERRNO
andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
- bne- syscall_exit_work
+ bne- .Lsyscall_exit_work
- andi. r0,r8,MSR_FP
- beq 2f
+ /* If MSR_FP and MSR_VEC are set in user msr, then no need to restore */
+ li r7,MSR_FP
#ifdef CONFIG_ALTIVEC
- andis. r0,r8,MSR_VEC@h
- bne 3f
-#endif
-2: addi r3,r1,STACK_FRAME_OVERHEAD
-#ifdef CONFIG_PPC_BOOK3S
- li r10,MSR_RI
- mtmsrd r10,1 /* Restore RI */
-#endif
- bl restore_math
-#ifdef CONFIG_PPC_BOOK3S
- li r11,0
- mtmsrd r11,1
+ oris r7,r7,MSR_VEC@h
#endif
- ld r8,_MSR(r1)
- ld r3,RESULT(r1)
- li r11,-MAX_ERRNO
+ and r0,r8,r7
+ cmpd r0,r7
+ bne .Lsyscall_restore_math
+.Lsyscall_restore_math_cont:
-3: cmpld r3,r11
+ cmpld r3,r11
ld r5,_CCR(r1)
- bge- syscall_error
+ bge- .Lsyscall_error
.Lsyscall_error_cont:
ld r7,_NIP(r1)
BEGIN_FTR_SECTION
@@ -258,14 +261,48 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
RFI
b . /* prevent speculative execution */
-syscall_error:
+.Lsyscall_error:
oris r5,r5,0x1000 /* Set SO bit in CR */
neg r3,r3
std r5,_CCR(r1)
b .Lsyscall_error_cont
-
+
+.Lsyscall_restore_math:
+ /*
+ * Some initial tests from restore_math to avoid the heavyweight
+ * C code entry and MSR manipulations.
+ */
+ LOAD_REG_IMMEDIATE(r0, MSR_TS_MASK)
+ and. r0,r0,r8
+ bne 1f
+
+ ld r7,PACACURRENT(r13)
+ lbz r0,THREAD+THREAD_LOAD_FP(r7)
+#ifdef CONFIG_ALTIVEC
+ lbz r6,THREAD+THREAD_LOAD_VEC(r7)
+ add r0,r0,r6
+#endif
+ cmpdi r0,0
+ beq .Lsyscall_restore_math_cont
+
+1: addi r3,r1,STACK_FRAME_OVERHEAD
+#ifdef CONFIG_PPC_BOOK3S
+ li r10,MSR_RI
+ mtmsrd r10,1 /* Restore RI */
+#endif
+ bl restore_math
+#ifdef CONFIG_PPC_BOOK3S
+ li r11,0
+ mtmsrd r11,1
+#endif
+ /* Restore volatiles, reload MSR from updated one */
+ ld r8,_MSR(r1)
+ ld r3,RESULT(r1)
+ li r11,-MAX_ERRNO
+ b .Lsyscall_restore_math_cont
+
/* Traced system call support */
-syscall_dotrace:
+.Lsyscall_dotrace:
bl save_nvgprs
addi r3,r1,STACK_FRAME_OVERHEAD
bl do_syscall_trace_enter
@@ -286,23 +323,23 @@ syscall_dotrace:
ld r7,GPR7(r1)
ld r8,GPR8(r1)
- /* Repopulate r9 and r10 for the system_call path */
+ /* Repopulate r9 and r10 for the syscall path */
addi r9,r1,STACK_FRAME_OVERHEAD
CURRENT_THREAD_INFO(r10, r1)
ld r10,TI_FLAGS(r10)
cmpldi r0,NR_syscalls
- blt+ system_call
+ blt+ .Lsyscall
/* Return code is already in r3 thanks to do_syscall_trace_enter() */
b .Lsyscall_exit
-syscall_enosys:
+.Lsyscall_enosys:
li r3,-ENOSYS
b .Lsyscall_exit
-syscall_exit_work:
+.Lsyscall_exit_work:
#ifdef CONFIG_PPC_BOOK3S
li r10,MSR_RI
mtmsrd r10,1 /* Restore RI */
@@ -362,7 +399,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
b ret_from_except
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-tabort_syscall:
+.Ltabort_syscall:
/* Firstly we need to enable TM in the kernel */
mfmsr r10
li r9, 1
@@ -388,6 +425,8 @@ tabort_syscall:
rfid
b . /* prevent speculative execution */
#endif
+_ASM_NOKPROBE_SYMBOL(system_call_common);
+_ASM_NOKPROBE_SYMBOL(system_call_exit);
/* Save non-volatile GPRs, if not already saved. */
_GLOBAL(save_nvgprs)
@@ -398,6 +437,7 @@ _GLOBAL(save_nvgprs)
clrrdi r0,r11,1
std r0,_TRAP(r1)
blr
+_ASM_NOKPROBE_SYMBOL(save_nvgprs);
/*
@@ -488,33 +528,30 @@ _GLOBAL(_switch)
std r23,_CCR(r1)
std r1,KSP(r3) /* Set old stack pointer */
-#ifdef CONFIG_SMP
- /* We need a sync somewhere here to make sure that if the
- * previous task gets rescheduled on another CPU, it sees all
- * stores it has performed on this one.
+ /*
+ * On SMP kernels, care must be taken because a task may be
+ * scheduled off CPUx and on to CPUy. Memory ordering must be
+ * considered.
+ *
+ * Cacheable stores on CPUx will be visible when the task is
+ * scheduled on CPUy by virtue of the core scheduler barriers
+ * (see "Notes on Program-Order guarantees on SMP systems." in
+ * kernel/sched/core.c).
+ *
+ * Uncacheable stores in the case of involuntary preemption must
+ * be taken care of. The smp_mb__before_spin_lock() in __schedule()
+ * is implemented as hwsync on powerpc, which orders MMIO too. So
+ * long as there is an hwsync in the context switch path, it will
+ * be executed on the source CPU after the task has performed
+ * all MMIO ops on that CPU, and on the destination CPU before the
+ * task performs any MMIO ops there.
*/
- sync
-#endif /* CONFIG_SMP */
/*
- * If we optimise away the clear of the reservation in system
- * calls because we know the CPU tracks the address of the
- * reservation, then we need to clear it here to cover the
- * case that the kernel context switch path has no larx
- * instructions.
+ * The kernel context switch path must contain a spin_lock,
+ * which contains larx/stcx, which will clear any reservation
+ * of the task being switched.
*/
-BEGIN_FTR_SECTION
- ldarx r6,0,r1
-END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS)
-
-BEGIN_FTR_SECTION
-/*
- * A cp_abort (copy paste abort) here ensures that when context switching, a
- * copy from one process can't leak into the paste of another.
- */
- PPC_CP_ABORT
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-
#ifdef CONFIG_PPC_BOOK3S
/* Cancel all explict user streams as they will have no use after context
* switch and will stop the HW from creating streams itself
@@ -583,6 +620,14 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
top of the kernel stack. */
addi r7,r7,THREAD_SIZE-SWITCH_FRAME_SIZE
+ /*
+ * PMU interrupts in radix may come in here. They will use r1, not
+ * PACAKSAVE, so this stack switch will not cause a problem. They
+ * will store to the process stack, which may then be migrated to
+ * another CPU. However the rq lock release on this CPU paired with
+ * the rq lock acquire on the new CPU before the stack becomes
+ * active on the new CPU, will order those stores.
+ */
mr r1,r8 /* start using new stack pointer */
std r7,PACAKSAVE(r13)
@@ -763,11 +808,11 @@ restore:
ld r5,SOFTE(r1)
lbz r6,PACASOFTIRQEN(r13)
cmpwi cr0,r5,0
- beq restore_irq_off
+ beq .Lrestore_irq_off
/* We are enabling, were we already enabled ? Yes, just return */
cmpwi cr0,r6,1
- beq cr0,do_restore
+ beq cr0,.Ldo_restore
/*
* We are about to soft-enable interrupts (we are hard disabled
@@ -776,14 +821,14 @@ restore:
*/
lbz r0,PACAIRQHAPPENED(r13)
cmpwi cr0,r0,0
- bne- restore_check_irq_replay
+ bne- .Lrestore_check_irq_replay
/*
* Get here when nothing happened while soft-disabled, just
* soft-enable and move-on. We will hard-enable as a side
* effect of rfi
*/
-restore_no_replay:
+.Lrestore_no_replay:
TRACE_ENABLE_INTS
li r0,1
stb r0,PACASOFTIRQEN(r13);
@@ -791,7 +836,7 @@ restore_no_replay:
/*
* Final return path. BookE is handled in a different file
*/
-do_restore:
+.Ldo_restore:
#ifdef CONFIG_PPC_BOOK3E
b exception_return_book3e
#else
@@ -825,7 +870,7 @@ fast_exception_return:
REST_8GPRS(5, r1)
andi. r0,r3,MSR_RI
- beq- unrecov_restore
+ beq- .Lunrecov_restore
/* Load PPR from thread struct before we clear MSR:RI */
BEGIN_FTR_SECTION
@@ -883,7 +928,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
* make sure that in this case, we also clear PACA_IRQ_HARD_DIS
* or that bit can get out of sync and bad things will happen
*/
-restore_irq_off:
+.Lrestore_irq_off:
ld r3,_MSR(r1)
lbz r7,PACAIRQHAPPENED(r13)
andi. r0,r3,MSR_EE
@@ -893,13 +938,13 @@ restore_irq_off:
1: li r0,0
stb r0,PACASOFTIRQEN(r13);
TRACE_DISABLE_INTS
- b do_restore
+ b .Ldo_restore
/*
* Something did happen, check if a re-emit is needed
* (this also clears paca->irq_happened)
*/
-restore_check_irq_replay:
+.Lrestore_check_irq_replay:
/* XXX: We could implement a fast path here where we check
* for irq_happened being just 0x01, in which case we can
* clear it and return. That means that we would potentially
@@ -909,7 +954,7 @@ restore_check_irq_replay:
*/
bl __check_irq_replay
cmpwi cr0,r3,0
- beq restore_no_replay
+ beq .Lrestore_no_replay
/*
* We need to re-emit an interrupt. We do so by re-using our
@@ -958,10 +1003,18 @@ restore_check_irq_replay:
#endif /* CONFIG_PPC_DOORBELL */
1: b ret_from_except /* What else to do here ? */
-unrecov_restore:
+.Lunrecov_restore:
addi r3,r1,STACK_FRAME_OVERHEAD
bl unrecoverable_exception
- b unrecov_restore
+ b .Lunrecov_restore
+
+_ASM_NOKPROBE_SYMBOL(ret_from_except);
+_ASM_NOKPROBE_SYMBOL(ret_from_except_lite);
+_ASM_NOKPROBE_SYMBOL(resume_kernel);
+_ASM_NOKPROBE_SYMBOL(fast_exc_return_irq);
+_ASM_NOKPROBE_SYMBOL(restore);
+_ASM_NOKPROBE_SYMBOL(fast_exception_return);
+
#ifdef CONFIG_PPC_RTAS
/*
@@ -1038,6 +1091,8 @@ _GLOBAL(enter_rtas)
rldicr r9,r9,MSR_SF_LG,(63-MSR_SF_LG)
ori r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP|MSR_RI|MSR_LE
andc r6,r0,r9
+
+__enter_rtas:
sync /* disable interrupts so SRR0/1 */
mtmsrd r0 /* don't get trashed */
@@ -1074,6 +1129,8 @@ rtas_return_loc:
mtspr SPRN_SRR1,r4
rfid
b . /* prevent speculative execution */
+_ASM_NOKPROBE_SYMBOL(__enter_rtas)
+_ASM_NOKPROBE_SYMBOL(rtas_return_loc)
.align 3
1: .llong rtas_restore_regs
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index b886795060fd..4c18a5fbb4bb 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -99,7 +99,11 @@ EXC_VIRT_NONE(0x4000, 0x100)
#ifdef CONFIG_PPC_P7_NAP
/*
* If running native on arch 2.06 or later, check if we are waking up
- * from nap/sleep/winkle, and branch to idle handler.
+ * from nap/sleep/winkle, and branch to idle handler. This tests SRR1
+ * bits 46:47. A non-0 value indicates that we are coming from a power
+ * saving state. The idle wakeup handler initially runs in real mode,
+ * but we branch to the 0xc000... address so we can turn on relocation
+ * with mtmsr.
*/
#define IDLETEST(n) \
BEGIN_FTR_SECTION ; \
@@ -107,7 +111,7 @@ EXC_VIRT_NONE(0x4000, 0x100)
rlwinm. r10,r10,47-31,30,31 ; \
beq- 1f ; \
cmpwi cr3,r10,2 ; \
- BRANCH_TO_COMMON(r10, system_reset_idle_common) ; \
+ BRANCH_TO_C000(r10, system_reset_idle_common) ; \
1: \
END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
#else
@@ -128,6 +132,7 @@ EXC_VIRT_NONE(0x4100, 0x100)
#ifdef CONFIG_PPC_P7_NAP
EXC_COMMON_BEGIN(system_reset_idle_common)
+ mfspr r12,SPRN_SRR1
b pnv_powersave_wakeup
#endif
@@ -507,46 +512,22 @@ EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80)
SET_SCRATCH0(r13)
EXCEPTION_PROLOG_0(PACA_EXSLB)
EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
- std r3,PACA_EXSLB+EX_R3(r13)
+ mr r12,r3 /* save r3 */
mfspr r3,SPRN_DAR
- mfspr r12,SPRN_SRR1
+ mfspr r11,SPRN_SRR1
crset 4*cr6+eq
-#ifndef CONFIG_RELOCATABLE
- b slb_miss_realmode
-#else
- /*
- * We can't just use a direct branch to slb_miss_realmode
- * because the distance from here to there depends on where
- * the kernel ends up being put.
- */
- mfctr r11
- LOAD_HANDLER(r10, slb_miss_realmode)
- mtctr r10
- bctr
-#endif
+ BRANCH_TO_COMMON(r10, slb_miss_common)
EXC_REAL_END(data_access_slb, 0x380, 0x80)
EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80)
SET_SCRATCH0(r13)
EXCEPTION_PROLOG_0(PACA_EXSLB)
EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380)
- std r3,PACA_EXSLB+EX_R3(r13)
+ mr r12,r3 /* save r3 */
mfspr r3,SPRN_DAR
- mfspr r12,SPRN_SRR1
+ mfspr r11,SPRN_SRR1
crset 4*cr6+eq
-#ifndef CONFIG_RELOCATABLE
- b slb_miss_realmode
-#else
- /*
- * We can't just use a direct branch to slb_miss_realmode
- * because the distance from here to there depends on where
- * the kernel ends up being put.
- */
- mfctr r11
- LOAD_HANDLER(r10, slb_miss_realmode)
- mtctr r10
- bctr
-#endif
+ BRANCH_TO_COMMON(r10, slb_miss_common)
EXC_VIRT_END(data_access_slb, 0x4380, 0x80)
TRAMP_KVM_SKIP(PACA_EXSLB, 0x380)
@@ -575,88 +556,82 @@ EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80)
SET_SCRATCH0(r13)
EXCEPTION_PROLOG_0(PACA_EXSLB)
EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480)
- std r3,PACA_EXSLB+EX_R3(r13)
+ mr r12,r3 /* save r3 */
mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */
- mfspr r12,SPRN_SRR1
+ mfspr r11,SPRN_SRR1
crclr 4*cr6+eq
-#ifndef CONFIG_RELOCATABLE
- b slb_miss_realmode
-#else
- mfctr r11
- LOAD_HANDLER(r10, slb_miss_realmode)
- mtctr r10
- bctr
-#endif
+ BRANCH_TO_COMMON(r10, slb_miss_common)
EXC_REAL_END(instruction_access_slb, 0x480, 0x80)
EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80)
SET_SCRATCH0(r13)
EXCEPTION_PROLOG_0(PACA_EXSLB)
EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480)
- std r3,PACA_EXSLB+EX_R3(r13)
+ mr r12,r3 /* save r3 */
mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */
- mfspr r12,SPRN_SRR1
+ mfspr r11,SPRN_SRR1
crclr 4*cr6+eq
-#ifndef CONFIG_RELOCATABLE
- b slb_miss_realmode
-#else
- mfctr r11
- LOAD_HANDLER(r10, slb_miss_realmode)
- mtctr r10
- bctr
-#endif
+ BRANCH_TO_COMMON(r10, slb_miss_common)
EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80)
TRAMP_KVM(PACA_EXSLB, 0x480)
-/* This handler is used by both 0x380 and 0x480 slb miss interrupts */
-EXC_COMMON_BEGIN(slb_miss_realmode)
+/*
+ * This handler is used by the 0x380 and 0x480 SLB miss interrupts, as well as
+ * the virtual mode 0x4380 and 0x4480 interrupts if AIL is enabled.
+ */
+EXC_COMMON_BEGIN(slb_miss_common)
/*
* r13 points to the PACA, r9 contains the saved CR,
- * r12 contain the saved SRR1, SRR0 is still ready for return
+ * r12 contains the saved r3,
+ * r11 contain the saved SRR1, SRR0 is still ready for return
* r3 has the faulting address
* r9 - r13 are saved in paca->exslb.
- * r3 is saved in paca->slb_r3
* cr6.eq is set for a D-SLB miss, clear for a I-SLB miss
* We assume we aren't going to take any exceptions during this
* procedure.
*/
mflr r10
-#ifdef CONFIG_RELOCATABLE
- mtctr r11
-#endif
-
stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */
std r10,PACA_EXSLB+EX_LR(r13) /* save LR */
- std r3,PACA_EXSLB+EX_DAR(r13)
+
+ /*
+ * Test MSR_RI before calling slb_allocate_realmode, because the
+ * MSR in r11 gets clobbered. However we still want to allocate
+ * SLB in case MSR_RI=0, to minimise the risk of getting stuck in
+ * recursive SLB faults. So use cr5 for this, which is preserved.
+ */
+ andi. r11,r11,MSR_RI /* check for unrecoverable exception */
+ cmpdi cr5,r11,MSR_RI
crset 4*cr0+eq
#ifdef CONFIG_PPC_STD_MMU_64
BEGIN_MMU_FTR_SECTION
- bl slb_allocate_realmode
+ bl slb_allocate
END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
#endif
ld r10,PACA_EXSLB+EX_LR(r13)
- ld r3,PACA_EXSLB+EX_R3(r13)
lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */
mtlr r10
- beq 8f /* if bad address, make full stack frame */
+ beq- 8f /* if bad address, make full stack frame */
- andi. r10,r12,MSR_RI /* check for unrecoverable exception */
- beq- 2f
+ bne- cr5,2f /* if unrecoverable exception, oops */
/* All done -- return from exception. */
.machine push
.machine "power4"
mtcrf 0x80,r9
+ mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */
mtcrf 0x02,r9 /* I/D indication is in cr6 */
mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */
.machine pop
+ RESTORE_CTR(r9, PACA_EXSLB)
RESTORE_PPR_PACA(PACA_EXSLB, r9)
+ mr r3,r12
ld r9,PACA_EXSLB+EX_R9(r13)
ld r10,PACA_EXSLB+EX_R10(r13)
ld r11,PACA_EXSLB+EX_R11(r13)
@@ -665,7 +640,10 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
rfid
b . /* prevent speculative execution */
-2: mfspr r11,SPRN_SRR0
+2: std r3,PACA_EXSLB+EX_DAR(r13)
+ mr r3,r12
+ mfspr r11,SPRN_SRR0
+ mfspr r12,SPRN_SRR1
LOAD_HANDLER(r10,unrecov_slb)
mtspr SPRN_SRR0,r10
ld r10,PACAKMSR(r13)
@@ -673,7 +651,10 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
rfid
b .
-8: mfspr r11,SPRN_SRR0
+8: std r3,PACA_EXSLB+EX_DAR(r13)
+ mr r3,r12
+ mfspr r11,SPRN_SRR0
+ mfspr r12,SPRN_SRR1
LOAD_HANDLER(r10,bad_addr_slb)
mtspr SPRN_SRR0,r10
ld r10,PACAKMSR(r13)
@@ -821,46 +802,80 @@ EXC_VIRT(trap_0b, 0x4b00, 0x100, 0xb00)
TRAMP_KVM(PACA_EXGEN, 0xb00)
EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
+/*
+ * system call / hypercall (0xc00, 0x4c00)
+ *
+ * The system call exception is invoked with "sc 0" and does not alter HV bit.
+ * There is support for kernel code to invoke system calls but there are no
+ * in-tree users.
+ *
+ * The hypercall is invoked with "sc 1" and sets HV=1.
+ *
+ * In HPT, sc 1 always goes to 0xc00 real mode. In RADIX, sc 1 can go to
+ * 0x4c00 virtual mode.
+ *
+ * Call convention:
+ *
+ * syscall register convention is in Documentation/powerpc/syscall64-abi.txt
+ *
+ * For hypercalls, the register convention is as follows:
+ * r0 volatile
+ * r1-2 nonvolatile
+ * r3 volatile parameter and return value for status
+ * r4-r10 volatile input and output value
+ * r11 volatile hypercall number and output value
+ * r12 volatile
+ * r13-r31 nonvolatile
+ * LR nonvolatile
+ * CTR volatile
+ * XER volatile
+ * CR0-1 CR5-7 volatile
+ * CR2-4 nonvolatile
+ * Other registers nonvolatile
+ *
+ * The intersection of volatile registers that don't contain possible
+ * inputs is: r12, cr0, xer, ctr. We may use these as scratch regs
+ * upon entry without saving.
+ */
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
- /*
- * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems
- * that support it) before changing to HMT_MEDIUM. That allows the KVM
- * code to save that value into the guest state (it is the guest's PPR
- * value). Otherwise just change to HMT_MEDIUM as userspace has
- * already saved the PPR.
- */
+ /*
+ * There is a little bit of juggling to get syscall and hcall
+ * working well. Save r10 in ctr to be restored in case it is a
+ * hcall.
+ *
+ * Userspace syscalls have already saved the PPR, hcalls must save
+ * it before setting HMT_MEDIUM.
+ */
#define SYSCALL_KVMTEST \
- SET_SCRATCH0(r13); \
+ mr r12,r13; \
GET_PACA(r13); \
- std r9,PACA_EXGEN+EX_R9(r13); \
- OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); \
+ mtctr r10; \
+ KVMTEST_PR(0xc00); /* uses r10, branch to do_kvm_0xc00_system_call */ \
HMT_MEDIUM; \
- std r10,PACA_EXGEN+EX_R10(r13); \
- OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR); \
- mfcr r9; \
- KVMTEST_PR(0xc00); \
- GET_SCRATCH0(r13)
+ mr r9,r12; \
#else
#define SYSCALL_KVMTEST \
- HMT_MEDIUM
+ HMT_MEDIUM; \
+ mr r9,r13; \
+ GET_PACA(r13);
#endif
#define LOAD_SYSCALL_HANDLER(reg) \
__LOAD_HANDLER(reg, system_call_common)
-/* Syscall routine is used twice, in reloc-off and reloc-on paths */
-#define SYSCALL_PSERIES_1 \
+#define SYSCALL_FASTENDIAN_TEST \
BEGIN_FTR_SECTION \
cmpdi r0,0x1ebe ; \
beq- 1f ; \
END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \
- mr r9,r13 ; \
- GET_PACA(r13) ; \
- mfspr r11,SPRN_SRR0 ; \
-0:
-#define SYSCALL_PSERIES_2_RFID \
+/*
+ * After SYSCALL_KVMTEST, we reach here with PACA in r13, r13 in r9,
+ * and HMT_MEDIUM.
+ */
+#define SYSCALL_REAL \
+ mfspr r11,SPRN_SRR0 ; \
mfspr r12,SPRN_SRR1 ; \
LOAD_SYSCALL_HANDLER(r10) ; \
mtspr SPRN_SRR0,r10 ; \
@@ -869,11 +884,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \
rfid ; \
b . ; /* prevent speculative execution */
-#define SYSCALL_PSERIES_3 \
+#define SYSCALL_FASTENDIAN \
/* Fast LE/BE switch system call */ \
1: mfspr r12,SPRN_SRR1 ; \
xori r12,r12,MSR_LE ; \
mtspr SPRN_SRR1,r12 ; \
+ mr r13,r9 ; \
rfid ; /* return to userspace */ \
b . ; /* prevent speculative execution */
@@ -882,16 +898,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \
* We can't branch directly so we do it via the CTR which
* is volatile across system calls.
*/
-#define SYSCALL_PSERIES_2_DIRECT \
- LOAD_SYSCALL_HANDLER(r12) ; \
- mtctr r12 ; \
+#define SYSCALL_VIRT \
+ LOAD_SYSCALL_HANDLER(r10) ; \
+ mtctr r10 ; \
+ mfspr r11,SPRN_SRR0 ; \
mfspr r12,SPRN_SRR1 ; \
li r10,MSR_RI ; \
mtmsrd r10,1 ; \
bctr ;
#else
/* We can branch directly */
-#define SYSCALL_PSERIES_2_DIRECT \
+#define SYSCALL_VIRT \
+ mfspr r11,SPRN_SRR0 ; \
mfspr r12,SPRN_SRR1 ; \
li r10,MSR_RI ; \
mtmsrd r10,1 ; /* Set RI (EE=0) */ \
@@ -899,20 +917,43 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \
#endif
EXC_REAL_BEGIN(system_call, 0xc00, 0x100)
- SYSCALL_KVMTEST
- SYSCALL_PSERIES_1
- SYSCALL_PSERIES_2_RFID
- SYSCALL_PSERIES_3
+ SYSCALL_KVMTEST /* loads PACA into r13, and saves r13 to r9 */
+ SYSCALL_FASTENDIAN_TEST
+ SYSCALL_REAL
+ SYSCALL_FASTENDIAN
EXC_REAL_END(system_call, 0xc00, 0x100)
EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100)
- SYSCALL_KVMTEST
- SYSCALL_PSERIES_1
- SYSCALL_PSERIES_2_DIRECT
- SYSCALL_PSERIES_3
+ SYSCALL_KVMTEST /* loads PACA into r13, and saves r13 to r9 */
+ SYSCALL_FASTENDIAN_TEST
+ SYSCALL_VIRT
+ SYSCALL_FASTENDIAN
EXC_VIRT_END(system_call, 0x4c00, 0x100)
-TRAMP_KVM(PACA_EXGEN, 0xc00)
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+ /*
+ * This is a hcall, so register convention is as above, with these
+ * differences:
+ * r13 = PACA
+ * r12 = orig r13
+ * ctr = orig r10
+ */
+TRAMP_KVM_BEGIN(do_kvm_0xc00)
+ /*
+ * Save the PPR (on systems that support it) before changing to
+ * HMT_MEDIUM. That allows the KVM code to save that value into the
+ * guest state (it is the guest's PPR value).
+ */
+ OPT_GET_SPR(r0, SPRN_PPR, CPU_FTR_HAS_PPR)
+ HMT_MEDIUM
+ OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r0, CPU_FTR_HAS_PPR)
+ mfctr r10
+ SET_SCRATCH0(r12)
+ std r9,PACA_EXGEN+EX_R9(r13)
+ mfcr r9
+ std r10,PACA_EXGEN+EX_R10(r13)
+ KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00)
+#endif
EXC_REAL(single_step, 0xd00, 0x100)
@@ -1553,6 +1594,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
1: addi r3,r1,STACK_FRAME_OVERHEAD
bl kernel_bad_stack
b 1b
+_ASM_NOKPROBE_SYMBOL(bad_stack);
+
+/*
+ * When doorbell is triggered from system reset wakeup, the message is
+ * not cleared, so it would fire again when EE is enabled.
+ *
+ * When coming from local_irq_enable, there may be the same problem if
+ * we were hard disabled.
+ *
+ * Execute msgclr to clear pending exceptions before handling it.
+ */
+h_doorbell_common_msgclr:
+ LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36))
+ PPC_MSGCLR(3)
+ b h_doorbell_common
+
+doorbell_super_common_msgclr:
+ LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36))
+ PPC_MSGCLRP(3)
+ b doorbell_super_common
/*
* Called from arch_local_irq_enable when an interrupt needs
@@ -1563,6 +1624,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
* Note: While MSR:EE is off, we need to make sure that _MSR
* in the generated frame has EE set to 1 or the exception
* handler will not properly re-enable them.
+ *
+ * Note that we don't specify LR as the NIP (return address) for
+ * the interrupt because that would unbalance the return branch
+ * predictor.
*/
_GLOBAL(__replay_interrupt)
/* We are going to jump to the exception common code which
@@ -1570,7 +1635,7 @@ _GLOBAL(__replay_interrupt)
* we don't give a damn about, so we don't bother storing them.
*/
mfmsr r12
- mflr r11
+ LOAD_REG_ADDR(r11, 1f)
mfcr r9
ori r12,r12,MSR_EE
cmpwi r3,0x900
@@ -1579,13 +1644,16 @@ _GLOBAL(__replay_interrupt)
beq hardware_interrupt_common
BEGIN_FTR_SECTION
cmpwi r3,0xe80
- beq h_doorbell_common
+ beq h_doorbell_common_msgclr
cmpwi r3,0xea0
beq h_virt_irq_common
cmpwi r3,0xe60
beq hmi_exception_common
FTR_SECTION_ELSE
cmpwi r3,0xa00
- beq doorbell_super_common
+ beq doorbell_super_common_msgclr
ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
+1:
blr
+
+_ASM_NOKPROBE_SYMBOL(__replay_interrupt)
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 466569e26278..3079518f2245 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -113,11 +113,55 @@ int __init early_init_dt_scan_fw_dump(unsigned long node,
return 1;
}
+/*
+ * If fadump is registered, check if the memory provided
+ * falls within boot memory area.
+ */
+int is_fadump_boot_memory_area(u64 addr, ulong size)
+{
+ if (!fw_dump.dump_registered)
+ return 0;
+
+ return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size;
+}
+
int is_fadump_active(void)
{
return fw_dump.dump_active;
}
+/*
+ * Returns 1, if there are no holes in boot memory area,
+ * 0 otherwise.
+ */
+static int is_boot_memory_area_contiguous(void)
+{
+ struct memblock_region *reg;
+ unsigned long tstart, tend;
+ unsigned long start_pfn = PHYS_PFN(RMA_START);
+ unsigned long end_pfn = PHYS_PFN(RMA_START + fw_dump.boot_memory_size);
+ unsigned int ret = 0;
+
+ for_each_memblock(memory, reg) {
+ tstart = max(start_pfn, memblock_region_memory_base_pfn(reg));
+ tend = min(end_pfn, memblock_region_memory_end_pfn(reg));
+ if (tstart < tend) {
+ /* Memory hole from start_pfn to tstart */
+ if (tstart > start_pfn)
+ break;
+
+ if (tend == end_pfn) {
+ ret = 1;
+ break;
+ }
+
+ start_pfn = tend + 1;
+ }
+ }
+
+ return ret;
+}
+
/* Print firmware assisted dump configurations for debugging purpose. */
static void fadump_show_config(void)
{
@@ -212,20 +256,46 @@ static inline unsigned long fadump_calculate_reserve_size(void)
int ret;
unsigned long long base, size;
+ if (fw_dump.reserve_bootvar)
+ pr_warn("'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter.\n");
+
/*
* Check if the size is specified through crashkernel= cmdline
- * option. If yes, then use that but ignore base as fadump
- * reserves memory at end of RAM.
+ * option. If yes, then use that but ignore base as fadump reserves
+ * memory at a predefined offset.
*/
ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
&size, &base);
if (ret == 0 && size > 0) {
+ unsigned long max_size;
+
+ if (fw_dump.reserve_bootvar)
+ pr_info("Using 'crashkernel=' parameter for memory reservation.\n");
+
fw_dump.reserve_bootvar = (unsigned long)size;
+
+ /*
+ * Adjust if the boot memory size specified is above
+ * the upper limit.
+ */
+ max_size = memblock_phys_mem_size() / MAX_BOOT_MEM_RATIO;
+ if (fw_dump.reserve_bootvar > max_size) {
+ fw_dump.reserve_bootvar = max_size;
+ pr_info("Adjusted boot memory size to %luMB\n",
+ (fw_dump.reserve_bootvar >> 20));
+ }
+
+ return fw_dump.reserve_bootvar;
+ } else if (fw_dump.reserve_bootvar) {
+ /*
+ * 'fadump_reserve_mem=' is being used to reserve memory
+ * for firmware-assisted dump.
+ */
return fw_dump.reserve_bootvar;
}
/* divide by 20 to get 5% of value */
- size = memblock_end_of_DRAM() / 20;
+ size = memblock_phys_mem_size() / 20;
/* round it down in multiples of 256 */
size = size & ~0x0FFFFFFFUL;
@@ -377,9 +447,22 @@ static int __init early_fadump_param(char *p)
}
early_param("fadump", early_fadump_param);
-static void register_fw_dump(struct fadump_mem_struct *fdm)
+/*
+ * Look for fadump_reserve_mem= cmdline option
+ * TODO: Remove references to 'fadump_reserve_mem=' parameter,
+ * the sooner 'crashkernel=' parameter is accustomed to.
+ */
+static int __init early_fadump_reserve_mem(char *p)
+{
+ if (p)
+ fw_dump.reserve_bootvar = memparse(p, &p);
+ return 0;
+}
+early_param("fadump_reserve_mem", early_fadump_reserve_mem);
+
+static int register_fw_dump(struct fadump_mem_struct *fdm)
{
- int rc;
+ int rc, err;
unsigned int wait_time;
pr_debug("Registering for firmware-assisted kernel dump...\n");
@@ -396,26 +479,38 @@ static void register_fw_dump(struct fadump_mem_struct *fdm)
} while (wait_time);
+ err = -EIO;
switch (rc) {
+ default:
+ pr_err("Failed to register. Unknown Error(%d).\n", rc);
+ break;
case -1:
printk(KERN_ERR "Failed to register firmware-assisted kernel"
" dump. Hardware Error(%d).\n", rc);
break;
case -3:
+ if (!is_boot_memory_area_contiguous())
+ pr_err("Can't have holes in boot memory area while "
+ "registering fadump\n");
+
printk(KERN_ERR "Failed to register firmware-assisted kernel"
" dump. Parameter Error(%d).\n", rc);
+ err = -EINVAL;
break;
case -9:
printk(KERN_ERR "firmware-assisted kernel dump is already "
" registered.");
fw_dump.dump_registered = 1;
+ err = -EEXIST;
break;
case 0:
printk(KERN_INFO "firmware-assisted kernel dump registration"
" is successful\n");
fw_dump.dump_registered = 1;
+ err = 0;
break;
}
+ return err;
}
void crash_fadump(struct pt_regs *regs, const char *str)
@@ -831,8 +926,19 @@ static void fadump_setup_crash_memory_ranges(void)
for_each_memblock(memory, reg) {
start = (unsigned long long)reg->base;
end = start + (unsigned long long)reg->size;
- if (start == RMA_START && end >= fw_dump.boot_memory_size)
- start = fw_dump.boot_memory_size;
+
+ /*
+ * skip the first memory chunk that is already added (RMA_START
+ * through boot_memory_size). This logic needs a relook if and
+ * when RMA_START changes to a non-zero value.
+ */
+ BUILD_BUG_ON(RMA_START != 0);
+ if (start < fw_dump.boot_memory_size) {
+ if (end > fw_dump.boot_memory_size)
+ start = fw_dump.boot_memory_size;
+ else
+ continue;
+ }
/* add this range excluding the reserved dump area. */
fadump_exclude_reserved_area(start, end);
@@ -956,7 +1062,7 @@ static unsigned long init_fadump_header(unsigned long addr)
return addr;
}
-static void register_fadump(void)
+static int register_fadump(void)
{
unsigned long addr;
void *vaddr;
@@ -966,7 +1072,7 @@ static void register_fadump(void)
* assisted dump.
*/
if (!fw_dump.reserve_dump_area_size)
- return;
+ return -ENODEV;
fadump_setup_crash_memory_ranges();
@@ -979,7 +1085,7 @@ static void register_fadump(void)
fadump_create_elfcore_headers(vaddr);
/* register the future kernel dump with firmware. */
- register_fw_dump(&fdm);
+ return register_fw_dump(&fdm);
}
static int fadump_unregister_dump(struct fadump_mem_struct *fdm)
@@ -1046,28 +1152,71 @@ void fadump_cleanup(void)
}
}
+static void fadump_free_reserved_memory(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long pfn;
+ unsigned long time_limit = jiffies + HZ;
+
+ pr_info("freeing reserved memory (0x%llx - 0x%llx)\n",
+ PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ free_reserved_page(pfn_to_page(pfn));
+
+ if (time_after(jiffies, time_limit)) {
+ cond_resched();
+ time_limit = jiffies + HZ;
+ }
+ }
+}
+
+/*
+ * Skip memory holes and free memory that was actually reserved.
+ */
+static void fadump_release_reserved_area(unsigned long start, unsigned long end)
+{
+ struct memblock_region *reg;
+ unsigned long tstart, tend;
+ unsigned long start_pfn = PHYS_PFN(start);
+ unsigned long end_pfn = PHYS_PFN(end);
+
+ for_each_memblock(memory, reg) {
+ tstart = max(start_pfn, memblock_region_memory_base_pfn(reg));
+ tend = min(end_pfn, memblock_region_memory_end_pfn(reg));
+ if (tstart < tend) {
+ fadump_free_reserved_memory(tstart, tend);
+
+ if (tend == end_pfn)
+ break;
+
+ start_pfn = tend + 1;
+ }
+ }
+}
+
/*
* Release the memory that was reserved in early boot to preserve the memory
* contents. The released memory will be available for general use.
*/
static void fadump_release_memory(unsigned long begin, unsigned long end)
{
- unsigned long addr;
unsigned long ra_start, ra_end;
ra_start = fw_dump.reserve_dump_area_start;
ra_end = ra_start + fw_dump.reserve_dump_area_size;
- for (addr = begin; addr < end; addr += PAGE_SIZE) {
- /*
- * exclude the dump reserve area. Will reuse it for next
- * fadump registration.
- */
- if (addr <= ra_end && ((addr + PAGE_SIZE) > ra_start))
- continue;
-
- free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
- }
+ /*
+ * exclude the dump reserve area. Will reuse it for next
+ * fadump registration.
+ */
+ if (begin < ra_end && end > ra_start) {
+ if (begin < ra_start)
+ fadump_release_reserved_area(begin, ra_start);
+ if (end > ra_end)
+ fadump_release_reserved_area(ra_end, end);
+ } else
+ fadump_release_reserved_area(begin, end);
}
static void fadump_invalidate_release_mem(void)
@@ -1161,7 +1310,6 @@ static ssize_t fadump_register_store(struct kobject *kobj,
switch (buf[0]) {
case '0':
if (fw_dump.dump_registered == 0) {
- ret = -EINVAL;
goto unlock_out;
}
/* Un-register Firmware-assisted dump */
@@ -1169,11 +1317,11 @@ static ssize_t fadump_register_store(struct kobject *kobj,
break;
case '1':
if (fw_dump.dump_registered == 1) {
- ret = -EINVAL;
+ ret = -EEXIST;
goto unlock_out;
}
/* Register Firmware-assisted dump */
- register_fadump();
+ ret = register_fadump();
break;
default:
ret = -EINVAL;
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 4898d676dcae..5adb390e773b 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -31,6 +31,7 @@
* registers for winkle support.
*/
#define _SDR1 GPR3
+#define _PTCR GPR3
#define _RPR GPR4
#define _SPURR GPR5
#define _PURR GPR6
@@ -39,7 +40,7 @@
#define _AMOR GPR9
#define _WORT GPR10
#define _WORC GPR11
-#define _PTCR GPR12
+#define _LPCR GPR12
#define PSSCR_EC_ESL_MASK_SHIFTED (PSSCR_EC | PSSCR_ESL) >> 16
@@ -55,12 +56,14 @@ save_sprs_to_stack:
* here since any thread in the core might wake up first
*/
BEGIN_FTR_SECTION
- mfspr r3,SPRN_PTCR
- std r3,_PTCR(r1)
/*
* Note - SDR1 is dropped in Power ISA v3. Hence not restoring
* SDR1 here
*/
+ mfspr r3,SPRN_PTCR
+ std r3,_PTCR(r1)
+ mfspr r3,SPRN_LPCR
+ std r3,_LPCR(r1)
FTR_SECTION_ELSE
mfspr r3,SPRN_SDR1
std r3,_SDR1(r1)
@@ -106,13 +109,9 @@ core_idle_lock_held:
/*
* Pass requested state in r3:
* r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8
- * - Requested STOP state in POWER9
- *
- * To check IRQ_HAPPENED in r4
- * 0 - don't check
- * 1 - check
+ * - Requested PSSCR value in POWER9
*
- * Address to 'rfid' to in r5
+ * Address of idle handler to branch to in realmode in r4
*/
pnv_powersave_common:
/* Use r3 to pass state nap/sleep/winkle */
@@ -122,37 +121,14 @@ pnv_powersave_common:
* need to save PC, some CR bits and the NV GPRs,
* but for now an interrupt frame will do.
*/
+ mtctr r4
+
mflr r0
std r0,16(r1)
stdu r1,-INT_FRAME_SIZE(r1)
std r0,_LINK(r1)
std r0,_NIP(r1)
- /* Hard disable interrupts */
- mfmsr r9
- rldicl r9,r9,48,1
- rotldi r9,r9,16
- mtmsrd r9,1 /* hard-disable interrupts */
-
- /* Check if something happened while soft-disabled */
- lbz r0,PACAIRQHAPPENED(r13)
- andi. r0,r0,~PACA_IRQ_HARD_DIS@l
- beq 1f
- cmpwi cr0,r4,0
- beq 1f
- addi r1,r1,INT_FRAME_SIZE
- ld r0,16(r1)
- li r3,0 /* Return 0 (no nap) */
- mtlr r0
- blr
-
-1: /* We mark irqs hard disabled as this is the state we'll
- * be in when returning and we need to tell arch_local_irq_restore()
- * about it
- */
- li r0,PACA_IRQ_HARD_DIS
- stb r0,PACAIRQHAPPENED(r13)
-
/* We haven't lost state ... yet */
li r0,0
stb r0,PACA_NAPSTATELOST(r13)
@@ -160,9 +136,8 @@ pnv_powersave_common:
/* Continue saving state */
SAVE_GPR(2, r1)
SAVE_NVGPRS(r1)
- mfcr r4
- std r4,_CCR(r1)
- std r9,_MSR(r1)
+ mfcr r5
+ std r5,_CCR(r1)
std r1,PACAR1(r13)
/*
@@ -172,12 +147,8 @@ pnv_powersave_common:
* the MMU context to the guest.
*/
LOAD_REG_IMMEDIATE(r7, MSR_IDLE)
- li r6, MSR_RI
- andc r6, r9, r6
- mtmsrd r6, 1 /* clear RI before setting SRR0/1 */
- mtspr SPRN_SRR0, r5
- mtspr SPRN_SRR1, r7
- rfid
+ mtmsrd r7,0
+ bctr
.globl pnv_enter_arch207_idle_mode
pnv_enter_arch207_idle_mode:
@@ -285,6 +256,19 @@ power_enter_stop:
bne .Lhandle_esl_ec_set
IDLE_STATE_ENTER_SEQ(PPC_STOP)
li r3,0 /* Since we didn't lose state, return 0 */
+
+ /*
+ * pnv_wakeup_noloss() expects r12 to contain the SRR1 value so
+ * it can determine if the wakeup reason is an HMI in
+ * CHECK_HMI_INTERRUPT.
+ *
+ * However, when we wakeup with ESL=0, SRR1 will not contain the wakeup
+ * reason, so there is no point setting r12 to SRR1.
+ *
+ * Further, we clear r12 here, so that we don't accidentally enter the
+ * HMI in pnv_wakeup_noloss() if the value of r12[42:45] == WAKE_HMI.
+ */
+ li r12, 0
b pnv_wakeup_noloss
.Lhandle_esl_ec_set:
@@ -319,45 +303,23 @@ lwarx_loop_stop:
IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP)
-_GLOBAL(power7_idle)
+/*
+ * Entered with MSR[EE]=0 and no soft-masked interrupts pending.
+ * r3 contains desired idle state (PNV_THREAD_NAP/SLEEP/WINKLE).
+ */
+_GLOBAL(power7_idle_insn)
/* Now check if user or arch enabled NAP mode */
- LOAD_REG_ADDRBASE(r3,powersave_nap)
- lwz r4,ADDROFF(powersave_nap)(r3)
- cmpwi 0,r4,0
- beqlr
- li r3, 1
- /* fall through */
-
-_GLOBAL(power7_nap)
- mr r4,r3
- li r3,PNV_THREAD_NAP
- LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode)
- b pnv_powersave_common
- /* No return */
-
-_GLOBAL(power7_sleep)
- li r3,PNV_THREAD_SLEEP
- li r4,1
- LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode)
- b pnv_powersave_common
- /* No return */
-
-_GLOBAL(power7_winkle)
- li r3,PNV_THREAD_WINKLE
- li r4,1
- LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode)
+ LOAD_REG_ADDR(r4, pnv_enter_arch207_idle_mode)
b pnv_powersave_common
- /* No return */
#define CHECK_HMI_INTERRUPT \
- mfspr r0,SPRN_SRR1; \
BEGIN_FTR_SECTION_NESTED(66); \
- rlwinm r0,r0,45-31,0xf; /* extract wake reason field (P8) */ \
+ rlwinm r0,r12,45-31,0xf; /* extract wake reason field (P8) */ \
FTR_SECTION_ELSE_NESTED(66); \
- rlwinm r0,r0,45-31,0xe; /* P7 wake reason field is 3 bits */ \
+ rlwinm r0,r12,45-31,0xe; /* P7 wake reason field is 3 bits */ \
ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \
cmpwi r0,0xa; /* Hypervisor maintenance ? */ \
- bne 20f; \
+ bne+ 20f; \
/* Invoke opal call to handle hmi */ \
ld r2,PACATOC(r13); \
ld r1,PACAR1(r13); \
@@ -369,16 +331,13 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \
20: nop;
/*
- * r3 - The PSSCR value corresponding to the stop state.
- * r4 - The PSSCR mask corrresonding to the stop state.
+ * Entered with MSR[EE]=0 and no soft-masked interrupts pending.
+ * r3 contains desired PSSCR register value.
*/
_GLOBAL(power9_idle_stop)
- mfspr r5,SPRN_PSSCR
- andc r5,r5,r4
- or r3,r3,r5
+ std r3, PACA_REQ_PSSCR(r13)
mtspr SPRN_PSSCR,r3
- LOAD_REG_ADDR(r5,power_enter_stop)
- li r4,1
+ LOAD_REG_ADDR(r4,power_enter_stop)
b pnv_powersave_common
/* No return */
@@ -436,17 +395,17 @@ pnv_powersave_wakeup_mce:
/*
* Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake
- * reason into SRR1, which allows reuse of the system reset wakeup
+ * reason into r12, which allows reuse of the system reset wakeup
* code without being mistaken for another type of wakeup.
*/
- oris r3,r3,SRR1_WAKEMCE_RESVD@h
- mtspr SPRN_SRR1,r3
+ oris r12,r3,SRR1_WAKEMCE_RESVD@h
b pnv_powersave_wakeup
/*
* Called from reset vector for powersave wakeups.
* cr3 - set to gt if waking up with partial/complete hypervisor state loss
+ * r12 - SRR1
*/
.global pnv_powersave_wakeup
pnv_powersave_wakeup:
@@ -464,6 +423,8 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
li r0,PNV_THREAD_RUNNING
stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */
+ mr r3,r12
+
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
li r0,KVM_HWTHREAD_IN_KERNEL
stb r0,HSTATE_HWTHREAD_STATE(r13)
@@ -477,7 +438,6 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
#endif
/* Return SRR1 from power7_nap() */
- mfspr r3,SPRN_SRR1
blt cr3,pnv_wakeup_noloss
b pnv_wakeup_loss
@@ -489,18 +449,35 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
*/
pnv_restore_hyp_resource_arch300:
/*
+ * Workaround for POWER9, if we lost resources, the ERAT
+ * might have been mixed up and needs flushing.
+ */
+ blt cr3,1f
+ PPC_INVALIDATE_ERAT
+1:
+ /*
* POWER ISA 3. Use PSSCR to determine if we
* are waking up from deep idle state
*/
LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
ld r4,ADDROFF(pnv_first_deep_stop_state)(r5)
- mfspr r5,SPRN_PSSCR
+BEGIN_FTR_SECTION_NESTED(71)
+ /*
+ * Assume that we are waking up from the state
+ * same as the Requested Level (RL) in the PSSCR
+ * which are Bits 60-63
+ */
+ ld r5,PACA_REQ_PSSCR(r13)
+ rldicl r5,r5,0,60
+FTR_SECTION_ELSE_NESTED(71)
/*
* 0-3 bits correspond to Power-Saving Level Status
* which indicates the idle state we are waking up from
*/
+ mfspr r5, SPRN_PSSCR
rldicl r5,r5,4,60
+ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_POWER9_DD1, 71)
cmpd cr4,r5,r4
bge cr4,pnv_wakeup_tb_loss /* returns to caller */
@@ -567,9 +544,9 @@ pnv_wakeup_tb_loss:
* is required to return back to reset vector after hypervisor state
* restore is complete.
*/
+ mr r19,r12
mr r18,r4
mflr r17
- mfspr r16,SPRN_SRR1
BEGIN_FTR_SECTION
CHECK_HMI_INTERRUPT
END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
@@ -731,13 +708,14 @@ timebase_resync:
* Use cr3 which indicates that we are waking up with atleast partial
* hypervisor state loss to determine if TIMEBASE RESYNC is needed.
*/
- ble cr3,clear_lock
+ ble cr3,.Ltb_resynced
/* Time base re-sync */
bl opal_resync_timebase;
/*
- * If waking up from sleep, per core state is not lost, skip to
- * clear_lock.
+ * If waking up from sleep (POWER8), per core state
+ * is not lost, skip to clear_lock.
*/
+.Ltb_resynced:
blt cr4,clear_lock
/*
@@ -812,9 +790,13 @@ no_segments:
mtctr r12
bctrl
+BEGIN_FTR_SECTION
+ ld r4,_LPCR(r1)
+ mtspr SPRN_LPCR,r4
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
hypervisor_state_restored:
- mtspr SPRN_SRR1,r16
+ mr r12,r19
mtlr r17
blr /* return to pnv_powersave_wakeup */
@@ -827,6 +809,7 @@ fastsleep_workaround_at_exit:
/*
* R3 here contains the value that will be returned to the caller
* of power7_nap.
+ * R12 contains SRR1 for CHECK_HMI_INTERRUPT.
*/
.global pnv_wakeup_loss
pnv_wakeup_loss:
@@ -836,32 +819,33 @@ BEGIN_FTR_SECTION
END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
REST_NVGPRS(r1)
REST_GPR(2, r1)
+ ld r4,PACAKMSR(r13)
+ ld r5,_LINK(r1)
ld r6,_CCR(r1)
- ld r4,_MSR(r1)
- ld r5,_NIP(r1)
addi r1,r1,INT_FRAME_SIZE
+ mtlr r5
mtcr r6
- mtspr SPRN_SRR1,r4
- mtspr SPRN_SRR0,r5
- rfid
+ mtmsrd r4
+ blr
/*
* R3 here contains the value that will be returned to the caller
* of power7_nap.
+ * R12 contains SRR1 for CHECK_HMI_INTERRUPT.
*/
pnv_wakeup_noloss:
lbz r0,PACA_NAPSTATELOST(r13)
cmpwi r0,0
bne pnv_wakeup_loss
+ ld r1,PACAR1(r13)
BEGIN_FTR_SECTION
CHECK_HMI_INTERRUPT
END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
- ld r1,PACAR1(r13)
- ld r6,_CCR(r1)
- ld r4,_MSR(r1)
+ ld r4,PACAKMSR(r13)
ld r5,_NIP(r1)
+ ld r6,_CCR(r1)
addi r1,r1,INT_FRAME_SIZE
+ mtlr r5
mtcr r6
- mtspr SPRN_SRR1,r4
- mtspr SPRN_SRR0,r5
- rfid
+ mtmsrd r4
+ blr
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 5c291df30fe3..0bcec745a672 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -322,7 +322,8 @@ bool prep_irq_for_idle(void)
* First we need to hard disable to ensure no interrupt
* occurs before we effectively enter the low power state
*/
- hard_irq_disable();
+ __hard_irq_disable();
+ local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
/*
* If anything happened while we were soft-disabled,
@@ -347,6 +348,65 @@ bool prep_irq_for_idle(void)
return true;
}
+#ifdef CONFIG_PPC_BOOK3S
+/*
+ * This is for idle sequences that return with IRQs off, but the
+ * idle state itself wakes on interrupt. Tell the irq tracer that
+ * IRQs are enabled for the duration of idle so it does not get long
+ * off times. Must be paired with fini_irq_for_idle_irqsoff.
+ */
+bool prep_irq_for_idle_irqsoff(void)
+{
+ WARN_ON(!irqs_disabled());
+
+ /*
+ * First we need to hard disable to ensure no interrupt
+ * occurs before we effectively enter the low power state
+ */
+ __hard_irq_disable();
+ local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+
+ /*
+ * If anything happened while we were soft-disabled,
+ * we return now and do not enter the low power state.
+ */
+ if (lazy_irq_pending())
+ return false;
+
+ /* Tell lockdep we are about to re-enable */
+ trace_hardirqs_on();
+
+ return true;
+}
+
+/*
+ * Take the SRR1 wakeup reason, index into this table to find the
+ * appropriate irq_happened bit.
+ */
+static const u8 srr1_to_lazyirq[0x10] = {
+ 0, 0, 0,
+ PACA_IRQ_DBELL,
+ 0,
+ PACA_IRQ_DBELL,
+ PACA_IRQ_DEC,
+ 0,
+ PACA_IRQ_EE,
+ PACA_IRQ_EE,
+ PACA_IRQ_HMI,
+ 0, 0, 0, 0, 0 };
+
+void irq_set_pending_from_srr1(unsigned long srr1)
+{
+ unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18;
+
+ /*
+ * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0,
+ * so this can be called unconditionally with srr1 wake reason.
+ */
+ local_paca->irq_happened |= srr1_to_lazyirq[idx];
+}
+#endif /* CONFIG_PPC_BOOK3S */
+
/*
* Force a replay of the external interrupt handler on this CPU.
*/
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 01addfb0ed0a..45f1ff721c32 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -164,17 +164,13 @@ NOKPROBE_SYMBOL(arch_prepare_kprobe);
void arch_arm_kprobe(struct kprobe *p)
{
- *p->addr = BREAKPOINT_INSTRUCTION;
- flush_icache_range((unsigned long) p->addr,
- (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+ patch_instruction(p->addr, BREAKPOINT_INSTRUCTION);
}
NOKPROBE_SYMBOL(arch_arm_kprobe);
void arch_disarm_kprobe(struct kprobe *p)
{
- *p->addr = p->opcode;
- flush_icache_range((unsigned long) p->addr,
- (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+ patch_instruction(p->addr, p->opcode);
}
NOKPROBE_SYMBOL(arch_disarm_kprobe);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index a9bfa49f3698..e0e131e662ed 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -268,6 +268,7 @@ void machine_check_print_event_info(struct machine_check_event *evt,
static const char *mc_ra_types[] = {
"Indeterminate",
"Instruction fetch (bad)",
+ "Instruction fetch (foreign)",
"Page table walk ifetch (bad)",
"Page table walk ifetch (foreign)",
"Load (bad)",
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index f913139bb0c2..d24e689e893f 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -236,6 +236,9 @@ static const struct mce_ierror_table mce_p9_ierror_table[] = {
{ 0x00000000081c0000, 0x0000000000180000, true,
MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000001c0000, true,
+ MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH_FOREIGN,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
{ 0x00000000081c0000, 0x0000000008000000, true,
MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 84db14e435f5..3f7a9a2d2435 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -244,8 +244,7 @@ _GLOBAL(_nmask_and_or_msr)
*/
_GLOBAL(real_readb)
mfmsr r7
- ori r0,r7,MSR_DR
- xori r0,r0,MSR_DR
+ rlwinm r0,r7,0,~MSR_DR
sync
mtmsr r0
sync
@@ -262,8 +261,7 @@ _GLOBAL(real_readb)
*/
_GLOBAL(real_writeb)
mfmsr r7
- ori r0,r7,MSR_DR
- xori r0,r0,MSR_DR
+ rlwinm r0,r7,0,~MSR_DR
sync
mtmsr r0
sync
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index ec60ed0d4aad..6f8273f5e988 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -158,12 +158,13 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr)
{
/* addis r4,0,(insn)@h */
- *addr++ = PPC_INST_ADDIS | ___PPC_RT(4) |
- ((val >> 16) & 0xffff);
+ patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(4) |
+ ((val >> 16) & 0xffff));
+ addr++;
/* ori r4,r4,(insn)@l */
- *addr = PPC_INST_ORI | ___PPC_RA(4) | ___PPC_RS(4) |
- (val & 0xffff);
+ patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(4) |
+ ___PPC_RS(4) | (val & 0xffff));
}
/*
@@ -173,24 +174,28 @@ void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr)
void patch_imm64_load_insns(unsigned long val, kprobe_opcode_t *addr)
{
/* lis r3,(op)@highest */
- *addr++ = PPC_INST_ADDIS | ___PPC_RT(3) |
- ((val >> 48) & 0xffff);
+ patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(3) |
+ ((val >> 48) & 0xffff));
+ addr++;
/* ori r3,r3,(op)@higher */
- *addr++ = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) |
- ((val >> 32) & 0xffff);
+ patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) |
+ ___PPC_RS(3) | ((val >> 32) & 0xffff));
+ addr++;
/* rldicr r3,r3,32,31 */
- *addr++ = PPC_INST_RLDICR | ___PPC_RA(3) | ___PPC_RS(3) |
- __PPC_SH64(32) | __PPC_ME64(31);
+ patch_instruction(addr, PPC_INST_RLDICR | ___PPC_RA(3) |
+ ___PPC_RS(3) | __PPC_SH64(32) | __PPC_ME64(31));
+ addr++;
/* oris r3,r3,(op)@h */
- *addr++ = PPC_INST_ORIS | ___PPC_RA(3) | ___PPC_RS(3) |
- ((val >> 16) & 0xffff);
+ patch_instruction(addr, PPC_INST_ORIS | ___PPC_RA(3) |
+ ___PPC_RS(3) | ((val >> 16) & 0xffff));
+ addr++;
/* ori r3,r3,(op)@l */
- *addr = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) |
- (val & 0xffff);
+ patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) |
+ ___PPC_RS(3) | (val & 0xffff));
}
int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
@@ -198,7 +203,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
kprobe_opcode_t *buff, branch_op_callback, branch_emulate_step;
kprobe_opcode_t *op_callback_addr, *emulate_step_addr;
long b_offset;
- unsigned long nip;
+ unsigned long nip, size;
+ int rc, i;
kprobe_ppc_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
@@ -231,8 +237,14 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
goto error;
/* Setup template */
- memcpy(buff, optprobe_template_entry,
- TMPL_END_IDX * sizeof(kprobe_opcode_t));
+ /* We can optimize this via patch_instruction_window later */
+ size = (TMPL_END_IDX * sizeof(kprobe_opcode_t)) / sizeof(int);
+ pr_devel("Copying template to %p, size %lu\n", buff, size);
+ for (i = 0; i < size; i++) {
+ rc = patch_instruction(buff + i, *(optprobe_template_entry + i));
+ if (rc < 0)
+ goto error;
+ }
/*
* Fixup the template with instructions to:
@@ -261,8 +273,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
if (!branch_op_callback || !branch_emulate_step)
goto error;
- buff[TMPL_CALL_HDLR_IDX] = branch_op_callback;
- buff[TMPL_EMULATE_IDX] = branch_emulate_step;
+ patch_instruction(buff + TMPL_CALL_HDLR_IDX, branch_op_callback);
+ patch_instruction(buff + TMPL_EMULATE_IDX, branch_emulate_step);
/*
* 3. load instruction to be emulated into relevant register, and
@@ -272,8 +284,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
/*
* 4. branch back from trampoline
*/
- buff[TMPL_RET_IDX] = create_branch((unsigned int *)buff + TMPL_RET_IDX,
- (unsigned long)nip, 0);
+ patch_branch(buff + TMPL_RET_IDX, (unsigned long)nip, 0);
flush_icache_range((unsigned long)buff,
(unsigned long)(&buff[TMPL_END_IDX]));
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 2ad725ef4368..9f3e2c932dcc 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -511,6 +511,10 @@ void restore_math(struct pt_regs *regs)
{
unsigned long msr;
+ /*
+ * Syscall exit makes a similar initial check before branching
+ * to restore_math. Keep them in synch.
+ */
if (!msr_tm_active(regs->msr) &&
!current->thread.load_fp && !loadvec(current->thread))
return;
@@ -1133,6 +1137,11 @@ static inline void restore_sprs(struct thread_struct *old_thread,
#endif
}
+#ifdef CONFIG_PPC_BOOK3S_64
+#define CP_SIZE 128
+static const u8 dummy_copy_buffer[CP_SIZE] __attribute__((aligned(CP_SIZE)));
+#endif
+
struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *new)
{
@@ -1195,12 +1204,14 @@ struct task_struct *__switch_to(struct task_struct *prev,
__switch_to_tm(prev, new);
- /*
- * We can't take a PMU exception inside _switch() since there is a
- * window where the kernel stack SLB and the kernel stack are out
- * of sync. Hard disable here.
- */
- hard_irq_disable();
+ if (!radix_enabled()) {
+ /*
+ * We can't take a PMU exception inside _switch() since there
+ * is a window where the kernel stack SLB and the kernel stack
+ * are out of sync. Hard disable here.
+ */
+ hard_irq_disable();
+ }
/*
* Call restore_sprs() before calling _switch(). If we move it after
@@ -1220,8 +1231,28 @@ struct task_struct *__switch_to(struct task_struct *prev,
batch->active = 1;
}
- if (current_thread_info()->task->thread.regs)
+ if (current_thread_info()->task->thread.regs) {
restore_math(current_thread_info()->task->thread.regs);
+
+ /*
+ * The copy-paste buffer can only store into foreign real
+ * addresses, so unprivileged processes can not see the
+ * data or use it in any way unless they have foreign real
+ * mappings. We don't have a VAS driver that allocates those
+ * yet, so no cpabort is required.
+ */
+ if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+ /*
+ * DD1 allows paste into normal system memory, so we
+ * do an unpaired copy here to clear the buffer and
+ * prevent a covert channel being set up.
+ *
+ * cpabort is not used because it is quite expensive.
+ */
+ asm volatile(PPC_COPY(%0, %1)
+ : : "r"(dummy_copy_buffer), "r"(0));
+ }
+ }
#endif /* CONFIG_PPC_STD_MMU_64 */
return last;
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 857129acf960..94a948207cd2 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -335,6 +335,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
maj = ((pvr >> 8) & 0xFF) - 1;
min = pvr & 0xFF;
break;
+ case 0x004e: /* POWER9 bits 12-15 give chip type */
+ maj = (pvr >> 8) & 0x0F;
+ min = pvr & 0xFF;
+ break;
default:
maj = (pvr >> 8) & 0xFF;
min = pvr & 0xFF;
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 1069f74fca47..c6b8bace1766 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -33,6 +33,7 @@
#include <linux/notifier.h>
#include <linux/topology.h>
#include <linux/profile.h>
+#include <linux/processor.h>
#include <asm/ptrace.h>
#include <linux/atomic.h>
@@ -112,7 +113,8 @@ int smp_generic_cpu_bootable(unsigned int nr)
#ifdef CONFIG_PPC64
int smp_generic_kick_cpu(int nr)
{
- BUG_ON(nr < 0 || nr >= NR_CPUS);
+ if (nr < 0 || nr >= nr_cpu_ids)
+ return -EINVAL;
/*
* The processor is currently spinning, waiting for the
@@ -766,8 +768,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
smp_ops->give_timebase();
/* Wait until cpu puts itself in the online & active maps */
- while (!cpu_online(cpu))
- cpu_relax();
+ spin_until_cond(cpu_online(cpu));
return 0;
}
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 2b33cfaac7b8..fe6f3a285455 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -59,10 +59,10 @@
#include <linux/suspend.h>
#include <linux/rtc.h>
#include <linux/sched/cputime.h>
+#include <linux/processor.h>
#include <asm/trace.h>
#include <asm/io.h>
-#include <asm/processor.h>
#include <asm/nvram.h>
#include <asm/cache.h>
#include <asm/machdep.h>
@@ -442,6 +442,7 @@ void __delay(unsigned long loops)
unsigned long start;
int diff;
+ spin_begin();
if (__USE_RTC()) {
start = get_rtcl();
do {
@@ -449,13 +450,14 @@ void __delay(unsigned long loops)
diff = get_rtcl() - start;
if (diff < 0)
diff += 1000000000;
+ spin_cpu_relax();
} while (diff < loops);
} else {
start = get_tbl();
while (get_tbl() - start < loops)
- HMT_low();
- HMT_medium();
+ spin_cpu_relax();
}
+ spin_end();
}
EXPORT_SYMBOL(__delay);
@@ -675,7 +677,7 @@ EXPORT_SYMBOL_GPL(tb_to_ns);
* the high 64 bits of a * b, i.e. (a * b) >> 64, where a and b
* are 64-bit unsigned numbers.
*/
-unsigned long long sched_clock(void)
+notrace unsigned long long sched_clock(void)
{
if (__USE_RTC())
return get_rtc();
@@ -739,12 +741,20 @@ static int __init get_freq(char *name, int cells, unsigned long *val)
static void start_cpu_decrementer(void)
{
#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
+ unsigned int tcr;
+
/* Clear any pending timer interrupts */
mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS);
- /* Enable decrementer interrupt */
- mtspr(SPRN_TCR, TCR_DIE);
-#endif /* defined(CONFIG_BOOKE) || defined(CONFIG_40x) */
+ tcr = mfspr(SPRN_TCR);
+ /*
+ * The watchdog may have already been enabled by u-boot. So leave
+ * TRC[WP] (Watchdog Period) alone.
+ */
+ tcr &= TCR_WP_MASK; /* Clear all bits except for TCR[WP] */
+ tcr |= TCR_DIE; /* Enable decrementer */
+ mtspr(SPRN_TCR, tcr);
+#endif
}
void __init generic_calibrate_decr(void)
@@ -823,38 +833,76 @@ void read_persistent_clock(struct timespec *ts)
}
/* clocksource code */
-static u64 rtc_read(struct clocksource *cs)
+static notrace u64 rtc_read(struct clocksource *cs)
{
return (u64)get_rtc();
}
-static u64 timebase_read(struct clocksource *cs)
+static notrace u64 timebase_read(struct clocksource *cs)
{
return (u64)get_tb();
}
-void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm,
- struct clocksource *clock, u32 mult, u64 cycle_last)
+
+void update_vsyscall(struct timekeeper *tk)
{
+ struct timespec xt;
+ struct clocksource *clock = tk->tkr_mono.clock;
+ u32 mult = tk->tkr_mono.mult;
+ u32 shift = tk->tkr_mono.shift;
+ u64 cycle_last = tk->tkr_mono.cycle_last;
u64 new_tb_to_xs, new_stamp_xsec;
- u32 frac_sec;
+ u64 frac_sec;
if (clock != &clocksource_timebase)
return;
+ xt.tv_sec = tk->xtime_sec;
+ xt.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
+
/* Make userspace gettimeofday spin until we're done. */
++vdso_data->tb_update_count;
smp_mb();
- /* 19342813113834067 ~= 2^(20+64) / 1e9 */
- new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift);
- new_stamp_xsec = (u64) wall_time->tv_nsec * XSEC_PER_SEC;
- do_div(new_stamp_xsec, 1000000000);
- new_stamp_xsec += (u64) wall_time->tv_sec * XSEC_PER_SEC;
+ /*
+ * This computes ((2^20 / 1e9) * mult) >> shift as a
+ * 0.64 fixed-point fraction.
+ * The computation in the else clause below won't overflow
+ * (as long as the timebase frequency is >= 1.049 MHz)
+ * but loses precision because we lose the low bits of the constant
+ * in the shift. Note that 19342813113834067 ~= 2^(20+64) / 1e9.
+ * For a shift of 24 the error is about 0.5e-9, or about 0.5ns
+ * over a second. (Shift values are usually 22, 23 or 24.)
+ * For high frequency clocks such as the 512MHz timebase clock
+ * on POWER[6789], the mult value is small (e.g. 32768000)
+ * and so we can shift the constant by 16 initially
+ * (295147905179 ~= 2^(20+64-16) / 1e9) and then do the
+ * remaining shifts after the multiplication, which gives a
+ * more accurate result (e.g. with mult = 32768000, shift = 24,
+ * the error is only about 1.2e-12, or 0.7ns over 10 minutes).
+ */
+ if (mult <= 62500000 && clock->shift >= 16)
+ new_tb_to_xs = ((u64) mult * 295147905179ULL) >> (clock->shift - 16);
+ else
+ new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift);
+
+ /*
+ * Compute the fractional second in units of 2^-32 seconds.
+ * The fractional second is tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift
+ * in nanoseconds, so multiplying that by 2^32 / 1e9 gives
+ * it in units of 2^-32 seconds.
+ * We assume shift <= 32 because clocks_calc_mult_shift()
+ * generates shift values in the range 0 - 32.
+ */
+ frac_sec = tk->tkr_mono.xtime_nsec << (32 - shift);
+ do_div(frac_sec, NSEC_PER_SEC);
- BUG_ON(wall_time->tv_nsec >= NSEC_PER_SEC);
- /* this is tv_nsec / 1e9 as a 0.32 fraction */
- frac_sec = ((u64) wall_time->tv_nsec * 18446744073ULL) >> 32;
+ /*
+ * Work out new stamp_xsec value for any legacy users of systemcfg.
+ * stamp_xsec is in units of 2^-20 seconds.
+ */
+ new_stamp_xsec = frac_sec >> 12;
+ new_stamp_xsec += tk->xtime_sec * XSEC_PER_SEC;
/*
* tb_update_count is used to allow the userspace gettimeofday code
@@ -864,15 +912,13 @@ void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm,
* the two values of tb_update_count match and are even then the
* tb_to_xs and stamp_xsec values are consistent. If not, then it
* loops back and reads them again until this criteria is met.
- * We expect the caller to have done the first increment of
- * vdso_data->tb_update_count already.
*/
vdso_data->tb_orig_stamp = cycle_last;
vdso_data->stamp_xsec = new_stamp_xsec;
vdso_data->tb_to_xs = new_tb_to_xs;
- vdso_data->wtom_clock_sec = wtm->tv_sec;
- vdso_data->wtom_clock_nsec = wtm->tv_nsec;
- vdso_data->stamp_xtime = *wall_time;
+ vdso_data->wtom_clock_sec = tk->wall_to_monotonic.tv_sec;
+ vdso_data->wtom_clock_nsec = tk->wall_to_monotonic.tv_nsec;
+ vdso_data->stamp_xtime = xt;
vdso_data->stamp_sec_fraction = frac_sec;
smp_wmb();
++(vdso_data->tb_update_count);
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 3a2d04134da9..c4ba37822ba0 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -313,8 +313,8 @@ dont_backup_fp:
blr
- /* void tm_recheckpoint(struct thread_struct *thread,
- * unsigned long orig_msr)
+ /* void __tm_recheckpoint(struct thread_struct *thread,
+ * unsigned long orig_msr)
* - Restore the checkpointed register state saved by tm_reclaim
* when we switch_to a process.
*
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index d4e545d27ef9..bfcfd9ef09f2 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -237,6 +237,7 @@ void die(const char *str, struct pt_regs *regs, long err)
err = 0;
oops_end(flags, regs, err);
}
+NOKPROBE_SYMBOL(die);
void user_single_step_siginfo(struct task_struct *tsk,
struct pt_regs *regs, siginfo_t *info)
@@ -1968,6 +1969,7 @@ void unrecoverable_exception(struct pt_regs *regs)
regs->trap, regs->nip);
die("Unrecoverable exception", regs, SIGABRT);
}
+NOKPROBE_SYMBOL(unrecoverable_exception);
#if defined(CONFIG_BOOKE_WDT) || defined(CONFIG_40x)
/*
@@ -1998,6 +2000,7 @@ void kernel_bad_stack(struct pt_regs *regs)
regs->gpr[1], regs->nip);
die("Bad kernel stack pointer", regs, SIGABRT);
}
+NOKPROBE_SYMBOL(kernel_bad_stack);
void __init trap_init(void)
{
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 2f793be3d2b1..b1a250560198 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -8,6 +8,12 @@
#include <asm/cache.h>
#include <asm/thread_info.h>
+#ifdef CONFIG_STRICT_KERNEL_RWX
+#define STRICT_ALIGN_SIZE (1 << 24)
+#else
+#define STRICT_ALIGN_SIZE PAGE_SIZE
+#endif
+
ENTRY(_stext)
PHDRS {
@@ -58,7 +64,6 @@ SECTIONS
#ifdef CONFIG_PPC64
KEEP(*(.head.text.first_256B));
#ifdef CONFIG_PPC_BOOK3E
-# define END_FIXED 0x100
#else
KEEP(*(.head.text.real_vectors));
*(.head.text.real_trampolines);
@@ -66,12 +71,8 @@ SECTIONS
*(.head.text.virt_trampolines);
# if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
KEEP(*(.head.data.fwnmi_page));
-# define END_FIXED 0x8000
-# else
-# define END_FIXED 0x7000
# endif
#endif
- ASSERT((. == END_FIXED), "vmlinux.lds.S: fixed section overflow error");
#else /* !CONFIG_PPC64 */
HEAD_TEXT
#endif
@@ -79,23 +80,6 @@ SECTIONS
__head_end = .;
- /*
- * If the build dies here, it's likely code in head_64.S is referencing
- * labels it can't reach, and the linker inserting stubs without the
- * assembler's knowledge. To debug, remove the above assert and
- * rebuild. Look for branch stubs in the fixed section region.
- *
- * Linker stub generation could be allowed in "trampoline"
- * sections if absolutely necessary, but this would require
- * some rework of the fixed sections. Before resorting to this,
- * consider references that have sufficient addressing range,
- * (e.g., hand coded trampolines) so the linker does not have
- * to add stubs.
- *
- * Linker stubs at the top of the main text section are currently not
- * detected, and will result in a crash at boot due to offsets being
- * wrong.
- */
#ifdef CONFIG_PPC64
/*
* BLOCK(0) overrides the default output section alignment because
@@ -103,18 +87,31 @@ SECTIONS
* section placement to work.
*/
.text BLOCK(0) : AT(ADDR(.text) - LOAD_OFFSET) {
+#ifdef CONFIG_LD_HEAD_STUB_CATCH
+ *(.linker_stub_catch);
+ . = . ;
+#endif
+
#else
.text : AT(ADDR(.text) - LOAD_OFFSET) {
ALIGN_FUNCTION();
#endif
/* careful! __ftr_alt_* sections need to be close to .text */
- *(.text .fixup __ftr_alt_* .ref.text)
+ *(.text.hot .text .text.fixup .text.unlikely .fixup __ftr_alt_* .ref.text);
SCHED_TEXT
CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
IRQENTRY_TEXT
SOFTIRQENTRY_TEXT
+ /*
+ * -Os builds call FP save/restore functions. The powerpc64
+ * linker generates those on demand in the .sfpr section.
+ * .sfpr gets placed at the beginning of a group of input
+ * sections, which can break start-of-text offset if it is
+ * included with the main text sections, so put it by itself.
+ */
+ *(.sfpr);
MEM_KEEP(init.text)
MEM_KEEP(exit.text)
@@ -132,7 +129,7 @@ SECTIONS
PROVIDE32 (etext = .);
/* Read-only data */
- RODATA
+ RO_DATA(PAGE_SIZE)
EXCEPTION_TABLE(0)
@@ -149,7 +146,7 @@ SECTIONS
/*
* Init sections discarded at runtime
*/
- . = ALIGN(PAGE_SIZE);
+ . = ALIGN(STRICT_ALIGN_SIZE);
__init_begin = .;
INIT_TEXT_SECTION(PAGE_SIZE) :kernel
@@ -267,7 +264,9 @@ SECTIONS
.data : AT(ADDR(.data) - LOAD_OFFSET) {
DATA_DATA
*(.sdata)
+ *(.sdata2)
*(.got.plt) *(.got)
+ *(.plt)
}
#else
.data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -330,6 +329,16 @@ SECTIONS
_end = . ;
PROVIDE32 (end = .);
- /* Sections to be discarded. */
+ STABS_DEBUG
+
+ DWARF_DEBUG
+
DISCARDS
+ /DISCARD/ : {
+ *(*.EMB.apuinfo)
+ *(.glink .iplt .plt .rela* .comment)
+ *(.gnu.version*)
+ *(.gnu.attributes)
+ *(.eh_frame)
+ }
}