summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig94
-rw-r--r--arch/x86/Kconfig.cpu21
-rw-r--r--arch/x86/Kconfig.debug23
-rw-r--r--arch/x86/Makefile6
-rw-r--r--arch/x86/Makefile_32.cpu9
-rw-r--r--arch/x86/boot/compressed/Makefile6
-rw-r--r--arch/x86/boot/compressed/head_64.S3
-rw-r--r--arch/x86/boot/compressed/misc.c19
-rw-r--r--arch/x86/boot/compressed/relocs.c87
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S3
-rw-r--r--arch/x86/boot/header.S2
-rw-r--r--arch/x86/boot/mkcpustr.c2
-rw-r--r--arch/x86/boot/setup.ld3
-rw-r--r--arch/x86/boot/version.c4
-rw-r--r--arch/x86/boot/video-vga.c9
-rw-r--r--arch/x86/boot/video.c13
-rw-r--r--arch/x86/crypto/Makefile3
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S517
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c10
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S157
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c333
-rw-r--r--arch/x86/crypto/twofish-i586-asm_32.S10
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S20
-rw-r--r--arch/x86/ia32/ia32_aout.c14
-rw-r--r--arch/x86/ia32/ia32entry.S52
-rw-r--r--arch/x86/ia32/sys_ia32.c175
-rw-r--r--arch/x86/include/asm/Kbuild2
-rw-r--r--arch/x86/include/asm/a.out-core.h10
-rw-r--r--arch/x86/include/asm/acpi.h29
-rw-r--r--arch/x86/include/asm/alternative-asm.h10
-rw-r--r--arch/x86/include/asm/alternative.h17
-rw-r--r--arch/x86/include/asm/amd_iommu.h15
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h41
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h54
-rw-r--r--arch/x86/include/asm/apb_timer.h70
-rw-r--r--arch/x86/include/asm/apic.h21
-rw-r--r--arch/x86/include/asm/apicdef.h6
-rw-r--r--arch/x86/include/asm/apicnum.h12
-rw-r--r--arch/x86/include/asm/asm-offsets.h1
-rw-r--r--arch/x86/include/asm/atomic.h299
-rw-r--r--arch/x86/include/asm/atomic64_32.h160
-rw-r--r--arch/x86/include/asm/atomic64_64.h224
-rw-r--r--arch/x86/include/asm/atomic_32.h415
-rw-r--r--arch/x86/include/asm/atomic_64.h485
-rw-r--r--arch/x86/include/asm/bug.h4
-rw-r--r--arch/x86/include/asm/cache.h7
-rw-r--r--arch/x86/include/asm/cacheflush.h2
-rw-r--r--arch/x86/include/asm/calgary.h2
-rw-r--r--arch/x86/include/asm/checksum_32.h3
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h248
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h234
-rw-r--r--arch/x86/include/asm/compat.h3
-rw-r--r--arch/x86/include/asm/cpu_debug.h127
-rw-r--r--arch/x86/include/asm/cpufeature.h6
-rw-r--r--arch/x86/include/asm/debugreg.h36
-rw-r--r--arch/x86/include/asm/desc.h2
-rw-r--r--arch/x86/include/asm/desc_defs.h4
-rw-r--r--arch/x86/include/asm/device.h2
-rw-r--r--arch/x86/include/asm/dma-mapping.h17
-rw-r--r--arch/x86/include/asm/e820.h28
-rw-r--r--arch/x86/include/asm/elf.h36
-rw-r--r--arch/x86/include/asm/entry_arch.h2
-rw-r--r--arch/x86/include/asm/fb.h4
-rw-r--r--arch/x86/include/asm/fixmap.h16
-rw-r--r--arch/x86/include/asm/gart.h9
-rw-r--r--arch/x86/include/asm/geode.h219
-rw-r--r--arch/x86/include/asm/hardirq.h8
-rw-r--r--arch/x86/include/asm/highmem.h4
-rw-r--r--arch/x86/include/asm/hpet.h8
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h72
-rw-r--r--arch/x86/include/asm/hw_irq.h42
-rw-r--r--arch/x86/include/asm/hyperv.h186
-rw-r--r--arch/x86/include/asm/i387.h19
-rw-r--r--arch/x86/include/asm/i8259.h21
-rw-r--r--arch/x86/include/asm/inat.h220
-rw-r--r--arch/x86/include/asm/inat_types.h29
-rw-r--r--arch/x86/include/asm/insn.h184
-rw-r--r--arch/x86/include/asm/inst.h150
-rw-r--r--arch/x86/include/asm/io.h155
-rw-r--r--arch/x86/include/asm/io_32.h196
-rw-r--r--arch/x86/include/asm/io_64.h181
-rw-r--r--arch/x86/include/asm/io_apic.h8
-rw-r--r--arch/x86/include/asm/iommu.h2
-rw-r--r--arch/x86/include/asm/irq.h3
-rw-r--r--arch/x86/include/asm/irq_vectors.h52
-rw-r--r--arch/x86/include/asm/k8.h5
-rw-r--r--arch/x86/include/asm/kprobes.h31
-rw-r--r--arch/x86/include/asm/kvm.h34
-rw-r--r--arch/x86/include/asm/kvm_emulate.h19
-rw-r--r--arch/x86/include/asm/kvm_host.h95
-rw-r--r--arch/x86/include/asm/kvm_para.h1
-rw-r--r--arch/x86/include/asm/local.h37
-rw-r--r--arch/x86/include/asm/mce.h17
-rw-r--r--arch/x86/include/asm/mmzone_32.h2
-rw-r--r--arch/x86/include/asm/mmzone_64.h6
-rw-r--r--arch/x86/include/asm/mpspec.h27
-rw-r--r--arch/x86/include/asm/mrst.h19
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/msr.h27
-rw-r--r--arch/x86/include/asm/nmi.h1
-rw-r--r--arch/x86/include/asm/numa_64.h5
-rw-r--r--arch/x86/include/asm/numaq.h5
-rw-r--r--arch/x86/include/asm/olpc.h22
-rw-r--r--arch/x86/include/asm/page_types.h4
-rw-r--r--arch/x86/include/asm/paravirt.h51
-rw-r--r--arch/x86/include/asm/paravirt_types.h28
-rw-r--r--arch/x86/include/asm/pci.h39
-rw-r--r--arch/x86/include/asm/pci_64.h2
-rw-r--r--arch/x86/include/asm/pci_x86.h43
-rw-r--r--arch/x86/include/asm/percpu.h199
-rw-r--r--arch/x86/include/asm/perf_event.h42
-rw-r--r--arch/x86/include/asm/pgalloc.h5
-rw-r--r--arch/x86/include/asm/pgtable.h6
-rw-r--r--arch/x86/include/asm/pgtable_32.h6
-rw-r--r--arch/x86/include/asm/pgtable_64.h2
-rw-r--r--arch/x86/include/asm/processor.h20
-rw-r--r--arch/x86/include/asm/proto.h27
-rw-r--r--arch/x86/include/asm/ptrace.h65
-rw-r--r--arch/x86/include/asm/rwsem.h81
-rw-r--r--arch/x86/include/asm/sections.h6
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/sigcontext.h4
-rw-r--r--arch/x86/include/asm/smp.h9
-rw-r--r--arch/x86/include/asm/spinlock.h62
-rw-r--r--arch/x86/include/asm/spinlock_types.h10
-rw-r--r--arch/x86/include/asm/stacktrace.h24
-rw-r--r--arch/x86/include/asm/string_32.h9
-rw-r--r--arch/x86/include/asm/svm.h5
-rw-r--r--arch/x86/include/asm/swiotlb.h11
-rw-r--r--arch/x86/include/asm/sys_ia32.h20
-rw-r--r--arch/x86/include/asm/syscall.h2
-rw-r--r--arch/x86/include/asm/syscalls.h49
-rw-r--r--arch/x86/include/asm/system.h44
-rw-r--r--arch/x86/include/asm/thread_info.h9
-rw-r--r--arch/x86/include/asm/topology.h10
-rw-r--r--arch/x86/include/asm/trampoline.h1
-rw-r--r--arch/x86/include/asm/uaccess.h1
-rw-r--r--arch/x86/include/asm/uaccess_32.h26
-rw-r--r--arch/x86/include/asm/uaccess_64.h56
-rw-r--r--arch/x86/include/asm/unistd_32.h7
-rw-r--r--arch/x86/include/asm/unistd_64.h5
-rw-r--r--arch/x86/include/asm/user.h58
-rw-r--r--arch/x86/include/asm/uv/bios.h22
-rw-r--r--arch/x86/include/asm/uv/uv.h1
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h2
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h164
-rw-r--r--arch/x86/include/asm/uv/uv_irq.h14
-rw-r--r--arch/x86/include/asm/visws/cobalt.h2
-rw-r--r--arch/x86/include/asm/vmx.h9
-rw-r--r--arch/x86/include/asm/x86_init.h31
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h27
-rw-r--r--arch/x86/include/asm/xsave.h2
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c172
-rw-r--r--arch/x86/kernel/acpi/cstate.c2
-rw-r--r--arch/x86/kernel/acpi/processor.c100
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S3
-rw-r--r--arch/x86/kernel/acpi/sleep.c26
-rw-r--r--arch/x86/kernel/alternative.c82
-rw-r--r--arch/x86/kernel/amd_iommu.c1312
-rw-r--r--arch/x86/kernel/amd_iommu_init.c143
-rw-r--r--arch/x86/kernel/apb_timer.c784
-rw-r--r--arch/x86/kernel/aperture_64.c13
-rw-r--r--arch/x86/kernel/apic/Makefile2
-rw-r--r--arch/x86/kernel/apic/apic.c68
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c12
-rw-r--r--arch/x86/kernel/apic/apic_noop.c200
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c18
-rw-r--r--arch/x86/kernel/apic/es7000_32.c28
-rw-r--r--arch/x86/kernel/apic/io_apic.c790
-rw-r--r--arch/x86/kernel/apic/nmi.c27
-rw-r--r--arch/x86/kernel/apic/numaq_32.c21
-rw-r--r--arch/x86/kernel/apic/probe_32.c31
-rw-r--r--arch/x86/kernel/apic/probe_64.c13
-rw-r--r--arch/x86/kernel/apic/summit_32.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c171
-rw-r--r--arch/x86/kernel/apm_32.c18
-rw-r--r--arch/x86/kernel/bios_uv.c47
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c19
-rw-r--r--arch/x86/kernel/cpu/amd.c57
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c52
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c688
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c72
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c620
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c43
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c21
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h24
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c2
-rw-r--r--arch/x86/kernel/cpu/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c12
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c416
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c174
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c45
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c191
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c10
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c28
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h6
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c94
-rw-r--r--arch/x86/kernel/cpu/perf_event.c1820
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c406
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c979
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c159
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c15
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/cpuid.c24
-rw-r--r--arch/x86/kernel/crash.c5
-rw-r--r--arch/x86/kernel/crash_dump_32.c19
-rw-r--r--arch/x86/kernel/ds.c4
-rw-r--r--arch/x86/kernel/dumpstack.c50
-rw-r--r--arch/x86/kernel/dumpstack.h21
-rw-r--r--arch/x86/kernel/dumpstack_32.c14
-rw-r--r--arch/x86/kernel/dumpstack_64.c99
-rw-r--r--arch/x86/kernel/e820.c352
-rw-r--r--arch/x86/kernel/early_printk.c5
-rw-r--r--arch/x86/kernel/efi.c4
-rw-r--r--arch/x86/kernel/entry_32.S100
-rw-r--r--arch/x86/kernel/entry_64.S84
-rw-r--r--arch/x86/kernel/ftrace.c133
-rw-r--r--arch/x86/kernel/geode_32.c196
-rw-r--r--arch/x86/kernel/head32.c10
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S24
-rw-r--r--arch/x86/kernel/head_64.S9
-rw-r--r--arch/x86/kernel/hpet.c87
-rw-r--r--arch/x86/kernel/hw_breakpoint.c530
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c10
-rw-r--r--arch/x86/kernel/i387.c71
-rw-r--r--arch/x86/kernel/i8259.c94
-rw-r--r--arch/x86/kernel/ioport.c28
-rw-r--r--arch/x86/kernel/irq.c130
-rw-r--r--arch/x86/kernel/irq_32.c45
-rw-r--r--arch/x86/kernel/irq_64.c58
-rw-r--r--arch/x86/kernel/irqinit.c40
-rw-r--r--arch/x86/kernel/k8.c14
-rw-r--r--arch/x86/kernel/kgdb.c241
-rw-r--r--arch/x86/kernel/kprobes.c861
-rw-r--r--arch/x86/kernel/machine_kexec_32.c8
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/mfgpt_32.c410
-rw-r--r--arch/x86/kernel/microcode_amd.c61
-rw-r--r--arch/x86/kernel/microcode_core.c28
-rw-r--r--arch/x86/kernel/microcode_intel.c47
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c7
-rw-r--r--arch/x86/kernel/mpparse.c54
-rw-r--r--arch/x86/kernel/mrst.c216
-rw-r--r--arch/x86/kernel/msr.c25
-rw-r--r--arch/x86/kernel/olpc.c12
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c4
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/pci-calgary_64.c102
-rw-r--r--arch/x86/kernel/pci-dma.c64
-rw-r--r--arch/x86/kernel/pci-gart_64.c166
-rw-r--r--arch/x86/kernel/pci-nommu.c11
-rw-r--r--arch/x86/kernel/pci-swiotlb.c21
-rw-r--r--arch/x86/kernel/process.c137
-rw-r--r--arch/x86/kernel/process_32.c115
-rw-r--r--arch/x86/kernel/process_64.c132
-rw-r--r--arch/x86/kernel/ptrace.c493
-rw-r--r--arch/x86/kernel/quirks.c22
-rw-r--r--arch/x86/kernel/reboot.c38
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c3
-rw-r--r--arch/x86/kernel/setup.c166
-rw-r--r--arch/x86/kernel/setup_percpu.c19
-rw-r--r--arch/x86/kernel/signal.c24
-rw-r--r--arch/x86/kernel/smpboot.c83
-rw-r--r--arch/x86/kernel/stacktrace.c18
-rw-r--r--arch/x86/kernel/sys_i386_32.c210
-rw-r--r--arch/x86/kernel/sys_x86_64.c29
-rw-r--r--arch/x86/kernel/syscall_table_32.S7
-rw-r--r--arch/x86/kernel/time.c7
-rw-r--r--arch/x86/kernel/tlb_uv.c13
-rw-r--r--arch/x86/kernel/trampoline.c30
-rw-r--r--arch/x86/kernel/trampoline_64.S4
-rw-r--r--arch/x86/kernel/traps.c76
-rw-r--r--arch/x86/kernel/tsc.c7
-rw-r--r--arch/x86/kernel/tsc_sync.c23
-rw-r--r--arch/x86/kernel/uv_irq.c238
-rw-r--r--arch/x86/kernel/uv_sysfs.c6
-rw-r--r--arch/x86/kernel/uv_time.c93
-rw-r--r--arch/x86/kernel/visws_quirks.c37
-rw-r--r--arch/x86/kernel/vm86_32.c11
-rw-r--r--arch/x86/kernel/vmi_32.c37
-rw-r--r--arch/x86/kernel/vmiclock_32.c10
-rw-r--r--arch/x86/kernel/vmlinux.lds.S49
-rw-r--r--arch/x86/kernel/vsyscall_64.c10
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c14
-rw-r--r--arch/x86/kernel/x86_init.c21
-rw-r--r--arch/x86/kernel/xsave.c1
-rw-r--r--arch/x86/kvm/Kconfig3
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c589
-rw-r--r--arch/x86/kvm/i8254.c42
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/i8259.c86
-rw-r--r--arch/x86/kvm/irq.h10
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h31
-rw-r--r--arch/x86/kvm/lapic.c55
-rw-r--r--arch/x86/kvm/lapic.h8
-rw-r--r--arch/x86/kvm/mmu.c220
-rw-r--r--arch/x86/kvm/mmu.h35
-rw-r--r--arch/x86/kvm/paging_tmpl.h54
-rw-r--r--arch/x86/kvm/svm.c651
-rw-r--r--arch/x86/kvm/trace.h224
-rw-r--r--arch/x86/kvm/vmx.c836
-rw-r--r--arch/x86/kvm/x86.c1637
-rw-r--r--arch/x86/kvm/x86.h30
-rw-r--r--arch/x86/lib/.gitignore1
-rw-r--r--arch/x86/lib/Makefile24
-rw-r--r--arch/x86/lib/cache-smp.c19
-rw-r--r--arch/x86/lib/cmpxchg8b_emu.S57
-rw-r--r--arch/x86/lib/copy_user_64.S20
-rw-r--r--arch/x86/lib/inat.c90
-rw-r--r--arch/x86/lib/insn.c516
-rw-r--r--arch/x86/lib/io_64.c25
-rw-r--r--arch/x86/lib/memcpy_64.S23
-rw-r--r--arch/x86/lib/memset_64.S18
-rw-r--r--arch/x86/lib/msr-smp.c204
-rw-r--r--arch/x86/lib/msr.c227
-rw-r--r--arch/x86/lib/rwsem_64.S81
-rw-r--r--arch/x86/lib/usercopy_32.c10
-rw-r--r--arch/x86/lib/x86-opcode-map.txt893
-rw-r--r--arch/x86/mm/extable.c31
-rw-r--r--arch/x86/mm/fault.c13
-rw-r--r--arch/x86/mm/gup.c2
-rw-r--r--arch/x86/mm/init.c11
-rw-r--r--arch/x86/mm/init_32.c28
-rw-r--r--arch/x86/mm/init_64.c63
-rw-r--r--arch/x86/mm/ioremap.c91
-rw-r--r--arch/x86/mm/k8topology_64.c101
-rw-r--r--arch/x86/mm/kmemcheck/error.c19
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c2
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c16
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h2
-rw-r--r--arch/x86/mm/kmmio.c57
-rw-r--r--arch/x86/mm/mmap.c4
-rw-r--r--arch/x86/mm/mmio-mod.c71
-rw-r--r--arch/x86/mm/numa_32.c7
-rw-r--r--arch/x86/mm/numa_64.c506
-rw-r--r--arch/x86/mm/pageattr.c43
-rw-r--r--arch/x86/mm/pat.c23
-rw-r--r--arch/x86/mm/pgtable.c31
-rw-r--r--arch/x86/mm/setup_nx.c59
-rw-r--r--arch/x86/mm/srat_32.c2
-rw-r--r--arch/x86/mm/srat_64.c41
-rw-r--r--arch/x86/mm/testmmiotrace.c29
-rw-r--r--arch/x86/mm/tlb.c11
-rw-r--r--arch/x86/oprofile/backtrace.c9
-rw-r--r--arch/x86/oprofile/nmi_int.c20
-rw-r--r--arch/x86/oprofile/op_model_amd.c261
-rw-r--r--arch/x86/oprofile/op_model_p4.c6
-rw-r--r--arch/x86/oprofile/op_model_ppro.c21
-rw-r--r--arch/x86/oprofile/op_x86_model.h20
-rw-r--r--arch/x86/pci/Makefile8
-rw-r--r--arch/x86/pci/acpi.c131
-rw-r--r--arch/x86/pci/amd_bus.c239
-rw-r--r--arch/x86/pci/bus_numa.c101
-rw-r--r--arch/x86/pci/bus_numa.h25
-rw-r--r--arch/x86/pci/common.c29
-rw-r--r--arch/x86/pci/early.c7
-rw-r--r--arch/x86/pci/i386.c60
-rw-r--r--arch/x86/pci/init.c8
-rw-r--r--arch/x86/pci/irq.c18
-rw-r--r--arch/x86/pci/legacy.c24
-rw-r--r--arch/x86/pci/mmconfig-shared.c355
-rw-r--r--arch/x86/pci/mmconfig_32.c16
-rw-r--r--arch/x86/pci/mmconfig_64.c88
-rw-r--r--arch/x86/pci/mrst.c262
-rw-r--r--arch/x86/pci/numaq_32.c12
-rw-r--r--arch/x86/pci/olpc.c3
-rw-r--r--arch/x86/pci/visws.c6
-rw-r--r--arch/x86/power/cpu.c26
-rw-r--r--arch/x86/tools/Makefile31
-rw-r--r--arch/x86/tools/chkobjdump.awk33
-rw-r--r--arch/x86/tools/distill.awk47
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk378
-rw-r--r--arch/x86/tools/test_get_len.c173
-rw-r--r--arch/x86/vdso/vdso32-setup.c1
-rw-r--r--arch/x86/xen/debugfs.c2
-rw-r--r--arch/x86/xen/enlighten.c62
-rw-r--r--arch/x86/xen/mmu.c23
-rw-r--r--arch/x86/xen/smp.c46
-rw-r--r--arch/x86/xen/spinlock.c16
-rw-r--r--arch/x86/xen/suspend.c17
-rw-r--r--arch/x86/xen/time.c31
-rw-r--r--arch/x86/xen/xen-asm_32.S4
-rw-r--r--arch/x86/xen/xen-asm_64.S4
-rw-r--r--arch/x86/xen/xen-ops.h2
407 files changed, 21923 insertions, 13337 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8da93745c087..0eacb1ffb421 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -31,6 +31,7 @@ config X86
select ARCH_WANT_FRAME_POINTERS
select HAVE_DMA_ATTRS
select HAVE_KRETPROBES
+ select HAVE_OPTPROBES
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_DYNAMIC_FTRACE
select HAVE_FUNCTION_TRACER
@@ -45,11 +46,17 @@ config X86
select HAVE_GENERIC_DMA_COHERENT if X86_32
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select USER_STACKTRACE_SUPPORT
+ select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_DMA_API_DEBUG
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA
+ select HAVE_KERNEL_LZO
+ select HAVE_HW_BREAKPOINT
+ select PERF_EVENTS
+ select ANON_INODES
select HAVE_ARCH_KMEMCHECK
+ select HAVE_USER_RETURN_NOTIFIER
config OUTPUT_FORMAT
string
@@ -86,10 +93,6 @@ config STACKTRACE_SUPPORT
config HAVE_LATENCYTOP_SUPPORT
def_bool y
-config FAST_CMPXCHG_LOCAL
- bool
- default y
-
config MMU
def_bool y
@@ -99,6 +102,9 @@ config ZONE_DMA
config SBUS
bool
+config NEED_DMA_MAP_STATE
+ def_bool (X86_64 || DMAR || DMA_API_DEBUG)
+
config GENERIC_ISA_DMA
def_bool y
@@ -182,6 +188,9 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
config ARCH_SUPPORTS_DEBUG_PAGEALLOC
def_bool y
+config HAVE_EARLY_RES
+ def_bool y
+
config HAVE_INTEL_TXT
def_bool y
depends on EXPERIMENTAL && DMAR && ACPI
@@ -387,8 +396,12 @@ config X86_ELAN
config X86_MRST
bool "Moorestown MID platform"
+ depends on PCI
+ depends on PCI_GOANY
depends on X86_32
depends on X86_EXTENDED_PLATFORM
+ depends on X86_IO_APIC
+ select APB_TIMER
---help---
Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
Internet Device(MID) platform. Moorestown consists of two chips:
@@ -423,6 +436,7 @@ config X86_32_NON_STANDARD
config X86_NUMAQ
bool "NUMAQ (IBM/Sequent)"
depends on X86_32_NON_STANDARD
+ depends on PCI
select NUMA
select X86_MPPARSE
---help---
@@ -495,7 +509,7 @@ if PARAVIRT_GUEST
source "arch/x86/xen/Kconfig"
config VMI
- bool "VMI Guest support"
+ bool "VMI Guest support (DEPRECATED)"
select PARAVIRT
depends on X86_32
---help---
@@ -504,6 +518,15 @@ config VMI
at the moment), by linking the kernel to a GPL-ed ROM module
provided by the hypervisor.
+ As of September 2009, VMware has started a phased retirement
+ of this feature from VMware's products. Please see
+ feature-removal-schedule.txt for details. If you are
+ planning to enable this option, please note that you cannot
+ live migrate a VMI enabled VM to a future VMware product,
+ which doesn't support VMI. So if you expect your kernel to
+ seamlessly migrate to newer VMware products, keep this
+ disabled.
+
config KVM_CLOCK
bool "KVM paravirtualized clock"
select PARAVIRT
@@ -558,6 +581,18 @@ config PARAVIRT_DEBUG
Enable to debug paravirt_ops internals. Specifically, BUG if
a paravirt_op is missing when it is called.
+config NO_BOOTMEM
+ default y
+ bool "Disable Bootmem code"
+ ---help---
+ Use early_res directly instead of bootmem before slab is ready.
+ - allocator (buddy) [generic]
+ - early allocator (bootmem) [generic]
+ - very early allocator (reserve_early*()) [x86]
+ - very very early allocator (early brk model) [x86]
+ So reduce one layer between early allocator to final allocator
+
+
config MEMTEST
bool "Memtest"
---help---
@@ -602,6 +637,16 @@ config HPET_EMULATE_RTC
def_bool y
depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
+config APB_TIMER
+ def_bool y if MRST
+ prompt "Langwell APB Timer Support" if X86_MRST
+ help
+ APB timer is the replacement for 8254, HPET on X86 MID platforms.
+ The APBT provides a stable time base on SMP
+ systems, unlike the TSC, but it is more expensive to access,
+ as it is off-chip. APB timers are always running regardless of CPU
+ C states, they are used as per CPU clockevent device when possible.
+
# Mark as embedded because too many people got it wrong.
# The code disables itself when not needed.
config DMI
@@ -617,7 +662,7 @@ config GART_IOMMU
bool "GART IOMMU support" if EMBEDDED
default y
select SWIOTLB
- depends on X86_64 && PCI
+ depends on X86_64 && PCI && K8_NB
---help---
Support for full DMA access of devices with 32bit memory access only
on systems with more than 3GB. This is usually needed for USB,
@@ -979,12 +1024,6 @@ config X86_CPUID
with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
/dev/cpu/31/cpuid.
-config X86_CPU_DEBUG
- tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support"
- ---help---
- If you select this option, this will provide various x86 CPUs
- information through debugfs.
-
choice
prompt "High Memory Support"
default HIGHMEM4G if !X86_NUMAQ
@@ -1237,6 +1276,11 @@ config ARCH_MEMORY_PROBE
def_bool X86_64
depends on MEMORY_HOTPLUG
+config ILLEGAL_POINTER_VALUE
+ hex
+ default 0 if X86_32
+ default 0xdead000000000000 if X86_64
+
source "mm/Kconfig"
config HIGHPTE
@@ -1325,7 +1369,9 @@ config MATH_EMULATION
kernel, it won't hurt.
config MTRR
- bool "MTRR (Memory Type Range Register) support"
+ bool
+ default y
+ prompt "MTRR (Memory Type Range Register) support" if EMBEDDED
---help---
On Intel P6 family processors (Pentium Pro, Pentium II and later)
the Memory Type Range Registers (MTRRs) may be used to control
@@ -1391,7 +1437,8 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
config X86_PAT
bool
- prompt "x86 PAT support"
+ default y
+ prompt "x86 PAT support" if EMBEDDED
depends on MTRR
---help---
Use PAT attributes to setup page level cache control.
@@ -1438,12 +1485,8 @@ config SECCOMP
If unsure, say Y. Only embedded should say N here.
-config CC_STACKPROTECTOR_ALL
- bool
-
config CC_STACKPROTECTOR
bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
- select CC_STACKPROTECTOR_ALL
---help---
This option turns on the -fstack-protector GCC feature. This
feature puts, at the beginning of functions, a canary value on
@@ -1601,7 +1644,7 @@ config COMPAT_VDSO
depends on X86_32 || IA32_EMULATION
---help---
Map the 32-bit VDSO to the predictable old-style address too.
- ---help---
+
Say N here if you are running a sufficiently recent glibc
version (2.3.3 or later), to remove the high-mapped
VDSO mapping and to exclusively use the randomized VDSO.
@@ -2006,18 +2049,9 @@ config SCx200HR_TIMER
processor goes idle (as is done by the scheduler). The
other workaround is idle=poll boot option.
-config GEODE_MFGPT_TIMER
- def_bool y
- prompt "Geode Multi-Function General Purpose Timer (MFGPT) events"
- depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS
- ---help---
- This driver provides a clock event source based on the MFGPT
- timer(s) in the CS5535 and CS5536 companion chip for the geode.
- MFGPTs have a better resolution and max interval than the
- generic PIT, and are suitable for use as high-res timers.
-
config OLPC
bool "One Laptop Per Child support"
+ select GPIOLIB
default n
---help---
Add support for detecting the unique features of the OLPC
@@ -2027,7 +2061,7 @@ endif # X86_32
config K8_NB
def_bool y
- depends on AGP_AMD64 || (X86_64 && (GART_IOMMU || (PCI && NUMA)))
+ depends on CPU_SUP_AMD && PCI
source "drivers/pcmcia/Kconfig"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 527519b8a9f9..a19829374e6a 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -301,15 +301,11 @@ config X86_CPU
#
# Define implied options from the CPU selection here
-config X86_L1_CACHE_BYTES
+config X86_INTERNODE_CACHE_SHIFT
int
- default "128" if MPSC
- default "64" if GENERIC_CPU || MK8 || MCORE2 || MATOM || X86_32
-
-config X86_INTERNODE_CACHE_BYTES
- int
- default "4096" if X86_VSMP
- default X86_L1_CACHE_BYTES if !X86_VSMP
+ default "12" if X86_VSMP
+ default "7" if NUMA
+ default X86_L1_CACHE_SHIFT
config X86_CMPXCHG
def_bool X86_64 || (X86_32 && !M386)
@@ -317,13 +313,13 @@ config X86_CMPXCHG
config X86_L1_CACHE_SHIFT
int
default "7" if MPENTIUM4 || MPSC
+ default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
config X86_XADD
def_bool y
- depends on X86_32 && !M386
+ depends on X86_64 || !M386
config X86_PPRO_FENCE
bool "PentiumPro memory ordering errata workaround"
@@ -400,18 +396,19 @@ config X86_TSC
config X86_CMPXCHG64
def_bool y
- depends on X86_PAE || X86_64
+ depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM
# this should be set for all -march=.. options where the compiler
# generates cmov.
config X86_CMOV
def_bool y
- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM)
+ depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
config X86_MINIMUM_CPU_FAMILY
int
default "64" if X86_64
default "6" if X86_32 && X86_P6_NOP
+ default "5" if X86_32 && X86_CMPXCHG64
default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK)
default "3"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d105f29bb6bb..bc01e3ebfeb2 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -186,6 +186,15 @@ config X86_DS_SELFTEST
config HAVE_MMIOTRACE_SUPPORT
def_bool y
+config X86_DECODER_SELFTEST
+ bool "x86 instruction decoder selftest"
+ depends on DEBUG_KERNEL && KPROBES
+ ---help---
+ Perform x86 instruction decoder selftests at build time.
+ This option is useful for checking the sanity of x86 instruction
+ decoder code.
+ If unsure, say "N".
+
#
# IO delay types:
#
@@ -287,4 +296,18 @@ config OPTIMIZE_INLINING
If unsure, say N.
+config DEBUG_STRICT_USER_COPY_CHECKS
+ bool "Strict copy size checks"
+ depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING
+ ---help---
+ Enabling this option turns a certain set of sanity checks for user
+ copy operations into compile time failures.
+
+ The copy_from_user() etc checks are there to help test if there
+ are sufficient security checks on the length argument of
+ the copy operation, by having gcc prove that the argument is
+ within bounds.
+
+ If unsure, or if you run an older (pre 4.4) gcc, say N.
+
endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index a012ee8ef803..0a43dc515e4c 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -76,7 +76,6 @@ ifdef CONFIG_CC_STACKPROTECTOR
cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh
ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(biarch)),y)
stackp-y := -fstack-protector
- stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all
KBUILD_CFLAGS += $(stackp-y)
else
$(warning stack protector enabled but no compiler support)
@@ -136,9 +135,7 @@ drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
# suspend and hibernation support
drivers-$(CONFIG_PM) += arch/x86/power/
-ifeq ($(CONFIG_X86_32),y)
drivers-$(CONFIG_FB) += arch/x86/video/
-endif
####
# boot loader support. Several targets are kept for legacy purposes
@@ -156,6 +153,9 @@ all: bzImage
KBUILD_IMAGE := $(boot)/bzImage
bzImage: vmlinux
+ifeq ($(CONFIG_X86_DECODER_SELFTEST),y)
+ $(Q)$(MAKE) $(build)=arch/x86/tools posttest
+endif
$(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
$(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
$(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index 30e9a264f69d..1255d953c65d 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -41,11 +41,18 @@ cflags-$(CONFIG_X86_ELAN) += -march=i486
# Geode GX1 support
cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
-
+cflags-$(CONFIG_MGEODE_LX) += $(call cc-option,-march=geode,-march=pentium-mmx)
# add at the end to overwrite eventual tuning options from earlier
# cpu entries
cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
+# Work around the pentium-mmx code generator madness of gcc4.4.x which
+# does stack alignment by generating horrible code _before_ the mcount
+# prologue (push %ebp, mov %esp, %ebp) which breaks the function graph
+# tracer assumptions. For i686, generic, core2 this is set by the
+# compiler anyway
+cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-maccumulate-outgoing-args)
+
# Bug fix for binutils: this option is required in order to keep
# binutils from generating NOPL instructions against our will.
ifneq ($(CONFIG_X86_P6_NOP),y)
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index f8ed0658404c..fbb47daf2459 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,11 +4,12 @@
# create a compressed vmlinux image from the original vmlinux
#
-targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o
+targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o piggy.o
KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
+cflags-$(CONFIG_X86_32) := -march=i386
cflags-$(CONFIG_X86_64) := -mcmodel=small
KBUILD_CFLAGS += $(cflags-y)
KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
@@ -48,10 +49,13 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
$(call if_changed,bzip2)
$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
$(call if_changed,lzma)
+$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
+ $(call if_changed,lzo)
suffix-$(CONFIG_KERNEL_GZIP) := gz
suffix-$(CONFIG_KERNEL_BZIP2) := bz2
suffix-$(CONFIG_KERNEL_LZMA) := lzma
+suffix-$(CONFIG_KERNEL_LZO) := lzo
quiet_cmd_mkpiggy = MKPIGGY $@
cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 077e1b69198e..faff0dc9c06a 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -107,8 +107,7 @@ ENTRY(startup_32)
lgdt gdt(%ebp)
/* Enable PAE mode */
- xorl %eax, %eax
- orl $(X86_CR4_PAE), %eax
+ movl $(X86_CR4_PAE), %eax
movl %eax, %cr4
/*
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 842b2a36174a..51e240779a44 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -19,11 +19,6 @@
#define _ASM_X86_DESC_H 1
#endif
-#ifdef CONFIG_X86_64
-#define _LINUX_STRING_H_ 1
-#define __LINUX_BITMAP_H 1
-#endif
-
#include <linux/linkage.h>
#include <linux/screen_info.h>
#include <linux/elf.h>
@@ -131,8 +126,8 @@ static void error(char *m);
static struct boot_params *real_mode; /* Pointer to real-mode data */
static int quiet;
-static void *memset(void *s, int c, unsigned n);
-void *memcpy(void *dest, const void *src, unsigned n);
+void *memset(void *s, int c, size_t n);
+void *memcpy(void *dest, const void *src, size_t n);
static void __putstr(int, const char *);
#define putstr(__x) __putstr(0, __x)
@@ -162,6 +157,10 @@ static int lines, cols;
#include "../../../../lib/decompress_unlzma.c"
#endif
+#ifdef CONFIG_KERNEL_LZO
+#include "../../../../lib/decompress_unlzo.c"
+#endif
+
static void scroll(void)
{
int i;
@@ -181,11 +180,9 @@ static void __putstr(int error, const char *s)
return;
#endif
-#ifdef CONFIG_X86_32
if (real_mode->screen_info.orig_video_mode == 0 &&
lines == 0 && cols == 0)
return;
-#endif
x = real_mode->screen_info.orig_x;
y = real_mode->screen_info.orig_y;
@@ -219,7 +216,7 @@ static void __putstr(int error, const char *s)
outb(0xff & (pos >> 1), vidport+1);
}
-static void *memset(void *s, int c, unsigned n)
+void *memset(void *s, int c, size_t n)
{
int i;
char *ss = s;
@@ -229,7 +226,7 @@ static void *memset(void *s, int c, unsigned n)
return s;
}
-void *memcpy(void *dest, const void *src, unsigned n)
+void *memcpy(void *dest, const void *src, size_t n)
{
int i;
const char *s = src;
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index bbeb0c3fbd90..89bbf4e4d05d 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -9,6 +9,9 @@
#include <byteswap.h>
#define USE_BSD
#include <endian.h>
+#include <regex.h>
+
+static void die(char *fmt, ...);
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
static Elf32_Ehdr ehdr;
@@ -30,25 +33,47 @@ static struct section *secs;
* the address for which it has been compiled. Don't warn user about
* absolute relocations present w.r.t these symbols.
*/
-static const char* safe_abs_relocs[] = {
- "xen_irq_disable_direct_reloc",
- "xen_save_fl_direct_reloc",
-};
+static const char abs_sym_regex[] =
+ "^(xen_irq_disable_direct_reloc$|"
+ "xen_save_fl_direct_reloc$|"
+ "VDSO|"
+ "__crc_)";
+static regex_t abs_sym_regex_c;
+static int is_abs_reloc(const char *sym_name)
+{
+ return !regexec(&abs_sym_regex_c, sym_name, 0, NULL, 0);
+}
-static int is_safe_abs_reloc(const char* sym_name)
+/*
+ * These symbols are known to be relative, even if the linker marks them
+ * as absolute (typically defined outside any section in the linker script.)
+ */
+static const char rel_sym_regex[] =
+ "^_end$";
+static regex_t rel_sym_regex_c;
+static int is_rel_reloc(const char *sym_name)
{
- int i;
+ return !regexec(&rel_sym_regex_c, sym_name, 0, NULL, 0);
+}
- for (i = 0; i < ARRAY_SIZE(safe_abs_relocs); i++) {
- if (!strcmp(sym_name, safe_abs_relocs[i]))
- /* Match found */
- return 1;
- }
- if (strncmp(sym_name, "VDSO", 4) == 0)
- return 1;
- if (strncmp(sym_name, "__crc_", 6) == 0)
- return 1;
- return 0;
+static void regex_init(void)
+{
+ char errbuf[128];
+ int err;
+
+ err = regcomp(&abs_sym_regex_c, abs_sym_regex,
+ REG_EXTENDED|REG_NOSUB);
+ if (err) {
+ regerror(err, &abs_sym_regex_c, errbuf, sizeof errbuf);
+ die("%s", errbuf);
+ }
+
+ err = regcomp(&rel_sym_regex_c, rel_sym_regex,
+ REG_EXTENDED|REG_NOSUB);
+ if (err) {
+ regerror(err, &rel_sym_regex_c, errbuf, sizeof errbuf);
+ die("%s", errbuf);
+ }
}
static void die(char *fmt, ...)
@@ -131,7 +156,7 @@ static const char *rel_type(unsigned type)
#undef REL_TYPE
};
const char *name = "unknown type rel type name";
- if (type < ARRAY_SIZE(type_name)) {
+ if (type < ARRAY_SIZE(type_name) && type_name[type]) {
name = type_name[type];
}
return name;
@@ -448,7 +473,7 @@ static void print_absolute_relocs(void)
* Before warning check if this absolute symbol
* relocation is harmless.
*/
- if (is_safe_abs_reloc(name))
+ if (is_abs_reloc(name) || is_rel_reloc(name))
continue;
if (!printed) {
@@ -501,21 +526,26 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
r_type = ELF32_R_TYPE(rel->r_info);
/* Don't visit relocations to absolute symbols */
- if (sym->st_shndx == SHN_ABS) {
+ if (sym->st_shndx == SHN_ABS &&
+ !is_rel_reloc(sym_name(sym_strtab, sym))) {
continue;
}
- if (r_type == R_386_NONE || r_type == R_386_PC32) {
+ switch (r_type) {
+ case R_386_NONE:
+ case R_386_PC32:
/*
* NONE can be ignored and and PC relative
* relocations don't need to be adjusted.
*/
- }
- else if (r_type == R_386_32) {
+ break;
+ case R_386_32:
/* Visit relocations that need to be adjusted */
visit(rel, sym);
- }
- else {
- die("Unsupported relocation type: %d\n", r_type);
+ break;
+ default:
+ die("Unsupported relocation type: %s (%d)\n",
+ rel_type(r_type), r_type);
+ break;
}
}
}
@@ -571,16 +601,15 @@ static void emit_relocs(int as_text)
}
else {
unsigned char buf[4];
- buf[0] = buf[1] = buf[2] = buf[3] = 0;
/* Print a stop */
- printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
+ fwrite("\0\0\0\0", 4, 1, stdout);
/* Now print each relocation */
for (i = 0; i < reloc_count; i++) {
buf[0] = (relocs[i] >> 0) & 0xff;
buf[1] = (relocs[i] >> 8) & 0xff;
buf[2] = (relocs[i] >> 16) & 0xff;
buf[3] = (relocs[i] >> 24) & 0xff;
- printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
+ fwrite(buf, 4, 1, stdout);
}
}
}
@@ -598,6 +627,8 @@ int main(int argc, char **argv)
FILE *fp;
int i;
+ regex_init();
+
show_absolute_syms = 0;
show_absolute_relocs = 0;
as_text = 0;
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index f4193bb48782..a6f1a59a5b0c 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -4,6 +4,7 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
#undef i386
+#include <asm/cache.h>
#include <asm/page_types.h>
#ifdef CONFIG_X86_64
@@ -46,7 +47,7 @@ SECTIONS
*(.data.*)
_edata = . ;
}
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ . = ALIGN(L1_CACHE_BYTES);
.bss : {
_bss = . ;
*(.bss)
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index b31cc54b4641..93e689f4bd86 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -16,7 +16,7 @@
*/
#include <asm/segment.h>
-#include <linux/utsrelease.h>
+#include <generated/utsrelease.h>
#include <asm/boot.h>
#include <asm/e820.h>
#include <asm/page_types.h>
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index 8ef60f20b371..919257f526f2 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -22,7 +22,7 @@ int main(void)
int i, j;
const char *str;
- printf("static const char x86_cap_strs[] = \n");
+ printf("static const char x86_cap_strs[] =\n");
for (i = 0; i < NCAPINTS; i++) {
for (j = 0; j < 32; j++) {
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index 0f6ec455a2b1..03c0683636b6 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -53,6 +53,9 @@ SECTIONS
/DISCARD/ : { *(.note*) }
+ /*
+ * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
+ */
. = ASSERT(_end <= 0x8000, "Setup too big!");
. = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
/* Necessary for the very-old-loader check to work... */
diff --git a/arch/x86/boot/version.c b/arch/x86/boot/version.c
index 2723d9b5ce43..2b15aa488ffb 100644
--- a/arch/x86/boot/version.c
+++ b/arch/x86/boot/version.c
@@ -13,8 +13,8 @@
*/
#include "boot.h"
-#include <linux/utsrelease.h>
-#include <linux/compile.h>
+#include <generated/utsrelease.h>
+#include <generated/compile.h>
const char kernel_version[] =
UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") "
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 819caa1f2008..ed7aeff786b2 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -42,22 +42,15 @@ static u8 vga_set_basic_mode(void)
{
struct biosregs ireg, oreg;
u16 ax;
- u8 rows;
u8 mode;
initregs(&ireg);
+ /* Query current mode */
ax = 0x0f00;
intcall(0x10, &ireg, &oreg);
mode = oreg.al;
- set_fs(0);
- rows = rdfs8(0x484); /* rows minus one */
-
- if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) &&
- (rows == 0 || rows == 24))
- return mode;
-
if (mode != 3 && mode != 7)
mode = 3;
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index d42da3802499..43eda284d27f 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -27,6 +27,12 @@ static void store_cursor_position(void)
boot_params.screen_info.orig_x = oreg.dl;
boot_params.screen_info.orig_y = oreg.dh;
+
+ if (oreg.ch & 0x20)
+ boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
+
+ if ((oreg.ch & 0x1f) > (oreg.cl & 0x1f))
+ boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
}
static void store_video_mode(void)
@@ -292,11 +298,18 @@ static void restore_screen(void)
}
/* Restore cursor position */
+ if (saved.curx >= xs)
+ saved.curx = xs-1;
+ if (saved.cury >= ys)
+ saved.cury = ys-1;
+
initregs(&ireg);
ireg.ah = 0x02; /* Set cursor position */
ireg.dh = saved.cury;
ireg.dl = saved.curx;
intcall(0x10, &ireg, NULL);
+
+ store_cursor_position();
}
void set_video(void)
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index cfb0010fa940..1a58ad89fdf7 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
+obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
@@ -24,3 +25,5 @@ twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
+
+ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index eb0566e83319..20bb0e1ac681 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -16,6 +16,7 @@
*/
#include <linux/linkage.h>
+#include <asm/inst.h>
.text
@@ -122,103 +123,72 @@ ENTRY(aesni_set_key)
movups 0x10(%rsi), %xmm2 # other user key
movaps %xmm2, (%rcx)
add $0x10, %rcx
- # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
+ AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
call _key_expansion_256a
- # aeskeygenassist $0x1, %xmm0, %xmm1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
+ AESKEYGENASSIST 0x1 %xmm0 %xmm1
call _key_expansion_256b
- # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
+ AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
call _key_expansion_256a
- # aeskeygenassist $0x2, %xmm0, %xmm1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
+ AESKEYGENASSIST 0x2 %xmm0 %xmm1
call _key_expansion_256b
- # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
+ AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
call _key_expansion_256a
- # aeskeygenassist $0x4, %xmm0, %xmm1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
+ AESKEYGENASSIST 0x4 %xmm0 %xmm1
call _key_expansion_256b
- # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
+ AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
call _key_expansion_256a
- # aeskeygenassist $0x8, %xmm0, %xmm1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
+ AESKEYGENASSIST 0x8 %xmm0 %xmm1
call _key_expansion_256b
- # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
+ AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
call _key_expansion_256a
- # aeskeygenassist $0x10, %xmm0, %xmm1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
+ AESKEYGENASSIST 0x10 %xmm0 %xmm1
call _key_expansion_256b
- # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
+ AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
call _key_expansion_256a
- # aeskeygenassist $0x20, %xmm0, %xmm1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
+ AESKEYGENASSIST 0x20 %xmm0 %xmm1
call _key_expansion_256b
- # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
+ AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
call _key_expansion_256a
jmp .Ldec_key
.Lenc_key192:
movq 0x10(%rsi), %xmm2 # other user key
- # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
+ AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
call _key_expansion_192a
- # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
+ AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
call _key_expansion_192b
- # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
+ AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
call _key_expansion_192a
- # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
+ AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
call _key_expansion_192b
- # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
+ AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
call _key_expansion_192a
- # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
+ AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
call _key_expansion_192b
- # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
+ AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
call _key_expansion_192a
- # aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80
+ AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
call _key_expansion_192b
jmp .Ldec_key
.Lenc_key128:
- # aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
+ AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
call _key_expansion_128
- # aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
+ AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
call _key_expansion_128
- # aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
+ AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
call _key_expansion_128
- # aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
+ AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
call _key_expansion_128
- # aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
+ AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
call _key_expansion_128
- # aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
+ AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
call _key_expansion_128
- # aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40
+ AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
call _key_expansion_128
- # aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80
+ AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
call _key_expansion_128
- # aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b
+ AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
call _key_expansion_128
- # aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
- .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36
+ AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
call _key_expansion_128
.Ldec_key:
sub $0x10, %rcx
@@ -231,8 +201,7 @@ ENTRY(aesni_set_key)
.align 4
.Ldec_key_loop:
movaps (%rdi), %xmm0
- # aesimc %xmm0, %xmm1
- .byte 0x66, 0x0f, 0x38, 0xdb, 0xc8
+ AESIMC %xmm0 %xmm1
movaps %xmm1, (%rsi)
add $0x10, %rdi
sub $0x10, %rsi
@@ -274,51 +243,37 @@ _aesni_enc1:
je .Lenc192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ AESENC KEY STATE
movaps -0x50(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ AESENC KEY STATE
.align 4
.Lenc192:
movaps -0x40(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ AESENC KEY STATE
movaps -0x30(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ AESENC KEY STATE
.align 4
.Lenc128:
movaps -0x20(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ AESENC KEY STATE
movaps -0x10(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ AESENC KEY STATE
movaps (TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ AESENC KEY STATE
movaps 0x10(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ AESENC KEY STATE
movaps 0x20(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ AESENC KEY STATE
movaps 0x30(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ AESENC KEY STATE
movaps 0x40(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ AESENC KEY STATE
movaps 0x50(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ AESENC KEY STATE
movaps 0x60(TKEYP), KEY
- # aesenc KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
+ AESENC KEY STATE
movaps 0x70(TKEYP), KEY
- # aesenclast KEY, STATE # last round
- .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2
+ AESENCLAST KEY STATE
ret
/*
@@ -353,135 +308,79 @@ _aesni_enc4:
je .L4enc192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
movaps -0x50(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
#.align 4
.L4enc192:
movaps -0x40(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
movaps -0x30(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
#.align 4
.L4enc128:
movaps -0x20(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
movaps -0x10(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
movaps (TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
movaps 0x10(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
movaps 0x20(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
movaps 0x30(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
movaps 0x40(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
movaps 0x50(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
movaps 0x60(TKEYP), KEY
- # aesenc KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
- # aesenc KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
- # aesenc KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
- # aesenc KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
movaps 0x70(TKEYP), KEY
- # aesenclast KEY, STATE1 # last round
- .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2
- # aesenclast KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdd, 0xe2
- # aesenclast KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdd, 0xea
- # aesenclast KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdd, 0xf2
+ AESENCLAST KEY STATE1 # last round
+ AESENCLAST KEY STATE2
+ AESENCLAST KEY STATE3
+ AESENCLAST KEY STATE4
ret
/*
@@ -518,51 +417,37 @@ _aesni_dec1:
je .Ldec192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ AESDEC KEY STATE
movaps -0x50(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ AESDEC KEY STATE
.align 4
.Ldec192:
movaps -0x40(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ AESDEC KEY STATE
movaps -0x30(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ AESDEC KEY STATE
.align 4
.Ldec128:
movaps -0x20(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ AESDEC KEY STATE
movaps -0x10(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ AESDEC KEY STATE
movaps (TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ AESDEC KEY STATE
movaps 0x10(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ AESDEC KEY STATE
movaps 0x20(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ AESDEC KEY STATE
movaps 0x30(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ AESDEC KEY STATE
movaps 0x40(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ AESDEC KEY STATE
movaps 0x50(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ AESDEC KEY STATE
movaps 0x60(TKEYP), KEY
- # aesdec KEY, STATE
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
+ AESDEC KEY STATE
movaps 0x70(TKEYP), KEY
- # aesdeclast KEY, STATE # last round
- .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2
+ AESDECLAST KEY STATE
ret
/*
@@ -597,135 +482,79 @@ _aesni_dec4:
je .L4dec192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
movaps -0x50(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
.align 4
.L4dec192:
movaps -0x40(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
movaps -0x30(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
.align 4
.L4dec128:
movaps -0x20(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
movaps -0x10(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
movaps (TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
movaps 0x10(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
movaps 0x20(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
movaps 0x30(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
movaps 0x40(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
movaps 0x50(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
movaps 0x60(TKEYP), KEY
- # aesdec KEY, STATE1
- .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
- # aesdec KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
- # aesdec KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xde, 0xea
- # aesdec KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
movaps 0x70(TKEYP), KEY
- # aesdeclast KEY, STATE1 # last round
- .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2
- # aesdeclast KEY, STATE2
- .byte 0x66, 0x0f, 0x38, 0xdf, 0xe2
- # aesdeclast KEY, STATE3
- .byte 0x66, 0x0f, 0x38, 0xdf, 0xea
- # aesdeclast KEY, STATE4
- .byte 0x66, 0x0f, 0x38, 0xdf, 0xf2
+ AESDECLAST KEY STATE1 # last round
+ AESDECLAST KEY STATE2
+ AESDECLAST KEY STATE3
+ AESDECLAST KEY STATE4
ret
/*
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 585edebe12cf..49c552c060e9 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -82,7 +82,7 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx,
return -EINVAL;
}
- if (irq_fpu_usable())
+ if (!irq_fpu_usable())
err = crypto_aes_expand_key(ctx, in_key, key_len);
else {
kernel_fpu_begin();
@@ -103,7 +103,7 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
- if (irq_fpu_usable())
+ if (!irq_fpu_usable())
crypto_aes_encrypt_x86(ctx, dst, src);
else {
kernel_fpu_begin();
@@ -116,7 +116,7 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
- if (irq_fpu_usable())
+ if (!irq_fpu_usable())
crypto_aes_decrypt_x86(ctx, dst, src);
else {
kernel_fpu_begin();
@@ -342,7 +342,7 @@ static int ablk_encrypt(struct ablkcipher_request *req)
struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
- if (irq_fpu_usable()) {
+ if (!irq_fpu_usable()) {
struct ablkcipher_request *cryptd_req =
ablkcipher_request_ctx(req);
memcpy(cryptd_req, req, sizeof(*req));
@@ -363,7 +363,7 @@ static int ablk_decrypt(struct ablkcipher_request *req)
struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
- if (irq_fpu_usable()) {
+ if (!irq_fpu_usable()) {
struct ablkcipher_request *cryptd_req =
ablkcipher_request_ctx(req);
memcpy(cryptd_req, req, sizeof(*req));
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
new file mode 100644
index 000000000000..1eb7f90cb7b9
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -0,0 +1,157 @@
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions. This file contains accelerated part of ghash
+ * implementation. More information about PCLMULQDQ can be found at:
+ *
+ * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
+ *
+ * Copyright (c) 2009 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ * Vinodh Gopal
+ * Erdinc Ozturk
+ * Deniz Karakoyunlu
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+.data
+
+.align 16
+.Lbswap_mask:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+.Lpoly:
+ .octa 0xc2000000000000000000000000000001
+.Ltwo_one:
+ .octa 0x00000001000000000000000000000001
+
+#define DATA %xmm0
+#define SHASH %xmm1
+#define T1 %xmm2
+#define T2 %xmm3
+#define T3 %xmm4
+#define BSWAP %xmm5
+#define IN1 %xmm6
+
+.text
+
+/*
+ * __clmul_gf128mul_ble: internal ABI
+ * input:
+ * DATA: operand1
+ * SHASH: operand2, hash_key << 1 mod poly
+ * output:
+ * DATA: operand1 * operand2 mod poly
+ * changed:
+ * T1
+ * T2
+ * T3
+ */
+__clmul_gf128mul_ble:
+ movaps DATA, T1
+ pshufd $0b01001110, DATA, T2
+ pshufd $0b01001110, SHASH, T3
+ pxor DATA, T2
+ pxor SHASH, T3
+
+ PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0
+ PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1
+ PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0)
+ pxor DATA, T2
+ pxor T1, T2 # T2 = a0 * b1 + a1 * b0
+
+ movaps T2, T3
+ pslldq $8, T3
+ psrldq $8, T2
+ pxor T3, DATA
+ pxor T2, T1 # <T1:DATA> is result of
+ # carry-less multiplication
+
+ # first phase of the reduction
+ movaps DATA, T3
+ psllq $1, T3
+ pxor DATA, T3
+ psllq $5, T3
+ pxor DATA, T3
+ psllq $57, T3
+ movaps T3, T2
+ pslldq $8, T2
+ psrldq $8, T3
+ pxor T2, DATA
+ pxor T3, T1
+
+ # second phase of the reduction
+ movaps DATA, T2
+ psrlq $5, T2
+ pxor DATA, T2
+ psrlq $1, T2
+ pxor DATA, T2
+ psrlq $1, T2
+ pxor T2, T1
+ pxor T1, DATA
+ ret
+
+/* void clmul_ghash_mul(char *dst, const be128 *shash) */
+ENTRY(clmul_ghash_mul)
+ movups (%rdi), DATA
+ movups (%rsi), SHASH
+ movaps .Lbswap_mask, BSWAP
+ PSHUFB_XMM BSWAP DATA
+ call __clmul_gf128mul_ble
+ PSHUFB_XMM BSWAP DATA
+ movups DATA, (%rdi)
+ ret
+
+/*
+ * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
+ * const be128 *shash);
+ */
+ENTRY(clmul_ghash_update)
+ cmp $16, %rdx
+ jb .Lupdate_just_ret # check length
+ movaps .Lbswap_mask, BSWAP
+ movups (%rdi), DATA
+ movups (%rcx), SHASH
+ PSHUFB_XMM BSWAP DATA
+.align 4
+.Lupdate_loop:
+ movups (%rsi), IN1
+ PSHUFB_XMM BSWAP IN1
+ pxor IN1, DATA
+ call __clmul_gf128mul_ble
+ sub $16, %rdx
+ add $16, %rsi
+ cmp $16, %rdx
+ jge .Lupdate_loop
+ PSHUFB_XMM BSWAP DATA
+ movups DATA, (%rdi)
+.Lupdate_just_ret:
+ ret
+
+/*
+ * void clmul_ghash_setkey(be128 *shash, const u8 *key);
+ *
+ * Calculate hash_key << 1 mod poly
+ */
+ENTRY(clmul_ghash_setkey)
+ movaps .Lbswap_mask, BSWAP
+ movups (%rsi), %xmm0
+ PSHUFB_XMM BSWAP %xmm0
+ movaps %xmm0, %xmm1
+ psllq $1, %xmm0
+ psrlq $63, %xmm1
+ movaps %xmm1, %xmm2
+ pslldq $8, %xmm1
+ psrldq $8, %xmm2
+ por %xmm1, %xmm0
+ # reduction
+ pshufd $0b00100100, %xmm2, %xmm1
+ pcmpeqd .Ltwo_one, %xmm1
+ pand .Lpoly, %xmm1
+ pxor %xmm1, %xmm0
+ movups %xmm0, (%rdi)
+ ret
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
new file mode 100644
index 000000000000..cbcc8d8ea93a
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -0,0 +1,333 @@
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions. This file contains glue code.
+ *
+ * Copyright (c) 2009 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/crypto.h>
+#include <crypto/algapi.h>
+#include <crypto/cryptd.h>
+#include <crypto/gf128mul.h>
+#include <crypto/internal/hash.h>
+#include <asm/i387.h>
+
+#define GHASH_BLOCK_SIZE 16
+#define GHASH_DIGEST_SIZE 16
+
+void clmul_ghash_mul(char *dst, const be128 *shash);
+
+void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
+ const be128 *shash);
+
+void clmul_ghash_setkey(be128 *shash, const u8 *key);
+
+struct ghash_async_ctx {
+ struct cryptd_ahash *cryptd_tfm;
+};
+
+struct ghash_ctx {
+ be128 shash;
+};
+
+struct ghash_desc_ctx {
+ u8 buffer[GHASH_BLOCK_SIZE];
+ u32 bytes;
+};
+
+static int ghash_init(struct shash_desc *desc)
+{
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ memset(dctx, 0, sizeof(*dctx));
+
+ return 0;
+}
+
+static int ghash_setkey(struct crypto_shash *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
+
+ if (keylen != GHASH_BLOCK_SIZE) {
+ crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ clmul_ghash_setkey(&ctx->shash, key);
+
+ return 0;
+}
+
+static int ghash_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+ struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+ u8 *dst = dctx->buffer;
+
+ kernel_fpu_begin();
+ if (dctx->bytes) {
+ int n = min(srclen, dctx->bytes);
+ u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
+
+ dctx->bytes -= n;
+ srclen -= n;
+
+ while (n--)
+ *pos++ ^= *src++;
+
+ if (!dctx->bytes)
+ clmul_ghash_mul(dst, &ctx->shash);
+ }
+
+ clmul_ghash_update(dst, src, srclen, &ctx->shash);
+ kernel_fpu_end();
+
+ if (srclen & 0xf) {
+ src += srclen - (srclen & 0xf);
+ srclen &= 0xf;
+ dctx->bytes = GHASH_BLOCK_SIZE - srclen;
+ while (srclen--)
+ *dst++ ^= *src++;
+ }
+
+ return 0;
+}
+
+static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx)
+{
+ u8 *dst = dctx->buffer;
+
+ if (dctx->bytes) {
+ u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
+
+ while (dctx->bytes--)
+ *tmp++ ^= 0;
+
+ kernel_fpu_begin();
+ clmul_ghash_mul(dst, &ctx->shash);
+ kernel_fpu_end();
+ }
+
+ dctx->bytes = 0;
+}
+
+static int ghash_final(struct shash_desc *desc, u8 *dst)
+{
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+ struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+ u8 *buf = dctx->buffer;
+
+ ghash_flush(ctx, dctx);
+ memcpy(dst, buf, GHASH_BLOCK_SIZE);
+
+ return 0;
+}
+
+static struct shash_alg ghash_alg = {
+ .digestsize = GHASH_DIGEST_SIZE,
+ .init = ghash_init,
+ .update = ghash_update,
+ .final = ghash_final,
+ .setkey = ghash_setkey,
+ .descsize = sizeof(struct ghash_desc_ctx),
+ .base = {
+ .cra_name = "__ghash",
+ .cra_driver_name = "__ghash-pclmulqdqni",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = GHASH_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct ghash_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ghash_alg.base.cra_list),
+ },
+};
+
+static int ghash_async_init(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+ if (!irq_fpu_usable()) {
+ memcpy(cryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+ return crypto_ahash_init(cryptd_req);
+ } else {
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
+
+ desc->tfm = child;
+ desc->flags = req->base.flags;
+ return crypto_shash_init(desc);
+ }
+}
+
+static int ghash_async_update(struct ahash_request *req)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+
+ if (!irq_fpu_usable()) {
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+ memcpy(cryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+ return crypto_ahash_update(cryptd_req);
+ } else {
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ return shash_ahash_update(req, desc);
+ }
+}
+
+static int ghash_async_final(struct ahash_request *req)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+
+ if (!irq_fpu_usable()) {
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+ memcpy(cryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+ return crypto_ahash_final(cryptd_req);
+ } else {
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ return crypto_shash_final(desc, req->result);
+ }
+}
+
+static int ghash_async_digest(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+ if (!irq_fpu_usable()) {
+ memcpy(cryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+ return crypto_ahash_digest(cryptd_req);
+ } else {
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
+
+ desc->tfm = child;
+ desc->flags = req->base.flags;
+ return shash_ahash_digest(req, desc);
+ }
+}
+
+static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct crypto_ahash *child = &ctx->cryptd_tfm->base;
+ int err;
+
+ crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+ crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm)
+ & CRYPTO_TFM_REQ_MASK);
+ err = crypto_ahash_setkey(child, key, keylen);
+ crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child)
+ & CRYPTO_TFM_RES_MASK);
+
+ return 0;
+}
+
+static int ghash_async_init_tfm(struct crypto_tfm *tfm)
+{
+ struct cryptd_ahash *cryptd_tfm;
+ struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ctx->cryptd_tfm = cryptd_tfm;
+ crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+ sizeof(struct ahash_request) +
+ crypto_ahash_reqsize(&cryptd_tfm->base));
+
+ return 0;
+}
+
+static void ghash_async_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ cryptd_free_ahash(ctx->cryptd_tfm);
+}
+
+static struct ahash_alg ghash_async_alg = {
+ .init = ghash_async_init,
+ .update = ghash_async_update,
+ .final = ghash_async_final,
+ .setkey = ghash_async_setkey,
+ .digest = ghash_async_digest,
+ .halg = {
+ .digestsize = GHASH_DIGEST_SIZE,
+ .base = {
+ .cra_name = "ghash",
+ .cra_driver_name = "ghash-clmulni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
+ .cra_blocksize = GHASH_BLOCK_SIZE,
+ .cra_type = &crypto_ahash_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ghash_async_alg.halg.base.cra_list),
+ .cra_init = ghash_async_init_tfm,
+ .cra_exit = ghash_async_exit_tfm,
+ },
+ },
+};
+
+static int __init ghash_pclmulqdqni_mod_init(void)
+{
+ int err;
+
+ if (!cpu_has_pclmulqdq) {
+ printk(KERN_INFO "Intel PCLMULQDQ-NI instructions are not"
+ " detected.\n");
+ return -ENODEV;
+ }
+
+ err = crypto_register_shash(&ghash_alg);
+ if (err)
+ goto err_out;
+ err = crypto_register_ahash(&ghash_async_alg);
+ if (err)
+ goto err_shash;
+
+ return 0;
+
+err_shash:
+ crypto_unregister_shash(&ghash_alg);
+err_out:
+ return err;
+}
+
+static void __exit ghash_pclmulqdqni_mod_exit(void)
+{
+ crypto_unregister_ahash(&ghash_async_alg);
+ crypto_unregister_shash(&ghash_alg);
+}
+
+module_init(ghash_pclmulqdqni_mod_init);
+module_exit(ghash_pclmulqdqni_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GHASH Message Digest Algorithm, "
+ "acclerated by PCLMULQDQ-NI");
+MODULE_ALIAS("ghash");
diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S
index 39b98ed2c1b9..575331cb2a8a 100644
--- a/arch/x86/crypto/twofish-i586-asm_32.S
+++ b/arch/x86/crypto/twofish-i586-asm_32.S
@@ -22,7 +22,7 @@
#include <asm/asm-offsets.h>
-/* return adress at 0 */
+/* return address at 0 */
#define in_blk 12 /* input byte array address parameter*/
#define out_blk 8 /* output byte array address parameter*/
@@ -230,8 +230,8 @@ twofish_enc_blk:
push %edi
mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
- add $crypto_tfm_ctx_offset, %ebp /* ctx adress */
- mov in_blk+16(%esp),%edi /* input adress in edi */
+ add $crypto_tfm_ctx_offset, %ebp /* ctx address */
+ mov in_blk+16(%esp),%edi /* input address in edi */
mov (%edi), %eax
mov b_offset(%edi), %ebx
@@ -286,8 +286,8 @@ twofish_dec_blk:
mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
- add $crypto_tfm_ctx_offset, %ebp /* ctx adress */
- mov in_blk+16(%esp),%edi /* input adress in edi */
+ add $crypto_tfm_ctx_offset, %ebp /* ctx address */
+ mov in_blk+16(%esp),%edi /* input address in edi */
mov (%edi), %eax
mov b_offset(%edi), %ebx
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index 35974a586615..573aa102542e 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -221,11 +221,11 @@
twofish_enc_blk:
pushq R1
- /* %rdi contains the crypto tfm adress */
- /* %rsi contains the output adress */
- /* %rdx contains the input adress */
- add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */
- /* ctx adress is moved to free one non-rex register
+ /* %rdi contains the crypto tfm address */
+ /* %rsi contains the output address */
+ /* %rdx contains the input address */
+ add $crypto_tfm_ctx_offset, %rdi /* set ctx address */
+ /* ctx address is moved to free one non-rex register
as target for the 8bit high operations */
mov %rdi, %r11
@@ -274,11 +274,11 @@ twofish_enc_blk:
twofish_dec_blk:
pushq R1
- /* %rdi contains the crypto tfm adress */
- /* %rsi contains the output adress */
- /* %rdx contains the input adress */
- add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */
- /* ctx adress is moved to free one non-rex register
+ /* %rdi contains the crypto tfm address */
+ /* %rsi contains the output address */
+ /* %rdx contains the input address */
+ add $crypto_tfm_ctx_offset, %rdi /* set ctx address */
+ /* ctx address is moved to free one non-rex register
as target for the 8bit high operations */
mov %rdi, %r11
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 2a4d073d2cf1..280c019cfad8 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -297,7 +297,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
* size limits imposed on them by creating programs with large
* arrays in the data or bss.
*/
- rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+ rlim = rlimit(RLIMIT_DATA);
if (rlim >= RLIM_INFINITY)
rlim = ~0;
if (ex.a_data + ex.a_bss > rlim)
@@ -308,14 +308,15 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
if (retval)
return retval;
- regs->cs = __USER32_CS;
- regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
- regs->r13 = regs->r14 = regs->r15 = 0;
-
/* OK, This is the point of no return */
set_personality(PER_LINUX);
set_thread_flag(TIF_IA32);
- clear_thread_flag(TIF_ABI_PENDING);
+
+ setup_new_exec(bprm);
+
+ regs->cs = __USER32_CS;
+ regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
+ regs->r13 = regs->r14 = regs->r15 = 0;
current->mm->end_code = ex.a_text +
(current->mm->start_code = N_TXTADDR(ex));
@@ -326,7 +327,6 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
current->mm->free_area_cache = TASK_UNMAPPED_BASE;
current->mm->cached_hole_size = 0;
- current->mm->mmap = NULL;
install_exec_creds(bprm);
current->flags &= ~PF_FORKNOEXEC;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 74619c4f9fda..59b4556a5b92 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -21,8 +21,8 @@
#define __AUDIT_ARCH_LE 0x40000000
#ifndef CONFIG_AUDITSYSCALL
-#define sysexit_audit int_ret_from_sys_call
-#define sysretl_audit int_ret_from_sys_call
+#define sysexit_audit ia32_ret_from_sys_call
+#define sysretl_audit ia32_ret_from_sys_call
#endif
#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
@@ -39,12 +39,12 @@
.endm
/* clobbers %eax */
- .macro CLEAR_RREGS _r9=rax
+ .macro CLEAR_RREGS offset=0, _r9=rax
xorl %eax,%eax
- movq %rax,R11(%rsp)
- movq %rax,R10(%rsp)
- movq %\_r9,R9(%rsp)
- movq %rax,R8(%rsp)
+ movq %rax,\offset+R11(%rsp)
+ movq %rax,\offset+R10(%rsp)
+ movq %\_r9,\offset+R9(%rsp)
+ movq %rax,\offset+R8(%rsp)
.endm
/*
@@ -172,6 +172,10 @@ sysexit_from_sys_call:
movl RIP-R11(%rsp),%edx /* User %eip */
CFI_REGISTER rip,rdx
RESTORE_ARGS 1,24,1,1,1,1
+ xorq %r8,%r8
+ xorq %r9,%r9
+ xorq %r10,%r10
+ xorq %r11,%r11
popfq
CFI_ADJUST_CFA_OFFSET -8
/*CFI_RESTORE rflags*/
@@ -200,9 +204,9 @@ sysexit_from_sys_call:
movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
.endm
- .macro auditsys_exit exit,ebpsave=RBP
+ .macro auditsys_exit exit
testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
- jnz int_ret_from_sys_call
+ jnz ia32_ret_from_sys_call
TRACE_IRQS_ON
sti
movl %eax,%esi /* second arg, syscall return value */
@@ -213,13 +217,13 @@ sysexit_from_sys_call:
call audit_syscall_exit
GET_THREAD_INFO(%r10)
movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
- movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
cli
TRACE_IRQS_OFF
testl %edi,TI_flags(%r10)
- jnz int_with_check
- jmp \exit
+ jz \exit
+ CLEAR_RREGS -ARGOFFSET
+ jmp int_with_check
.endm
sysenter_auditsys:
@@ -329,6 +333,9 @@ sysretl_from_sys_call:
CFI_REGISTER rip,rcx
movl EFLAGS-ARGOFFSET(%rsp),%r11d
/*CFI_REGISTER rflags,r11*/
+ xorq %r10,%r10
+ xorq %r9,%r9
+ xorq %r8,%r8
TRACE_IRQS_ON
movl RSP-ARGOFFSET(%rsp),%esp
CFI_RESTORE rsp
@@ -343,7 +350,7 @@ cstar_auditsys:
jmp cstar_dispatch
sysretl_audit:
- auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
+ auditsys_exit sysretl_from_sys_call
#endif
cstar_tracesys:
@@ -353,7 +360,7 @@ cstar_tracesys:
#endif
xchgl %r9d,%ebp
SAVE_REST
- CLEAR_RREGS r9
+ CLEAR_RREGS 0, r9
movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
@@ -425,6 +432,8 @@ ia32_do_call:
call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
ia32_sysret:
movq %rax,RAX-ARGOFFSET(%rsp)
+ia32_ret_from_sys_call:
+ CLEAR_RREGS -ARGOFFSET
jmp int_ret_from_sys_call
ia32_tracesys:
@@ -442,8 +451,8 @@ END(ia32_syscall)
ia32_badsys:
movq $0,ORIG_RAX-ARGOFFSET(%rsp)
- movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
- jmp int_ret_from_sys_call
+ movq $-ENOSYS,%rax
+ jmp ia32_sysret
quiet_ni_syscall:
movq $-ENOSYS,%rax
@@ -554,7 +563,7 @@ ia32_sys_call_table:
.quad quiet_ni_syscall /* old mpx syscall holder */
.quad sys_setpgid
.quad quiet_ni_syscall /* old ulimit syscall holder */
- .quad sys32_olduname
+ .quad sys_olduname
.quad sys_umask /* 60 */
.quad sys_chroot
.quad compat_sys_ustat
@@ -577,7 +586,7 @@ ia32_sys_call_table:
.quad compat_sys_settimeofday
.quad sys_getgroups16 /* 80 */
.quad sys_setgroups16
- .quad sys32_old_select
+ .quad compat_sys_old_select
.quad sys_symlink
.quad sys_lstat
.quad sys_readlink /* 85 */
@@ -604,7 +613,7 @@ ia32_sys_call_table:
.quad compat_sys_newstat
.quad compat_sys_newlstat
.quad compat_sys_newfstat
- .quad sys32_uname
+ .quad sys_uname
.quad stub32_iopl /* 110 */
.quad sys_vhangup
.quad quiet_ni_syscall /* old "idle" system call */
@@ -644,7 +653,7 @@ ia32_sys_call_table:
.quad compat_sys_writev
.quad sys_getsid
.quad sys_fdatasync
- .quad sys32_sysctl /* sysctl */
+ .quad compat_sys_sysctl /* sysctl */
.quad sys_mlock /* 150 */
.quad sys_munlock
.quad sys_mlockall
@@ -687,7 +696,7 @@ ia32_sys_call_table:
.quad quiet_ni_syscall /* streams2 */
.quad stub32_vfork /* 190 */
.quad compat_sys_getrlimit
- .quad sys32_mmap2
+ .quad sys_mmap_pgoff
.quad sys32_truncate64
.quad sys32_ftruncate64
.quad sys32_stat64 /* 195 */
@@ -832,4 +841,5 @@ ia32_sys_call_table:
.quad compat_sys_pwritev
.quad compat_sys_rt_tgsigqueueinfo /* 335 */
.quad sys_perf_event_open
+ .quad compat_sys_recvmmsg
ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 9f5527198825..74c35431b7d8 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -143,7 +143,7 @@ asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename,
* block for parameter passing..
*/
-struct mmap_arg_struct {
+struct mmap_arg_struct32 {
unsigned int addr;
unsigned int len;
unsigned int prot;
@@ -152,12 +152,9 @@ struct mmap_arg_struct {
unsigned int offset;
};
-asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg)
+asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *arg)
{
- struct mmap_arg_struct a;
- struct file *file = NULL;
- unsigned long retval;
- struct mm_struct *mm ;
+ struct mmap_arg_struct32 a;
if (copy_from_user(&a, arg, sizeof(a)))
return -EFAULT;
@@ -165,22 +162,8 @@ asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg)
if (a.offset & ~PAGE_MASK)
return -EINVAL;
- if (!(a.flags & MAP_ANONYMOUS)) {
- file = fget(a.fd);
- if (!file)
- return -EBADF;
- }
-
- mm = current->mm;
- down_write(&mm->mmap_sem);
- retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags,
+ return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
a.offset>>PAGE_SHIFT);
- if (file)
- fput(file);
-
- up_write(&mm->mmap_sem);
-
- return retval;
}
asmlinkage long sys32_mprotect(unsigned long start, size_t len,
@@ -349,24 +332,6 @@ asmlinkage long sys32_alarm(unsigned int seconds)
return alarm_setitimer(seconds);
}
-struct sel_arg_struct {
- unsigned int n;
- unsigned int inp;
- unsigned int outp;
- unsigned int exp;
- unsigned int tvp;
-};
-
-asmlinkage long sys32_old_select(struct sel_arg_struct __user *arg)
-{
- struct sel_arg_struct a;
-
- if (copy_from_user(&a, arg, sizeof(a)))
- return -EFAULT;
- return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
- compat_ptr(a.exp), compat_ptr(a.tvp));
-}
-
asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr,
int options)
{
@@ -434,62 +399,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
return ret;
}
-#ifdef CONFIG_SYSCTL_SYSCALL
-struct sysctl_ia32 {
- unsigned int name;
- int nlen;
- unsigned int oldval;
- unsigned int oldlenp;
- unsigned int newval;
- unsigned int newlen;
- unsigned int __unused[4];
-};
-
-
-asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *args32)
-{
- struct sysctl_ia32 a32;
- mm_segment_t old_fs = get_fs();
- void __user *oldvalp, *newvalp;
- size_t oldlen;
- int __user *namep;
- long ret;
-
- if (copy_from_user(&a32, args32, sizeof(a32)))
- return -EFAULT;
-
- /*
- * We need to pre-validate these because we have to disable
- * address checking before calling do_sysctl() because of
- * OLDLEN but we can't run the risk of the user specifying bad
- * addresses here. Well, since we're dealing with 32 bit
- * addresses, we KNOW that access_ok() will always succeed, so
- * this is an expensive NOP, but so what...
- */
- namep = compat_ptr(a32.name);
- oldvalp = compat_ptr(a32.oldval);
- newvalp = compat_ptr(a32.newval);
-
- if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
- || !access_ok(VERIFY_WRITE, namep, 0)
- || !access_ok(VERIFY_WRITE, oldvalp, 0)
- || !access_ok(VERIFY_WRITE, newvalp, 0))
- return -EFAULT;
-
- set_fs(KERNEL_DS);
- lock_kernel();
- ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen,
- newvalp, (size_t) a32.newlen);
- unlock_kernel();
- set_fs(old_fs);
-
- if (oldvalp && put_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
- return -EFAULT;
-
- return ret;
-}
-#endif
-
/* warning: next two assume little endian */
asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
u32 poslo, u32 poshi)
@@ -539,82 +448,6 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd,
return ret;
}
-asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
- unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long pgoff)
-{
- struct mm_struct *mm = current->mm;
- unsigned long error;
- struct file *file = NULL;
-
- flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- if (!(flags & MAP_ANONYMOUS)) {
- file = fget(fd);
- if (!file)
- return -EBADF;
- }
-
- down_write(&mm->mmap_sem);
- error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
- up_write(&mm->mmap_sem);
-
- if (file)
- fput(file);
- return error;
-}
-
-asmlinkage long sys32_olduname(struct oldold_utsname __user *name)
-{
- char *arch = "x86_64";
- int err;
-
- if (!name)
- return -EFAULT;
- if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
- return -EFAULT;
-
- down_read(&uts_sem);
-
- err = __copy_to_user(&name->sysname, &utsname()->sysname,
- __OLD_UTS_LEN);
- err |= __put_user(0, name->sysname+__OLD_UTS_LEN);
- err |= __copy_to_user(&name->nodename, &utsname()->nodename,
- __OLD_UTS_LEN);
- err |= __put_user(0, name->nodename+__OLD_UTS_LEN);
- err |= __copy_to_user(&name->release, &utsname()->release,
- __OLD_UTS_LEN);
- err |= __put_user(0, name->release+__OLD_UTS_LEN);
- err |= __copy_to_user(&name->version, &utsname()->version,
- __OLD_UTS_LEN);
- err |= __put_user(0, name->version+__OLD_UTS_LEN);
-
- if (personality(current->personality) == PER_LINUX32)
- arch = "i686";
-
- err |= __copy_to_user(&name->machine, arch, strlen(arch) + 1);
-
- up_read(&uts_sem);
-
- err = err ? -EFAULT : 0;
-
- return err;
-}
-
-long sys32_uname(struct old_utsname __user *name)
-{
- int err;
-
- if (!name)
- return -EFAULT;
- down_read(&uts_sem);
- err = copy_to_user(name, utsname(), sizeof(*name));
- up_read(&uts_sem);
- if (personality(current->personality) == PER_LINUX32)
- err |= copy_to_user(&name->machine, "i686", 5);
-
- return err ? -EFAULT : 0;
-}
-
asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
compat_uptr_t __user *envp, struct pt_regs *regs)
{
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4a8e80cdcfa5..493092efaa3b 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -10,6 +10,8 @@ header-y += ptrace-abi.h
header-y += sigcontext32.h
header-y += ucontext.h
header-y += processor-flags.h
+header-y += hw_breakpoint.h
+header-y += hyperv.h
unifdef-y += e820.h
unifdef-y += ist.h
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
index bb70e397aa84..7a15588e45d4 100644
--- a/arch/x86/include/asm/a.out-core.h
+++ b/arch/x86/include/asm/a.out-core.h
@@ -17,6 +17,7 @@
#include <linux/user.h>
#include <linux/elfcore.h>
+#include <asm/debugreg.h>
/*
* fill in the user structure for an a.out core dump
@@ -32,14 +33,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
>> PAGE_SHIFT;
dump->u_dsize -= dump->u_tsize;
dump->u_ssize = 0;
- dump->u_debugreg[0] = current->thread.debugreg0;
- dump->u_debugreg[1] = current->thread.debugreg1;
- dump->u_debugreg[2] = current->thread.debugreg2;
- dump->u_debugreg[3] = current->thread.debugreg3;
- dump->u_debugreg[4] = 0;
- dump->u_debugreg[5] = 0;
- dump->u_debugreg[6] = current->thread.debugreg6;
- dump->u_debugreg[7] = current->thread.debugreg7;
+ aout_dump_debugregs(dump);
if (dump->start_stack < TASK_SIZE)
dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 4518dc500903..56f462cf22d2 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -118,7 +118,7 @@ extern void acpi_restore_state_mem(void);
extern unsigned long acpi_wakeup_address;
/* early initialization routine */
-extern void acpi_reserve_bootmem(void);
+extern void acpi_reserve_wakeup_memory(void);
/*
* Check if the CPU can handle C2 and deeper
@@ -142,6 +142,32 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
return max_cstate;
}
+static inline bool arch_has_acpi_pdc(void)
+{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+ return (c->x86_vendor == X86_VENDOR_INTEL ||
+ c->x86_vendor == X86_VENDOR_CENTAUR);
+}
+
+static inline void arch_acpi_set_pdc_bits(u32 *buf)
+{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+
+ buf[2] |= ACPI_PDC_C_CAPABILITY_SMP;
+
+ if (cpu_has(c, X86_FEATURE_EST))
+ buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP;
+
+ if (cpu_has(c, X86_FEATURE_ACPI))
+ buf[2] |= ACPI_PDC_T_FFH;
+
+ /*
+ * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
+ */
+ if (!cpu_has(c, X86_FEATURE_MWAIT))
+ buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
+}
+
#else /* !CONFIG_ACPI */
#define acpi_lapic 0
@@ -158,6 +184,7 @@ struct bootnode;
#ifdef CONFIG_ACPI_NUMA
extern int acpi_numa;
+extern int acpi_get_nodes(struct bootnode *physnodes);
extern int acpi_scan_nodes(unsigned long start, unsigned long end);
#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index e2077d343c33..b97f786a48d5 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -1,17 +1,13 @@
#ifdef __ASSEMBLY__
-#ifdef CONFIG_X86_32
-# define X86_ALIGN .long
-#else
-# define X86_ALIGN .quad
-#endif
+#include <asm/asm.h>
#ifdef CONFIG_SMP
.macro LOCK_PREFIX
1: lock
.section .smp_locks,"a"
- .align 4
- X86_ALIGN 1b
+ _ASM_ALIGN
+ _ASM_PTR 1b
.previous
.endm
#else
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index c240efc74e00..b09ec55650b3 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -65,12 +65,17 @@ extern void alternatives_smp_module_add(struct module *mod, char *name,
void *text, void *text_end);
extern void alternatives_smp_module_del(struct module *mod);
extern void alternatives_smp_switch(int smp);
+extern int alternatives_text_reserved(void *start, void *end);
#else
static inline void alternatives_smp_module_add(struct module *mod, char *name,
void *locks, void *locks_end,
void *text, void *text_end) {}
static inline void alternatives_smp_module_del(struct module *mod) {}
static inline void alternatives_smp_switch(int smp) {}
+static inline int alternatives_text_reserved(void *start, void *end)
+{
+ return 0;
+}
#endif /* CONFIG_SMP */
/* alternative assembly primitive: */
@@ -84,6 +89,7 @@ static inline void alternatives_smp_switch(int smp) {}
" .byte " __stringify(feature) "\n" /* feature bit */ \
" .byte 662b-661b\n" /* sourcelen */ \
" .byte 664f-663f\n" /* replacementlen */ \
+ " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \
".previous\n" \
".section .altinstr_replacement, \"ax\"\n" \
"663:\n\t" newinstr "\n664:\n" /* replacement */ \
@@ -124,11 +130,16 @@ static inline void alternatives_smp_switch(int smp) {}
asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
: output : "i" (0), ## input)
+/* Like alternative_io, but for replacing a direct call with another one. */
+#define alternative_call(oldfunc, newfunc, feature, output, input...) \
+ asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \
+ : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
+
/*
* use this macro(s) if you need more than one output parameter
* in alternative_io
*/
-#define ASM_OUTPUT2(a, b) a, b
+#define ASM_OUTPUT2(a...) a
struct paravirt_patch_site;
#ifdef CONFIG_PARAVIRT
@@ -154,10 +165,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
* invalid instruction possible) or if the instructions are changed from a
* consistent state to another consistent state atomically.
* More care must be taken when modifying code in the SMP case because of
- * Intel's errata.
+ * Intel's errata. text_poke_smp() takes care that errata, but still
+ * doesn't support NMI/MCE handler code modifying.
* On the local CPU you need to be protected again NMI or MCE handlers seeing an
* inconsistent instruction while you patch.
*/
extern void *text_poke(void *addr, const void *opcode, size_t len);
+extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index ac95995b7bad..5af2982133b5 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
* Leo Duran <leo.duran@amd.com>
*
@@ -23,18 +23,13 @@
#include <linux/irqreturn.h>
#ifdef CONFIG_AMD_IOMMU
-extern int amd_iommu_init(void);
-extern int amd_iommu_init_dma_ops(void);
-extern int amd_iommu_init_passthrough(void);
+
extern void amd_iommu_detect(void);
-extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
-extern void amd_iommu_flush_all_domains(void);
-extern void amd_iommu_flush_all_devices(void);
-extern void amd_iommu_shutdown(void);
+
#else
-static inline int amd_iommu_init(void) { return -ENODEV; }
+
static inline void amd_iommu_detect(void) { }
-static inline void amd_iommu_shutdown(void) { }
+
#endif
#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
new file mode 100644
index 000000000000..d2544f1d705d
--- /dev/null
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2009 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
+#define _ASM_X86_AMD_IOMMU_PROTO_H
+
+struct amd_iommu;
+
+extern int amd_iommu_init_dma_ops(void);
+extern int amd_iommu_init_passthrough(void);
+extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
+extern void amd_iommu_flush_all_domains(void);
+extern void amd_iommu_flush_all_devices(void);
+extern void amd_iommu_apply_erratum_63(u16 devid);
+extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
+extern int amd_iommu_init_devices(void);
+extern void amd_iommu_uninit_devices(void);
+extern void amd_iommu_init_notifier(void);
+extern void amd_iommu_init_api(void);
+#ifndef CONFIG_AMD_IOMMU_STATS
+
+static inline void amd_iommu_stats_init(void) { }
+
+#endif /* !CONFIG_AMD_IOMMU_STATS */
+
+#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 2a2cc7a78a81..ba19ad4c47d0 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
* Leo Duran <leo.duran@amd.com>
*
@@ -25,6 +25,11 @@
#include <linux/spinlock.h>
/*
+ * Maximum number of IOMMUs supported
+ */
+#define MAX_IOMMUS 32
+
+/*
* some size calculation constants
*/
#define DEV_TABLE_ENTRY_SIZE 32
@@ -206,6 +211,9 @@ extern bool amd_iommu_dump;
printk(KERN_INFO "AMD-Vi: " format, ## arg); \
} while(0);
+/* global flag if IOMMUs cache non-present entries */
+extern bool amd_iommu_np_cache;
+
/*
* Make iterating over all IOMMUs easier
*/
@@ -226,6 +234,8 @@ extern bool amd_iommu_dump;
* independent of their use.
*/
struct protection_domain {
+ struct list_head list; /* for list of all protection domains */
+ struct list_head dev_list; /* List of all devices in this domain */
spinlock_t lock; /* mostly used to lock the page table*/
u16 id; /* the domain id written to the device table */
int mode; /* paging mode (0-6 levels) */
@@ -233,7 +243,20 @@ struct protection_domain {
unsigned long flags; /* flags to find out type of domain */
bool updated; /* complete domain flush required */
unsigned dev_cnt; /* devices assigned to this domain */
+ unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
void *priv; /* private data */
+
+};
+
+/*
+ * This struct contains device specific data for the IOMMU
+ */
+struct iommu_dev_data {
+ struct list_head list; /* For domain->dev_list */
+ struct device *dev; /* Device this data belong to */
+ struct device *alias; /* The Alias Device */
+ struct protection_domain *domain; /* Domain the device is bound to */
+ atomic_t bind; /* Domain attach reverent count */
};
/*
@@ -291,6 +314,9 @@ struct dma_ops_domain {
struct amd_iommu {
struct list_head list;
+ /* Index within the IOMMU array */
+ int index;
+
/* locks the accesses to the hardware */
spinlock_t lock;
@@ -357,6 +383,21 @@ struct amd_iommu {
extern struct list_head amd_iommu_list;
/*
+ * Array with pointers to each IOMMU struct
+ * The indices are referenced in the protection domains
+ */
+extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
+
+/* Number of IOMMUs present in the system */
+extern int amd_iommus_present;
+
+/*
+ * Declarations for the global list of all protection domains
+ */
+extern spinlock_t amd_iommu_pd_lock;
+extern struct list_head amd_iommu_pd_list;
+
+/*
* Structure defining one entry in the device table
*/
struct dev_table_entry {
@@ -416,15 +457,9 @@ extern unsigned amd_iommu_aperture_order;
/* largest PCI device id we expect translation requests for */
extern u16 amd_iommu_last_bdf;
-/* data structures for protection domain handling */
-extern struct protection_domain **amd_iommu_pd_table;
-
/* allocation bitmap for domain ids */
extern unsigned long *amd_iommu_pd_alloc_bitmap;
-/* will be 1 if device isolation is enabled */
-extern bool amd_iommu_isolate;
-
/*
* If true, the addresses will be flushed on unmap time, not when
* they are reused
@@ -462,11 +497,6 @@ struct __iommu_counter {
#define ADD_STATS_COUNTER(name, x)
#define SUB_STATS_COUNTER(name, x)
-static inline void amd_iommu_stats_init(void) { }
-
#endif /* CONFIG_AMD_IOMMU_STATS */
-/* some function prototypes */
-extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
-
#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
new file mode 100644
index 000000000000..c74a2eebe570
--- /dev/null
+++ b/arch/x86/include/asm/apb_timer.h
@@ -0,0 +1,70 @@
+/*
+ * apb_timer.h: Driver for Langwell APB timer based on Synopsis DesignWare
+ *
+ * (C) Copyright 2009 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ */
+
+#ifndef ASM_X86_APBT_H
+#define ASM_X86_APBT_H
+#include <linux/sfi.h>
+
+#ifdef CONFIG_APB_TIMER
+
+/* Langwell DW APB timer registers */
+#define APBTMR_N_LOAD_COUNT 0x00
+#define APBTMR_N_CURRENT_VALUE 0x04
+#define APBTMR_N_CONTROL 0x08
+#define APBTMR_N_EOI 0x0c
+#define APBTMR_N_INT_STATUS 0x10
+
+#define APBTMRS_INT_STATUS 0xa0
+#define APBTMRS_EOI 0xa4
+#define APBTMRS_RAW_INT_STATUS 0xa8
+#define APBTMRS_COMP_VERSION 0xac
+#define APBTMRS_REG_SIZE 0x14
+
+/* register bits */
+#define APBTMR_CONTROL_ENABLE (1<<0)
+#define APBTMR_CONTROL_MODE_PERIODIC (1<<1) /*1: periodic 0:free running */
+#define APBTMR_CONTROL_INT (1<<2)
+
+/* default memory mapped register base */
+#define LNW_SCU_ADDR 0xFF100000
+#define LNW_EXT_TIMER_OFFSET 0x1B800
+#define APBT_DEFAULT_BASE (LNW_SCU_ADDR+LNW_EXT_TIMER_OFFSET)
+#define LNW_EXT_TIMER_PGOFFSET 0x800
+
+/* APBT clock speed range from PCLK to fabric base, 25-100MHz */
+#define APBT_MAX_FREQ 50
+#define APBT_MIN_FREQ 1
+#define APBT_MMAP_SIZE 1024
+
+#define APBT_DEV_USED 1
+
+extern void apbt_time_init(void);
+extern struct clock_event_device *global_clock_event;
+extern unsigned long apbt_quick_calibrate(void);
+extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
+extern void apbt_setup_secondary_clock(void);
+extern unsigned int boot_cpu_id;
+extern int disable_apbt_percpu;
+
+extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
+extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
+extern int sfi_mtimer_num;
+
+#else /* CONFIG_APB_TIMER */
+
+static inline unsigned long apbt_quick_calibrate(void) {return 0; }
+static inline void apbt_time_init(void) {return 0; }
+
+#endif
+#endif /* ASM_X86_APBT_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 474d80d3e6cc..b4ac2cdcb64f 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -297,20 +297,20 @@ struct apic {
int disable_esr;
int dest_logical;
- unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid);
+ unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
unsigned long (*check_apicid_present)(int apicid);
void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
void (*init_apic_ldr)(void);
- physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map);
+ void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
void (*setup_apic_routing)(void);
int (*multi_timer_check)(int apic, int irq);
int (*apicid_to_node)(int logical_apicid);
int (*cpu_to_logical_apicid)(int cpu);
int (*cpu_present_to_apicid)(int mps_cpu);
- physid_mask_t (*apicid_to_cpu_present)(int phys_apicid);
+ void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
void (*setup_portio_remap)(void);
int (*check_phys_apicid_present)(int phys_apicid);
void (*enable_apic_mode)(void);
@@ -488,6 +488,8 @@ static inline unsigned int read_apic_id(void)
extern void default_setup_apic_routing(void);
+extern struct apic apic_noop;
+
#ifdef CONFIG_X86_32
extern struct apic apic_default;
@@ -532,9 +534,9 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
return (unsigned int)(mask1 & mask2 & mask3);
}
-static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid)
+static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid)
{
- return physid_isset(apicid, bitmap);
+ return physid_isset(apicid, *map);
}
static inline unsigned long default_check_apicid_present(int bit)
@@ -542,9 +544,9 @@ static inline unsigned long default_check_apicid_present(int bit)
return physid_isset(bit, phys_cpu_present_map);
}
-static inline physid_mask_t default_ioapic_phys_id_map(physid_mask_t phys_map)
+static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
{
- return phys_map;
+ *retmap = *phys_map;
}
/* Mapping from cpu number to logical apicid */
@@ -583,11 +585,6 @@ extern int default_cpu_present_to_apicid(int mps_cpu);
extern int default_check_phys_apicid_present(int phys_apicid);
#endif
-static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid)
-{
- return physid_mask_of_physid(phys_apicid);
-}
-
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 3b62da926de9..7fe3b3060f08 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -11,6 +11,12 @@
#define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000
#define APIC_DEFAULT_PHYS_BASE 0xfee00000
+/*
+ * This is the IO-APIC register space as specified
+ * by Intel docs:
+ */
+#define IO_APIC_SLOT_SIZE 1024
+
#define APIC_ID 0x20
#define APIC_LVR 0x30
diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h
deleted file mode 100644
index 82f613c607ce..000000000000
--- a/arch/x86/include/asm/apicnum.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _ASM_X86_APICNUM_H
-#define _ASM_X86_APICNUM_H
-
-/* define MAX_IO_APICS */
-#ifdef CONFIG_X86_32
-# define MAX_IO_APICS 64
-#else
-# define MAX_IO_APICS 128
-# define MAX_LOCAL_APIC 32768
-#endif
-
-#endif /* _ASM_X86_APICNUM_H */
diff --git a/arch/x86/include/asm/asm-offsets.h b/arch/x86/include/asm/asm-offsets.h
new file mode 100644
index 000000000000..d370ee36a182
--- /dev/null
+++ b/arch/x86/include/asm/asm-offsets.h
@@ -0,0 +1 @@
+#include <generated/asm-offsets.h>
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 4e1b8873c474..8f8217b9bdac 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -1,5 +1,300 @@
+#ifndef _ASM_X86_ATOMIC_H
+#define _ASM_X86_ATOMIC_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <asm/processor.h>
+#include <asm/alternative.h>
+#include <asm/cmpxchg.h>
+
+/*
+ * Atomic operations that C can't guarantee us. Useful for
+ * resource counting etc..
+ */
+
+#define ATOMIC_INIT(i) { (i) }
+
+/**
+ * atomic_read - read atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically reads the value of @v.
+ */
+static inline int atomic_read(const atomic_t *v)
+{
+ return v->counter;
+}
+
+/**
+ * atomic_set - set atomic variable
+ * @v: pointer of type atomic_t
+ * @i: required value
+ *
+ * Atomically sets the value of @v to @i.
+ */
+static inline void atomic_set(atomic_t *v, int i)
+{
+ v->counter = i;
+}
+
+/**
+ * atomic_add - add integer to atomic variable
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v.
+ */
+static inline void atomic_add(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "addl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i));
+}
+
+/**
+ * atomic_sub - subtract integer from atomic variable
+ * @i: integer value to subtract
+ * @v: pointer of type atomic_t
+ *
+ * Atomically subtracts @i from @v.
+ */
+static inline void atomic_sub(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "subl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i));
+}
+
+/**
+ * atomic_sub_and_test - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @v: pointer of type atomic_t
+ *
+ * Atomically subtracts @i from @v and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic_sub_and_test(int i, atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
+ : "+m" (v->counter), "=qm" (c)
+ : "ir" (i) : "memory");
+ return c;
+}
+
+/**
+ * atomic_inc - increment atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1.
+ */
+static inline void atomic_inc(atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "incl %0"
+ : "+m" (v->counter));
+}
+
+/**
+ * atomic_dec - decrement atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically decrements @v by 1.
+ */
+static inline void atomic_dec(atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "decl %0"
+ : "+m" (v->counter));
+}
+
+/**
+ * atomic_dec_and_test - decrement and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically decrements @v by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic_dec_and_test(atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "decl %0; sete %1"
+ : "+m" (v->counter), "=qm" (c)
+ : : "memory");
+ return c != 0;
+}
+
+/**
+ * atomic_inc_and_test - increment and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic_inc_and_test(atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "incl %0; sete %1"
+ : "+m" (v->counter), "=qm" (c)
+ : : "memory");
+ return c != 0;
+}
+
+/**
+ * atomic_add_negative - add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int atomic_add_negative(int i, atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
+ : "+m" (v->counter), "=qm" (c)
+ : "ir" (i) : "memory");
+ return c;
+}
+
+/**
+ * atomic_add_return - add integer and return
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v and returns @i + @v
+ */
+static inline int atomic_add_return(int i, atomic_t *v)
+{
+ int __i;
+#ifdef CONFIG_M386
+ unsigned long flags;
+ if (unlikely(boot_cpu_data.x86 <= 3))
+ goto no_xadd;
+#endif
+ /* Modern 486+ processor */
+ __i = i;
+ asm volatile(LOCK_PREFIX "xaddl %0, %1"
+ : "+r" (i), "+m" (v->counter)
+ : : "memory");
+ return i + __i;
+
+#ifdef CONFIG_M386
+no_xadd: /* Legacy 386 processor */
+ raw_local_irq_save(flags);
+ __i = atomic_read(v);
+ atomic_set(v, i + __i);
+ raw_local_irq_restore(flags);
+ return i + __i;
+#endif
+}
+
+/**
+ * atomic_sub_return - subtract integer and return
+ * @v: pointer of type atomic_t
+ * @i: integer value to subtract
+ *
+ * Atomically subtracts @i from @v and returns @v - @i
+ */
+static inline int atomic_sub_return(int i, atomic_t *v)
+{
+ return atomic_add_return(-i, v);
+}
+
+#define atomic_inc_return(v) (atomic_add_return(1, v))
+#define atomic_dec_return(v) (atomic_sub_return(1, v))
+
+static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
+{
+ return cmpxchg(&v->counter, old, new);
+}
+
+static inline int atomic_xchg(atomic_t *v, int new)
+{
+ return xchg(&v->counter, new);
+}
+
+/**
+ * atomic_add_unless - add unless the number is already a given value
+ * @v: pointer of type atomic_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, so long as @v was not already @u.
+ * Returns non-zero if @v was not @u, and zero otherwise.
+ */
+static inline int atomic_add_unless(atomic_t *v, int a, int u)
+{
+ int c, old;
+ c = atomic_read(v);
+ for (;;) {
+ if (unlikely(c == (u)))
+ break;
+ old = atomic_cmpxchg((v), c, c + (a));
+ if (likely(old == c))
+ break;
+ c = old;
+ }
+ return c != (u);
+}
+
+#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
+
+/**
+ * atomic_inc_short - increment of a short integer
+ * @v: pointer to type int
+ *
+ * Atomically adds 1 to @v
+ * Returns the new value of @u
+ */
+static inline short int atomic_inc_short(short int *v)
+{
+ asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
+ return *v;
+}
+
+#ifdef CONFIG_X86_64
+/**
+ * atomic_or_long - OR of two long integers
+ * @v1: pointer to type unsigned long
+ * @v2: pointer to type unsigned long
+ *
+ * Atomically ORs @v1 and @v2
+ * Returns the result of the OR
+ */
+static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
+{
+ asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2));
+}
+#endif
+
+/* These are x86-specific, used by some header files */
+#define atomic_clear_mask(mask, addr) \
+ asm volatile(LOCK_PREFIX "andl %0,%1" \
+ : : "r" (~(mask)), "m" (*(addr)) : "memory")
+
+#define atomic_set_mask(mask, addr) \
+ asm volatile(LOCK_PREFIX "orl %0,%1" \
+ : : "r" ((unsigned)(mask)), "m" (*(addr)) \
+ : "memory")
+
+/* Atomic operations are already serializing on x86 */
+#define smp_mb__before_atomic_dec() barrier()
+#define smp_mb__after_atomic_dec() barrier()
+#define smp_mb__before_atomic_inc() barrier()
+#define smp_mb__after_atomic_inc() barrier()
+
#ifdef CONFIG_X86_32
-# include "atomic_32.h"
+# include "atomic64_32.h"
#else
-# include "atomic_64.h"
+# include "atomic64_64.h"
#endif
+
+#include <asm-generic/atomic-long.h>
+#endif /* _ASM_X86_ATOMIC_H */
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
new file mode 100644
index 000000000000..03027bf28de5
--- /dev/null
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -0,0 +1,160 @@
+#ifndef _ASM_X86_ATOMIC64_32_H
+#define _ASM_X86_ATOMIC64_32_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <asm/processor.h>
+//#include <asm/cmpxchg.h>
+
+/* An 64bit atomic type */
+
+typedef struct {
+ u64 __aligned(8) counter;
+} atomic64_t;
+
+#define ATOMIC64_INIT(val) { (val) }
+
+extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
+
+/**
+ * atomic64_xchg - xchg atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ * @new_val: value to assign
+ *
+ * Atomically xchgs the value of @ptr to @new_val and returns
+ * the old value.
+ */
+extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ * @new_val: value to assign
+ *
+ * Atomically sets the value of @ptr to @new_val.
+ */
+extern void atomic64_set(atomic64_t *ptr, u64 new_val);
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically reads the value of @ptr and returns it.
+ */
+static inline u64 atomic64_read(atomic64_t *ptr)
+{
+ u64 res;
+
+ /*
+ * Note, we inline this atomic64_t primitive because
+ * it only clobbers EAX/EDX and leaves the others
+ * untouched. We also (somewhat subtly) rely on the
+ * fact that cmpxchg8b returns the current 64-bit value
+ * of the memory location we are touching:
+ */
+ asm volatile(
+ "mov %%ebx, %%eax\n\t"
+ "mov %%ecx, %%edx\n\t"
+ LOCK_PREFIX "cmpxchg8b %1\n"
+ : "=&A" (res)
+ : "m" (*ptr)
+ );
+
+ return res;
+}
+
+extern u64 atomic64_read(atomic64_t *ptr);
+
+/**
+ * atomic64_add_return - add and return
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns @delta + *@ptr
+ */
+extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
+
+/*
+ * Other variants with different arithmetic operators:
+ */
+extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
+extern u64 atomic64_inc_return(atomic64_t *ptr);
+extern u64 atomic64_dec_return(atomic64_t *ptr);
+
+/**
+ * atomic64_add - add integer to atomic64 variable
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr.
+ */
+extern void atomic64_add(u64 delta, atomic64_t *ptr);
+
+/**
+ * atomic64_sub - subtract the atomic64 variable
+ * @delta: integer value to subtract
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr.
+ */
+extern void atomic64_sub(u64 delta, atomic64_t *ptr);
+
+/**
+ * atomic64_sub_and_test - subtract value from variable and test result
+ * @delta: integer value to subtract
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
+
+/**
+ * atomic64_inc - increment atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1.
+ */
+extern void atomic64_inc(atomic64_t *ptr);
+
+/**
+ * atomic64_dec - decrement atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1.
+ */
+extern void atomic64_dec(atomic64_t *ptr);
+
+/**
+ * atomic64_dec_and_test - decrement and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+extern int atomic64_dec_and_test(atomic64_t *ptr);
+
+/**
+ * atomic64_inc_and_test - increment and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+extern int atomic64_inc_and_test(atomic64_t *ptr);
+
+/**
+ * atomic64_add_negative - add and test if negative
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
+
+#endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
new file mode 100644
index 000000000000..51c5b4056929
--- /dev/null
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -0,0 +1,224 @@
+#ifndef _ASM_X86_ATOMIC64_64_H
+#define _ASM_X86_ATOMIC64_64_H
+
+#include <linux/types.h>
+#include <asm/alternative.h>
+#include <asm/cmpxchg.h>
+
+/* The 64-bit atomic type */
+
+#define ATOMIC64_INIT(i) { (i) }
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically reads the value of @v.
+ * Doesn't imply a read memory barrier.
+ */
+static inline long atomic64_read(const atomic64_t *v)
+{
+ return v->counter;
+}
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @v: pointer to type atomic64_t
+ * @i: required value
+ *
+ * Atomically sets the value of @v to @i.
+ */
+static inline void atomic64_set(atomic64_t *v, long i)
+{
+ v->counter = i;
+}
+
+/**
+ * atomic64_add - add integer to atomic64 variable
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically adds @i to @v.
+ */
+static inline void atomic64_add(long i, atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "addq %1,%0"
+ : "=m" (v->counter)
+ : "er" (i), "m" (v->counter));
+}
+
+/**
+ * atomic64_sub - subtract the atomic64 variable
+ * @i: integer value to subtract
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically subtracts @i from @v.
+ */
+static inline void atomic64_sub(long i, atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "subq %1,%0"
+ : "=m" (v->counter)
+ : "er" (i), "m" (v->counter));
+}
+
+/**
+ * atomic64_sub_and_test - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically subtracts @i from @v and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic64_sub_and_test(long i, atomic64_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "subq %2,%0; sete %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "er" (i), "m" (v->counter) : "memory");
+ return c;
+}
+
+/**
+ * atomic64_inc - increment atomic64 variable
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically increments @v by 1.
+ */
+static inline void atomic64_inc(atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "incq %0"
+ : "=m" (v->counter)
+ : "m" (v->counter));
+}
+
+/**
+ * atomic64_dec - decrement atomic64 variable
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically decrements @v by 1.
+ */
+static inline void atomic64_dec(atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "decq %0"
+ : "=m" (v->counter)
+ : "m" (v->counter));
+}
+
+/**
+ * atomic64_dec_and_test - decrement and test
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically decrements @v by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic64_dec_and_test(atomic64_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "decq %0; sete %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "m" (v->counter) : "memory");
+ return c != 0;
+}
+
+/**
+ * atomic64_inc_and_test - increment and test
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically increments @v by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic64_inc_and_test(atomic64_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "incq %0; sete %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "m" (v->counter) : "memory");
+ return c != 0;
+}
+
+/**
+ * atomic64_add_negative - add and test if negative
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically adds @i to @v and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int atomic64_add_negative(long i, atomic64_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "addq %2,%0; sets %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "er" (i), "m" (v->counter) : "memory");
+ return c;
+}
+
+/**
+ * atomic64_add_return - add and return
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically adds @i to @v and returns @i + @v
+ */
+static inline long atomic64_add_return(long i, atomic64_t *v)
+{
+ long __i = i;
+ asm volatile(LOCK_PREFIX "xaddq %0, %1;"
+ : "+r" (i), "+m" (v->counter)
+ : : "memory");
+ return i + __i;
+}
+
+static inline long atomic64_sub_return(long i, atomic64_t *v)
+{
+ return atomic64_add_return(-i, v);
+}
+
+#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
+#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
+
+static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
+{
+ return cmpxchg(&v->counter, old, new);
+}
+
+static inline long atomic64_xchg(atomic64_t *v, long new)
+{
+ return xchg(&v->counter, new);
+}
+
+/**
+ * atomic64_add_unless - add unless the number is a given value
+ * @v: pointer of type atomic64_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, so long as it was not @u.
+ * Returns non-zero if @v was not @u, and zero otherwise.
+ */
+static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
+{
+ long c, old;
+ c = atomic64_read(v);
+ for (;;) {
+ if (unlikely(c == (u)))
+ break;
+ old = atomic64_cmpxchg((v), c, c + (a));
+ if (likely(old == c))
+ break;
+ c = old;
+ }
+ return c != (u);
+}
+
+#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
+
+#endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
deleted file mode 100644
index dc5a667ff791..000000000000
--- a/arch/x86/include/asm/atomic_32.h
+++ /dev/null
@@ -1,415 +0,0 @@
-#ifndef _ASM_X86_ATOMIC_32_H
-#define _ASM_X86_ATOMIC_32_H
-
-#include <linux/compiler.h>
-#include <linux/types.h>
-#include <asm/processor.h>
-#include <asm/cmpxchg.h>
-
-/*
- * Atomic operations that C can't guarantee us. Useful for
- * resource counting etc..
- */
-
-#define ATOMIC_INIT(i) { (i) }
-
-/**
- * atomic_read - read atomic variable
- * @v: pointer of type atomic_t
- *
- * Atomically reads the value of @v.
- */
-static inline int atomic_read(const atomic_t *v)
-{
- return v->counter;
-}
-
-/**
- * atomic_set - set atomic variable
- * @v: pointer of type atomic_t
- * @i: required value
- *
- * Atomically sets the value of @v to @i.
- */
-static inline void atomic_set(atomic_t *v, int i)
-{
- v->counter = i;
-}
-
-/**
- * atomic_add - add integer to atomic variable
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v.
- */
-static inline void atomic_add(int i, atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "addl %1,%0"
- : "+m" (v->counter)
- : "ir" (i));
-}
-
-/**
- * atomic_sub - subtract integer from atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v.
- */
-static inline void atomic_sub(int i, atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "subl %1,%0"
- : "+m" (v->counter)
- : "ir" (i));
-}
-
-/**
- * atomic_sub_and_test - subtract value from variable and test result
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-static inline int atomic_sub_and_test(int i, atomic_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
- : "+m" (v->counter), "=qm" (c)
- : "ir" (i) : "memory");
- return c;
-}
-
-/**
- * atomic_inc - increment atomic variable
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1.
- */
-static inline void atomic_inc(atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "incl %0"
- : "+m" (v->counter));
-}
-
-/**
- * atomic_dec - decrement atomic variable
- * @v: pointer of type atomic_t
- *
- * Atomically decrements @v by 1.
- */
-static inline void atomic_dec(atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "decl %0"
- : "+m" (v->counter));
-}
-
-/**
- * atomic_dec_and_test - decrement and test
- * @v: pointer of type atomic_t
- *
- * Atomically decrements @v by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-static inline int atomic_dec_and_test(atomic_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "decl %0; sete %1"
- : "+m" (v->counter), "=qm" (c)
- : : "memory");
- return c != 0;
-}
-
-/**
- * atomic_inc_and_test - increment and test
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-static inline int atomic_inc_and_test(atomic_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "incl %0; sete %1"
- : "+m" (v->counter), "=qm" (c)
- : : "memory");
- return c != 0;
-}
-
-/**
- * atomic_add_negative - add and test if negative
- * @v: pointer of type atomic_t
- * @i: integer value to add
- *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-static inline int atomic_add_negative(int i, atomic_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
- : "+m" (v->counter), "=qm" (c)
- : "ir" (i) : "memory");
- return c;
-}
-
-/**
- * atomic_add_return - add integer and return
- * @v: pointer of type atomic_t
- * @i: integer value to add
- *
- * Atomically adds @i to @v and returns @i + @v
- */
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- int __i;
-#ifdef CONFIG_M386
- unsigned long flags;
- if (unlikely(boot_cpu_data.x86 <= 3))
- goto no_xadd;
-#endif
- /* Modern 486+ processor */
- __i = i;
- asm volatile(LOCK_PREFIX "xaddl %0, %1"
- : "+r" (i), "+m" (v->counter)
- : : "memory");
- return i + __i;
-
-#ifdef CONFIG_M386
-no_xadd: /* Legacy 386 processor */
- local_irq_save(flags);
- __i = atomic_read(v);
- atomic_set(v, i + __i);
- local_irq_restore(flags);
- return i + __i;
-#endif
-}
-
-/**
- * atomic_sub_return - subtract integer and return
- * @v: pointer of type atomic_t
- * @i: integer value to subtract
- *
- * Atomically subtracts @i from @v and returns @v - @i
- */
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- return atomic_add_return(-i, v);
-}
-
-static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
-{
- return cmpxchg(&v->counter, old, new);
-}
-
-static inline int atomic_xchg(atomic_t *v, int new)
-{
- return xchg(&v->counter, new);
-}
-
-/**
- * atomic_add_unless - add unless the number is already a given value
- * @v: pointer of type atomic_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as @v was not already @u.
- * Returns non-zero if @v was not @u, and zero otherwise.
- */
-static inline int atomic_add_unless(atomic_t *v, int a, int u)
-{
- int c, old;
- c = atomic_read(v);
- for (;;) {
- if (unlikely(c == (u)))
- break;
- old = atomic_cmpxchg((v), c, c + (a));
- if (likely(old == c))
- break;
- c = old;
- }
- return c != (u);
-}
-
-#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
-
-#define atomic_inc_return(v) (atomic_add_return(1, v))
-#define atomic_dec_return(v) (atomic_sub_return(1, v))
-
-/* These are x86-specific, used by some header files */
-#define atomic_clear_mask(mask, addr) \
- asm volatile(LOCK_PREFIX "andl %0,%1" \
- : : "r" (~(mask)), "m" (*(addr)) : "memory")
-
-#define atomic_set_mask(mask, addr) \
- asm volatile(LOCK_PREFIX "orl %0,%1" \
- : : "r" (mask), "m" (*(addr)) : "memory")
-
-/* Atomic operations are already serializing on x86 */
-#define smp_mb__before_atomic_dec() barrier()
-#define smp_mb__after_atomic_dec() barrier()
-#define smp_mb__before_atomic_inc() barrier()
-#define smp_mb__after_atomic_inc() barrier()
-
-/* An 64bit atomic type */
-
-typedef struct {
- u64 __aligned(8) counter;
-} atomic64_t;
-
-#define ATOMIC64_INIT(val) { (val) }
-
-extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
-
-/**
- * atomic64_xchg - xchg atomic64 variable
- * @ptr: pointer to type atomic64_t
- * @new_val: value to assign
- *
- * Atomically xchgs the value of @ptr to @new_val and returns
- * the old value.
- */
-extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
-
-/**
- * atomic64_set - set atomic64 variable
- * @ptr: pointer to type atomic64_t
- * @new_val: value to assign
- *
- * Atomically sets the value of @ptr to @new_val.
- */
-extern void atomic64_set(atomic64_t *ptr, u64 new_val);
-
-/**
- * atomic64_read - read atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically reads the value of @ptr and returns it.
- */
-static inline u64 atomic64_read(atomic64_t *ptr)
-{
- u64 res;
-
- /*
- * Note, we inline this atomic64_t primitive because
- * it only clobbers EAX/EDX and leaves the others
- * untouched. We also (somewhat subtly) rely on the
- * fact that cmpxchg8b returns the current 64-bit value
- * of the memory location we are touching:
- */
- asm volatile(
- "mov %%ebx, %%eax\n\t"
- "mov %%ecx, %%edx\n\t"
- LOCK_PREFIX "cmpxchg8b %1\n"
- : "=&A" (res)
- : "m" (*ptr)
- );
-
- return res;
-}
-
-extern u64 atomic64_read(atomic64_t *ptr);
-
-/**
- * atomic64_add_return - add and return
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr and returns @delta + *@ptr
- */
-extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
-
-/*
- * Other variants with different arithmetic operators:
- */
-extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
-extern u64 atomic64_inc_return(atomic64_t *ptr);
-extern u64 atomic64_dec_return(atomic64_t *ptr);
-
-/**
- * atomic64_add - add integer to atomic64 variable
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr.
- */
-extern void atomic64_add(u64 delta, atomic64_t *ptr);
-
-/**
- * atomic64_sub - subtract the atomic64 variable
- * @delta: integer value to subtract
- * @ptr: pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr.
- */
-extern void atomic64_sub(u64 delta, atomic64_t *ptr);
-
-/**
- * atomic64_sub_and_test - subtract value from variable and test result
- * @delta: integer value to subtract
- * @ptr: pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
-
-/**
- * atomic64_inc - increment atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1.
- */
-extern void atomic64_inc(atomic64_t *ptr);
-
-/**
- * atomic64_dec - decrement atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1.
- */
-extern void atomic64_dec(atomic64_t *ptr);
-
-/**
- * atomic64_dec_and_test - decrement and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-extern int atomic64_dec_and_test(atomic64_t *ptr);
-
-/**
- * atomic64_inc_and_test - increment and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-extern int atomic64_inc_and_test(atomic64_t *ptr);
-
-/**
- * atomic64_add_negative - add and test if negative
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
-
-#include <asm-generic/atomic-long.h>
-#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
deleted file mode 100644
index d605dc268e79..000000000000
--- a/arch/x86/include/asm/atomic_64.h
+++ /dev/null
@@ -1,485 +0,0 @@
-#ifndef _ASM_X86_ATOMIC_64_H
-#define _ASM_X86_ATOMIC_64_H
-
-#include <linux/types.h>
-#include <asm/alternative.h>
-#include <asm/cmpxchg.h>
-
-/*
- * Atomic operations that C can't guarantee us. Useful for
- * resource counting etc..
- */
-
-#define ATOMIC_INIT(i) { (i) }
-
-/**
- * atomic_read - read atomic variable
- * @v: pointer of type atomic_t
- *
- * Atomically reads the value of @v.
- */
-static inline int atomic_read(const atomic_t *v)
-{
- return v->counter;
-}
-
-/**
- * atomic_set - set atomic variable
- * @v: pointer of type atomic_t
- * @i: required value
- *
- * Atomically sets the value of @v to @i.
- */
-static inline void atomic_set(atomic_t *v, int i)
-{
- v->counter = i;
-}
-
-/**
- * atomic_add - add integer to atomic variable
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v.
- */
-static inline void atomic_add(int i, atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "addl %1,%0"
- : "=m" (v->counter)
- : "ir" (i), "m" (v->counter));
-}
-
-/**
- * atomic_sub - subtract the atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v.
- */
-static inline void atomic_sub(int i, atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "subl %1,%0"
- : "=m" (v->counter)
- : "ir" (i), "m" (v->counter));
-}
-
-/**
- * atomic_sub_and_test - subtract value from variable and test result
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-static inline int atomic_sub_and_test(int i, atomic_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
- : "=m" (v->counter), "=qm" (c)
- : "ir" (i), "m" (v->counter) : "memory");
- return c;
-}
-
-/**
- * atomic_inc - increment atomic variable
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1.
- */
-static inline void atomic_inc(atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "incl %0"
- : "=m" (v->counter)
- : "m" (v->counter));
-}
-
-/**
- * atomic_dec - decrement atomic variable
- * @v: pointer of type atomic_t
- *
- * Atomically decrements @v by 1.
- */
-static inline void atomic_dec(atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "decl %0"
- : "=m" (v->counter)
- : "m" (v->counter));
-}
-
-/**
- * atomic_dec_and_test - decrement and test
- * @v: pointer of type atomic_t
- *
- * Atomically decrements @v by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-static inline int atomic_dec_and_test(atomic_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "decl %0; sete %1"
- : "=m" (v->counter), "=qm" (c)
- : "m" (v->counter) : "memory");
- return c != 0;
-}
-
-/**
- * atomic_inc_and_test - increment and test
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-static inline int atomic_inc_and_test(atomic_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "incl %0; sete %1"
- : "=m" (v->counter), "=qm" (c)
- : "m" (v->counter) : "memory");
- return c != 0;
-}
-
-/**
- * atomic_add_negative - add and test if negative
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-static inline int atomic_add_negative(int i, atomic_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
- : "=m" (v->counter), "=qm" (c)
- : "ir" (i), "m" (v->counter) : "memory");
- return c;
-}
-
-/**
- * atomic_add_return - add and return
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v and returns @i + @v
- */
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- int __i = i;
- asm volatile(LOCK_PREFIX "xaddl %0, %1"
- : "+r" (i), "+m" (v->counter)
- : : "memory");
- return i + __i;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- return atomic_add_return(-i, v);
-}
-
-#define atomic_inc_return(v) (atomic_add_return(1, v))
-#define atomic_dec_return(v) (atomic_sub_return(1, v))
-
-/* The 64-bit atomic type */
-
-#define ATOMIC64_INIT(i) { (i) }
-
-/**
- * atomic64_read - read atomic64 variable
- * @v: pointer of type atomic64_t
- *
- * Atomically reads the value of @v.
- * Doesn't imply a read memory barrier.
- */
-static inline long atomic64_read(const atomic64_t *v)
-{
- return v->counter;
-}
-
-/**
- * atomic64_set - set atomic64 variable
- * @v: pointer to type atomic64_t
- * @i: required value
- *
- * Atomically sets the value of @v to @i.
- */
-static inline void atomic64_set(atomic64_t *v, long i)
-{
- v->counter = i;
-}
-
-/**
- * atomic64_add - add integer to atomic64 variable
- * @i: integer value to add
- * @v: pointer to type atomic64_t
- *
- * Atomically adds @i to @v.
- */
-static inline void atomic64_add(long i, atomic64_t *v)
-{
- asm volatile(LOCK_PREFIX "addq %1,%0"
- : "=m" (v->counter)
- : "er" (i), "m" (v->counter));
-}
-
-/**
- * atomic64_sub - subtract the atomic64 variable
- * @i: integer value to subtract
- * @v: pointer to type atomic64_t
- *
- * Atomically subtracts @i from @v.
- */
-static inline void atomic64_sub(long i, atomic64_t *v)
-{
- asm volatile(LOCK_PREFIX "subq %1,%0"
- : "=m" (v->counter)
- : "er" (i), "m" (v->counter));
-}
-
-/**
- * atomic64_sub_and_test - subtract value from variable and test result
- * @i: integer value to subtract
- * @v: pointer to type atomic64_t
- *
- * Atomically subtracts @i from @v and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-static inline int atomic64_sub_and_test(long i, atomic64_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "subq %2,%0; sete %1"
- : "=m" (v->counter), "=qm" (c)
- : "er" (i), "m" (v->counter) : "memory");
- return c;
-}
-
-/**
- * atomic64_inc - increment atomic64 variable
- * @v: pointer to type atomic64_t
- *
- * Atomically increments @v by 1.
- */
-static inline void atomic64_inc(atomic64_t *v)
-{
- asm volatile(LOCK_PREFIX "incq %0"
- : "=m" (v->counter)
- : "m" (v->counter));
-}
-
-/**
- * atomic64_dec - decrement atomic64 variable
- * @v: pointer to type atomic64_t
- *
- * Atomically decrements @v by 1.
- */
-static inline void atomic64_dec(atomic64_t *v)
-{
- asm volatile(LOCK_PREFIX "decq %0"
- : "=m" (v->counter)
- : "m" (v->counter));
-}
-
-/**
- * atomic64_dec_and_test - decrement and test
- * @v: pointer to type atomic64_t
- *
- * Atomically decrements @v by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-static inline int atomic64_dec_and_test(atomic64_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "decq %0; sete %1"
- : "=m" (v->counter), "=qm" (c)
- : "m" (v->counter) : "memory");
- return c != 0;
-}
-
-/**
- * atomic64_inc_and_test - increment and test
- * @v: pointer to type atomic64_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-static inline int atomic64_inc_and_test(atomic64_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "incq %0; sete %1"
- : "=m" (v->counter), "=qm" (c)
- : "m" (v->counter) : "memory");
- return c != 0;
-}
-
-/**
- * atomic64_add_negative - add and test if negative
- * @i: integer value to add
- * @v: pointer to type atomic64_t
- *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-static inline int atomic64_add_negative(long i, atomic64_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "addq %2,%0; sets %1"
- : "=m" (v->counter), "=qm" (c)
- : "er" (i), "m" (v->counter) : "memory");
- return c;
-}
-
-/**
- * atomic64_add_return - add and return
- * @i: integer value to add
- * @v: pointer to type atomic64_t
- *
- * Atomically adds @i to @v and returns @i + @v
- */
-static inline long atomic64_add_return(long i, atomic64_t *v)
-{
- long __i = i;
- asm volatile(LOCK_PREFIX "xaddq %0, %1;"
- : "+r" (i), "+m" (v->counter)
- : : "memory");
- return i + __i;
-}
-
-static inline long atomic64_sub_return(long i, atomic64_t *v)
-{
- return atomic64_add_return(-i, v);
-}
-
-#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
-#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
-
-static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
-{
- return cmpxchg(&v->counter, old, new);
-}
-
-static inline long atomic64_xchg(atomic64_t *v, long new)
-{
- return xchg(&v->counter, new);
-}
-
-static inline long atomic_cmpxchg(atomic_t *v, int old, int new)
-{
- return cmpxchg(&v->counter, old, new);
-}
-
-static inline long atomic_xchg(atomic_t *v, int new)
-{
- return xchg(&v->counter, new);
-}
-
-/**
- * atomic_add_unless - add unless the number is a given value
- * @v: pointer of type atomic_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as it was not @u.
- * Returns non-zero if @v was not @u, and zero otherwise.
- */
-static inline int atomic_add_unless(atomic_t *v, int a, int u)
-{
- int c, old;
- c = atomic_read(v);
- for (;;) {
- if (unlikely(c == (u)))
- break;
- old = atomic_cmpxchg((v), c, c + (a));
- if (likely(old == c))
- break;
- c = old;
- }
- return c != (u);
-}
-
-#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
-
-/**
- * atomic64_add_unless - add unless the number is a given value
- * @v: pointer of type atomic64_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as it was not @u.
- * Returns non-zero if @v was not @u, and zero otherwise.
- */
-static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
-{
- long c, old;
- c = atomic64_read(v);
- for (;;) {
- if (unlikely(c == (u)))
- break;
- old = atomic64_cmpxchg((v), c, c + (a));
- if (likely(old == c))
- break;
- c = old;
- }
- return c != (u);
-}
-
-/**
- * atomic_inc_short - increment of a short integer
- * @v: pointer to type int
- *
- * Atomically adds 1 to @v
- * Returns the new value of @u
- */
-static inline short int atomic_inc_short(short int *v)
-{
- asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
- return *v;
-}
-
-/**
- * atomic_or_long - OR of two long integers
- * @v1: pointer to type unsigned long
- * @v2: pointer to type unsigned long
- *
- * Atomically ORs @v1 and @v2
- * Returns the result of the OR
- */
-static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
-{
- asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2));
-}
-
-#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
-
-/* These are x86-specific, used by some header files */
-#define atomic_clear_mask(mask, addr) \
- asm volatile(LOCK_PREFIX "andl %0,%1" \
- : : "r" (~(mask)), "m" (*(addr)) : "memory")
-
-#define atomic_set_mask(mask, addr) \
- asm volatile(LOCK_PREFIX "orl %0,%1" \
- : : "r" ((unsigned)(mask)), "m" (*(addr)) \
- : "memory")
-
-/* Atomic operations are already serializing on x86 */
-#define smp_mb__before_atomic_dec() barrier()
-#define smp_mb__after_atomic_dec() barrier()
-#define smp_mb__before_atomic_inc() barrier()
-#define smp_mb__after_atomic_inc() barrier()
-
-#include <asm-generic/atomic-long.h>
-#endif /* _ASM_X86_ATOMIC_64_H */
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index d9cf1cd156d2..f654d1bb17fb 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -22,14 +22,14 @@ do { \
".popsection" \
: : "i" (__FILE__), "i" (__LINE__), \
"i" (sizeof(struct bug_entry))); \
- for (;;) ; \
+ unreachable(); \
} while (0)
#else
#define BUG() \
do { \
asm volatile("ud2"); \
- for (;;) ; \
+ unreachable(); \
} while (0)
#endif
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
index 549860d3be8f..2f9047cfaaca 100644
--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -9,12 +9,13 @@
#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT
+#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT)
+
#ifdef CONFIG_X86_VSMP
-/* vSMP Internode cacheline shift */
-#define INTERNODE_CACHE_SHIFT (12)
#ifdef CONFIG_SMP
#define __cacheline_aligned_in_smp \
- __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \
+ __attribute__((__aligned__(INTERNODE_CACHE_BYTES))) \
__page_aligned_data
#endif
#endif
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index b54f6afe7ec4..634c40a739a6 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -12,6 +12,7 @@ static inline void flush_cache_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end) { }
static inline void flush_cache_page(struct vm_area_struct *vma,
unsigned long vmaddr, unsigned long pfn) { }
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
static inline void flush_dcache_page(struct page *page) { }
static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
@@ -176,6 +177,7 @@ void clflush_cache_range(void *addr, unsigned int size);
#ifdef CONFIG_DEBUG_RODATA
void mark_rodata_ro(void);
extern const int rodata_test_data;
+extern int kernel_set_to_readonly;
void set_kernel_text_rw(void);
void set_kernel_text_ro(void);
#else
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
index b03bedb62aa7..0918654305af 100644
--- a/arch/x86/include/asm/calgary.h
+++ b/arch/x86/include/asm/calgary.h
@@ -62,10 +62,8 @@ struct cal_chipset_ops {
extern int use_calgary;
#ifdef CONFIG_CALGARY_IOMMU
-extern int calgary_iommu_init(void);
extern void detect_calgary(void);
#else
-static inline int calgary_iommu_init(void) { return 1; }
static inline void detect_calgary(void) { return; }
#endif
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index 7c5ef8b14d92..46fc474fd819 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -161,7 +161,8 @@ static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
"adcl $0, %0 ;\n"
: "=&r" (sum)
: "r" (saddr), "r" (daddr),
- "r" (htonl(len)), "r" (htonl(proto)), "0" (sum));
+ "r" (htonl(len)), "r" (htonl(proto)), "0" (sum)
+ : "memory");
return csum_fold(sum);
}
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 82ceb788a981..ffb9bb6b6c37 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -8,14 +8,50 @@
* you need to test for the feature in boot_cpu_data.
*/
-#define xchg(ptr, v) \
- ((__typeof__(*(ptr)))__xchg((unsigned long)(v), (ptr), sizeof(*(ptr))))
+extern void __xchg_wrong_size(void);
+
+/*
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
+ * Note 2: xchg has side effect, so that attribute volatile is necessary,
+ * but generally the primitive is invalid, *ptr is output argument. --ANK
+ */
struct __xchg_dummy {
unsigned long a[100];
};
#define __xg(x) ((struct __xchg_dummy *)(x))
+#define __xchg(x, ptr, size) \
+({ \
+ __typeof(*(ptr)) __x = (x); \
+ switch (size) { \
+ case 1: \
+ asm volatile("xchgb %b0,%1" \
+ : "=q" (__x) \
+ : "m" (*__xg(ptr)), "0" (__x) \
+ : "memory"); \
+ break; \
+ case 2: \
+ asm volatile("xchgw %w0,%1" \
+ : "=r" (__x) \
+ : "m" (*__xg(ptr)), "0" (__x) \
+ : "memory"); \
+ break; \
+ case 4: \
+ asm volatile("xchgl %0,%1" \
+ : "=r" (__x) \
+ : "m" (*__xg(ptr)), "0" (__x) \
+ : "memory"); \
+ break; \
+ default: \
+ __xchg_wrong_size(); \
+ } \
+ __x; \
+})
+
+#define xchg(ptr, v) \
+ __xchg((v), (ptr), sizeof(*ptr))
+
/*
* The semantics of XCHGCMP8B are a bit strange, this is why
* there is a loop and the loading of %%eax and %%edx has to
@@ -71,57 +107,63 @@ static inline void __set_64bit_var(unsigned long long *ptr,
(unsigned int)((value) >> 32)) \
: __set_64bit(ptr, ll_low((value)), ll_high((value))))
-/*
- * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
- * Note 2: xchg has side effect, so that attribute volatile is necessary,
- * but generally the primitive is invalid, *ptr is output argument. --ANK
- */
-static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
- int size)
-{
- switch (size) {
- case 1:
- asm volatile("xchgb %b0,%1"
- : "=q" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- case 2:
- asm volatile("xchgw %w0,%1"
- : "=r" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- case 4:
- asm volatile("xchgl %0,%1"
- : "=r" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- }
- return x;
-}
+extern void __cmpxchg_wrong_size(void);
/*
* Atomic compare and exchange. Compare OLD with MEM, if identical,
* store NEW in MEM. Return the initial value in MEM. Success is
* indicated by comparing RETURN with OLD.
*/
+#define __raw_cmpxchg(ptr, old, new, size, lock) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ __typeof__(*(ptr)) __old = (old); \
+ __typeof__(*(ptr)) __new = (new); \
+ switch (size) { \
+ case 1: \
+ asm volatile(lock "cmpxchgb %b1,%2" \
+ : "=a"(__ret) \
+ : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ : "memory"); \
+ break; \
+ case 2: \
+ asm volatile(lock "cmpxchgw %w1,%2" \
+ : "=a"(__ret) \
+ : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ : "memory"); \
+ break; \
+ case 4: \
+ asm volatile(lock "cmpxchgl %1,%2" \
+ : "=a"(__ret) \
+ : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ : "memory"); \
+ break; \
+ default: \
+ __cmpxchg_wrong_size(); \
+ } \
+ __ret; \
+})
+
+#define __cmpxchg(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
+
+#define __sync_cmpxchg(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
+
+#define __cmpxchg_local(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), "")
#ifdef CONFIG_X86_CMPXCHG
#define __HAVE_ARCH_CMPXCHG 1
-#define cmpxchg(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
- (unsigned long)(n), \
- sizeof(*(ptr))))
-#define sync_cmpxchg(ptr, o, n) \
- ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
- (unsigned long)(n), \
- sizeof(*(ptr))))
-#define cmpxchg_local(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
- (unsigned long)(n), \
- sizeof(*(ptr))))
+
+#define cmpxchg(ptr, old, new) \
+ __cmpxchg((ptr), (old), (new), sizeof(*ptr))
+
+#define sync_cmpxchg(ptr, old, new) \
+ __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr))
+
+#define cmpxchg_local(ptr, old, new) \
+ __cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
#endif
#ifdef CONFIG_X86_CMPXCHG64
@@ -133,94 +175,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
(unsigned long long)(n)))
#endif
-static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
- unsigned long new, int size)
-{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile(LOCK_PREFIX "cmpxchgl %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
-}
-
-/*
- * Always use locked operations when touching memory shared with a
- * hypervisor, since the system may be SMP even if the guest kernel
- * isn't.
- */
-static inline unsigned long __sync_cmpxchg(volatile void *ptr,
- unsigned long old,
- unsigned long new, int size)
-{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile("lock; cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile("lock; cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile("lock; cmpxchgl %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
-}
-
-static inline unsigned long __cmpxchg_local(volatile void *ptr,
- unsigned long old,
- unsigned long new, int size)
-{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile("cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile("cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile("cmpxchgl %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
-}
-
static inline unsigned long long __cmpxchg64(volatile void *ptr,
unsigned long long old,
unsigned long long new)
@@ -312,19 +266,23 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
-#define cmpxchg64(ptr, o, n) \
-({ \
- __typeof__(*(ptr)) __ret; \
- if (likely(boot_cpu_data.x86 > 4)) \
- __ret = (__typeof__(*(ptr)))__cmpxchg64((ptr), \
- (unsigned long long)(o), \
- (unsigned long long)(n)); \
- else \
- __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \
- (unsigned long long)(o), \
- (unsigned long long)(n)); \
- __ret; \
-})
+#define cmpxchg64(ptr, o, n) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ __typeof__(*(ptr)) __old = (o); \
+ __typeof__(*(ptr)) __new = (n); \
+ alternative_io("call cmpxchg8b_emu", \
+ "lock; cmpxchg8b (%%esi)" , \
+ X86_FEATURE_CX8, \
+ "=A" (__ret), \
+ "S" ((ptr)), "0" (__old), \
+ "b" ((unsigned int)__new), \
+ "c" ((unsigned int)(__new>>32)) \
+ : "memory"); \
+ __ret; })
+
+
+
#define cmpxchg64_local(ptr, o, n) \
({ \
__typeof__(*(ptr)) __ret; \
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 52de72e0de8c..485ae415faec 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -3,9 +3,6 @@
#include <asm/alternative.h> /* Provides LOCK_PREFIX */
-#define xchg(ptr, v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v), \
- (ptr), sizeof(*(ptr))))
-
#define __xg(x) ((volatile long *)(x))
static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
@@ -15,167 +12,118 @@ static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
#define _set_64bit set_64bit
+extern void __xchg_wrong_size(void);
+extern void __cmpxchg_wrong_size(void);
+
/*
* Note: no "lock" prefix even on SMP: xchg always implies lock anyway
* Note 2: xchg has side effect, so that attribute volatile is necessary,
* but generally the primitive is invalid, *ptr is output argument. --ANK
*/
-static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
- int size)
-{
- switch (size) {
- case 1:
- asm volatile("xchgb %b0,%1"
- : "=q" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- case 2:
- asm volatile("xchgw %w0,%1"
- : "=r" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- case 4:
- asm volatile("xchgl %k0,%1"
- : "=r" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- case 8:
- asm volatile("xchgq %0,%1"
- : "=r" (x)
- : "m" (*__xg(ptr)), "0" (x)
- : "memory");
- break;
- }
- return x;
-}
+#define __xchg(x, ptr, size) \
+({ \
+ __typeof(*(ptr)) __x = (x); \
+ switch (size) { \
+ case 1: \
+ asm volatile("xchgb %b0,%1" \
+ : "=q" (__x) \
+ : "m" (*__xg(ptr)), "0" (__x) \
+ : "memory"); \
+ break; \
+ case 2: \
+ asm volatile("xchgw %w0,%1" \
+ : "=r" (__x) \
+ : "m" (*__xg(ptr)), "0" (__x) \
+ : "memory"); \
+ break; \
+ case 4: \
+ asm volatile("xchgl %k0,%1" \
+ : "=r" (__x) \
+ : "m" (*__xg(ptr)), "0" (__x) \
+ : "memory"); \
+ break; \
+ case 8: \
+ asm volatile("xchgq %0,%1" \
+ : "=r" (__x) \
+ : "m" (*__xg(ptr)), "0" (__x) \
+ : "memory"); \
+ break; \
+ default: \
+ __xchg_wrong_size(); \
+ } \
+ __x; \
+})
+
+#define xchg(ptr, v) \
+ __xchg((v), (ptr), sizeof(*ptr))
+
+#define __HAVE_ARCH_CMPXCHG 1
/*
* Atomic compare and exchange. Compare OLD with MEM, if identical,
* store NEW in MEM. Return the initial value in MEM. Success is
* indicated by comparing RETURN with OLD.
*/
+#define __raw_cmpxchg(ptr, old, new, size, lock) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ __typeof__(*(ptr)) __old = (old); \
+ __typeof__(*(ptr)) __new = (new); \
+ switch (size) { \
+ case 1: \
+ asm volatile(lock "cmpxchgb %b1,%2" \
+ : "=a"(__ret) \
+ : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ : "memory"); \
+ break; \
+ case 2: \
+ asm volatile(lock "cmpxchgw %w1,%2" \
+ : "=a"(__ret) \
+ : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ : "memory"); \
+ break; \
+ case 4: \
+ asm volatile(lock "cmpxchgl %k1,%2" \
+ : "=a"(__ret) \
+ : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ : "memory"); \
+ break; \
+ case 8: \
+ asm volatile(lock "cmpxchgq %1,%2" \
+ : "=a"(__ret) \
+ : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ : "memory"); \
+ break; \
+ default: \
+ __cmpxchg_wrong_size(); \
+ } \
+ __ret; \
+})
-#define __HAVE_ARCH_CMPXCHG 1
+#define __cmpxchg(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
-static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
- unsigned long new, int size)
-{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 8:
- asm volatile(LOCK_PREFIX "cmpxchgq %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
-}
+#define __sync_cmpxchg(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
-/*
- * Always use locked operations when touching memory shared with a
- * hypervisor, since the system may be SMP even if the guest kernel
- * isn't.
- */
-static inline unsigned long __sync_cmpxchg(volatile void *ptr,
- unsigned long old,
- unsigned long new, int size)
-{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile("lock; cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile("lock; cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile("lock; cmpxchgl %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
-}
+#define __cmpxchg_local(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), "")
-static inline unsigned long __cmpxchg_local(volatile void *ptr,
- unsigned long old,
- unsigned long new, int size)
-{
- unsigned long prev;
- switch (size) {
- case 1:
- asm volatile("cmpxchgb %b1,%2"
- : "=a"(prev)
- : "q"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 2:
- asm volatile("cmpxchgw %w1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 4:
- asm volatile("cmpxchgl %k1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- case 8:
- asm volatile("cmpxchgq %1,%2"
- : "=a"(prev)
- : "r"(new), "m"(*__xg(ptr)), "0"(old)
- : "memory");
- return prev;
- }
- return old;
-}
+#define cmpxchg(ptr, old, new) \
+ __cmpxchg((ptr), (old), (new), sizeof(*ptr))
+
+#define sync_cmpxchg(ptr, old, new) \
+ __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr))
+
+#define cmpxchg_local(ptr, old, new) \
+ __cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
-#define cmpxchg(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
- (unsigned long)(n), sizeof(*(ptr))))
#define cmpxchg64(ptr, o, n) \
({ \
BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
cmpxchg((ptr), (o), (n)); \
})
-#define cmpxchg_local(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
- (unsigned long)(n), \
- sizeof(*(ptr))))
-#define sync_cmpxchg(ptr, o, n) \
- ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
- (unsigned long)(n), \
- sizeof(*(ptr))))
+
#define cmpxchg64_local(ptr, o, n) \
({ \
BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 9a9c7bdc923d..306160e58b48 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -8,7 +8,8 @@
#include <linux/sched.h>
#include <asm/user32.h>
-#define COMPAT_USER_HZ 100
+#define COMPAT_USER_HZ 100
+#define COMPAT_UTS_MACHINE "i686\0\0"
typedef u32 compat_size_t;
typedef s32 compat_ssize_t;
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
deleted file mode 100644
index d96c1ee3a95c..000000000000
--- a/arch/x86/include/asm/cpu_debug.h
+++ /dev/null
@@ -1,127 +0,0 @@
-#ifndef _ASM_X86_CPU_DEBUG_H
-#define _ASM_X86_CPU_DEBUG_H
-
-/*
- * CPU x86 architecture debug
- *
- * Copyright(C) 2009 Jaswinder Singh Rajput
- */
-
-/* Register flags */
-enum cpu_debug_bit {
-/* Model Specific Registers (MSRs) */
- CPU_MC_BIT, /* Machine Check */
- CPU_MONITOR_BIT, /* Monitor */
- CPU_TIME_BIT, /* Time */
- CPU_PMC_BIT, /* Performance Monitor */
- CPU_PLATFORM_BIT, /* Platform */
- CPU_APIC_BIT, /* APIC */
- CPU_POWERON_BIT, /* Power-on */
- CPU_CONTROL_BIT, /* Control */
- CPU_FEATURES_BIT, /* Features control */
- CPU_LBRANCH_BIT, /* Last Branch */
- CPU_BIOS_BIT, /* BIOS */
- CPU_FREQ_BIT, /* Frequency */
- CPU_MTTR_BIT, /* MTRR */
- CPU_PERF_BIT, /* Performance */
- CPU_CACHE_BIT, /* Cache */
- CPU_SYSENTER_BIT, /* Sysenter */
- CPU_THERM_BIT, /* Thermal */
- CPU_MISC_BIT, /* Miscellaneous */
- CPU_DEBUG_BIT, /* Debug */
- CPU_PAT_BIT, /* PAT */
- CPU_VMX_BIT, /* VMX */
- CPU_CALL_BIT, /* System Call */
- CPU_BASE_BIT, /* BASE Address */
- CPU_VER_BIT, /* Version ID */
- CPU_CONF_BIT, /* Configuration */
- CPU_SMM_BIT, /* System mgmt mode */
- CPU_SVM_BIT, /*Secure Virtual Machine*/
- CPU_OSVM_BIT, /* OS-Visible Workaround*/
-/* Standard Registers */
- CPU_TSS_BIT, /* Task Stack Segment */
- CPU_CR_BIT, /* Control Registers */
- CPU_DT_BIT, /* Descriptor Table */
-/* End of Registers flags */
- CPU_REG_ALL_BIT, /* Select all Registers */
-};
-
-#define CPU_REG_ALL (~0) /* Select all Registers */
-
-#define CPU_MC (1 << CPU_MC_BIT)
-#define CPU_MONITOR (1 << CPU_MONITOR_BIT)
-#define CPU_TIME (1 << CPU_TIME_BIT)
-#define CPU_PMC (1 << CPU_PMC_BIT)
-#define CPU_PLATFORM (1 << CPU_PLATFORM_BIT)
-#define CPU_APIC (1 << CPU_APIC_BIT)
-#define CPU_POWERON (1 << CPU_POWERON_BIT)
-#define CPU_CONTROL (1 << CPU_CONTROL_BIT)
-#define CPU_FEATURES (1 << CPU_FEATURES_BIT)
-#define CPU_LBRANCH (1 << CPU_LBRANCH_BIT)
-#define CPU_BIOS (1 << CPU_BIOS_BIT)
-#define CPU_FREQ (1 << CPU_FREQ_BIT)
-#define CPU_MTRR (1 << CPU_MTTR_BIT)
-#define CPU_PERF (1 << CPU_PERF_BIT)
-#define CPU_CACHE (1 << CPU_CACHE_BIT)
-#define CPU_SYSENTER (1 << CPU_SYSENTER_BIT)
-#define CPU_THERM (1 << CPU_THERM_BIT)
-#define CPU_MISC (1 << CPU_MISC_BIT)
-#define CPU_DEBUG (1 << CPU_DEBUG_BIT)
-#define CPU_PAT (1 << CPU_PAT_BIT)
-#define CPU_VMX (1 << CPU_VMX_BIT)
-#define CPU_CALL (1 << CPU_CALL_BIT)
-#define CPU_BASE (1 << CPU_BASE_BIT)
-#define CPU_VER (1 << CPU_VER_BIT)
-#define CPU_CONF (1 << CPU_CONF_BIT)
-#define CPU_SMM (1 << CPU_SMM_BIT)
-#define CPU_SVM (1 << CPU_SVM_BIT)
-#define CPU_OSVM (1 << CPU_OSVM_BIT)
-#define CPU_TSS (1 << CPU_TSS_BIT)
-#define CPU_CR (1 << CPU_CR_BIT)
-#define CPU_DT (1 << CPU_DT_BIT)
-
-/* Register file flags */
-enum cpu_file_bit {
- CPU_INDEX_BIT, /* index */
- CPU_VALUE_BIT, /* value */
-};
-
-#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
-
-#define MAX_CPU_FILES 512
-
-struct cpu_private {
- unsigned cpu;
- unsigned type;
- unsigned reg;
- unsigned file;
-};
-
-struct cpu_debug_base {
- char *name; /* Register name */
- unsigned flag; /* Register flag */
- unsigned write; /* Register write flag */
-};
-
-/*
- * Currently it looks similar to cpu_debug_base but once we add more files
- * cpu_file_base will go in different direction
- */
-struct cpu_file_base {
- char *name; /* Register file name */
- unsigned flag; /* Register file flag */
- unsigned write; /* Register write flag */
-};
-
-struct cpu_cpuX_base {
- struct dentry *dentry; /* Register dentry */
- int init; /* Register index file */
-};
-
-struct cpu_debug_range {
- unsigned min; /* Register range min */
- unsigned max; /* Register range max */
- unsigned flag; /* Supported flags */
-};
-
-#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 9cfc88b97742..0cd82d068613 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -153,6 +153,7 @@
#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */
#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */
#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */
+#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
/*
* Auxiliary flags: Linux defined - For features scattered in various
@@ -167,6 +168,10 @@
#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
+#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */
+#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */
+#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */
+#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
@@ -248,6 +253,7 @@ extern const char * const x86_power_flags[32];
#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
+#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
# define cpu_has_invlpg 1
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 3ea6f37be9e2..b81002f23614 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -14,10 +14,14 @@
which debugging register was responsible for the trap. The other bits
are either reserved or not of interest to us. */
+/* Define reserved bits in DR6 which are always set to 1 */
+#define DR6_RESERVED (0xFFFF0FF0)
+
#define DR_TRAP0 (0x1) /* db0 */
#define DR_TRAP1 (0x2) /* db1 */
#define DR_TRAP2 (0x4) /* db2 */
#define DR_TRAP3 (0x8) /* db3 */
+#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
#define DR_STEP (0x4000) /* single-step */
#define DR_SWITCH (0x8000) /* task switch */
@@ -49,6 +53,8 @@
#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */
#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */
+#define DR_LOCAL_ENABLE (0x1) /* Local enable for reg 0 */
+#define DR_GLOBAL_ENABLE (0x2) /* Global enable for reg 0 */
#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */
#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */
@@ -67,4 +73,34 @@
#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */
#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */
+/*
+ * HW breakpoint additions
+ */
+#ifdef __KERNEL__
+
+DECLARE_PER_CPU(unsigned long, cpu_dr7);
+
+static inline void hw_breakpoint_disable(void)
+{
+ /* Zero the control register for HW Breakpoint */
+ set_debugreg(0UL, 7);
+
+ /* Zero-out the individual HW breakpoint address registers */
+ set_debugreg(0UL, 0);
+ set_debugreg(0UL, 1);
+ set_debugreg(0UL, 2);
+ set_debugreg(0UL, 3);
+}
+
+static inline int hw_breakpoint_active(void)
+{
+ return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
+}
+
+extern void aout_dump_debugregs(struct user *dump);
+
+extern void hw_breakpoint_restore(void);
+
+#endif /* __KERNEL__ */
+
#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index e8de2f6f5ca5..617bd56b3070 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -288,7 +288,7 @@ static inline void load_LDT(mm_context_t *pc)
static inline unsigned long get_desc_base(const struct desc_struct *desc)
{
- return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
+ return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
}
static inline void set_desc_base(struct desc_struct *desc, unsigned long base)
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index 9d6684849fd9..278441f39856 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -12,9 +12,9 @@
#include <linux/types.h>
/*
- * FIXME: Acessing the desc_struct through its fields is more elegant,
+ * FIXME: Accessing the desc_struct through its fields is more elegant,
* and should be the one valid thing to do. However, a lot of open code
- * still touches the a and b acessors, and doing this allow us to do it
+ * still touches the a and b accessors, and doing this allow us to do it
* incrementally. We keep the signature as a struct, rather than an union,
* so we can get rid of it transparently in the future -- glommer
*/
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index cee34e9ca45b..029f230ab637 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -8,7 +8,7 @@ struct dev_archdata {
#ifdef CONFIG_X86_64
struct dma_map_ops *dma_ops;
#endif
-#ifdef CONFIG_DMAR
+#if defined(CONFIG_DMAR) || defined(CONFIG_AMD_IOMMU)
void *iommu; /* hook for IOMMU specific extension */
#endif
};
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 0ee770d23d0e..ac91eed21061 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -14,7 +14,14 @@
#include <asm/swiotlb.h>
#include <asm-generic/dma-coherent.h>
-extern dma_addr_t bad_dma_address;
+#ifdef CONFIG_ISA
+# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
+#else
+# define ISA_DMA_BIT_MASK DMA_BIT_MASK(32)
+#endif
+
+#define DMA_ERROR_CODE 0
+
extern int iommu_merge;
extern struct device x86_dma_fallback_dev;
extern int panic_on_overflow;
@@ -42,7 +49,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
if (ops->mapping_error)
return ops->mapping_error(dev, dma_addr);
- return (dma_addr == bad_dma_address);
+ return (dma_addr == DMA_ERROR_CODE);
}
#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
@@ -60,7 +67,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
if (!dev->dma_mask)
return 0;
- return addr + size <= *dev->dma_mask;
+ return addr + size - 1 <= *dev->dma_mask;
}
static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
@@ -124,10 +131,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
return memory;
- if (!dev) {
+ if (!dev)
dev = &x86_dma_fallback_dev;
- gfp |= GFP_DMA;
- }
if (!is_device_dma_capable(dev))
return NULL;
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 40b4e614fe71..0e22296790d3 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -61,6 +61,12 @@ struct e820map {
struct e820entry map[E820_X_MAX];
};
+#define ISA_START_ADDRESS 0xa0000
+#define ISA_END_ADDRESS 0x100000
+
+#define BIOS_BEGIN 0x000a0000
+#define BIOS_END 0x00100000
+
#ifdef __KERNEL__
/* see comment in arch/x86/kernel/e820.c */
extern struct e820map e820;
@@ -105,11 +111,8 @@ extern unsigned long end_user_pfn;
extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
-extern void reserve_early(u64 start, u64 end, char *name);
-extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
-extern void free_early(u64 start, u64 end);
-extern void early_res_to_bootmem(u64 start, u64 end);
extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
+#include <linux/early_res.h>
extern unsigned long e820_end_of_ram_pfn(void);
extern unsigned long e820_end_of_low_ram_pfn(void);
@@ -126,15 +129,18 @@ extern void e820_reserve_resources(void);
extern void e820_reserve_resources_late(void);
extern void setup_memory_map(void);
extern char *default_machine_specific_memory_setup(void);
-#endif /* __KERNEL__ */
-#endif /* __ASSEMBLY__ */
-#define ISA_START_ADDRESS 0xa0000
-#define ISA_END_ADDRESS 0x100000
-#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS)
+/*
+ * Returns true iff the specified range [s,e) is completely contained inside
+ * the ISA region.
+ */
+static inline bool is_ISA_range(u64 s, u64 e)
+{
+ return s >= ISA_START_ADDRESS && e <= ISA_END_ADDRESS;
+}
-#define BIOS_BEGIN 0x000a0000
-#define BIOS_END 0x00100000
+#endif /* __KERNEL__ */
+#endif /* __ASSEMBLY__ */
#ifdef __KERNEL__
#include <linux/ioport.h>
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 456a304b8172..f2ad2163109d 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -157,19 +157,6 @@ do { \
#define compat_elf_check_arch(x) elf_check_arch_ia32(x)
-static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp)
-{
- loadsegment(fs, 0);
- loadsegment(ds, __USER32_DS);
- loadsegment(es, __USER32_DS);
- load_gs_index(0);
- regs->ip = ip;
- regs->sp = sp;
- regs->flags = X86_EFLAGS_IF;
- regs->cs = __USER32_CS;
- regs->ss = __USER32_DS;
-}
-
static inline void elf_common_init(struct thread_struct *t,
struct pt_regs *regs, const u16 ds)
{
@@ -183,28 +170,16 @@ static inline void elf_common_init(struct thread_struct *t,
}
#define ELF_PLAT_INIT(_r, load_addr) \
-do { \
- elf_common_init(&current->thread, _r, 0); \
- clear_thread_flag(TIF_IA32); \
-} while (0)
+ elf_common_init(&current->thread, _r, 0)
#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
elf_common_init(&current->thread, regs, __USER_DS)
-#define compat_start_thread(regs, ip, sp) \
-do { \
- start_ia32_thread(regs, ip, sp); \
- set_fs(USER_DS); \
-} while (0)
+void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp);
+#define compat_start_thread start_thread_ia32
-#define COMPAT_SET_PERSONALITY(ex) \
-do { \
- if (test_thread_flag(TIF_IA32)) \
- clear_thread_flag(TIF_ABI_PENDING); \
- else \
- set_thread_flag(TIF_ABI_PENDING); \
- current->personality |= force_personality32; \
-} while (0)
+void set_personality_ia32(void);
+#define COMPAT_SET_PERSONALITY(ex) set_personality_ia32()
#define COMPAT_ELF_PLATFORM ("i686")
@@ -255,7 +230,6 @@ extern int force_personality32;
#endif /* !CONFIG_X86_32 */
#define CORE_DUMP_USE_REGSET
-#define USE_ELF_CORE_DUMP
#define ELF_EXEC_PAGESIZE 4096
/* This is the location that an ET_DYN program is loaded if exec'ed. Typical
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index f5693c81a1db..8e8ec663a98f 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -34,7 +34,7 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
smp_invalidate_interrupt)
#endif
-BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR)
+BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
/*
* every pentium local APIC has two 'local interrupts', with a
diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h
index 53018464aea6..2519d0679d99 100644
--- a/arch/x86/include/asm/fb.h
+++ b/arch/x86/include/asm/fb.h
@@ -12,10 +12,6 @@ static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
pgprot_val(vma->vm_page_prot) |= _PAGE_PCD;
}
-#ifdef CONFIG_X86_32
extern int fb_is_primary_device(struct fb_info *info);
-#else
-static inline int fb_is_primary_device(struct fb_info *info) { return 0; }
-#endif
#endif /* _ASM_X86_FB_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 14f9890eb495..635f03bb4995 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -118,14 +118,20 @@ enum fixed_addresses {
* 256 temporary boot-time mappings, used by early_ioremap(),
* before ioremap() is functional.
*
- * We round it up to the next 256 pages boundary so that we
- * can have a single pgd entry and a single pte table:
+ * If necessary we round it up to the next 256 pages boundary so
+ * that we can have a single pgd entry and a single pte table:
*/
#define NR_FIX_BTMAPS 64
#define FIX_BTMAPS_SLOTS 4
- FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
- (__end_of_permanent_fixed_addresses & 255),
- FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
+#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)
+ FIX_BTMAP_END =
+ (__end_of_permanent_fixed_addresses ^
+ (__end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - 1)) &
+ -PTRS_PER_PTE
+ ? __end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS -
+ (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1))
+ : __end_of_permanent_fixed_addresses,
+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
FIX_OHCI1394_BASE,
#endif
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 6cfdafa409d8..4ac5b0f33fc1 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -35,8 +35,7 @@ extern int gart_iommu_aperture_allowed;
extern int gart_iommu_aperture_disabled;
extern void early_gart_iommu_check(void);
-extern void gart_iommu_init(void);
-extern void gart_iommu_shutdown(void);
+extern int gart_iommu_init(void);
extern void __init gart_parse_options(char *);
extern void gart_iommu_hole_init(void);
@@ -48,12 +47,6 @@ extern void gart_iommu_hole_init(void);
static inline void early_gart_iommu_check(void)
{
}
-static inline void gart_iommu_init(void)
-{
-}
-static inline void gart_iommu_shutdown(void)
-{
-}
static inline void gart_parse_options(char *options)
{
}
diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h
index ad3c2ed75481..7cd73552a4e8 100644
--- a/arch/x86/include/asm/geode.h
+++ b/arch/x86/include/asm/geode.h
@@ -12,160 +12,7 @@
#include <asm/processor.h>
#include <linux/io.h>
-
-/* Generic southbridge functions */
-
-#define GEODE_DEV_PMS 0
-#define GEODE_DEV_ACPI 1
-#define GEODE_DEV_GPIO 2
-#define GEODE_DEV_MFGPT 3
-
-extern int geode_get_dev_base(unsigned int dev);
-
-/* Useful macros */
-#define geode_pms_base() geode_get_dev_base(GEODE_DEV_PMS)
-#define geode_acpi_base() geode_get_dev_base(GEODE_DEV_ACPI)
-#define geode_gpio_base() geode_get_dev_base(GEODE_DEV_GPIO)
-#define geode_mfgpt_base() geode_get_dev_base(GEODE_DEV_MFGPT)
-
-/* MSRS */
-
-#define MSR_GLIU_P2D_RO0 0x10000029
-
-#define MSR_LX_GLD_MSR_CONFIG 0x48002001
-#define MSR_LX_MSR_PADSEL 0x48002011 /* NOT 0x48000011; the data
- * sheet has the wrong value */
-#define MSR_GLCP_SYS_RSTPLL 0x4C000014
-#define MSR_GLCP_DOTPLL 0x4C000015
-
-#define MSR_LBAR_SMB 0x5140000B
-#define MSR_LBAR_GPIO 0x5140000C
-#define MSR_LBAR_MFGPT 0x5140000D
-#define MSR_LBAR_ACPI 0x5140000E
-#define MSR_LBAR_PMS 0x5140000F
-
-#define MSR_DIVIL_SOFT_RESET 0x51400017
-
-#define MSR_PIC_YSEL_LOW 0x51400020
-#define MSR_PIC_YSEL_HIGH 0x51400021
-#define MSR_PIC_ZSEL_LOW 0x51400022
-#define MSR_PIC_ZSEL_HIGH 0x51400023
-#define MSR_PIC_IRQM_LPC 0x51400025
-
-#define MSR_MFGPT_IRQ 0x51400028
-#define MSR_MFGPT_NR 0x51400029
-#define MSR_MFGPT_SETUP 0x5140002B
-
-#define MSR_LX_SPARE_MSR 0x80000011 /* DC-specific */
-
-#define MSR_GX_GLD_MSR_CONFIG 0xC0002001
-#define MSR_GX_MSR_PADSEL 0xC0002011
-
-/* Resource Sizes */
-
-#define LBAR_GPIO_SIZE 0xFF
-#define LBAR_MFGPT_SIZE 0x40
-#define LBAR_ACPI_SIZE 0x40
-#define LBAR_PMS_SIZE 0x80
-
-/* ACPI registers (PMS block) */
-
-/*
- * PM1_EN is only valid when VSA is enabled for 16 bit reads.
- * When VSA is not enabled, *always* read both PM1_STS and PM1_EN
- * with a 32 bit read at offset 0x0
- */
-
-#define PM1_STS 0x00
-#define PM1_EN 0x02
-#define PM1_CNT 0x08
-#define PM2_CNT 0x0C
-#define PM_TMR 0x10
-#define PM_GPE0_STS 0x18
-#define PM_GPE0_EN 0x1C
-
-/* PMC registers (PMS block) */
-
-#define PM_SSD 0x00
-#define PM_SCXA 0x04
-#define PM_SCYA 0x08
-#define PM_OUT_SLPCTL 0x0C
-#define PM_SCLK 0x10
-#define PM_SED 0x1
-#define PM_SCXD 0x18
-#define PM_SCYD 0x1C
-#define PM_IN_SLPCTL 0x20
-#define PM_WKD 0x30
-#define PM_WKXD 0x34
-#define PM_RD 0x38
-#define PM_WKXA 0x3C
-#define PM_FSD 0x40
-#define PM_TSD 0x44
-#define PM_PSD 0x48
-#define PM_NWKD 0x4C
-#define PM_AWKD 0x50
-#define PM_SSC 0x54
-
-/* VSA2 magic values */
-
-#define VSA_VRC_INDEX 0xAC1C
-#define VSA_VRC_DATA 0xAC1E
-#define VSA_VR_UNLOCK 0xFC53 /* unlock virtual register */
-#define VSA_VR_SIGNATURE 0x0003
-#define VSA_VR_MEM_SIZE 0x0200
-#define AMD_VSA_SIG 0x4132 /* signature is ascii 'VSA2' */
-#define GSW_VSA_SIG 0x534d /* General Software signature */
-/* GPIO */
-
-#define GPIO_OUTPUT_VAL 0x00
-#define GPIO_OUTPUT_ENABLE 0x04
-#define GPIO_OUTPUT_OPEN_DRAIN 0x08
-#define GPIO_OUTPUT_INVERT 0x0C
-#define GPIO_OUTPUT_AUX1 0x10
-#define GPIO_OUTPUT_AUX2 0x14
-#define GPIO_PULL_UP 0x18
-#define GPIO_PULL_DOWN 0x1C
-#define GPIO_INPUT_ENABLE 0x20
-#define GPIO_INPUT_INVERT 0x24
-#define GPIO_INPUT_FILTER 0x28
-#define GPIO_INPUT_EVENT_COUNT 0x2C
-#define GPIO_READ_BACK 0x30
-#define GPIO_INPUT_AUX1 0x34
-#define GPIO_EVENTS_ENABLE 0x38
-#define GPIO_LOCK_ENABLE 0x3C
-#define GPIO_POSITIVE_EDGE_EN 0x40
-#define GPIO_NEGATIVE_EDGE_EN 0x44
-#define GPIO_POSITIVE_EDGE_STS 0x48
-#define GPIO_NEGATIVE_EDGE_STS 0x4C
-
-#define GPIO_MAP_X 0xE0
-#define GPIO_MAP_Y 0xE4
-#define GPIO_MAP_Z 0xE8
-#define GPIO_MAP_W 0xEC
-
-static inline u32 geode_gpio(unsigned int nr)
-{
- BUG_ON(nr > 28);
- return 1 << nr;
-}
-
-extern void geode_gpio_set(u32, unsigned int);
-extern void geode_gpio_clear(u32, unsigned int);
-extern int geode_gpio_isset(u32, unsigned int);
-extern void geode_gpio_setup_event(unsigned int, int, int);
-extern void geode_gpio_set_irq(unsigned int, unsigned int);
-
-static inline void geode_gpio_event_irq(unsigned int gpio, int pair)
-{
- geode_gpio_setup_event(gpio, pair, 0);
-}
-
-static inline void geode_gpio_event_pme(unsigned int gpio, int pair)
-{
- geode_gpio_setup_event(gpio, pair, 1);
-}
-
-/* Specific geode tests */
+#include <linux/cs5535.h>
static inline int is_geode_gx(void)
{
@@ -186,68 +33,4 @@ static inline int is_geode(void)
return (is_geode_gx() || is_geode_lx());
}
-#ifdef CONFIG_MGEODE_LX
-extern int geode_has_vsa2(void);
-#else
-static inline int geode_has_vsa2(void)
-{
- return 0;
-}
-#endif
-
-/* MFGPTs */
-
-#define MFGPT_MAX_TIMERS 8
-#define MFGPT_TIMER_ANY (-1)
-
-#define MFGPT_DOMAIN_WORKING 1
-#define MFGPT_DOMAIN_STANDBY 2
-#define MFGPT_DOMAIN_ANY (MFGPT_DOMAIN_WORKING | MFGPT_DOMAIN_STANDBY)
-
-#define MFGPT_CMP1 0
-#define MFGPT_CMP2 1
-
-#define MFGPT_EVENT_IRQ 0
-#define MFGPT_EVENT_NMI 1
-#define MFGPT_EVENT_RESET 3
-
-#define MFGPT_REG_CMP1 0
-#define MFGPT_REG_CMP2 2
-#define MFGPT_REG_COUNTER 4
-#define MFGPT_REG_SETUP 6
-
-#define MFGPT_SETUP_CNTEN (1 << 15)
-#define MFGPT_SETUP_CMP2 (1 << 14)
-#define MFGPT_SETUP_CMP1 (1 << 13)
-#define MFGPT_SETUP_SETUP (1 << 12)
-#define MFGPT_SETUP_STOPEN (1 << 11)
-#define MFGPT_SETUP_EXTEN (1 << 10)
-#define MFGPT_SETUP_REVEN (1 << 5)
-#define MFGPT_SETUP_CLKSEL (1 << 4)
-
-static inline void geode_mfgpt_write(int timer, u16 reg, u16 value)
-{
- u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
- outw(value, base + reg + (timer * 8));
-}
-
-static inline u16 geode_mfgpt_read(int timer, u16 reg)
-{
- u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
- return inw(base + reg + (timer * 8));
-}
-
-extern int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable);
-extern int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable);
-extern int geode_mfgpt_alloc_timer(int timer, int domain);
-
-#define geode_mfgpt_setup_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 1)
-#define geode_mfgpt_release_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 0)
-
-#ifdef CONFIG_GEODE_MFGPT_TIMER
-extern int __init mfgpt_timer_setup(void);
-#else
-static inline int mfgpt_timer_setup(void) { return 0; }
-#endif
-
#endif /* _ASM_X86_GEODE_H */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 82e3e8f01043..0f8576427cfe 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -12,7 +12,7 @@ typedef struct {
unsigned int apic_timer_irqs; /* arch dependent */
unsigned int irq_spurious_count;
#endif
- unsigned int generic_irqs; /* arch dependent */
+ unsigned int x86_platform_ipis; /* arch dependent */
unsigned int apic_perf_irqs;
unsigned int apic_pending_irqs;
#ifdef CONFIG_SMP
@@ -20,11 +20,11 @@ typedef struct {
unsigned int irq_call_count;
unsigned int irq_tlb_count;
#endif
-#ifdef CONFIG_X86_MCE
+#ifdef CONFIG_X86_THERMAL_VECTOR
unsigned int irq_thermal_count;
-# ifdef CONFIG_X86_MCE_THRESHOLD
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
unsigned int irq_threshold_count;
-# endif
#endif
} ____cacheline_aligned irq_cpustat_t;
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 014c2b85ae45..a726650fc80f 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -66,10 +66,6 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
struct page *kmap_atomic_to_page(void *ptr);
-#ifndef CONFIG_PARAVIRT
-#define kmap_atomic_pte(page, type) kmap_atomic(page, type)
-#endif
-
#define flush_cache_kmaps() do { } while (0)
extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 1c22cb05ad6a..1d5c08a1bdfd 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -65,11 +65,13 @@
/* hpet memory map physical address */
extern unsigned long hpet_address;
extern unsigned long force_hpet_address;
+extern u8 hpet_blockid;
extern int hpet_force_user;
+extern u8 hpet_msi_disable;
extern int is_hpet_enabled(void);
extern int hpet_enable(void);
extern void hpet_disable(void);
-extern unsigned long hpet_readl(unsigned long a);
+extern unsigned int hpet_readl(unsigned int a);
extern void force_hpet_resume(void);
extern void hpet_msi_unmask(unsigned int irq);
@@ -78,9 +80,9 @@ extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
#ifdef CONFIG_PCI_MSI
-extern int arch_setup_hpet_msi(unsigned int irq);
+extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id);
#else
-static inline int arch_setup_hpet_msi(unsigned int irq)
+static inline int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
{
return -EINVAL;
}
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
new file mode 100644
index 000000000000..2a1bd8f4f23a
--- /dev/null
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,72 @@
+#ifndef _I386_HW_BREAKPOINT_H
+#define _I386_HW_BREAKPOINT_H
+
+#ifdef __KERNEL__
+#define __ARCH_HW_BREAKPOINT_H
+
+/*
+ * The name should probably be something dealt in
+ * a higher level. While dealing with the user
+ * (display/resolving)
+ */
+struct arch_hw_breakpoint {
+ unsigned long address;
+ u8 len;
+ u8 type;
+};
+
+#include <linux/kdebug.h>
+#include <linux/percpu.h>
+#include <linux/list.h>
+
+/* Available HW breakpoint length encodings */
+#define X86_BREAKPOINT_LEN_1 0x40
+#define X86_BREAKPOINT_LEN_2 0x44
+#define X86_BREAKPOINT_LEN_4 0x4c
+#define X86_BREAKPOINT_LEN_EXECUTE 0x40
+
+#ifdef CONFIG_X86_64
+#define X86_BREAKPOINT_LEN_8 0x48
+#endif
+
+/* Available HW breakpoint type encodings */
+
+/* trigger on instruction execute */
+#define X86_BREAKPOINT_EXECUTE 0x80
+/* trigger on memory write */
+#define X86_BREAKPOINT_WRITE 0x81
+/* trigger on memory read or write */
+#define X86_BREAKPOINT_RW 0x83
+
+/* Total number of available HW breakpoint registers */
+#define HBP_NUM 4
+
+struct perf_event;
+struct pmu;
+
+extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
+extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
+ struct task_struct *tsk);
+extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
+ unsigned long val, void *data);
+
+
+int arch_install_hw_breakpoint(struct perf_event *bp);
+void arch_uninstall_hw_breakpoint(struct perf_event *bp);
+void hw_breakpoint_pmu_read(struct perf_event *bp);
+void hw_breakpoint_pmu_unthrottle(struct perf_event *bp);
+
+extern void
+arch_fill_perf_breakpoint(struct perf_event *bp);
+
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type);
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type);
+
+extern int arch_bp_generic_fields(int x86_len, int x86_type,
+ int *gen_len, int *gen_type);
+
+extern struct pmu perf_ops_bp;
+
+#endif /* __KERNEL__ */
+#endif /* _I386_HW_BREAKPOINT_H */
+
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index ba180d93b08c..a929c9ede33d 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -27,7 +27,7 @@
/* Interrupt handlers registered during init_IRQ */
extern void apic_timer_interrupt(void);
-extern void generic_interrupt(void);
+extern void x86_platform_ipi(void);
extern void error_interrupt(void);
extern void perf_pending_interrupt(void);
@@ -53,13 +53,6 @@ extern void threshold_interrupt(void);
extern void call_function_interrupt(void);
extern void call_function_single_interrupt(void);
-/* PIC specific functions */
-extern void disable_8259A_irq(unsigned int irq);
-extern void enable_8259A_irq(unsigned int irq);
-extern int i8259A_irq_pending(unsigned int irq);
-extern void make_8259A_irq(unsigned int irq);
-extern void init_8259A(int aeoi);
-
/* IOAPIC */
#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
extern unsigned long io_apic_irqs;
@@ -79,14 +72,33 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
int ioapic, int ioapic_pin,
int trigger, int polarity)
{
- irq_attr->ioapic = ioapic;
- irq_attr->ioapic_pin = ioapic_pin;
- irq_attr->trigger = trigger;
- irq_attr->polarity = polarity;
+ irq_attr->ioapic = ioapic;
+ irq_attr->ioapic_pin = ioapic_pin;
+ irq_attr->trigger = trigger;
+ irq_attr->polarity = polarity;
}
-extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin,
- struct io_apic_irq_attr *irq_attr);
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * Most irqs are mapped 1:1 with pins.
+ */
+struct irq_cfg {
+ struct irq_pin_list *irq_2_pin;
+ cpumask_var_t domain;
+ cpumask_var_t old_domain;
+ u8 vector;
+ u8 move_in_progress : 1;
+};
+
+extern struct irq_cfg *irq_cfg(unsigned int);
+extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
+extern void send_cleanup_vector(struct irq_cfg *);
+
+struct irq_desc;
+extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *,
+ unsigned int *dest_id);
+extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr);
extern void setup_ioapic_dest(void);
extern void enable_IO_APIC(void);
@@ -101,7 +113,7 @@ extern void eisa_set_level_irq(unsigned int irq);
/* SMP */
extern void smp_apic_timer_interrupt(struct pt_regs *);
extern void smp_spurious_interrupt(struct pt_regs *);
-extern void smp_generic_interrupt(struct pt_regs *);
+extern void smp_x86_platform_ipi(struct pt_regs *);
extern void smp_error_interrupt(struct pt_regs *);
#ifdef CONFIG_X86_IO_APIC
extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h
new file mode 100644
index 000000000000..e153a2b3889a
--- /dev/null
+++ b/arch/x86/include/asm/hyperv.h
@@ -0,0 +1,186 @@
+#ifndef _ASM_X86_KVM_HYPERV_H
+#define _ASM_X86_KVM_HYPERV_H
+
+#include <linux/types.h>
+
+/*
+ * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
+ * is set by CPUID(HvCpuIdFunctionVersionAndFeatures).
+ */
+#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000
+#define HYPERV_CPUID_INTERFACE 0x40000001
+#define HYPERV_CPUID_VERSION 0x40000002
+#define HYPERV_CPUID_FEATURES 0x40000003
+#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
+#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
+
+/*
+ * Feature identification. EAX indicates which features are available
+ * to the partition based upon the current partition privileges.
+ */
+
+/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */
+#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0)
+/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
+#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)
+/*
+ * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
+ * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
+ */
+#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2)
+/*
+ * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through
+ * HV_X64_MSR_STIMER3_COUNT) available
+ */
+#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3)
+/*
+ * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR)
+ * are available
+ */
+#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4)
+/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/
+#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5)
+/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/
+#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6)
+/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/
+#define HV_X64_MSR_RESET_AVAILABLE (1 << 7)
+ /*
+ * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE,
+ * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE,
+ * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available
+ */
+#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8)
+
+/*
+ * Feature identification: EBX indicates which flags were specified at
+ * partition creation. The format is the same as the partition creation
+ * flag structure defined in section Partition Creation Flags.
+ */
+#define HV_X64_CREATE_PARTITIONS (1 << 0)
+#define HV_X64_ACCESS_PARTITION_ID (1 << 1)
+#define HV_X64_ACCESS_MEMORY_POOL (1 << 2)
+#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3)
+#define HV_X64_POST_MESSAGES (1 << 4)
+#define HV_X64_SIGNAL_EVENTS (1 << 5)
+#define HV_X64_CREATE_PORT (1 << 6)
+#define HV_X64_CONNECT_PORT (1 << 7)
+#define HV_X64_ACCESS_STATS (1 << 8)
+#define HV_X64_DEBUGGING (1 << 11)
+#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12)
+#define HV_X64_CONFIGURE_PROFILER (1 << 13)
+
+/*
+ * Feature identification. EDX indicates which miscellaneous features
+ * are available to the partition.
+ */
+/* The MWAIT instruction is available (per section MONITOR / MWAIT) */
+#define HV_X64_MWAIT_AVAILABLE (1 << 0)
+/* Guest debugging support is available */
+#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1)
+/* Performance Monitor support is available*/
+#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2)
+/* Support for physical CPU dynamic partitioning events is available*/
+#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3)
+/*
+ * Support for passing hypercall input parameter block via XMM
+ * registers is available
+ */
+#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4)
+/* Support for a virtual guest idle state is available */
+#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5)
+
+/*
+ * Implementation recommendations. Indicates which behaviors the hypervisor
+ * recommends the OS implement for optimal performance.
+ */
+ /*
+ * Recommend using hypercall for address space switches rather
+ * than MOV to CR3 instruction
+ */
+#define HV_X64_MWAIT_RECOMMENDED (1 << 0)
+/* Recommend using hypercall for local TLB flushes rather
+ * than INVLPG or MOV to CR3 instructions */
+#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1)
+/*
+ * Recommend using hypercall for remote TLB flushes rather
+ * than inter-processor interrupts
+ */
+#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2)
+/*
+ * Recommend using MSRs for accessing APIC registers
+ * EOI, ICR and TPR rather than their memory-mapped counterparts
+ */
+#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3)
+/* Recommend using the hypervisor-provided MSR to initiate a system RESET */
+#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4)
+/*
+ * Recommend using relaxed timing for this partition. If used,
+ * the VM should disable any watchdog timeouts that rely on the
+ * timely delivery of external interrupts
+ */
+#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5)
+
+/* MSR used to identify the guest OS. */
+#define HV_X64_MSR_GUEST_OS_ID 0x40000000
+
+/* MSR used to setup pages used to communicate with the hypervisor. */
+#define HV_X64_MSR_HYPERCALL 0x40000001
+
+/* MSR used to provide vcpu index */
+#define HV_X64_MSR_VP_INDEX 0x40000002
+
+/* Define the virtual APIC registers */
+#define HV_X64_MSR_EOI 0x40000070
+#define HV_X64_MSR_ICR 0x40000071
+#define HV_X64_MSR_TPR 0x40000072
+#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073
+
+/* Define synthetic interrupt controller model specific registers. */
+#define HV_X64_MSR_SCONTROL 0x40000080
+#define HV_X64_MSR_SVERSION 0x40000081
+#define HV_X64_MSR_SIEFP 0x40000082
+#define HV_X64_MSR_SIMP 0x40000083
+#define HV_X64_MSR_EOM 0x40000084
+#define HV_X64_MSR_SINT0 0x40000090
+#define HV_X64_MSR_SINT1 0x40000091
+#define HV_X64_MSR_SINT2 0x40000092
+#define HV_X64_MSR_SINT3 0x40000093
+#define HV_X64_MSR_SINT4 0x40000094
+#define HV_X64_MSR_SINT5 0x40000095
+#define HV_X64_MSR_SINT6 0x40000096
+#define HV_X64_MSR_SINT7 0x40000097
+#define HV_X64_MSR_SINT8 0x40000098
+#define HV_X64_MSR_SINT9 0x40000099
+#define HV_X64_MSR_SINT10 0x4000009A
+#define HV_X64_MSR_SINT11 0x4000009B
+#define HV_X64_MSR_SINT12 0x4000009C
+#define HV_X64_MSR_SINT13 0x4000009D
+#define HV_X64_MSR_SINT14 0x4000009E
+#define HV_X64_MSR_SINT15 0x4000009F
+
+
+#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
+#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
+
+/* Declare the various hypercall operations. */
+#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008
+
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+
+#define HV_PROCESSOR_POWER_STATE_C0 0
+#define HV_PROCESSOR_POWER_STATE_C1 1
+#define HV_PROCESSOR_POWER_STATE_C2 2
+#define HV_PROCESSOR_POWER_STATE_C3 3
+
+/* hypercall status code */
+#define HV_STATUS_SUCCESS 0
+#define HV_STATUS_INVALID_HYPERCALL_CODE 2
+#define HV_STATUS_INVALID_HYPERCALL_INPUT 3
+#define HV_STATUS_INVALID_ALIGNMENT 4
+
+#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 0b20bbb758f2..da2930924501 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -10,6 +10,8 @@
#ifndef _ASM_X86_I387_H
#define _ASM_X86_I387_H
+#ifndef __ASSEMBLY__
+
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/regset.h>
@@ -31,8 +33,16 @@ extern void init_thread_xstate(void);
extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
extern user_regset_active_fn fpregs_active, xfpregs_active;
-extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get;
-extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set;
+extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
+ xstateregs_get;
+extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
+ xstateregs_set;
+
+/*
+ * xstateregs_active == fpregs_active. Please refer to the comment
+ * at the definition of fpregs_active.
+ */
+#define xstateregs_active fpregs_active
extern struct _fpx_sw_bytes fx_sw_reserved;
#ifdef CONFIG_IA32_EMULATION
@@ -411,4 +421,9 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
}
}
+#endif /* __ASSEMBLY__ */
+
+#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
+#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
+
#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 58d7091eeb1f..1655147646aa 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -24,12 +24,7 @@ extern unsigned int cached_irq_mask;
#define SLAVE_ICW4_DEFAULT 0x01
#define PIC_ICW4_AEOI 2
-extern spinlock_t i8259A_lock;
-
-extern void init_8259A(int auto_eoi);
-extern void enable_8259A_irq(unsigned int irq);
-extern void disable_8259A_irq(unsigned int irq);
-extern unsigned int startup_8259A_irq(unsigned int irq);
+extern raw_spinlock_t i8259A_lock;
/* the PIC may need a careful delay on some platforms, hence specific calls */
static inline unsigned char inb_pic(unsigned int port)
@@ -57,7 +52,17 @@ static inline void outb_pic(unsigned char value, unsigned int port)
extern struct irq_chip i8259A_chip;
-extern void mask_8259A(void);
-extern void unmask_8259A(void);
+struct legacy_pic {
+ int nr_legacy_irqs;
+ struct irq_chip *chip;
+ void (*mask_all)(void);
+ void (*restore_mask)(void);
+ void (*init)(int auto_eoi);
+ int (*irq_pending)(unsigned int irq);
+ void (*make_irq)(unsigned int irq);
+};
+
+extern struct legacy_pic *legacy_pic;
+extern struct legacy_pic null_legacy_pic;
#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
new file mode 100644
index 000000000000..205b063e3e32
--- /dev/null
+++ b/arch/x86/include/asm/inat.h
@@ -0,0 +1,220 @@
+#ifndef _ASM_X86_INAT_H
+#define _ASM_X86_INAT_H
+/*
+ * x86 instruction attributes
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+#include <asm/inat_types.h>
+
+/*
+ * Internal bits. Don't use bitmasks directly, because these bits are
+ * unstable. You should use checking functions.
+ */
+
+#define INAT_OPCODE_TABLE_SIZE 256
+#define INAT_GROUP_TABLE_SIZE 8
+
+/* Legacy last prefixes */
+#define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */
+#define INAT_PFX_REPE 2 /* 0xF3 */ /* LPFX2 */
+#define INAT_PFX_REPNE 3 /* 0xF2 */ /* LPFX3 */
+/* Other Legacy prefixes */
+#define INAT_PFX_LOCK 4 /* 0xF0 */
+#define INAT_PFX_CS 5 /* 0x2E */
+#define INAT_PFX_DS 6 /* 0x3E */
+#define INAT_PFX_ES 7 /* 0x26 */
+#define INAT_PFX_FS 8 /* 0x64 */
+#define INAT_PFX_GS 9 /* 0x65 */
+#define INAT_PFX_SS 10 /* 0x36 */
+#define INAT_PFX_ADDRSZ 11 /* 0x67 */
+/* x86-64 REX prefix */
+#define INAT_PFX_REX 12 /* 0x4X */
+/* AVX VEX prefixes */
+#define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */
+#define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */
+
+#define INAT_LSTPFX_MAX 3
+#define INAT_LGCPFX_MAX 11
+
+/* Immediate size */
+#define INAT_IMM_BYTE 1
+#define INAT_IMM_WORD 2
+#define INAT_IMM_DWORD 3
+#define INAT_IMM_QWORD 4
+#define INAT_IMM_PTR 5
+#define INAT_IMM_VWORD32 6
+#define INAT_IMM_VWORD 7
+
+/* Legacy prefix */
+#define INAT_PFX_OFFS 0
+#define INAT_PFX_BITS 4
+#define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1)
+#define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS)
+/* Escape opcodes */
+#define INAT_ESC_OFFS (INAT_PFX_OFFS + INAT_PFX_BITS)
+#define INAT_ESC_BITS 2
+#define INAT_ESC_MAX ((1 << INAT_ESC_BITS) - 1)
+#define INAT_ESC_MASK (INAT_ESC_MAX << INAT_ESC_OFFS)
+/* Group opcodes (1-16) */
+#define INAT_GRP_OFFS (INAT_ESC_OFFS + INAT_ESC_BITS)
+#define INAT_GRP_BITS 5
+#define INAT_GRP_MAX ((1 << INAT_GRP_BITS) - 1)
+#define INAT_GRP_MASK (INAT_GRP_MAX << INAT_GRP_OFFS)
+/* Immediates */
+#define INAT_IMM_OFFS (INAT_GRP_OFFS + INAT_GRP_BITS)
+#define INAT_IMM_BITS 3
+#define INAT_IMM_MASK (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS)
+/* Flags */
+#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS)
+#define INAT_MODRM (1 << (INAT_FLAG_OFFS))
+#define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 1))
+#define INAT_SCNDIMM (1 << (INAT_FLAG_OFFS + 2))
+#define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3))
+#define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4))
+#define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5))
+#define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6))
+/* Attribute making macros for attribute tables */
+#define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS)
+#define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS)
+#define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM)
+#define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS)
+
+/* Attribute search APIs */
+extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode);
+extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode,
+ insn_byte_t last_pfx,
+ insn_attr_t esc_attr);
+extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm,
+ insn_byte_t last_pfx,
+ insn_attr_t esc_attr);
+extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode,
+ insn_byte_t vex_m,
+ insn_byte_t vex_pp);
+
+/* Attribute checking functions */
+static inline int inat_is_legacy_prefix(insn_attr_t attr)
+{
+ attr &= INAT_PFX_MASK;
+ return attr && attr <= INAT_LGCPFX_MAX;
+}
+
+static inline int inat_is_address_size_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ;
+}
+
+static inline int inat_is_operand_size_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ;
+}
+
+static inline int inat_is_rex_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_REX;
+}
+
+static inline int inat_last_prefix_id(insn_attr_t attr)
+{
+ if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX)
+ return 0;
+ else
+ return attr & INAT_PFX_MASK;
+}
+
+static inline int inat_is_vex_prefix(insn_attr_t attr)
+{
+ attr &= INAT_PFX_MASK;
+ return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3;
+}
+
+static inline int inat_is_vex3_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3;
+}
+
+static inline int inat_is_escape(insn_attr_t attr)
+{
+ return attr & INAT_ESC_MASK;
+}
+
+static inline int inat_escape_id(insn_attr_t attr)
+{
+ return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS;
+}
+
+static inline int inat_is_group(insn_attr_t attr)
+{
+ return attr & INAT_GRP_MASK;
+}
+
+static inline int inat_group_id(insn_attr_t attr)
+{
+ return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS;
+}
+
+static inline int inat_group_common_attribute(insn_attr_t attr)
+{
+ return attr & ~INAT_GRP_MASK;
+}
+
+static inline int inat_has_immediate(insn_attr_t attr)
+{
+ return attr & INAT_IMM_MASK;
+}
+
+static inline int inat_immediate_size(insn_attr_t attr)
+{
+ return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS;
+}
+
+static inline int inat_has_modrm(insn_attr_t attr)
+{
+ return attr & INAT_MODRM;
+}
+
+static inline int inat_is_force64(insn_attr_t attr)
+{
+ return attr & INAT_FORCE64;
+}
+
+static inline int inat_has_second_immediate(insn_attr_t attr)
+{
+ return attr & INAT_SCNDIMM;
+}
+
+static inline int inat_has_moffset(insn_attr_t attr)
+{
+ return attr & INAT_MOFFSET;
+}
+
+static inline int inat_has_variant(insn_attr_t attr)
+{
+ return attr & INAT_VARIANT;
+}
+
+static inline int inat_accept_vex(insn_attr_t attr)
+{
+ return attr & INAT_VEXOK;
+}
+
+static inline int inat_must_vex(insn_attr_t attr)
+{
+ return attr & INAT_VEXONLY;
+}
+#endif
diff --git a/arch/x86/include/asm/inat_types.h b/arch/x86/include/asm/inat_types.h
new file mode 100644
index 000000000000..cb3c20ce39cf
--- /dev/null
+++ b/arch/x86/include/asm/inat_types.h
@@ -0,0 +1,29 @@
+#ifndef _ASM_X86_INAT_TYPES_H
+#define _ASM_X86_INAT_TYPES_H
+/*
+ * x86 instruction attributes
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+/* Instruction attributes */
+typedef unsigned int insn_attr_t;
+typedef unsigned char insn_byte_t;
+typedef signed int insn_value_t;
+
+#endif
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
new file mode 100644
index 000000000000..96c2e0ad04ca
--- /dev/null
+++ b/arch/x86/include/asm/insn.h
@@ -0,0 +1,184 @@
+#ifndef _ASM_X86_INSN_H
+#define _ASM_X86_INSN_H
+/*
+ * x86 instruction analysis
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+/* insn_attr_t is defined in inat.h */
+#include <asm/inat.h>
+
+struct insn_field {
+ union {
+ insn_value_t value;
+ insn_byte_t bytes[4];
+ };
+ /* !0 if we've run insn_get_xxx() for this field */
+ unsigned char got;
+ unsigned char nbytes;
+};
+
+struct insn {
+ struct insn_field prefixes; /*
+ * Prefixes
+ * prefixes.bytes[3]: last prefix
+ */
+ struct insn_field rex_prefix; /* REX prefix */
+ struct insn_field vex_prefix; /* VEX prefix */
+ struct insn_field opcode; /*
+ * opcode.bytes[0]: opcode1
+ * opcode.bytes[1]: opcode2
+ * opcode.bytes[2]: opcode3
+ */
+ struct insn_field modrm;
+ struct insn_field sib;
+ struct insn_field displacement;
+ union {
+ struct insn_field immediate;
+ struct insn_field moffset1; /* for 64bit MOV */
+ struct insn_field immediate1; /* for 64bit imm or off16/32 */
+ };
+ union {
+ struct insn_field moffset2; /* for 64bit MOV */
+ struct insn_field immediate2; /* for 64bit imm or seg16 */
+ };
+
+ insn_attr_t attr;
+ unsigned char opnd_bytes;
+ unsigned char addr_bytes;
+ unsigned char length;
+ unsigned char x86_64;
+
+ const insn_byte_t *kaddr; /* kernel address of insn to analyze */
+ const insn_byte_t *next_byte;
+};
+
+#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
+#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
+#define X86_MODRM_RM(modrm) ((modrm) & 0x07)
+
+#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6)
+#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3)
+#define X86_SIB_BASE(sib) ((sib) & 0x07)
+
+#define X86_REX_W(rex) ((rex) & 8)
+#define X86_REX_R(rex) ((rex) & 4)
+#define X86_REX_X(rex) ((rex) & 2)
+#define X86_REX_B(rex) ((rex) & 1)
+
+/* VEX bit flags */
+#define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */
+#define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */
+#define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */
+#define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */
+#define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */
+/* VEX bit fields */
+#define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */
+#define X86_VEX2_M 1 /* VEX2.M always 1 */
+#define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */
+#define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */
+#define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */
+
+/* The last prefix is needed for two-byte and three-byte opcodes */
+static inline insn_byte_t insn_last_prefix(struct insn *insn)
+{
+ return insn->prefixes.bytes[3];
+}
+
+extern void insn_init(struct insn *insn, const void *kaddr, int x86_64);
+extern void insn_get_prefixes(struct insn *insn);
+extern void insn_get_opcode(struct insn *insn);
+extern void insn_get_modrm(struct insn *insn);
+extern void insn_get_sib(struct insn *insn);
+extern void insn_get_displacement(struct insn *insn);
+extern void insn_get_immediate(struct insn *insn);
+extern void insn_get_length(struct insn *insn);
+
+/* Attribute will be determined after getting ModRM (for opcode groups) */
+static inline void insn_get_attribute(struct insn *insn)
+{
+ insn_get_modrm(insn);
+}
+
+/* Instruction uses RIP-relative addressing */
+extern int insn_rip_relative(struct insn *insn);
+
+/* Init insn for kernel text */
+static inline void kernel_insn_init(struct insn *insn, const void *kaddr)
+{
+#ifdef CONFIG_X86_64
+ insn_init(insn, kaddr, 1);
+#else /* CONFIG_X86_32 */
+ insn_init(insn, kaddr, 0);
+#endif
+}
+
+static inline int insn_is_avx(struct insn *insn)
+{
+ if (!insn->prefixes.got)
+ insn_get_prefixes(insn);
+ return (insn->vex_prefix.value != 0);
+}
+
+static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
+{
+ if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
+ return X86_VEX2_M;
+ else
+ return X86_VEX3_M(insn->vex_prefix.bytes[1]);
+}
+
+static inline insn_byte_t insn_vex_p_bits(struct insn *insn)
+{
+ if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
+ return X86_VEX_P(insn->vex_prefix.bytes[1]);
+ else
+ return X86_VEX_P(insn->vex_prefix.bytes[2]);
+}
+
+/* Offset of each field from kaddr */
+static inline int insn_offset_rex_prefix(struct insn *insn)
+{
+ return insn->prefixes.nbytes;
+}
+static inline int insn_offset_vex_prefix(struct insn *insn)
+{
+ return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes;
+}
+static inline int insn_offset_opcode(struct insn *insn)
+{
+ return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes;
+}
+static inline int insn_offset_modrm(struct insn *insn)
+{
+ return insn_offset_opcode(insn) + insn->opcode.nbytes;
+}
+static inline int insn_offset_sib(struct insn *insn)
+{
+ return insn_offset_modrm(insn) + insn->modrm.nbytes;
+}
+static inline int insn_offset_displacement(struct insn *insn)
+{
+ return insn_offset_sib(insn) + insn->sib.nbytes;
+}
+static inline int insn_offset_immediate(struct insn *insn)
+{
+ return insn_offset_displacement(insn) + insn->displacement.nbytes;
+}
+
+#endif /* _ASM_X86_INSN_H */
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
new file mode 100644
index 000000000000..14cf526091f9
--- /dev/null
+++ b/arch/x86/include/asm/inst.h
@@ -0,0 +1,150 @@
+/*
+ * Generate .byte code for some instructions not supported by old
+ * binutils.
+ */
+#ifndef X86_ASM_INST_H
+#define X86_ASM_INST_H
+
+#ifdef __ASSEMBLY__
+
+ .macro XMM_NUM opd xmm
+ .ifc \xmm,%xmm0
+ \opd = 0
+ .endif
+ .ifc \xmm,%xmm1
+ \opd = 1
+ .endif
+ .ifc \xmm,%xmm2
+ \opd = 2
+ .endif
+ .ifc \xmm,%xmm3
+ \opd = 3
+ .endif
+ .ifc \xmm,%xmm4
+ \opd = 4
+ .endif
+ .ifc \xmm,%xmm5
+ \opd = 5
+ .endif
+ .ifc \xmm,%xmm6
+ \opd = 6
+ .endif
+ .ifc \xmm,%xmm7
+ \opd = 7
+ .endif
+ .ifc \xmm,%xmm8
+ \opd = 8
+ .endif
+ .ifc \xmm,%xmm9
+ \opd = 9
+ .endif
+ .ifc \xmm,%xmm10
+ \opd = 10
+ .endif
+ .ifc \xmm,%xmm11
+ \opd = 11
+ .endif
+ .ifc \xmm,%xmm12
+ \opd = 12
+ .endif
+ .ifc \xmm,%xmm13
+ \opd = 13
+ .endif
+ .ifc \xmm,%xmm14
+ \opd = 14
+ .endif
+ .ifc \xmm,%xmm15
+ \opd = 15
+ .endif
+ .endm
+
+ .macro PFX_OPD_SIZE
+ .byte 0x66
+ .endm
+
+ .macro PFX_REX opd1 opd2
+ .if (\opd1 | \opd2) & 8
+ .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1)
+ .endif
+ .endm
+
+ .macro MODRM mod opd1 opd2
+ .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
+ .endm
+
+ .macro PSHUFB_XMM xmm1 xmm2
+ XMM_NUM pshufb_opd1 \xmm1
+ XMM_NUM pshufb_opd2 \xmm2
+ PFX_OPD_SIZE
+ PFX_REX pshufb_opd1 pshufb_opd2
+ .byte 0x0f, 0x38, 0x00
+ MODRM 0xc0 pshufb_opd1 pshufb_opd2
+ .endm
+
+ .macro PCLMULQDQ imm8 xmm1 xmm2
+ XMM_NUM clmul_opd1 \xmm1
+ XMM_NUM clmul_opd2 \xmm2
+ PFX_OPD_SIZE
+ PFX_REX clmul_opd1 clmul_opd2
+ .byte 0x0f, 0x3a, 0x44
+ MODRM 0xc0 clmul_opd1 clmul_opd2
+ .byte \imm8
+ .endm
+
+ .macro AESKEYGENASSIST rcon xmm1 xmm2
+ XMM_NUM aeskeygen_opd1 \xmm1
+ XMM_NUM aeskeygen_opd2 \xmm2
+ PFX_OPD_SIZE
+ PFX_REX aeskeygen_opd1 aeskeygen_opd2
+ .byte 0x0f, 0x3a, 0xdf
+ MODRM 0xc0 aeskeygen_opd1 aeskeygen_opd2
+ .byte \rcon
+ .endm
+
+ .macro AESIMC xmm1 xmm2
+ XMM_NUM aesimc_opd1 \xmm1
+ XMM_NUM aesimc_opd2 \xmm2
+ PFX_OPD_SIZE
+ PFX_REX aesimc_opd1 aesimc_opd2
+ .byte 0x0f, 0x38, 0xdb
+ MODRM 0xc0 aesimc_opd1 aesimc_opd2
+ .endm
+
+ .macro AESENC xmm1 xmm2
+ XMM_NUM aesenc_opd1 \xmm1
+ XMM_NUM aesenc_opd2 \xmm2
+ PFX_OPD_SIZE
+ PFX_REX aesenc_opd1 aesenc_opd2
+ .byte 0x0f, 0x38, 0xdc
+ MODRM 0xc0 aesenc_opd1 aesenc_opd2
+ .endm
+
+ .macro AESENCLAST xmm1 xmm2
+ XMM_NUM aesenclast_opd1 \xmm1
+ XMM_NUM aesenclast_opd2 \xmm2
+ PFX_OPD_SIZE
+ PFX_REX aesenclast_opd1 aesenclast_opd2
+ .byte 0x0f, 0x38, 0xdd
+ MODRM 0xc0 aesenclast_opd1 aesenclast_opd2
+ .endm
+
+ .macro AESDEC xmm1 xmm2
+ XMM_NUM aesdec_opd1 \xmm1
+ XMM_NUM aesdec_opd2 \xmm2
+ PFX_OPD_SIZE
+ PFX_REX aesdec_opd1 aesdec_opd2
+ .byte 0x0f, 0x38, 0xde
+ MODRM 0xc0 aesdec_opd1 aesdec_opd2
+ .endm
+
+ .macro AESDECLAST xmm1 xmm2
+ XMM_NUM aesdeclast_opd1 \xmm1
+ XMM_NUM aesdeclast_opd2 \xmm2
+ PFX_OPD_SIZE
+ PFX_REX aesdeclast_opd1 aesdeclast_opd2
+ .byte 0x0f, 0x38, 0xdf
+ MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2
+ .endm
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 73739322b6d0..a1dcfa3ab17d 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -1,8 +1,42 @@
#ifndef _ASM_X86_IO_H
#define _ASM_X86_IO_H
+/*
+ * This file contains the definitions for the x86 IO instructions
+ * inb/inw/inl/outb/outw/outl and the "string versions" of the same
+ * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
+ * versions of the single-IO instructions (inb_p/inw_p/..).
+ *
+ * This file is not meant to be obfuscating: it's just complicated
+ * to (a) handle it all in a way that makes gcc able to optimize it
+ * as well as possible and (b) trying to avoid writing the same thing
+ * over and over again with slight variations and possibly making a
+ * mistake somewhere.
+ */
+
+/*
+ * Thanks to James van Artsdalen for a better timing-fix than
+ * the two short jumps: using outb's to a nonexistent port seems
+ * to guarantee better timings even on fast machines.
+ *
+ * On the other hand, I'd like to be sure of a non-existent port:
+ * I feel a bit unsafe about using 0x80 (should be safe, though)
+ *
+ * Linus
+ */
+
+ /*
+ * Bit simplified and optimized by Jan Hubicka
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
+ *
+ * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
+ * isa_read[wl] and isa_write[wl] fixed
+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ */
+
#define ARCH_HAS_IOREMAP_WC
+#include <linux/string.h>
#include <linux/compiler.h>
#include <asm-generic/int-ll64.h>
#include <asm/page.h>
@@ -173,11 +207,126 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
extern void iounmap(volatile void __iomem *addr);
-#ifdef CONFIG_X86_32
-# include "io_32.h"
+#ifdef __KERNEL__
+
+#include <asm-generic/iomap.h>
+
+#include <linux/vmalloc.h>
+
+/*
+ * Convert a virtual cached pointer to an uncached pointer
+ */
+#define xlate_dev_kmem_ptr(p) p
+
+static inline void
+memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
+{
+ memset((void __force *)addr, val, count);
+}
+
+static inline void
+memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
+{
+ memcpy(dst, (const void __force *)src, count);
+}
+
+static inline void
+memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
+{
+ memcpy((void __force *)dst, src, count);
+}
+
+/*
+ * ISA space is 'always mapped' on a typical x86 system, no need to
+ * explicitly ioremap() it. The fact that the ISA IO space is mapped
+ * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
+ * are physical addresses. The following constant pointer can be
+ * used as the IO-area pointer (it can be iounmapped as well, so the
+ * analogy with PCI is quite large):
+ */
+#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
+
+/*
+ * Cache management
+ *
+ * This needed for two cases
+ * 1. Out of order aware processors
+ * 2. Accidentally out of order processors (PPro errata #51)
+ */
+
+static inline void flush_write_buffers(void)
+{
+#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
+ asm volatile("lock; addl $0,0(%%esp)": : :"memory");
+#endif
+}
+
+#endif /* __KERNEL__ */
+
+extern void native_io_delay(void);
+
+extern int io_delay_type;
+extern void io_delay_init(void);
+
+#if defined(CONFIG_PARAVIRT)
+#include <asm/paravirt.h>
#else
-# include "io_64.h"
+
+static inline void slow_down_io(void)
+{
+ native_io_delay();
+#ifdef REALLY_SLOW_IO
+ native_io_delay();
+ native_io_delay();
+ native_io_delay();
#endif
+}
+
+#endif
+
+#define BUILDIO(bwl, bw, type) \
+static inline void out##bwl(unsigned type value, int port) \
+{ \
+ asm volatile("out" #bwl " %" #bw "0, %w1" \
+ : : "a"(value), "Nd"(port)); \
+} \
+ \
+static inline unsigned type in##bwl(int port) \
+{ \
+ unsigned type value; \
+ asm volatile("in" #bwl " %w1, %" #bw "0" \
+ : "=a"(value) : "Nd"(port)); \
+ return value; \
+} \
+ \
+static inline void out##bwl##_p(unsigned type value, int port) \
+{ \
+ out##bwl(value, port); \
+ slow_down_io(); \
+} \
+ \
+static inline unsigned type in##bwl##_p(int port) \
+{ \
+ unsigned type value = in##bwl(port); \
+ slow_down_io(); \
+ return value; \
+} \
+ \
+static inline void outs##bwl(int port, const void *addr, unsigned long count) \
+{ \
+ asm volatile("rep; outs" #bwl \
+ : "+S"(addr), "+c"(count) : "d"(port)); \
+} \
+ \
+static inline void ins##bwl(int port, void *addr, unsigned long count) \
+{ \
+ asm volatile("rep; ins" #bwl \
+ : "+D"(addr), "+c"(count) : "d"(port)); \
+}
+
+BUILDIO(b, b, char)
+BUILDIO(w, w, short)
+BUILDIO(l, , int)
extern void *xlate_dev_mem_ptr(unsigned long phys);
extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h
deleted file mode 100644
index a299900f5920..000000000000
--- a/arch/x86/include/asm/io_32.h
+++ /dev/null
@@ -1,196 +0,0 @@
-#ifndef _ASM_X86_IO_32_H
-#define _ASM_X86_IO_32_H
-
-#include <linux/string.h>
-#include <linux/compiler.h>
-
-/*
- * This file contains the definitions for the x86 IO instructions
- * inb/inw/inl/outb/outw/outl and the "string versions" of the same
- * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
- * versions of the single-IO instructions (inb_p/inw_p/..).
- *
- * This file is not meant to be obfuscating: it's just complicated
- * to (a) handle it all in a way that makes gcc able to optimize it
- * as well as possible and (b) trying to avoid writing the same thing
- * over and over again with slight variations and possibly making a
- * mistake somewhere.
- */
-
-/*
- * Thanks to James van Artsdalen for a better timing-fix than
- * the two short jumps: using outb's to a nonexistent port seems
- * to guarantee better timings even on fast machines.
- *
- * On the other hand, I'd like to be sure of a non-existent port:
- * I feel a bit unsafe about using 0x80 (should be safe, though)
- *
- * Linus
- */
-
- /*
- * Bit simplified and optimized by Jan Hubicka
- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
- *
- * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
- * isa_read[wl] and isa_write[wl] fixed
- * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- */
-
-#define XQUAD_PORTIO_BASE 0xfe400000
-#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
-
-#ifdef __KERNEL__
-
-#include <asm-generic/iomap.h>
-
-#include <linux/vmalloc.h>
-
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p) p
-
-static inline void
-memset_io(volatile void __iomem *addr, unsigned char val, int count)
-{
- memset((void __force *)addr, val, count);
-}
-
-static inline void
-memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
-{
- __memcpy(dst, (const void __force *)src, count);
-}
-
-static inline void
-memcpy_toio(volatile void __iomem *dst, const void *src, int count)
-{
- __memcpy((void __force *)dst, src, count);
-}
-
-/*
- * ISA space is 'always mapped' on a typical x86 system, no need to
- * explicitly ioremap() it. The fact that the ISA IO space is mapped
- * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
- * are physical addresses. The following constant pointer can be
- * used as the IO-area pointer (it can be iounmapped as well, so the
- * analogy with PCI is quite large):
- */
-#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
-
-/*
- * Cache management
- *
- * This needed for two cases
- * 1. Out of order aware processors
- * 2. Accidentally out of order processors (PPro errata #51)
- */
-
-#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
-
-static inline void flush_write_buffers(void)
-{
- asm volatile("lock; addl $0,0(%%esp)": : :"memory");
-}
-
-#else
-
-#define flush_write_buffers() do { } while (0)
-
-#endif
-
-#endif /* __KERNEL__ */
-
-extern void native_io_delay(void);
-
-extern int io_delay_type;
-extern void io_delay_init(void);
-
-#if defined(CONFIG_PARAVIRT)
-#include <asm/paravirt.h>
-#else
-
-static inline void slow_down_io(void)
-{
- native_io_delay();
-#ifdef REALLY_SLOW_IO
- native_io_delay();
- native_io_delay();
- native_io_delay();
-#endif
-}
-
-#endif
-
-#define __BUILDIO(bwl, bw, type) \
-static inline void out##bwl(unsigned type value, int port) \
-{ \
- out##bwl##_local(value, port); \
-} \
- \
-static inline unsigned type in##bwl(int port) \
-{ \
- return in##bwl##_local(port); \
-}
-
-#define BUILDIO(bwl, bw, type) \
-static inline void out##bwl##_local(unsigned type value, int port) \
-{ \
- asm volatile("out" #bwl " %" #bw "0, %w1" \
- : : "a"(value), "Nd"(port)); \
-} \
- \
-static inline unsigned type in##bwl##_local(int port) \
-{ \
- unsigned type value; \
- asm volatile("in" #bwl " %w1, %" #bw "0" \
- : "=a"(value) : "Nd"(port)); \
- return value; \
-} \
- \
-static inline void out##bwl##_local_p(unsigned type value, int port) \
-{ \
- out##bwl##_local(value, port); \
- slow_down_io(); \
-} \
- \
-static inline unsigned type in##bwl##_local_p(int port) \
-{ \
- unsigned type value = in##bwl##_local(port); \
- slow_down_io(); \
- return value; \
-} \
- \
-__BUILDIO(bwl, bw, type) \
- \
-static inline void out##bwl##_p(unsigned type value, int port) \
-{ \
- out##bwl(value, port); \
- slow_down_io(); \
-} \
- \
-static inline unsigned type in##bwl##_p(int port) \
-{ \
- unsigned type value = in##bwl(port); \
- slow_down_io(); \
- return value; \
-} \
- \
-static inline void outs##bwl(int port, const void *addr, unsigned long count) \
-{ \
- asm volatile("rep; outs" #bwl \
- : "+S"(addr), "+c"(count) : "d"(port)); \
-} \
- \
-static inline void ins##bwl(int port, void *addr, unsigned long count) \
-{ \
- asm volatile("rep; ins" #bwl \
- : "+D"(addr), "+c"(count) : "d"(port)); \
-}
-
-BUILDIO(b, b, char)
-BUILDIO(w, w, short)
-BUILDIO(l, , int)
-
-#endif /* _ASM_X86_IO_32_H */
diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h
deleted file mode 100644
index 244067893af4..000000000000
--- a/arch/x86/include/asm/io_64.h
+++ /dev/null
@@ -1,181 +0,0 @@
-#ifndef _ASM_X86_IO_64_H
-#define _ASM_X86_IO_64_H
-
-
-/*
- * This file contains the definitions for the x86 IO instructions
- * inb/inw/inl/outb/outw/outl and the "string versions" of the same
- * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
- * versions of the single-IO instructions (inb_p/inw_p/..).
- *
- * This file is not meant to be obfuscating: it's just complicated
- * to (a) handle it all in a way that makes gcc able to optimize it
- * as well as possible and (b) trying to avoid writing the same thing
- * over and over again with slight variations and possibly making a
- * mistake somewhere.
- */
-
-/*
- * Thanks to James van Artsdalen for a better timing-fix than
- * the two short jumps: using outb's to a nonexistent port seems
- * to guarantee better timings even on fast machines.
- *
- * On the other hand, I'd like to be sure of a non-existent port:
- * I feel a bit unsafe about using 0x80 (should be safe, though)
- *
- * Linus
- */
-
- /*
- * Bit simplified and optimized by Jan Hubicka
- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
- *
- * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
- * isa_read[wl] and isa_write[wl] fixed
- * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- */
-
-extern void native_io_delay(void);
-
-extern int io_delay_type;
-extern void io_delay_init(void);
-
-#if defined(CONFIG_PARAVIRT)
-#include <asm/paravirt.h>
-#else
-
-static inline void slow_down_io(void)
-{
- native_io_delay();
-#ifdef REALLY_SLOW_IO
- native_io_delay();
- native_io_delay();
- native_io_delay();
-#endif
-}
-#endif
-
-/*
- * Talk about misusing macros..
- */
-#define __OUT1(s, x) \
-static inline void out##s(unsigned x value, unsigned short port) {
-
-#define __OUT2(s, s1, s2) \
-asm volatile ("out" #s " %" s1 "0,%" s2 "1"
-
-#ifndef REALLY_SLOW_IO
-#define REALLY_SLOW_IO
-#define UNSET_REALLY_SLOW_IO
-#endif
-
-#define __OUT(s, s1, x) \
- __OUT1(s, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
- } \
- __OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
- slow_down_io(); \
-}
-
-#define __IN1(s) \
-static inline RETURN_TYPE in##s(unsigned short port) \
-{ \
- RETURN_TYPE _v;
-
-#define __IN2(s, s1, s2) \
- asm volatile ("in" #s " %" s2 "1,%" s1 "0"
-
-#define __IN(s, s1, i...) \
- __IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
- return _v; \
- } \
- __IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
- slow_down_io(); \
- return _v; }
-
-#ifdef UNSET_REALLY_SLOW_IO
-#undef REALLY_SLOW_IO
-#endif
-
-#define __INS(s) \
-static inline void ins##s(unsigned short port, void *addr, \
- unsigned long count) \
-{ \
- asm volatile ("rep ; ins" #s \
- : "=D" (addr), "=c" (count) \
- : "d" (port), "0" (addr), "1" (count)); \
-}
-
-#define __OUTS(s) \
-static inline void outs##s(unsigned short port, const void *addr, \
- unsigned long count) \
-{ \
- asm volatile ("rep ; outs" #s \
- : "=S" (addr), "=c" (count) \
- : "d" (port), "0" (addr), "1" (count)); \
-}
-
-#define RETURN_TYPE unsigned char
-__IN(b, "")
-#undef RETURN_TYPE
-#define RETURN_TYPE unsigned short
-__IN(w, "")
-#undef RETURN_TYPE
-#define RETURN_TYPE unsigned int
-__IN(l, "")
-#undef RETURN_TYPE
-
-__OUT(b, "b", char)
-__OUT(w, "w", short)
-__OUT(l, , int)
-
-__INS(b)
-__INS(w)
-__INS(l)
-
-__OUTS(b)
-__OUTS(w)
-__OUTS(l)
-
-#if defined(__KERNEL__) && defined(__x86_64__)
-
-#include <linux/vmalloc.h>
-
-#include <asm-generic/iomap.h>
-
-void __memcpy_fromio(void *, unsigned long, unsigned);
-void __memcpy_toio(unsigned long, const void *, unsigned);
-
-static inline void memcpy_fromio(void *to, const volatile void __iomem *from,
- unsigned len)
-{
- __memcpy_fromio(to, (unsigned long)from, len);
-}
-
-static inline void memcpy_toio(volatile void __iomem *to, const void *from,
- unsigned len)
-{
- __memcpy_toio((unsigned long)to, from, len);
-}
-
-void memset_io(volatile void __iomem *a, int b, size_t c);
-
-/*
- * ISA space is 'always mapped' on a typical x86 system, no need to
- * explicitly ioremap() it. The fact that the ISA IO space is mapped
- * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
- * are physical addresses. The following constant pointer can be
- * used as the IO-area pointer (it can be iounmapped as well, so the
- * analogy with PCI is quite large):
- */
-#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
-
-#define flush_write_buffers()
-
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p) p
-
-#endif /* __KERNEL__ */
-
-#endif /* _ASM_X86_IO_64_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 7c7c16cde1f8..35832a03a515 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -143,8 +143,6 @@ extern int noioapicreroute;
/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
extern int timer_through_8259;
-extern void io_apic_disable_legacy(void);
-
/*
* If we use the IO-APIC for IRQ routing, disable automatic
* assignment of PCI IRQ's.
@@ -160,6 +158,7 @@ extern int io_apic_get_redir_entries(int ioapic);
struct io_apic_irq_attr;
extern int io_apic_set_pci_routing(struct device *dev, int irq,
struct io_apic_irq_attr *irq_attr);
+void setup_IO_APIC_irq_extra(u32 gsi);
extern int (*ioapic_renumber_irq)(int ioapic, int irq);
extern void ioapic_init_mappings(void);
extern void ioapic_insert_resources(void);
@@ -188,6 +187,7 @@ extern struct mp_ioapic_gsi mp_gsi_routing[];
int mp_find_ioapic(int gsi);
int mp_find_ioapic_pin(int ioapic, int gsi);
void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
+extern void __init pre_init_apic_IRQ0(void);
#else /* !CONFIG_X86_IO_APIC */
@@ -197,7 +197,11 @@ static const int timer_through_8259 = 0;
static inline void ioapic_init_mappings(void) { }
static inline void ioapic_insert_resources(void) { }
static inline void probe_nr_irqs_gsi(void) { }
+static inline int mp_find_ioapic(int gsi) { return 0; }
+struct io_apic_irq_attr;
+static inline int io_apic_set_pci_routing(struct device *dev, int irq,
+ struct io_apic_irq_attr *irq_attr) { return 0; }
#endif
#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index fd6d21bbee6c..345c99cef152 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -1,8 +1,6 @@
#ifndef _ASM_X86_IOMMU_H
#define _ASM_X86_IOMMU_H
-extern void pci_iommu_shutdown(void);
-extern void no_iommu_init(void);
extern struct dma_map_ops nommu_dma_ops;
extern int force_iommu, no_iommu;
extern int iommu_detected;
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index ddda6cbed6f4..5458380b6ef8 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -34,9 +34,10 @@ static inline int irq_canonicalize(int irq)
#ifdef CONFIG_HOTPLUG_CPU
#include <linux/cpumask.h>
extern void fixup_irqs(void);
+extern void irq_force_complete_move(int);
#endif
-extern void (*generic_interrupt_extension)(void);
+extern void (*x86_platform_ipi_callback)(void);
extern void native_init_IRQ(void);
extern bool handle_irq(unsigned irq, struct pt_regs *regs);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 5b21f0ec3df2..8767d99c4f64 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -28,28 +28,33 @@
#define MCE_VECTOR 0x12
/*
- * IDT vectors usable for external interrupt sources start
- * at 0x20:
+ * IDT vectors usable for external interrupt sources start at 0x20.
+ * (0x80 is the syscall vector, 0x30-0x3f are for ISA)
*/
#define FIRST_EXTERNAL_VECTOR 0x20
-
-#ifdef CONFIG_X86_32
-# define SYSCALL_VECTOR 0x80
-# define IA32_SYSCALL_VECTOR 0x80
-#else
-# define IA32_SYSCALL_VECTOR 0x80
-#endif
+/*
+ * We start allocating at 0x21 to spread out vectors evenly between
+ * priority levels. (0x80 is the syscall vector)
+ */
+#define VECTOR_OFFSET_START 1
/*
- * Reserve the lowest usable priority level 0x20 - 0x2f for triggering
- * cleanup after irq migration.
+ * Reserve the lowest usable vector (and hence lowest priority) 0x20 for
+ * triggering cleanup after irq migration. 0x21-0x2f will still be used
+ * for device interrupts.
*/
#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
+#define IA32_SYSCALL_VECTOR 0x80
+#ifdef CONFIG_X86_32
+# define SYSCALL_VECTOR 0x80
+#endif
+
/*
* Vectors 0x30-0x3f are used for ISA interrupts.
+ * round up to the next 16-vector boundary
*/
-#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10)
+#define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15)
#define IRQ1_VECTOR (IRQ0_VECTOR + 1)
#define IRQ2_VECTOR (IRQ0_VECTOR + 2)
@@ -106,27 +111,20 @@
/*
* Generic system vector for platform specific use
*/
-#define GENERIC_INTERRUPT_VECTOR 0xed
+#define X86_PLATFORM_IPI_VECTOR 0xed
/*
* Performance monitoring pending work vector:
*/
#define LOCAL_PENDING_VECTOR 0xec
-#define UV_BAU_MESSAGE 0xec
+#define UV_BAU_MESSAGE 0xea
/*
* Self IPI vector for machine checks
*/
#define MCE_SELF_VECTOR 0xeb
-/*
- * First APIC vector available to drivers: (vectors 0x30-0xee) we
- * start at 0x31(0x41) to spread out vectors evenly between priority
- * levels. (0x80 is the syscall vector)
- */
-#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
-
#define NR_VECTORS 256
#define FPU_IRQ 13
@@ -154,21 +152,21 @@ static inline int invalid_vm86_irq(int irq)
#define NR_IRQS_LEGACY 16
-#define CPU_VECTOR_LIMIT ( 8 * NR_CPUS )
#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS )
#ifdef CONFIG_X86_IO_APIC
# ifdef CONFIG_SPARSE_IRQ
+# define CPU_VECTOR_LIMIT (64 * NR_CPUS)
# define NR_IRQS \
(CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \
(NR_VECTORS + CPU_VECTOR_LIMIT) : \
(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
# else
-# if NR_CPUS < MAX_IO_APICS
-# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT)
-# else
-# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT)
-# endif
+# define CPU_VECTOR_LIMIT (32 * NR_CPUS)
+# define NR_IRQS \
+ (CPU_VECTOR_LIMIT < IO_APIC_VECTOR_LIMIT ? \
+ (NR_VECTORS + CPU_VECTOR_LIMIT) : \
+ (NR_VECTORS + IO_APIC_VECTOR_LIMIT))
# endif
#else /* !CONFIG_X86_IO_APIC: */
# define NR_IRQS NR_IRQS_LEGACY
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index c2d1f3b58e5f..f70e60071fe8 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -4,13 +4,16 @@
#include <linux/pci.h>
extern struct pci_device_id k8_nb_ids[];
+struct bootnode;
extern int early_is_k8_nb(u32 value);
extern struct pci_dev **k8_northbridges;
extern int num_k8_northbridges;
extern int cache_k8_northbridges(void);
extern void k8_flush_garts(void);
-extern int k8_scan_nodes(unsigned long start, unsigned long end);
+extern int k8_get_nodes(struct bootnode *nodes);
+extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
+extern int k8_scan_nodes(void);
#ifdef CONFIG_K8_NB
static inline struct pci_dev *node_to_k8_nb_misc(int node)
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 4fe681de1e76..4ffa345a8ccb 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -32,7 +32,10 @@ struct kprobe;
typedef u8 kprobe_opcode_t;
#define BREAKPOINT_INSTRUCTION 0xcc
-#define RELATIVEJUMP_INSTRUCTION 0xe9
+#define RELATIVEJUMP_OPCODE 0xe9
+#define RELATIVEJUMP_SIZE 5
+#define RELATIVECALL_OPCODE 0xe8
+#define RELATIVE_ADDR_SIZE 4
#define MAX_INSN_SIZE 16
#define MAX_STACK_SIZE 64
#define MIN_STACK_SIZE(ADDR) \
@@ -44,6 +47,17 @@ typedef u8 kprobe_opcode_t;
#define flush_insn_slot(p) do { } while (0)
+/* optinsn template addresses */
+extern kprobe_opcode_t optprobe_template_entry;
+extern kprobe_opcode_t optprobe_template_val;
+extern kprobe_opcode_t optprobe_template_call;
+extern kprobe_opcode_t optprobe_template_end;
+#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE)
+#define MAX_OPTINSN_SIZE \
+ (((unsigned long)&optprobe_template_end - \
+ (unsigned long)&optprobe_template_entry) + \
+ MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE)
+
extern const int kretprobe_blacklist_size;
void arch_remove_kprobe(struct kprobe *p);
@@ -64,6 +78,21 @@ struct arch_specific_insn {
int boostable;
};
+struct arch_optimized_insn {
+ /* copy of the original instructions */
+ kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE];
+ /* detour code buffer */
+ kprobe_opcode_t *insn;
+ /* the size of instructions copied to detour code buffer */
+ size_t size;
+};
+
+/* Return true (!0) if optinsn is prepared for optimization. */
+static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
+{
+ return optinsn->size;
+}
+
struct prev_kprobe {
struct kprobe *kp;
unsigned long status;
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 4a5fe914dc59..f46b79f6c16c 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -19,6 +19,8 @@
#define __KVM_HAVE_MSIX
#define __KVM_HAVE_MCE
#define __KVM_HAVE_PIT_STATE2
+#define __KVM_HAVE_XEN_HVM
+#define __KVM_HAVE_VCPU_EVENTS
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
@@ -79,6 +81,7 @@ struct kvm_ioapic_state {
#define KVM_IRQCHIP_PIC_MASTER 0
#define KVM_IRQCHIP_PIC_SLAVE 1
#define KVM_IRQCHIP_IOAPIC 2
+#define KVM_NR_IRQCHIPS 3
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
@@ -250,4 +253,35 @@ struct kvm_reinject_control {
__u8 pit_reinject;
__u8 reserved[31];
};
+
+/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
+#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
+#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
+
+/* for KVM_GET/SET_VCPU_EVENTS */
+struct kvm_vcpu_events {
+ struct {
+ __u8 injected;
+ __u8 nr;
+ __u8 has_error_code;
+ __u8 pad;
+ __u32 error_code;
+ } exception;
+ struct {
+ __u8 injected;
+ __u8 nr;
+ __u8 soft;
+ __u8 pad;
+ } interrupt;
+ struct {
+ __u8 injected;
+ __u8 pending;
+ __u8 masked;
+ __u8 pad;
+ } nmi;
+ __u32 sipi_vector;
+ __u32 flags;
+ __u32 reserved[10];
+};
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b7ed2c423116..7a6f54fa13ba 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -54,13 +54,23 @@ struct x86_emulate_ctxt;
struct x86_emulate_ops {
/*
* read_std: Read bytes of standard (non-emulated/special) memory.
- * Used for instruction fetch, stack operations, and others.
+ * Used for descriptor reading.
* @addr: [IN ] Linear address from which to read.
* @val: [OUT] Value read from memory, zero-extended to 'u_long'.
* @bytes: [IN ] Number of bytes to read from memory.
*/
int (*read_std)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu);
+ unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+
+ /*
+ * fetch: Read bytes of standard (non-emulated/special) memory.
+ * Used for instruction fetch.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*fetch)(unsigned long addr, void *val,
+ unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
/*
* read_emulated: Read bytes from emulated/special memory area.
@@ -74,7 +84,7 @@ struct x86_emulate_ops {
struct kvm_vcpu *vcpu);
/*
- * write_emulated: Read bytes from emulated/special memory area.
+ * write_emulated: Write bytes to emulated/special memory area.
* @addr: [IN ] Linear address to which to write.
* @val: [IN ] Value to write to memory (low-order bytes used as
* required).
@@ -129,7 +139,7 @@ struct decode_cache {
u8 seg_override;
unsigned int d;
unsigned long regs[NR_VCPU_REGS];
- unsigned long eip;
+ unsigned long eip, eip_orig;
/* modrm */
u8 modrm;
u8 modrm_mod;
@@ -168,6 +178,7 @@ struct x86_emulate_ctxt {
/* Execution mode, passed to the emulator. */
#define X86EMUL_MODE_REAL 0 /* Real mode. */
+#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */
#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3be000435fad..06d9e79ca37d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -25,7 +25,7 @@
#include <asm/mtrr.h>
#include <asm/msr-index.h>
-#define KVM_MAX_VCPUS 16
+#define KVM_MAX_VCPUS 64
#define KVM_MEMORY_SLOTS 32
/* memory slots that does not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 4
@@ -38,19 +38,6 @@
#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
0xFFFFFF0000000000ULL)
-#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
- (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
-#define KVM_GUEST_CR0_MASK \
- (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
-#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
- (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
-#define KVM_VM_CR0_ALWAYS_ON \
- (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
-#define KVM_GUEST_CR4_MASK \
- (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
-#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
-#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
-
#define INVALID_PAGE (~(hpa_t)0)
#define UNMAPPED_GVA (~(gpa_t)0)
@@ -256,7 +243,8 @@ struct kvm_mmu {
void (*new_cr3)(struct kvm_vcpu *vcpu);
int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
void (*free)(struct kvm_vcpu *vcpu);
- gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
+ gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
+ u32 *error);
void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
@@ -282,13 +270,15 @@ struct kvm_vcpu_arch {
u32 regs_dirty;
unsigned long cr0;
+ unsigned long cr0_guest_owned_bits;
unsigned long cr2;
unsigned long cr3;
unsigned long cr4;
+ unsigned long cr4_guest_owned_bits;
unsigned long cr8;
u32 hflags;
u64 pdptrs[4]; /* pae */
- u64 shadow_efer;
+ u64 efer;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
int32_t apic_arb_prio;
@@ -354,7 +344,6 @@ struct kvm_vcpu_arch {
unsigned int time_offset;
struct page *time_page;
- bool singlestep; /* guest is single stepped by KVM */
bool nmi_pending;
bool nmi_injected;
@@ -371,17 +360,31 @@ struct kvm_vcpu_arch {
u64 mcg_status;
u64 mcg_ctl;
u64 *mce_banks;
+
+ /* used for guest single stepping over the given code position */
+ u16 singlestep_cs;
+ unsigned long singlestep_rip;
+ /* fields used by HYPER-V emulation */
+ u64 hv_vapic;
};
struct kvm_mem_alias {
gfn_t base_gfn;
unsigned long npages;
gfn_t target_gfn;
+#define KVM_ALIAS_INVALID 1UL
+ unsigned long flags;
};
-struct kvm_arch{
- int naliases;
+#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
+
+struct kvm_mem_aliases {
struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
+ int naliases;
+};
+
+struct kvm_arch {
+ struct kvm_mem_aliases *aliases;
unsigned int n_free_mmu_pages;
unsigned int n_requested_mmu_pages;
@@ -397,7 +400,6 @@ struct kvm_arch{
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
- struct hlist_head irq_ack_notifier_list;
int vapics_in_nmi_mode;
unsigned int tss_addr;
@@ -410,8 +412,14 @@ struct kvm_arch{
gpa_t ept_identity_map_addr;
unsigned long irq_sources_bitmap;
- unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
u64 vm_init_tsc;
+ s64 kvmclock_offset;
+
+ struct kvm_xen_hvm_config xen_hvm_config;
+
+ /* fields used by HYPER-V emulation */
+ u64 hv_guest_os_id;
+ u64 hv_hypercall;
};
struct kvm_vm_stat {
@@ -461,12 +469,13 @@ struct descriptor_table {
struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void); /* __init */
int (*disabled_by_bios)(void); /* __init */
- void (*hardware_enable)(void *dummy); /* __init */
+ int (*hardware_enable)(void *dummy);
void (*hardware_disable)(void *dummy);
void (*check_processor_compatibility)(void *rtn);
int (*hardware_setup)(void); /* __init */
void (*hardware_unsetup)(void); /* __exit */
bool (*cpu_has_accelerated_tpr)(void);
+ void (*cpuid_update)(struct kvm_vcpu *vcpu);
/* Create, but do not attach this VCPU */
struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
@@ -477,8 +486,8 @@ struct kvm_x86_ops {
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
- int (*set_guest_debug)(struct kvm_vcpu *vcpu,
- struct kvm_guest_debug *dbg);
+ void (*set_guest_debug)(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg);
int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -488,6 +497,7 @@ struct kvm_x86_ops {
void (*set_segment)(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
+ void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -497,17 +507,18 @@ struct kvm_x86_ops {
void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
- void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
- int *exception);
+ int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
+ int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+ void (*fpu_activate)(struct kvm_vcpu *vcpu);
+ void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
void (*tlb_flush)(struct kvm_vcpu *vcpu);
- void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
- int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+ void (*run)(struct kvm_vcpu *vcpu);
+ int (*handle_exit)(struct kvm_vcpu *vcpu);
void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
@@ -519,13 +530,16 @@ struct kvm_x86_ops {
bool has_error_code, u32 error_code);
int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
int (*nmi_allowed)(struct kvm_vcpu *vcpu);
+ bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
+ void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
- bool (*gb_page_enable)(void);
+ int (*get_lpage_level)(void);
+ bool (*rdtscp_supported)(void);
const struct trace_print_flags *exit_reasons_str;
};
@@ -568,7 +582,7 @@ enum emulation_result {
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
#define EMULTYPE_SKIP (1 << 2)
-int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
+int emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2, u16 error_code, int emulation_type);
void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -585,9 +599,9 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
struct x86_emulate_ctxt;
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
int size, unsigned port);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
int size, unsigned long count, int down,
gva_t address, int rep, unsigned port);
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
@@ -600,8 +614,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
unsigned long value);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
- int type_bits, int seg);
+int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
@@ -616,6 +629,9 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
@@ -644,6 +660,10 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
+gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
@@ -657,6 +677,7 @@ void kvm_disable_tdp(void);
int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int complete_pio(struct kvm_vcpu *vcpu);
+bool kvm_check_iopl(struct kvm_vcpu *vcpu);
struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
@@ -796,9 +817,13 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
+void kvm_define_shared_msr(unsigned index, u32 msr);
+void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c584076a47f4..ffae1420e7d7 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -2,6 +2,7 @@
#define _ASM_X86_KVM_PARA_H
#include <linux/types.h>
+#include <asm/hyperv.h>
/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
* should be used to determine that a VM is running under KVM.
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 47b9b6f19057..2e9972468a5d 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -195,41 +195,4 @@ static inline long local_sub_return(long i, local_t *l)
#define __local_add(i, l) local_add((i), (l))
#define __local_sub(i, l) local_sub((i), (l))
-/* Use these for per-cpu local_t variables: on some archs they are
- * much more efficient than these naive implementations. Note they take
- * a variable, not an address.
- *
- * X86_64: This could be done better if we moved the per cpu data directly
- * after GS.
- */
-
-/* Need to disable preemption for the cpu local counters otherwise we could
- still access a variable of a previous CPU in a non atomic way. */
-#define cpu_local_wrap_v(l) \
-({ \
- local_t res__; \
- preempt_disable(); \
- res__ = (l); \
- preempt_enable(); \
- res__; \
-})
-#define cpu_local_wrap(l) \
-({ \
- preempt_disable(); \
- (l); \
- preempt_enable(); \
-}) \
-
-#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var((l))))
-#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var((l)), (i)))
-#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var((l))))
-#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var((l))))
-#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var((l))))
-#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var((l))))
-
-#define __cpu_local_inc(l) cpu_local_inc((l))
-#define __cpu_local_dec(l) cpu_local_dec((l))
-#define __cpu_local_add(i, l) cpu_local_add((i), (l))
-#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
-
#endif /* _ASM_X86_LOCAL_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index b608a64c5814..6c3fdd631ed3 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -108,8 +108,11 @@ struct mce_log {
#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
+
#ifdef __KERNEL__
+extern struct atomic_notifier_head x86_mce_decoder_chain;
+
#include <linux/percpu.h>
#include <linux/init.h>
#include <asm/atomic.h>
@@ -118,9 +121,11 @@ extern int mce_disabled;
extern int mce_p5_enabled;
#ifdef CONFIG_X86_MCE
-void mcheck_init(struct cpuinfo_x86 *c);
+int mcheck_init(void);
+void mcheck_cpu_init(struct cpuinfo_x86 *c);
#else
-static inline void mcheck_init(struct cpuinfo_x86 *c) {}
+static inline int mcheck_init(void) { return 0; }
+static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
#endif
#ifdef CONFIG_X86_ANCIENT_MCE
@@ -133,6 +138,8 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
static inline void enable_p5_mce(void) {}
#endif
+extern void (*x86_mce_decode_callback)(struct mce *m);
+
void mce_setup(struct mce *m);
void mce_log(struct mce *m);
DECLARE_PER_CPU(struct sys_device, mce_dev);
@@ -212,5 +219,11 @@ void intel_init_thermal(struct cpuinfo_x86 *c);
void mce_log_therm_throt_event(__u64 status);
+#ifdef CONFIG_X86_THERMAL_VECTOR
+extern void mcheck_intel_therm_init(void);
+#else
+static inline void mcheck_intel_therm_init(void) { }
+#endif
+
#endif /* __KERNEL__ */
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index ede6998bd92c..91df7c51806c 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -47,7 +47,7 @@ static inline void resume_map_numa_kva(pgd_t *pgd) {}
/*
* generic node memory support, the following assumptions apply:
*
- * 1) memory comes in 64Mb contigious chunks which are either present or not
+ * 1) memory comes in 64Mb contiguous chunks which are either present or not
* 2) we will not have more than 64Gb in total
*
* for now assume that 64Gb is max amount of RAM for whole system
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index a29f48c2a322..288b96f815a6 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -39,11 +39,5 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
NODE_DATA(nid)->node_spanned_pages)
-
-#ifdef CONFIG_NUMA_EMU
-#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024)
-#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
-#endif
-
#endif
#endif /* _ASM_X86_MMZONE_64_H */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 79c94500c0bb..d8bf23a88d05 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -71,12 +71,7 @@ static inline void early_get_smp_config(void)
static inline void find_smp_config(void)
{
- x86_init.mpparse.find_smp_config(1);
-}
-
-static inline void early_find_smp_config(void)
-{
- x86_init.mpparse.find_smp_config(0);
+ x86_init.mpparse.find_smp_config();
}
#ifdef CONFIG_X86_MPPARSE
@@ -89,7 +84,7 @@ extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str);
# else
# define default_mpc_oem_bus_info NULL
# endif
-extern void default_find_smp_config(unsigned int reserve);
+extern void default_find_smp_config(void);
extern void default_get_smp_config(unsigned int early);
#else
static inline void early_reserve_e820_mpc_new(void) { }
@@ -97,7 +92,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
#define default_mpc_apic_id NULL
#define default_smp_read_mpc_oem NULL
#define default_mpc_oem_bus_info NULL
-#define default_find_smp_config x86_init_uint_noop
+#define default_find_smp_config x86_init_noop
#define default_get_smp_config x86_init_uint_noop
#endif
@@ -163,14 +158,16 @@ typedef struct physid_mask physid_mask_t;
#define physids_shift_left(d, s, n) \
bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
-#define physids_coerce(map) ((map).mask[0])
+static inline unsigned long physids_coerce(physid_mask_t *map)
+{
+ return map->mask[0];
+}
-#define physids_promote(physids) \
- ({ \
- physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
- __physid_mask.mask[0] = physids; \
- __physid_mask; \
- })
+static inline void physids_promote(unsigned long physids, physid_mask_t *map)
+{
+ physids_clear(*map);
+ map->mask[0] = physids;
+}
/* Note: will create very large stack frames if physid_mask_t is big */
#define physid_mask_of_physid(physid) \
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
new file mode 100644
index 000000000000..451d30e7f62d
--- /dev/null
+++ b/arch/x86/include/asm/mrst.h
@@ -0,0 +1,19 @@
+/*
+ * mrst.h: Intel Moorestown platform specific setup code
+ *
+ * (C) Copyright 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef _ASM_X86_MRST_H
+#define _ASM_X86_MRST_H
+extern int pci_mrst_init(void);
+int __init sfi_parse_mrtc(struct sfi_table_header *table);
+
+#define SFI_MTMR_MAX_NUM 8
+#define SFI_MRTC_MAX 8
+
+#endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 4ffe09b2ad75..1cd58cdbc03f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -12,6 +12,7 @@
#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */
#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */
#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */
+#define MSR_TSC_AUX 0xc0000103 /* Auxiliary TSC */
/* EFER bits: */
#define _EFER_SCE 0 /* SYSCALL/SYSRET */
@@ -123,6 +124,7 @@
#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff
#define FAM10H_MMIO_CONF_BASE_SHIFT 20
+#define MSR_FAM10H_NODE_ID 0xc001100c
/* K8 MSRs */
#define MSR_K8_TOP_MEM1 0xc001001a
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 7e2b6ba962ff..c5bc4c2d33f5 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -27,6 +27,18 @@ struct msr {
};
};
+struct msr_info {
+ u32 msr_no;
+ struct msr reg;
+ struct msr *msrs;
+ int err;
+};
+
+struct msr_regs_info {
+ u32 *regs;
+ int err;
+};
+
static inline unsigned long long native_read_tscp(unsigned int *aux)
{
unsigned long low, high;
@@ -240,15 +252,18 @@ do { \
#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \
(u32)((val) >> 32))
-#define write_tsc(val1, val2) wrmsr(0x10, (val1), (val2))
+#define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2))
+
+#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0)
-#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0)
+struct msr *msrs_alloc(void);
+void msrs_free(struct msr *msrs);
#ifdef CONFIG_SMP
int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
-void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
-void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
+void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
+void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
@@ -264,12 +279,12 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
wrmsr(msr_no, l, h);
return 0;
}
-static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no,
+static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no,
struct msr *msrs)
{
rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
}
-static inline void wrmsr_on_cpus(const cpumask_t *m, u32 msr_no,
+static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no,
struct msr *msrs)
{
wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 139d4c1a33a7..93da9c3f3341 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -19,7 +19,6 @@ extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
extern int check_nmi_watchdog(void);
extern int nmi_watchdog_enabled;
extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
-extern int avail_to_resrv_perfctr_nmi(unsigned int);
extern int reserve_perfctr_nmi(unsigned int);
extern void release_perfctr_nmi(unsigned int);
extern int reserve_evntsel_nmi(unsigned int);
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index c4ae822e415f..823e070e7c26 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -36,6 +36,11 @@ extern void __cpuinit numa_set_node(int cpu, int node);
extern void __cpuinit numa_clear_node(int cpu);
extern void __cpuinit numa_add_cpu(int cpu);
extern void __cpuinit numa_remove_cpu(int cpu);
+
+#ifdef CONFIG_NUMA_EMU
+#define FAKE_NODE_MIN_SIZE ((u64)64 << 20)
+#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
+#endif /* CONFIG_NUMA_EMU */
#else
static inline void init_cpu_to_node(void) { }
static inline void numa_set_node(int cpu, int node) { }
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
index 9f0a5f5d29ec..37c516545ec8 100644
--- a/arch/x86/include/asm/numaq.h
+++ b/arch/x86/include/asm/numaq.h
@@ -30,9 +30,14 @@
extern int found_numaq;
extern int get_memcfg_numaq(void);
+extern int pci_numaq_init(void);
extern void *xquad_portio;
+#define XQUAD_PORTIO_BASE 0xfe400000
+#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
+#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
+
/*
* SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the
*/
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 834a30295fab..101229b0d8ed 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -13,7 +13,6 @@ struct olpc_platform_t {
#define OLPC_F_PRESENT 0x01
#define OLPC_F_DCON 0x02
-#define OLPC_F_VSA 0x04
#ifdef CONFIG_OLPC
@@ -51,18 +50,6 @@ static inline int olpc_has_dcon(void)
}
/*
- * The VSA is software from AMD that typical Geode bioses will include.
- * It is used to emulate the PCI bus, VGA, etc. OLPC's Open Firmware does
- * not include the VSA; instead, PCI is emulated by the kernel.
- *
- * The VSA is described further in arch/x86/pci/olpc.c.
- */
-static inline int olpc_has_vsa(void)
-{
- return (olpc_platform_info.flags & OLPC_F_VSA) ? 1 : 0;
-}
-
-/*
* The "Mass Production" version of OLPC's XO is identified as being model
* C2. During the prototype phase, the following models (in chronological
* order) were created: A1, B1, B2, B3, B4, C1. The A1 through B2 models
@@ -87,13 +74,10 @@ static inline int olpc_has_dcon(void)
return 0;
}
-static inline int olpc_has_vsa(void)
-{
- return 0;
-}
-
#endif
+extern int pci_olpc_init(void);
+
/* EC related functions */
extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
@@ -120,7 +104,7 @@ extern int olpc_ec_mask_unset(uint8_t bits);
/* GPIO assignments */
-#define OLPC_GPIO_MIC_AC geode_gpio(1)
+#define OLPC_GPIO_MIC_AC 1
#define OLPC_GPIO_DCON_IRQ geode_gpio(7)
#define OLPC_GPIO_THRM_ALRM geode_gpio(10)
#define OLPC_GPIO_SMB_CLK geode_gpio(14)
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 6473f5ccff85..a667f24c7254 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,7 +40,6 @@
#ifndef __ASSEMBLY__
-extern int page_is_ram(unsigned long pagenr);
extern int devmem_is_allowed(unsigned long pagenr);
extern unsigned long max_low_pfn_mapped;
@@ -49,7 +48,8 @@ extern unsigned long max_pfn_mapped;
extern unsigned long init_memory_mapping(unsigned long start,
unsigned long end);
-extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
+extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn,
+ int acpi, int k8);
extern void free_initmem(void);
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 8aebcc41041d..5653f43d90e5 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -435,15 +435,6 @@ static inline void paravirt_release_pud(unsigned long pfn)
PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
}
-#ifdef CONFIG_HIGHPTE
-static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
-{
- unsigned long ret;
- ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
- return (void *)ret;
-}
-#endif
-
static inline void pte_update(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
@@ -731,34 +722,34 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
-static inline int __raw_spin_is_locked(struct raw_spinlock *lock)
+static inline int arch_spin_is_locked(struct arch_spinlock *lock)
{
return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock);
}
-static inline int __raw_spin_is_contended(struct raw_spinlock *lock)
+static inline int arch_spin_is_contended(struct arch_spinlock *lock)
{
return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
}
-#define __raw_spin_is_contended __raw_spin_is_contended
+#define arch_spin_is_contended arch_spin_is_contended
-static __always_inline void __raw_spin_lock(struct raw_spinlock *lock)
+static __always_inline void arch_spin_lock(struct arch_spinlock *lock)
{
PVOP_VCALL1(pv_lock_ops.spin_lock, lock);
}
-static __always_inline void __raw_spin_lock_flags(struct raw_spinlock *lock,
+static __always_inline void arch_spin_lock_flags(struct arch_spinlock *lock,
unsigned long flags)
{
PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags);
}
-static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock)
+static __always_inline int arch_spin_trylock(struct arch_spinlock *lock)
{
return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock);
}
-static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
+static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
{
PVOP_VCALL1(pv_lock_ops.spin_unlock, lock);
}
@@ -840,42 +831,22 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
static inline unsigned long __raw_local_save_flags(void)
{
- unsigned long f;
-
- asm volatile(paravirt_alt(PARAVIRT_CALL)
- : "=a"(f)
- : paravirt_type(pv_irq_ops.save_fl),
- paravirt_clobber(CLBR_EAX)
- : "memory", "cc");
- return f;
+ return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
}
static inline void raw_local_irq_restore(unsigned long f)
{
- asm volatile(paravirt_alt(PARAVIRT_CALL)
- : "=a"(f)
- : PV_FLAGS_ARG(f),
- paravirt_type(pv_irq_ops.restore_fl),
- paravirt_clobber(CLBR_EAX)
- : "memory", "cc");
+ PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
}
static inline void raw_local_irq_disable(void)
{
- asm volatile(paravirt_alt(PARAVIRT_CALL)
- :
- : paravirt_type(pv_irq_ops.irq_disable),
- paravirt_clobber(CLBR_EAX)
- : "memory", "eax", "cc");
+ PVOP_VCALLEE0(pv_irq_ops.irq_disable);
}
static inline void raw_local_irq_enable(void)
{
- asm volatile(paravirt_alt(PARAVIRT_CALL)
- :
- : paravirt_type(pv_irq_ops.irq_enable),
- paravirt_clobber(CLBR_EAX)
- : "memory", "eax", "cc");
+ PVOP_VCALLEE0(pv_irq_ops.irq_enable);
}
static inline unsigned long __raw_local_irq_save(void)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index dd0f5b32489d..db9ef5532341 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -304,10 +304,6 @@ struct pv_mmu_ops {
#endif /* PAGETABLE_LEVELS == 4 */
#endif /* PAGETABLE_LEVELS >= 3 */
-#ifdef CONFIG_HIGHPTE
- void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
-#endif
-
struct pv_lazy_ops lazy_mode;
/* dom0 ops */
@@ -318,14 +314,14 @@ struct pv_mmu_ops {
phys_addr_t phys, pgprot_t flags);
};
-struct raw_spinlock;
+struct arch_spinlock;
struct pv_lock_ops {
- int (*spin_is_locked)(struct raw_spinlock *lock);
- int (*spin_is_contended)(struct raw_spinlock *lock);
- void (*spin_lock)(struct raw_spinlock *lock);
- void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags);
- int (*spin_trylock)(struct raw_spinlock *lock);
- void (*spin_unlock)(struct raw_spinlock *lock);
+ int (*spin_is_locked)(struct arch_spinlock *lock);
+ int (*spin_is_contended)(struct arch_spinlock *lock);
+ void (*spin_lock)(struct arch_spinlock *lock);
+ void (*spin_lock_flags)(struct arch_spinlock *lock, unsigned long flags);
+ int (*spin_trylock)(struct arch_spinlock *lock);
+ void (*spin_unlock)(struct arch_spinlock *lock);
};
/* This contains all the paravirt structures: we get a convenient
@@ -494,10 +490,11 @@ int paravirt_disable_iospace(void);
#define EXTRA_CLOBBERS
#define VEXTRA_CLOBBERS
#else /* CONFIG_X86_64 */
+/* [re]ax isn't an arg, but the return val */
#define PVOP_VCALL_ARGS \
unsigned long __edi = __edi, __esi = __esi, \
- __edx = __edx, __ecx = __ecx
-#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax
+ __edx = __edx, __ecx = __ecx, __eax = __eax
+#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
@@ -509,6 +506,7 @@ int paravirt_disable_iospace(void);
"=c" (__ecx)
#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
+/* void functions are still allowed [re]ax for scratch */
#define PVOP_VCALLEE_CLOBBERS "=a" (__eax)
#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
@@ -583,8 +581,8 @@ int paravirt_disable_iospace(void);
VEXTRA_CLOBBERS, \
pre, post, ##__VA_ARGS__)
-#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \
- ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
+#define __PVOP_VCALLEESAVE(op, pre, post, ...) \
+ ____PVOP_VCALL(op.func, CLBR_RET_REG, \
PVOP_VCALLEE_CLOBBERS, , \
pre, post, ##__VA_ARGS__)
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index ada8c201d513..404a880ea325 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -45,8 +45,15 @@ static inline int pci_proc_domain(struct pci_bus *bus)
#ifdef CONFIG_PCI
extern unsigned int pcibios_assign_all_busses(void);
+extern int pci_legacy_init(void);
+# ifdef CONFIG_ACPI
+# define x86_default_pci_init pci_acpi_init
+# else
+# define x86_default_pci_init pci_legacy_init
+# endif
#else
-#define pcibios_assign_all_busses() 0
+# define pcibios_assign_all_busses() 0
+# define x86_default_pci_init NULL
#endif
extern unsigned long pci_mem_start;
@@ -90,40 +97,14 @@ extern void pci_iommu_alloc(void);
#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
-#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG)
-
-#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
- dma_addr_t ADDR_NAME;
-#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
- __u32 LEN_NAME;
-#define pci_unmap_addr(PTR, ADDR_NAME) \
- ((PTR)->ADDR_NAME)
-#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
- (((PTR)->ADDR_NAME) = (VAL))
-#define pci_unmap_len(PTR, LEN_NAME) \
- ((PTR)->LEN_NAME)
-#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
- (((PTR)->LEN_NAME) = (VAL))
-
-#else
-
-#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
-#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
-#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
-#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
- do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
-#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
-#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
- do { break; } while (pci_unmap_len(PTR, LEN_NAME))
-
-#endif
-
#endif /* __KERNEL__ */
#ifdef CONFIG_X86_64
#include "pci_64.h"
#endif
+void dma32_reserve_bootmem(void);
+
/* implement the pci_ DMA API in terms of the generic device dma_ one */
#include <asm-generic/pci-dma-compat.h>
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
index ae5e40f67daf..fe15cfb21b9b 100644
--- a/arch/x86/include/asm/pci_64.h
+++ b/arch/x86/include/asm/pci_64.h
@@ -22,8 +22,6 @@ extern int (*pci_config_read)(int seg, int bus, int dev, int fn,
extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
int reg, int len, u32 value);
-extern void dma32_reserve_bootmem(void);
-
#endif /* __KERNEL__ */
#endif /* _ASM_X86_PCI_64_H */
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index b399988eee3a..1a0422348d6d 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -29,6 +29,7 @@
#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000
#define PCI_HAS_IO_ECS 0x40000
#define PCI_NOASSIGN_ROMS 0x80000
+#define PCI_ROOT_NO_CRS 0x100000
extern unsigned int pci_probe;
extern unsigned long pirq_table_addr;
@@ -82,7 +83,6 @@ struct irq_routing_table {
extern unsigned int pcibios_irq_mask;
-extern int pcibios_scanned;
extern spinlock_t pci_config_lock;
extern int (*pcibios_enable_irq)(struct pci_dev *dev);
@@ -105,24 +105,39 @@ extern bool port_cf9_safe;
extern int pci_direct_probe(void);
extern void pci_direct_init(int type);
extern void pci_pcbios_init(void);
-extern int pci_olpc_init(void);
extern void __init dmi_check_pciprobe(void);
extern void __init dmi_check_skip_isa_align(void);
/* some common used subsys_initcalls */
extern int __init pci_acpi_init(void);
-extern int __init pcibios_irq_init(void);
-extern int __init pci_visws_init(void);
-extern int __init pci_numaq_init(void);
+extern void __init pcibios_irq_init(void);
extern int __init pcibios_init(void);
+extern int pci_legacy_init(void);
+extern void pcibios_fixup_irqs(void);
/* pci-mmconfig.c */
+/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
+#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
+
+struct pci_mmcfg_region {
+ struct list_head list;
+ struct resource res;
+ u64 address;
+ char __iomem *virt;
+ u16 segment;
+ u8 start_bus;
+ u8 end_bus;
+ char name[PCI_MMCFG_RESOURCE_NAME_LEN];
+};
+
extern int __init pci_mmcfg_arch_init(void);
extern void __init pci_mmcfg_arch_free(void);
+extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
+
+extern struct list_head pci_mmcfg_list;
-extern struct acpi_mcfg_allocation *pci_mmcfg_config;
-extern int pci_mmcfg_config_num;
+#define PCI_MMCFG_BUS_OFFSET(bus) ((bus) << 20)
/*
* AMD Fam10h CPUs are buggy, and cannot access MMIO config space
@@ -166,3 +181,17 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val)
{
asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory");
}
+
+#ifdef CONFIG_PCI
+# ifdef CONFIG_ACPI
+# define x86_default_pci_init pci_acpi_init
+# else
+# define x86_default_pci_init pci_legacy_init
+# endif
+# define x86_default_pci_init_irq pcibios_irq_init
+# define x86_default_pci_fixup_irqs pcibios_fixup_irqs
+#else
+# define x86_default_pci_init NULL
+# define x86_default_pci_init_irq NULL
+# define x86_default_pci_fixup_irqs NULL
+#endif
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index b65a36defeb7..66a272dfd8b8 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -25,19 +25,18 @@
*/
#ifdef CONFIG_SMP
#define PER_CPU(var, reg) \
- __percpu_mov_op %__percpu_seg:per_cpu__this_cpu_off, reg; \
- lea per_cpu__##var(reg), reg
-#define PER_CPU_VAR(var) %__percpu_seg:per_cpu__##var
+ __percpu_mov_op %__percpu_seg:this_cpu_off, reg; \
+ lea var(reg), reg
+#define PER_CPU_VAR(var) %__percpu_seg:var
#else /* ! SMP */
-#define PER_CPU(var, reg) \
- __percpu_mov_op $per_cpu__##var, reg
-#define PER_CPU_VAR(var) per_cpu__##var
+#define PER_CPU(var, reg) __percpu_mov_op $var, reg
+#define PER_CPU_VAR(var) var
#endif /* SMP */
#ifdef CONFIG_X86_64_SMP
#define INIT_PER_CPU_VAR(var) init_per_cpu__##var
#else
-#define INIT_PER_CPU_VAR(var) per_cpu__##var
+#define INIT_PER_CPU_VAR(var) var
#endif
#else /* ...!ASSEMBLY */
@@ -60,12 +59,12 @@
* There also must be an entry in vmlinux_64.lds.S
*/
#define DECLARE_INIT_PER_CPU(var) \
- extern typeof(per_cpu_var(var)) init_per_cpu_var(var)
+ extern typeof(var) init_per_cpu_var(var)
#ifdef CONFIG_X86_64_SMP
#define init_per_cpu_var(var) init_per_cpu__##var
#else
-#define init_per_cpu_var(var) per_cpu_var(var)
+#define init_per_cpu_var(var) var
#endif
/* For arch-specific code, we can use direct single-insn ops (they
@@ -74,63 +73,121 @@ extern void __bad_percpu_size(void);
#define percpu_to_op(op, var, val) \
do { \
- typedef typeof(var) T__; \
+ typedef typeof(var) pto_T__; \
if (0) { \
- T__ tmp__; \
- tmp__ = (val); \
+ pto_T__ pto_tmp__; \
+ pto_tmp__ = (val); \
} \
switch (sizeof(var)) { \
case 1: \
asm(op "b %1,"__percpu_arg(0) \
: "+m" (var) \
- : "qi" ((T__)(val))); \
+ : "qi" ((pto_T__)(val))); \
break; \
case 2: \
asm(op "w %1,"__percpu_arg(0) \
: "+m" (var) \
- : "ri" ((T__)(val))); \
+ : "ri" ((pto_T__)(val))); \
break; \
case 4: \
asm(op "l %1,"__percpu_arg(0) \
: "+m" (var) \
- : "ri" ((T__)(val))); \
+ : "ri" ((pto_T__)(val))); \
break; \
case 8: \
asm(op "q %1,"__percpu_arg(0) \
: "+m" (var) \
- : "re" ((T__)(val))); \
+ : "re" ((pto_T__)(val))); \
break; \
default: __bad_percpu_size(); \
} \
} while (0)
+/*
+ * Generate a percpu add to memory instruction and optimize code
+ * if a one is added or subtracted.
+ */
+#define percpu_add_op(var, val) \
+do { \
+ typedef typeof(var) pao_T__; \
+ const int pao_ID__ = (__builtin_constant_p(val) && \
+ ((val) == 1 || (val) == -1)) ? (val) : 0; \
+ if (0) { \
+ pao_T__ pao_tmp__; \
+ pao_tmp__ = (val); \
+ } \
+ switch (sizeof(var)) { \
+ case 1: \
+ if (pao_ID__ == 1) \
+ asm("incb "__percpu_arg(0) : "+m" (var)); \
+ else if (pao_ID__ == -1) \
+ asm("decb "__percpu_arg(0) : "+m" (var)); \
+ else \
+ asm("addb %1, "__percpu_arg(0) \
+ : "+m" (var) \
+ : "qi" ((pao_T__)(val))); \
+ break; \
+ case 2: \
+ if (pao_ID__ == 1) \
+ asm("incw "__percpu_arg(0) : "+m" (var)); \
+ else if (pao_ID__ == -1) \
+ asm("decw "__percpu_arg(0) : "+m" (var)); \
+ else \
+ asm("addw %1, "__percpu_arg(0) \
+ : "+m" (var) \
+ : "ri" ((pao_T__)(val))); \
+ break; \
+ case 4: \
+ if (pao_ID__ == 1) \
+ asm("incl "__percpu_arg(0) : "+m" (var)); \
+ else if (pao_ID__ == -1) \
+ asm("decl "__percpu_arg(0) : "+m" (var)); \
+ else \
+ asm("addl %1, "__percpu_arg(0) \
+ : "+m" (var) \
+ : "ri" ((pao_T__)(val))); \
+ break; \
+ case 8: \
+ if (pao_ID__ == 1) \
+ asm("incq "__percpu_arg(0) : "+m" (var)); \
+ else if (pao_ID__ == -1) \
+ asm("decq "__percpu_arg(0) : "+m" (var)); \
+ else \
+ asm("addq %1, "__percpu_arg(0) \
+ : "+m" (var) \
+ : "re" ((pao_T__)(val))); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+} while (0)
+
#define percpu_from_op(op, var, constraint) \
({ \
- typeof(var) ret__; \
+ typeof(var) pfo_ret__; \
switch (sizeof(var)) { \
case 1: \
asm(op "b "__percpu_arg(1)",%0" \
- : "=q" (ret__) \
+ : "=q" (pfo_ret__) \
: constraint); \
break; \
case 2: \
asm(op "w "__percpu_arg(1)",%0" \
- : "=r" (ret__) \
+ : "=r" (pfo_ret__) \
: constraint); \
break; \
case 4: \
asm(op "l "__percpu_arg(1)",%0" \
- : "=r" (ret__) \
+ : "=r" (pfo_ret__) \
: constraint); \
break; \
case 8: \
asm(op "q "__percpu_arg(1)",%0" \
- : "=r" (ret__) \
+ : "=r" (pfo_ret__) \
: constraint); \
break; \
default: __bad_percpu_size(); \
} \
- ret__; \
+ pfo_ret__; \
})
/*
@@ -142,23 +199,99 @@ do { \
* per-thread variables implemented as per-cpu variables and thus
* stable for the duration of the respective task.
*/
-#define percpu_read(var) percpu_from_op("mov", per_cpu__##var, \
- "m" (per_cpu__##var))
-#define percpu_read_stable(var) percpu_from_op("mov", per_cpu__##var, \
- "p" (&per_cpu__##var))
-#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val)
-#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val)
-#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val)
-#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val)
-#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val)
-#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val)
+#define percpu_read(var) percpu_from_op("mov", var, "m" (var))
+#define percpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var)))
+#define percpu_write(var, val) percpu_to_op("mov", var, val)
+#define percpu_add(var, val) percpu_add_op(var, val)
+#define percpu_sub(var, val) percpu_add_op(var, -(val))
+#define percpu_and(var, val) percpu_to_op("and", var, val)
+#define percpu_or(var, val) percpu_to_op("or", var, val)
+#define percpu_xor(var, val) percpu_to_op("xor", var, val)
+
+#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
+#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
+#define __this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
+
+#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
+#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
+#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
+#define __this_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
+#define __this_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
+#define __this_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
+#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
+#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
+#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
+#define __this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
+#define __this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
+#define __this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
+#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
+#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
+#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
+
+#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
+#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
+#define this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
+#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
+#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
+#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
+#define this_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
+#define this_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
+#define this_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
+#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
+#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
+#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
+#define this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
+#define this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
+#define this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
+#define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
+#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
+#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
+
+#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
+#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
+#define irqsafe_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
+#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
+#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
+#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
+#define irqsafe_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
+#define irqsafe_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
+#define irqsafe_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
+#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
+#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
+#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
+
+/*
+ * Per cpu atomic 64 bit operations are only available under 64 bit.
+ * 32 bit must fall back to generic operations.
+ */
+#ifdef CONFIG_X86_64
+#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
+#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
+#define __this_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
+#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
+#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
+#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
+
+#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
+#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
+#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
+#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
+#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
+#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
+
+#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
+#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
+#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
+#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
+
+#endif
/* This is not atomic against other CPUs -- CPU preemption needs to be off */
#define x86_test_and_clear_bit_percpu(bit, var) \
({ \
int old__; \
asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \
- : "=r" (old__), "+m" (per_cpu__##var) \
+ : "=r" (old__), "+m" (var) \
: "dIr" (bit)); \
old__; \
})
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index ad7ce3fd5065..db6109a885a7 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -18,7 +18,8 @@
#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
-#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_ENABLE (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_ANY (1 << 21)
#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
@@ -26,11 +27,34 @@
/*
* Includes eventsel and unit mask as well:
*/
-#define ARCH_PERFMON_EVENT_MASK 0xffff
+
+
+#define INTEL_ARCH_EVTSEL_MASK 0x000000FFULL
+#define INTEL_ARCH_UNIT_MASK 0x0000FF00ULL
+#define INTEL_ARCH_EDGE_MASK 0x00040000ULL
+#define INTEL_ARCH_INV_MASK 0x00800000ULL
+#define INTEL_ARCH_CNT_MASK 0xFF000000ULL
+#define INTEL_ARCH_EVENT_MASK (INTEL_ARCH_UNIT_MASK|INTEL_ARCH_EVTSEL_MASK)
+
+/*
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ * - inv
+ * - edge
+ * - cnt-mask
+ * The other filters are supported by fixed counters.
+ * The any-thread option is supported starting with v3.
+ */
+#define INTEL_ARCH_FIXED_MASK \
+ (INTEL_ARCH_CNT_MASK| \
+ INTEL_ARCH_INV_MASK| \
+ INTEL_ARCH_EDGE_MASK|\
+ INTEL_ARCH_UNIT_MASK|\
+ INTEL_ARCH_EVENT_MASK)
#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
@@ -93,6 +117,18 @@ union cpuid10_edx {
*/
#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16)
+/* IbsFetchCtl bits/masks */
+#define IBS_FETCH_RAND_EN (1ULL<<57)
+#define IBS_FETCH_VAL (1ULL<<49)
+#define IBS_FETCH_ENABLE (1ULL<<48)
+#define IBS_FETCH_CNT 0xFFFF0000ULL
+#define IBS_FETCH_MAX_CNT 0x0000FFFFULL
+
+/* IbsOpCtl bits */
+#define IBS_OP_CNT_CTL (1ULL<<19)
+#define IBS_OP_VAL (1ULL<<18)
+#define IBS_OP_ENABLE (1ULL<<17)
+#define IBS_OP_MAX_CNT 0x0000FFFFULL
#ifdef CONFIG_PERF_EVENTS
extern void init_hw_perf_events(void);
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 0e8c2a0fd922..271de94c3810 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -23,6 +23,11 @@ static inline void paravirt_release_pud(unsigned long pfn) {}
#endif
/*
+ * Flags to use when allocating a user page table page.
+ */
+extern gfp_t __userpte_alloc_gfp;
+
+/*
* Allocate and free page tables.
*/
extern pgd_t *pgd_alloc(struct mm_struct *);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index af6fd360ab35..a34c785c5a63 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -16,6 +16,8 @@
#ifndef __ASSEMBLY__
+#include <asm/x86_init.h>
+
/*
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
@@ -270,9 +272,9 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
unsigned long new_flags)
{
/*
- * PAT type is always WB for ISA. So no need to check.
+ * PAT type is always WB for untracked ranges, so no need to check.
*/
- if (is_ISA_range(paddr, paddr + size - 1))
+ if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
return 1;
/*
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 01fd9461d323..47339a1ac7b6 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -54,10 +54,10 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
in_irq() ? KM_IRQ_PTE : \
KM_PTE0)
#define pte_offset_map(dir, address) \
- ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \
+ ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \
pte_index((address)))
#define pte_offset_map_nested(dir, address) \
- ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \
+ ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \
pte_index((address)))
#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
@@ -80,7 +80,7 @@ do { \
* The i386 doesn't have any external MMU info: the kernel page
* tables contain all the necessary information.
*/
-#define update_mmu_cache(vma, address, pte) do { } while (0)
+#define update_mmu_cache(vma, address, ptep) do { } while (0)
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index c57a30117149..181be528c612 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -129,7 +129,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
#define pte_unmap(pte) /* NOP */
#define pte_unmap_nested(pte) /* NOP */
-#define update_mmu_cache(vma, address, pte) do { } while (0)
+#define update_mmu_cache(vma, address, ptep) do { } while (0)
/* Encode and de-code a swap entry */
#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c3429e8b2424..b753ea59703a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -30,6 +30,7 @@ struct mm_struct;
#include <linux/math64.h>
#include <linux/init.h>
+#define HBP_NUM 4
/*
* Default implementation of macro that returns current
* instruction pointer ("program counter").
@@ -180,7 +181,7 @@ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
/* ecx is often an input as well as an output. */
- asm("cpuid"
+ asm volatile("cpuid"
: "=a" (*eax),
"=b" (*ebx),
"=c" (*ecx),
@@ -422,6 +423,8 @@ extern unsigned int xstate_size;
extern void free_thread_xstate(struct task_struct *);
extern struct kmem_cache *task_xstate_cachep;
+struct perf_event;
+
struct thread_struct {
/* Cached TLS descriptors: */
struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
@@ -443,13 +446,12 @@ struct thread_struct {
unsigned long fs;
#endif
unsigned long gs;
- /* Hardware debugging registers: */
- unsigned long debugreg0;
- unsigned long debugreg1;
- unsigned long debugreg2;
- unsigned long debugreg3;
- unsigned long debugreg6;
- unsigned long debugreg7;
+ /* Save middle states of ptrace breakpoints */
+ struct perf_event *ptrace_bps[HBP_NUM];
+ /* Debug status used for traps, single steps, etc... */
+ unsigned long debugreg6;
+ /* Keep track of the exact dr7 value set by the user */
+ unsigned long ptrace_dr7;
/* Fault info: */
unsigned long cr2;
unsigned long trap_no;
@@ -1000,7 +1002,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
-#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
+extern unsigned long KSTK_ESP(struct task_struct *task);
#endif /* CONFIG_X86_64 */
extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 621f56d73121..6f414ed88620 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -5,31 +5,22 @@
/* misc architecture specific prototypes */
-extern void early_idt_handler(void);
+void early_idt_handler(void);
-extern void system_call(void);
-extern void syscall_init(void);
+void system_call(void);
+void syscall_init(void);
-extern void ia32_syscall(void);
-extern void ia32_cstar_target(void);
-extern void ia32_sysenter_target(void);
+void ia32_syscall(void);
+void ia32_cstar_target(void);
+void ia32_sysenter_target(void);
-extern void syscall32_cpu_init(void);
+void syscall32_cpu_init(void);
-extern void check_efer(void);
+void x86_configure_nx(void);
+void x86_report_nx(void);
extern int reboot_force;
long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
-/*
- * This looks more complex than it should be. But we need to
- * get the type for the ~ right in round_down (it needs to be
- * as wide as the result!), and we want to evaluate the macro
- * arguments just once each.
- */
-#define __round_mask(x,y) ((__typeof__(x))((y)-1))
-#define round_up(x,y) ((((x)-1) | __round_mask(x,y))+1)
-#define round_down(x,y) ((x) & ~__round_mask(x,y))
-
#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 0f0d908349aa..69a686a7dff0 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -7,6 +7,7 @@
#ifdef __KERNEL__
#include <asm/segment.h>
+#include <asm/page_types.h>
#endif
#ifndef __ASSEMBLY__
@@ -216,20 +217,72 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
return regs->sp;
}
-/*
- * These are defined as per linux/ptrace.h, which see.
+/* Query offset/name of register from its name/offset */
+extern int regs_query_register_offset(const char *name);
+extern const char *regs_query_register_name(unsigned int offset);
+#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))
+
+/**
+ * regs_get_register() - get register value from its offset
+ * @regs: pt_regs from which register value is gotten.
+ * @offset: offset number of the register.
+ *
+ * regs_get_register returns the value of a register. The @offset is the
+ * offset of the register in struct pt_regs address which specified by @regs.
+ * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
*/
-#define arch_has_single_step() (1)
-extern void user_enable_single_step(struct task_struct *);
-extern void user_disable_single_step(struct task_struct *);
+static inline unsigned long regs_get_register(struct pt_regs *regs,
+ unsigned int offset)
+{
+ if (unlikely(offset > MAX_REG_OFFSET))
+ return 0;
+ return *(unsigned long *)((unsigned long)regs + offset);
+}
+
+/**
+ * regs_within_kernel_stack() - check the address in the stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @addr: address which is checked.
+ *
+ * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
+ * If @addr is within the kernel stack, it returns true. If not, returns false.
+ */
+static inline int regs_within_kernel_stack(struct pt_regs *regs,
+ unsigned long addr)
+{
+ return ((addr & ~(THREAD_SIZE - 1)) ==
+ (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
+}
-extern void user_enable_block_step(struct task_struct *);
+/**
+ * regs_get_kernel_stack_nth() - get Nth entry of the stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @n: stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
+ * is specified by @regs. If the @n th entry is NOT in the kernel stack,
+ * this returns 0.
+ */
+static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
+ unsigned int n)
+{
+ unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
+ addr += n;
+ if (regs_within_kernel_stack(regs, (unsigned long)addr))
+ return *addr;
+ else
+ return 0;
+}
+
+#define arch_has_single_step() (1)
#ifdef CONFIG_X86_DEBUGCTLMSR
#define arch_has_block_step() (1)
#else
#define arch_has_block_step() (boot_cpu_data.x86 >= 6)
#endif
+#define ARCH_HAS_USER_SINGLE_STEP_INFO
+
struct user_desc;
extern int do_get_thread_area(struct task_struct *p, int idx,
struct user_desc __user *info);
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index ca7517d33776..606ede126972 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -41,6 +41,7 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/lockdep.h>
+#include <asm/asm.h>
struct rwsem_waiter;
@@ -55,17 +56,28 @@ extern asmregparm struct rw_semaphore *
/*
* the semaphore definition
+ *
+ * The bias values and the counter type limits the number of
+ * potential readers/writers to 32767 for 32 bits and 2147483647
+ * for 64 bits.
*/
-#define RWSEM_UNLOCKED_VALUE 0x00000000
-#define RWSEM_ACTIVE_BIAS 0x00000001
-#define RWSEM_ACTIVE_MASK 0x0000ffff
-#define RWSEM_WAITING_BIAS (-0x00010000)
+#ifdef CONFIG_X86_64
+# define RWSEM_ACTIVE_MASK 0xffffffffL
+#else
+# define RWSEM_ACTIVE_MASK 0x0000ffffL
+#endif
+
+#define RWSEM_UNLOCKED_VALUE 0x00000000L
+#define RWSEM_ACTIVE_BIAS 0x00000001L
+#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1)
#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
+typedef signed long rwsem_count_t;
+
struct rw_semaphore {
- signed long count;
+ rwsem_count_t count;
spinlock_t wait_lock;
struct list_head wait_list;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -105,7 +117,7 @@ do { \
static inline void __down_read(struct rw_semaphore *sem)
{
asm volatile("# beginning down_read\n\t"
- LOCK_PREFIX " incl (%%eax)\n\t"
+ LOCK_PREFIX _ASM_INC "(%1)\n\t"
/* adds 0x00000001, returns the old value */
" jns 1f\n"
" call call_rwsem_down_read_failed\n"
@@ -121,14 +133,14 @@ static inline void __down_read(struct rw_semaphore *sem)
*/
static inline int __down_read_trylock(struct rw_semaphore *sem)
{
- __s32 result, tmp;
+ rwsem_count_t result, tmp;
asm volatile("# beginning __down_read_trylock\n\t"
- " movl %0,%1\n\t"
+ " mov %0,%1\n\t"
"1:\n\t"
- " movl %1,%2\n\t"
- " addl %3,%2\n\t"
+ " mov %1,%2\n\t"
+ " add %3,%2\n\t"
" jle 2f\n\t"
- LOCK_PREFIX " cmpxchgl %2,%0\n\t"
+ LOCK_PREFIX " cmpxchg %2,%0\n\t"
" jnz 1b\n\t"
"2:\n\t"
"# ending __down_read_trylock\n\t"
@@ -143,13 +155,13 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
*/
static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
{
- int tmp;
+ rwsem_count_t tmp;
tmp = RWSEM_ACTIVE_WRITE_BIAS;
asm volatile("# beginning down_write\n\t"
- LOCK_PREFIX " xadd %%edx,(%%eax)\n\t"
+ LOCK_PREFIX " xadd %1,(%2)\n\t"
/* subtract 0x0000ffff, returns the old value */
- " testl %%edx,%%edx\n\t"
+ " test %1,%1\n\t"
/* was the count 0 before? */
" jz 1f\n"
" call call_rwsem_down_write_failed\n"
@@ -170,9 +182,9 @@ static inline void __down_write(struct rw_semaphore *sem)
*/
static inline int __down_write_trylock(struct rw_semaphore *sem)
{
- signed long ret = cmpxchg(&sem->count,
- RWSEM_UNLOCKED_VALUE,
- RWSEM_ACTIVE_WRITE_BIAS);
+ rwsem_count_t ret = cmpxchg(&sem->count,
+ RWSEM_UNLOCKED_VALUE,
+ RWSEM_ACTIVE_WRITE_BIAS);
if (ret == RWSEM_UNLOCKED_VALUE)
return 1;
return 0;
@@ -183,9 +195,9 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
*/
static inline void __up_read(struct rw_semaphore *sem)
{
- __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
+ rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS;
asm volatile("# beginning __up_read\n\t"
- LOCK_PREFIX " xadd %%edx,(%%eax)\n\t"
+ LOCK_PREFIX " xadd %1,(%2)\n\t"
/* subtracts 1, returns the old value */
" jns 1f\n\t"
" call call_rwsem_wake\n"
@@ -201,18 +213,18 @@ static inline void __up_read(struct rw_semaphore *sem)
*/
static inline void __up_write(struct rw_semaphore *sem)
{
+ rwsem_count_t tmp;
asm volatile("# beginning __up_write\n\t"
- " movl %2,%%edx\n\t"
- LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t"
+ LOCK_PREFIX " xadd %1,(%2)\n\t"
/* tries to transition
0xffff0001 -> 0x00000000 */
" jz 1f\n"
" call call_rwsem_wake\n"
"1:\n\t"
"# ending __up_write\n"
- : "+m" (sem->count)
- : "a" (sem), "i" (-RWSEM_ACTIVE_WRITE_BIAS)
- : "memory", "cc", "edx");
+ : "+m" (sem->count), "=d" (tmp)
+ : "a" (sem), "1" (-RWSEM_ACTIVE_WRITE_BIAS)
+ : "memory", "cc");
}
/*
@@ -221,33 +233,38 @@ static inline void __up_write(struct rw_semaphore *sem)
static inline void __downgrade_write(struct rw_semaphore *sem)
{
asm volatile("# beginning __downgrade_write\n\t"
- LOCK_PREFIX " addl %2,(%%eax)\n\t"
- /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
+ LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t"
+ /*
+ * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386)
+ * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64)
+ */
" jns 1f\n\t"
" call call_rwsem_downgrade_wake\n"
"1:\n\t"
"# ending __downgrade_write\n"
: "+m" (sem->count)
- : "a" (sem), "i" (-RWSEM_WAITING_BIAS)
+ : "a" (sem), "er" (-RWSEM_WAITING_BIAS)
: "memory", "cc");
}
/*
* implement atomic add functionality
*/
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+static inline void rwsem_atomic_add(rwsem_count_t delta,
+ struct rw_semaphore *sem)
{
- asm volatile(LOCK_PREFIX "addl %1,%0"
+ asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0"
: "+m" (sem->count)
- : "ir" (delta));
+ : "er" (delta));
}
/*
* implement exchange and add functionality
*/
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
+static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta,
+ struct rw_semaphore *sem)
{
- int tmp = delta;
+ rwsem_count_t tmp = delta;
asm volatile(LOCK_PREFIX "xadd %0,%1"
: "+r" (tmp), "+m" (sem->count)
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 1b7ee5d673c2..0a5242428659 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -2,7 +2,13 @@
#define _ASM_X86_SECTIONS_H
#include <asm-generic/sections.h>
+#include <asm/uaccess.h>
extern char __brk_base[], __brk_limit[];
+extern struct exception_table_entry __stop___ex_table[];
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+extern char __end_rodata_hpage_align[];
+#endif
#endif /* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 18e496c98ff0..86b1506f4179 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -37,10 +37,8 @@ void setup_bios_corruption_check(void);
#ifdef CONFIG_X86_VISWS
extern void visws_early_detect(void);
-extern int is_visws_box(void);
#else
static inline void visws_early_detect(void) { }
-static inline int is_visws_box(void) { return 0; }
#endif
extern unsigned long saved_video_mode;
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 72e5a4491661..04459d25e66e 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -124,7 +124,7 @@ struct sigcontext {
* fpstate is really (struct _fpstate *) or (struct _xstate *)
* depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
* bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
- * of extended memory layout. See comments at the defintion of
+ * of extended memory layout. See comments at the definition of
* (struct _fpx_sw_bytes)
*/
void __user *fpstate; /* zero when no FPU/extended context */
@@ -219,7 +219,7 @@ struct sigcontext {
* fpstate is really (struct _fpstate *) or (struct _xstate *)
* depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
* bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
- * of extended memory layout. See comments at the defintion of
+ * of extended memory layout. See comments at the definition of
* (struct _fpx_sw_bytes)
*/
void __user *fpstate; /* zero when no FPU/extended context */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 1e796782cd7b..4cfc90824068 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -135,6 +135,8 @@ int native_cpu_disable(void);
void native_cpu_die(unsigned int cpu);
void native_play_dead(void);
void play_dead_common(void);
+void wbinvd_on_cpu(int cpu);
+int wbinvd_on_all_cpus(void);
void native_send_call_func_ipi(const struct cpumask *mask);
void native_send_call_func_single_ipi(int cpu);
@@ -147,6 +149,13 @@ static inline int num_booting_cpus(void)
{
return cpumask_weight(cpu_callout_mask);
}
+#else /* !CONFIG_SMP */
+#define wbinvd_on_cpu(cpu) wbinvd()
+static inline int wbinvd_on_all_cpus(void)
+{
+ wbinvd();
+ return 0;
+}
#endif /* CONFIG_SMP */
extern unsigned disabled_cpus __cpuinitdata;
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 4e77853321db..3089f70c0c52 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -58,7 +58,7 @@
#if (NR_CPUS < 256)
#define TICKET_SHIFT 8
-static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
+static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
{
short inc = 0x0100;
@@ -77,7 +77,7 @@ static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
: "memory", "cc");
}
-static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
+static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
{
int tmp, new;
@@ -96,7 +96,7 @@ static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
return tmp;
}
-static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
+static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
{
asm volatile(UNLOCK_LOCK_PREFIX "incb %0"
: "+m" (lock->slock)
@@ -106,7 +106,7 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
#else
#define TICKET_SHIFT 16
-static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
+static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
{
int inc = 0x00010000;
int tmp;
@@ -127,7 +127,7 @@ static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
: "memory", "cc");
}
-static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
+static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
{
int tmp;
int new;
@@ -149,7 +149,7 @@ static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
return tmp;
}
-static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
+static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
{
asm volatile(UNLOCK_LOCK_PREFIX "incw %0"
: "+m" (lock->slock)
@@ -158,14 +158,14 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
}
#endif
-static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
+static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
{
int tmp = ACCESS_ONCE(lock->slock);
return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
}
-static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
+static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
{
int tmp = ACCESS_ONCE(lock->slock);
@@ -174,43 +174,43 @@ static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
#ifndef CONFIG_PARAVIRT_SPINLOCKS
-static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+static inline int arch_spin_is_locked(arch_spinlock_t *lock)
{
return __ticket_spin_is_locked(lock);
}
-static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+static inline int arch_spin_is_contended(arch_spinlock_t *lock)
{
return __ticket_spin_is_contended(lock);
}
-#define __raw_spin_is_contended __raw_spin_is_contended
+#define arch_spin_is_contended arch_spin_is_contended
-static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
+static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
{
__ticket_spin_lock(lock);
}
-static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
{
return __ticket_spin_trylock(lock);
}
-static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
{
__ticket_spin_unlock(lock);
}
-static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
+static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
unsigned long flags)
{
- __raw_spin_lock(lock);
+ arch_spin_lock(lock);
}
#endif /* CONFIG_PARAVIRT_SPINLOCKS */
-static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
{
- while (__raw_spin_is_locked(lock))
+ while (arch_spin_is_locked(lock))
cpu_relax();
}
@@ -232,7 +232,7 @@ static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
* read_can_lock - would read_trylock() succeed?
* @lock: the rwlock in question.
*/
-static inline int __raw_read_can_lock(raw_rwlock_t *lock)
+static inline int arch_read_can_lock(arch_rwlock_t *lock)
{
return (int)(lock)->lock > 0;
}
@@ -241,12 +241,12 @@ static inline int __raw_read_can_lock(raw_rwlock_t *lock)
* write_can_lock - would write_trylock() succeed?
* @lock: the rwlock in question.
*/
-static inline int __raw_write_can_lock(raw_rwlock_t *lock)
+static inline int arch_write_can_lock(arch_rwlock_t *lock)
{
return (lock)->lock == RW_LOCK_BIAS;
}
-static inline void __raw_read_lock(raw_rwlock_t *rw)
+static inline void arch_read_lock(arch_rwlock_t *rw)
{
asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
"jns 1f\n"
@@ -255,7 +255,7 @@ static inline void __raw_read_lock(raw_rwlock_t *rw)
::LOCK_PTR_REG (rw) : "memory");
}
-static inline void __raw_write_lock(raw_rwlock_t *rw)
+static inline void arch_write_lock(arch_rwlock_t *rw)
{
asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
"jz 1f\n"
@@ -264,7 +264,7 @@ static inline void __raw_write_lock(raw_rwlock_t *rw)
::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
}
-static inline int __raw_read_trylock(raw_rwlock_t *lock)
+static inline int arch_read_trylock(arch_rwlock_t *lock)
{
atomic_t *count = (atomic_t *)lock;
@@ -274,7 +274,7 @@ static inline int __raw_read_trylock(raw_rwlock_t *lock)
return 0;
}
-static inline int __raw_write_trylock(raw_rwlock_t *lock)
+static inline int arch_write_trylock(arch_rwlock_t *lock)
{
atomic_t *count = (atomic_t *)lock;
@@ -284,23 +284,23 @@ static inline int __raw_write_trylock(raw_rwlock_t *lock)
return 0;
}
-static inline void __raw_read_unlock(raw_rwlock_t *rw)
+static inline void arch_read_unlock(arch_rwlock_t *rw)
{
asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
}
-static inline void __raw_write_unlock(raw_rwlock_t *rw)
+static inline void arch_write_unlock(arch_rwlock_t *rw)
{
asm volatile(LOCK_PREFIX "addl %1, %0"
: "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
}
-#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
-#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
+#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-#define _raw_spin_relax(lock) cpu_relax()
-#define _raw_read_relax(lock) cpu_relax()
-#define _raw_write_relax(lock) cpu_relax()
+#define arch_spin_relax(lock) cpu_relax()
+#define arch_read_relax(lock) cpu_relax()
+#define arch_write_relax(lock) cpu_relax()
/* The {read|write|spin}_lock() on x86 are full memory barriers. */
static inline void smp_mb__after_lock(void) { }
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 845f81c87091..dcb48b2edc11 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -5,16 +5,16 @@
# error "please don't include this file directly"
#endif
-typedef struct raw_spinlock {
+typedef struct arch_spinlock {
unsigned int slock;
-} raw_spinlock_t;
+} arch_spinlock_t;
-#define __RAW_SPIN_LOCK_UNLOCKED { 0 }
+#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
typedef struct {
unsigned int lock;
-} raw_rwlock_t;
+} arch_rwlock_t;
-#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
+#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index cf86a5e73815..4dab78edbad9 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -3,7 +3,28 @@
extern int kstack_depth_to_print;
-int x86_is_stack_id(int id, char *name);
+struct thread_info;
+struct stacktrace_ops;
+
+typedef unsigned long (*walk_stack_t)(struct thread_info *tinfo,
+ unsigned long *stack,
+ unsigned long bp,
+ const struct stacktrace_ops *ops,
+ void *data,
+ unsigned long *end,
+ int *graph);
+
+extern unsigned long
+print_context_stack(struct thread_info *tinfo,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data,
+ unsigned long *end, int *graph);
+
+extern unsigned long
+print_context_stack_bp(struct thread_info *tinfo,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data,
+ unsigned long *end, int *graph);
/* Generic stack tracer with callbacks */
@@ -14,6 +35,7 @@ struct stacktrace_ops {
void (*address)(void *data, unsigned long address, int reliable);
/* On negative return stop dumping */
int (*stack)(void *data, char *name);
+ walk_stack_t walk_stack;
};
void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index ae907e617181..3d3e8353ee5c 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -177,10 +177,15 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
*/
#ifndef CONFIG_KMEMCHECK
+
+#if (__GNUC__ >= 4)
+#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
+#else
#define memcpy(t, f, n) \
(__builtin_constant_p((n)) \
? __constant_memcpy((t), (f), (n)) \
: __memcpy((t), (f), (n)))
+#endif
#else
/*
* kmemcheck becomes very happy if we use the REP instructions unconditionally,
@@ -316,11 +321,15 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern,
: __memset_generic((s), (c), (count)))
#define __HAVE_ARCH_MEMSET
+#if (__GNUC__ >= 4)
+#define memset(s, c, count) __builtin_memset(s, c, count)
+#else
#define memset(s, c, count) \
(__builtin_constant_p(c) \
? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \
(count)) \
: __memset((s), (c), (count)))
+#endif
/*
* find the first occurrence of byte 'c', or 1 past the area if none
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 85574b7c1bc1..38638cd2fa4c 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u16 intercept_dr_write;
u32 intercept_exceptions;
u64 intercept;
- u8 reserved_1[44];
+ u8 reserved_1[42];
+ u16 pause_filter_count;
u64 iopm_base_pa;
u64 msrpm_base_pa;
u64 tsc_offset;
@@ -312,7 +313,7 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXIT_ERR -1
-#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
+#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index b9e4e20174fb..8085277e1b8b 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -3,15 +3,16 @@
#include <linux/swiotlb.h>
-/* SWIOTLB interface */
-
-extern int swiotlb_force;
-
#ifdef CONFIG_SWIOTLB
extern int swiotlb;
-extern void pci_swiotlb_init(void);
+extern int __init pci_swiotlb_detect(void);
+extern void __init pci_swiotlb_init(void);
#else
#define swiotlb 0
+static inline int pci_swiotlb_detect(void)
+{
+ return 0;
+}
static inline void pci_swiotlb_init(void)
{
}
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 72a6dcd1299b..3ad421784ae7 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -26,11 +26,10 @@ asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *);
asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);
asmlinkage long sys32_fstatat(unsigned int, char __user *,
struct stat64 __user *, int);
-struct mmap_arg_struct;
-asmlinkage long sys32_mmap(struct mmap_arg_struct __user *);
+struct mmap_arg_struct32;
+asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *);
asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long);
-asmlinkage long sys32_pipe(int __user *);
struct sigaction32;
struct old_sigaction32;
asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *,
@@ -41,8 +40,6 @@ asmlinkage long sys32_rt_sigprocmask(int, compat_sigset_t __user *,
compat_sigset_t __user *, unsigned int);
asmlinkage long sys32_alarm(unsigned int);
-struct sel_arg_struct;
-asmlinkage long sys32_old_select(struct sel_arg_struct __user *);
asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int);
asmlinkage long sys32_sysfs(int, u32, u32);
@@ -51,25 +48,12 @@ asmlinkage long sys32_sched_rr_get_interval(compat_pid_t,
asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
-#ifdef CONFIG_SYSCTL_SYSCALL
-struct sysctl_ia32;
-asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *);
-#endif
-
asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
asmlinkage long sys32_personality(unsigned long);
asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
-asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long,
- unsigned long, unsigned long, unsigned long);
-
-struct oldold_utsname;
-struct old_utsname;
-asmlinkage long sys32_olduname(struct oldold_utsname __user *);
-long sys32_uname(struct old_utsname __user *);
-
asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
compat_uptr_t __user *, struct pt_regs *);
asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 8d33bc5462d1..c4a348f7bd43 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -16,6 +16,8 @@
#include <linux/sched.h>
#include <linux/err.h>
+extern const unsigned long sys_call_table[];
+
/*
* Only the low 32 bits of orig_ax are meaningful, so we return int.
* This importantly ignores the high bits on 64-bit, so comparisons
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 372b76edd63f..5c044b43e9a7 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -18,16 +18,24 @@
/* Common in X86_32 and X86_64 */
/* kernel/ioport.c */
asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
+long sys_iopl(unsigned int, struct pt_regs *);
/* kernel/process.c */
int sys_fork(struct pt_regs *);
int sys_vfork(struct pt_regs *);
+long sys_execve(char __user *, char __user * __user *,
+ char __user * __user *, struct pt_regs *);
+long sys_clone(unsigned long, unsigned long, void __user *,
+ void __user *, struct pt_regs *);
/* kernel/ldt.c */
asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
/* kernel/signal.c */
long sys_rt_sigreturn(struct pt_regs *);
+long sys_sigaltstack(const stack_t __user *, stack_t __user *,
+ struct pt_regs *);
+
/* kernel/tls.c */
asmlinkage int sys_set_thread_area(struct user_desc __user *);
@@ -35,63 +43,26 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *);
/* X86_32 only */
#ifdef CONFIG_X86_32
-/* kernel/ioport.c */
-long sys_iopl(struct pt_regs *);
-
-/* kernel/process_32.c */
-int sys_clone(struct pt_regs *);
-int sys_execve(struct pt_regs *);
/* kernel/signal.c */
asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
struct old_sigaction __user *);
-int sys_sigaltstack(struct pt_regs *);
unsigned long sys_sigreturn(struct pt_regs *);
-/* kernel/sys_i386_32.c */
-struct mmap_arg_struct;
-struct sel_arg_struct;
-struct oldold_utsname;
-struct old_utsname;
-
-asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
- unsigned long, unsigned long, unsigned long);
-asmlinkage int old_mmap(struct mmap_arg_struct __user *);
-asmlinkage int old_select(struct sel_arg_struct __user *);
-asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
-asmlinkage int sys_uname(struct old_utsname __user *);
-asmlinkage int sys_olduname(struct oldold_utsname __user *);
-
/* kernel/vm86_32.c */
-int sys_vm86old(struct pt_regs *);
-int sys_vm86(struct pt_regs *);
+int sys_vm86old(struct vm86_struct __user *, struct pt_regs *);
+int sys_vm86(unsigned long, unsigned long, struct pt_regs *);
#else /* CONFIG_X86_32 */
/* X86_64 only */
-/* kernel/ioport.c */
-asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
-
/* kernel/process_64.c */
-asmlinkage long sys_clone(unsigned long, unsigned long,
- void __user *, void __user *,
- struct pt_regs *);
-asmlinkage long sys_execve(char __user *, char __user * __user *,
- char __user * __user *,
- struct pt_regs *);
long sys_arch_prctl(int, unsigned long);
-/* kernel/signal.c */
-asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
- struct pt_regs *);
-
/* kernel/sys_x86_64.c */
-struct new_utsname;
-
asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
unsigned long, unsigned long, unsigned long);
-asmlinkage long sys_uname(struct new_utsname __user *);
#endif /* CONFIG_X86_32 */
#endif /* _ASM_X86_SYSCALLS_H */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index f08f97374892..b8fe48ee2ed9 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -11,9 +11,9 @@
#include <linux/irqflags.h>
/* entries in ARCH_DLINFO: */
-#ifdef CONFIG_IA32_EMULATION
+#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
# define AT_VECTOR_SIZE_ARCH 2
-#else
+#else /* else it's non-compat x86-64 */
# define AT_VECTOR_SIZE_ARCH 1
#endif
@@ -23,6 +23,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
struct tss_struct;
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss);
+extern void show_regs_common(void);
#ifdef CONFIG_X86_32
@@ -31,7 +32,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
"movl %P[task_canary](%[next]), %%ebx\n\t" \
"movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
#define __switch_canary_oparam \
- , [stack_canary] "=m" (per_cpu_var(stack_canary.canary))
+ , [stack_canary] "=m" (stack_canary.canary)
#define __switch_canary_iparam \
, [task_canary] "i" (offsetof(struct task_struct, stack_canary))
#else /* CC_STACKPROTECTOR */
@@ -113,7 +114,7 @@ do { \
"movq %P[task_canary](%%rsi),%%r8\n\t" \
"movq %%r8,"__percpu_arg([gs_canary])"\n\t"
#define __switch_canary_oparam \
- , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary))
+ , [gs_canary] "=m" (irq_stack_union.stack_canary)
#define __switch_canary_iparam \
, [task_canary] "i" (offsetof(struct task_struct, stack_canary))
#else /* CC_STACKPROTECTOR */
@@ -128,13 +129,11 @@ do { \
"movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
"movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
"call __switch_to\n\t" \
- ".globl thread_return\n" \
- "thread_return:\n\t" \
"movq "__percpu_arg([current_task])",%%rsi\n\t" \
__switch_canary \
"movq %P[thread_info](%%rsi),%%r8\n\t" \
"movq %%rax,%%rdi\n\t" \
- "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
+ "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
"jnz ret_from_fork\n\t" \
RESTORE_CONTEXT \
: "=a" (last) \
@@ -144,7 +143,7 @@ do { \
[ti_flags] "i" (offsetof(struct thread_info, flags)), \
[_tif_fork] "i" (_TIF_FORK), \
[thread_info] "i" (offsetof(struct task_struct, stack)), \
- [current_task] "m" (per_cpu_var(current_task)) \
+ [current_task] "m" (current_task) \
__switch_canary_iparam \
: "memory", "cc" __EXTRA_CLOBBER)
#endif
@@ -157,19 +156,22 @@ extern void native_load_gs_index(unsigned);
* Load a segment. Fall back on loading the zero
* segment if something goes wrong..
*/
-#define loadsegment(seg, value) \
- asm volatile("\n" \
- "1:\t" \
- "movl %k0,%%" #seg "\n" \
- "2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3:\t" \
- "movl %k1, %%" #seg "\n\t" \
- "jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE(1b,3b) \
- : :"r" (value), "r" (0) : "memory")
-
+#define loadsegment(seg, value) \
+do { \
+ unsigned short __val = (value); \
+ \
+ asm volatile(" \n" \
+ "1: movl %k0,%%" #seg " \n" \
+ \
+ ".section .fixup,\"ax\" \n" \
+ "2: xorl %k0,%k0 \n" \
+ " jmp 1b \n" \
+ ".previous \n" \
+ \
+ _ASM_EXTABLE(1b, 2b) \
+ \
+ : "+r" (__val) : : "memory"); \
+} while (0)
/*
* Save a segment register away
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d27d0a2fec4c..e0d28901e969 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,10 +83,10 @@ struct thread_info {
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
+#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* 32bit process */
#define TIF_FORK 18 /* ret_from_fork */
-#define TIF_ABI_PENDING 19
#define TIF_MEMDIE 20
#define TIF_DEBUG 21 /* uses debug registers */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
@@ -107,10 +107,10 @@ struct thread_info {
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
+#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_IA32 (1 << TIF_IA32)
#define _TIF_FORK (1 << TIF_FORK)
-#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING)
#define _TIF_DEBUG (1 << TIF_DEBUG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
#define _TIF_FREEZE (1 << TIF_FREEZE)
@@ -142,13 +142,14 @@ struct thread_info {
/* Only used for 64 bit */
#define _TIF_DO_NOTIFY_MASK \
- (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+ (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \
+ _TIF_USER_RETURN_NOTIFY)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
-#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
#define PREEMPT_ACTIVE 0x10000000
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 25a92842dd99..c5087d796587 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -35,11 +35,16 @@
# endif
#endif
-/* Node not present */
-#define NUMA_NO_NODE (-1)
+/*
+ * to preserve the visibility of NUMA_NO_NODE definition,
+ * moved to there from here. May be used independent of
+ * CONFIG_NUMA.
+ */
+#include <linux/numa.h>
#ifdef CONFIG_NUMA
#include <linux/cpumask.h>
+
#include <asm/mpspec.h>
#ifdef CONFIG_X86_32
@@ -143,6 +148,7 @@ extern unsigned long node_remap_size[];
| 1*SD_BALANCE_FORK \
| 0*SD_BALANCE_WAKE \
| 1*SD_WAKE_AFFINE \
+ | 0*SD_PREFER_LOCAL \
| 0*SD_SHARE_CPUPOWER \
| 0*SD_POWERSAVINGS_BALANCE \
| 0*SD_SHARE_PKG_RESOURCES \
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index 90f06c25221d..cb507bb05d79 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -16,7 +16,6 @@ extern unsigned long initial_code;
extern unsigned long initial_gs;
#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)
-#define TRAMPOLINE_BASE 0x6000
extern unsigned long setup_trampoline(void);
extern void __init reserve_trampoline_memory(void);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index d2c6c930b491..abd3e0ea762a 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -570,7 +570,6 @@ extern struct movsl_mask {
#ifdef CONFIG_X86_32
# include "uaccess_32.h"
#else
-# define ARCH_HAS_SEARCH_EXTABLE
# include "uaccess_64.h"
#endif
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 632fb44b4cb5..088d09fb1615 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -187,9 +187,33 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from,
unsigned long __must_check copy_to_user(void __user *to,
const void *from, unsigned long n);
-unsigned long __must_check copy_from_user(void *to,
+unsigned long __must_check _copy_from_user(void *to,
const void __user *from,
unsigned long n);
+
+
+extern void copy_from_user_overflow(void)
+#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
+ __compiletime_error("copy_from_user() buffer size is not provably correct")
+#else
+ __compiletime_warning("copy_from_user() buffer size is not provably correct")
+#endif
+;
+
+static inline unsigned long __must_check copy_from_user(void *to,
+ const void __user *from,
+ unsigned long n)
+{
+ int sz = __compiletime_object_size(to);
+
+ if (likely(sz == -1 || sz >= n))
+ n = _copy_from_user(to, from, n);
+ else
+ copy_from_user_overflow();
+
+ return n;
+}
+
long __must_check strncpy_from_user(char *dst, const char __user *src,
long count);
long __must_check __strncpy_from_user(char *dst,
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index db24b215fc50..316708d5af92 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -8,6 +8,8 @@
#include <linux/errno.h>
#include <linux/prefetch.h>
#include <linux/lockdep.h>
+#include <asm/alternative.h>
+#include <asm/cpufeature.h>
#include <asm/page.h>
/*
@@ -16,15 +18,56 @@
/* Handles exceptions in both to and from, but doesn't do access_ok */
__must_check unsigned long
-copy_user_generic(void *to, const void *from, unsigned len);
+copy_user_generic_string(void *to, const void *from, unsigned len);
+__must_check unsigned long
+copy_user_generic_unrolled(void *to, const void *from, unsigned len);
+
+static __always_inline __must_check unsigned long
+copy_user_generic(void *to, const void *from, unsigned len)
+{
+ unsigned ret;
+
+ alternative_call(copy_user_generic_unrolled,
+ copy_user_generic_string,
+ X86_FEATURE_REP_GOOD,
+ ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
+ "=d" (len)),
+ "1" (to), "2" (from), "3" (len)
+ : "memory", "rcx", "r8", "r9", "r10", "r11");
+ return ret;
+}
__must_check unsigned long
-copy_to_user(void __user *to, const void *from, unsigned len);
+_copy_to_user(void __user *to, const void *from, unsigned len);
__must_check unsigned long
-copy_from_user(void *to, const void __user *from, unsigned len);
+_copy_from_user(void *to, const void __user *from, unsigned len);
__must_check unsigned long
copy_in_user(void __user *to, const void __user *from, unsigned len);
+static inline unsigned long __must_check copy_from_user(void *to,
+ const void __user *from,
+ unsigned long n)
+{
+ int sz = __compiletime_object_size(to);
+
+ might_fault();
+ if (likely(sz == -1 || sz >= n))
+ n = _copy_from_user(to, from, n);
+#ifdef CONFIG_DEBUG_VM
+ else
+ WARN(1, "Buffer overflow detected!\n");
+#endif
+ return n;
+}
+
+static __always_inline __must_check
+int copy_to_user(void __user *dst, const void *src, unsigned size)
+{
+ might_fault();
+
+ return _copy_to_user(dst, src, size);
+}
+
static __always_inline __must_check
int __copy_from_user(void *dst, const void __user *src, unsigned size)
{
@@ -176,8 +219,11 @@ __must_check long strlen_user(const char __user *str);
__must_check unsigned long clear_user(void __user *mem, unsigned long len);
__must_check unsigned long __clear_user(void __user *mem, unsigned long len);
-__must_check long __copy_from_user_inatomic(void *dst, const void __user *src,
- unsigned size);
+static __must_check __always_inline int
+__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
+{
+ return copy_user_generic(dst, (__force const void *)src, size);
+}
static __must_check __always_inline int
__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6fb3c209a7e3..beb9b5f8f8a4 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -342,10 +342,11 @@
#define __NR_pwritev 334
#define __NR_rt_tgsigqueueinfo 335
#define __NR_perf_event_open 336
+#define __NR_recvmmsg 337
#ifdef __KERNEL__
-#define NR_syscalls 337
+#define NR_syscalls 338
#define __ARCH_WANT_IPC_PARSE_VERSION
#define __ARCH_WANT_OLD_READDIR
@@ -353,6 +354,7 @@
#define __ARCH_WANT_STAT64
#define __ARCH_WANT_SYS_ALARM
#define __ARCH_WANT_SYS_GETHOSTNAME
+#define __ARCH_WANT_SYS_IPC
#define __ARCH_WANT_SYS_PAUSE
#define __ARCH_WANT_SYS_SGETMASK
#define __ARCH_WANT_SYS_SIGNAL
@@ -365,6 +367,9 @@
#define __ARCH_WANT_SYS_LLSEEK
#define __ARCH_WANT_SYS_NICE
#define __ARCH_WANT_SYS_OLD_GETRLIMIT
+#define __ARCH_WANT_SYS_OLD_UNAME
+#define __ARCH_WANT_SYS_OLD_MMAP
+#define __ARCH_WANT_SYS_OLD_SELECT
#define __ARCH_WANT_SYS_OLDUMOUNT
#define __ARCH_WANT_SYS_SIGPENDING
#define __ARCH_WANT_SYS_SIGPROCMASK
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 8d3ad0adbc68..ff4307b0e81e 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -146,7 +146,7 @@ __SYSCALL(__NR_wait4, sys_wait4)
#define __NR_kill 62
__SYSCALL(__NR_kill, sys_kill)
#define __NR_uname 63
-__SYSCALL(__NR_uname, sys_uname)
+__SYSCALL(__NR_uname, sys_newuname)
#define __NR_semget 64
__SYSCALL(__NR_semget, sys_semget)
@@ -661,6 +661,8 @@ __SYSCALL(__NR_pwritev, sys_pwritev)
__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
#define __NR_perf_event_open 298
__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
+#define __NR_recvmmsg 299
+__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
@@ -678,6 +680,7 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
#define __ARCH_WANT_SYS_LLSEEK
#define __ARCH_WANT_SYS_NICE
#define __ARCH_WANT_SYS_OLD_GETRLIMIT
+#define __ARCH_WANT_SYS_OLD_UNAME
#define __ARCH_WANT_SYS_OLDUMOUNT
#define __ARCH_WANT_SYS_SIGPENDING
#define __ARCH_WANT_SYS_SIGPROCMASK
diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h
index 999873b22e7f..24532c7da3d6 100644
--- a/arch/x86/include/asm/user.h
+++ b/arch/x86/include/asm/user.h
@@ -1,5 +1,63 @@
+#ifndef _ASM_X86_USER_H
+#define _ASM_X86_USER_H
+
#ifdef CONFIG_X86_32
# include "user_32.h"
#else
# include "user_64.h"
#endif
+
+#include <asm/types.h>
+
+struct user_ymmh_regs {
+ /* 16 * 16 bytes for each YMMH-reg */
+ __u32 ymmh_space[64];
+};
+
+struct user_xsave_hdr {
+ __u64 xstate_bv;
+ __u64 reserved1[2];
+ __u64 reserved2[5];
+};
+
+/*
+ * The structure layout of user_xstateregs, used for exporting the
+ * extended register state through ptrace and core-dump (NT_X86_XSTATE note)
+ * interfaces will be same as the memory layout of xsave used by the processor
+ * (except for the bytes 464..511, which can be used by the software) and hence
+ * the size of this structure varies depending on the features supported by the
+ * processor and OS. The size of the structure that users need to use can be
+ * obtained by doing:
+ * cpuid_count(0xd, 0, &eax, &ptrace_xstateregs_struct_size, &ecx, &edx);
+ * i.e., cpuid.(eax=0xd,ecx=0).ebx will be the size that user (debuggers, etc.)
+ * need to use.
+ *
+ * For now, only the first 8 bytes of the software usable bytes[464..471] will
+ * be used and will be set to OS enabled xstate mask (which is same as the
+ * 64bit mask returned by the xgetbv's xCR0). Users (analyzing core dump
+ * remotely, etc.) can use this mask as well as the mask saved in the
+ * xstate_hdr bytes and interpret what states the processor/OS supports
+ * and what states are in modified/initialized conditions for the
+ * particular process/thread.
+ *
+ * Also when the user modifies certain state FP/SSE/etc through the
+ * ptrace interface, they must ensure that the xsave_hdr.xstate_bv
+ * bytes[512..519] of the memory layout are updated correspondingly.
+ * i.e., for example when FP state is modified to a non-init state,
+ * xsave_hdr.xstate_bv's bit 0 must be set to '1', when SSE is modified to
+ * non-init state, xsave_hdr.xstate_bv's bit 1 must to be set to '1', etc.
+ */
+#define USER_XSTATE_FX_SW_WORDS 6
+#define USER_XSTATE_XCR0_WORD 0
+
+struct user_xstateregs {
+ struct {
+ __u64 fpx_space[58];
+ __u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS];
+ } i387;
+ struct user_xsave_hdr xsave_hdr;
+ struct user_ymmh_regs ymmh;
+ /* further processor state extensions go here */
+};
+
+#endif /* _ASM_X86_USER_H */
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index 7ed17ff502b9..71605c7d5c5c 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -18,8 +18,8 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
- * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
- * Copyright (c) Russ Anderson
+ * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Russ Anderson <rja@sgi.com>
*/
#include <linux/rtc.h>
@@ -36,7 +36,8 @@ enum uv_bios_cmd {
UV_BIOS_WATCHLIST_ALLOC,
UV_BIOS_WATCHLIST_FREE,
UV_BIOS_MEMPROTECT,
- UV_BIOS_GET_PARTITION_ADDR
+ UV_BIOS_GET_PARTITION_ADDR,
+ UV_BIOS_SET_LEGACY_VGA_TARGET
};
/*
@@ -76,15 +77,6 @@ union partition_info_u {
};
};
-union uv_watchlist_u {
- u64 val;
- struct {
- u64 blade : 16,
- size : 32,
- filler : 16;
- };
-};
-
enum uv_memprotect {
UV_MEMPROT_RESTRICT_ACCESS,
UV_MEMPROT_ALLOW_AMO,
@@ -98,13 +90,14 @@ extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64);
extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64);
extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
-extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *);
+extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *);
extern s64 uv_bios_freq_base(u64, u64 *);
-extern int uv_bios_mq_watchlist_alloc(int, unsigned long, unsigned int,
+extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int,
unsigned long *);
extern int uv_bios_mq_watchlist_free(int, int);
extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *);
+extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus);
extern void uv_bios_init(void);
@@ -113,6 +106,7 @@ extern int uv_type;
extern long sn_partition_id;
extern long sn_coherency_id;
extern long sn_region_size;
+extern long system_serial_number;
#define partition_coherence_id() (sn_coherency_id)
extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index c0a01b5d985b..3bb9491b7659 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -11,6 +11,7 @@ struct mm_struct;
extern enum uv_system_type get_uv_system_type(void);
extern int is_uv_system(void);
extern void uv_cpu_init(void);
+extern void uv_nmi_init(void);
extern void uv_system_init(void);
extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 80e2984f521c..b414d2b401f6 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -55,7 +55,7 @@
#define DESC_STATUS_SOURCE_TIMEOUT 3
/*
- * source side threshholds at which message retries print a warning
+ * source side thresholds at which message retries print a warning
*/
#define SOURCE_TIMEOUT_LIMIT 20
#define DESTINATION_TIMEOUT_LIMIT 20
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 04eb6c958b9d..14cc74ba5d23 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -19,6 +19,8 @@
#include <asm/types.h>
#include <asm/percpu.h>
#include <asm/uv/uv_mmrs.h>
+#include <asm/irq_vectors.h>
+#include <asm/io_apic.h>
/*
@@ -29,20 +31,20 @@
* contiguous (although various IO spaces may punch holes in
* it)..
*
- * N - Number of bits in the node portion of a socket physical
- * address.
+ * N - Number of bits in the node portion of a socket physical
+ * address.
*
- * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of
- * routers always have low bit of 1, C/MBricks have low bit
- * equal to 0. Most addressing macros that target UV hub chips
- * right shift the NASID by 1 to exclude the always-zero bit.
- * NASIDs contain up to 15 bits.
+ * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of
+ * routers always have low bit of 1, C/MBricks have low bit
+ * equal to 0. Most addressing macros that target UV hub chips
+ * right shift the NASID by 1 to exclude the always-zero bit.
+ * NASIDs contain up to 15 bits.
*
* GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead
* of nasids.
*
- * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant
- * of the nasid for socket usage.
+ * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant
+ * of the nasid for socket usage.
*
*
* NumaLink Global Physical Address Format:
@@ -69,12 +71,12 @@
*
*
* APICID format
- * NOTE!!!!!! This is the current format of the APICID. However, code
- * should assume that this will change in the future. Use functions
- * in this file for all APICID bit manipulations and conversion.
+ * NOTE!!!!!! This is the current format of the APICID. However, code
+ * should assume that this will change in the future. Use functions
+ * in this file for all APICID bit manipulations and conversion.
*
- * 1111110000000000
- * 5432109876543210
+ * 1111110000000000
+ * 5432109876543210
* pppppppppplc0cch
* sssssssssss
*
@@ -87,9 +89,9 @@
* Note: Processor only supports 12 bits in the APICID register. The ACPI
* tables hold all 16 bits. Software needs to be aware of this.
*
- * Unless otherwise specified, all references to APICID refer to
- * the FULL value contained in ACPI tables, not the subset in the
- * processor APICID register.
+ * Unless otherwise specified, all references to APICID refer to
+ * the FULL value contained in ACPI tables, not the subset in the
+ * processor APICID register.
*/
@@ -114,7 +116,7 @@
/*
* The largest possible NASID of a C or M brick (+ 2)
*/
-#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2)
+#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_BLADES * 2)
struct uv_scir_s {
struct timer_list timer;
@@ -149,16 +151,16 @@ struct uv_hub_info_s {
};
DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
-#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
+#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
/*
* Local & Global MMR space macros.
- * Note: macros are intended to be used ONLY by inline functions
- * in this file - not by other kernel code.
- * n - NASID (full 15-bit global nasid)
- * g - GNODE (full 15-bit global nasid, right shifted 1)
- * p - PNODE (local part of nsids, right shifted 1)
+ * Note: macros are intended to be used ONLY by inline functions
+ * in this file - not by other kernel code.
+ * n - NASID (full 15-bit global nasid)
+ * g - GNODE (full 15-bit global nasid, right shifted 1)
+ * p - PNODE (local part of nsids, right shifted 1)
*/
#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask)
#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra)
@@ -170,6 +172,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
+#define UV_GLOBAL_GRU_MMR_BASE 0x4000000
+
#define UV_GLOBAL_MMR32_PNODE_SHIFT 15
#define UV_GLOBAL_MMR64_PNODE_SHIFT 26
@@ -211,8 +215,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
/*
* Macros for converting between kernel virtual addresses, socket local physical
* addresses, and UV global physical addresses.
- * Note: use the standard __pa() & __va() macros for converting
- * between socket virtual and socket physical addresses.
+ * Note: use the standard __pa() & __va() macros for converting
+ * between socket virtual and socket physical addresses.
*/
/* socket phys RAM --> UV global physical address */
@@ -230,6 +234,40 @@ static inline unsigned long uv_gpa(void *v)
return uv_soc_phys_ram_to_gpa(__pa(v));
}
+/* Top two bits indicate the requested address is in MMR space. */
+static inline int
+uv_gpa_in_mmr_space(unsigned long gpa)
+{
+ return (gpa >> 62) == 0x3UL;
+}
+
+/* UV global physical address --> socket phys RAM */
+static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa)
+{
+ unsigned long paddr = gpa & uv_hub_info->gpa_mask;
+ unsigned long remap_base = uv_hub_info->lowmem_remap_base;
+ unsigned long remap_top = uv_hub_info->lowmem_remap_top;
+
+ if (paddr >= remap_base && paddr < remap_base + remap_top)
+ paddr -= remap_base;
+ return paddr;
+}
+
+
+/* gnode -> pnode */
+static inline unsigned long uv_gpa_to_gnode(unsigned long gpa)
+{
+ return gpa >> uv_hub_info->m_val;
+}
+
+/* gpa -> pnode */
+static inline int uv_gpa_to_pnode(unsigned long gpa)
+{
+ unsigned long n_mask = (1UL << uv_hub_info->n_val) - 1;
+
+ return uv_gpa_to_gnode(gpa) & n_mask;
+}
+
/* pnode, offset --> socket virtual */
static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
{
@@ -249,21 +287,18 @@ static inline int uv_apicid_to_pnode(int apicid)
* Access global MMRs using the low memory MMR32 space. This region supports
* faster MMR access but not all MMRs are accessible in this space.
*/
-static inline unsigned long *uv_global_mmr32_address(int pnode,
- unsigned long offset)
+static inline unsigned long *uv_global_mmr32_address(int pnode, unsigned long offset)
{
return __va(UV_GLOBAL_MMR32_BASE |
UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset);
}
-static inline void uv_write_global_mmr32(int pnode, unsigned long offset,
- unsigned long val)
+static inline void uv_write_global_mmr32(int pnode, unsigned long offset, unsigned long val)
{
writeq(val, uv_global_mmr32_address(pnode, offset));
}
-static inline unsigned long uv_read_global_mmr32(int pnode,
- unsigned long offset)
+static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset)
{
return readq(uv_global_mmr32_address(pnode, offset));
}
@@ -272,26 +307,43 @@ static inline unsigned long uv_read_global_mmr32(int pnode,
* Access Global MMR space using the MMR space located at the top of physical
* memory.
*/
-static inline unsigned long *uv_global_mmr64_address(int pnode,
- unsigned long offset)
+static inline unsigned long *uv_global_mmr64_address(int pnode, unsigned long offset)
{
return __va(UV_GLOBAL_MMR64_BASE |
UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
}
-static inline void uv_write_global_mmr64(int pnode, unsigned long offset,
- unsigned long val)
+static inline void uv_write_global_mmr64(int pnode, unsigned long offset, unsigned long val)
{
writeq(val, uv_global_mmr64_address(pnode, offset));
}
-static inline unsigned long uv_read_global_mmr64(int pnode,
- unsigned long offset)
+static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset)
{
return readq(uv_global_mmr64_address(pnode, offset));
}
/*
+ * Global MMR space addresses when referenced by the GRU. (GRU does
+ * NOT use socket addressing).
+ */
+static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset)
+{
+ return UV_GLOBAL_GRU_MMR_BASE | offset |
+ ((unsigned long)pnode << uv_hub_info->m_val);
+}
+
+static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val)
+{
+ writeb(val, uv_global_mmr64_address(pnode, offset));
+}
+
+static inline unsigned char uv_read_global_mmr8(int pnode, unsigned long offset)
+{
+ return readb(uv_global_mmr64_address(pnode, offset));
+}
+
+/*
* Access hub local MMRs. Faster than using global space but only local MMRs
* are accessible.
*/
@@ -410,23 +462,51 @@ static inline void uv_set_scir_bits(unsigned char value)
}
}
+static inline unsigned long uv_scir_offset(int apicid)
+{
+ return SCIR_LOCAL_MMR_BASE | (apicid & 0x3f);
+}
+
static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
{
if (uv_cpu_hub_info(cpu)->scir.state != value) {
+ uv_write_global_mmr8(uv_cpu_to_pnode(cpu),
+ uv_cpu_hub_info(cpu)->scir.offset, value);
uv_cpu_hub_info(cpu)->scir.state = value;
- uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value);
}
}
+static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode)
+{
+ return (1UL << UVH_IPI_INT_SEND_SHFT) |
+ ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) |
+ (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) |
+ (vector << UVH_IPI_INT_VECTOR_SHFT);
+}
+
static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
{
unsigned long val;
+ unsigned long dmode = dest_Fixed;
- val = (1UL << UVH_IPI_INT_SEND_SHFT) |
- ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) |
- (vector << UVH_IPI_INT_VECTOR_SHFT);
+ if (vector == NMI_VECTOR)
+ dmode = dest_NMI;
+
+ val = uv_hub_ipi_value(apicid, vector, dmode);
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
}
+/*
+ * Get the minimum revision number of the hub chips within the partition.
+ * 1 - initial rev 1.0 silicon
+ * 2 - rev 2.0 production silicon
+ */
+static inline int uv_get_min_hub_revision_id(void)
+{
+ extern int uv_min_hub_revision_id;
+
+ return uv_min_hub_revision_id;
+}
+
#endif /* CONFIG_X86_64 */
#endif /* _ASM_X86_UV_UV_HUB_H */
diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h
index 9613c8c0b647..d6b17c760622 100644
--- a/arch/x86/include/asm/uv/uv_irq.h
+++ b/arch/x86/include/asm/uv/uv_irq.h
@@ -25,12 +25,14 @@ struct uv_IO_APIC_route_entry {
dest : 32;
};
-extern struct irq_chip uv_irq_chip;
-
-extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long);
-extern void arch_disable_uv_irq(int, unsigned long);
+enum {
+ UV_AFFINITY_ALL,
+ UV_AFFINITY_NODE,
+ UV_AFFINITY_CPU
+};
-extern int uv_setup_irq(char *, int, int, unsigned long);
-extern void uv_teardown_irq(unsigned int, int, unsigned long);
+extern int uv_irq_2_mmr_info(int, unsigned long *, int *);
+extern int uv_setup_irq(char *, int, int, unsigned long, int);
+extern void uv_teardown_irq(unsigned int);
#endif /* _ASM_X86_UV_UV_IRQ_H */
diff --git a/arch/x86/include/asm/visws/cobalt.h b/arch/x86/include/asm/visws/cobalt.h
index 166adf61e770..2edb37637ead 100644
--- a/arch/x86/include/asm/visws/cobalt.h
+++ b/arch/x86/include/asm/visws/cobalt.h
@@ -122,4 +122,6 @@ extern char visws_board_type;
extern char visws_board_rev;
+extern int pci_visws_init(void);
+
#endif /* _ASM_X86_VISWS_COBALT_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 272514c2d456..fb9a080740ec 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -53,9 +53,11 @@
*/
#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
+#define SECONDARY_EXEC_RDTSCP 0x00000008
#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
+#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -144,6 +146,8 @@ enum vmcs_field {
VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
TPR_THRESHOLD = 0x0000401c,
SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
+ PLE_GAP = 0x00004020,
+ PLE_WINDOW = 0x00004022,
VM_INSTRUCTION_ERROR = 0x00004400,
VM_EXIT_REASON = 0x00004402,
VM_EXIT_INTR_INFO = 0x00004404,
@@ -248,6 +252,8 @@ enum vmcs_field {
#define EXIT_REASON_MSR_READ 31
#define EXIT_REASON_MSR_WRITE 32
#define EXIT_REASON_MWAIT_INSTRUCTION 36
+#define EXIT_REASON_MONITOR_INSTRUCTION 39
+#define EXIT_REASON_PAUSE_INSTRUCTION 40
#define EXIT_REASON_MCE_DURING_VMENTRY 41
#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
#define EXIT_REASON_APIC_ACCESS 44
@@ -358,6 +364,7 @@ enum vmcs_field {
#define VMX_EPTP_UC_BIT (1ull << 8)
#define VMX_EPTP_WB_BIT (1ull << 14)
#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
+#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
@@ -370,7 +377,7 @@ enum vmcs_field {
#define VMX_EPT_READABLE_MASK 0x1ull
#define VMX_EPT_WRITABLE_MASK 0x2ull
#define VMX_EPT_EXECUTABLE_MASK 0x4ull
-#define VMX_EPT_IGMT_BIT (1ull << 6)
+#define VMX_EPT_IPAT_BIT (1ull << 6)
#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 2c756fd4ab0e..519b54327d75 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -26,7 +26,7 @@ struct x86_init_mpparse {
void (*smp_read_mpc_oem)(struct mpc_table *mpc);
void (*mpc_oem_pci_bus)(struct mpc_bus *m);
void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
- void (*find_smp_config)(unsigned int reserve);
+ void (*find_smp_config)(void);
void (*get_smp_config)(unsigned int early);
};
@@ -91,6 +91,28 @@ struct x86_init_timers {
};
/**
+ * struct x86_init_iommu - platform specific iommu setup
+ * @iommu_init: platform specific iommu setup
+ */
+struct x86_init_iommu {
+ int (*iommu_init)(void);
+};
+
+/**
+ * struct x86_init_pci - platform specific pci init functions
+ * @arch_init: platform specific pci arch init call
+ * @init: platform specific pci subsystem init
+ * @init_irq: platform specific pci irq init
+ * @fixup_irqs: platform specific pci irq fixup
+ */
+struct x86_init_pci {
+ int (*arch_init)(void);
+ int (*init)(void);
+ void (*init_irq)(void);
+ void (*fixup_irqs)(void);
+};
+
+/**
* struct x86_init_ops - functions for platform specific setup
*
*/
@@ -101,6 +123,8 @@ struct x86_init_ops {
struct x86_init_oem oem;
struct x86_init_paging paging;
struct x86_init_timers timers;
+ struct x86_init_iommu iommu;
+ struct x86_init_pci pci;
};
/**
@@ -116,11 +140,16 @@ struct x86_cpuinit_ops {
* @calibrate_tsc: calibrate TSC
* @get_wallclock: get time from HW clock like RTC etc.
* @set_wallclock: set time back to HW clock
+ * @is_untracked_pat_range exclude from PAT logic
+ * @nmi_init enable NMI on cpus
*/
struct x86_platform_ops {
unsigned long (*calibrate_tsc)(void);
unsigned long (*get_wallclock)(void);
int (*set_wallclock)(unsigned long nowtime);
+ void (*iommu_shutdown)(void);
+ bool (*is_untracked_pat_range)(u64 start, u64 end);
+ void (*nmi_init)(void);
};
extern struct x86_init_ops x86_init;
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index d5b7e90c0edf..396ff4cc8ed4 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -37,31 +37,4 @@
extern struct shared_info *HYPERVISOR_shared_info;
extern struct start_info *xen_start_info;
-enum xen_domain_type {
- XEN_NATIVE, /* running on bare hardware */
- XEN_PV_DOMAIN, /* running in a PV domain */
- XEN_HVM_DOMAIN, /* running in a Xen hvm domain */
-};
-
-#ifdef CONFIG_XEN
-extern enum xen_domain_type xen_domain_type;
-#else
-#define xen_domain_type XEN_NATIVE
-#endif
-
-#define xen_domain() (xen_domain_type != XEN_NATIVE)
-#define xen_pv_domain() (xen_domain() && \
- xen_domain_type == XEN_PV_DOMAIN)
-#define xen_hvm_domain() (xen_domain() && \
- xen_domain_type == XEN_HVM_DOMAIN)
-
-#ifdef CONFIG_XEN_DOM0
-#include <xen/interface/xen.h>
-
-#define xen_initial_domain() (xen_pv_domain() && \
- xen_start_info->flags & SIF_INITDOMAIN)
-#else /* !CONFIG_XEN_DOM0 */
-#define xen_initial_domain() (0)
-#endif /* CONFIG_XEN_DOM0 */
-
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 727acc152344..ddc04ccad03b 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -27,9 +27,11 @@
extern unsigned int xstate_size;
extern u64 pcntxt_mask;
extern struct xsave_struct *init_xstate_buf;
+extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
extern void xsave_cntxt_init(void);
extern void xsave_init(void);
+extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
extern int init_fpu(struct task_struct *child);
extern int check_for_xstate(struct i387_fxsave_struct __user *buf,
void __user *fpstate,
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d8e5d0cdd678..4c58352209e0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
-obj-y += alternative.o i8253.o pci-nommu.o
+obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
obj-y += tsc.o io_delay.o rtc.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
@@ -87,9 +87,9 @@ obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
+obj-$(CONFIG_APB_TIMER) += apb_timer.o
obj-$(CONFIG_K8_NB) += k8.o
-obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index fd5ca97a2ad5..6f35260bb3ef 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -4,7 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
-obj-y += cstate.o processor.o
+obj-y += cstate.o
endif
$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 67e929b89875..0061ea263061 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -35,6 +35,7 @@
#include <linux/ioport.h>
#include <linux/pci.h>
+#include <asm/pci_x86.h>
#include <asm/pgtable.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
@@ -49,6 +50,7 @@ EXPORT_SYMBOL(acpi_disabled);
#ifdef CONFIG_X86_64
# include <asm/proto.h>
+# include <asm/numa_64.h>
#endif /* X86 */
#define BAD_MADT_ENTRY(entry, end) ( \
@@ -446,6 +448,12 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
{
*irq = gsi;
+
+#ifdef CONFIG_X86_IO_APIC
+ if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
+ setup_IO_APIC_irq_extra(gsi);
+#endif
+
return 0;
}
@@ -473,7 +481,8 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
}
#endif
- acpi_gsi_to_irq(plat_gsi, &irq);
+ irq = plat_gsi;
+
return irq;
}
@@ -481,6 +490,26 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
* ACPI based hotplug support for CPU
*/
#ifdef CONFIG_ACPI_HOTPLUG_CPU
+#include <acpi/processor.h>
+
+static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+{
+#ifdef CONFIG_ACPI_NUMA
+ int nid;
+
+ nid = acpi_get_node(handle);
+ if (nid == -1 || !node_online(nid))
+ return;
+#ifdef CONFIG_X86_64
+ apicid_to_node[physid] = nid;
+ numa_set_node(cpu, nid);
+#else /* CONFIG_X86_32 */
+ apicid_2_node[physid] = nid;
+ cpu_to_node_map[cpu] = nid;
+#endif
+
+#endif
+}
static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
{
@@ -539,7 +568,10 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
goto free_new_map;
}
+ acpi_processor_set_pdc(handle);
+
cpu = cpumask_first(new_map);
+ acpi_map_cpu2node(handle, cpu, physid);
*pcpu = cpu;
retval = 0;
@@ -624,6 +656,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
}
hpet_address = hpet_tbl->address.address;
+ hpet_blockid = hpet_tbl->sequence;
/*
* Some broken BIOSes advertise HPET at 0x0. We really do not
@@ -1122,7 +1155,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
if (!acpi_sci_override_gsi)
acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
- /* Fill in identity legacy mapings where no override */
+ /* Fill in identity legacy mappings where no override */
mp_config_acpi_legacy_irqs();
count =
@@ -1184,9 +1217,6 @@ static void __init acpi_process_madt(void)
if (!error) {
acpi_lapic = 1;
-#ifdef CONFIG_X86_BIGSMP
- generic_bigsmp_probe();
-#endif
/*
* Parse MADT IO-APIC entries
*/
@@ -1196,8 +1226,6 @@ static void __init acpi_process_madt(void)
acpi_ioapic = 1;
smp_found_config = 1;
- if (apic->setup_apic_routing)
- apic->setup_apic_routing();
}
}
if (error == -EINVAL) {
@@ -1268,23 +1296,6 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
}
/*
- * Limit ACPI to CPU enumeration for HT
- */
-static int __init force_acpi_ht(const struct dmi_system_id *d)
-{
- if (!acpi_force) {
- printk(KERN_NOTICE "%s detected: force use of acpi=ht\n",
- d->ident);
- disable_acpi();
- acpi_ht = 1;
- } else {
- printk(KERN_NOTICE
- "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
- }
- return 0;
-}
-
-/*
* Force ignoring BIOS IRQ0 pin2 override
*/
static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
@@ -1320,90 +1331,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
},
/*
- * Boxes that need acpi=ht
- */
- {
- .callback = force_acpi_ht,
- .ident = "FSC Primergy T850",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
- DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "HP VISUALIZE NT Workstation",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
- DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "Compaq Workstation W8000",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
- DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "ASUS P2B-DS",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
- DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "ASUS CUR-DLS",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
- DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "ABIT i440BX-W83977",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
- DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "IBM Bladecenter",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
- DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "IBM eServer xSeries 360",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
- DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "IBM eserver xSeries 330",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
- DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
- },
- },
- {
- .callback = force_acpi_ht,
- .ident = "IBM eserver xSeries 440",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
- DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
- },
- },
-
- /*
* Boxes that need ACPI PCI IRQ routing disabled
*/
{
@@ -1528,16 +1455,10 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
* if acpi_blacklisted() acpi_disabled = 1;
* acpi_irq_model=...
* ...
- *
- * return value: (currently ignored)
- * 0: success
- * !0: failure
*/
-int __init acpi_boot_table_init(void)
+void __init acpi_boot_table_init(void)
{
- int error;
-
dmi_check_system(acpi_dmi_table);
/*
@@ -1545,15 +1466,14 @@ int __init acpi_boot_table_init(void)
* One exception: acpi=ht continues far enough to enumerate LAPICs
*/
if (acpi_disabled && !acpi_ht)
- return 1;
+ return;
/*
* Initialize the ACPI boot-time table parser.
*/
- error = acpi_table_init();
- if (error) {
+ if (acpi_table_init()) {
disable_acpi();
- return error;
+ return;
}
acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1561,18 +1481,15 @@ int __init acpi_boot_table_init(void)
/*
* blacklist may disable ACPI entirely
*/
- error = acpi_blacklisted();
- if (error) {
+ if (acpi_blacklisted()) {
if (acpi_force) {
printk(KERN_WARNING PREFIX "acpi=force override\n");
} else {
printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
disable_acpi();
- return error;
+ return;
}
}
-
- return 0;
}
int __init early_acpi_boot_init(void)
@@ -1618,6 +1535,9 @@ int __init acpi_boot_init(void)
acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
+ if (!acpi_noirq)
+ x86_init.pci.init = pci_acpi_init;
+
return 0;
}
@@ -1642,8 +1562,10 @@ static int __init parse_acpi(char *arg)
}
/* Limit ACPI just to boot-time to enable HT */
else if (strcmp(arg, "ht") == 0) {
- if (!acpi_force)
+ if (!acpi_force) {
+ printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n");
disable_acpi();
+ }
acpi_ht = 1;
}
/* acpi=rsdt use RSDT instead of XSDT */
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 59cdfa4686b2..2e837f5080fe 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
* P4, Core and beyond CPUs
*/
if (c->x86_vendor == X86_VENDOR_INTEL &&
- (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14)))
+ (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f)))
flags->bm_control = 0;
}
EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
deleted file mode 100644
index d296f4a195c9..000000000000
--- a/arch/x86/kernel/acpi/processor.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (C) 2005 Intel Corporation
- * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
- * - Added _PDC for platforms with Intel CPUs
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/acpi.h>
-
-#include <acpi/processor.h>
-#include <asm/acpi.h>
-
-static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
-{
- struct acpi_object_list *obj_list;
- union acpi_object *obj;
- u32 *buf;
-
- /* allocate and initialize pdc. It will be used later. */
- obj_list = kmalloc(sizeof(struct acpi_object_list), GFP_KERNEL);
- if (!obj_list) {
- printk(KERN_ERR "Memory allocation error\n");
- return;
- }
-
- obj = kmalloc(sizeof(union acpi_object), GFP_KERNEL);
- if (!obj) {
- printk(KERN_ERR "Memory allocation error\n");
- kfree(obj_list);
- return;
- }
-
- buf = kmalloc(12, GFP_KERNEL);
- if (!buf) {
- printk(KERN_ERR "Memory allocation error\n");
- kfree(obj);
- kfree(obj_list);
- return;
- }
-
- buf[0] = ACPI_PDC_REVISION_ID;
- buf[1] = 1;
- buf[2] = ACPI_PDC_C_CAPABILITY_SMP;
-
- /*
- * The default of PDC_SMP_T_SWCOORD bit is set for intel x86 cpu so
- * that OSPM is capable of native ACPI throttling software
- * coordination using BIOS supplied _TSD info.
- */
- buf[2] |= ACPI_PDC_SMP_T_SWCOORD;
- if (cpu_has(c, X86_FEATURE_EST))
- buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP;
-
- if (cpu_has(c, X86_FEATURE_ACPI))
- buf[2] |= ACPI_PDC_T_FFH;
-
- /*
- * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
- */
- if (!cpu_has(c, X86_FEATURE_MWAIT))
- buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
-
- obj->type = ACPI_TYPE_BUFFER;
- obj->buffer.length = 12;
- obj->buffer.pointer = (u8 *) buf;
- obj_list->count = 1;
- obj_list->pointer = obj;
- pr->pdc = obj_list;
-
- return;
-}
-
-
-/* Initialize _PDC data based on the CPU vendor */
-void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
-{
- struct cpuinfo_x86 *c = &cpu_data(pr->id);
-
- pr->pdc = NULL;
- if (c->x86_vendor == X86_VENDOR_INTEL)
- init_intel_pdc(pr, c);
-
- return;
-}
-
-EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
-
-void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr)
-{
- if (pr->pdc) {
- kfree(pr->pdc->pointer->buffer.pointer);
- kfree(pr->pdc->pointer);
- kfree(pr->pdc);
- pr->pdc = NULL;
- }
-}
-
-EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc);
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
index 7da00b799cda..060fff8f5c5b 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -57,5 +57,8 @@ SECTIONS
*(.note*)
}
+ /*
+ * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
+ */
. = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ca93638ba430..f9961034e557 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -78,12 +78,9 @@ int acpi_save_state_mem(void)
#ifndef CONFIG_64BIT
store_gdt((struct desc_ptr *)&header->pmode_gdt);
- header->pmode_efer_low = nx_enabled;
- if (header->pmode_efer_low & 1) {
- /* This is strange, why not save efer, always? */
- rdmsr(MSR_EFER, header->pmode_efer_low,
- header->pmode_efer_high);
- }
+ if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low,
+ &header->pmode_efer_high))
+ header->pmode_efer_low = header->pmode_efer_high = 0;
#endif /* !CONFIG_64BIT */
header->pmode_cr0 = read_cr0();
@@ -119,29 +116,32 @@ void acpi_restore_state_mem(void)
/**
- * acpi_reserve_bootmem - do _very_ early ACPI initialisation
+ * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
*
* We allocate a page from the first 1MB of memory for the wakeup
* routine for when we come back from a sleep state. The
* runtime allocator allows specification of <16MB pages, but not
* <1MB pages.
*/
-void __init acpi_reserve_bootmem(void)
+void __init acpi_reserve_wakeup_memory(void)
{
+ unsigned long mem;
+
if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
printk(KERN_ERR
"ACPI: Wakeup code way too big, S3 disabled.\n");
return;
}
- acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE);
+ mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
- if (!acpi_realmode) {
+ if (mem == -1L) {
printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
return;
}
-
- acpi_wakeup_address = virt_to_phys((void *)acpi_realmode);
+ acpi_realmode = (unsigned long) phys_to_virt(mem);
+ acpi_wakeup_address = mem;
+ reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
}
@@ -162,6 +162,8 @@ static int __init acpi_sleep_setup(char *str)
#endif
if (strncmp(str, "old_ordering", 12) == 0)
acpi_old_suspend_ordering();
+ if (strncmp(str, "sci_force_enable", 16) == 0)
+ acpi_set_sci_en_on_resume();
str = strchr(str, ',');
if (str != NULL)
str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index de7353c0ce9c..3a4bf35c179b 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -7,6 +7,7 @@
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
+#include <linux/stop_machine.h>
#include <asm/alternative.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
@@ -205,7 +206,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
struct alt_instr *end)
{
struct alt_instr *a;
- char insnbuf[MAX_PATCH_LEN];
+ u8 insnbuf[MAX_PATCH_LEN];
DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
for (a = start; a < end; a++) {
@@ -223,6 +224,8 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
}
#endif
memcpy(insnbuf, a->replacement, a->replacementlen);
+ if (*insnbuf == 0xe8 && a->replacementlen == 5)
+ *(s32 *)(insnbuf + 1) += a->replacement - a->instr;
add_nops(insnbuf + a->replacementlen,
a->instrlen - a->replacementlen);
text_poke_early(instr, insnbuf, a->instrlen);
@@ -390,6 +393,24 @@ void alternatives_smp_switch(int smp)
mutex_unlock(&smp_alt);
}
+/* Return 1 if the address range is reserved for smp-alternatives */
+int alternatives_text_reserved(void *start, void *end)
+{
+ struct smp_alt_module *mod;
+ u8 **ptr;
+ u8 *text_start = start;
+ u8 *text_end = end;
+
+ list_for_each_entry(mod, &smp_alt_modules, next) {
+ if (mod->text > text_end || mod->text_end < text_start)
+ continue;
+ for (ptr = mod->locks; ptr < mod->locks_end; ptr++)
+ if (text_start <= *ptr && text_end >= *ptr)
+ return 1;
+ }
+
+ return 0;
+}
#endif
#ifdef CONFIG_PARAVIRT
@@ -552,3 +573,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
local_irq_restore(flags);
return addr;
}
+
+/*
+ * Cross-modifying kernel text with stop_machine().
+ * This code originally comes from immediate value.
+ */
+static atomic_t stop_machine_first;
+static int wrote_text;
+
+struct text_poke_params {
+ void *addr;
+ const void *opcode;
+ size_t len;
+};
+
+static int __kprobes stop_machine_text_poke(void *data)
+{
+ struct text_poke_params *tpp = data;
+
+ if (atomic_dec_and_test(&stop_machine_first)) {
+ text_poke(tpp->addr, tpp->opcode, tpp->len);
+ smp_wmb(); /* Make sure other cpus see that this has run */
+ wrote_text = 1;
+ } else {
+ while (!wrote_text)
+ cpu_relax();
+ smp_mb(); /* Load wrote_text before following execution */
+ }
+
+ flush_icache_range((unsigned long)tpp->addr,
+ (unsigned long)tpp->addr + tpp->len);
+ return 0;
+}
+
+/**
+ * text_poke_smp - Update instructions on a live kernel on SMP
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * Modify multi-byte instruction by using stop_machine() on SMP. This allows
+ * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
+ * should be allowed, since stop_machine() does _not_ protect code against
+ * NMI and MCE.
+ *
+ * Note: Must be called under get_online_cpus() and text_mutex.
+ */
+void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
+{
+ struct text_poke_params tpp;
+
+ tpp.addr = addr;
+ tpp.opcode = opcode;
+ tpp.len = len;
+ atomic_set(&stop_machine_first, 1);
+ wrote_text = 0;
+ stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+ return addr;
+}
+
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 98f230f6a28d..adb0ba025702 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
* Leo Duran <leo.duran@amd.com>
*
@@ -19,7 +19,7 @@
#include <linux/pci.h>
#include <linux/gfp.h>
-#include <linux/bitops.h>
+#include <linux/bitmap.h>
#include <linux/debugfs.h>
#include <linux/scatterlist.h>
#include <linux/dma-mapping.h>
@@ -28,6 +28,7 @@
#include <asm/proto.h>
#include <asm/iommu.h>
#include <asm/gart.h>
+#include <asm/amd_iommu_proto.h>
#include <asm/amd_iommu_types.h>
#include <asm/amd_iommu.h>
@@ -56,20 +57,152 @@ struct iommu_cmd {
u32 data[4];
};
-static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
- struct unity_map_entry *e);
-static struct dma_ops_domain *find_protection_domain(u16 devid);
-static u64 *alloc_pte(struct protection_domain *domain,
- unsigned long address, int end_lvl,
- u64 **pte_page, gfp_t gfp);
-static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
- unsigned long start_page,
- unsigned int pages);
static void reset_iommu_command_buffer(struct amd_iommu *iommu);
-static u64 *fetch_pte(struct protection_domain *domain,
- unsigned long address, int map_size);
static void update_domain(struct protection_domain *domain);
+/****************************************************************************
+ *
+ * Helper functions
+ *
+ ****************************************************************************/
+
+static inline u16 get_device_id(struct device *dev)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ return calc_devid(pdev->bus->number, pdev->devfn);
+}
+
+static struct iommu_dev_data *get_dev_data(struct device *dev)
+{
+ return dev->archdata.iommu;
+}
+
+/*
+ * In this function the list of preallocated protection domains is traversed to
+ * find the domain for a specific device
+ */
+static struct dma_ops_domain *find_protection_domain(u16 devid)
+{
+ struct dma_ops_domain *entry, *ret = NULL;
+ unsigned long flags;
+ u16 alias = amd_iommu_alias_table[devid];
+
+ if (list_empty(&iommu_pd_list))
+ return NULL;
+
+ spin_lock_irqsave(&iommu_pd_list_lock, flags);
+
+ list_for_each_entry(entry, &iommu_pd_list, list) {
+ if (entry->target_dev == devid ||
+ entry->target_dev == alias) {
+ ret = entry;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
+
+ return ret;
+}
+
+/*
+ * This function checks if the driver got a valid device from the caller to
+ * avoid dereferencing invalid pointers.
+ */
+static bool check_device(struct device *dev)
+{
+ u16 devid;
+
+ if (!dev || !dev->dma_mask)
+ return false;
+
+ /* No device or no PCI device */
+ if (!dev || dev->bus != &pci_bus_type)
+ return false;
+
+ devid = get_device_id(dev);
+
+ /* Out of our scope? */
+ if (devid > amd_iommu_last_bdf)
+ return false;
+
+ if (amd_iommu_rlookup_table[devid] == NULL)
+ return false;
+
+ return true;
+}
+
+static int iommu_init_device(struct device *dev)
+{
+ struct iommu_dev_data *dev_data;
+ struct pci_dev *pdev;
+ u16 devid, alias;
+
+ if (dev->archdata.iommu)
+ return 0;
+
+ dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
+ if (!dev_data)
+ return -ENOMEM;
+
+ dev_data->dev = dev;
+
+ devid = get_device_id(dev);
+ alias = amd_iommu_alias_table[devid];
+ pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
+ if (pdev)
+ dev_data->alias = &pdev->dev;
+
+ atomic_set(&dev_data->bind, 0);
+
+ dev->archdata.iommu = dev_data;
+
+
+ return 0;
+}
+
+static void iommu_uninit_device(struct device *dev)
+{
+ kfree(dev->archdata.iommu);
+}
+
+void __init amd_iommu_uninit_devices(void)
+{
+ struct pci_dev *pdev = NULL;
+
+ for_each_pci_dev(pdev) {
+
+ if (!check_device(&pdev->dev))
+ continue;
+
+ iommu_uninit_device(&pdev->dev);
+ }
+}
+
+int __init amd_iommu_init_devices(void)
+{
+ struct pci_dev *pdev = NULL;
+ int ret = 0;
+
+ for_each_pci_dev(pdev) {
+
+ if (!check_device(&pdev->dev))
+ continue;
+
+ ret = iommu_init_device(&pdev->dev);
+ if (ret)
+ goto out_free;
+ }
+
+ return 0;
+
+out_free:
+
+ amd_iommu_uninit_devices();
+
+ return ret;
+}
#ifdef CONFIG_AMD_IOMMU_STATS
/*
@@ -90,7 +223,6 @@ DECLARE_STATS_COUNTER(alloced_io_mem);
DECLARE_STATS_COUNTER(total_map_requests);
static struct dentry *stats_dir;
-static struct dentry *de_isolate;
static struct dentry *de_fflush;
static void amd_iommu_stats_add(struct __iommu_counter *cnt)
@@ -108,9 +240,6 @@ static void amd_iommu_stats_init(void)
if (stats_dir == NULL)
return;
- de_isolate = debugfs_create_bool("isolation", 0444, stats_dir,
- (u32 *)&amd_iommu_isolate);
-
de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
(u32 *)&amd_iommu_unmap_flush);
@@ -130,12 +259,6 @@ static void amd_iommu_stats_init(void)
#endif
-/* returns !0 if the IOMMU is caching non-present entries in its TLB */
-static int iommu_has_npcache(struct amd_iommu *iommu)
-{
- return iommu->cap & (1UL << IOMMU_CAP_NPCACHE);
-}
-
/****************************************************************************
*
* Interrupt handling functions
@@ -199,6 +322,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
break;
case EVENT_TYPE_ILL_CMD:
printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
+ iommu->reset_in_progress = true;
reset_iommu_command_buffer(iommu);
dump_command(address);
break;
@@ -321,11 +445,8 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu)
status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
- if (unlikely(i == EXIT_LOOP_COUNT)) {
- spin_unlock(&iommu->lock);
- reset_iommu_command_buffer(iommu);
- spin_lock(&iommu->lock);
- }
+ if (unlikely(i == EXIT_LOOP_COUNT))
+ iommu->reset_in_progress = true;
}
/*
@@ -372,26 +493,46 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
out:
spin_unlock_irqrestore(&iommu->lock, flags);
+ if (iommu->reset_in_progress)
+ reset_iommu_command_buffer(iommu);
+
return 0;
}
+static void iommu_flush_complete(struct protection_domain *domain)
+{
+ int i;
+
+ for (i = 0; i < amd_iommus_present; ++i) {
+ if (!domain->dev_iommu[i])
+ continue;
+
+ /*
+ * Devices of this domain are behind this IOMMU
+ * We need to wait for completion of all commands.
+ */
+ iommu_completion_wait(amd_iommus[i]);
+ }
+}
+
/*
* Command send function for invalidating a device table entry
*/
-static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
+static int iommu_flush_device(struct device *dev)
{
+ struct amd_iommu *iommu;
struct iommu_cmd cmd;
- int ret;
+ u16 devid;
- BUG_ON(iommu == NULL);
+ devid = get_device_id(dev);
+ iommu = amd_iommu_rlookup_table[devid];
+ /* Build command */
memset(&cmd, 0, sizeof(cmd));
CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
cmd.data[0] = devid;
- ret = iommu_queue_command(iommu, &cmd);
-
- return ret;
+ return iommu_queue_command(iommu, &cmd);
}
static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
@@ -430,11 +571,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
* It invalidates a single PTE if the range to flush is within a single
* page. Otherwise it flushes the whole TLB of the IOMMU.
*/
-static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
- u64 address, size_t size)
+static void __iommu_flush_pages(struct protection_domain *domain,
+ u64 address, size_t size, int pde)
{
- int s = 0;
- unsigned pages = iommu_num_pages(address, size, PAGE_SIZE);
+ int s = 0, i;
+ unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE);
address &= PAGE_MASK;
@@ -447,142 +588,212 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
s = 1;
}
- iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
- return 0;
+ for (i = 0; i < amd_iommus_present; ++i) {
+ if (!domain->dev_iommu[i])
+ continue;
+
+ /*
+ * Devices of this domain are behind this IOMMU
+ * We need a TLB flush
+ */
+ iommu_queue_inv_iommu_pages(amd_iommus[i], address,
+ domain->id, pde, s);
+ }
+
+ return;
}
-/* Flush the whole IO/TLB for a given protection domain */
-static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
+static void iommu_flush_pages(struct protection_domain *domain,
+ u64 address, size_t size)
{
- u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
-
- INC_STATS_COUNTER(domain_flush_single);
+ __iommu_flush_pages(domain, address, size, 0);
+}
- iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
+/* Flush the whole IO/TLB for a given protection domain */
+static void iommu_flush_tlb(struct protection_domain *domain)
+{
+ __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
}
/* Flush the whole IO/TLB for a given protection domain - including PDE */
-static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid)
+static void iommu_flush_tlb_pde(struct protection_domain *domain)
{
- u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
-
- INC_STATS_COUNTER(domain_flush_single);
-
- iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1);
+ __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
}
+
/*
- * This function flushes one domain on one IOMMU
+ * This function flushes the DTEs for all devices in domain
*/
-static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid)
+static void iommu_flush_domain_devices(struct protection_domain *domain)
{
- struct iommu_cmd cmd;
+ struct iommu_dev_data *dev_data;
unsigned long flags;
- __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
- domid, 1, 1);
+ spin_lock_irqsave(&domain->lock, flags);
- spin_lock_irqsave(&iommu->lock, flags);
- __iommu_queue_command(iommu, &cmd);
- __iommu_completion_wait(iommu);
- __iommu_wait_for_completion(iommu);
- spin_unlock_irqrestore(&iommu->lock, flags);
+ list_for_each_entry(dev_data, &domain->dev_list, list)
+ iommu_flush_device(dev_data->dev);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
}
-static void flush_all_domains_on_iommu(struct amd_iommu *iommu)
+static void iommu_flush_all_domain_devices(void)
{
- int i;
+ struct protection_domain *domain;
+ unsigned long flags;
- for (i = 1; i < MAX_DOMAIN_ID; ++i) {
- if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
- continue;
- flush_domain_on_iommu(iommu, i);
+ spin_lock_irqsave(&amd_iommu_pd_lock, flags);
+
+ list_for_each_entry(domain, &amd_iommu_pd_list, list) {
+ iommu_flush_domain_devices(domain);
+ iommu_flush_complete(domain);
}
+ spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
+}
+
+void amd_iommu_flush_all_devices(void)
+{
+ iommu_flush_all_domain_devices();
}
/*
- * This function is used to flush the IO/TLB for a given protection domain
- * on every IOMMU in the system
+ * This function uses heavy locking and may disable irqs for some time. But
+ * this is no issue because it is only called during resume.
*/
-static void iommu_flush_domain(u16 domid)
+void amd_iommu_flush_all_domains(void)
{
- struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ unsigned long flags;
- INC_STATS_COUNTER(domain_flush_all);
+ spin_lock_irqsave(&amd_iommu_pd_lock, flags);
- for_each_iommu(iommu)
- flush_domain_on_iommu(iommu, domid);
+ list_for_each_entry(domain, &amd_iommu_pd_list, list) {
+ spin_lock(&domain->lock);
+ iommu_flush_tlb_pde(domain);
+ iommu_flush_complete(domain);
+ spin_unlock(&domain->lock);
+ }
+
+ spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
}
-void amd_iommu_flush_all_domains(void)
+static void reset_iommu_command_buffer(struct amd_iommu *iommu)
{
- struct amd_iommu *iommu;
+ pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
- for_each_iommu(iommu)
- flush_all_domains_on_iommu(iommu);
+ if (iommu->reset_in_progress)
+ panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
+
+ amd_iommu_reset_cmd_buffer(iommu);
+ amd_iommu_flush_all_devices();
+ amd_iommu_flush_all_domains();
+
+ iommu->reset_in_progress = false;
}
-static void flush_all_devices_for_iommu(struct amd_iommu *iommu)
+/****************************************************************************
+ *
+ * The functions below are used the create the page table mappings for
+ * unity mapped regions.
+ *
+ ****************************************************************************/
+
+/*
+ * This function is used to add another level to an IO page table. Adding
+ * another level increases the size of the address space by 9 bits to a size up
+ * to 64 bits.
+ */
+static bool increase_address_space(struct protection_domain *domain,
+ gfp_t gfp)
{
- int i;
+ u64 *pte;
- for (i = 0; i <= amd_iommu_last_bdf; ++i) {
- if (iommu != amd_iommu_rlookup_table[i])
- continue;
+ if (domain->mode == PAGE_MODE_6_LEVEL)
+ /* address space already 64 bit large */
+ return false;
- iommu_queue_inv_dev_entry(iommu, i);
- iommu_completion_wait(iommu);
- }
+ pte = (void *)get_zeroed_page(gfp);
+ if (!pte)
+ return false;
+
+ *pte = PM_LEVEL_PDE(domain->mode,
+ virt_to_phys(domain->pt_root));
+ domain->pt_root = pte;
+ domain->mode += 1;
+ domain->updated = true;
+
+ return true;
}
-static void flush_devices_by_domain(struct protection_domain *domain)
+static u64 *alloc_pte(struct protection_domain *domain,
+ unsigned long address,
+ int end_lvl,
+ u64 **pte_page,
+ gfp_t gfp)
{
- struct amd_iommu *iommu;
- int i;
+ u64 *pte, *page;
+ int level;
- for (i = 0; i <= amd_iommu_last_bdf; ++i) {
- if ((domain == NULL && amd_iommu_pd_table[i] == NULL) ||
- (amd_iommu_pd_table[i] != domain))
- continue;
+ while (address > PM_LEVEL_SIZE(domain->mode))
+ increase_address_space(domain, gfp);
- iommu = amd_iommu_rlookup_table[i];
- if (!iommu)
- continue;
+ level = domain->mode - 1;
+ pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+
+ while (level > end_lvl) {
+ if (!IOMMU_PTE_PRESENT(*pte)) {
+ page = (u64 *)get_zeroed_page(gfp);
+ if (!page)
+ return NULL;
+ *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
+ }
- iommu_queue_inv_dev_entry(iommu, i);
- iommu_completion_wait(iommu);
+ level -= 1;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+
+ if (pte_page && level == end_lvl)
+ *pte_page = pte;
+
+ pte = &pte[PM_LEVEL_INDEX(level, address)];
}
+
+ return pte;
}
-static void reset_iommu_command_buffer(struct amd_iommu *iommu)
+/*
+ * This function checks if there is a PTE for a given dma address. If
+ * there is one, it returns the pointer to it.
+ */
+static u64 *fetch_pte(struct protection_domain *domain,
+ unsigned long address, int map_size)
{
- pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
+ int level;
+ u64 *pte;
- if (iommu->reset_in_progress)
- panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
+ level = domain->mode - 1;
+ pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
- iommu->reset_in_progress = true;
+ while (level > map_size) {
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return NULL;
- amd_iommu_reset_cmd_buffer(iommu);
- flush_all_devices_for_iommu(iommu);
- flush_all_domains_on_iommu(iommu);
+ level -= 1;
- iommu->reset_in_progress = false;
-}
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[PM_LEVEL_INDEX(level, address)];
-void amd_iommu_flush_all_devices(void)
-{
- flush_devices_by_domain(NULL);
-}
+ if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
+ pte = NULL;
+ break;
+ }
+ }
-/****************************************************************************
- *
- * The functions below are used the create the page table mappings for
- * unity mapped regions.
- *
- ****************************************************************************/
+ return pte;
+}
/*
* Generic mapping functions. It maps a physical address into a DMA
@@ -654,28 +865,6 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
}
/*
- * Init the unity mappings for a specific IOMMU in the system
- *
- * Basically iterates over all unity mapping entries and applies them to
- * the default domain DMA of that IOMMU if necessary.
- */
-static int iommu_init_unity_mappings(struct amd_iommu *iommu)
-{
- struct unity_map_entry *entry;
- int ret;
-
- list_for_each_entry(entry, &amd_iommu_unity_map, list) {
- if (!iommu_for_unity_map(iommu, entry))
- continue;
- ret = dma_ops_unity_map(iommu->default_dom, entry);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/*
* This function actually applies the mapping to the page table of the
* dma_ops domain.
*/
@@ -704,6 +893,28 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
}
/*
+ * Init the unity mappings for a specific IOMMU in the system
+ *
+ * Basically iterates over all unity mapping entries and applies them to
+ * the default domain DMA of that IOMMU if necessary.
+ */
+static int iommu_init_unity_mappings(struct amd_iommu *iommu)
+{
+ struct unity_map_entry *entry;
+ int ret;
+
+ list_for_each_entry(entry, &amd_iommu_unity_map, list) {
+ if (!iommu_for_unity_map(iommu, entry))
+ continue;
+ ret = dma_ops_unity_map(iommu->default_dom, entry);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
* Inits the unity mappings required for a specific device
*/
static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
@@ -740,34 +951,23 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
*/
/*
- * This function checks if there is a PTE for a given dma address. If
- * there is one, it returns the pointer to it.
+ * Used to reserve address ranges in the aperture (e.g. for exclusion
+ * ranges.
*/
-static u64 *fetch_pte(struct protection_domain *domain,
- unsigned long address, int map_size)
+static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
+ unsigned long start_page,
+ unsigned int pages)
{
- int level;
- u64 *pte;
-
- level = domain->mode - 1;
- pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
-
- while (level > map_size) {
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
-
- level -= 1;
+ unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[PM_LEVEL_INDEX(level, address)];
+ if (start_page + pages > last_page)
+ pages = last_page - start_page;
- if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
- pte = NULL;
- break;
- }
+ for (i = start_page; i < start_page + pages; ++i) {
+ int index = i / APERTURE_RANGE_PAGES;
+ int page = i % APERTURE_RANGE_PAGES;
+ __set_bit(page, dom->aperture[index]->bitmap);
}
-
- return pte;
}
/*
@@ -775,12 +975,12 @@ static u64 *fetch_pte(struct protection_domain *domain,
* aperture in case of dma_ops domain allocation or address allocation
* failure.
*/
-static int alloc_new_range(struct amd_iommu *iommu,
- struct dma_ops_domain *dma_dom,
+static int alloc_new_range(struct dma_ops_domain *dma_dom,
bool populate, gfp_t gfp)
{
int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
- int i;
+ struct amd_iommu *iommu;
+ unsigned long i;
#ifdef CONFIG_IOMMU_STRESS
populate = false;
@@ -819,14 +1019,17 @@ static int alloc_new_range(struct amd_iommu *iommu,
dma_dom->aperture_size += APERTURE_RANGE_SIZE;
/* Intialize the exclusion range if necessary */
- if (iommu->exclusion_start &&
- iommu->exclusion_start >= dma_dom->aperture[index]->offset &&
- iommu->exclusion_start < dma_dom->aperture_size) {
- unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
- int pages = iommu_num_pages(iommu->exclusion_start,
- iommu->exclusion_length,
- PAGE_SIZE);
- dma_ops_reserve_addresses(dma_dom, startpage, pages);
+ for_each_iommu(iommu) {
+ if (iommu->exclusion_start &&
+ iommu->exclusion_start >= dma_dom->aperture[index]->offset
+ && iommu->exclusion_start < dma_dom->aperture_size) {
+ unsigned long startpage;
+ int pages = iommu_num_pages(iommu->exclusion_start,
+ iommu->exclusion_length,
+ PAGE_SIZE);
+ startpage = iommu->exclusion_start >> PAGE_SHIFT;
+ dma_ops_reserve_addresses(dma_dom, startpage, pages);
+ }
}
/*
@@ -928,7 +1131,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
}
if (unlikely(address == -1))
- address = bad_dma_address;
+ address = DMA_ERROR_CODE;
WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
@@ -959,7 +1162,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
- iommu_area_free(range->bitmap, address, pages);
+ bitmap_clear(range->bitmap, address, pages);
}
@@ -973,6 +1176,31 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
*
****************************************************************************/
+/*
+ * This function adds a protection domain to the global protection domain list
+ */
+static void add_domain_to_list(struct protection_domain *domain)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&amd_iommu_pd_lock, flags);
+ list_add(&domain->list, &amd_iommu_pd_list);
+ spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
+}
+
+/*
+ * This function removes a protection domain to the global
+ * protection domain list
+ */
+static void del_domain_from_list(struct protection_domain *domain)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&amd_iommu_pd_lock, flags);
+ list_del(&domain->list);
+ spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
+}
+
static u16 domain_id_alloc(void)
{
unsigned long flags;
@@ -1000,26 +1228,6 @@ static void domain_id_free(int id)
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
}
-/*
- * Used to reserve address ranges in the aperture (e.g. for exclusion
- * ranges.
- */
-static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
- unsigned long start_page,
- unsigned int pages)
-{
- unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
-
- if (start_page + pages > last_page)
- pages = last_page - start_page;
-
- for (i = start_page; i < start_page + pages; ++i) {
- int index = i / APERTURE_RANGE_PAGES;
- int page = i % APERTURE_RANGE_PAGES;
- __set_bit(page, dom->aperture[index]->bitmap);
- }
-}
-
static void free_pagetable(struct protection_domain *domain)
{
int i, j;
@@ -1061,6 +1269,8 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
if (!dom)
return;
+ del_domain_from_list(&dom->domain);
+
free_pagetable(&dom->domain);
for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
@@ -1078,7 +1288,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
* It also intializes the page table and the address allocator data
* structures required for the dma_ops interface
*/
-static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
+static struct dma_ops_domain *dma_ops_domain_alloc(void)
{
struct dma_ops_domain *dma_dom;
@@ -1091,6 +1301,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
dma_dom->domain.id = domain_id_alloc();
if (dma_dom->domain.id == 0)
goto free_dma_dom;
+ INIT_LIST_HEAD(&dma_dom->domain.dev_list);
dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
dma_dom->domain.flags = PD_DMA_OPS_MASK;
@@ -1101,7 +1312,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
dma_dom->need_flush = false;
dma_dom->target_dev = 0xffff;
- if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL))
+ add_domain_to_list(&dma_dom->domain);
+
+ if (alloc_new_range(dma_dom, true, GFP_KERNEL))
goto free_dma_dom;
/*
@@ -1129,22 +1342,6 @@ static bool dma_ops_domain(struct protection_domain *domain)
return domain->flags & PD_DMA_OPS_MASK;
}
-/*
- * Find out the protection domain structure for a given PCI device. This
- * will give us the pointer to the page table root for example.
- */
-static struct protection_domain *domain_for_device(u16 devid)
-{
- struct protection_domain *dom;
- unsigned long flags;
-
- read_lock_irqsave(&amd_iommu_devtable_lock, flags);
- dom = amd_iommu_pd_table[devid];
- read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- return dom;
-}
-
static void set_dte_entry(u16 devid, struct protection_domain *domain)
{
u64 pte_root = virt_to_phys(domain->pt_root);
@@ -1156,42 +1353,123 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain)
amd_iommu_dev_table[devid].data[2] = domain->id;
amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
+}
+
+static void clear_dte_entry(u16 devid)
+{
+ /* remove entry from the device table seen by the hardware */
+ amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
+ amd_iommu_dev_table[devid].data[1] = 0;
+ amd_iommu_dev_table[devid].data[2] = 0;
+
+ amd_iommu_apply_erratum_63(devid);
+}
+
+static void do_attach(struct device *dev, struct protection_domain *domain)
+{
+ struct iommu_dev_data *dev_data;
+ struct amd_iommu *iommu;
+ u16 devid;
+
+ devid = get_device_id(dev);
+ iommu = amd_iommu_rlookup_table[devid];
+ dev_data = get_dev_data(dev);
+
+ /* Update data structures */
+ dev_data->domain = domain;
+ list_add(&dev_data->list, &domain->dev_list);
+ set_dte_entry(devid, domain);
+
+ /* Do reference counting */
+ domain->dev_iommu[iommu->index] += 1;
+ domain->dev_cnt += 1;
- amd_iommu_pd_table[devid] = domain;
+ /* Flush the DTE entry */
+ iommu_flush_device(dev);
+}
+
+static void do_detach(struct device *dev)
+{
+ struct iommu_dev_data *dev_data;
+ struct amd_iommu *iommu;
+ u16 devid;
+
+ devid = get_device_id(dev);
+ iommu = amd_iommu_rlookup_table[devid];
+ dev_data = get_dev_data(dev);
+
+ /* decrease reference counters */
+ dev_data->domain->dev_iommu[iommu->index] -= 1;
+ dev_data->domain->dev_cnt -= 1;
+
+ /* Update data structures */
+ dev_data->domain = NULL;
+ list_del(&dev_data->list);
+ clear_dte_entry(devid);
+
+ /* Flush the DTE entry */
+ iommu_flush_device(dev);
}
/*
* If a device is not yet associated with a domain, this function does
* assigns it visible for the hardware
*/
-static void __attach_device(struct amd_iommu *iommu,
- struct protection_domain *domain,
- u16 devid)
+static int __attach_device(struct device *dev,
+ struct protection_domain *domain)
{
+ struct iommu_dev_data *dev_data, *alias_data;
+
+ dev_data = get_dev_data(dev);
+ alias_data = get_dev_data(dev_data->alias);
+
+ if (!alias_data)
+ return -EINVAL;
+
/* lock domain */
spin_lock(&domain->lock);
- /* update DTE entry */
- set_dte_entry(devid, domain);
+ /* Some sanity checks */
+ if (alias_data->domain != NULL &&
+ alias_data->domain != domain)
+ return -EBUSY;
- domain->dev_cnt += 1;
+ if (dev_data->domain != NULL &&
+ dev_data->domain != domain)
+ return -EBUSY;
+
+ /* Do real assignment */
+ if (dev_data->alias != dev) {
+ alias_data = get_dev_data(dev_data->alias);
+ if (alias_data->domain == NULL)
+ do_attach(dev_data->alias, domain);
+
+ atomic_inc(&alias_data->bind);
+ }
+
+ if (dev_data->domain == NULL)
+ do_attach(dev, domain);
+
+ atomic_inc(&dev_data->bind);
/* ready */
spin_unlock(&domain->lock);
+
+ return 0;
}
/*
* If a device is not yet associated with a domain, this function does
* assigns it visible for the hardware
*/
-static void attach_device(struct amd_iommu *iommu,
- struct protection_domain *domain,
- u16 devid)
+static int attach_device(struct device *dev,
+ struct protection_domain *domain)
{
unsigned long flags;
+ int ret;
write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- __attach_device(iommu, domain, devid);
+ ret = __attach_device(dev, domain);
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
/*
@@ -1199,96 +1477,130 @@ static void attach_device(struct amd_iommu *iommu,
* left the caches in the IOMMU dirty. So we have to flush
* here to evict all dirty stuff.
*/
- iommu_queue_inv_dev_entry(iommu, devid);
- iommu_flush_tlb_pde(iommu, domain->id);
+ iommu_flush_tlb_pde(domain);
+
+ return ret;
}
/*
* Removes a device from a protection domain (unlocked)
*/
-static void __detach_device(struct protection_domain *domain, u16 devid)
+static void __detach_device(struct device *dev)
{
+ struct iommu_dev_data *dev_data = get_dev_data(dev);
+ struct iommu_dev_data *alias_data;
+ struct protection_domain *domain;
+ unsigned long flags;
- /* lock domain */
- spin_lock(&domain->lock);
+ BUG_ON(!dev_data->domain);
- /* remove domain from the lookup table */
- amd_iommu_pd_table[devid] = NULL;
+ domain = dev_data->domain;
- /* remove entry from the device table seen by the hardware */
- amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
- amd_iommu_dev_table[devid].data[1] = 0;
- amd_iommu_dev_table[devid].data[2] = 0;
+ spin_lock_irqsave(&domain->lock, flags);
- /* decrease reference counter */
- domain->dev_cnt -= 1;
+ if (dev_data->alias != dev) {
+ alias_data = get_dev_data(dev_data->alias);
+ if (atomic_dec_and_test(&alias_data->bind))
+ do_detach(dev_data->alias);
+ }
- /* ready */
- spin_unlock(&domain->lock);
+ if (atomic_dec_and_test(&dev_data->bind))
+ do_detach(dev);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
/*
* If we run in passthrough mode the device must be assigned to the
- * passthrough domain if it is detached from any other domain
+ * passthrough domain if it is detached from any other domain.
+ * Make sure we can deassign from the pt_domain itself.
*/
- if (iommu_pass_through) {
- struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
- __attach_device(iommu, pt_domain, devid);
- }
+ if (iommu_pass_through &&
+ (dev_data->domain == NULL && domain != pt_domain))
+ __attach_device(dev, pt_domain);
}
/*
* Removes a device from a protection domain (with devtable_lock held)
*/
-static void detach_device(struct protection_domain *domain, u16 devid)
+static void detach_device(struct device *dev)
{
unsigned long flags;
/* lock device table */
write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- __detach_device(domain, devid);
+ __detach_device(dev);
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
}
+/*
+ * Find out the protection domain structure for a given PCI device. This
+ * will give us the pointer to the page table root for example.
+ */
+static struct protection_domain *domain_for_device(struct device *dev)
+{
+ struct protection_domain *dom;
+ struct iommu_dev_data *dev_data, *alias_data;
+ unsigned long flags;
+ u16 devid, alias;
+
+ devid = get_device_id(dev);
+ alias = amd_iommu_alias_table[devid];
+ dev_data = get_dev_data(dev);
+ alias_data = get_dev_data(dev_data->alias);
+ if (!alias_data)
+ return NULL;
+
+ read_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ dom = dev_data->domain;
+ if (dom == NULL &&
+ alias_data->domain != NULL) {
+ __attach_device(dev, alias_data->domain);
+ dom = alias_data->domain;
+ }
+
+ read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+ return dom;
+}
+
static int device_change_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
struct device *dev = data;
- struct pci_dev *pdev = to_pci_dev(dev);
- u16 devid = calc_devid(pdev->bus->number, pdev->devfn);
+ u16 devid;
struct protection_domain *domain;
struct dma_ops_domain *dma_domain;
struct amd_iommu *iommu;
unsigned long flags;
- if (devid > amd_iommu_last_bdf)
- goto out;
-
- devid = amd_iommu_alias_table[devid];
-
- iommu = amd_iommu_rlookup_table[devid];
- if (iommu == NULL)
- goto out;
-
- domain = domain_for_device(devid);
+ if (!check_device(dev))
+ return 0;
- if (domain && !dma_ops_domain(domain))
- WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound "
- "to a non-dma-ops domain\n", dev_name(dev));
+ devid = get_device_id(dev);
+ iommu = amd_iommu_rlookup_table[devid];
switch (action) {
case BUS_NOTIFY_UNBOUND_DRIVER:
+
+ domain = domain_for_device(dev);
+
if (!domain)
goto out;
if (iommu_pass_through)
break;
- detach_device(domain, devid);
+ detach_device(dev);
break;
case BUS_NOTIFY_ADD_DEVICE:
+
+ iommu_init_device(dev);
+
+ domain = domain_for_device(dev);
+
/* allocate a protection domain if a device is added */
dma_domain = find_protection_domain(devid);
if (dma_domain)
goto out;
- dma_domain = dma_ops_domain_alloc(iommu);
+ dma_domain = dma_ops_domain_alloc();
if (!dma_domain)
goto out;
dma_domain->target_dev = devid;
@@ -1298,11 +1610,15 @@ static int device_change_notifier(struct notifier_block *nb,
spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
break;
+ case BUS_NOTIFY_DEL_DEVICE:
+
+ iommu_uninit_device(dev);
+
default:
goto out;
}
- iommu_queue_inv_dev_entry(iommu, devid);
+ iommu_flush_device(dev);
iommu_completion_wait(iommu);
out:
@@ -1313,6 +1629,11 @@ static struct notifier_block device_nb = {
.notifier_call = device_change_notifier,
};
+void amd_iommu_init_notifier(void)
+{
+ bus_register_notifier(&pci_bus_type, &device_nb);
+}
+
/*****************************************************************************
*
* The next functions belong to the dma_ops mapping/unmapping code.
@@ -1320,106 +1641,46 @@ static struct notifier_block device_nb = {
*****************************************************************************/
/*
- * This function checks if the driver got a valid device from the caller to
- * avoid dereferencing invalid pointers.
- */
-static bool check_device(struct device *dev)
-{
- if (!dev || !dev->dma_mask)
- return false;
-
- return true;
-}
-
-/*
- * In this function the list of preallocated protection domains is traversed to
- * find the domain for a specific device
- */
-static struct dma_ops_domain *find_protection_domain(u16 devid)
-{
- struct dma_ops_domain *entry, *ret = NULL;
- unsigned long flags;
-
- if (list_empty(&iommu_pd_list))
- return NULL;
-
- spin_lock_irqsave(&iommu_pd_list_lock, flags);
-
- list_for_each_entry(entry, &iommu_pd_list, list) {
- if (entry->target_dev == devid) {
- ret = entry;
- break;
- }
- }
-
- spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-
- return ret;
-}
-
-/*
* In the dma_ops path we only have the struct device. This function
* finds the corresponding IOMMU, the protection domain and the
* requestor id for a given device.
* If the device is not yet associated with a domain this is also done
* in this function.
*/
-static int get_device_resources(struct device *dev,
- struct amd_iommu **iommu,
- struct protection_domain **domain,
- u16 *bdf)
+static struct protection_domain *get_domain(struct device *dev)
{
+ struct protection_domain *domain;
struct dma_ops_domain *dma_dom;
- struct pci_dev *pcidev;
- u16 _bdf;
-
- *iommu = NULL;
- *domain = NULL;
- *bdf = 0xffff;
+ u16 devid = get_device_id(dev);
- if (dev->bus != &pci_bus_type)
- return 0;
-
- pcidev = to_pci_dev(dev);
- _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
+ if (!check_device(dev))
+ return ERR_PTR(-EINVAL);
- /* device not translated by any IOMMU in the system? */
- if (_bdf > amd_iommu_last_bdf)
- return 0;
+ domain = domain_for_device(dev);
+ if (domain != NULL && !dma_ops_domain(domain))
+ return ERR_PTR(-EBUSY);
- *bdf = amd_iommu_alias_table[_bdf];
+ if (domain != NULL)
+ return domain;
- *iommu = amd_iommu_rlookup_table[*bdf];
- if (*iommu == NULL)
- return 0;
- *domain = domain_for_device(*bdf);
- if (*domain == NULL) {
- dma_dom = find_protection_domain(*bdf);
- if (!dma_dom)
- dma_dom = (*iommu)->default_dom;
- *domain = &dma_dom->domain;
- attach_device(*iommu, *domain, *bdf);
- DUMP_printk("Using protection domain %d for device %s\n",
- (*domain)->id, dev_name(dev));
- }
-
- if (domain_for_device(_bdf) == NULL)
- attach_device(*iommu, *domain, _bdf);
+ /* Device not bount yet - bind it */
+ dma_dom = find_protection_domain(devid);
+ if (!dma_dom)
+ dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
+ attach_device(dev, &dma_dom->domain);
+ DUMP_printk("Using protection domain %d for device %s\n",
+ dma_dom->domain.id, dev_name(dev));
- return 1;
+ return &dma_dom->domain;
}
static void update_device_table(struct protection_domain *domain)
{
- unsigned long flags;
- int i;
+ struct iommu_dev_data *dev_data;
- for (i = 0; i <= amd_iommu_last_bdf; ++i) {
- if (amd_iommu_pd_table[i] != domain)
- continue;
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- set_dte_entry(i, domain);
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+ list_for_each_entry(dev_data, &domain->dev_list, list) {
+ u16 devid = get_device_id(dev_data->dev);
+ set_dte_entry(devid, domain);
}
}
@@ -1429,76 +1690,13 @@ static void update_domain(struct protection_domain *domain)
return;
update_device_table(domain);
- flush_devices_by_domain(domain);
- iommu_flush_domain(domain->id);
+ iommu_flush_domain_devices(domain);
+ iommu_flush_tlb_pde(domain);
domain->updated = false;
}
/*
- * This function is used to add another level to an IO page table. Adding
- * another level increases the size of the address space by 9 bits to a size up
- * to 64 bits.
- */
-static bool increase_address_space(struct protection_domain *domain,
- gfp_t gfp)
-{
- u64 *pte;
-
- if (domain->mode == PAGE_MODE_6_LEVEL)
- /* address space already 64 bit large */
- return false;
-
- pte = (void *)get_zeroed_page(gfp);
- if (!pte)
- return false;
-
- *pte = PM_LEVEL_PDE(domain->mode,
- virt_to_phys(domain->pt_root));
- domain->pt_root = pte;
- domain->mode += 1;
- domain->updated = true;
-
- return true;
-}
-
-static u64 *alloc_pte(struct protection_domain *domain,
- unsigned long address,
- int end_lvl,
- u64 **pte_page,
- gfp_t gfp)
-{
- u64 *pte, *page;
- int level;
-
- while (address > PM_LEVEL_SIZE(domain->mode))
- increase_address_space(domain, gfp);
-
- level = domain->mode - 1;
- pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
-
- while (level > end_lvl) {
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(gfp);
- if (!page)
- return NULL;
- *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
- }
-
- level -= 1;
-
- pte = IOMMU_PTE_PAGE(*pte);
-
- if (pte_page && level == end_lvl)
- *pte_page = pte;
-
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- }
-
- return pte;
-}
-
-/*
* This function fetches the PTE for a given address in the aperture
*/
static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
@@ -1528,8 +1726,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
* This is the generic map function. It maps one 4kb page at paddr to
* the given address in the DMA address space for the domain.
*/
-static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
- struct dma_ops_domain *dom,
+static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
unsigned long address,
phys_addr_t paddr,
int direction)
@@ -1542,7 +1739,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
pte = dma_ops_get_pte(dom, address);
if (!pte)
- return bad_dma_address;
+ return DMA_ERROR_CODE;
__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
@@ -1563,8 +1760,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
/*
* The generic unmapping function for on page in the DMA address space.
*/
-static void dma_ops_domain_unmap(struct amd_iommu *iommu,
- struct dma_ops_domain *dom,
+static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
unsigned long address)
{
struct aperture_range *aperture;
@@ -1595,7 +1791,6 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
* Must be called with the domain lock held.
*/
static dma_addr_t __map_single(struct device *dev,
- struct amd_iommu *iommu,
struct dma_ops_domain *dma_dom,
phys_addr_t paddr,
size_t size,
@@ -1623,7 +1818,7 @@ static dma_addr_t __map_single(struct device *dev,
retry:
address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
dma_mask);
- if (unlikely(address == bad_dma_address)) {
+ if (unlikely(address == DMA_ERROR_CODE)) {
/*
* setting next_address here will let the address
* allocator only scan the new allocated range in the
@@ -1631,11 +1826,11 @@ retry:
*/
dma_dom->next_address = dma_dom->aperture_size;
- if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC))
+ if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
goto out;
/*
- * aperture was sucessfully enlarged by 128 MB, try
+ * aperture was successfully enlarged by 128 MB, try
* allocation again
*/
goto retry;
@@ -1643,8 +1838,8 @@ retry:
start = address;
for (i = 0; i < pages; ++i) {
- ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
- if (ret == bad_dma_address)
+ ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
+ if (ret == DMA_ERROR_CODE)
goto out_unmap;
paddr += PAGE_SIZE;
@@ -1655,10 +1850,10 @@ retry:
ADD_STATS_COUNTER(alloced_io_mem, size);
if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
- iommu_flush_tlb(iommu, dma_dom->domain.id);
+ iommu_flush_tlb(&dma_dom->domain);
dma_dom->need_flush = false;
- } else if (unlikely(iommu_has_npcache(iommu)))
- iommu_flush_pages(iommu, dma_dom->domain.id, address, size);
+ } else if (unlikely(amd_iommu_np_cache))
+ iommu_flush_pages(&dma_dom->domain, address, size);
out:
return address;
@@ -1667,20 +1862,19 @@ out_unmap:
for (--i; i >= 0; --i) {
start -= PAGE_SIZE;
- dma_ops_domain_unmap(iommu, dma_dom, start);
+ dma_ops_domain_unmap(dma_dom, start);
}
dma_ops_free_addresses(dma_dom, address, pages);
- return bad_dma_address;
+ return DMA_ERROR_CODE;
}
/*
* Does the reverse of the __map_single function. Must be called with
* the domain lock held too
*/
-static void __unmap_single(struct amd_iommu *iommu,
- struct dma_ops_domain *dma_dom,
+static void __unmap_single(struct dma_ops_domain *dma_dom,
dma_addr_t dma_addr,
size_t size,
int dir)
@@ -1688,7 +1882,7 @@ static void __unmap_single(struct amd_iommu *iommu,
dma_addr_t i, start;
unsigned int pages;
- if ((dma_addr == bad_dma_address) ||
+ if ((dma_addr == DMA_ERROR_CODE) ||
(dma_addr + size > dma_dom->aperture_size))
return;
@@ -1697,7 +1891,7 @@ static void __unmap_single(struct amd_iommu *iommu,
start = dma_addr;
for (i = 0; i < pages; ++i) {
- dma_ops_domain_unmap(iommu, dma_dom, start);
+ dma_ops_domain_unmap(dma_dom, start);
start += PAGE_SIZE;
}
@@ -1706,7 +1900,7 @@ static void __unmap_single(struct amd_iommu *iommu,
dma_ops_free_addresses(dma_dom, dma_addr, pages);
if (amd_iommu_unmap_flush || dma_dom->need_flush) {
- iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size);
+ iommu_flush_pages(&dma_dom->domain, dma_addr, size);
dma_dom->need_flush = false;
}
}
@@ -1720,36 +1914,29 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
struct dma_attrs *attrs)
{
unsigned long flags;
- struct amd_iommu *iommu;
struct protection_domain *domain;
- u16 devid;
dma_addr_t addr;
u64 dma_mask;
phys_addr_t paddr = page_to_phys(page) + offset;
INC_STATS_COUNTER(cnt_map_single);
- if (!check_device(dev))
- return bad_dma_address;
-
- dma_mask = *dev->dma_mask;
-
- get_device_resources(dev, &iommu, &domain, &devid);
-
- if (iommu == NULL || domain == NULL)
- /* device not handled by any AMD IOMMU */
+ domain = get_domain(dev);
+ if (PTR_ERR(domain) == -EINVAL)
return (dma_addr_t)paddr;
+ else if (IS_ERR(domain))
+ return DMA_ERROR_CODE;
- if (!dma_ops_domain(domain))
- return bad_dma_address;
+ dma_mask = *dev->dma_mask;
spin_lock_irqsave(&domain->lock, flags);
- addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
+
+ addr = __map_single(dev, domain->priv, paddr, size, dir, false,
dma_mask);
- if (addr == bad_dma_address)
+ if (addr == DMA_ERROR_CODE)
goto out;
- iommu_completion_wait(iommu);
+ iommu_flush_complete(domain);
out:
spin_unlock_irqrestore(&domain->lock, flags);
@@ -1764,25 +1951,19 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
enum dma_data_direction dir, struct dma_attrs *attrs)
{
unsigned long flags;
- struct amd_iommu *iommu;
struct protection_domain *domain;
- u16 devid;
INC_STATS_COUNTER(cnt_unmap_single);
- if (!check_device(dev) ||
- !get_device_resources(dev, &iommu, &domain, &devid))
- /* device not handled by any AMD IOMMU */
- return;
-
- if (!dma_ops_domain(domain))
+ domain = get_domain(dev);
+ if (IS_ERR(domain))
return;
spin_lock_irqsave(&domain->lock, flags);
- __unmap_single(iommu, domain->priv, dma_addr, size, dir);
+ __unmap_single(domain->priv, dma_addr, size, dir);
- iommu_completion_wait(iommu);
+ iommu_flush_complete(domain);
spin_unlock_irqrestore(&domain->lock, flags);
}
@@ -1814,9 +1995,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
struct dma_attrs *attrs)
{
unsigned long flags;
- struct amd_iommu *iommu;
struct protection_domain *domain;
- u16 devid;
int i;
struct scatterlist *s;
phys_addr_t paddr;
@@ -1825,25 +2004,20 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
INC_STATS_COUNTER(cnt_map_sg);
- if (!check_device(dev))
+ domain = get_domain(dev);
+ if (PTR_ERR(domain) == -EINVAL)
+ return map_sg_no_iommu(dev, sglist, nelems, dir);
+ else if (IS_ERR(domain))
return 0;
dma_mask = *dev->dma_mask;
- get_device_resources(dev, &iommu, &domain, &devid);
-
- if (!iommu || !domain)
- return map_sg_no_iommu(dev, sglist, nelems, dir);
-
- if (!dma_ops_domain(domain))
- return 0;
-
spin_lock_irqsave(&domain->lock, flags);
for_each_sg(sglist, s, nelems, i) {
paddr = sg_phys(s);
- s->dma_address = __map_single(dev, iommu, domain->priv,
+ s->dma_address = __map_single(dev, domain->priv,
paddr, s->length, dir, false,
dma_mask);
@@ -1854,7 +2028,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
goto unmap;
}
- iommu_completion_wait(iommu);
+ iommu_flush_complete(domain);
out:
spin_unlock_irqrestore(&domain->lock, flags);
@@ -1863,7 +2037,7 @@ out:
unmap:
for_each_sg(sglist, s, mapped_elems, i) {
if (s->dma_address)
- __unmap_single(iommu, domain->priv, s->dma_address,
+ __unmap_single(domain->priv, s->dma_address,
s->dma_length, dir);
s->dma_address = s->dma_length = 0;
}
@@ -1882,30 +2056,25 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
struct dma_attrs *attrs)
{
unsigned long flags;
- struct amd_iommu *iommu;
struct protection_domain *domain;
struct scatterlist *s;
- u16 devid;
int i;
INC_STATS_COUNTER(cnt_unmap_sg);
- if (!check_device(dev) ||
- !get_device_resources(dev, &iommu, &domain, &devid))
- return;
-
- if (!dma_ops_domain(domain))
+ domain = get_domain(dev);
+ if (IS_ERR(domain))
return;
spin_lock_irqsave(&domain->lock, flags);
for_each_sg(sglist, s, nelems, i) {
- __unmap_single(iommu, domain->priv, s->dma_address,
+ __unmap_single(domain->priv, s->dma_address,
s->dma_length, dir);
s->dma_address = s->dma_length = 0;
}
- iommu_completion_wait(iommu);
+ iommu_flush_complete(domain);
spin_unlock_irqrestore(&domain->lock, flags);
}
@@ -1918,49 +2087,44 @@ static void *alloc_coherent(struct device *dev, size_t size,
{
unsigned long flags;
void *virt_addr;
- struct amd_iommu *iommu;
struct protection_domain *domain;
- u16 devid;
phys_addr_t paddr;
u64 dma_mask = dev->coherent_dma_mask;
INC_STATS_COUNTER(cnt_alloc_coherent);
- if (!check_device(dev))
+ domain = get_domain(dev);
+ if (PTR_ERR(domain) == -EINVAL) {
+ virt_addr = (void *)__get_free_pages(flag, get_order(size));
+ *dma_addr = __pa(virt_addr);
+ return virt_addr;
+ } else if (IS_ERR(domain))
return NULL;
- if (!get_device_resources(dev, &iommu, &domain, &devid))
- flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+ dma_mask = dev->coherent_dma_mask;
+ flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+ flag |= __GFP_ZERO;
- flag |= __GFP_ZERO;
virt_addr = (void *)__get_free_pages(flag, get_order(size));
if (!virt_addr)
return NULL;
paddr = virt_to_phys(virt_addr);
- if (!iommu || !domain) {
- *dma_addr = (dma_addr_t)paddr;
- return virt_addr;
- }
-
- if (!dma_ops_domain(domain))
- goto out_free;
-
if (!dma_mask)
dma_mask = *dev->dma_mask;
spin_lock_irqsave(&domain->lock, flags);
- *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
+ *dma_addr = __map_single(dev, domain->priv, paddr,
size, DMA_BIDIRECTIONAL, true, dma_mask);
- if (*dma_addr == bad_dma_address) {
+ if (*dma_addr == DMA_ERROR_CODE) {
spin_unlock_irqrestore(&domain->lock, flags);
goto out_free;
}
- iommu_completion_wait(iommu);
+ iommu_flush_complete(domain);
spin_unlock_irqrestore(&domain->lock, flags);
@@ -1980,28 +2144,19 @@ static void free_coherent(struct device *dev, size_t size,
void *virt_addr, dma_addr_t dma_addr)
{
unsigned long flags;
- struct amd_iommu *iommu;
struct protection_domain *domain;
- u16 devid;
INC_STATS_COUNTER(cnt_free_coherent);
- if (!check_device(dev))
- return;
-
- get_device_resources(dev, &iommu, &domain, &devid);
-
- if (!iommu || !domain)
- goto free_mem;
-
- if (!dma_ops_domain(domain))
+ domain = get_domain(dev);
+ if (IS_ERR(domain))
goto free_mem;
spin_lock_irqsave(&domain->lock, flags);
- __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
+ __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
- iommu_completion_wait(iommu);
+ iommu_flush_complete(domain);
spin_unlock_irqrestore(&domain->lock, flags);
@@ -2015,22 +2170,7 @@ free_mem:
*/
static int amd_iommu_dma_supported(struct device *dev, u64 mask)
{
- u16 bdf;
- struct pci_dev *pcidev;
-
- /* No device or no PCI device */
- if (!dev || dev->bus != &pci_bus_type)
- return 0;
-
- pcidev = to_pci_dev(dev);
-
- bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
-
- /* Out of our scope? */
- if (bdf > amd_iommu_last_bdf)
- return 0;
-
- return 1;
+ return check_device(dev);
}
/*
@@ -2044,25 +2184,28 @@ static void prealloc_protection_domains(void)
{
struct pci_dev *dev = NULL;
struct dma_ops_domain *dma_dom;
- struct amd_iommu *iommu;
u16 devid;
while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
- devid = calc_devid(dev->bus->number, dev->devfn);
- if (devid > amd_iommu_last_bdf)
- continue;
- devid = amd_iommu_alias_table[devid];
- if (domain_for_device(devid))
+
+ /* Do we handle this device? */
+ if (!check_device(&dev->dev))
continue;
- iommu = amd_iommu_rlookup_table[devid];
- if (!iommu)
+
+ /* Is there already any domain for it? */
+ if (domain_for_device(&dev->dev))
continue;
- dma_dom = dma_ops_domain_alloc(iommu);
+
+ devid = get_device_id(&dev->dev);
+
+ dma_dom = dma_ops_domain_alloc();
if (!dma_dom)
continue;
init_unity_mappings_for_device(dma_dom, devid);
dma_dom->target_dev = devid;
+ attach_device(&dev->dev, &dma_dom->domain);
+
list_add_tail(&dma_dom->list, &iommu_pd_list);
}
}
@@ -2080,6 +2223,12 @@ static struct dma_map_ops amd_iommu_dma_ops = {
/*
* The function which clues the AMD IOMMU driver into dma_ops.
*/
+
+void __init amd_iommu_init_api(void)
+{
+ register_iommu(&amd_iommu_ops);
+}
+
int __init amd_iommu_init_dma_ops(void)
{
struct amd_iommu *iommu;
@@ -2091,7 +2240,7 @@ int __init amd_iommu_init_dma_ops(void)
* protection domain will be assigned to the default one.
*/
for_each_iommu(iommu) {
- iommu->default_dom = dma_ops_domain_alloc(iommu);
+ iommu->default_dom = dma_ops_domain_alloc();
if (iommu->default_dom == NULL)
return -ENOMEM;
iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -2101,15 +2250,12 @@ int __init amd_iommu_init_dma_ops(void)
}
/*
- * If device isolation is enabled, pre-allocate the protection
- * domains for each device.
+ * Pre-allocate the protection domains for each device.
*/
- if (amd_iommu_isolate)
- prealloc_protection_domains();
+ prealloc_protection_domains();
iommu_detected = 1;
- force_iommu = 1;
- bad_dma_address = 0;
+ swiotlb = 0;
#ifdef CONFIG_GART_IOMMU
gart_iommu_aperture_disabled = 1;
gart_iommu_aperture = 0;
@@ -2118,10 +2264,6 @@ int __init amd_iommu_init_dma_ops(void)
/* Make the driver finally visible to the drivers */
dma_ops = &amd_iommu_dma_ops;
- register_iommu(&amd_iommu_ops);
-
- bus_register_notifier(&pci_bus_type, &device_nb);
-
amd_iommu_stats_init();
return 0;
@@ -2148,14 +2290,17 @@ free_domains:
static void cleanup_domain(struct protection_domain *domain)
{
+ struct iommu_dev_data *dev_data, *next;
unsigned long flags;
- u16 devid;
write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- for (devid = 0; devid <= amd_iommu_last_bdf; ++devid)
- if (amd_iommu_pd_table[devid] == domain)
- __detach_device(domain, devid);
+ list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
+ struct device *dev = dev_data->dev;
+
+ do_detach(dev);
+ atomic_set(&dev_data->bind, 0);
+ }
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
}
@@ -2165,6 +2310,8 @@ static void protection_domain_free(struct protection_domain *domain)
if (!domain)
return;
+ del_domain_from_list(domain);
+
if (domain->id)
domain_id_free(domain->id);
@@ -2183,6 +2330,9 @@ static struct protection_domain *protection_domain_alloc(void)
domain->id = domain_id_alloc();
if (!domain->id)
goto out_err;
+ INIT_LIST_HEAD(&domain->dev_list);
+
+ add_domain_to_list(domain);
return domain;
@@ -2239,26 +2389,23 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
static void amd_iommu_detach_device(struct iommu_domain *dom,
struct device *dev)
{
- struct protection_domain *domain = dom->priv;
+ struct iommu_dev_data *dev_data = dev->archdata.iommu;
struct amd_iommu *iommu;
- struct pci_dev *pdev;
u16 devid;
- if (dev->bus != &pci_bus_type)
+ if (!check_device(dev))
return;
- pdev = to_pci_dev(dev);
+ devid = get_device_id(dev);
- devid = calc_devid(pdev->bus->number, pdev->devfn);
-
- if (devid > 0)
- detach_device(domain, devid);
+ if (dev_data->domain != NULL)
+ detach_device(dev);
iommu = amd_iommu_rlookup_table[devid];
if (!iommu)
return;
- iommu_queue_inv_dev_entry(iommu, devid);
+ iommu_flush_device(dev);
iommu_completion_wait(iommu);
}
@@ -2266,35 +2413,30 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
struct device *dev)
{
struct protection_domain *domain = dom->priv;
- struct protection_domain *old_domain;
+ struct iommu_dev_data *dev_data;
struct amd_iommu *iommu;
- struct pci_dev *pdev;
+ int ret;
u16 devid;
- if (dev->bus != &pci_bus_type)
+ if (!check_device(dev))
return -EINVAL;
- pdev = to_pci_dev(dev);
-
- devid = calc_devid(pdev->bus->number, pdev->devfn);
+ dev_data = dev->archdata.iommu;
- if (devid >= amd_iommu_last_bdf ||
- devid != amd_iommu_alias_table[devid])
- return -EINVAL;
+ devid = get_device_id(dev);
iommu = amd_iommu_rlookup_table[devid];
if (!iommu)
return -EINVAL;
- old_domain = domain_for_device(devid);
- if (old_domain)
- detach_device(old_domain, devid);
+ if (dev_data->domain)
+ detach_device(dev);
- attach_device(iommu, domain, devid);
+ ret = attach_device(dev, domain);
iommu_completion_wait(iommu);
- return 0;
+ return ret;
}
static int amd_iommu_map_range(struct iommu_domain *dom,
@@ -2340,7 +2482,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
iova += PAGE_SIZE;
}
- iommu_flush_domain(domain->id);
+ iommu_flush_tlb_pde(domain);
}
static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
@@ -2391,10 +2533,11 @@ static struct iommu_ops amd_iommu_ops = {
int __init amd_iommu_init_passthrough(void)
{
+ struct amd_iommu *iommu;
struct pci_dev *dev = NULL;
- u16 devid, devid2;
+ u16 devid;
- /* allocate passthroug domain */
+ /* allocate passthrough domain */
pt_domain = protection_domain_alloc();
if (!pt_domain)
return -ENOMEM;
@@ -2402,20 +2545,17 @@ int __init amd_iommu_init_passthrough(void)
pt_domain->mode |= PAGE_MODE_NONE;
while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
- struct amd_iommu *iommu;
- devid = calc_devid(dev->bus->number, dev->devfn);
- if (devid > amd_iommu_last_bdf)
+ if (!check_device(&dev->dev))
continue;
- devid2 = amd_iommu_alias_table[devid];
+ devid = get_device_id(&dev->dev);
- iommu = amd_iommu_rlookup_table[devid2];
+ iommu = amd_iommu_rlookup_table[devid];
if (!iommu)
continue;
- __attach_device(iommu, pt_domain, devid);
- __attach_device(iommu, pt_domain, devid2);
+ attach_device(&dev->dev, pt_domain);
}
pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index b4b61d462dcc..9dc91b431470 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
* Leo Duran <leo.duran@amd.com>
*
@@ -25,10 +25,12 @@
#include <linux/interrupt.h>
#include <linux/msi.h>
#include <asm/pci-direct.h>
+#include <asm/amd_iommu_proto.h>
#include <asm/amd_iommu_types.h>
#include <asm/amd_iommu.h>
#include <asm/iommu.h>
#include <asm/gart.h>
+#include <asm/x86_init.h>
/*
* definitions for the ACPI scanning code
@@ -123,18 +125,29 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
to handle */
LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
we find in ACPI */
-#ifdef CONFIG_IOMMU_STRESS
-bool amd_iommu_isolate = false;
-#else
-bool amd_iommu_isolate = true; /* if true, device isolation is
- enabled */
-#endif
-
bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
system */
+/* Array to assign indices to IOMMUs*/
+struct amd_iommu *amd_iommus[MAX_IOMMUS];
+int amd_iommus_present;
+
+/* IOMMUs have a non-present cache? */
+bool amd_iommu_np_cache __read_mostly;
+
+/*
+ * Set to true if ACPI table parsing and hardware intialization went properly
+ */
+static bool amd_iommu_initialized;
+
+/*
+ * List of protection domains - used during resume
+ */
+LIST_HEAD(amd_iommu_pd_list);
+spinlock_t amd_iommu_pd_lock;
+
/*
* Pointer to the device table which is shared by all AMD IOMMUs
* it is indexed by the PCI device id or the HT unit id and contains
@@ -157,12 +170,6 @@ u16 *amd_iommu_alias_table;
struct amd_iommu **amd_iommu_rlookup_table;
/*
- * The pd table (protection domain table) is used to find the protection domain
- * data structure a device belongs to. Indexed with the PCI device id too.
- */
-struct protection_domain **amd_iommu_pd_table;
-
-/*
* AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
* to know which ones are already in use.
*/
@@ -240,7 +247,7 @@ static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
}
-static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
+static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
{
u32 ctrl;
@@ -519,6 +526,26 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
}
+static int get_dev_entry_bit(u16 devid, u8 bit)
+{
+ int i = (bit >> 5) & 0x07;
+ int _bit = bit & 0x1f;
+
+ return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
+}
+
+
+void amd_iommu_apply_erratum_63(u16 devid)
+{
+ int sysmgt;
+
+ sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
+ (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
+
+ if (sysmgt == 0x01)
+ set_dev_entry_bit(devid, DEV_ENTRY_IW);
+}
+
/* Writes the specific IOMMU for a device into the rlookup table */
static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
{
@@ -547,6 +574,8 @@ static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
if (flags & ACPI_DEVFLAG_LINT1)
set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
+ amd_iommu_apply_erratum_63(devid);
+
set_iommu_for_device(iommu, devid);
}
@@ -816,7 +845,18 @@ static void __init free_iommu_all(void)
static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
{
spin_lock_init(&iommu->lock);
+
+ /* Add IOMMU to internal data structures */
list_add_tail(&iommu->list, &amd_iommu_list);
+ iommu->index = amd_iommus_present++;
+
+ if (unlikely(iommu->index >= MAX_IOMMUS)) {
+ WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
+ return -ENOSYS;
+ }
+
+ /* Index is fine - add IOMMU to the array */
+ amd_iommus[iommu->index] = iommu;
/*
* Copy data from ACPI table entry to the iommu struct
@@ -846,6 +886,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
init_iommu_from_acpi(iommu, h);
init_iommu_devices(iommu);
+ if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
+ amd_iommu_np_cache = true;
+
return pci_enable_device(iommu->dev);
}
@@ -891,6 +934,8 @@ static int __init init_iommu_all(struct acpi_table_header *table)
}
WARN_ON(p != end);
+ amd_iommu_initialized = true;
+
return 0;
}
@@ -903,7 +948,7 @@ static int __init init_iommu_all(struct acpi_table_header *table)
*
****************************************************************************/
-static int __init iommu_setup_msi(struct amd_iommu *iommu)
+static int iommu_setup_msi(struct amd_iommu *iommu)
{
int r;
@@ -1154,19 +1199,10 @@ static struct sys_device device_amd_iommu = {
* functions. Finally it prints some information about AMD IOMMUs and
* the driver state and enables the hardware.
*/
-int __init amd_iommu_init(void)
+static int __init amd_iommu_init(void)
{
int i, ret = 0;
-
- if (no_iommu) {
- printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
- return 0;
- }
-
- if (!amd_iommu_detected)
- return -ENODEV;
-
/*
* First parse ACPI tables to find the largest Bus/Dev/Func
* we need to handle. Upon this information the shared data
@@ -1203,15 +1239,6 @@ int __init amd_iommu_init(void)
if (amd_iommu_rlookup_table == NULL)
goto free;
- /*
- * Protection Domain table - maps devices to protection domains
- * This table has the same size as the rlookup_table
- */
- amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(rlookup_table_size));
- if (amd_iommu_pd_table == NULL)
- goto free;
-
amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
GFP_KERNEL | __GFP_ZERO,
get_order(MAX_DOMAIN_ID/8));
@@ -1233,6 +1260,8 @@ int __init amd_iommu_init(void)
*/
amd_iommu_pd_alloc_bitmap[0] = 1;
+ spin_lock_init(&amd_iommu_pd_lock);
+
/*
* now the data structures are allocated and basically initialized
* start the real acpi table scan
@@ -1241,6 +1270,9 @@ int __init amd_iommu_init(void)
if (acpi_table_parse("IVRS", init_iommu_all) != 0)
goto free;
+ if (!amd_iommu_initialized)
+ goto free;
+
if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
goto free;
@@ -1252,39 +1284,43 @@ int __init amd_iommu_init(void)
if (ret)
goto free;
+ ret = amd_iommu_init_devices();
+ if (ret)
+ goto free;
+
if (iommu_pass_through)
ret = amd_iommu_init_passthrough();
else
ret = amd_iommu_init_dma_ops();
+
if (ret)
goto free;
+ amd_iommu_init_api();
+
+ amd_iommu_init_notifier();
+
enable_iommus();
if (iommu_pass_through)
goto out;
- printk(KERN_INFO "AMD-Vi: device isolation ");
- if (amd_iommu_isolate)
- printk("enabled\n");
- else
- printk("disabled\n");
-
if (amd_iommu_unmap_flush)
printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
else
printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
+ x86_platform.iommu_shutdown = disable_iommus;
out:
return ret;
free:
+
+ amd_iommu_uninit_devices();
+
free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
get_order(MAX_DOMAIN_ID/8));
- free_pages((unsigned long)amd_iommu_pd_table,
- get_order(rlookup_table_size));
-
free_pages((unsigned long)amd_iommu_rlookup_table,
get_order(rlookup_table_size));
@@ -1301,11 +1337,6 @@ free:
goto out;
}
-void amd_iommu_shutdown(void)
-{
- disable_iommus();
-}
-
/****************************************************************************
*
* Early detect code. This code runs at IOMMU detection time in the DMA
@@ -1320,16 +1351,16 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
void __init amd_iommu_detect(void)
{
- if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
+ if (no_iommu || (iommu_detected && !gart_iommu_aperture))
return;
if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
iommu_detected = 1;
amd_iommu_detected = 1;
-#ifdef CONFIG_GART_IOMMU
- gart_iommu_aperture_disabled = 1;
- gart_iommu_aperture = 0;
-#endif
+ x86_init.iommu.iommu_init = amd_iommu_init;
+
+ /* Make sure ACS will be enabled */
+ pci_request_acs();
}
}
@@ -1350,10 +1381,6 @@ static int __init parse_amd_iommu_dump(char *str)
static int __init parse_amd_iommu_options(char *str)
{
for (; *str; ++str) {
- if (strncmp(str, "isolate", 7) == 0)
- amd_iommu_isolate = true;
- if (strncmp(str, "share", 5) == 0)
- amd_iommu_isolate = false;
if (strncmp(str, "fullflush", 9) == 0)
amd_iommu_unmap_flush = true;
}
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
new file mode 100644
index 000000000000..4b7099526d2c
--- /dev/null
+++ b/arch/x86/kernel/apb_timer.c
@@ -0,0 +1,784 @@
+/*
+ * apb_timer.c: Driver for Langwell APB timers
+ *
+ * (C) Copyright 2009 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ * Langwell is the south complex of Intel Moorestown MID platform. There are
+ * eight external timers in total that can be used by the operating system.
+ * The timer information, such as frequency and addresses, is provided to the
+ * OS via SFI tables.
+ * Timer interrupts are routed via FW/HW emulated IOAPIC independently via
+ * individual redirection table entries (RTE).
+ * Unlike HPET, there is no master counter, therefore one of the timers are
+ * used as clocksource. The overall allocation looks like:
+ * - timer 0 - NR_CPUs for per cpu timer
+ * - one timer for clocksource
+ * - one timer for watchdog driver.
+ * It is also worth notice that APB timer does not support true one-shot mode,
+ * free-running mode will be used here to emulate one-shot mode.
+ * APB timer can also be used as broadcast timer along with per cpu local APIC
+ * timer, but by default APB timer has higher rating than local APIC timers.
+ */
+
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/sysdev.h>
+#include <linux/pm.h>
+#include <linux/pci.h>
+#include <linux/sfi.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/irq.h>
+
+#include <asm/fixmap.h>
+#include <asm/apb_timer.h>
+
+#define APBT_MASK CLOCKSOURCE_MASK(32)
+#define APBT_SHIFT 22
+#define APBT_CLOCKEVENT_RATING 150
+#define APBT_CLOCKSOURCE_RATING 250
+#define APBT_MIN_DELTA_USEC 200
+
+#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt)
+#define APBT_CLOCKEVENT0_NUM (0)
+#define APBT_CLOCKEVENT1_NUM (1)
+#define APBT_CLOCKSOURCE_NUM (2)
+
+static unsigned long apbt_address;
+static int apb_timer_block_enabled;
+static void __iomem *apbt_virt_address;
+static int phy_cs_timer_id;
+
+/*
+ * Common DW APB timer info
+ */
+static uint64_t apbt_freq;
+
+static void apbt_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt);
+static int apbt_next_event(unsigned long delta,
+ struct clock_event_device *evt);
+static cycle_t apbt_read_clocksource(struct clocksource *cs);
+static void apbt_restart_clocksource(struct clocksource *cs);
+
+struct apbt_dev {
+ struct clock_event_device evt;
+ unsigned int num;
+ int cpu;
+ unsigned int irq;
+ unsigned int tick;
+ unsigned int count;
+ unsigned int flags;
+ char name[10];
+};
+
+int disable_apbt_percpu __cpuinitdata;
+
+static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
+
+#ifdef CONFIG_SMP
+static unsigned int apbt_num_timers_used;
+static struct apbt_dev *apbt_devs;
+#endif
+
+static inline unsigned long apbt_readl_reg(unsigned long a)
+{
+ return readl(apbt_virt_address + a);
+}
+
+static inline void apbt_writel_reg(unsigned long d, unsigned long a)
+{
+ writel(d, apbt_virt_address + a);
+}
+
+static inline unsigned long apbt_readl(int n, unsigned long a)
+{
+ return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE);
+}
+
+static inline void apbt_writel(int n, unsigned long d, unsigned long a)
+{
+ writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE);
+}
+
+static inline void apbt_set_mapping(void)
+{
+ struct sfi_timer_table_entry *mtmr;
+
+ if (apbt_virt_address) {
+ pr_debug("APBT base already mapped\n");
+ return;
+ }
+ mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
+ if (mtmr == NULL) {
+ printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
+ APBT_CLOCKEVENT0_NUM);
+ return;
+ }
+ apbt_address = (unsigned long)mtmr->phys_addr;
+ if (!apbt_address) {
+ printk(KERN_WARNING "No timer base from SFI, use default\n");
+ apbt_address = APBT_DEFAULT_BASE;
+ }
+ apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
+ if (apbt_virt_address) {
+ pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\
+ (void *)apbt_address, (void *)apbt_virt_address);
+ } else {
+ pr_debug("Failed mapping APBT phy address at %p\n",\
+ (void *)apbt_address);
+ goto panic_noapbt;
+ }
+ apbt_freq = mtmr->freq_hz / USEC_PER_SEC;
+ sfi_free_mtmr(mtmr);
+
+ /* Now figure out the physical timer id for clocksource device */
+ mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM);
+ if (mtmr == NULL)
+ goto panic_noapbt;
+
+ /* Now figure out the physical timer id */
+ phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff)
+ / APBTMRS_REG_SIZE;
+ pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id);
+ return;
+
+panic_noapbt:
+ panic("Failed to setup APB system timer\n");
+
+}
+
+static inline void apbt_clear_mapping(void)
+{
+ iounmap(apbt_virt_address);
+ apbt_virt_address = NULL;
+}
+
+/*
+ * APBT timer interrupt enable / disable
+ */
+static inline int is_apbt_capable(void)
+{
+ return apbt_virt_address ? 1 : 0;
+}
+
+static struct clocksource clocksource_apbt = {
+ .name = "apbt",
+ .rating = APBT_CLOCKSOURCE_RATING,
+ .read = apbt_read_clocksource,
+ .mask = APBT_MASK,
+ .shift = APBT_SHIFT,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+ .resume = apbt_restart_clocksource,
+};
+
+/* boot APB clock event device */
+static struct clock_event_device apbt_clockevent = {
+ .name = "apbt0",
+ .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+ .set_mode = apbt_set_mode,
+ .set_next_event = apbt_next_event,
+ .shift = APBT_SHIFT,
+ .irq = 0,
+ .rating = APBT_CLOCKEVENT_RATING,
+};
+
+/*
+ * if user does not want to use per CPU apb timer, just give it a lower rating
+ * than local apic timer and skip the late per cpu timer init.
+ */
+static inline int __init setup_x86_mrst_timer(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (strcmp("apbt_only", arg) == 0)
+ disable_apbt_percpu = 0;
+ else if (strcmp("lapic_and_apbt", arg) == 0)
+ disable_apbt_percpu = 1;
+ else {
+ pr_warning("X86 MRST timer option %s not recognised"
+ " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
+ arg);
+ return -EINVAL;
+ }
+ return 0;
+}
+__setup("x86_mrst_timer=", setup_x86_mrst_timer);
+
+/*
+ * start count down from 0xffff_ffff. this is done by toggling the enable bit
+ * then load initial load count to ~0.
+ */
+static void apbt_start_counter(int n)
+{
+ unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
+
+ ctrl &= ~APBTMR_CONTROL_ENABLE;
+ apbt_writel(n, ctrl, APBTMR_N_CONTROL);
+ apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT);
+ /* enable, mask interrupt */
+ ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
+ ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
+ apbt_writel(n, ctrl, APBTMR_N_CONTROL);
+ /* read it once to get cached counter value initialized */
+ apbt_read_clocksource(&clocksource_apbt);
+}
+
+static irqreturn_t apbt_interrupt_handler(int irq, void *data)
+{
+ struct apbt_dev *dev = (struct apbt_dev *)data;
+ struct clock_event_device *aevt = &dev->evt;
+
+ if (!aevt->event_handler) {
+ printk(KERN_INFO "Spurious APBT timer interrupt on %d\n",
+ dev->num);
+ return IRQ_NONE;
+ }
+ aevt->event_handler(aevt);
+ return IRQ_HANDLED;
+}
+
+static void apbt_restart_clocksource(struct clocksource *cs)
+{
+ apbt_start_counter(phy_cs_timer_id);
+}
+
+/* Setup IRQ routing via IOAPIC */
+#ifdef CONFIG_SMP
+static void apbt_setup_irq(struct apbt_dev *adev)
+{
+ struct irq_chip *chip;
+ struct irq_desc *desc;
+
+ /* timer0 irq has been setup early */
+ if (adev->irq == 0)
+ return;
+ desc = irq_to_desc(adev->irq);
+ chip = get_irq_chip(adev->irq);
+ disable_irq(adev->irq);
+ desc->status |= IRQ_MOVE_PCNTXT;
+ irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
+ /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
+ set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
+ enable_irq(adev->irq);
+ if (system_state == SYSTEM_BOOTING)
+ if (request_irq(adev->irq, apbt_interrupt_handler,
+ IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
+ adev->name, adev)) {
+ printk(KERN_ERR "Failed request IRQ for APBT%d\n",
+ adev->num);
+ }
+}
+#endif
+
+static void apbt_enable_int(int n)
+{
+ unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
+ /* clear pending intr */
+ apbt_readl(n, APBTMR_N_EOI);
+ ctrl &= ~APBTMR_CONTROL_INT;
+ apbt_writel(n, ctrl, APBTMR_N_CONTROL);
+}
+
+static void apbt_disable_int(int n)
+{
+ unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
+
+ ctrl |= APBTMR_CONTROL_INT;
+ apbt_writel(n, ctrl, APBTMR_N_CONTROL);
+}
+
+
+static int __init apbt_clockevent_register(void)
+{
+ struct sfi_timer_table_entry *mtmr;
+ struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev);
+
+ mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
+ if (mtmr == NULL) {
+ printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
+ APBT_CLOCKEVENT0_NUM);
+ return -ENODEV;
+ }
+
+ /*
+ * We need to calculate the scaled math multiplication factor for
+ * nanosecond to apbt tick conversion.
+ * mult = (nsec/cycle)*2^APBT_SHIFT
+ */
+ apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz
+ , NSEC_PER_SEC, APBT_SHIFT);
+
+ /* Calculate the min / max delta */
+ apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
+ &apbt_clockevent);
+ apbt_clockevent.min_delta_ns = clockevent_delta2ns(
+ APBT_MIN_DELTA_USEC*apbt_freq,
+ &apbt_clockevent);
+ /*
+ * Start apbt with the boot cpu mask and make it
+ * global if not used for per cpu timer.
+ */
+ apbt_clockevent.cpumask = cpumask_of(smp_processor_id());
+ adev->num = smp_processor_id();
+ memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
+
+ if (disable_apbt_percpu) {
+ apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
+ global_clock_event = &adev->evt;
+ printk(KERN_DEBUG "%s clockevent registered as global\n",
+ global_clock_event->name);
+ }
+
+ if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler,
+ IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
+ apbt_clockevent.name, adev)) {
+ printk(KERN_ERR "Failed request IRQ for APBT%d\n",
+ apbt_clockevent.irq);
+ }
+
+ clockevents_register_device(&adev->evt);
+ /* Start APBT 0 interrupts */
+ apbt_enable_int(APBT_CLOCKEVENT0_NUM);
+
+ sfi_free_mtmr(mtmr);
+ return 0;
+}
+
+#ifdef CONFIG_SMP
+/* Should be called with per cpu */
+void apbt_setup_secondary_clock(void)
+{
+ struct apbt_dev *adev;
+ struct clock_event_device *aevt;
+ int cpu;
+
+ /* Don't register boot CPU clockevent */
+ cpu = smp_processor_id();
+ if (cpu == boot_cpu_id)
+ return;
+ /*
+ * We need to calculate the scaled math multiplication factor for
+ * nanosecond to apbt tick conversion.
+ * mult = (nsec/cycle)*2^APBT_SHIFT
+ */
+ printk(KERN_INFO "Init per CPU clockevent %d\n", cpu);
+ adev = &per_cpu(cpu_apbt_dev, cpu);
+ aevt = &adev->evt;
+
+ memcpy(aevt, &apbt_clockevent, sizeof(*aevt));
+ aevt->cpumask = cpumask_of(cpu);
+ aevt->name = adev->name;
+ aevt->mode = CLOCK_EVT_MODE_UNUSED;
+
+ printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n",
+ cpu, aevt->name, *(u32 *)aevt->cpumask);
+
+ apbt_setup_irq(adev);
+
+ clockevents_register_device(aevt);
+
+ apbt_enable_int(cpu);
+
+ return;
+}
+
+/*
+ * this notify handler process CPU hotplug events. in case of S0i3, nonboot
+ * cpus are disabled/enabled frequently, for performance reasons, we keep the
+ * per cpu timer irq registered so that we do need to do free_irq/request_irq.
+ *
+ * TODO: it might be more reliable to directly disable percpu clockevent device
+ * without the notifier chain. currently, cpu 0 may get interrupts from other
+ * cpu timers during the offline process due to the ordering of notification.
+ * the extra interrupt is harmless.
+ */
+static int apbt_cpuhp_notify(struct notifier_block *n,
+ unsigned long action, void *hcpu)
+{
+ unsigned long cpu = (unsigned long)hcpu;
+ struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
+
+ switch (action & 0xf) {
+ case CPU_DEAD:
+ apbt_disable_int(cpu);
+ if (system_state == SYSTEM_RUNNING)
+ pr_debug("skipping APBT CPU %lu offline\n", cpu);
+ else if (adev) {
+ pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
+ free_irq(adev->irq, adev);
+ }
+ break;
+ default:
+ pr_debug(KERN_INFO "APBT notified %lu, no action\n", action);
+ }
+ return NOTIFY_OK;
+}
+
+static __init int apbt_late_init(void)
+{
+ if (disable_apbt_percpu)
+ return 0;
+ /* This notifier should be called after workqueue is ready */
+ hotcpu_notifier(apbt_cpuhp_notify, -20);
+ return 0;
+}
+fs_initcall(apbt_late_init);
+#else
+
+void apbt_setup_secondary_clock(void) {}
+
+#endif /* CONFIG_SMP */
+
+static void apbt_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ unsigned long ctrl;
+ uint64_t delta;
+ int timer_num;
+ struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
+
+ timer_num = adev->num;
+ pr_debug("%s CPU %d timer %d mode=%d\n",
+ __func__, first_cpu(*evt->cpumask), timer_num, mode);
+
+ switch (mode) {
+ case CLOCK_EVT_MODE_PERIODIC:
+ delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult;
+ delta >>= apbt_clockevent.shift;
+ ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
+ ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+ /*
+ * DW APB p. 46, have to disable timer before load counter,
+ * may cause sync problem.
+ */
+ ctrl &= ~APBTMR_CONTROL_ENABLE;
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+ udelay(1);
+ pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ);
+ apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
+ ctrl |= APBTMR_CONTROL_ENABLE;
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+ break;
+ /* APB timer does not have one-shot mode, use free running mode */
+ case CLOCK_EVT_MODE_ONESHOT:
+ ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
+ /*
+ * set free running mode, this mode will let timer reload max
+ * timeout which will give time (3min on 25MHz clock) to rearm
+ * the next event, therefore emulate the one-shot mode.
+ */
+ ctrl &= ~APBTMR_CONTROL_ENABLE;
+ ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
+
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+ /* write again to set free running mode */
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+
+ /*
+ * DW APB p. 46, load counter with all 1s before starting free
+ * running mode.
+ */
+ apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT);
+ ctrl &= ~APBTMR_CONTROL_INT;
+ ctrl |= APBTMR_CONTROL_ENABLE;
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+ break;
+
+ case CLOCK_EVT_MODE_UNUSED:
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ apbt_disable_int(timer_num);
+ ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
+ ctrl &= ~APBTMR_CONTROL_ENABLE;
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+ break;
+
+ case CLOCK_EVT_MODE_RESUME:
+ apbt_enable_int(timer_num);
+ break;
+ }
+}
+
+static int apbt_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ unsigned long ctrl;
+ int timer_num;
+
+ struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
+
+ timer_num = adev->num;
+ /* Disable timer */
+ ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
+ ctrl &= ~APBTMR_CONTROL_ENABLE;
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+ /* write new count */
+ apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
+ ctrl |= APBTMR_CONTROL_ENABLE;
+ apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
+ return 0;
+}
+
+/*
+ * APB timer clock is not in sync with pclk on Langwell, which translates to
+ * unreliable read value caused by sampling error. the error does not add up
+ * overtime and only happens when sampling a 0 as a 1 by mistake. so the time
+ * would go backwards. the following code is trying to prevent time traveling
+ * backwards. little bit paranoid.
+ */
+static cycle_t apbt_read_clocksource(struct clocksource *cs)
+{
+ unsigned long t0, t1, t2;
+ static unsigned long last_read;
+
+bad_count:
+ t1 = apbt_readl(phy_cs_timer_id,
+ APBTMR_N_CURRENT_VALUE);
+ t2 = apbt_readl(phy_cs_timer_id,
+ APBTMR_N_CURRENT_VALUE);
+ if (unlikely(t1 < t2)) {
+ pr_debug("APBT: read current count error %lx:%lx:%lx\n",
+ t1, t2, t2 - t1);
+ goto bad_count;
+ }
+ /*
+ * check against cached last read, makes sure time does not go back.
+ * it could be a normal rollover but we will do tripple check anyway
+ */
+ if (unlikely(t2 > last_read)) {
+ /* check if we have a normal rollover */
+ unsigned long raw_intr_status =
+ apbt_readl_reg(APBTMRS_RAW_INT_STATUS);
+ /*
+ * cs timer interrupt is masked but raw intr bit is set if
+ * rollover occurs. then we read EOI reg to clear it.
+ */
+ if (raw_intr_status & (1 << phy_cs_timer_id)) {
+ apbt_readl(phy_cs_timer_id, APBTMR_N_EOI);
+ goto out;
+ }
+ pr_debug("APB CS going back %lx:%lx:%lx ",
+ t2, last_read, t2 - last_read);
+bad_count_x3:
+ pr_debug(KERN_INFO "tripple check enforced\n");
+ t0 = apbt_readl(phy_cs_timer_id,
+ APBTMR_N_CURRENT_VALUE);
+ udelay(1);
+ t1 = apbt_readl(phy_cs_timer_id,
+ APBTMR_N_CURRENT_VALUE);
+ udelay(1);
+ t2 = apbt_readl(phy_cs_timer_id,
+ APBTMR_N_CURRENT_VALUE);
+ if ((t2 > t1) || (t1 > t0)) {
+ printk(KERN_ERR "Error: APB CS tripple check failed\n");
+ goto bad_count_x3;
+ }
+ }
+out:
+ last_read = t2;
+ return (cycle_t)~t2;
+}
+
+static int apbt_clocksource_register(void)
+{
+ u64 start, now;
+ cycle_t t1;
+
+ /* Start the counter, use timer 2 as source, timer 0/1 for event */
+ apbt_start_counter(phy_cs_timer_id);
+
+ /* Verify whether apbt counter works */
+ t1 = apbt_read_clocksource(&clocksource_apbt);
+ rdtscll(start);
+
+ /*
+ * We don't know the TSC frequency yet, but waiting for
+ * 200000 TSC cycles is safe:
+ * 4 GHz == 50us
+ * 1 GHz == 200us
+ */
+ do {
+ rep_nop();
+ rdtscll(now);
+ } while ((now - start) < 200000UL);
+
+ /* APBT is the only always on clocksource, it has to work! */
+ if (t1 == apbt_read_clocksource(&clocksource_apbt))
+ panic("APBT counter not counting. APBT disabled\n");
+
+ /*
+ * initialize and register APBT clocksource
+ * convert that to ns/clock cycle
+ * mult = (ns/c) * 2^APBT_SHIFT
+ */
+ clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
+ (unsigned long) apbt_freq, APBT_SHIFT);
+ clocksource_register(&clocksource_apbt);
+
+ return 0;
+}
+
+/*
+ * Early setup the APBT timer, only use timer 0 for booting then switch to
+ * per CPU timer if possible.
+ * returns 1 if per cpu apbt is setup
+ * returns 0 if no per cpu apbt is chosen
+ * panic if set up failed, this is the only platform timer on Moorestown.
+ */
+void __init apbt_time_init(void)
+{
+#ifdef CONFIG_SMP
+ int i;
+ struct sfi_timer_table_entry *p_mtmr;
+ unsigned int percpu_timer;
+ struct apbt_dev *adev;
+#endif
+
+ if (apb_timer_block_enabled)
+ return;
+ apbt_set_mapping();
+ if (apbt_virt_address) {
+ pr_debug("Found APBT version 0x%lx\n",\
+ apbt_readl_reg(APBTMRS_COMP_VERSION));
+ } else
+ goto out_noapbt;
+ /*
+ * Read the frequency and check for a sane value, for ESL model
+ * we extend the possible clock range to allow time scaling.
+ */
+
+ if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
+ pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq);
+ goto out_noapbt;
+ }
+ if (apbt_clocksource_register()) {
+ pr_debug("APBT has failed to register clocksource\n");
+ goto out_noapbt;
+ }
+ if (!apbt_clockevent_register())
+ apb_timer_block_enabled = 1;
+ else {
+ pr_debug("APBT has failed to register clockevent\n");
+ goto out_noapbt;
+ }
+#ifdef CONFIG_SMP
+ /* kernel cmdline disable apb timer, so we will use lapic timers */
+ if (disable_apbt_percpu) {
+ printk(KERN_INFO "apbt: disabled per cpu timer\n");
+ return;
+ }
+ pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
+ if (num_possible_cpus() <= sfi_mtimer_num) {
+ percpu_timer = 1;
+ apbt_num_timers_used = num_possible_cpus();
+ } else {
+ percpu_timer = 0;
+ apbt_num_timers_used = 1;
+ adev = &per_cpu(cpu_apbt_dev, 0);
+ adev->flags &= ~APBT_DEV_USED;
+ }
+ pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
+
+ /* here we set up per CPU timer data structure */
+ apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used,
+ GFP_KERNEL);
+ if (!apbt_devs) {
+ printk(KERN_ERR "Failed to allocate APB timer devices\n");
+ return;
+ }
+ for (i = 0; i < apbt_num_timers_used; i++) {
+ adev = &per_cpu(cpu_apbt_dev, i);
+ adev->num = i;
+ adev->cpu = i;
+ p_mtmr = sfi_get_mtmr(i);
+ if (p_mtmr) {
+ adev->tick = p_mtmr->freq_hz;
+ adev->irq = p_mtmr->irq;
+ } else
+ printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
+ adev->count = 0;
+ sprintf(adev->name, "apbt%d", i);
+ }
+#endif
+
+ return;
+
+out_noapbt:
+ apbt_clear_mapping();
+ apb_timer_block_enabled = 0;
+ panic("failed to enable APB timer\n");
+}
+
+static inline void apbt_disable(int n)
+{
+ if (is_apbt_capable()) {
+ unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
+ ctrl &= ~APBTMR_CONTROL_ENABLE;
+ apbt_writel(n, ctrl, APBTMR_N_CONTROL);
+ }
+}
+
+/* called before apb_timer_enable, use early map */
+unsigned long apbt_quick_calibrate()
+{
+ int i, scale;
+ u64 old, new;
+ cycle_t t1, t2;
+ unsigned long khz = 0;
+ u32 loop, shift;
+
+ apbt_set_mapping();
+ apbt_start_counter(phy_cs_timer_id);
+
+ /* check if the timer can count down, otherwise return */
+ old = apbt_read_clocksource(&clocksource_apbt);
+ i = 10000;
+ while (--i) {
+ if (old != apbt_read_clocksource(&clocksource_apbt))
+ break;
+ }
+ if (!i)
+ goto failed;
+
+ /* count 16 ms */
+ loop = (apbt_freq * 1000) << 4;
+
+ /* restart the timer to ensure it won't get to 0 in the calibration */
+ apbt_start_counter(phy_cs_timer_id);
+
+ old = apbt_read_clocksource(&clocksource_apbt);
+ old += loop;
+
+ t1 = __native_read_tsc();
+
+ do {
+ new = apbt_read_clocksource(&clocksource_apbt);
+ } while (new < old);
+
+ t2 = __native_read_tsc();
+
+ shift = 5;
+ if (unlikely(loop >> shift == 0)) {
+ printk(KERN_INFO
+ "APBT TSC calibration failed, not enough resolution\n");
+ return 0;
+ }
+ scale = (int)div_u64((t2 - t1), loop >> shift);
+ khz = (scale * apbt_freq * 1000) >> shift;
+ printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
+ return khz;
+failed:
+ return 0;
+}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 128111d8ffe0..3704997e8b25 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -28,6 +28,7 @@
#include <asm/pci-direct.h>
#include <asm/dma.h>
#include <asm/k8.h>
+#include <asm/x86_init.h>
int gart_iommu_aperture;
int gart_iommu_aperture_disabled __initdata;
@@ -279,7 +280,8 @@ void __init early_gart_iommu_check(void)
* or BIOS forget to put that in reserved.
* try to update e820 to make that region as reserved.
*/
- int i, fix, slot;
+ u32 agp_aper_base = 0, agp_aper_order = 0;
+ int i, fix, slot, valid_agp = 0;
u32 ctl;
u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
u64 aper_base = 0, last_aper_base = 0;
@@ -289,6 +291,8 @@ void __init early_gart_iommu_check(void)
return;
/* This is mostly duplicate of iommu_hole_init */
+ agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
+
fix = 0;
for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
int bus;
@@ -341,10 +345,10 @@ void __init early_gart_iommu_check(void)
}
}
- if (!fix)
+ if (valid_agp)
return;
- /* different nodes have different setting, disable them all at first*/
+ /* disable them all at first */
for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
int bus;
int dev_base, dev_limit;
@@ -400,6 +404,7 @@ void __init gart_iommu_hole_init(void)
iommu_detected = 1;
gart_iommu_aperture = 1;
+ x86_init.iommu.iommu_init = gart_iommu_init;
aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
aper_size = (32 * 1024 * 1024) << aper_order;
@@ -456,8 +461,6 @@ out:
if (aper_alloc) {
/* Got the aperture from the AGP bridge */
- } else if (swiotlb && !valid_agp) {
- /* Do nothing */
} else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
force_iommu ||
valid_agp ||
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index da7b7b9f8bd8..565c1bfc507d 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,7 @@
# Makefile for local APIC drivers and for the IO-APIC code
#
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_SMP) += ipi.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 894aa97f0717..00187f1fcfb7 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -61,12 +61,6 @@ unsigned int boot_cpu_physical_apicid = -1U;
/*
* The highest APIC ID seen during enumeration.
- *
- * On AMD, this determines the messaging protocol we can use: if all APIC IDs
- * are in the 0 ... 7 range, then we can use logical addressing which
- * has some performance advantages (better broadcasting).
- *
- * If there's an APIC ID above 8, we use physical addressing.
*/
unsigned int max_physical_apicid;
@@ -241,28 +235,13 @@ static int modern_apic(void)
}
/*
- * bare function to substitute write operation
- * and it's _that_ fast :)
- */
-static void native_apic_write_dummy(u32 reg, u32 v)
-{
- WARN_ON_ONCE((cpu_has_apic || !disable_apic));
-}
-
-static u32 native_apic_read_dummy(u32 reg)
-{
- WARN_ON_ONCE((cpu_has_apic && !disable_apic));
- return 0;
-}
-
-/*
- * right after this call apic->write/read doesn't do anything
- * note that there is no restore operation it works one way
+ * right after this call apic become NOOP driven
+ * so apic->write/read doesn't do anything
*/
void apic_disable(void)
{
- apic->read = native_apic_read_dummy;
- apic->write = native_apic_write_dummy;
+ pr_info("APIC: switched to apic NOOP\n");
+ apic = &apic_noop;
}
void native_apic_wait_icr_idle(void)
@@ -459,7 +438,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
v = apic_read(APIC_LVTT);
v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
apic_write(APIC_LVTT, v);
- apic_write(APIC_TMICT, 0xffffffff);
+ apic_write(APIC_TMICT, 0);
break;
case CLOCK_EVT_MODE_RESUME:
/* Nothing to do here */
@@ -602,7 +581,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
res = (((u64)(*deltatsc)) * pm_100ms);
do_div(res, deltapm);
apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
- "PM-Timer: %lu (%ld) \n",
+ "PM-Timer: %lu (%ld)\n",
(unsigned long)res, *deltatsc);
*deltatsc = (long)res;
}
@@ -662,7 +641,7 @@ static int __init calibrate_APIC_clock(void)
calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
- apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult);
+ apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
calibration_result);
@@ -1356,7 +1335,7 @@ void enable_x2apic(void)
rdmsr(MSR_IA32_APICBASE, msr, msr2);
if (!(msr & X2APIC_ENABLE)) {
- pr_info("Enabling x2apic\n");
+ printk_once(KERN_INFO "Enabling x2apic\n");
wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
}
}
@@ -1392,14 +1371,11 @@ void __init enable_IR_x2apic(void)
unsigned long flags;
struct IO_APIC_route_entry **ioapic_entries = NULL;
int ret, x2apic_enabled = 0;
- int dmar_table_init_ret = 0;
+ int dmar_table_init_ret;
-#ifdef CONFIG_INTR_REMAP
dmar_table_init_ret = dmar_table_init();
- if (dmar_table_init_ret)
- pr_debug("dmar_table_init() failed with %d:\n",
- dmar_table_init_ret);
-#endif
+ if (dmar_table_init_ret && !x2apic_supported())
+ return;
ioapic_entries = alloc_ioapic_entries();
if (!ioapic_entries) {
@@ -1414,7 +1390,7 @@ void __init enable_IR_x2apic(void)
}
local_irq_save(flags);
- mask_8259A();
+ legacy_pic->mask_all();
mask_IO_APIC_setup(ioapic_entries);
if (dmar_table_init_ret)
@@ -1446,7 +1422,7 @@ void __init enable_IR_x2apic(void)
nox2apic:
if (!ret) /* IR enabling failed */
restore_IO_APIC_setup(ioapic_entries);
- unmask_8259A();
+ legacy_pic->restore_mask();
local_irq_restore(flags);
out:
@@ -1665,9 +1641,7 @@ int __init APIC_init_uniprocessor(void)
#endif
enable_IR_x2apic();
-#ifdef CONFIG_X86_64
default_setup_apic_routing();
-#endif
verify_local_APIC();
connect_bsp_APIC();
@@ -1915,18 +1889,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
if (apicid > max_physical_apicid)
max_physical_apicid = apicid;
-#ifdef CONFIG_X86_32
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- if (num_processors > 8)
- def_to_bigsmp = 1;
- break;
- case X86_VENDOR_AMD:
- if (max_physical_apicid >= 8)
- def_to_bigsmp = 1;
- }
-#endif
-
#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
@@ -2056,7 +2018,7 @@ static int lapic_resume(struct sys_device *dev)
}
mask_IO_APIC_setup(ioapic_entries);
- mask_8259A();
+ legacy_pic->mask_all();
}
if (x2apic_mode)
@@ -2100,7 +2062,7 @@ static int lapic_resume(struct sys_device *dev)
if (intr_remapping_enabled) {
reenable_intr_remapping(x2apic_mode);
- unmask_8259A();
+ legacy_pic->restore_mask();
restore_IO_APIC_setup(ioapic_entries);
free_ioapic_entries(ioapic_entries);
}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index d0c99abc26c3..09d3b17ce0c2 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -223,7 +223,7 @@ struct apic apic_flat = {
};
/*
- * Physflat mode is used when there are more than 8 CPUs on a AMD system.
+ * Physflat mode is used when there are more than 8 CPUs on a system.
* We cannot use logical delivery in this case because the mask
* overflows, so use physical mode.
*/
@@ -240,6 +240,11 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
printk(KERN_DEBUG "system APIC only can use physical flat");
return 1;
}
+
+ if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) {
+ printk(KERN_DEBUG "IBM Summit detected, will use apic physical");
+ return 1;
+ }
#endif
return 0;
@@ -306,10 +311,7 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
if (cpumask_test_cpu(cpu, cpu_online_mask))
break;
}
- if (cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_apicid, cpu);
-
- return BAD_APICID;
+ return per_cpu(x86_cpu_to_apicid, cpu);
}
struct apic apic_physflat = {
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
new file mode 100644
index 000000000000..e31b9ffe25f5
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -0,0 +1,200 @@
+/*
+ * NOOP APIC driver.
+ *
+ * Does almost nothing and should be substituted by a real apic driver via
+ * probe routine.
+ *
+ * Though in case if apic is disabled (for some reason) we try
+ * to not uglify the caller's code and allow to call (some) apic routines
+ * like self-ipi, etc...
+ */
+
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/setup.h>
+
+#include <linux/smp.h>
+#include <asm/ipi.h>
+
+#include <linux/interrupt.h>
+#include <asm/acpi.h>
+#include <asm/e820.h>
+
+static void noop_init_apic_ldr(void) { }
+static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
+static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
+static void noop_send_IPI_allbutself(int vector) { }
+static void noop_send_IPI_all(int vector) { }
+static void noop_send_IPI_self(int vector) { }
+static void noop_apic_wait_icr_idle(void) { }
+static void noop_apic_icr_write(u32 low, u32 id) { }
+
+static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
+{
+ return -1;
+}
+
+static u32 noop_safe_apic_wait_icr_idle(void)
+{
+ return 0;
+}
+
+static u64 noop_apic_icr_read(void)
+{
+ return 0;
+}
+
+static int noop_cpu_to_logical_apicid(int cpu)
+{
+ return 0;
+}
+
+static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return 0;
+}
+
+static unsigned int noop_get_apic_id(unsigned long x)
+{
+ return 0;
+}
+
+static int noop_probe(void)
+{
+ /*
+ * NOOP apic should not ever be
+ * enabled via probe routine
+ */
+ return 0;
+}
+
+static int noop_apic_id_registered(void)
+{
+ /*
+ * if we would be really "pedantic"
+ * we should pass read_apic_id() here
+ * but since NOOP suppose APIC ID = 0
+ * lets save a few cycles
+ */
+ return physid_isset(0, phys_cpu_present_map);
+}
+
+static const struct cpumask *noop_target_cpus(void)
+{
+ /* only BSP here */
+ return cpumask_of(0);
+}
+
+static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid)
+{
+ return physid_isset(apicid, *map);
+}
+
+static unsigned long noop_check_apicid_present(int bit)
+{
+ return physid_isset(bit, phys_cpu_present_map);
+}
+
+static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ if (cpu != 0)
+ pr_warning("APIC: Vector allocated for non-BSP cpu\n");
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
+}
+
+int noop_apicid_to_node(int logical_apicid)
+{
+ /* we're always on node 0 */
+ return 0;
+}
+
+static u32 noop_apic_read(u32 reg)
+{
+ WARN_ON_ONCE((cpu_has_apic && !disable_apic));
+ return 0;
+}
+
+static void noop_apic_write(u32 reg, u32 v)
+{
+ WARN_ON_ONCE(cpu_has_apic && !disable_apic);
+}
+
+struct apic apic_noop = {
+ .name = "noop",
+ .probe = noop_probe,
+ .acpi_madt_oem_check = NULL,
+
+ .apic_id_registered = noop_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ /* logical delivery broadcast to all CPUs: */
+ .irq_dest_mode = 1,
+
+ .target_cpus = noop_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = noop_check_apicid_used,
+ .check_apicid_present = noop_check_apicid_present,
+
+ .vector_allocation_domain = noop_vector_allocation_domain,
+ .init_apic_ldr = noop_init_apic_ldr,
+
+ .ioapic_phys_id_map = default_ioapic_phys_id_map,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .apicid_to_node = noop_apicid_to_node,
+
+ .cpu_to_logical_apicid = noop_cpu_to_logical_apicid,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = physid_set_mask_of_physid,
+
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+
+ .phys_pkg_id = noop_phys_pkg_id,
+
+ .mps_oem_check = NULL,
+
+ .get_apic_id = noop_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0x0F << 24,
+
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = noop_send_IPI_mask,
+ .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = noop_send_IPI_allbutself,
+ .send_IPI_all = noop_send_IPI_all,
+ .send_IPI_self = noop_send_IPI_self,
+
+ .wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
+
+ /* should be safe */
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+
+ .wait_for_init_deassert = NULL,
+
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = NULL,
+
+ .read = noop_apic_read,
+ .write = noop_apic_write,
+ .icr_read = noop_apic_icr_read,
+ .icr_write = noop_apic_icr_write,
+ .wait_icr_idle = noop_apic_wait_icr_idle,
+ .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
+};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 77a06413b6b2..cb804c5091b9 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -35,7 +35,7 @@ static const struct cpumask *bigsmp_target_cpus(void)
#endif
}
-static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid)
+static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
{
return 0;
}
@@ -93,11 +93,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
return BAD_APICID;
}
-static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
-{
- return physid_mask_of_physid(phys_apicid);
-}
-
/* Mapping from cpu number to logical apicid */
static inline int bigsmp_cpu_to_logical_apicid(int cpu)
{
@@ -106,10 +101,10 @@ static inline int bigsmp_cpu_to_logical_apicid(int cpu)
return cpu_physical_id(cpu);
}
-static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map)
+static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
{
/* For clustered we don't have a good way to do this yet - hack */
- return physids_promote(0xFFL);
+ physids_promote(0xFFL, retmap);
}
static int bigsmp_check_phys_apicid_present(int phys_apicid)
@@ -136,10 +131,7 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
if (cpumask_test_cpu(cpu, cpu_online_mask))
break;
}
- if (cpu < nr_cpu_ids)
- return bigsmp_cpu_to_logical_apicid(cpu);
-
- return BAD_APICID;
+ return bigsmp_cpu_to_logical_apicid(cpu);
}
static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -230,7 +222,7 @@ struct apic apic_bigsmp = {
.apicid_to_node = bigsmp_apicid_to_node,
.cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
.cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
- .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present,
+ .apicid_to_cpu_present = physid_set_mask_of_physid,
.setup_portio_remap = NULL,
.check_phys_apicid_present = bigsmp_check_phys_apicid_present,
.enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 89174f847b49..dd2b5f264643 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -27,6 +27,9 @@
*
* http://www.unisys.com
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/notifier.h>
#include <linux/spinlock.h>
#include <linux/cpumask.h>
@@ -223,9 +226,9 @@ static int parse_unisys_oem(char *oemptr)
mip_addr = val;
mip = (struct mip_reg *)val;
mip_reg = __va(mip);
- pr_debug("es7000_mipcfg: host_reg = 0x%lx \n",
+ pr_debug("host_reg = 0x%lx\n",
(unsigned long)host_reg);
- pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n",
+ pr_debug("mip_reg = 0x%lx\n",
(unsigned long)mip_reg);
success++;
break;
@@ -401,7 +404,7 @@ static void es7000_enable_apic_mode(void)
if (!es7000_plat)
return;
- printk(KERN_INFO "ES7000: Enabling APIC mode.\n");
+ pr_info("Enabling APIC mode.\n");
memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
es7000_mip_reg.off_0x00 = MIP_SW_APIC;
es7000_mip_reg.off_0x38 = MIP_VALID;
@@ -466,11 +469,11 @@ static const struct cpumask *es7000_target_cpus(void)
return cpumask_of(smp_processor_id());
}
-static unsigned long
-es7000_check_apicid_used(physid_mask_t bitmap, int apicid)
+static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid)
{
return 0;
}
+
static unsigned long es7000_check_apicid_present(int bit)
{
return physid_isset(bit, phys_cpu_present_map);
@@ -514,8 +517,7 @@ static void es7000_setup_apic_routing(void)
{
int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
- printk(KERN_INFO
- "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
+ pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
(apic_version[apic] == 0x14) ?
"Physical Cluster" : "Logical Cluster",
nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
@@ -539,14 +541,10 @@ static int es7000_cpu_present_to_apicid(int mps_cpu)
static int cpu_id;
-static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid)
+static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
{
- physid_mask_t mask;
-
- mask = physid_mask_of_physid(cpu_id);
+ physid_set_mask_of_physid(cpu_id, retmap);
++cpu_id;
-
- return mask;
}
/* Mapping from cpu number to logical apicid */
@@ -561,10 +559,10 @@ static int es7000_cpu_to_logical_apicid(int cpu)
#endif
}
-static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map)
+static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
{
/* For clustered we don't have a good way to do this yet - hack */
- return physids_promote(0xff);
+ physids_promote(0xFFL, retmap);
}
static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index dc69f28489f5..e4e0ddcb1546 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -60,8 +60,6 @@
#include <asm/irq_remapping.h>
#include <asm/hpet.h>
#include <asm/hw_irq.h>
-#include <asm/uv/uv_hub.h>
-#include <asm/uv/uv_irq.h>
#include <asm/apic.h>
@@ -75,8 +73,8 @@
*/
int sis_apic_bug = -1;
-static DEFINE_SPINLOCK(ioapic_lock);
-static DEFINE_SPINLOCK(vector_lock);
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
+static DEFINE_RAW_SPINLOCK(vector_lock);
/*
* # of IRQ routing registers
@@ -96,8 +94,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* # of MP IRQ source entries */
int mp_irq_entries;
-/* Number of legacy interrupts */
-static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY;
/* GSI interrupts */
static int nr_irqs_gsi = NR_IRQS_LEGACY;
@@ -140,49 +136,12 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
return pin;
}
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * Most irqs are mapped 1:1 with pins.
- */
-struct irq_cfg {
- struct irq_pin_list *irq_2_pin;
- cpumask_var_t domain;
- cpumask_var_t old_domain;
- unsigned move_cleanup_count;
- u8 vector;
- u8 move_in_progress : 1;
-};
-
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
#ifdef CONFIG_SPARSE_IRQ
-static struct irq_cfg irq_cfgx[] = {
+static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
#else
-static struct irq_cfg irq_cfgx[NR_IRQS] = {
+static struct irq_cfg irq_cfgx[NR_IRQS];
#endif
- [0] = { .vector = IRQ0_VECTOR, },
- [1] = { .vector = IRQ1_VECTOR, },
- [2] = { .vector = IRQ2_VECTOR, },
- [3] = { .vector = IRQ3_VECTOR, },
- [4] = { .vector = IRQ4_VECTOR, },
- [5] = { .vector = IRQ5_VECTOR, },
- [6] = { .vector = IRQ6_VECTOR, },
- [7] = { .vector = IRQ7_VECTOR, },
- [8] = { .vector = IRQ8_VECTOR, },
- [9] = { .vector = IRQ9_VECTOR, },
- [10] = { .vector = IRQ10_VECTOR, },
- [11] = { .vector = IRQ11_VECTOR, },
- [12] = { .vector = IRQ12_VECTOR, },
- [13] = { .vector = IRQ13_VECTOR, },
- [14] = { .vector = IRQ14_VECTOR, },
- [15] = { .vector = IRQ15_VECTOR, },
-};
-
-void __init io_apic_disable_legacy(void)
-{
- nr_legacy_irqs = 0;
- nr_irqs_gsi = 0;
-}
int __init arch_early_irq_init(void)
{
@@ -192,6 +151,11 @@ int __init arch_early_irq_init(void)
int node;
int i;
+ if (!legacy_pic->nr_legacy_irqs) {
+ nr_irqs_gsi = 0;
+ io_apic_irqs = ~0UL;
+ }
+
cfg = irq_cfgx;
count = ARRAY_SIZE(irq_cfgx);
node= cpu_to_node(boot_cpu_id);
@@ -201,15 +165,21 @@ int __init arch_early_irq_init(void)
desc->chip_data = &cfg[i];
zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
- if (i < nr_legacy_irqs)
- cpumask_setall(cfg[i].domain);
+ /*
+ * For legacy IRQ's, start with assigning irq0 to irq15 to
+ * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
+ */
+ if (i < legacy_pic->nr_legacy_irqs) {
+ cfg[i].vector = IRQ0_VECTOR + i;
+ cpumask_set_cpu(0, cfg[i].domain);
+ }
}
return 0;
}
#ifdef CONFIG_SPARSE_IRQ
-static struct irq_cfg *irq_cfg(unsigned int irq)
+struct irq_cfg *irq_cfg(unsigned int irq)
{
struct irq_cfg *cfg = NULL;
struct irq_desc *desc;
@@ -361,7 +331,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
/* end for move_irq_desc */
#else
-static struct irq_cfg *irq_cfg(unsigned int irq)
+struct irq_cfg *irq_cfg(unsigned int irq)
{
return irq < nr_irqs ? irq_cfgx + irq : NULL;
}
@@ -422,7 +392,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
struct irq_pin_list *entry;
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
for_each_irq_pin(entry, cfg->irq_2_pin) {
unsigned int reg;
int pin;
@@ -431,11 +401,11 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
reg = io_apic_read(entry->apic, 0x10 + pin*2);
/* Is the remote IRR bit set? */
if (reg & IO_APIC_REDIR_REMOTE_IRR) {
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return true;
}
}
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return false;
}
@@ -449,10 +419,10 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
{
union entry_union eu;
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return eu.entry;
}
@@ -475,9 +445,9 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
__ioapic_write_entry(apic, pin, e);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -490,10 +460,10 @@ static void ioapic_mask_entry(int apic, int pin)
unsigned long flags;
union entry_union eu = { .entry.mask = 1 };
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic, 0x10 + 2*pin, eu.w1);
io_apic_write(apic, 0x11 + 2*pin, eu.w2);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -555,23 +525,41 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
add_pin_to_irq_node(cfg, node, newapic, newpin);
}
+static void __io_apic_modify_irq(struct irq_pin_list *entry,
+ int mask_and, int mask_or,
+ void (*final)(struct irq_pin_list *entry))
+{
+ unsigned int reg, pin;
+
+ pin = entry->pin;
+ reg = io_apic_read(entry->apic, 0x10 + pin * 2);
+ reg &= mask_and;
+ reg |= mask_or;
+ io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
+ if (final)
+ final(entry);
+}
+
static void io_apic_modify_irq(struct irq_cfg *cfg,
int mask_and, int mask_or,
void (*final)(struct irq_pin_list *entry))
{
- int pin;
struct irq_pin_list *entry;
- for_each_irq_pin(entry, cfg->irq_2_pin) {
- unsigned int reg;
- pin = entry->pin;
- reg = io_apic_read(entry->apic, 0x10 + pin * 2);
- reg &= mask_and;
- reg |= mask_or;
- io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
- if (final)
- final(entry);
- }
+ for_each_irq_pin(entry, cfg->irq_2_pin)
+ __io_apic_modify_irq(entry, mask_and, mask_or, final);
+}
+
+static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry)
+{
+ __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+ IO_APIC_REDIR_MASKED, NULL);
+}
+
+static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
+{
+ __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED,
+ IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
}
static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
@@ -595,18 +583,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
}
-static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
-{
- io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
- IO_APIC_REDIR_MASKED, NULL);
-}
-
-static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
-{
- io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
- IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
-}
-
static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
{
struct irq_cfg *cfg = desc->chip_data;
@@ -614,9 +590,9 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
BUG_ON(!cfg);
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
__mask_IO_APIC_irq(cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
@@ -624,9 +600,9 @@ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
struct irq_cfg *cfg = desc->chip_data;
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
__unmask_IO_APIC_irq(cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void mask_IO_APIC_irq(unsigned int irq)
@@ -875,7 +851,7 @@ static int __init find_isa_irq_apic(int irq, int type)
*/
static int EISA_ELCR(unsigned int irq)
{
- if (irq < nr_legacy_irqs) {
+ if (irq < legacy_pic->nr_legacy_irqs) {
unsigned int port = 0x4d0 + (irq >> 3);
return (inb(port) >> (irq & 7)) & 1;
}
@@ -1150,12 +1126,12 @@ void lock_vector_lock(void)
/* Used to the online set of cpus does not change
* during assign_irq_vector.
*/
- spin_lock(&vector_lock);
+ raw_spin_lock(&vector_lock);
}
void unlock_vector_lock(void)
{
- spin_unlock(&vector_lock);
+ raw_spin_unlock(&vector_lock);
}
static int
@@ -1172,12 +1148,13 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
* Also, we've got to be careful not to trash gate
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
- static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+ static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
+ static int current_offset = VECTOR_OFFSET_START % 8;
unsigned int old_vector;
int cpu, err;
cpumask_var_t tmp_mask;
- if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+ if (cfg->move_in_progress)
return -EBUSY;
if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
@@ -1208,7 +1185,7 @@ next:
if (vector >= first_system_vector) {
/* If out of vectors on large boxen, must share them. */
offset = (offset + 1) % 8;
- vector = FIRST_DEVICE_VECTOR + offset;
+ vector = FIRST_EXTERNAL_VECTOR + offset;
}
if (unlikely(current_vector == vector))
continue;
@@ -1237,15 +1214,14 @@ next:
return err;
}
-static int
-assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
{
int err;
unsigned long flags;
- spin_lock_irqsave(&vector_lock, flags);
+ raw_spin_lock_irqsave(&vector_lock, flags);
err = __assign_irq_vector(irq, cfg, mask);
- spin_unlock_irqrestore(&vector_lock, flags);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
return err;
}
@@ -1279,11 +1255,16 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
void __setup_vector_irq(int cpu)
{
/* Initialize vector_irq on a new cpu */
- /* This function must be called with vector_lock held */
int irq, vector;
struct irq_cfg *cfg;
struct irq_desc *desc;
+ /*
+ * vector_lock will make sure that we don't run into irq vector
+ * assignments that might be happening on another cpu in parallel,
+ * while we setup our initial vector to irq mappings.
+ */
+ raw_spin_lock(&vector_lock);
/* Mark the inuse vectors */
for_each_irq_desc(irq, desc) {
cfg = desc->chip_data;
@@ -1302,6 +1283,7 @@ void __setup_vector_irq(int cpu)
if (!cpumask_test_cpu(cpu, cfg->domain))
per_cpu(vector_irq, cpu)[vector] = -1;
}
+ raw_spin_unlock(&vector_lock);
}
static struct irq_chip ioapic_chip;
@@ -1451,6 +1433,14 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
cfg = desc->chip_data;
+ /*
+ * For legacy irqs, cfg->domain starts with cpu 0 for legacy
+ * controllers like 8259. Now that IO-APIC can handle this irq, update
+ * the cfg->domain.
+ */
+ if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
+ apic->vector_allocation_domain(0, cfg->domain);
+
if (assign_irq_vector(irq, cfg, apic->target_cpus()))
return;
@@ -1472,8 +1462,8 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
}
ioapic_register_intr(irq, desc, trigger);
- if (irq < nr_legacy_irqs)
- disable_8259A_irq(irq);
+ if (irq < legacy_pic->nr_legacy_irqs)
+ legacy_pic->chip->mask(irq);
ioapic_write_entry(apic_id, pin, entry);
}
@@ -1484,7 +1474,7 @@ static struct {
static void __init setup_IO_APIC_irqs(void)
{
- int apic_id = 0, pin, idx, irq;
+ int apic_id, pin, idx, irq;
int notcon = 0;
struct irq_desc *desc;
struct irq_cfg *cfg;
@@ -1492,14 +1482,7 @@ static void __init setup_IO_APIC_irqs(void)
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
-#ifdef CONFIG_ACPI
- if (!acpi_disabled && acpi_ioapic) {
- apic_id = mp_find_ioapic(0);
- if (apic_id < 0)
- apic_id = 0;
- }
-#endif
-
+ for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
idx = find_irq_entry(apic_id, pin, mp_INT);
if (idx == -1) {
@@ -1521,6 +1504,9 @@ static void __init setup_IO_APIC_irqs(void)
irq = pin_2_irq(idx, apic_id, pin);
+ if ((apic_id > 0) && (irq > 16))
+ continue;
+
/*
* Skip the timer IRQ if there's a quirk handler
* installed and if it returns 1:
@@ -1550,6 +1536,56 @@ static void __init setup_IO_APIC_irqs(void)
}
/*
+ * for the gsit that is not in first ioapic
+ * but could not use acpi_register_gsi()
+ * like some special sci in IBM x3330
+ */
+void setup_IO_APIC_irq_extra(u32 gsi)
+{
+ int apic_id = 0, pin, idx, irq;
+ int node = cpu_to_node(boot_cpu_id);
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+
+ /*
+ * Convert 'gsi' to 'ioapic.pin'.
+ */
+ apic_id = mp_find_ioapic(gsi);
+ if (apic_id < 0)
+ return;
+
+ pin = mp_find_ioapic_pin(apic_id, gsi);
+ idx = find_irq_entry(apic_id, pin, mp_INT);
+ if (idx == -1)
+ return;
+
+ irq = pin_2_irq(idx, apic_id, pin);
+#ifdef CONFIG_SPARSE_IRQ
+ desc = irq_to_desc(irq);
+ if (desc)
+ return;
+#endif
+ desc = irq_to_desc_alloc_node(irq, node);
+ if (!desc) {
+ printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+ return;
+ }
+
+ cfg = desc->chip_data;
+ add_pin_to_irq_node(cfg, node, apic_id, pin);
+
+ if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
+ pr_debug("Pin %d-%d already programmed\n",
+ mp_ioapics[apic_id].apicid, pin);
+ return;
+ }
+ set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
+
+ setup_IO_APIC_irq(apic_id, pin, irq, desc,
+ irq_trigger(idx), irq_polarity(idx));
+}
+
+/*
* Set up the timer pin, possibly with the 8259A-master behind.
*/
static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
@@ -1599,9 +1635,6 @@ __apicdebuginit(void) print_IO_APIC(void)
struct irq_desc *desc;
unsigned int irq;
- if (apic_verbosity == APIC_QUIET)
- return;
-
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for (i = 0; i < nr_ioapics; i++)
printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
@@ -1615,14 +1648,14 @@ __apicdebuginit(void) print_IO_APIC(void)
for (apic = 0; apic < nr_ioapics; apic++) {
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic, 0);
reg_01.raw = io_apic_read(apic, 1);
if (reg_01.bits.version >= 0x10)
reg_02.raw = io_apic_read(apic, 2);
if (reg_01.bits.version >= 0x20)
reg_03.raw = io_apic_read(apic, 3);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
printk("\n");
printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
@@ -1661,7 +1694,7 @@ __apicdebuginit(void) print_IO_APIC(void)
printk(KERN_DEBUG ".... IRQ redirection table:\n");
printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
- " Stat Dmod Deli Vect: \n");
+ " Stat Dmod Deli Vect:\n");
for (i = 0; i <= reg_01.bits.entries; i++) {
struct IO_APIC_route_entry entry;
@@ -1708,9 +1741,6 @@ __apicdebuginit(void) print_APIC_field(int base)
{
int i;
- if (apic_verbosity == APIC_QUIET)
- return;
-
printk(KERN_DEBUG);
for (i = 0; i < 8; i++)
@@ -1724,9 +1754,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
unsigned int i, v, ver, maxlvt;
u64 icr;
- if (apic_verbosity == APIC_QUIET)
- return;
-
printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
smp_processor_id(), hard_smp_processor_id());
v = apic_read(APIC_ID);
@@ -1824,13 +1851,19 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
printk("\n");
}
-__apicdebuginit(void) print_all_local_APICs(void)
+__apicdebuginit(void) print_local_APICs(int maxcpu)
{
int cpu;
+ if (!maxcpu)
+ return;
+
preempt_disable();
- for_each_online_cpu(cpu)
+ for_each_online_cpu(cpu) {
+ if (cpu >= maxcpu)
+ break;
smp_call_function_single(cpu, print_local_APIC, NULL, 1);
+ }
preempt_enable();
}
@@ -1839,12 +1872,12 @@ __apicdebuginit(void) print_PIC(void)
unsigned int v;
unsigned long flags;
- if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs)
+ if (!legacy_pic->nr_legacy_irqs)
return;
printk(KERN_DEBUG "\nprinting PIC contents\n");
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
v = inb(0xa1) << 8 | inb(0x21);
printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
@@ -1858,7 +1891,7 @@ __apicdebuginit(void) print_PIC(void)
outb(0x0a,0xa0);
outb(0x0a,0x20);
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
@@ -1866,21 +1899,41 @@ __apicdebuginit(void) print_PIC(void)
printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
}
-__apicdebuginit(int) print_all_ICs(void)
+static int __initdata show_lapic = 1;
+static __init int setup_show_lapic(char *arg)
+{
+ int num = -1;
+
+ if (strcmp(arg, "all") == 0) {
+ show_lapic = CONFIG_NR_CPUS;
+ } else {
+ get_option(&arg, &num);
+ if (num >= 0)
+ show_lapic = num;
+ }
+
+ return 1;
+}
+__setup("show_lapic=", setup_show_lapic);
+
+__apicdebuginit(int) print_ICs(void)
{
+ if (apic_verbosity == APIC_QUIET)
+ return 0;
+
print_PIC();
/* don't print out if apic is not there */
if (!cpu_has_apic && !apic_from_smp_config())
return 0;
- print_all_local_APICs();
+ print_local_APICs(show_lapic);
print_IO_APIC();
return 0;
}
-fs_initcall(print_all_ICs);
+fs_initcall(print_ICs);
/* Where if anywhere is the i8259 connect in external int mode */
@@ -1897,13 +1950,13 @@ void __init enable_IO_APIC(void)
* The number of IO-APIC IRQ registers (== #pins):
*/
for (apic = 0; apic < nr_ioapics; apic++) {
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_01.raw = io_apic_read(apic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
nr_ioapic_registers[apic] = reg_01.bits.entries+1;
}
- if (!nr_legacy_irqs)
+ if (!legacy_pic->nr_legacy_irqs)
return;
for(apic = 0; apic < nr_ioapics; apic++) {
@@ -1960,7 +2013,7 @@ void disable_IO_APIC(void)
*/
clear_IO_APIC();
- if (!nr_legacy_irqs)
+ if (!legacy_pic->nr_legacy_irqs)
return;
/*
@@ -2031,7 +2084,7 @@ void __init setup_ioapic_ids_from_mpc(void)
* This is broken; anything with a real cpu count has to
* circumvent this idiocy regardless.
*/
- phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
+ apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
/*
* Set the IOAPIC ID to the value stored in the MPC table.
@@ -2039,9 +2092,9 @@ void __init setup_ioapic_ids_from_mpc(void)
for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
/* Read the register 0 value */
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic_id, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
old_id = mp_ioapics[apic_id].apicid;
@@ -2058,7 +2111,7 @@ void __init setup_ioapic_ids_from_mpc(void)
* system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
- if (apic->check_apicid_used(phys_id_present_map,
+ if (apic->check_apicid_used(&phys_id_present_map,
mp_ioapics[apic_id].apicid)) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
apic_id, mp_ioapics[apic_id].apicid);
@@ -2073,7 +2126,7 @@ void __init setup_ioapic_ids_from_mpc(void)
mp_ioapics[apic_id].apicid = i;
} else {
physid_mask_t tmp;
- tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid);
+ apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp);
apic_printk(APIC_VERBOSE, "Setting %d in the "
"phys_id_present_map\n",
mp_ioapics[apic_id].apicid);
@@ -2100,16 +2153,16 @@ void __init setup_ioapic_ids_from_mpc(void)
mp_ioapics[apic_id].apicid);
reg_00.bits.ID = mp_ioapics[apic_id].apicid;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic_id, 0, reg_00.raw);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
/*
* Sanity check
*/
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic_id, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
printk("could not set ID!\n");
else
@@ -2192,15 +2245,15 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
unsigned long flags;
struct irq_cfg *cfg;
- spin_lock_irqsave(&ioapic_lock, flags);
- if (irq < nr_legacy_irqs) {
- disable_8259A_irq(irq);
- if (i8259A_irq_pending(irq))
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ if (irq < legacy_pic->nr_legacy_irqs) {
+ legacy_pic->chip->mask(irq);
+ if (legacy_pic->irq_pending(irq))
was_pending = 1;
}
cfg = irq_cfg(irq);
__unmask_IO_APIC_irq(cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return was_pending;
}
@@ -2211,9 +2264,9 @@ static int ioapic_retrigger_irq(unsigned int irq)
struct irq_cfg *cfg = irq_cfg(irq);
unsigned long flags;
- spin_lock_irqsave(&vector_lock, flags);
+ raw_spin_lock_irqsave(&vector_lock, flags);
apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
- spin_unlock_irqrestore(&vector_lock, flags);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
return 1;
}
@@ -2228,20 +2281,16 @@ static int ioapic_retrigger_irq(unsigned int irq)
*/
#ifdef CONFIG_SMP
-static void send_cleanup_vector(struct irq_cfg *cfg)
+void send_cleanup_vector(struct irq_cfg *cfg)
{
cpumask_var_t cleanup_mask;
if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
unsigned int i;
- cfg->move_cleanup_count = 0;
- for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
- cfg->move_cleanup_count++;
for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
} else {
cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
- cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
free_cpumask_var(cleanup_mask);
}
@@ -2272,31 +2321,30 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
}
}
-static int
-assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
-
/*
* Either sets desc->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
+ * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
* leaves desc->affinity untouched.
*/
-static unsigned int
-set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
+unsigned int
+set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask,
+ unsigned int *dest_id)
{
struct irq_cfg *cfg;
unsigned int irq;
if (!cpumask_intersects(mask, cpu_online_mask))
- return BAD_APICID;
+ return -1;
irq = desc->irq;
cfg = desc->chip_data;
if (assign_irq_vector(irq, cfg, mask))
- return BAD_APICID;
+ return -1;
cpumask_copy(desc->affinity, mask);
- return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
+ *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
+ return 0;
}
static int
@@ -2311,15 +2359,14 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
irq = desc->irq;
cfg = desc->chip_data;
- spin_lock_irqsave(&ioapic_lock, flags);
- dest = set_desc_affinity(desc, mask);
- if (dest != BAD_APICID) {
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ ret = set_desc_affinity(desc, mask, &dest);
+ if (!ret) {
/* Only the high 8 bits are valid. */
dest = SET_APIC_LOGICAL_ID(dest);
__target_IO_APIC_irq(irq, dest, cfg);
- ret = 0;
}
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return ret;
}
@@ -2432,8 +2479,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
continue;
cfg = irq_cfg(irq);
- spin_lock(&desc->lock);
- if (!cfg->move_cleanup_count)
+ raw_spin_lock(&desc->lock);
+
+ /*
+ * Check if the irq migration is in progress. If so, we
+ * haven't received the cleanup request yet for this irq.
+ */
+ if (cfg->move_in_progress)
goto unlock;
if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
@@ -2452,29 +2504,40 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
goto unlock;
}
__get_cpu_var(vector_irq)[vector] = -1;
- cfg->move_cleanup_count--;
unlock:
- spin_unlock(&desc->lock);
+ raw_spin_unlock(&desc->lock);
}
irq_exit();
}
-static void irq_complete_move(struct irq_desc **descp)
+static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
{
struct irq_desc *desc = *descp;
struct irq_cfg *cfg = desc->chip_data;
- unsigned vector, me;
+ unsigned me;
if (likely(!cfg->move_in_progress))
return;
- vector = ~get_irq_regs()->orig_ax;
me = smp_processor_id();
if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
send_cleanup_vector(cfg);
}
+
+static void irq_complete_move(struct irq_desc **descp)
+{
+ __irq_complete_move(descp, ~get_irq_regs()->orig_ax);
+}
+
+void irq_force_complete_move(int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg = desc->chip_data;
+
+ __irq_complete_move(&desc, cfg->vector);
+}
#else
static inline void irq_complete_move(struct irq_desc **descp) {}
#endif
@@ -2490,6 +2553,59 @@ static void ack_apic_edge(unsigned int irq)
atomic_t irq_mis_count;
+/*
+ * IO-APIC versions below 0x20 don't support EOI register.
+ * For the record, here is the information about various versions:
+ * 0Xh 82489DX
+ * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
+ * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
+ * 30h-FFh Reserved
+ *
+ * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
+ * version as 0x2. This is an error with documentation and these ICH chips
+ * use io-apic's of version 0x20.
+ *
+ * For IO-APIC's with EOI register, we use that to do an explicit EOI.
+ * Otherwise, we simulate the EOI message manually by changing the trigger
+ * mode to edge and then back to level, with RTE being masked during this.
+*/
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+ struct irq_pin_list *entry;
+
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
+ if (mp_ioapics[entry->apic].apicver >= 0x20) {
+ /*
+ * Intr-remapping uses pin number as the virtual vector
+ * in the RTE. Actual vector is programmed in
+ * intr-remapping table entry. Hence for the io-apic
+ * EOI we use the pin number.
+ */
+ if (irq_remapped(irq))
+ io_apic_eoi(entry->apic, entry->pin);
+ else
+ io_apic_eoi(entry->apic, cfg->vector);
+ } else {
+ __mask_and_edge_IO_APIC_irq(entry);
+ __unmask_and_level_IO_APIC_irq(entry);
+ }
+ }
+}
+
+static void eoi_ioapic_irq(struct irq_desc *desc)
+{
+ struct irq_cfg *cfg;
+ unsigned long flags;
+ unsigned int irq;
+
+ irq = desc->irq;
+ cfg = desc->chip_data;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ __eoi_ioapic_irq(irq, cfg);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
static void ack_apic_level(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -2525,6 +2641,19 @@ static void ack_apic_level(unsigned int irq)
* level-triggered interrupt. We mask the source for the time of the
* operation to prevent an edge-triggered interrupt escaping meanwhile.
* The idea is from Manfred Spraul. --macro
+ *
+ * Also in the case when cpu goes offline, fixup_irqs() will forward
+ * any unhandled interrupt on the offlined cpu to the new cpu
+ * destination that is handling the corresponding interrupt. This
+ * interrupt forwarding is done via IPI's. Hence, in this case also
+ * level-triggered io-apic interrupt will be seen as an edge
+ * interrupt in the IRR. And we can't rely on the cpu's EOI
+ * to be broadcasted to the IO-APIC's which will clear the remoteIRR
+ * corresponding to the level-triggered interrupt. Hence on IO-APIC's
+ * supporting EOI register, we do an explicit EOI to clear the
+ * remote IRR and on IO-APIC's which don't have an EOI register,
+ * we use the above logic (mask+edge followed by unmask+level) from
+ * Manfred Spraul to clear the remote IRR.
*/
cfg = desc->chip_data;
i = cfg->vector;
@@ -2536,6 +2665,19 @@ static void ack_apic_level(unsigned int irq)
*/
ack_APIC_irq();
+ /*
+ * Tail end of clearing remote IRR bit (either by delivering the EOI
+ * message via io-apic EOI register write or simulating it using
+ * mask+edge followed by unnask+level logic) manually when the
+ * level triggered interrupt is seen as the edge triggered interrupt
+ * at the cpu.
+ */
+ if (!(v & (1 << (i & 0x1f)))) {
+ atomic_inc(&irq_mis_count);
+
+ eoi_ioapic_irq(desc);
+ }
+
/* Now we can move and renable the irq */
if (unlikely(do_unmask_irq)) {
/* Only migrate the irq if the ack has been received.
@@ -2569,41 +2711,9 @@ static void ack_apic_level(unsigned int irq)
move_masked_irq(irq);
unmask_IO_APIC_irq_desc(desc);
}
-
- /* Tail end of version 0x11 I/O APIC bug workaround */
- if (!(v & (1 << (i & 0x1f)))) {
- atomic_inc(&irq_mis_count);
- spin_lock(&ioapic_lock);
- __mask_and_edge_IO_APIC_irq(cfg);
- __unmask_and_level_IO_APIC_irq(cfg);
- spin_unlock(&ioapic_lock);
- }
}
#ifdef CONFIG_INTR_REMAP
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
-{
- struct irq_pin_list *entry;
-
- for_each_irq_pin(entry, cfg->irq_2_pin)
- io_apic_eoi(entry->apic, entry->pin);
-}
-
-static void
-eoi_ioapic_irq(struct irq_desc *desc)
-{
- struct irq_cfg *cfg;
- unsigned long flags;
- unsigned int irq;
-
- irq = desc->irq;
- cfg = desc->chip_data;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- __eoi_ioapic_irq(irq, cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
static void ir_ack_apic_edge(unsigned int irq)
{
ack_APIC_irq();
@@ -2671,8 +2781,8 @@ static inline void init_IO_APIC_traps(void)
* so default to an old-fashioned 8259
* interrupt if we can..
*/
- if (irq < nr_legacy_irqs)
- make_8259A_irq(irq);
+ if (irq < legacy_pic->nr_legacy_irqs)
+ legacy_pic->make_irq(irq);
else
/* Strange. Oh, well.. */
desc->chip = &no_irq_chip;
@@ -2829,7 +2939,7 @@ static inline void __init check_timer(void)
/*
* get/set the timer IRQ vector:
*/
- disable_8259A_irq(0);
+ legacy_pic->chip->mask(0);
assign_irq_vector(0, cfg, apic->target_cpus());
/*
@@ -2842,7 +2952,7 @@ static inline void __init check_timer(void)
* automatically.
*/
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
- init_8259A(1);
+ legacy_pic->init(1);
#ifdef CONFIG_X86_32
{
unsigned int ver;
@@ -2901,7 +3011,7 @@ static inline void __init check_timer(void)
if (timer_irq_works()) {
if (nmi_watchdog == NMI_IO_APIC) {
setup_nmi();
- enable_8259A_irq(0);
+ legacy_pic->chip->unmask(0);
}
if (disable_timer_pin_1 > 0)
clear_IO_APIC_pin(0, pin1);
@@ -2924,14 +3034,14 @@ static inline void __init check_timer(void)
*/
replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
- enable_8259A_irq(0);
+ legacy_pic->chip->unmask(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
timer_through_8259 = 1;
if (nmi_watchdog == NMI_IO_APIC) {
- disable_8259A_irq(0);
+ legacy_pic->chip->mask(0);
setup_nmi();
- enable_8259A_irq(0);
+ legacy_pic->chip->unmask(0);
}
goto out;
}
@@ -2939,7 +3049,7 @@ static inline void __init check_timer(void)
* Cleanup, just in case ...
*/
local_irq_disable();
- disable_8259A_irq(0);
+ legacy_pic->chip->mask(0);
clear_IO_APIC_pin(apic2, pin2);
apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
}
@@ -2958,22 +3068,22 @@ static inline void __init check_timer(void)
lapic_register_intr(0, desc);
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
- enable_8259A_irq(0);
+ legacy_pic->chip->unmask(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
local_irq_disable();
- disable_8259A_irq(0);
+ legacy_pic->chip->mask(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as ExtINT IRQ...\n");
- init_8259A(0);
- make_8259A_irq(0);
+ legacy_pic->init(0);
+ legacy_pic->make_irq(0);
apic_write(APIC_LVT0, APIC_DM_EXTINT);
unlock_ExtINT_logic();
@@ -3015,7 +3125,7 @@ void __init setup_IO_APIC(void)
/*
* calling enable_IO_APIC() is moved to setup_local_APIC for BP
*/
- io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
+ io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
/*
@@ -3026,7 +3136,7 @@ void __init setup_IO_APIC(void)
sync_Arb_IDs();
setup_IO_APIC_irqs();
init_IO_APIC_traps();
- if (nr_legacy_irqs)
+ if (legacy_pic->nr_legacy_irqs)
check_timer();
}
@@ -3075,13 +3185,13 @@ static int ioapic_resume(struct sys_device *dev)
data = container_of(dev, struct sysfs_ioapic_data, dev);
entry = data->entry;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(dev->id, 0);
if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
reg_00.bits.ID = mp_ioapics[dev->id].apicid;
io_apic_write(dev->id, 0, reg_00.raw);
}
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
ioapic_write_entry(dev->id, i, entry[i]);
@@ -3144,7 +3254,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
if (irq_want < nr_irqs_gsi)
irq_want = nr_irqs_gsi;
- spin_lock_irqsave(&vector_lock, flags);
+ raw_spin_lock_irqsave(&vector_lock, flags);
for (new = irq_want; new < nr_irqs; new++) {
desc_new = irq_to_desc_alloc_node(new, node);
if (!desc_new) {
@@ -3157,19 +3267,17 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
continue;
desc_new = move_irq_desc(desc_new, node);
+ cfg_new = desc_new->chip_data;
if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
irq = new;
break;
}
- spin_unlock_irqrestore(&vector_lock, flags);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ if (irq > 0)
+ dynamic_irq_init_keep_chip_data(irq);
- if (irq > 0) {
- dynamic_irq_init(irq);
- /* restore it, in case dynamic_irq_init clear it */
- if (desc_new)
- desc_new->chip_data = cfg_new;
- }
return irq;
}
@@ -3191,27 +3299,21 @@ int create_irq(void)
void destroy_irq(unsigned int irq)
{
unsigned long flags;
- struct irq_cfg *cfg;
- struct irq_desc *desc;
- /* store it, in case dynamic_irq_cleanup clear it */
- desc = irq_to_desc(irq);
- cfg = desc->chip_data;
- dynamic_irq_cleanup(irq);
- /* connect back irq_cfg */
- desc->chip_data = cfg;
+ dynamic_irq_cleanup_keep_chip_data(irq);
free_irte(irq);
- spin_lock_irqsave(&vector_lock, flags);
- __clear_irq_vector(irq, cfg);
- spin_unlock_irqrestore(&vector_lock, flags);
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ __clear_irq_vector(irq, get_irq_chip_data(irq));
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
}
/*
* MSI message composition
*/
#ifdef CONFIG_PCI_MSI
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
+ struct msi_msg *msg, u8 hpet_id)
{
struct irq_cfg *cfg;
int err;
@@ -3245,7 +3347,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
irte.dest_id = IRTE_DEST(dest);
/* Set source-id of interrupt request */
- set_msi_sid(&irte, pdev);
+ if (pdev)
+ set_msi_sid(&irte, pdev);
+ else
+ set_hpet_sid(&irte, hpet_id);
modify_irte(irq, &irte);
@@ -3291,8 +3396,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
struct msi_msg msg;
unsigned int dest;
- dest = set_desc_affinity(desc, mask);
- if (dest == BAD_APICID)
+ if (set_desc_affinity(desc, mask, &dest))
return -1;
cfg = desc->chip_data;
@@ -3324,8 +3428,7 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
if (get_irte(irq, &irte))
return -1;
- dest = set_desc_affinity(desc, mask);
- if (dest == BAD_APICID)
+ if (set_desc_affinity(desc, mask, &dest))
return -1;
irte.vector = cfg->vector;
@@ -3410,7 +3513,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
int ret;
struct msi_msg msg;
- ret = msi_compose_msg(dev, irq, &msg);
+ ret = msi_compose_msg(dev, irq, &msg, -1);
if (ret < 0)
return ret;
@@ -3507,8 +3610,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
struct msi_msg msg;
unsigned int dest;
- dest = set_desc_affinity(desc, mask);
- if (dest == BAD_APICID)
+ if (set_desc_affinity(desc, mask, &dest))
return -1;
cfg = desc->chip_data;
@@ -3543,7 +3645,7 @@ int arch_setup_dmar_msi(unsigned int irq)
int ret;
struct msi_msg msg;
- ret = msi_compose_msg(NULL, irq, &msg);
+ ret = msi_compose_msg(NULL, irq, &msg, -1);
if (ret < 0)
return ret;
dmar_msi_write(irq, &msg);
@@ -3563,8 +3665,7 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
struct msi_msg msg;
unsigned int dest;
- dest = set_desc_affinity(desc, mask);
- if (dest == BAD_APICID)
+ if (set_desc_affinity(desc, mask, &dest))
return -1;
cfg = desc->chip_data;
@@ -3583,6 +3684,19 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
#endif /* CONFIG_SMP */
+static struct irq_chip ir_hpet_msi_type = {
+ .name = "IR-HPET_MSI",
+ .unmask = hpet_msi_unmask,
+ .mask = hpet_msi_mask,
+#ifdef CONFIG_INTR_REMAP
+ .ack = ir_ack_apic_edge,
+#ifdef CONFIG_SMP
+ .set_affinity = ir_set_msi_irq_affinity,
+#endif
+#endif
+ .retrigger = ioapic_retrigger_irq,
+};
+
static struct irq_chip hpet_msi_type = {
.name = "HPET_MSI",
.unmask = hpet_msi_unmask,
@@ -3594,20 +3708,36 @@ static struct irq_chip hpet_msi_type = {
.retrigger = ioapic_retrigger_irq,
};
-int arch_setup_hpet_msi(unsigned int irq)
+int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
{
int ret;
struct msi_msg msg;
struct irq_desc *desc = irq_to_desc(irq);
- ret = msi_compose_msg(NULL, irq, &msg);
+ if (intr_remapping_enabled) {
+ struct intel_iommu *iommu = map_hpet_to_ir(id);
+ int index;
+
+ if (!iommu)
+ return -1;
+
+ index = alloc_irte(iommu, irq, 1);
+ if (index < 0)
+ return -1;
+ }
+
+ ret = msi_compose_msg(NULL, irq, &msg, id);
if (ret < 0)
return ret;
hpet_msi_write(irq, &msg);
desc->status |= IRQ_MOVE_PCNTXT;
- set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq,
- "edge");
+ if (irq_remapped(irq))
+ set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
+ handle_edge_irq, "edge");
+ else
+ set_irq_chip_and_handler_name(irq, &hpet_msi_type,
+ handle_edge_irq, "edge");
return 0;
}
@@ -3641,8 +3771,7 @@ static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
struct irq_cfg *cfg;
unsigned int dest;
- dest = set_desc_affinity(desc, mask);
- if (dest == BAD_APICID)
+ if (set_desc_affinity(desc, mask, &dest))
return -1;
cfg = desc->chip_data;
@@ -3708,83 +3837,14 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
}
#endif /* CONFIG_HT_IRQ */
-#ifdef CONFIG_X86_UV
-/*
- * Re-target the irq to the specified CPU and enable the specified MMR located
- * on the specified blade to allow the sending of MSIs to the specified CPU.
- */
-int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
- unsigned long mmr_offset)
-{
- const struct cpumask *eligible_cpu = cpumask_of(cpu);
- struct irq_cfg *cfg;
- int mmr_pnode;
- unsigned long mmr_value;
- struct uv_IO_APIC_route_entry *entry;
- unsigned long flags;
- int err;
-
- BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
- cfg = irq_cfg(irq);
-
- err = assign_irq_vector(irq, cfg, eligible_cpu);
- if (err != 0)
- return err;
-
- spin_lock_irqsave(&vector_lock, flags);
- set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
- irq_name);
- spin_unlock_irqrestore(&vector_lock, flags);
-
- mmr_value = 0;
- entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
- entry->vector = cfg->vector;
- entry->delivery_mode = apic->irq_delivery_mode;
- entry->dest_mode = apic->irq_dest_mode;
- entry->polarity = 0;
- entry->trigger = 0;
- entry->mask = 0;
- entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
-
- mmr_pnode = uv_blade_to_pnode(mmr_blade);
- uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-
- if (cfg->move_in_progress)
- send_cleanup_vector(cfg);
-
- return irq;
-}
-
-/*
- * Disable the specified MMR located on the specified blade so that MSIs are
- * longer allowed to be sent.
- */
-void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
-{
- unsigned long mmr_value;
- struct uv_IO_APIC_route_entry *entry;
- int mmr_pnode;
-
- BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
- mmr_value = 0;
- entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
- entry->mask = 1;
-
- mmr_pnode = uv_blade_to_pnode(mmr_blade);
- uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-}
-#endif /* CONFIG_X86_64 */
-
int __init io_apic_get_redir_entries (int ioapic)
{
union IO_APIC_reg_01 reg_01;
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_01.raw = io_apic_read(ioapic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return reg_01.bits.entries;
}
@@ -3867,7 +3927,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
/*
* IRQs < 16 are already in the irq_2_pin[] map
*/
- if (irq >= nr_legacy_irqs) {
+ if (irq >= legacy_pic->nr_legacy_irqs) {
cfg = desc->chip_data;
if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
printk(KERN_INFO "can not add pin %d for irq %d\n",
@@ -3944,11 +4004,11 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
*/
if (physids_empty(apic_id_map))
- apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map);
+ apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(ioapic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
if (apic_id >= get_physical_broadcast()) {
printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
@@ -3960,10 +4020,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
* Every APIC in a system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
- if (apic->check_apicid_used(apic_id_map, apic_id)) {
+ if (apic->check_apicid_used(&apic_id_map, apic_id)) {
for (i = 0; i < get_physical_broadcast(); i++) {
- if (!apic->check_apicid_used(apic_id_map, i))
+ if (!apic->check_apicid_used(&apic_id_map, i))
break;
}
@@ -3976,16 +4036,16 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
apic_id = i;
}
- tmp = apic->apicid_to_cpu_present(apic_id);
+ apic->apicid_to_cpu_present(apic_id, &tmp);
physids_or(apic_id_map, apic_id_map, tmp);
if (reg_00.bits.ID != apic_id) {
reg_00.bits.ID = apic_id;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(ioapic, 0, reg_00.raw);
reg_00.raw = io_apic_read(ioapic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
/* Sanity check */
if (reg_00.bits.ID != apic_id) {
@@ -4006,9 +4066,9 @@ int __init io_apic_get_version(int ioapic)
union IO_APIC_reg_01 reg_01;
unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_01.raw = io_apic_read(ioapic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return reg_01.bits.version;
}
@@ -4040,27 +4100,23 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
#ifdef CONFIG_SMP
void __init setup_ioapic_dest(void)
{
- int pin, ioapic = 0, irq, irq_entry;
+ int pin, ioapic, irq, irq_entry;
struct irq_desc *desc;
const struct cpumask *mask;
if (skip_ioapic_setup == 1)
return;
-#ifdef CONFIG_ACPI
- if (!acpi_disabled && acpi_ioapic) {
- ioapic = mp_find_ioapic(0);
- if (ioapic < 0)
- ioapic = 0;
- }
-#endif
-
+ for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
irq_entry = find_irq_entry(ioapic, pin, mp_INT);
if (irq_entry == -1)
continue;
irq = pin_2_irq(irq_entry, ioapic, pin);
+ if ((ioapic > 0) && (irq > 16))
+ continue;
+
desc = irq_to_desc(irq);
/*
@@ -4106,7 +4162,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
for (i = 0; i < nr_ioapics; i++) {
res[i].name = mem;
res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- sprintf(mem, "IOAPIC %u", i);
+ snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
mem += IOAPIC_RESOURCE_NAME_SIZE;
}
@@ -4140,18 +4196,17 @@ void __init ioapic_init_mappings(void)
#ifdef CONFIG_X86_32
fake_ioapic_page:
#endif
- ioapic_phys = (unsigned long)
- alloc_bootmem_pages(PAGE_SIZE);
+ ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
ioapic_phys = __pa(ioapic_phys);
}
set_fixmap_nocache(idx, ioapic_phys);
- apic_printk(APIC_VERBOSE,
- "mapped IOAPIC to %08lx (%08lx)\n",
- __fix_to_virt(idx), ioapic_phys);
+ apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
+ __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
+ ioapic_phys);
idx++;
ioapic_res->start = ioapic_phys;
- ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
+ ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
ioapic_res++;
}
}
@@ -4246,3 +4301,24 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
nr_ioapics++;
}
+
+/* Enable IOAPIC early just for system timer */
+void __init pre_init_apic_IRQ0(void)
+{
+ struct irq_cfg *cfg;
+ struct irq_desc *desc;
+
+ printk(KERN_INFO "Early APIC setup for system timer0\n");
+#ifndef CONFIG_SMP
+ phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+#endif
+ desc = irq_to_desc_alloc_node(0, 0);
+
+ setup_local_APIC();
+
+ cfg = irq_cfg(0);
+ add_pin_to_irq_node(cfg, 0, 0, 0);
+ set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
+
+ setup_IO_APIC_irq(0, 0, 0, desc, 0, 0);
+}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 7ff61d6a188a..8aa65adbd25d 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -39,7 +39,8 @@
int unknown_nmi_panic;
int nmi_watchdog_enabled;
-static cpumask_t backtrace_mask __read_mostly;
+/* For reliability, we're prepared to waste bits here. */
+static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
/* nmi_active:
* >0: the lapic NMI watchdog is active, but can be disabled
@@ -176,7 +177,7 @@ int __init check_nmi_watchdog(void)
error:
if (nmi_watchdog == NMI_IO_APIC) {
if (!timer_through_8259)
- disable_8259A_irq(0);
+ legacy_pic->chip->mask(0);
on_each_cpu(__acpi_nmi_disable, NULL, 1);
}
@@ -360,7 +361,7 @@ void stop_apic_nmi_watchdog(void *unused)
*/
static DEFINE_PER_CPU(unsigned, last_irq_sum);
-static DEFINE_PER_CPU(local_t, alert_counter);
+static DEFINE_PER_CPU(long, alert_counter);
static DEFINE_PER_CPU(int, nmi_touch);
void touch_nmi_watchdog(void)
@@ -414,15 +415,15 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
}
/* We can be called before check_nmi_watchdog, hence NULL check. */
- if (cpumask_test_cpu(cpu, &backtrace_mask)) {
- static DEFINE_SPINLOCK(lock); /* Serialise the printks */
+ if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
+ static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
- spin_lock(&lock);
+ raw_spin_lock(&lock);
printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
show_regs(regs);
dump_stack();
- spin_unlock(&lock);
- cpumask_clear_cpu(cpu, &backtrace_mask);
+ raw_spin_unlock(&lock);
+ cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
rc = 1;
}
@@ -437,8 +438,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
* Ayiee, looks like this CPU is stuck ...
* wait a few IRQs (5 seconds) before doing the oops ...
*/
- local_inc(&__get_cpu_var(alert_counter));
- if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz)
+ __this_cpu_inc(alert_counter);
+ if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
/*
* die_nmi will return ONLY if NOTIFY_STOP happens..
*/
@@ -446,7 +447,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
regs, panic_on_timeout);
} else {
__get_cpu_var(last_irq_sum) = sum;
- local_set(&__get_cpu_var(alert_counter), 0);
+ __this_cpu_write(alert_counter, 0);
}
/* see if the nmi watchdog went off */
@@ -558,14 +559,14 @@ void arch_trigger_all_cpu_backtrace(void)
{
int i;
- cpumask_copy(&backtrace_mask, cpu_online_mask);
+ cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
printk(KERN_INFO "sending NMI to all CPUs:\n");
apic->send_IPI_all(NMI_VECTOR);
/* Wait for up to 10 seconds for all CPUs to do the backtrace */
for (i = 0; i < 10 * 1000; i++) {
- if (cpumask_empty(&backtrace_mask))
+ if (cpumask_empty(to_cpumask(backtrace_mask)))
break;
mdelay(1);
}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index efa00e2b8505..3e28401f161c 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -225,7 +225,7 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
mpc_record = 0;
printk(KERN_INFO
- "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
+ "Found an OEM MPC table at %8p - parsing it...\n", oemtable);
if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
printk(KERN_WARNING
@@ -264,11 +264,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
static __init void early_check_numaq(void)
{
/*
- * Find possible boot-time SMP configuration:
- */
- early_find_smp_config();
-
- /*
* get boot-time SMP configuration:
*/
if (smp_found_config)
@@ -282,6 +277,7 @@ static __init void early_check_numaq(void)
x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
x86_init.timers.tsc_pre_init = numaq_tsc_init;
+ x86_init.pci.init = pci_numaq_init;
}
}
@@ -334,10 +330,9 @@ static inline const struct cpumask *numaq_target_cpus(void)
return cpu_all_mask;
}
-static inline unsigned long
-numaq_check_apicid_used(physid_mask_t bitmap, int apicid)
+static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid)
{
- return physid_isset(apicid, bitmap);
+ return physid_isset(apicid, *map);
}
static inline unsigned long numaq_check_apicid_present(int bit)
@@ -371,10 +366,10 @@ static inline int numaq_multi_timer_check(int apic, int irq)
return apic != 0 && irq == 0;
}
-static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map)
+static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
{
/* We don't have a good way to do this yet - hack */
- return physids_promote(0xFUL);
+ return physids_promote(0xFUL, retmap);
}
static inline int numaq_cpu_to_logical_apicid(int cpu)
@@ -402,12 +397,12 @@ static inline int numaq_apicid_to_node(int logical_apicid)
return logical_apicid >> 4;
}
-static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid)
+static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
{
int node = numaq_apicid_to_node(logical_apicid);
int cpu = __ffs(logical_apicid & 0xf);
- return physid_mask_of_physid(cpu + 4*node);
+ physid_set_mask_of_physid(cpu + 4*node, retmap);
}
/* Where the IO area was mapped on multiquad, always 0 otherwise */
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 0c0182cc947d..99d2fe016084 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -52,7 +52,32 @@ static int __init print_ipi_mode(void)
}
late_initcall(print_ipi_mode);
-void default_setup_apic_routing(void)
+void __init default_setup_apic_routing(void)
+{
+ int version = apic_version[boot_cpu_physical_apicid];
+
+ if (num_possible_cpus() > 8) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (!APIC_XAPIC(version)) {
+ def_to_bigsmp = 0;
+ break;
+ }
+ /* If P4 and above fall through */
+ case X86_VENDOR_AMD:
+ def_to_bigsmp = 1;
+ }
+ }
+
+#ifdef CONFIG_X86_BIGSMP
+ generic_bigsmp_probe();
+#endif
+
+ if (apic->setup_apic_routing)
+ apic->setup_apic_routing();
+}
+
+static void setup_apic_flat_routing(void)
{
#ifdef CONFIG_X86_IO_APIC
printk(KERN_INFO
@@ -103,12 +128,12 @@ struct apic apic_default = {
.init_apic_ldr = default_init_apic_ldr,
.ioapic_phys_id_map = default_ioapic_phys_id_map,
- .setup_apic_routing = default_setup_apic_routing,
+ .setup_apic_routing = setup_apic_flat_routing,
.multi_timer_check = NULL,
.apicid_to_node = default_apicid_to_node,
.cpu_to_logical_apicid = default_cpu_to_logical_apicid,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = default_apicid_to_cpu_present,
+ .apicid_to_cpu_present = physid_set_mask_of_physid,
.setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
.enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index c4cbd3080c1c..83e9be4778e2 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -67,17 +67,8 @@ void __init default_setup_apic_routing(void)
}
#endif
- if (apic == &apic_flat) {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- if (num_processors > 8)
- apic = &apic_physflat;
- break;
- case X86_VENDOR_AMD:
- if (max_physical_apicid >= 8)
- apic = &apic_physflat;
- }
- }
+ if (apic == &apic_flat && num_possible_cpus() > 8)
+ apic = &apic_physflat;
printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 645ecc4ff0be..9b419263d90d 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -183,7 +183,7 @@ static const struct cpumask *summit_target_cpus(void)
return cpumask_of(0);
}
-static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid)
+static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid)
{
return 0;
}
@@ -261,15 +261,15 @@ static int summit_cpu_present_to_apicid(int mps_cpu)
return BAD_APICID;
}
-static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map)
+static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap)
{
/* For clustered we don't have a good way to do this yet - hack */
- return physids_promote(0x0F);
+ physids_promote(0x0FL, retmap);
}
-static physid_mask_t summit_apicid_to_cpu_present(int apicid)
+static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap)
{
- return physid_mask_of_physid(0);
+ physid_set_mask_of_physid(0, retmap);
}
static int summit_check_phys_apicid_present(int physical_apicid)
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index a5371ec36776..cf69c59f4910 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -148,10 +148,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
break;
}
- if (cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
-
- return BAD_APICID;
+ return per_cpu(x86_cpu_to_logical_apicid, cpu);
}
static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a8989aadc99a..8972f38c5ced 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -146,10 +146,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
break;
}
- if (cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_apicid, cpu);
-
- return BAD_APICID;
+ return per_cpu(x86_cpu_to_apicid, cpu);
}
static unsigned int x2apic_phys_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index f5f5886a6b53..49dbeaef2a27 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
*
* SGI UV APIC functions (note: not an Intel compatible APIC)
*
- * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved.
*/
#include <linux/cpumask.h>
#include <linux/hardirq.h>
@@ -20,6 +20,8 @@
#include <linux/cpu.h>
#include <linux/init.h>
#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/kdebug.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
@@ -30,10 +32,27 @@
#include <asm/apic.h>
#include <asm/ipi.h>
#include <asm/smp.h>
+#include <asm/x86_init.h>
DEFINE_PER_CPU(int, x2apic_extra_bits);
+#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args)
+
static enum uv_system_type uv_system_type;
+static u64 gru_start_paddr, gru_end_paddr;
+int uv_min_hub_revision_id;
+EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
+static DEFINE_SPINLOCK(uv_nmi_lock);
+
+static inline bool is_GRU_range(u64 start, u64 end)
+{
+ return start >= gru_start_paddr && end <= gru_end_paddr;
+}
+
+static bool uv_is_untracked_pat_range(u64 start, u64 end)
+{
+ return is_ISA_range(start, end) || is_GRU_range(start, end);
+}
static int early_get_nodeid(void)
{
@@ -43,19 +62,28 @@ static int early_get_nodeid(void)
mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
node_id.v = *mmr;
early_iounmap(mmr, sizeof(*mmr));
+
+ /* Currently, all blades have same revision number */
+ uv_min_hub_revision_id = node_id.s.revision;
+
return node_id.s.node_id;
}
static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
+ int nodeid;
+
if (!strcmp(oem_id, "SGI")) {
+ nodeid = early_get_nodeid();
+ x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
+ x86_platform.nmi_init = uv_nmi_init;
if (!strcmp(oem_table_id, "UVL"))
uv_system_type = UV_LEGACY_APIC;
else if (!strcmp(oem_table_id, "UVX"))
uv_system_type = UV_X2APIC;
else if (!strcmp(oem_table_id, "UVH")) {
__get_cpu_var(x2apic_extra_bits) =
- early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1);
+ nodeid << (UV_APIC_PNODE_SHIFT - 1);
uv_system_type = UV_NON_UNIQUE_APIC;
return 1;
}
@@ -92,11 +120,9 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
unsigned long sn_rtc_cycles_per_second;
EXPORT_SYMBOL(sn_rtc_cycles_per_second);
-/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
-
static const struct cpumask *uv_target_cpus(void)
{
- return cpumask_of(0);
+ return cpu_online_mask;
}
static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
@@ -212,10 +238,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
if (cpumask_test_cpu(cpu, cpu_online_mask))
break;
}
- if (cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_apicid, cpu);
-
- return BAD_APICID;
+ return per_cpu(x86_cpu_to_apicid, cpu);
}
static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -352,25 +375,25 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
alias.v = uv_read_local_mmr(redir_addrs[i].alias);
- if (alias.s.base == 0) {
+ if (alias.s.enable && alias.s.base == 0) {
*size = (1UL << alias.s.m_alias);
redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
*base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
return;
}
}
- BUG();
+ *base = *size = 0;
}
enum map_type {map_wb, map_uc};
-static __init void map_high(char *id, unsigned long base, int shift,
- int max_pnode, enum map_type map_type)
+static __init void map_high(char *id, unsigned long base, int pshift,
+ int bshift, int max_pnode, enum map_type map_type)
{
unsigned long bytes, paddr;
- paddr = base << shift;
- bytes = (1UL << shift) * (max_pnode + 1);
+ paddr = base << pshift;
+ bytes = (1UL << bshift) * (max_pnode + 1);
printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
paddr + bytes);
if (map_type == map_uc)
@@ -385,8 +408,12 @@ static __init void map_gru_high(int max_pnode)
int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
- if (gru.s.enable)
- map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
+ if (gru.s.enable) {
+ map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb);
+ gru_start_paddr = ((u64)gru.s.base << shift);
+ gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
+
+ }
}
static __init void map_mmr_high(int max_pnode)
@@ -396,7 +423,7 @@ static __init void map_mmr_high(int max_pnode)
mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
if (mmr.s.enable)
- map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
+ map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc);
}
static __init void map_mmioh_high(int max_pnode)
@@ -406,7 +433,14 @@ static __init void map_mmioh_high(int max_pnode)
mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
if (mmioh.s.enable)
- map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc);
+ map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io,
+ max_pnode, map_uc);
+}
+
+static __init void map_low_mmrs(void)
+{
+ init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
+ init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
}
static __init void uv_rtc_init(void)
@@ -452,7 +486,7 @@ static void uv_heartbeat(unsigned long ignored)
static void __cpuinit uv_heartbeat_enable(int cpu)
{
- if (!uv_cpu_hub_info(cpu)->scir.enabled) {
+ while (!uv_cpu_hub_info(cpu)->scir.enabled) {
struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
@@ -460,11 +494,10 @@ static void __cpuinit uv_heartbeat_enable(int cpu)
timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
add_timer_on(timer, cpu);
uv_cpu_hub_info(cpu)->scir.enabled = 1;
- }
- /* check boot cpu */
- if (!uv_cpu_hub_info(0)->scir.enabled)
- uv_heartbeat_enable(0);
+ /* also ensure that boot cpu is enabled */
+ cpu = 0;
+ }
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -523,6 +556,30 @@ late_initcall(uv_init_heartbeat);
#endif /* !CONFIG_HOTPLUG_CPU */
+/* Direct Legacy VGA I/O traffic to designated IOH */
+int uv_set_vga_state(struct pci_dev *pdev, bool decode,
+ unsigned int command_bits, bool change_bridge)
+{
+ int domain, bus, rc;
+
+ PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n",
+ pdev->devfn, decode, command_bits, change_bridge);
+
+ if (!change_bridge)
+ return 0;
+
+ if ((command_bits & PCI_COMMAND_IO) == 0)
+ return 0;
+
+ domain = pci_domain_nr(pdev->bus);
+ bus = pdev->bus->number;
+
+ rc = uv_bios_set_legacy_vga_target(decode, domain, bus);
+ PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc);
+
+ return rc;
+}
+
/*
* Called on each cpu to initialize the per_cpu UV data area.
* FIXME: hotplug not supported yet
@@ -539,6 +596,46 @@ void __cpuinit uv_cpu_init(void)
set_x2apic_extra_bits(uv_hub_info->pnode);
}
+/*
+ * When NMI is received, print a stack trace.
+ */
+int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
+{
+ if (reason != DIE_NMI_IPI)
+ return NOTIFY_OK;
+ /*
+ * Use a lock so only one cpu prints at a time
+ * to prevent intermixed output.
+ */
+ spin_lock(&uv_nmi_lock);
+ pr_info("NMI stack dump cpu %u:\n", smp_processor_id());
+ dump_stack();
+ spin_unlock(&uv_nmi_lock);
+
+ return NOTIFY_STOP;
+}
+
+static struct notifier_block uv_dump_stack_nmi_nb = {
+ .notifier_call = uv_handle_nmi
+};
+
+void uv_register_nmi_notifier(void)
+{
+ if (register_die_notifier(&uv_dump_stack_nmi_nb))
+ printk(KERN_WARNING "UV NMI handler failed to register\n");
+}
+
+void uv_nmi_init(void)
+{
+ unsigned int value;
+
+ /*
+ * Unmask NMI on all cpus
+ */
+ value = apic_read(APIC_LVT1) | APIC_DM_NMI;
+ value &= ~APIC_LVT_MASKED;
+ apic_write(APIC_LVT1, value);
+}
void __init uv_system_init(void)
{
@@ -550,6 +647,8 @@ void __init uv_system_init(void)
unsigned long mmr_base, present, paddr;
unsigned short pnode_mask;
+ map_low_mmrs();
+
m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
m_val = m_n_config.s.m_skt;
n_val = m_n_config.s.n_skt;
@@ -602,13 +701,15 @@ void __init uv_system_init(void)
}
uv_bios_init();
- uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
- &sn_coherency_id, &sn_region_size);
+ uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id,
+ &sn_region_size, &system_serial_number);
uv_rtc_init();
for_each_present_cpu(cpu) {
+ int apicid = per_cpu(x86_cpu_to_apicid, cpu);
+
nid = cpu_to_node(cpu);
- pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
+ pnode = uv_apicid_to_pnode(apicid);
blade = boot_pnode_to_blade(pnode);
lcpu = uv_blade_info[blade].nr_possible_cpus;
uv_blade_info[blade].nr_possible_cpus++;
@@ -619,25 +720,23 @@ void __init uv_system_init(void)
uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
uv_cpu_hub_info(cpu)->m_val = m_val;
- uv_cpu_hub_info(cpu)->n_val = m_val;
+ uv_cpu_hub_info(cpu)->n_val = n_val;
uv_cpu_hub_info(cpu)->numa_blade_id = blade;
uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
uv_cpu_hub_info(cpu)->pnode = pnode;
uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
- uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
+ uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
- uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
+ uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
uv_node_to_blade[nid] = blade;
uv_cpu_to_blade[cpu] = blade;
max_pnode = max(pnode, max_pnode);
- printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, "
- "lcpu %d, blade %d\n",
- cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
- lcpu, blade);
+ printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
+ cpu, apicid, pnode, nid, lcpu, blade);
}
/* Add blade/pnode info for nodes without cpus */
@@ -658,5 +757,9 @@ void __init uv_system_init(void)
uv_cpu_init();
uv_scir_register_cpu_notifier();
+ uv_register_nmi_notifier();
proc_mkdir("sgi_uv", NULL);
+
+ /* register Legacy VGA I/O redirection handler */
+ pci_register_set_vga_state(uv_set_vga_state);
}
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 151ace69a5aa..031aa887b0eb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -204,7 +204,6 @@
#include <linux/module.h>
#include <linux/poll.h>
-#include <linux/smp_lock.h>
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/timer.h>
@@ -403,6 +402,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
static struct apm_user *user_list;
static DEFINE_SPINLOCK(user_list_lock);
+static DEFINE_MUTEX(apm_mutex);
/*
* Set up a segment that references the real mode segment 0x40
@@ -1531,7 +1531,7 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
return -EPERM;
switch (cmd) {
case APM_IOC_STANDBY:
- lock_kernel();
+ mutex_lock(&apm_mutex);
if (as->standbys_read > 0) {
as->standbys_read--;
as->standbys_pending--;
@@ -1540,10 +1540,10 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
queue_event(APM_USER_STANDBY, as);
if (standbys_pending <= 0)
standby();
- unlock_kernel();
+ mutex_unlock(&apm_mutex);
break;
case APM_IOC_SUSPEND:
- lock_kernel();
+ mutex_lock(&apm_mutex);
if (as->suspends_read > 0) {
as->suspends_read--;
as->suspends_pending--;
@@ -1552,13 +1552,14 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
queue_event(APM_USER_SUSPEND, as);
if (suspends_pending <= 0) {
ret = suspend(1);
+ mutex_unlock(&apm_mutex);
} else {
as->suspend_wait = 1;
+ mutex_unlock(&apm_mutex);
wait_event_interruptible(apm_suspend_waitqueue,
as->suspend_wait == 0);
ret = as->suspend_result;
}
- unlock_kernel();
return ret;
default:
return -ENOTTY;
@@ -1608,12 +1609,10 @@ static int do_open(struct inode *inode, struct file *filp)
{
struct apm_user *as;
- lock_kernel();
as = kmalloc(sizeof(*as), GFP_KERNEL);
if (as == NULL) {
printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
sizeof(*as));
- unlock_kernel();
return -ENOMEM;
}
as->magic = APM_BIOS_MAGIC;
@@ -1635,7 +1634,6 @@ static int do_open(struct inode *inode, struct file *filp)
user_list = as;
spin_unlock(&user_list_lock);
filp->private_data = as;
- unlock_kernel();
return 0;
}
@@ -1994,8 +1992,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
apm_info.disabled = 1;
printk(KERN_INFO "%s machine detected. "
"Disabling APM.\n", d->ident);
- printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n");
- printk(KERN_INFO "download from support.intel.com \n");
+ printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n");
+ printk(KERN_INFO "download from support.intel.com\n");
}
return 0;
}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index 63a88e1f987d..8bc57baaa9ad 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -15,8 +15,8 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
- * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
- * Copyright (c) Russ Anderson
+ * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Russ Anderson <rja@sgi.com>
*/
#include <linux/efi.h>
@@ -30,6 +30,7 @@ static struct uv_systab uv_systab;
s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
{
struct uv_systab *tab = &uv_systab;
+ s64 ret;
if (!tab->function)
/*
@@ -37,9 +38,11 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
*/
return BIOS_STATUS_UNIMPLEMENTED;
- return efi_call6((void *)__va(tab->function),
- (u64)which, a1, a2, a3, a4, a5);
+ ret = efi_call6((void *)__va(tab->function), (u64)which,
+ a1, a2, a3, a4, a5);
+ return ret;
}
+EXPORT_SYMBOL_GPL(uv_bios_call);
s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
u64 a4, u64 a5)
@@ -73,11 +76,14 @@ long sn_coherency_id;
EXPORT_SYMBOL_GPL(sn_coherency_id);
long sn_region_size;
EXPORT_SYMBOL_GPL(sn_region_size);
+long system_serial_number;
+EXPORT_SYMBOL_GPL(system_serial_number);
int uv_type;
+EXPORT_SYMBOL_GPL(uv_type);
s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
- long *region)
+ long *region, long *ssn)
{
s64 ret;
u64 v0, v1;
@@ -97,25 +103,24 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
*coher = part.coherence_id;
if (region)
*region = part.region_size;
+ if (ssn)
+ *ssn = v1;
return ret;
}
+EXPORT_SYMBOL_GPL(uv_bios_get_sn_info);
int
-uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size,
+uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
unsigned long *intr_mmr_offset)
{
- union uv_watchlist_u size_blade;
u64 watchlist;
s64 ret;
- size_blade.size = mq_size;
- size_blade.blade = blade;
-
/*
* bios returns watchlist number or negative error number.
*/
ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
- size_blade.val, (u64)intr_mmr_offset,
+ mq_size, (u64)intr_mmr_offset,
(u64)&watchlist, 0);
if (ret < BIOS_STATUS_SUCCESS)
return ret;
@@ -158,6 +163,25 @@ s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
}
EXPORT_SYMBOL_GPL(uv_bios_freq_base);
+/*
+ * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target
+ * @decode: true to enable target, false to disable target
+ * @domain: PCI domain number
+ * @bus: PCI bus number
+ *
+ * Returns:
+ * 0: Success
+ * -EINVAL: Invalid domain or bus number
+ * -ENOSYS: Capability not available
+ * -EBUSY: Legacy VGA I/O cannot be retargeted at this time
+ */
+int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
+{
+ return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET,
+ (u64)decode, (u64)domain, (u64)bus, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
+
#ifdef CONFIG_EFI
void uv_bios_init(void)
@@ -189,4 +213,3 @@ void uv_bios_init(void)
void uv_bios_init(void) { }
#endif
-
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 68537e957a9b..c202b62f3671 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -5,6 +5,7 @@
# Don't trace early stages of a secondary CPU boot
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_common.o = -pg
+CFLAGS_REMOVE_perf_event.o = -pg
endif
# Make sure load_percpu_segment has no stackprotector
@@ -18,8 +19,6 @@ obj-y += vmware.o hypervisor.o sched.o
obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
obj-$(CONFIG_X86_64) += bugs_64.o
-obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
-
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index c965e5212714..97ad79cdf688 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -32,6 +32,10 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
{ X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
+ { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
+ { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
+ { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
+ { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
{ 0, 0, 0, 0 }
};
@@ -74,6 +78,7 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
unsigned int eax, ebx, ecx, edx, sub_index;
unsigned int ht_mask_width, core_plus_mask_width;
unsigned int core_select_mask, core_level_siblings;
+ static bool printed;
if (c->cpuid_level < 0xb)
return;
@@ -127,12 +132,14 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
c->x86_max_cores = (core_level_siblings / smp_num_siblings);
-
- printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
- c->phys_proc_id);
- if (c->x86_max_cores > 1)
- printk(KERN_INFO "CPU: Processor Core ID: %d\n",
- c->cpu_core_id);
+ if (!printed) {
+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
+ c->phys_proc_id);
+ if (c->x86_max_cores > 1)
+ printk(KERN_INFO "CPU: Processor Core ID: %d\n",
+ c->cpu_core_id);
+ printed = 1;
+ }
return;
#endif
}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c910a716a71c..e485825130d2 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -254,59 +254,36 @@ static int __cpuinit nearby_node(int apicid)
/*
* Fixup core topology information for AMD multi-node processors.
- * Assumption 1: Number of cores in each internal node is the same.
- * Assumption 2: Mixed systems with both single-node and dual-node
- * processors are not supported.
+ * Assumption: Number of cores in each internal node is the same.
*/
#ifdef CONFIG_X86_HT
static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_PCI
- u32 t, cpn;
- u8 n, n_id;
+ unsigned long long value;
+ u32 nodes, cores_per_node;
int cpu = smp_processor_id();
+ if (!cpu_has(c, X86_FEATURE_NODEID_MSR))
+ return;
+
/* fixup topology information only once for a core */
if (cpu_has(c, X86_FEATURE_AMD_DCM))
return;
- /* check for multi-node processor on boot cpu */
- t = read_pci_config(0, 24, 3, 0xe8);
- if (!(t & (1 << 29)))
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+
+ nodes = ((value >> 3) & 7) + 1;
+ if (nodes == 1)
return;
set_cpu_cap(c, X86_FEATURE_AMD_DCM);
+ cores_per_node = c->x86_max_cores / nodes;
- /* cores per node: each internal node has half the number of cores */
- cpn = c->x86_max_cores >> 1;
-
- /* even-numbered NB_id of this dual-node processor */
- n = c->phys_proc_id << 1;
-
- /*
- * determine internal node id and assign cores fifty-fifty to
- * each node of the dual-node processor
- */
- t = read_pci_config(0, 24 + n, 3, 0xe8);
- n = (t>>30) & 0x3;
- if (n == 0) {
- if (c->cpu_core_id < cpn)
- n_id = 0;
- else
- n_id = 1;
- } else {
- if (c->cpu_core_id < cpn)
- n_id = 1;
- else
- n_id = 0;
- }
-
- /* compute entire NodeID, use llc_shared_map to store sibling info */
- per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id;
+ /* store NodeID, use llc_shared_map to store sibling info */
+ per_cpu(cpu_llc_id, cpu) = value & 7;
- /* fixup core id to be in range from 0 to cpn */
- c->cpu_core_id = c->cpu_core_id % cpn;
-#endif
+ /* fixup core id to be in range from 0 to (cores_per_node - 1) */
+ c->cpu_core_id = c->cpu_core_id % cores_per_node;
}
#endif
@@ -375,8 +352,6 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
node = nearby_node(apicid);
}
numa_set_node(cpu, node);
-
- printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
#endif
}
@@ -535,7 +510,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
}
}
- display_cacheinfo(c);
+ cpu_detect_cache_sizes(c);
/* Multi core CPU? */
if (c->extended_cpuid_level >= 0x80000008) {
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index c95e831bb095..e58d978e0758 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -294,7 +294,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
}
- display_cacheinfo(c);
+ cpu_detect_cache_sizes(c);
}
enum {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc25c2b4a567..4868e4a951ee 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -61,7 +61,7 @@ void __init setup_cpu_local_masks(void)
static void __cpuinit default_init(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_64
- display_cacheinfo(c);
+ cpu_detect_cache_sizes(c);
#else
/* Not much we can do here... */
/* Check if at least it has cpuid */
@@ -383,7 +383,7 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
}
}
-void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
+void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
{
unsigned int n, dummy, ebx, ecx, edx, l2size;
@@ -391,8 +391,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
if (n >= 0x80000005) {
cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
- printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
- edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
c->x86_cache_size = (ecx>>24) + (edx>>24);
#ifdef CONFIG_X86_64
/* On K8 L1 TLB is inclusive, so don't count it */
@@ -422,9 +420,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
#endif
c->x86_cache_size = l2size;
-
- printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
- l2size, ecx & 0xFF);
}
void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -432,6 +427,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_HT
u32 eax, ebx, ecx, edx;
int index_msb, core_bits;
+ static bool printed;
if (!cpu_has(c, X86_FEATURE_HT))
return;
@@ -447,7 +443,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
smp_num_siblings = (ebx & 0xff0000) >> 16;
if (smp_num_siblings == 1) {
- printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
+ printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
goto out;
}
@@ -474,11 +470,12 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
((1 << core_bits) - 1);
out:
- if ((c->x86_max_cores * smp_num_siblings) > 1) {
+ if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
c->phys_proc_id);
printk(KERN_INFO "CPU: Processor Core ID: %d\n",
c->cpu_core_id);
+ printed = 1;
}
#endif
}
@@ -659,24 +656,31 @@ void __init early_cpu_init(void)
const struct cpu_dev *const *cdev;
int count = 0;
+#ifdef PROCESSOR_SELECT
printk(KERN_INFO "KERNEL supported cpus:\n");
+#endif
+
for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
const struct cpu_dev *cpudev = *cdev;
- unsigned int j;
if (count >= X86_VENDOR_NUM)
break;
cpu_devs[count] = cpudev;
count++;
- for (j = 0; j < 2; j++) {
- if (!cpudev->c_ident[j])
- continue;
- printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
- cpudev->c_ident[j]);
+#ifdef PROCESSOR_SELECT
+ {
+ unsigned int j;
+
+ for (j = 0; j < 2; j++) {
+ if (!cpudev->c_ident[j])
+ continue;
+ printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
+ cpudev->c_ident[j]);
+ }
}
+#endif
}
-
early_identify_cpu(&boot_cpu_data);
}
@@ -837,10 +841,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
}
-#ifdef CONFIG_X86_MCE
/* Init Machine Check Exception if available. */
- mcheck_init(c);
-#endif
+ mcheck_cpu_init(c);
select_idle_routine(c);
@@ -1093,7 +1095,7 @@ static void clear_all_debug_regs(void)
void __cpuinit cpu_init(void)
{
- struct orig_ist *orig_ist;
+ struct orig_ist *oist;
struct task_struct *me;
struct tss_struct *t;
unsigned long v;
@@ -1102,7 +1104,7 @@ void __cpuinit cpu_init(void)
cpu = stack_smp_processor_id();
t = &per_cpu(init_tss, cpu);
- orig_ist = &per_cpu(orig_ist, cpu);
+ oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
if (cpu != 0 && percpu_read(node_number) == 0 &&
@@ -1115,7 +1117,7 @@ void __cpuinit cpu_init(void)
if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
panic("CPU#%d already initialized!\n", cpu);
- printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+ pr_debug("Initializing CPU#%d\n", cpu);
clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
@@ -1136,19 +1138,19 @@ void __cpuinit cpu_init(void)
wrmsrl(MSR_KERNEL_GS_BASE, 0);
barrier();
- check_efer();
+ x86_configure_nx();
if (cpu != 0)
enable_x2apic();
/*
* set up and load the per-CPU TSS
*/
- if (!orig_ist->ist[0]) {
+ if (!oist->ist[0]) {
char *estacks = per_cpu(exception_stacks, cpu);
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
estacks += exception_stack_sizes[v];
- orig_ist->ist[v] = t->x86_tss.ist[v] =
+ oist->ist[v] = t->x86_tss.ist[v] =
(unsigned long)estacks;
}
}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 6de9a908e400..3624e8a0f71b 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,6 @@ struct cpu_dev {
extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
-extern void display_cacheinfo(struct cpuinfo_x86 *c);
+extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
#endif
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
deleted file mode 100644
index dca325c03999..000000000000
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ /dev/null
@@ -1,688 +0,0 @@
-/*
- * CPU x86 architecture debug code
- *
- * Copyright(C) 2009 Jaswinder Singh Rajput
- *
- * For licencing details see kernel-base/COPYING
- */
-
-#include <linux/interrupt.h>
-#include <linux/compiler.h>
-#include <linux/seq_file.h>
-#include <linux/debugfs.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/signal.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/smp.h>
-
-#include <asm/cpu_debug.h>
-#include <asm/paravirt.h>
-#include <asm/system.h>
-#include <asm/traps.h>
-#include <asm/apic.h>
-#include <asm/desc.h>
-
-static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr);
-static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr);
-static DEFINE_PER_CPU(int, cpu_priv_count);
-
-static DEFINE_MUTEX(cpu_debug_lock);
-
-static struct dentry *cpu_debugfs_dir;
-
-static struct cpu_debug_base cpu_base[] = {
- { "mc", CPU_MC, 0 },
- { "monitor", CPU_MONITOR, 0 },
- { "time", CPU_TIME, 0 },
- { "pmc", CPU_PMC, 1 },
- { "platform", CPU_PLATFORM, 0 },
- { "apic", CPU_APIC, 0 },
- { "poweron", CPU_POWERON, 0 },
- { "control", CPU_CONTROL, 0 },
- { "features", CPU_FEATURES, 0 },
- { "lastbranch", CPU_LBRANCH, 0 },
- { "bios", CPU_BIOS, 0 },
- { "freq", CPU_FREQ, 0 },
- { "mtrr", CPU_MTRR, 0 },
- { "perf", CPU_PERF, 0 },
- { "cache", CPU_CACHE, 0 },
- { "sysenter", CPU_SYSENTER, 0 },
- { "therm", CPU_THERM, 0 },
- { "misc", CPU_MISC, 0 },
- { "debug", CPU_DEBUG, 0 },
- { "pat", CPU_PAT, 0 },
- { "vmx", CPU_VMX, 0 },
- { "call", CPU_CALL, 0 },
- { "base", CPU_BASE, 0 },
- { "ver", CPU_VER, 0 },
- { "conf", CPU_CONF, 0 },
- { "smm", CPU_SMM, 0 },
- { "svm", CPU_SVM, 0 },
- { "osvm", CPU_OSVM, 0 },
- { "tss", CPU_TSS, 0 },
- { "cr", CPU_CR, 0 },
- { "dt", CPU_DT, 0 },
- { "registers", CPU_REG_ALL, 0 },
-};
-
-static struct cpu_file_base cpu_file[] = {
- { "index", CPU_REG_ALL, 0 },
- { "value", CPU_REG_ALL, 1 },
-};
-
-/* CPU Registers Range */
-static struct cpu_debug_range cpu_reg_range[] = {
- { 0x00000000, 0x00000001, CPU_MC, },
- { 0x00000006, 0x00000007, CPU_MONITOR, },
- { 0x00000010, 0x00000010, CPU_TIME, },
- { 0x00000011, 0x00000013, CPU_PMC, },
- { 0x00000017, 0x00000017, CPU_PLATFORM, },
- { 0x0000001B, 0x0000001B, CPU_APIC, },
- { 0x0000002A, 0x0000002B, CPU_POWERON, },
- { 0x0000002C, 0x0000002C, CPU_FREQ, },
- { 0x0000003A, 0x0000003A, CPU_CONTROL, },
- { 0x00000040, 0x00000047, CPU_LBRANCH, },
- { 0x00000060, 0x00000067, CPU_LBRANCH, },
- { 0x00000079, 0x00000079, CPU_BIOS, },
- { 0x00000088, 0x0000008A, CPU_CACHE, },
- { 0x0000008B, 0x0000008B, CPU_BIOS, },
- { 0x0000009B, 0x0000009B, CPU_MONITOR, },
- { 0x000000C1, 0x000000C4, CPU_PMC, },
- { 0x000000CD, 0x000000CD, CPU_FREQ, },
- { 0x000000E7, 0x000000E8, CPU_PERF, },
- { 0x000000FE, 0x000000FE, CPU_MTRR, },
-
- { 0x00000116, 0x0000011E, CPU_CACHE, },
- { 0x00000174, 0x00000176, CPU_SYSENTER, },
- { 0x00000179, 0x0000017B, CPU_MC, },
- { 0x00000186, 0x00000189, CPU_PMC, },
- { 0x00000198, 0x00000199, CPU_PERF, },
- { 0x0000019A, 0x0000019A, CPU_TIME, },
- { 0x0000019B, 0x0000019D, CPU_THERM, },
- { 0x000001A0, 0x000001A0, CPU_MISC, },
- { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
- { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
- { 0x000001D9, 0x000001D9, CPU_DEBUG, },
- { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
-
- { 0x00000200, 0x0000020F, CPU_MTRR, },
- { 0x00000250, 0x00000250, CPU_MTRR, },
- { 0x00000258, 0x00000259, CPU_MTRR, },
- { 0x00000268, 0x0000026F, CPU_MTRR, },
- { 0x00000277, 0x00000277, CPU_PAT, },
- { 0x000002FF, 0x000002FF, CPU_MTRR, },
-
- { 0x00000300, 0x00000311, CPU_PMC, },
- { 0x00000345, 0x00000345, CPU_PMC, },
- { 0x00000360, 0x00000371, CPU_PMC, },
- { 0x0000038D, 0x00000390, CPU_PMC, },
- { 0x000003A0, 0x000003BE, CPU_PMC, },
- { 0x000003C0, 0x000003CD, CPU_PMC, },
- { 0x000003E0, 0x000003E1, CPU_PMC, },
- { 0x000003F0, 0x000003F2, CPU_PMC, },
-
- { 0x00000400, 0x00000417, CPU_MC, },
- { 0x00000480, 0x0000048B, CPU_VMX, },
-
- { 0x00000600, 0x00000600, CPU_DEBUG, },
- { 0x00000680, 0x0000068F, CPU_LBRANCH, },
- { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
-
- { 0x000107CC, 0x000107D3, CPU_PMC, },
-
- { 0xC0000080, 0xC0000080, CPU_FEATURES, },
- { 0xC0000081, 0xC0000084, CPU_CALL, },
- { 0xC0000100, 0xC0000102, CPU_BASE, },
- { 0xC0000103, 0xC0000103, CPU_TIME, },
-
- { 0xC0010000, 0xC0010007, CPU_PMC, },
- { 0xC0010010, 0xC0010010, CPU_CONF, },
- { 0xC0010015, 0xC0010015, CPU_CONF, },
- { 0xC0010016, 0xC001001A, CPU_MTRR, },
- { 0xC001001D, 0xC001001D, CPU_MTRR, },
- { 0xC001001F, 0xC001001F, CPU_CONF, },
- { 0xC0010030, 0xC0010035, CPU_BIOS, },
- { 0xC0010044, 0xC0010048, CPU_MC, },
- { 0xC0010050, 0xC0010056, CPU_SMM, },
- { 0xC0010058, 0xC0010058, CPU_CONF, },
- { 0xC0010060, 0xC0010060, CPU_CACHE, },
- { 0xC0010061, 0xC0010068, CPU_SMM, },
- { 0xC0010069, 0xC001006B, CPU_SMM, },
- { 0xC0010070, 0xC0010071, CPU_SMM, },
- { 0xC0010111, 0xC0010113, CPU_SMM, },
- { 0xC0010114, 0xC0010118, CPU_SVM, },
- { 0xC0010140, 0xC0010141, CPU_OSVM, },
- { 0xC0011022, 0xC0011023, CPU_CONF, },
-};
-
-static int is_typeflag_valid(unsigned cpu, unsigned flag)
-{
- int i;
-
- /* Standard Registers should be always valid */
- if (flag >= CPU_TSS)
- return 1;
-
- for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
- if (cpu_reg_range[i].flag == flag)
- return 1;
- }
-
- /* Invalid */
- return 0;
-}
-
-static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
- int index, unsigned flag)
-{
- if (cpu_reg_range[index].flag == flag) {
- *min = cpu_reg_range[index].min;
- *max = cpu_reg_range[index].max;
- } else
- *max = 0;
-
- return *max;
-}
-
-/* This function can also be called with seq = NULL for printk */
-static void print_cpu_data(struct seq_file *seq, unsigned type,
- u32 low, u32 high)
-{
- struct cpu_private *priv;
- u64 val = high;
-
- if (seq) {
- priv = seq->private;
- if (priv->file) {
- val = (val << 32) | low;
- seq_printf(seq, "0x%llx\n", val);
- } else
- seq_printf(seq, " %08x: %08x_%08x\n",
- type, high, low);
- } else
- printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
-}
-
-/* This function can also be called with seq = NULL for printk */
-static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
-{
- unsigned msr, msr_min, msr_max;
- struct cpu_private *priv;
- u32 low, high;
- int i;
-
- if (seq) {
- priv = seq->private;
- if (priv->file) {
- if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
- &low, &high))
- print_cpu_data(seq, priv->reg, low, high);
- return;
- }
- }
-
- for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
- if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
- continue;
-
- for (msr = msr_min; msr <= msr_max; msr++) {
- if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
- continue;
- print_cpu_data(seq, msr, low, high);
- }
- }
-}
-
-static void print_tss(void *arg)
-{
- struct pt_regs *regs = task_pt_regs(current);
- struct seq_file *seq = arg;
- unsigned int seg;
-
- seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
- seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
- seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
- seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
-
- seq_printf(seq, " RSI\t: %016lx\n", regs->si);
- seq_printf(seq, " RDI\t: %016lx\n", regs->di);
- seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
- seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
-
-#ifdef CONFIG_X86_64
- seq_printf(seq, " R08\t: %016lx\n", regs->r8);
- seq_printf(seq, " R09\t: %016lx\n", regs->r9);
- seq_printf(seq, " R10\t: %016lx\n", regs->r10);
- seq_printf(seq, " R11\t: %016lx\n", regs->r11);
- seq_printf(seq, " R12\t: %016lx\n", regs->r12);
- seq_printf(seq, " R13\t: %016lx\n", regs->r13);
- seq_printf(seq, " R14\t: %016lx\n", regs->r14);
- seq_printf(seq, " R15\t: %016lx\n", regs->r15);
-#endif
-
- asm("movl %%cs,%0" : "=r" (seg));
- seq_printf(seq, " CS\t: %04x\n", seg);
- asm("movl %%ds,%0" : "=r" (seg));
- seq_printf(seq, " DS\t: %04x\n", seg);
- seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff);
- asm("movl %%es,%0" : "=r" (seg));
- seq_printf(seq, " ES\t: %04x\n", seg);
- asm("movl %%fs,%0" : "=r" (seg));
- seq_printf(seq, " FS\t: %04x\n", seg);
- asm("movl %%gs,%0" : "=r" (seg));
- seq_printf(seq, " GS\t: %04x\n", seg);
-
- seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
-
- seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
-}
-
-static void print_cr(void *arg)
-{
- struct seq_file *seq = arg;
-
- seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
- seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
- seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
- seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
-#ifdef CONFIG_X86_64
- seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
-#endif
-}
-
-static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
-{
- seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
-}
-
-static void print_dt(void *seq)
-{
- struct desc_ptr dt;
- unsigned long ldt;
-
- /* IDT */
- store_idt((struct desc_ptr *)&dt);
- print_desc_ptr("IDT", seq, dt);
-
- /* GDT */
- store_gdt((struct desc_ptr *)&dt);
- print_desc_ptr("GDT", seq, dt);
-
- /* LDT */
- store_ldt(ldt);
- seq_printf(seq, " LDT\t: %016lx\n", ldt);
-
- /* TR */
- store_tr(ldt);
- seq_printf(seq, " TR\t: %016lx\n", ldt);
-}
-
-static void print_dr(void *arg)
-{
- struct seq_file *seq = arg;
- unsigned long dr;
- int i;
-
- for (i = 0; i < 8; i++) {
- /* Ignore db4, db5 */
- if ((i == 4) || (i == 5))
- continue;
- get_debugreg(dr, i);
- seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
- }
-
- seq_printf(seq, "\n MSR\t:\n");
-}
-
-static void print_apic(void *arg)
-{
- struct seq_file *seq = arg;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(seq, " LAPIC\t:\n");
- seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24);
- seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR));
- seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI));
- seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI));
- seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI));
- seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR));
- seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR));
- seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV));
- seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR));
- seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR));
- seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR));
- seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2));
- seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT));
- seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR));
- seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC));
- seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0));
- seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1));
- seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR));
- seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
- seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
- seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
- if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
- unsigned int i, v, maxeilvt;
-
- v = apic_read(APIC_EFEAT);
- maxeilvt = (v >> 16) & 0xff;
- seq_printf(seq, " EFEAT\t\t: %08x\n", v);
- seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
-
- for (i = 0; i < maxeilvt; i++) {
- v = apic_read(APIC_EILVTn(i));
- seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
- }
- }
-#endif /* CONFIG_X86_LOCAL_APIC */
- seq_printf(seq, "\n MSR\t:\n");
-}
-
-static int cpu_seq_show(struct seq_file *seq, void *v)
-{
- struct cpu_private *priv = seq->private;
-
- if (priv == NULL)
- return -EINVAL;
-
- switch (cpu_base[priv->type].flag) {
- case CPU_TSS:
- smp_call_function_single(priv->cpu, print_tss, seq, 1);
- break;
- case CPU_CR:
- smp_call_function_single(priv->cpu, print_cr, seq, 1);
- break;
- case CPU_DT:
- smp_call_function_single(priv->cpu, print_dt, seq, 1);
- break;
- case CPU_DEBUG:
- if (priv->file == CPU_INDEX_BIT)
- smp_call_function_single(priv->cpu, print_dr, seq, 1);
- print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
- break;
- case CPU_APIC:
- if (priv->file == CPU_INDEX_BIT)
- smp_call_function_single(priv->cpu, print_apic, seq, 1);
- print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
- break;
-
- default:
- print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
- break;
- }
- seq_printf(seq, "\n");
-
- return 0;
-}
-
-static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
-{
- if (*pos == 0) /* One time is enough ;-) */
- return seq;
-
- return NULL;
-}
-
-static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- (*pos)++;
-
- return cpu_seq_start(seq, pos);
-}
-
-static void cpu_seq_stop(struct seq_file *seq, void *v)
-{
-}
-
-static const struct seq_operations cpu_seq_ops = {
- .start = cpu_seq_start,
- .next = cpu_seq_next,
- .stop = cpu_seq_stop,
- .show = cpu_seq_show,
-};
-
-static int cpu_seq_open(struct inode *inode, struct file *file)
-{
- struct cpu_private *priv = inode->i_private;
- struct seq_file *seq;
- int err;
-
- err = seq_open(file, &cpu_seq_ops);
- if (!err) {
- seq = file->private_data;
- seq->private = priv;
- }
-
- return err;
-}
-
-static int write_msr(struct cpu_private *priv, u64 val)
-{
- u32 low, high;
-
- high = (val >> 32) & 0xffffffff;
- low = val & 0xffffffff;
-
- if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
- return 0;
-
- return -EPERM;
-}
-
-static int write_cpu_register(struct cpu_private *priv, const char *buf)
-{
- int ret = -EPERM;
- u64 val;
-
- ret = strict_strtoull(buf, 0, &val);
- if (ret < 0)
- return ret;
-
- /* Supporting only MSRs */
- if (priv->type < CPU_TSS_BIT)
- return write_msr(priv, val);
-
- return ret;
-}
-
-static ssize_t cpu_write(struct file *file, const char __user *ubuf,
- size_t count, loff_t *off)
-{
- struct seq_file *seq = file->private_data;
- struct cpu_private *priv = seq->private;
- char buf[19];
-
- if ((priv == NULL) || (count >= sizeof(buf)))
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, count))
- return -EFAULT;
-
- buf[count] = 0;
-
- if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
- if (!write_cpu_register(priv, buf))
- return count;
-
- return -EACCES;
-}
-
-static const struct file_operations cpu_fops = {
- .owner = THIS_MODULE,
- .open = cpu_seq_open,
- .read = seq_read,
- .write = cpu_write,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
- unsigned file, struct dentry *dentry)
-{
- struct cpu_private *priv = NULL;
-
- /* Already intialized */
- if (file == CPU_INDEX_BIT)
- if (per_cpu(cpu_arr[type].init, cpu))
- return 0;
-
- priv = kzalloc(sizeof(*priv), GFP_KERNEL);
- if (priv == NULL)
- return -ENOMEM;
-
- priv->cpu = cpu;
- priv->type = type;
- priv->reg = reg;
- priv->file = file;
- mutex_lock(&cpu_debug_lock);
- per_cpu(priv_arr[type], cpu) = priv;
- per_cpu(cpu_priv_count, cpu)++;
- mutex_unlock(&cpu_debug_lock);
-
- if (file)
- debugfs_create_file(cpu_file[file].name, S_IRUGO,
- dentry, (void *)priv, &cpu_fops);
- else {
- debugfs_create_file(cpu_base[type].name, S_IRUGO,
- per_cpu(cpu_arr[type].dentry, cpu),
- (void *)priv, &cpu_fops);
- mutex_lock(&cpu_debug_lock);
- per_cpu(cpu_arr[type].init, cpu) = 1;
- mutex_unlock(&cpu_debug_lock);
- }
-
- return 0;
-}
-
-static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
- struct dentry *dentry)
-{
- unsigned file;
- int err = 0;
-
- for (file = 0; file < ARRAY_SIZE(cpu_file); file++) {
- err = cpu_create_file(cpu, type, reg, file, dentry);
- if (err)
- return err;
- }
-
- return err;
-}
-
-static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
-{
- struct dentry *cpu_dentry = NULL;
- unsigned reg, reg_min, reg_max;
- int i, err = 0;
- char reg_dir[12];
- u32 low, high;
-
- for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
- if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
- cpu_base[type].flag))
- continue;
-
- for (reg = reg_min; reg <= reg_max; reg++) {
- if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
- continue;
-
- sprintf(reg_dir, "0x%x", reg);
- cpu_dentry = debugfs_create_dir(reg_dir, dentry);
- err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
- if (err)
- return err;
- }
- }
-
- return err;
-}
-
-static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
-{
- struct dentry *cpu_dentry = NULL;
- unsigned type;
- int err = 0;
-
- for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) {
- if (!is_typeflag_valid(cpu, cpu_base[type].flag))
- continue;
- cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
- per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
-
- if (type < CPU_TSS_BIT)
- err = cpu_init_msr(cpu, type, cpu_dentry);
- else
- err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
- cpu_dentry);
- if (err)
- return err;
- }
-
- return err;
-}
-
-static int cpu_init_cpu(void)
-{
- struct dentry *cpu_dentry = NULL;
- struct cpuinfo_x86 *cpui;
- char cpu_dir[12];
- unsigned cpu;
- int err = 0;
-
- for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
- cpui = &cpu_data(cpu);
- if (!cpu_has(cpui, X86_FEATURE_MSR))
- continue;
-
- sprintf(cpu_dir, "cpu%d", cpu);
- cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
- err = cpu_init_allreg(cpu, cpu_dentry);
-
- pr_info("cpu%d(%d) debug files %d\n",
- cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu));
- if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) {
- pr_err("Register files count %d exceeds limit %d\n",
- per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES);
- per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES;
- err = -ENFILE;
- }
- if (err)
- return err;
- }
-
- return err;
-}
-
-static int __init cpu_debug_init(void)
-{
- cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
-
- return cpu_init_cpu();
-}
-
-static void __exit cpu_debug_exit(void)
-{
- int i, cpu;
-
- if (cpu_debugfs_dir)
- debugfs_remove_recursive(cpu_debugfs_dir);
-
- for (cpu = 0; cpu < nr_cpu_ids; cpu++)
- for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
- kfree(per_cpu(priv_arr[i], cpu));
-}
-
-module_init(cpu_debug_init);
-module_exit(cpu_debug_exit);
-
-MODULE_AUTHOR("Jaswinder Singh Rajput");
-MODULE_DESCRIPTION("CPU Debug module");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index f138c6c389b9..870e6cc6ad28 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -10,6 +10,20 @@ if CPU_FREQ
comment "CPUFreq processor drivers"
+config X86_PCC_CPUFREQ
+ tristate "Processor Clocking Control interface driver"
+ depends on ACPI && ACPI_PROCESSOR
+ help
+ This driver adds support for the PCC interface.
+
+ For details, take a look at:
+ <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
+
+ To compile this driver as a module, choose M here: the
+ module will be called pcc-cpufreq.
+
+ If in doubt, say N.
+
config X86_ACPI_CPUFREQ
tristate "ACPI Processor P-States driver"
select CPU_FREQ_TABLE
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 509296df294d..1840c0a5170b 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -4,6 +4,7 @@
obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
+obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 7d5c3b0ea8da..1b1920fa7c80 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -68,9 +68,9 @@ struct acpi_cpufreq_data {
unsigned int cpu_feature;
};
-static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
+static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
-static DEFINE_PER_CPU(struct aperfmperf, old_perf);
+static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
/* acpi_perf_data is a pointer to percpu data. */
static struct acpi_processor_performance *acpi_perf_data;
@@ -190,9 +190,11 @@ static void do_drv_write(void *_cmd)
static void drv_read(struct drv_cmd *cmd)
{
+ int err;
cmd->val = 0;
- smp_call_function_single(cpumask_any(cmd->mask), do_drv_read, cmd, 1);
+ err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
+ WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
}
static void drv_write(struct drv_cmd *cmd)
@@ -214,14 +216,14 @@ static u32 get_cur_val(const struct cpumask *mask)
if (unlikely(cpumask_empty(mask)))
return 0;
- switch (per_cpu(drv_data, cpumask_first(mask))->cpu_feature) {
+ switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) {
case SYSTEM_INTEL_MSR_CAPABLE:
cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
break;
case SYSTEM_IO_CAPABLE:
cmd.type = SYSTEM_IO_CAPABLE;
- perf = per_cpu(drv_data, cpumask_first(mask))->acpi_data;
+ perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data;
cmd.addr.io.port = perf->control_register.address;
cmd.addr.io.bit_width = perf->control_register.bit_width;
break;
@@ -268,8 +270,8 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
return 0;
- ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf);
- per_cpu(old_perf, cpu) = perf;
+ ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
+ per_cpu(acfreq_old_perf, cpu) = perf;
retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
@@ -278,7 +280,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
{
- struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);
+ struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
unsigned int freq;
unsigned int cached_freq;
@@ -322,7 +324,7 @@ static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
static int acpi_cpufreq_target(struct cpufreq_policy *policy,
unsigned int target_freq, unsigned int relation)
{
- struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
+ struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
struct acpi_processor_performance *perf;
struct cpufreq_freqs freqs;
struct drv_cmd cmd;
@@ -416,7 +418,7 @@ out:
static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
{
- struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
+ struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
dprintk("acpi_cpufreq_verify\n");
@@ -526,15 +528,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = {
static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
{
- /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf
+ /* Intel Xeon Processor 7100 Series Specification Update
+ * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
* AL30: A Machine Check Exception (MCE) Occurring during an
* Enhanced Intel SpeedStep Technology Ratio Change May Cause
- * Both Processor Cores to Lock Up when HT is enabled*/
+ * Both Processor Cores to Lock Up. */
if (c->x86_vendor == X86_VENDOR_INTEL) {
if ((c->x86 == 15) &&
(c->x86_model == 6) &&
- (c->x86_mask == 8) && smt_capable())
+ (c->x86_mask == 8)) {
+ printk(KERN_INFO "acpi-cpufreq: Intel(R) "
+ "Xeon(R) 7100 Errata AL30, processors may "
+ "lock up on frequency changes: disabling "
+ "acpi-cpufreq.\n");
return -ENODEV;
+ }
}
return 0;
}
@@ -549,13 +557,18 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
unsigned int result = 0;
struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
struct acpi_processor_performance *perf;
+#ifdef CONFIG_SMP
+ static int blacklisted;
+#endif
dprintk("acpi_cpufreq_cpu_init\n");
#ifdef CONFIG_SMP
- result = acpi_cpufreq_blacklist(c);
- if (result)
- return result;
+ if (blacklisted)
+ return blacklisted;
+ blacklisted = acpi_cpufreq_blacklist(c);
+ if (blacklisted)
+ return blacklisted;
#endif
data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
@@ -563,7 +576,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
return -ENOMEM;
data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
- per_cpu(drv_data, cpu) = data;
+ per_cpu(acfreq_data, cpu) = data;
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
@@ -714,20 +727,20 @@ err_unreg:
acpi_processor_unregister_performance(perf, cpu);
err_free:
kfree(data);
- per_cpu(drv_data, cpu) = NULL;
+ per_cpu(acfreq_data, cpu) = NULL;
return result;
}
static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
{
- struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
+ struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
dprintk("acpi_cpufreq_cpu_exit\n");
if (data) {
cpufreq_frequency_table_put_attr(policy->cpu);
- per_cpu(drv_data, policy->cpu) = NULL;
+ per_cpu(acfreq_data, policy->cpu) = NULL;
acpi_processor_unregister_performance(data->acpi_data,
policy->cpu);
kfree(data);
@@ -738,7 +751,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
{
- struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
+ struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
dprintk("acpi_cpufreq_resume\n");
@@ -753,14 +766,15 @@ static struct freq_attr *acpi_cpufreq_attr[] = {
};
static struct cpufreq_driver acpi_cpufreq_driver = {
- .verify = acpi_cpufreq_verify,
- .target = acpi_cpufreq_target,
- .init = acpi_cpufreq_cpu_init,
- .exit = acpi_cpufreq_cpu_exit,
- .resume = acpi_cpufreq_resume,
- .name = "acpi-cpufreq",
- .owner = THIS_MODULE,
- .attr = acpi_cpufreq_attr,
+ .verify = acpi_cpufreq_verify,
+ .target = acpi_cpufreq_target,
+ .bios_limit = acpi_processor_get_bios_limit,
+ .init = acpi_cpufreq_cpu_init,
+ .exit = acpi_cpufreq_cpu_exit,
+ .resume = acpi_cpufreq_resume,
+ .name = "acpi-cpufreq",
+ .owner = THIS_MODULE,
+ .attr = acpi_cpufreq_attr,
};
static int __init acpi_cpufreq_init(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index ce2ed3e4aad9..7e7eea4f8261 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -813,7 +813,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
break;
case 1 ... 15:
- longhaul_version = TYPE_LONGHAUL_V1;
+ longhaul_version = TYPE_LONGHAUL_V2;
if (c->x86_mask < 8) {
cpu_model = CPU_SAMUEL2;
cpuname = "C3 'Samuel 2' [C5B]";
@@ -885,7 +885,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
/* Find ACPI data for processor */
acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
- ACPI_UINT32_MAX, &longhaul_walk_callback,
+ ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
NULL, (void *)&pr);
/* Check ACPI support for C3 state */
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
new file mode 100644
index 000000000000..ff36d2979a90
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -0,0 +1,620 @@
+/*
+ * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
+ *
+ * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
+ * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
+ * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
+ * INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <linux/cpufreq.h>
+#include <linux/compiler.h>
+
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+
+#include <acpi/processor.h>
+
+#define PCC_VERSION "1.00.00"
+#define POLL_LOOPS 300
+
+#define CMD_COMPLETE 0x1
+#define CMD_GET_FREQ 0x0
+#define CMD_SET_FREQ 0x1
+
+#define BUF_SZ 4
+
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
+ "pcc-cpufreq", msg)
+
+struct pcc_register_resource {
+ u8 descriptor;
+ u16 length;
+ u8 space_id;
+ u8 bit_width;
+ u8 bit_offset;
+ u8 access_size;
+ u64 address;
+} __attribute__ ((packed));
+
+struct pcc_memory_resource {
+ u8 descriptor;
+ u16 length;
+ u8 space_id;
+ u8 resource_usage;
+ u8 type_specific;
+ u64 granularity;
+ u64 minimum;
+ u64 maximum;
+ u64 translation_offset;
+ u64 address_length;
+} __attribute__ ((packed));
+
+static struct cpufreq_driver pcc_cpufreq_driver;
+
+struct pcc_header {
+ u32 signature;
+ u16 length;
+ u8 major;
+ u8 minor;
+ u32 features;
+ u16 command;
+ u16 status;
+ u32 latency;
+ u32 minimum_time;
+ u32 maximum_time;
+ u32 nominal;
+ u32 throttled_frequency;
+ u32 minimum_frequency;
+};
+
+static void __iomem *pcch_virt_addr;
+static struct pcc_header __iomem *pcch_hdr;
+
+static DEFINE_SPINLOCK(pcc_lock);
+
+static struct acpi_generic_address doorbell;
+
+static u64 doorbell_preserve;
+static u64 doorbell_write;
+
+static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
+ 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
+
+struct pcc_cpu {
+ u32 input_offset;
+ u32 output_offset;
+};
+
+static struct pcc_cpu *pcc_cpu_info;
+
+static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
+{
+ cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
+ policy->cpuinfo.max_freq);
+ return 0;
+}
+
+static inline void pcc_cmd(void)
+{
+ u64 doorbell_value;
+ int i;
+
+ acpi_read(&doorbell_value, &doorbell);
+ acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
+ &doorbell);
+
+ for (i = 0; i < POLL_LOOPS; i++) {
+ if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
+ break;
+ }
+}
+
+static inline void pcc_clear_mapping(void)
+{
+ if (pcch_virt_addr)
+ iounmap(pcch_virt_addr);
+ pcch_virt_addr = NULL;
+}
+
+static unsigned int pcc_get_freq(unsigned int cpu)
+{
+ struct pcc_cpu *pcc_cpu_data;
+ unsigned int curr_freq;
+ unsigned int freq_limit;
+ u16 status;
+ u32 input_buffer;
+ u32 output_buffer;
+
+ spin_lock(&pcc_lock);
+
+ dprintk("get: get_freq for CPU %d\n", cpu);
+ pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
+
+ input_buffer = 0x1;
+ iowrite32(input_buffer,
+ (pcch_virt_addr + pcc_cpu_data->input_offset));
+ iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
+
+ pcc_cmd();
+
+ output_buffer =
+ ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
+
+ /* Clear the input buffer - we are done with the current command */
+ memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
+
+ status = ioread16(&pcch_hdr->status);
+ if (status != CMD_COMPLETE) {
+ dprintk("get: FAILED: for CPU %d, status is %d\n",
+ cpu, status);
+ goto cmd_incomplete;
+ }
+ iowrite16(0, &pcch_hdr->status);
+ curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
+ / 100) * 1000);
+
+ dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
+ "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
+ cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
+ output_buffer, curr_freq);
+
+ freq_limit = (output_buffer >> 8) & 0xff;
+ if (freq_limit != 0xff) {
+ dprintk("get: frequency for cpu %d is being temporarily"
+ " capped at %d\n", cpu, curr_freq);
+ }
+
+ spin_unlock(&pcc_lock);
+ return curr_freq;
+
+cmd_incomplete:
+ iowrite16(0, &pcch_hdr->status);
+ spin_unlock(&pcc_lock);
+ return -EINVAL;
+}
+
+static int pcc_cpufreq_target(struct cpufreq_policy *policy,
+ unsigned int target_freq,
+ unsigned int relation)
+{
+ struct pcc_cpu *pcc_cpu_data;
+ struct cpufreq_freqs freqs;
+ u16 status;
+ u32 input_buffer;
+ int cpu;
+
+ spin_lock(&pcc_lock);
+ cpu = policy->cpu;
+ pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
+
+ dprintk("target: CPU %d should go to target freq: %d "
+ "(virtual) input_offset is 0x%x\n",
+ cpu, target_freq,
+ (pcch_virt_addr + pcc_cpu_data->input_offset));
+
+ freqs.new = target_freq;
+ freqs.cpu = cpu;
+ cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+
+ input_buffer = 0x1 | (((target_freq * 100)
+ / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
+ iowrite32(input_buffer,
+ (pcch_virt_addr + pcc_cpu_data->input_offset));
+ iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
+
+ pcc_cmd();
+
+ /* Clear the input buffer - we are done with the current command */
+ memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
+
+ status = ioread16(&pcch_hdr->status);
+ if (status != CMD_COMPLETE) {
+ dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
+ cpu, status);
+ goto cmd_incomplete;
+ }
+ iowrite16(0, &pcch_hdr->status);
+
+ cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+ dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
+ spin_unlock(&pcc_lock);
+
+ return 0;
+
+cmd_incomplete:
+ iowrite16(0, &pcch_hdr->status);
+ spin_unlock(&pcc_lock);
+ return -EINVAL;
+}
+
+static int pcc_get_offset(int cpu)
+{
+ acpi_status status;
+ struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
+ union acpi_object *pccp, *offset;
+ struct pcc_cpu *pcc_cpu_data;
+ struct acpi_processor *pr;
+ int ret = 0;
+
+ pr = per_cpu(processors, cpu);
+ pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
+
+ status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
+ pccp = buffer.pointer;
+ if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
+ ret = -ENODEV;
+ goto out_free;
+ };
+
+ offset = &(pccp->package.elements[0]);
+ if (!offset || offset->type != ACPI_TYPE_INTEGER) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ pcc_cpu_data->input_offset = offset->integer.value;
+
+ offset = &(pccp->package.elements[1]);
+ if (!offset || offset->type != ACPI_TYPE_INTEGER) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ pcc_cpu_data->output_offset = offset->integer.value;
+
+ memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
+ memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
+
+ dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
+ "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
+ cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
+out_free:
+ kfree(buffer.pointer);
+ return ret;
+}
+
+static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
+{
+ acpi_status status;
+ struct acpi_object_list input;
+ struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
+ union acpi_object in_params[4];
+ union acpi_object *out_obj;
+ u32 capabilities[2];
+ u32 errors;
+ u32 supported;
+ int ret = 0;
+
+ input.count = 4;
+ input.pointer = in_params;
+ input.count = 4;
+ input.pointer = in_params;
+ in_params[0].type = ACPI_TYPE_BUFFER;
+ in_params[0].buffer.length = 16;
+ in_params[0].buffer.pointer = OSC_UUID;
+ in_params[1].type = ACPI_TYPE_INTEGER;
+ in_params[1].integer.value = 1;
+ in_params[2].type = ACPI_TYPE_INTEGER;
+ in_params[2].integer.value = 2;
+ in_params[3].type = ACPI_TYPE_BUFFER;
+ in_params[3].buffer.length = 8;
+ in_params[3].buffer.pointer = (u8 *)&capabilities;
+
+ capabilities[0] = OSC_QUERY_ENABLE;
+ capabilities[1] = 0x1;
+
+ status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
+ if (!output.length)
+ return -ENODEV;
+
+ out_obj = output.pointer;
+ if (out_obj->type != ACPI_TYPE_BUFFER) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
+ if (errors) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ supported = *((u32 *)(out_obj->buffer.pointer + 4));
+ if (!(supported & 0x1)) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ kfree(output.pointer);
+ capabilities[0] = 0x0;
+ capabilities[1] = 0x1;
+
+ status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
+ if (!output.length)
+ return -ENODEV;
+
+ out_obj = output.pointer;
+ if (out_obj->type != ACPI_TYPE_BUFFER) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
+ if (errors) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ supported = *((u32 *)(out_obj->buffer.pointer + 4));
+ if (!(supported & 0x1)) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+out_free:
+ kfree(output.pointer);
+ return ret;
+}
+
+static int __init pcc_cpufreq_probe(void)
+{
+ acpi_status status;
+ struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
+ struct pcc_memory_resource *mem_resource;
+ struct pcc_register_resource *reg_resource;
+ union acpi_object *out_obj, *member;
+ acpi_handle handle, osc_handle;
+ int ret = 0;
+
+ status = acpi_get_handle(NULL, "\\_SB", &handle);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
+ status = acpi_get_handle(handle, "_OSC", &osc_handle);
+ if (ACPI_SUCCESS(status)) {
+ ret = pcc_cpufreq_do_osc(&osc_handle);
+ if (ret)
+ dprintk("probe: _OSC evaluation did not succeed\n");
+ /* Firmware's use of _OSC is optional */
+ ret = 0;
+ }
+
+ status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
+ out_obj = output.pointer;
+ if (out_obj->type != ACPI_TYPE_PACKAGE) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ member = &out_obj->package.elements[0];
+ if (member->type != ACPI_TYPE_BUFFER) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
+
+ dprintk("probe: mem_resource descriptor: 0x%x,"
+ " length: %d, space_id: %d, resource_usage: %d,"
+ " type_specific: %d, granularity: 0x%llx,"
+ " minimum: 0x%llx, maximum: 0x%llx,"
+ " translation_offset: 0x%llx, address_length: 0x%llx\n",
+ mem_resource->descriptor, mem_resource->length,
+ mem_resource->space_id, mem_resource->resource_usage,
+ mem_resource->type_specific, mem_resource->granularity,
+ mem_resource->minimum, mem_resource->maximum,
+ mem_resource->translation_offset,
+ mem_resource->address_length);
+
+ if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
+ ret = -ENODEV;
+ goto out_free;
+ }
+
+ pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
+ mem_resource->address_length);
+ if (pcch_virt_addr == NULL) {
+ dprintk("probe: could not map shared mem region\n");
+ goto out_free;
+ }
+ pcch_hdr = pcch_virt_addr;
+
+ dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
+ dprintk("probe: PCCH header is at physical address: 0x%llx,"
+ " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
+ " supported features: 0x%x, command field: 0x%x,"
+ " status field: 0x%x, nominal latency: %d us\n",
+ mem_resource->minimum, ioread32(&pcch_hdr->signature),
+ ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
+ ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
+ ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
+ ioread32(&pcch_hdr->latency));
+
+ dprintk("probe: min time between commands: %d us,"
+ " max time between commands: %d us,"
+ " nominal CPU frequency: %d MHz,"
+ " minimum CPU frequency: %d MHz,"
+ " minimum CPU frequency without throttling: %d MHz\n",
+ ioread32(&pcch_hdr->minimum_time),
+ ioread32(&pcch_hdr->maximum_time),
+ ioread32(&pcch_hdr->nominal),
+ ioread32(&pcch_hdr->throttled_frequency),
+ ioread32(&pcch_hdr->minimum_frequency));
+
+ member = &out_obj->package.elements[1];
+ if (member->type != ACPI_TYPE_BUFFER) {
+ ret = -ENODEV;
+ goto pcch_free;
+ }
+
+ reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
+
+ doorbell.space_id = reg_resource->space_id;
+ doorbell.bit_width = reg_resource->bit_width;
+ doorbell.bit_offset = reg_resource->bit_offset;
+ doorbell.access_width = 64;
+ doorbell.address = reg_resource->address;
+
+ dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
+ "bit_offset is %d, access_width is %d, address is 0x%llx\n",
+ doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
+ doorbell.access_width, reg_resource->address);
+
+ member = &out_obj->package.elements[2];
+ if (member->type != ACPI_TYPE_INTEGER) {
+ ret = -ENODEV;
+ goto pcch_free;
+ }
+
+ doorbell_preserve = member->integer.value;
+
+ member = &out_obj->package.elements[3];
+ if (member->type != ACPI_TYPE_INTEGER) {
+ ret = -ENODEV;
+ goto pcch_free;
+ }
+
+ doorbell_write = member->integer.value;
+
+ dprintk("probe: doorbell_preserve: 0x%llx,"
+ " doorbell_write: 0x%llx\n",
+ doorbell_preserve, doorbell_write);
+
+ pcc_cpu_info = alloc_percpu(struct pcc_cpu);
+ if (!pcc_cpu_info) {
+ ret = -ENOMEM;
+ goto pcch_free;
+ }
+
+ printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
+ " limits: %d MHz, %d MHz\n", PCC_VERSION,
+ ioread32(&pcch_hdr->minimum_frequency),
+ ioread32(&pcch_hdr->nominal));
+ kfree(output.pointer);
+ return ret;
+pcch_free:
+ pcc_clear_mapping();
+out_free:
+ kfree(output.pointer);
+ return ret;
+}
+
+static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+ unsigned int cpu = policy->cpu;
+ unsigned int result = 0;
+
+ if (!pcch_virt_addr) {
+ result = -1;
+ goto pcch_null;
+ }
+
+ result = pcc_get_offset(cpu);
+ if (result) {
+ dprintk("init: PCCP evaluation failed\n");
+ goto free;
+ }
+
+ policy->max = policy->cpuinfo.max_freq =
+ ioread32(&pcch_hdr->nominal) * 1000;
+ policy->min = policy->cpuinfo.min_freq =
+ ioread32(&pcch_hdr->minimum_frequency) * 1000;
+ policy->cur = pcc_get_freq(cpu);
+
+ dprintk("init: policy->max is %d, policy->min is %d\n",
+ policy->max, policy->min);
+
+ return 0;
+free:
+ pcc_clear_mapping();
+ free_percpu(pcc_cpu_info);
+pcch_null:
+ return result;
+}
+
+static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+ return 0;
+}
+
+static struct cpufreq_driver pcc_cpufreq_driver = {
+ .flags = CPUFREQ_CONST_LOOPS,
+ .get = pcc_get_freq,
+ .verify = pcc_cpufreq_verify,
+ .target = pcc_cpufreq_target,
+ .init = pcc_cpufreq_cpu_init,
+ .exit = pcc_cpufreq_cpu_exit,
+ .name = "pcc-cpufreq",
+ .owner = THIS_MODULE,
+};
+
+static int __init pcc_cpufreq_init(void)
+{
+ int ret;
+
+ if (acpi_disabled)
+ return 0;
+
+ ret = pcc_cpufreq_probe();
+ if (ret) {
+ dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
+ return ret;
+ }
+
+ ret = cpufreq_register_driver(&pcc_cpufreq_driver);
+
+ return ret;
+}
+
+static void __exit pcc_cpufreq_exit(void)
+{
+ cpufreq_unregister_driver(&pcc_cpufreq_driver);
+
+ pcc_clear_mapping();
+
+ free_percpu(pcc_cpu_info);
+}
+
+MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
+MODULE_VERSION(PCC_VERSION);
+MODULE_DESCRIPTION("Processor Clocking Control interface driver");
+MODULE_LICENSE("GPL");
+
+late_initcall(pcc_cpufreq_init);
+module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index f10dea409f40..cb01dac267d3 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -164,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
}
/* cpuinfo and default policy values */
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+ policy->cpuinfo.transition_latency = 200000;
policy->cur = busfreq * max_multiplier;
result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index d47c775eb0ab..9a97116f89e5 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -714,14 +714,17 @@ static struct freq_attr *powernow_table_attr[] = {
};
static struct cpufreq_driver powernow_driver = {
- .verify = powernow_verify,
- .target = powernow_target,
- .get = powernow_get,
- .init = powernow_cpu_init,
- .exit = powernow_cpu_exit,
- .name = "powernow-k7",
- .owner = THIS_MODULE,
- .attr = powernow_table_attr,
+ .verify = powernow_verify,
+ .target = powernow_target,
+ .get = powernow_get,
+#ifdef CONFIG_X86_POWERNOW_K7_ACPI
+ .bios_limit = acpi_processor_get_bios_limit,
+#endif
+ .init = powernow_cpu_init,
+ .exit = powernow_cpu_exit,
+ .name = "powernow-k7",
+ .owner = THIS_MODULE,
+ .attr = powernow_table_attr,
};
static int __init powernow_init(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 6394aa5c7985..d360b56e9825 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -806,7 +806,7 @@ static int find_psb_table(struct powernow_k8_data *data)
static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
unsigned int index)
{
- acpi_integer control;
+ u64 control;
if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
return;
@@ -824,7 +824,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
{
struct cpufreq_frequency_table *powernow_table;
int ret_val = -ENODEV;
- acpi_integer control, status;
+ u64 control, status;
if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
dprintk("register performance failed: bad ACPI data\n");
@@ -948,7 +948,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
u32 fid;
u32 vid;
u32 freq, index;
- acpi_integer status, control;
+ u64 status, control;
if (data->exttype) {
status = data->acpi_data.states[i].status;
@@ -1022,7 +1022,7 @@ static int get_transition_latency(struct powernow_k8_data *data)
* set it to 1 to avoid problems in the future.
* For all others it's a BIOS bug.
*/
- if (!boot_cpu_data.x86 == 0x11)
+ if (boot_cpu_data.x86 != 0x11)
printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
"latency\n");
max_latency = 1;
@@ -1118,7 +1118,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,
static int powernowk8_target(struct cpufreq_policy *pol,
unsigned targfreq, unsigned relation)
{
- cpumask_t oldmask;
+ cpumask_var_t oldmask;
struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
u32 checkfid;
u32 checkvid;
@@ -1131,9 +1131,13 @@ static int powernowk8_target(struct cpufreq_policy *pol,
checkfid = data->currfid;
checkvid = data->currvid;
- /* only run on specific CPU from here on */
- oldmask = current->cpus_allowed;
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu));
+ /* only run on specific CPU from here on. */
+ /* This is poor form: use a workqueue or smp_call_function_single */
+ if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_copy(oldmask, tsk_cpus_allowed(current));
+ set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
if (smp_processor_id() != pol->cpu) {
printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1193,7 +1197,8 @@ static int powernowk8_target(struct cpufreq_policy *pol,
ret = 0;
err_out:
- set_cpus_allowed_ptr(current, &oldmask);
+ set_cpus_allowed_ptr(current, oldmask);
+ free_cpumask_var(oldmask);
return ret;
}
@@ -1351,6 +1356,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
kfree(data->powernow_table);
kfree(data);
+ per_cpu(powernow_data, pol->cpu) = NULL;
return 0;
}
@@ -1370,7 +1376,7 @@ static unsigned int powernowk8_get(unsigned int cpu)
int err;
if (!data)
- return -EINVAL;
+ return 0;
smp_call_function_single(cpu, query_values_on_cpu, &err, true);
if (err)
@@ -1393,14 +1399,15 @@ static struct freq_attr *powernow_k8_attr[] = {
};
static struct cpufreq_driver cpufreq_amd64_driver = {
- .verify = powernowk8_verify,
- .target = powernowk8_target,
- .init = powernowk8_cpu_init,
- .exit = __devexit_p(powernowk8_cpu_exit),
- .get = powernowk8_get,
- .name = "powernow-k8",
- .owner = THIS_MODULE,
- .attr = powernow_k8_attr,
+ .verify = powernowk8_verify,
+ .target = powernowk8_target,
+ .bios_limit = acpi_processor_get_bios_limit,
+ .init = powernowk8_cpu_init,
+ .exit = __devexit_p(powernowk8_cpu_exit),
+ .get = powernowk8_get,
+ .name = "powernow-k8",
+ .owner = THIS_MODULE,
+ .attr = powernow_k8_attr,
};
/* driver entry point for init */
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 6911e91fb4f6..2ce8e0b5cc54 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -39,7 +39,7 @@ static struct pci_dev *speedstep_chipset_dev;
/* speedstep_processor
*/
-static unsigned int speedstep_processor;
+static enum speedstep_processor speedstep_processor;
static u32 pmbase;
@@ -232,28 +232,23 @@ static unsigned int speedstep_detect_chipset(void)
return 0;
}
-struct get_freq_data {
- unsigned int speed;
- unsigned int processor;
-};
-
-static void get_freq_data(void *_data)
+static void get_freq_data(void *_speed)
{
- struct get_freq_data *data = _data;
+ unsigned int *speed = _speed;
- data->speed = speedstep_get_frequency(data->processor);
+ *speed = speedstep_get_frequency(speedstep_processor);
}
static unsigned int speedstep_get(unsigned int cpu)
{
- struct get_freq_data data = { .processor = cpu };
+ unsigned int speed;
/* You're supposed to ensure CPU is online. */
- if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0)
+ if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
BUG();
- dprintk("detected %u kHz as current frequency\n", data.speed);
- return data.speed;
+ dprintk("detected %u kHz as current frequency\n", speed);
+ return speed;
}
/**
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index f4c290b8482f..ad0083abfa23 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -34,7 +34,7 @@ static int relaxed_check;
* GET PROCESSOR CORE SPEED IN KHZ *
*********************************************************************/
-static unsigned int pentium3_get_frequency(unsigned int processor)
+static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
{
/* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
struct {
@@ -227,7 +227,7 @@ static unsigned int pentium4_get_frequency(void)
/* Warning: may get called from smp_call_function_single. */
-unsigned int speedstep_get_frequency(unsigned int processor)
+unsigned int speedstep_get_frequency(enum speedstep_processor processor)
{
switch (processor) {
case SPEEDSTEP_CPU_PCORE:
@@ -380,7 +380,7 @@ EXPORT_SYMBOL_GPL(speedstep_detect_processor);
* DETECT SPEEDSTEP SPEEDS *
*********************************************************************/
-unsigned int speedstep_get_freqs(unsigned int processor,
+unsigned int speedstep_get_freqs(enum speedstep_processor processor,
unsigned int *low_speed,
unsigned int *high_speed,
unsigned int *transition_latency,
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
index 2b6c04e5a304..70d9cea1219d 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
@@ -11,18 +11,18 @@
/* processors */
-
-#define SPEEDSTEP_CPU_PIII_C_EARLY 0x00000001 /* Coppermine core */
-#define SPEEDSTEP_CPU_PIII_C 0x00000002 /* Coppermine core */
-#define SPEEDSTEP_CPU_PIII_T 0x00000003 /* Tualatin core */
-#define SPEEDSTEP_CPU_P4M 0x00000004 /* P4-M */
-
+enum speedstep_processor {
+ SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */
+ SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */
+ SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */
+ SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */
/* the following processors are not speedstep-capable and are not auto-detected
* in speedstep_detect_processor(). However, their speed can be detected using
* the speedstep_get_frequency() call. */
-#define SPEEDSTEP_CPU_PM 0xFFFFFF03 /* Pentium M */
-#define SPEEDSTEP_CPU_P4D 0xFFFFFF04 /* desktop P4 */
-#define SPEEDSTEP_CPU_PCORE 0xFFFFFF05 /* Core */
+ SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */
+ SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */
+ SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */
+};
/* speedstep states -- only two of them */
@@ -31,10 +31,10 @@
/* detect a speedstep-capable processor */
-extern unsigned int speedstep_detect_processor (void);
+extern enum speedstep_processor speedstep_detect_processor(void);
/* detect the current speed (in khz) of the processor */
-extern unsigned int speedstep_get_frequency(unsigned int processor);
+extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
/* detect the low and high speeds of the processor. The callback
@@ -42,7 +42,7 @@ extern unsigned int speedstep_get_frequency(unsigned int processor);
* SPEEDSTEP_LOW; the second argument is zero so that no
* cpufreq_notify_transition calls are initiated.
*/
-extern unsigned int speedstep_get_freqs(unsigned int processor,
+extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
unsigned int *low_speed,
unsigned int *high_speed,
unsigned int *transition_latency,
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index befea088e4f5..04d73c114e49 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -35,7 +35,7 @@ static int smi_cmd;
static unsigned int smi_sig;
/* info about the processor */
-static unsigned int speedstep_processor;
+static enum speedstep_processor speedstep_processor;
/*
* There are only two frequency states for each processor. Values
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 19807b89f058..4fbd384fb645 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -373,7 +373,7 @@ static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
/* Handle the GX (Formally known as the GX2) */
if (c->x86 == 5 && c->x86_model == 5)
- display_cacheinfo(c);
+ cpu_detect_cache_sizes(c);
else
init_cyrix(c);
}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 40e1835b35e8..7e1cca13af35 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -70,8 +70,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
- set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
- sched_clock_stable = 1;
+ if (!check_tsc_unstable())
+ sched_clock_stable = 1;
}
/*
@@ -263,11 +263,13 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
/* Don't do the funky fallback heuristics the AMD version employs
for now. */
node = apicid_to_node[apicid];
- if (node == NUMA_NO_NODE || !node_online(node))
+ if (node == NUMA_NO_NODE)
node = first_node(node_online_map);
+ else if (!node_online(node)) {
+ /* reuse the value from init_cpu_to_node() */
+ node = cpu_to_node(cpu);
+ }
numa_set_node(cpu, node);
-
- printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
#endif
}
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 804c40e2bc3e..b3eeb66c0a51 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -18,6 +18,7 @@
#include <asm/processor.h>
#include <linux/smp.h>
#include <asm/k8.h>
+#include <asm/smp.h>
#define LVL_1_INST 1
#define LVL_1_DATA 2
@@ -31,6 +32,8 @@ struct _cache_table {
short size;
};
+#define MB(x) ((x) * 1024)
+
/* All the cache descriptor types we care about (no TLB or
trace cache entries) */
@@ -44,9 +47,9 @@ static const struct _cache_table __cpuinitconst cache_table[] =
{ 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
{ 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
{ 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x23, LVL_3, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x25, LVL_3, 2048 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x29, LVL_3, 4096 }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */
{ 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */
{ 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */
{ 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */
@@ -59,16 +62,16 @@ static const struct _cache_table __cpuinitconst cache_table[] =
{ 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */
{ 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */
{ 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */
- { 0x44, LVL_2, 1024 }, /* 4-way set assoc, 32 byte line size */
- { 0x45, LVL_2, 2048 }, /* 4-way set assoc, 32 byte line size */
- { 0x46, LVL_3, 4096 }, /* 4-way set assoc, 64 byte line size */
- { 0x47, LVL_3, 8192 }, /* 8-way set assoc, 64 byte line size */
- { 0x49, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */
- { 0x4a, LVL_3, 6144 }, /* 12-way set assoc, 64 byte line size */
- { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */
- { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */
- { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */
- { 0x4e, LVL_2, 6144 }, /* 24-way set assoc, 64 byte line size */
+ { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */
+ { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
+ { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
+ { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
+ { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
+ { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
+ { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
+ { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */
+ { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */
+ { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */
{ 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
{ 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
{ 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
@@ -77,31 +80,34 @@ static const struct _cache_table __cpuinitconst cache_table[] =
{ 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */
{ 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */
{ 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */
- { 0x78, LVL_2, 1024 }, /* 4-way set assoc, 64 byte line size */
- { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7c, LVL_2, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7d, LVL_2, 2048 }, /* 8-way set assoc, 64 byte line size */
- { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
- { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
- { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
- { 0x84, LVL_2, 1024 }, /* 8-way set assoc, 32 byte line size */
- { 0x85, LVL_2, 2048 }, /* 8-way set assoc, 32 byte line size */
- { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
- { 0x87, LVL_2, 1024 }, /* 8-way set assoc, 64 byte line size */
- { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */
- { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */
- { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */
- { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */
- { 0xd7, LVL_3, 2038 }, /* 8-way set assoc, 64 byte line size */
- { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
- { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */
- { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
- { 0xde, LVL_3, 8192 }, /* 12-way set assoc, 64 byte line size */
- { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */
- { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */
- { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */
+ { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */
+ { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
+ { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
+ { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
+ { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
+ { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
+ { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */
+ { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
+ { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */
+ { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */
+ { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */
+ { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */
+ { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */
+ { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */
+ { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
+ { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */
+ { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
+ { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */
+ { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */
+ { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
+ { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
+ { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */
+ { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */
+ { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */
{ 0x00, 0, 0}
};
@@ -147,7 +153,8 @@ struct _cpuid4_info {
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
unsigned long size;
- unsigned long can_disable;
+ bool can_disable;
+ unsigned int l3_indices;
DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
};
@@ -157,7 +164,8 @@ struct _cpuid4_info_regs {
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
unsigned long size;
- unsigned long can_disable;
+ bool can_disable;
+ unsigned int l3_indices;
};
unsigned short num_cache_leaves;
@@ -287,6 +295,36 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
(ebx->split.ways_of_associativity + 1) - 1;
}
+struct _cache_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct _cpuid4_info *, char *);
+ ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
+};
+
+#ifdef CONFIG_CPU_SUP_AMD
+static unsigned int __cpuinit amd_calc_l3_indices(void)
+{
+ /*
+ * We're called over smp_call_function_single() and therefore
+ * are on the correct cpu.
+ */
+ int cpu = smp_processor_id();
+ int node = cpu_to_node(cpu);
+ struct pci_dev *dev = node_to_k8_nb_misc(node);
+ unsigned int sc0, sc1, sc2, sc3;
+ u32 val = 0;
+
+ pci_read_config_dword(dev, 0x1C4, &val);
+
+ /* calculate subcache sizes */
+ sc0 = !(val & BIT(0));
+ sc1 = !(val & BIT(4));
+ sc2 = !(val & BIT(8)) + !(val & BIT(9));
+ sc3 = !(val & BIT(12)) + !(val & BIT(13));
+
+ return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+}
+
static void __cpuinit
amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
{
@@ -296,13 +334,104 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
if (boot_cpu_data.x86 == 0x11)
return;
- /* see erratum #382 */
- if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8))
+ /* see errata #382 and #388 */
+ if ((boot_cpu_data.x86 == 0x10) &&
+ ((boot_cpu_data.x86_model < 0x8) ||
+ (boot_cpu_data.x86_mask < 0x1)))
return;
- this_leaf->can_disable = 1;
+ this_leaf->can_disable = true;
+ this_leaf->l3_indices = amd_calc_l3_indices();
+}
+
+static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
+ unsigned int index)
+{
+ int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+ int node = amd_get_nb_id(cpu);
+ struct pci_dev *dev = node_to_k8_nb_misc(node);
+ unsigned int reg = 0;
+
+ if (!this_leaf->can_disable)
+ return -EINVAL;
+
+ if (!dev)
+ return -EINVAL;
+
+ pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
+ return sprintf(buf, "0x%08x\n", reg);
+}
+
+#define SHOW_CACHE_DISABLE(index) \
+static ssize_t \
+show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
+{ \
+ return show_cache_disable(this_leaf, buf, index); \
+}
+SHOW_CACHE_DISABLE(0)
+SHOW_CACHE_DISABLE(1)
+
+static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
+ const char *buf, size_t count, unsigned int index)
+{
+ int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+ int node = amd_get_nb_id(cpu);
+ struct pci_dev *dev = node_to_k8_nb_misc(node);
+ unsigned long val = 0;
+
+#define SUBCACHE_MASK (3UL << 20)
+#define SUBCACHE_INDEX 0xfff
+
+ if (!this_leaf->can_disable)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!dev)
+ return -EINVAL;
+
+ if (strict_strtoul(buf, 10, &val) < 0)
+ return -EINVAL;
+
+ /* do not allow writes outside of allowed bits */
+ if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
+ ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
+ return -EINVAL;
+
+ val |= BIT(30);
+ pci_write_config_dword(dev, 0x1BC + index * 4, val);
+ /*
+ * We need to WBINVD on a core on the node containing the L3 cache which
+ * indices we disable therefore a simple wbinvd() is not sufficient.
+ */
+ wbinvd_on_cpu(cpu);
+ pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
+ return count;
}
+#define STORE_CACHE_DISABLE(index) \
+static ssize_t \
+store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
+ const char *buf, size_t count) \
+{ \
+ return store_cache_disable(this_leaf, buf, count, index); \
+}
+STORE_CACHE_DISABLE(0)
+STORE_CACHE_DISABLE(1)
+
+static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
+ show_cache_disable_0, store_cache_disable_0);
+static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
+ show_cache_disable_1, store_cache_disable_1);
+
+#else /* CONFIG_CPU_SUP_AMD */
+static void __cpuinit
+amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
+{
+};
+#endif /* CONFIG_CPU_SUP_AMD */
+
static int
__cpuinit cpuid4_cache_lookup_regs(int index,
struct _cpuid4_info_regs *this_leaf)
@@ -488,22 +617,6 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
#endif
}
- if (trace)
- printk(KERN_INFO "CPU: Trace cache: %dK uops", trace);
- else if (l1i)
- printk(KERN_INFO "CPU: L1 I cache: %dK", l1i);
-
- if (l1d)
- printk(KERN_CONT ", L1 D cache: %dK\n", l1d);
- else
- printk(KERN_CONT "\n");
-
- if (l2)
- printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
-
- if (l3)
- printk(KERN_INFO "CPU: L3 cache: %dK\n", l3);
-
c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
return l2;
@@ -512,26 +625,27 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
#ifdef CONFIG_SYSFS
/* pointer to _cpuid4_info array (for each cache leaf) */
-static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info);
-#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y]))
+static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
+#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
#ifdef CONFIG_SMP
static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
{
struct _cpuid4_info *this_leaf, *sibling_leaf;
unsigned long num_threads_sharing;
- int index_msb, i;
+ int index_msb, i, sibling;
struct cpuinfo_x86 *c = &cpu_data(cpu);
if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
- struct cpuinfo_x86 *d;
- for_each_online_cpu(i) {
- if (!per_cpu(cpuid4_info, i))
+ for_each_cpu(i, c->llc_shared_map) {
+ if (!per_cpu(ici_cpuid4_info, i))
continue;
- d = &cpu_data(i);
this_leaf = CPUID4_INFO_IDX(i, index);
- cpumask_copy(to_cpumask(this_leaf->shared_cpu_map),
- d->llc_shared_map);
+ for_each_cpu(sibling, c->llc_shared_map) {
+ if (!cpu_online(sibling))
+ continue;
+ set_bit(sibling, this_leaf->shared_cpu_map);
+ }
}
return;
}
@@ -548,7 +662,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
c->apicid >> index_msb) {
cpumask_set_cpu(i,
to_cpumask(this_leaf->shared_cpu_map));
- if (i != cpu && per_cpu(cpuid4_info, i)) {
+ if (i != cpu && per_cpu(ici_cpuid4_info, i)) {
sibling_leaf =
CPUID4_INFO_IDX(i, index);
cpumask_set_cpu(cpu, to_cpumask(
@@ -587,8 +701,8 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
for (i = 0; i < num_cache_leaves; i++)
cache_remove_shared_cpu_map(cpu, i);
- kfree(per_cpu(cpuid4_info, cpu));
- per_cpu(cpuid4_info, cpu) = NULL;
+ kfree(per_cpu(ici_cpuid4_info, cpu));
+ per_cpu(ici_cpuid4_info, cpu) = NULL;
}
static int
@@ -627,15 +741,15 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
if (num_cache_leaves == 0)
return -ENOENT;
- per_cpu(cpuid4_info, cpu) = kzalloc(
+ per_cpu(ici_cpuid4_info, cpu) = kzalloc(
sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
- if (per_cpu(cpuid4_info, cpu) == NULL)
+ if (per_cpu(ici_cpuid4_info, cpu) == NULL)
return -ENOMEM;
smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
if (retval) {
- kfree(per_cpu(cpuid4_info, cpu));
- per_cpu(cpuid4_info, cpu) = NULL;
+ kfree(per_cpu(ici_cpuid4_info, cpu));
+ per_cpu(ici_cpuid4_info, cpu) = NULL;
}
return retval;
@@ -647,7 +761,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
/* pointer to kobject for cpuX/cache */
-static DEFINE_PER_CPU(struct kobject *, cache_kobject);
+static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
struct _index_kobject {
struct kobject kobj;
@@ -656,8 +770,8 @@ struct _index_kobject {
};
/* pointer to array of kobjects for cpuX/cache/indexY */
-static DEFINE_PER_CPU(struct _index_kobject *, index_kobject);
-#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y]))
+static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
+#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
#define show_one_plus(file_name, object, val) \
static ssize_t show_##file_name \
@@ -723,82 +837,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
#define to_object(k) container_of(k, struct _index_kobject, kobj)
#define to_attr(a) container_of(a, struct _cache_attr, attr)
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int index)
-{
- int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- int node = cpu_to_node(cpu);
- struct pci_dev *dev = node_to_k8_nb_misc(node);
- unsigned int reg = 0;
-
- if (!this_leaf->can_disable)
- return -EINVAL;
-
- if (!dev)
- return -EINVAL;
-
- pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
- return sprintf(buf, "%x\n", reg);
-}
-
-#define SHOW_CACHE_DISABLE(index) \
-static ssize_t \
-show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
-{ \
- return show_cache_disable(this_leaf, buf, index); \
-}
-SHOW_CACHE_DISABLE(0)
-SHOW_CACHE_DISABLE(1)
-
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
- const char *buf, size_t count, unsigned int index)
-{
- int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- int node = cpu_to_node(cpu);
- struct pci_dev *dev = node_to_k8_nb_misc(node);
- unsigned long val = 0;
- unsigned int scrubber = 0;
-
- if (!this_leaf->can_disable)
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (!dev)
- return -EINVAL;
-
- if (strict_strtoul(buf, 10, &val) < 0)
- return -EINVAL;
-
- val |= 0xc0000000;
-
- pci_read_config_dword(dev, 0x58, &scrubber);
- scrubber &= ~0x1f000000;
- pci_write_config_dword(dev, 0x58, scrubber);
-
- pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
- wbinvd();
- pci_write_config_dword(dev, 0x1BC + index * 4, val);
- return count;
-}
-
-#define STORE_CACHE_DISABLE(index) \
-static ssize_t \
-store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
- const char *buf, size_t count) \
-{ \
- return store_cache_disable(this_leaf, buf, count, index); \
-}
-STORE_CACHE_DISABLE(0)
-STORE_CACHE_DISABLE(1)
-
-struct _cache_attr {
- struct attribute attr;
- ssize_t (*show)(struct _cpuid4_info *, char *);
- ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
-};
-
#define define_one_ro(_name) \
static struct _cache_attr _name = \
__ATTR(_name, 0444, show_##_name, NULL)
@@ -813,23 +851,28 @@ define_one_ro(size);
define_one_ro(shared_cpu_map);
define_one_ro(shared_cpu_list);
-static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
- show_cache_disable_0, store_cache_disable_0);
-static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
- show_cache_disable_1, store_cache_disable_1);
+#define DEFAULT_SYSFS_CACHE_ATTRS \
+ &type.attr, \
+ &level.attr, \
+ &coherency_line_size.attr, \
+ &physical_line_partition.attr, \
+ &ways_of_associativity.attr, \
+ &number_of_sets.attr, \
+ &size.attr, \
+ &shared_cpu_map.attr, \
+ &shared_cpu_list.attr
static struct attribute *default_attrs[] = {
- &type.attr,
- &level.attr,
- &coherency_line_size.attr,
- &physical_line_partition.attr,
- &ways_of_associativity.attr,
- &number_of_sets.attr,
- &size.attr,
- &shared_cpu_map.attr,
- &shared_cpu_list.attr,
+ DEFAULT_SYSFS_CACHE_ATTRS,
+ NULL
+};
+
+static struct attribute *default_l3_attrs[] = {
+ DEFAULT_SYSFS_CACHE_ATTRS,
+#ifdef CONFIG_CPU_SUP_AMD
&cache_disable_0.attr,
&cache_disable_1.attr,
+#endif
NULL
};
@@ -860,7 +903,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
return ret;
}
-static struct sysfs_ops sysfs_ops = {
+static const struct sysfs_ops sysfs_ops = {
.show = show,
.store = store,
};
@@ -876,10 +919,10 @@ static struct kobj_type ktype_percpu_entry = {
static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu)
{
- kfree(per_cpu(cache_kobject, cpu));
- kfree(per_cpu(index_kobject, cpu));
- per_cpu(cache_kobject, cpu) = NULL;
- per_cpu(index_kobject, cpu) = NULL;
+ kfree(per_cpu(ici_cache_kobject, cpu));
+ kfree(per_cpu(ici_index_kobject, cpu));
+ per_cpu(ici_cache_kobject, cpu) = NULL;
+ per_cpu(ici_index_kobject, cpu) = NULL;
free_cache_attributes(cpu);
}
@@ -895,14 +938,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
return err;
/* Allocate all required memory */
- per_cpu(cache_kobject, cpu) =
+ per_cpu(ici_cache_kobject, cpu) =
kzalloc(sizeof(struct kobject), GFP_KERNEL);
- if (unlikely(per_cpu(cache_kobject, cpu) == NULL))
+ if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL))
goto err_out;
- per_cpu(index_kobject, cpu) = kzalloc(
+ per_cpu(ici_index_kobject, cpu) = kzalloc(
sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
- if (unlikely(per_cpu(index_kobject, cpu) == NULL))
+ if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL))
goto err_out;
return 0;
@@ -920,13 +963,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
unsigned int cpu = sys_dev->id;
unsigned long i, j;
struct _index_kobject *this_object;
+ struct _cpuid4_info *this_leaf;
int retval;
retval = cpuid4_cache_sysfs_init(cpu);
if (unlikely(retval < 0))
return retval;
- retval = kobject_init_and_add(per_cpu(cache_kobject, cpu),
+ retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
&ktype_percpu_entry,
&sys_dev->kobj, "%s", "cache");
if (retval < 0) {
@@ -938,14 +982,22 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
this_object = INDEX_KOBJECT_PTR(cpu, i);
this_object->cpu = cpu;
this_object->index = i;
+
+ this_leaf = CPUID4_INFO_IDX(cpu, i);
+
+ if (this_leaf->can_disable)
+ ktype_cache.default_attrs = default_l3_attrs;
+ else
+ ktype_cache.default_attrs = default_attrs;
+
retval = kobject_init_and_add(&(this_object->kobj),
&ktype_cache,
- per_cpu(cache_kobject, cpu),
+ per_cpu(ici_cache_kobject, cpu),
"index%1lu", i);
if (unlikely(retval)) {
for (j = 0; j < i; j++)
kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
- kobject_put(per_cpu(cache_kobject, cpu));
+ kobject_put(per_cpu(ici_cache_kobject, cpu));
cpuid4_cache_sysfs_exit(cpu);
return retval;
}
@@ -953,7 +1005,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
}
cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
- kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
+ kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD);
return 0;
}
@@ -962,7 +1014,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
unsigned int cpu = sys_dev->id;
unsigned long i;
- if (per_cpu(cpuid4_info, cpu) == NULL)
+ if (per_cpu(ici_cpuid4_info, cpu) == NULL)
return;
if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
return;
@@ -970,7 +1022,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
for (i = 0; i < num_cache_leaves; i++)
kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
- kobject_put(per_cpu(cache_kobject, cpu));
+ kobject_put(per_cpu(ici_cache_kobject, cpu));
cpuid4_cache_sysfs_exit(cpu);
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 472763d92098..73734baa50f2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -74,7 +74,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)
m->finished = 0;
}
-static cpumask_t mce_inject_cpumask;
+static cpumask_var_t mce_inject_cpumask;
static int mce_raise_notify(struct notifier_block *self,
unsigned long val, void *data)
@@ -82,9 +82,9 @@ static int mce_raise_notify(struct notifier_block *self,
struct die_args *args = (struct die_args *)data;
int cpu = smp_processor_id();
struct mce *m = &__get_cpu_var(injectm);
- if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask))
+ if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
return NOTIFY_DONE;
- cpu_clear(cpu, mce_inject_cpumask);
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
if (m->inject_flags & MCJ_EXCEPTION)
raise_exception(m, args->regs);
else if (m->status)
@@ -148,22 +148,22 @@ static void raise_mce(struct mce *m)
unsigned long start;
int cpu;
get_online_cpus();
- mce_inject_cpumask = cpu_online_map;
- cpu_clear(get_cpu(), mce_inject_cpumask);
+ cpumask_copy(mce_inject_cpumask, cpu_online_mask);
+ cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
for_each_online_cpu(cpu) {
struct mce *mcpu = &per_cpu(injectm, cpu);
if (!mcpu->finished ||
MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
- cpu_clear(cpu, mce_inject_cpumask);
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
}
- if (!cpus_empty(mce_inject_cpumask))
- apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR);
+ if (!cpumask_empty(mce_inject_cpumask))
+ apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR);
start = jiffies;
- while (!cpus_empty(mce_inject_cpumask)) {
+ while (!cpumask_empty(mce_inject_cpumask)) {
if (!time_before(jiffies, start + 2*HZ)) {
printk(KERN_ERR
"Timeout waiting for mce inject NMI %lx\n",
- *cpus_addr(mce_inject_cpumask));
+ *cpumask_bits(mce_inject_cpumask));
break;
}
cpu_relax();
@@ -210,6 +210,8 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
static int inject_init(void)
{
+ if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
+ return -ENOMEM;
printk(KERN_INFO "Machine check injector initialized\n");
mce_chrdev_ops.write = mce_write;
register_die_notifier(&mce_raise_nb);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 4b2af86e3e8d..3ab9c886b613 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -46,6 +46,16 @@
#include "mce-internal.h"
+static DEFINE_MUTEX(mce_read_mutex);
+
+#define rcu_dereference_check_mce(p) \
+ rcu_dereference_check((p), \
+ rcu_read_lock_sched_held() || \
+ lockdep_is_held(&mce_read_mutex))
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/mce.h>
+
int mce_disabled __read_mostly;
#define MISC_MCELOG_MINOR 227
@@ -85,6 +95,26 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing;
+/*
+ * CPU/chipset specific EDAC code can register a notifier call here to print
+ * MCE errors in a human-readable form.
+ */
+ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
+
+static int default_decode_mce(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ pr_emerg("No human readable MCE decoding support on this CPU type.\n");
+ pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
+
+ return NOTIFY_STOP;
+}
+
+static struct notifier_block mce_dec_nb = {
+ .notifier_call = default_decode_mce,
+ .priority = -1,
+};
/* MCA banks polled by the period polling timer for corrected events */
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -129,10 +159,13 @@ void mce_log(struct mce *mce)
{
unsigned next, entry;
+ /* Emit the trace record: */
+ trace_mce_record(mce);
+
mce->finished = 0;
wmb();
for (;;) {
- entry = rcu_dereference(mcelog.next);
+ entry = rcu_dereference_check_mce(mcelog.next);
for (;;) {
/*
* When the buffer fills up discard new entries.
@@ -165,49 +198,46 @@ void mce_log(struct mce *mce)
set_bit(0, &mce_need_notify);
}
-void __weak decode_mce(struct mce *m)
-{
- return;
-}
-
static void print_mce(struct mce *m)
{
- printk(KERN_EMERG
- "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
+ pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
m->extcpu, m->mcgstatus, m->bank, m->status);
+
if (m->ip) {
- printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
- !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
- m->cs, m->ip);
+ pr_emerg("RIP%s %02x:<%016Lx> ",
+ !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
+ m->cs, m->ip);
+
if (m->cs == __KERNEL_CS)
print_symbol("{%s}", m->ip);
- printk(KERN_CONT "\n");
+ pr_cont("\n");
}
- printk(KERN_EMERG "TSC %llx ", m->tsc);
+
+ pr_emerg("TSC %llx ", m->tsc);
if (m->addr)
- printk(KERN_CONT "ADDR %llx ", m->addr);
+ pr_cont("ADDR %llx ", m->addr);
if (m->misc)
- printk(KERN_CONT "MISC %llx ", m->misc);
- printk(KERN_CONT "\n");
- printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
- m->cpuvendor, m->cpuid, m->time, m->socketid,
- m->apicid);
+ pr_cont("MISC %llx ", m->misc);
+
+ pr_cont("\n");
+ pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
+ m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
- decode_mce(m);
+ /*
+ * Print out human-readable details about the MCE error,
+ * (if the CPU has an implementation for that)
+ */
+ atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
}
static void print_mce_head(void)
{
- printk(KERN_EMERG "\nHARDWARE ERROR\n");
+ pr_emerg("\nHARDWARE ERROR\n");
}
static void print_mce_tail(void)
{
- printk(KERN_EMERG "This is not a software problem!\n"
-#if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD))
- "Run through mcelog --ascii to decode and contact your hardware vendor\n"
-#endif
- );
+ pr_emerg("This is not a software problem!\n");
}
#define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -221,6 +251,7 @@ static atomic_t mce_fake_paniced;
static void wait_for_panic(void)
{
long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
+
preempt_disable();
local_irq_enable();
while (timeout-- > 0)
@@ -288,6 +319,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
static int msr_to_offset(u32 msr)
{
unsigned bank = __get_cpu_var(injectm.bank);
+
if (msr == rip_msr)
return offsetof(struct mce, ip);
if (msr == MSR_IA32_MCx_STATUS(bank))
@@ -1111,7 +1143,7 @@ static int check_interval = 5 * 60; /* 5 minutes */
static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static void mcheck_timer(unsigned long data)
+static void mce_start_timer(unsigned long data)
{
struct timer_list *t = &per_cpu(mce_timer, data);
int *n;
@@ -1176,7 +1208,7 @@ int mce_notify_irq(void)
}
EXPORT_SYMBOL_GPL(mce_notify_irq);
-static int mce_banks_init(void)
+static int __cpuinit __mcheck_cpu_mce_banks_init(void)
{
int i;
@@ -1195,7 +1227,7 @@ static int mce_banks_init(void)
/*
* Initialize Machine Checks for a CPU.
*/
-static int __cpuinit mce_cap_init(void)
+static int __cpuinit __mcheck_cpu_cap_init(void)
{
unsigned b;
u64 cap;
@@ -1203,7 +1235,8 @@ static int __cpuinit mce_cap_init(void)
rdmsrl(MSR_IA32_MCG_CAP, cap);
b = cap & MCG_BANKCNT_MASK;
- printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
+ if (!banks)
+ printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
if (b > MAX_NR_BANKS) {
printk(KERN_WARNING
@@ -1216,7 +1249,7 @@ static int __cpuinit mce_cap_init(void)
WARN_ON(banks != 0 && b != banks);
banks = b;
if (!mce_banks) {
- int err = mce_banks_init();
+ int err = __mcheck_cpu_mce_banks_init();
if (err)
return err;
@@ -1232,7 +1265,7 @@ static int __cpuinit mce_cap_init(void)
return 0;
}
-static void mce_init(void)
+static void __mcheck_cpu_init_generic(void)
{
mce_banks_t all_banks;
u64 cap;
@@ -1261,7 +1294,7 @@ static void mce_init(void)
}
/* Add per CPU specific workarounds here */
-static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
+static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
{
if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1329,7 +1362,7 @@ static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
return 0;
}
-static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
+static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
{
if (c->x86 != 5)
return;
@@ -1343,7 +1376,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
}
}
-static void mce_cpu_features(struct cpuinfo_x86 *c)
+static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
{
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
@@ -1357,18 +1390,19 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
}
}
-static void mce_init_timer(void)
+static void __mcheck_cpu_init_timer(void)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
int *n = &__get_cpu_var(mce_next_interval);
+ setup_timer(t, mce_start_timer, smp_processor_id());
+
if (mce_ignore_ce)
return;
*n = check_interval * HZ;
if (!*n)
return;
- setup_timer(t, mcheck_timer, smp_processor_id());
t->expires = round_jiffies(jiffies + *n);
add_timer_on(t, smp_processor_id());
}
@@ -1388,27 +1422,28 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
* Called for each booted CPU to set up machine checks.
* Must be called with preempt off:
*/
-void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
+void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
{
if (mce_disabled)
return;
- mce_ancient_init(c);
+ __mcheck_cpu_ancient_init(c);
if (!mce_available(c))
return;
- if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) {
+ if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
mce_disabled = 1;
return;
}
machine_check_vector = do_machine_check;
- mce_init();
- mce_cpu_features(c);
- mce_init_timer();
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_vendor(c);
+ __mcheck_cpu_init_timer();
INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
+
}
/*
@@ -1457,8 +1492,6 @@ static void collect_tscs(void *data)
rdtscll(cpu_tsc[smp_processor_id()]);
}
-static DEFINE_MUTEX(mce_read_mutex);
-
static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
loff_t *off)
{
@@ -1472,7 +1505,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
return -ENOMEM;
mutex_lock(&mce_read_mutex);
- next = rcu_dereference(mcelog.next);
+ next = rcu_dereference_check_mce(mcelog.next);
/* Only supports full reads right now */
if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
@@ -1537,7 +1570,7 @@ timeout:
static unsigned int mce_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &mce_wait, wait);
- if (rcu_dereference(mcelog.next))
+ if (rcu_dereference_check_mce(mcelog.next))
return POLLIN | POLLRDNORM;
return 0;
}
@@ -1628,6 +1661,15 @@ static int __init mcheck_enable(char *str)
}
__setup("mce", mcheck_enable);
+int __init mcheck_init(void)
+{
+ atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
+
+ mcheck_intel_therm_init();
+
+ return 0;
+}
+
/*
* Sysfs support
*/
@@ -1636,7 +1678,7 @@ __setup("mce", mcheck_enable);
* Disable machine checks on suspend and shutdown. We can't really handle
* them later.
*/
-static int mce_disable(void)
+static int mce_disable_error_reporting(void)
{
int i;
@@ -1651,12 +1693,12 @@ static int mce_disable(void)
static int mce_suspend(struct sys_device *dev, pm_message_t state)
{
- return mce_disable();
+ return mce_disable_error_reporting();
}
static int mce_shutdown(struct sys_device *dev)
{
- return mce_disable();
+ return mce_disable_error_reporting();
}
/*
@@ -1666,8 +1708,8 @@ static int mce_shutdown(struct sys_device *dev)
*/
static int mce_resume(struct sys_device *dev)
{
- mce_init();
- mce_cpu_features(&current_cpu_data);
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_vendor(&current_cpu_data);
return 0;
}
@@ -1677,8 +1719,8 @@ static void mce_cpu_restart(void *data)
del_timer_sync(&__get_cpu_var(mce_timer));
if (!mce_available(&current_cpu_data))
return;
- mce_init();
- mce_init_timer();
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_timer();
}
/* Reinit MCEs after user configuration changes */
@@ -1704,7 +1746,7 @@ static void mce_enable_ce(void *all)
cmci_reenable();
cmci_recheck();
if (all)
- mce_init_timer();
+ __mcheck_cpu_init_timer();
}
static struct sysdev_class mce_sysclass = {
@@ -1892,7 +1934,7 @@ error2:
sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
error:
while (--i >= 0)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
+ sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
sysdev_unregister(&per_cpu(mce_dev, cpu));
@@ -1917,13 +1959,14 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
}
/* Make sure there are no machine checks on offlined CPUs. */
-static void mce_disable_cpu(void *h)
+static void __cpuinit mce_disable_cpu(void *h)
{
unsigned long action = *(unsigned long *)h;
int i;
if (!mce_available(&current_cpu_data))
return;
+
if (!(action & CPU_TASKS_FROZEN))
cmci_clear();
for (i = 0; i < banks; i++) {
@@ -1934,7 +1977,7 @@ static void mce_disable_cpu(void *h)
}
}
-static void mce_reenable_cpu(void *h)
+static void __cpuinit mce_reenable_cpu(void *h)
{
unsigned long action = *(unsigned long *)h;
int i;
@@ -1979,9 +2022,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
- t->expires = round_jiffies(jiffies +
+ if (!mce_ignore_ce && check_interval) {
+ t->expires = round_jiffies(jiffies +
__get_cpu_var(mce_next_interval));
- add_timer_on(t, cpu);
+ add_timer_on(t, cpu);
+ }
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
break;
case CPU_POST_DEAD:
@@ -2004,6 +2049,7 @@ static __init void mce_init_banks(void)
struct mce_bank *b = &mce_banks[i];
struct sysdev_attribute *a = &b->attr;
+ sysfs_attr_init(&a->attr);
a->attr.name = b->attrname;
snprintf(b->attrname, ATTR_LEN, "bank%d", i);
@@ -2013,7 +2059,7 @@ static __init void mce_init_banks(void)
}
}
-static __init int mce_init_device(void)
+static __init int mcheck_init_device(void)
{
int err;
int i = 0;
@@ -2041,7 +2087,7 @@ static __init int mce_init_device(void)
return err;
}
-device_initcall(mce_init_device);
+device_initcall(mcheck_init_device);
/*
* Old style boot options parsing. Only for compatibility.
@@ -2089,7 +2135,7 @@ static int fake_panic_set(void *data, u64 val)
DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
fake_panic_set, "%llu\n");
-static int __init mce_debugfs_init(void)
+static int __init mcheck_debugfs_init(void)
{
struct dentry *dmce, *ffake_panic;
@@ -2103,5 +2149,5 @@ static int __init mce_debugfs_init(void)
return 0;
}
-late_initcall(mce_debugfs_init);
+late_initcall(mcheck_debugfs_init);
#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 83a3d1f4efca..cda932ca3ade 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -388,7 +388,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
return ret;
}
-static struct sysfs_ops threshold_ops = {
+static const struct sysfs_ops threshold_ops = {
.show = show,
.store = store,
};
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 889f665fe93d..d15df6e49bf0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -8,6 +8,7 @@
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
+#include <linux/sched.h>
#include <asm/apic.h>
#include <asm/processor.h>
#include <asm/msr.h>
@@ -94,7 +95,7 @@ static void cmci_discover(int banks, int boot)
/* Already owned by someone else? */
if (val & CMCI_EN) {
- if (test_and_clear_bit(i, owned) || boot)
+ if (test_and_clear_bit(i, owned) && !boot)
print_update("SHD", &hdr, i);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
continue;
@@ -106,7 +107,7 @@ static void cmci_discover(int banks, int boot)
/* Did the enable bit stick? -- the bank supports CMCI */
if (val & CMCI_EN) {
- if (!test_and_set_bit(i, owned) || boot)
+ if (!test_and_set_bit(i, owned) && !boot)
print_update("CMCI", &hdr, i);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
} else {
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index b3a1dba75330..81c499eceb21 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -49,6 +49,8 @@ static DEFINE_PER_CPU(struct thermal_state, thermal_state);
static atomic_t therm_throt_en = ATOMIC_INIT(0);
+static u32 lvtthmr_init __read_mostly;
+
#ifdef CONFIG_SYSFS
#define define_therm_throt_sysdev_one_ro(_name) \
static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
@@ -254,14 +256,34 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
ack_APIC_irq();
}
+/* Thermal monitoring depends on APIC, ACPI and clock modulation */
+static int intel_thermal_supported(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has_apic)
+ return 0;
+ if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
+ return 0;
+ return 1;
+}
+
+void __init mcheck_intel_therm_init(void)
+{
+ /*
+ * This function is only called on boot CPU. Save the init thermal
+ * LVT value on BSP and use that value to restore APs' thermal LVT
+ * entry BIOS programmed later
+ */
+ if (intel_thermal_supported(&boot_cpu_data))
+ lvtthmr_init = apic_read(APIC_LVTTHMR);
+}
+
void intel_init_thermal(struct cpuinfo_x86 *c)
{
unsigned int cpu = smp_processor_id();
int tm2 = 0;
u32 l, h;
- /* Thermal monitoring depends on ACPI and clock modulation*/
- if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
+ if (!intel_thermal_supported(c))
return;
/*
@@ -270,7 +292,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
* since it might be delivered via SMI already:
*/
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- h = apic_read(APIC_LVTTHMR);
+
+ /*
+ * The initial value of thermal LVT entries on all APs always reads
+ * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
+ * sequence to them and LVT registers are reset to 0s except for
+ * the mask bits which are set to 1s when APs receive INIT IPI.
+ * Always restore the value that BIOS has programmed on AP based on
+ * BSP's info we saved since BIOS is always setting the same value
+ * for all threads/cores
+ */
+ apic_write(APIC_LVTTHMR, lvtthmr_init);
+
+ h = lvtthmr_init;
+
if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
printk(KERN_DEBUG
"CPU%d: Thermal monitoring handled by SMI\n", cpu);
@@ -312,8 +347,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
l = apic_read(APIC_LVTTHMR);
apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
- printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
- cpu, tm2 ? "TM2" : "TM1");
+ printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
+ tm2 ? "TM2" : "TM1");
/* enable thermal throttle processing */
atomic_set(&therm_throt_en, 1);
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index f4361b56f8e9..ad9e5ed81181 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
-obj-y := main.o if.o generic.o state.o cleanup.o
+obj-y := main.o if.o generic.o cleanup.o
obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index 33af14110dfd..92ba9cd31c9a 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -108,7 +108,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
return 0;
}
-static struct mtrr_ops amd_mtrr_ops = {
+static const struct mtrr_ops amd_mtrr_ops = {
.vendor = X86_VENDOR_AMD,
.set = amd_set_mtrr,
.get = amd_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index de89f14eff3a..316fe3e60a97 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -110,7 +110,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
return 0;
}
-static struct mtrr_ops centaur_mtrr_ops = {
+static const struct mtrr_ops centaur_mtrr_ops = {
.vendor = X86_VENDOR_CENTAUR,
.set = centaur_set_mcr,
.get = centaur_get_mcr,
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 315738c74aad..06130b52f012 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -22,10 +22,10 @@
#include <linux/pci.h>
#include <linux/smp.h>
#include <linux/cpu.h>
-#include <linux/sort.h>
#include <linux/mutex.h>
#include <linux/uaccess.h>
#include <linux/kvm_para.h>
+#include <linux/range.h>
#include <asm/processor.h>
#include <asm/e820.h>
@@ -34,11 +34,6 @@
#include "mtrr.h"
-struct res_range {
- unsigned long start;
- unsigned long end;
-};
-
struct var_mtrr_range_state {
unsigned long base_pfn;
unsigned long size_pfn;
@@ -56,7 +51,7 @@ struct var_mtrr_state {
/* Should be related to MTRR_VAR_RANGES nums */
#define RANGE_NUM 256
-static struct res_range __initdata range[RANGE_NUM];
+static struct range __initdata range[RANGE_NUM];
static int __initdata nr_range;
static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
@@ -64,117 +59,11 @@ static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
static int __initdata debug_print;
#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
-
-static int __init
-add_range(struct res_range *range, int nr_range,
- unsigned long start, unsigned long end)
-{
- /* Out of slots: */
- if (nr_range >= RANGE_NUM)
- return nr_range;
-
- range[nr_range].start = start;
- range[nr_range].end = end;
-
- nr_range++;
-
- return nr_range;
-}
-
-static int __init
-add_range_with_merge(struct res_range *range, int nr_range,
- unsigned long start, unsigned long end)
-{
- int i;
-
- /* Try to merge it with old one: */
- for (i = 0; i < nr_range; i++) {
- unsigned long final_start, final_end;
- unsigned long common_start, common_end;
-
- if (!range[i].end)
- continue;
-
- common_start = max(range[i].start, start);
- common_end = min(range[i].end, end);
- if (common_start > common_end + 1)
- continue;
-
- final_start = min(range[i].start, start);
- final_end = max(range[i].end, end);
-
- range[i].start = final_start;
- range[i].end = final_end;
- return nr_range;
- }
-
- /* Need to add it: */
- return add_range(range, nr_range, start, end);
-}
-
-static void __init
-subtract_range(struct res_range *range, unsigned long start, unsigned long end)
-{
- int i, j;
-
- for (j = 0; j < RANGE_NUM; j++) {
- if (!range[j].end)
- continue;
-
- if (start <= range[j].start && end >= range[j].end) {
- range[j].start = 0;
- range[j].end = 0;
- continue;
- }
-
- if (start <= range[j].start && end < range[j].end &&
- range[j].start < end + 1) {
- range[j].start = end + 1;
- continue;
- }
-
-
- if (start > range[j].start && end >= range[j].end &&
- range[j].end > start - 1) {
- range[j].end = start - 1;
- continue;
- }
-
- if (start > range[j].start && end < range[j].end) {
- /* Find the new spare: */
- for (i = 0; i < RANGE_NUM; i++) {
- if (range[i].end == 0)
- break;
- }
- if (i < RANGE_NUM) {
- range[i].end = range[j].end;
- range[i].start = end + 1;
- } else {
- printk(KERN_ERR "run of slot in ranges\n");
- }
- range[j].end = start - 1;
- continue;
- }
- }
-}
-
-static int __init cmp_range(const void *x1, const void *x2)
-{
- const struct res_range *r1 = x1;
- const struct res_range *r2 = x2;
- long start1, start2;
-
- start1 = r1->start;
- start2 = r2->start;
-
- return start1 - start2;
-}
-
#define BIOS_BUG_MSG KERN_WARNING \
"WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
static int __init
-x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
+x86_get_mtrr_mem_range(struct range *range, int nr_range,
unsigned long extra_remove_base,
unsigned long extra_remove_size)
{
@@ -188,14 +77,14 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
continue;
base = range_state[i].base_pfn;
size = range_state[i].size_pfn;
- nr_range = add_range_with_merge(range, nr_range, base,
- base + size - 1);
+ nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
+ base, base + size);
}
if (debug_print) {
printk(KERN_DEBUG "After WB checking\n");
for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
+ printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
}
/* Take out UC ranges: */
@@ -217,51 +106,43 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
size -= (1<<(20-PAGE_SHIFT)) - base;
base = 1<<(20-PAGE_SHIFT);
}
- subtract_range(range, base, base + size - 1);
+ subtract_range(range, RANGE_NUM, base, base + size);
}
if (extra_remove_size)
- subtract_range(range, extra_remove_base,
- extra_remove_base + extra_remove_size - 1);
+ subtract_range(range, RANGE_NUM, extra_remove_base,
+ extra_remove_base + extra_remove_size);
- /* get new range num */
- nr_range = 0;
- for (i = 0; i < RANGE_NUM; i++) {
- if (!range[i].end)
- continue;
- nr_range++;
- }
if (debug_print) {
printk(KERN_DEBUG "After UC checking\n");
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
+ for (i = 0; i < RANGE_NUM; i++) {
+ if (!range[i].end)
+ continue;
+ printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
+ }
}
/* sort the ranges */
- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+ nr_range = clean_sort_range(range, RANGE_NUM);
if (debug_print) {
printk(KERN_DEBUG "After sorting\n");
for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
- range[i].start, range[i].end + 1);
+ printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
}
- /* clear those is not used */
- for (i = nr_range; i < RANGE_NUM; i++)
- memset(&range[i], 0, sizeof(range[i]));
-
return nr_range;
}
#ifdef CONFIG_MTRR_SANITIZER
-static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
+static unsigned long __init sum_ranges(struct range *range, int nr_range)
{
unsigned long sum = 0;
int i;
for (i = 0; i < nr_range; i++)
- sum += range[i].end + 1 - range[i].start;
+ sum += range[i].end - range[i].start;
return sum;
}
@@ -590,7 +471,7 @@ static int __init parse_mtrr_spare_reg(char *arg)
early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
static int __init
-x86_setup_var_mtrrs(struct res_range *range, int nr_range,
+x86_setup_var_mtrrs(struct range *range, int nr_range,
u64 chunk_size, u64 gran_size)
{
struct var_mtrr_state var_state;
@@ -608,7 +489,7 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
/* Write the range: */
for (i = 0; i < nr_range; i++) {
set_var_mtrr_range(&var_state, range[i].start,
- range[i].end - range[i].start + 1);
+ range[i].end - range[i].start);
}
/* Write the last range: */
@@ -689,8 +570,6 @@ static int __init mtrr_need_cleanup(void)
continue;
if (!size)
type = MTRR_NUM_TYPES;
- if (type == MTRR_TYPE_WRPROT)
- type = MTRR_TYPE_UNCACHABLE;
num[type]++;
}
@@ -713,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
unsigned long x_remove_base,
unsigned long x_remove_size, int i)
{
- static struct res_range range_new[RANGE_NUM];
+ static struct range range_new[RANGE_NUM];
unsigned long range_sums_new;
static int nr_range_new;
int num_reg;
@@ -840,13 +719,13 @@ int __init mtrr_cleanup(unsigned address_bits)
* [0, 1M) should always be covered by var mtrr with WB
* and fixed mtrrs should take effect before var mtrr for it:
*/
- nr_range = add_range_with_merge(range, nr_range, 0,
- (1ULL<<(20 - PAGE_SHIFT)) - 1);
+ nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0,
+ 1ULL<<(20 - PAGE_SHIFT));
/* Sort the ranges: */
- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+ sort_range(range, nr_range);
range_sums = sum_ranges(range, nr_range);
- printk(KERN_INFO "total RAM coverred: %ldM\n",
+ printk(KERN_INFO "total RAM covered: %ldM\n",
range_sums >> (20 - PAGE_SHIFT));
if (mtrr_chunk_size && mtrr_gran_size) {
@@ -1060,9 +939,9 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
nr_range = 0;
if (mtrr_tom2) {
range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
- range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
- if (highest_pfn < range[nr_range].end + 1)
- highest_pfn = range[nr_range].end + 1;
+ range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT;
+ if (highest_pfn < range[nr_range].end)
+ highest_pfn = range[nr_range].end;
nr_range++;
}
nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
@@ -1074,15 +953,15 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
/* Check the holes: */
for (i = 0; i < nr_range - 1; i++) {
- if (range[i].end + 1 < range[i+1].start)
- total_trim_size += real_trim_memory(range[i].end + 1,
+ if (range[i].end < range[i+1].start)
+ total_trim_size += real_trim_memory(range[i].end,
range[i+1].start);
}
/* Check the top: */
i = nr_range - 1;
- if (range[i].end + 1 < end_pfn)
- total_trim_size += real_trim_memory(range[i].end + 1,
+ if (range[i].end < end_pfn)
+ total_trim_size += real_trim_memory(range[i].end,
end_pfn);
if (total_trim_size) {
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 228d982ce09c..68a3343e5798 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -265,7 +265,7 @@ static void cyrix_set_all(void)
post_set();
}
-static struct mtrr_ops cyrix_mtrr_ops = {
+static const struct mtrr_ops cyrix_mtrr_ops = {
.vendor = X86_VENDOR_CYRIX,
.set_all = cyrix_set_all,
.set = cyrix_set_arr,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 55da0c5f68dd..9aa5dc76ff4a 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -464,7 +464,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
tmp |= ~((1<<(hi - 1)) - 1);
if (tmp != mask_lo) {
- WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
+ printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
mask_lo = tmp;
}
}
@@ -570,7 +570,7 @@ static unsigned long set_mtrr_state(void)
static unsigned long cr4;
-static DEFINE_SPINLOCK(set_atomicity_lock);
+static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
/*
* Since we are disabling the cache don't allow any interrupts,
@@ -590,7 +590,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
* changes to the way the kernel boots
*/
- spin_lock(&set_atomicity_lock);
+ raw_spin_lock(&set_atomicity_lock);
/* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
cr0 = read_cr0() | X86_CR0_CD;
@@ -627,7 +627,7 @@ static void post_set(void) __releases(set_atomicity_lock)
/* Restore value of CR4 */
if (cpu_has_pge)
write_cr4(cr4);
- spin_unlock(&set_atomicity_lock);
+ raw_spin_unlock(&set_atomicity_lock);
}
static void generic_set_all(void)
@@ -752,7 +752,7 @@ int positive_have_wrcomb(void)
/*
* Generic structure...
*/
-struct mtrr_ops generic_mtrr_ops = {
+const struct mtrr_ops generic_mtrr_ops = {
.use_intel_if = 1,
.set_all = generic_set_all,
.get = generic_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index f04e72527604..e006e56f699c 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -4,6 +4,7 @@
#include <linux/proc_fs.h>
#include <linux/module.h>
#include <linux/ctype.h>
+#include <linux/string.h>
#include <linux/init.h>
#define LINE_SIZE 80
@@ -96,17 +97,24 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
unsigned long long base, size;
char *ptr;
char line[LINE_SIZE];
+ int length;
size_t linelen;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!len)
- return -EINVAL;
memset(line, 0, LINE_SIZE);
- if (len > LINE_SIZE)
- len = LINE_SIZE;
- if (copy_from_user(line, buf, len - 1))
+
+ length = len;
+ length--;
+
+ if (length > LINE_SIZE - 1)
+ length = LINE_SIZE - 1;
+
+ if (length < 0)
+ return -EINVAL;
+
+ if (copy_from_user(line, buf, length))
return -EFAULT;
linelen = strlen(line);
@@ -126,8 +134,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
return -EINVAL;
base = simple_strtoull(line + 5, &ptr, 0);
- while (isspace(*ptr))
- ptr++;
+ ptr = skip_spaces(ptr);
if (strncmp(ptr, "size=", 5))
return -EINVAL;
@@ -135,14 +142,11 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
size = simple_strtoull(ptr + 5, &ptr, 0);
if ((base & 0xfff) || (size & 0xfff))
return -EINVAL;
- while (isspace(*ptr))
- ptr++;
+ ptr = skip_spaces(ptr);
if (strncmp(ptr, "type=", 5))
return -EINVAL;
- ptr += 5;
- while (isspace(*ptr))
- ptr++;
+ ptr = skip_spaces(ptr + 5);
for (i = 0; i < MTRR_NUM_TYPES; ++i) {
if (strcmp(ptr, mtrr_strings[i]))
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 84e83de54575..79556bd9b602 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -60,14 +60,14 @@ static DEFINE_MUTEX(mtrr_mutex);
u64 size_or_mask, size_and_mask;
static bool mtrr_aps_delayed_init;
-static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
+static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
-struct mtrr_ops *mtrr_if;
+const struct mtrr_ops *mtrr_if;
static void set_mtrr(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type);
-void set_mtrr_ops(struct mtrr_ops *ops)
+void set_mtrr_ops(const struct mtrr_ops *ops)
{
if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
mtrr_ops[ops->vendor] = ops;
@@ -145,6 +145,7 @@ struct set_mtrr_data {
/**
* ipi_handler - Synchronisation handler. Executed by "other" CPUs.
+ * @info: pointer to mtrr configuration data
*
* Returns nothing.
*/
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index a501dee9a87a..df5e41f31a27 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -32,7 +32,7 @@ extern int generic_get_free_region(unsigned long base, unsigned long size,
extern int generic_validate_add_page(unsigned long base, unsigned long size,
unsigned int type);
-extern struct mtrr_ops generic_mtrr_ops;
+extern const struct mtrr_ops generic_mtrr_ops;
extern int positive_have_wrcomb(void);
@@ -53,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
void get_mtrr_state(void);
-extern void set_mtrr_ops(struct mtrr_ops *ops);
+extern void set_mtrr_ops(const struct mtrr_ops *ops);
extern u64 size_or_mask, size_and_mask;
-extern struct mtrr_ops *mtrr_if;
+extern const struct mtrr_ops *mtrr_if;
#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
deleted file mode 100644
index dfc80b4e6b0d..000000000000
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ /dev/null
@@ -1,94 +0,0 @@
-#include <linux/init.h>
-#include <linux/io.h>
-#include <linux/mm.h>
-
-#include <asm/processor-cyrix.h>
-#include <asm/processor-flags.h>
-#include <asm/mtrr.h>
-#include <asm/msr.h>
-
-#include "mtrr.h"
-
-/* Put the processor into a state where MTRRs can be safely set */
-void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
-{
- unsigned int cr0;
-
- /* Disable interrupts locally */
- local_irq_save(ctxt->flags);
-
- if (use_intel() || is_cpu(CYRIX)) {
-
- /* Save value of CR4 and clear Page Global Enable (bit 7) */
- if (cpu_has_pge) {
- ctxt->cr4val = read_cr4();
- write_cr4(ctxt->cr4val & ~X86_CR4_PGE);
- }
-
- /*
- * Disable and flush caches. Note that wbinvd flushes the TLBs
- * as a side-effect
- */
- cr0 = read_cr0() | X86_CR0_CD;
- wbinvd();
- write_cr0(cr0);
- wbinvd();
-
- if (use_intel()) {
- /* Save MTRR state */
- rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
- } else {
- /*
- * Cyrix ARRs -
- * everything else were excluded at the top
- */
- ctxt->ccr3 = getCx86(CX86_CCR3);
- }
- }
-}
-
-void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
-{
- if (use_intel()) {
- /* Disable MTRRs, and set the default type to uncached */
- mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
- ctxt->deftype_hi);
- } else {
- if (is_cpu(CYRIX)) {
- /* Cyrix ARRs - everything else were excluded at the top */
- setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10);
- }
- }
-}
-
-/* Restore the processor after a set_mtrr_prepare */
-void set_mtrr_done(struct set_mtrr_context *ctxt)
-{
- if (use_intel() || is_cpu(CYRIX)) {
-
- /* Flush caches and TLBs */
- wbinvd();
-
- /* Restore MTRRdefType */
- if (use_intel()) {
- /* Intel (P6) standard MTRRs */
- mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo,
- ctxt->deftype_hi);
- } else {
- /*
- * Cyrix ARRs -
- * everything else was excluded at the top
- */
- setCx86(CX86_CCR3, ctxt->ccr3);
- }
-
- /* Enable caches */
- write_cr0(read_cr0() & 0xbfffffff);
-
- /* Restore value of CR4 */
- if (cpu_has_pge)
- write_cr4(ctxt->cr4val);
- }
- /* Re-enable interrupts locally (if enabled previously) */
- local_irq_restore(ctxt->flags);
-}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b5801c311846..60398a0d947c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -7,6 +7,7 @@
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
* Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
* Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ * Copyright (C) 2009 Google, Inc., Stephane Eranian
*
* For licencing details see kernel-base/COPYING
*/
@@ -22,6 +23,7 @@
#include <linux/uaccess.h>
#include <linux/highmem.h>
#include <linux/cpu.h>
+#include <linux/bitops.h>
#include <asm/apic.h>
#include <asm/stacktrace.h>
@@ -68,15 +70,60 @@ struct debug_store {
u64 pebs_event_reset[MAX_PEBS_EVENTS];
};
+struct event_constraint {
+ union {
+ unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ u64 idxmsk64;
+ };
+ u64 code;
+ u64 cmask;
+ int weight;
+};
+
+struct amd_nb {
+ int nb_id; /* NorthBridge id */
+ int refcnt; /* reference count */
+ struct perf_event *owners[X86_PMC_IDX_MAX];
+ struct event_constraint event_constraints[X86_PMC_IDX_MAX];
+};
+
struct cpu_hw_events {
- struct perf_event *events[X86_PMC_IDX_MAX];
- unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
unsigned long interrupts;
int enabled;
struct debug_store *ds;
+
+ int n_events;
+ int n_added;
+ int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
+ u64 tags[X86_PMC_IDX_MAX];
+ struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+ struct amd_nb *amd_nb;
};
+#define __EVENT_CONSTRAINT(c, n, m, w) {\
+ { .idxmsk64 = (n) }, \
+ .code = (c), \
+ .cmask = (m), \
+ .weight = (w), \
+}
+
+#define EVENT_CONSTRAINT(c, n, m) \
+ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
+
+#define INTEL_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
+
+#define FIXED_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
+
+#define EVENT_CONSTRAINT_END \
+ EVENT_CONSTRAINT(0, 0, 0)
+
+#define for_each_event_constraint(e, c) \
+ for ((e) = (c); (e)->cmask; (e)++)
+
/*
* struct x86_pmu - generic x86 pmu
*/
@@ -86,8 +133,8 @@ struct x86_pmu {
int (*handle_irq)(struct pt_regs *);
void (*disable_all)(void);
void (*enable_all)(void);
- void (*enable)(struct hw_perf_event *, int);
- void (*disable)(struct hw_perf_event *, int);
+ void (*enable)(struct perf_event *);
+ void (*disable)(struct perf_event *);
unsigned eventsel;
unsigned perfctr;
u64 (*event_map)(int);
@@ -102,78 +149,28 @@ struct x86_pmu {
u64 intel_ctrl;
void (*enable_bts)(u64 config);
void (*disable_bts)(void);
-};
-static struct x86_pmu x86_pmu __read_mostly;
+ struct event_constraint *
+ (*get_event_constraints)(struct cpu_hw_events *cpuc,
+ struct perf_event *event);
-static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
- .enabled = 1,
-};
+ void (*put_event_constraints)(struct cpu_hw_events *cpuc,
+ struct perf_event *event);
+ struct event_constraint *event_constraints;
-/*
- * Not sure about some of these
- */
-static const u64 p6_perfmon_event_map[] =
-{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
- [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
+ void (*cpu_prepare)(int cpu);
+ void (*cpu_starting)(int cpu);
+ void (*cpu_dying)(int cpu);
+ void (*cpu_dead)(int cpu);
};
-static u64 p6_pmu_event_map(int hw_event)
-{
- return p6_perfmon_event_map[hw_event];
-}
-
-/*
- * Event setting that is specified not to count anything.
- * We use this to effectively disable a counter.
- *
- * L2_RQSTS with 0 MESI unit mask.
- */
-#define P6_NOP_EVENT 0x0000002EULL
-
-static u64 p6_pmu_raw_event(u64 hw_event)
-{
-#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
-#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
-#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
-#define P6_EVNTSEL_INV_MASK 0x00800000ULL
-#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
-
-#define P6_EVNTSEL_MASK \
- (P6_EVNTSEL_EVENT_MASK | \
- P6_EVNTSEL_UNIT_MASK | \
- P6_EVNTSEL_EDGE_MASK | \
- P6_EVNTSEL_INV_MASK | \
- P6_EVNTSEL_REG_MASK)
-
- return hw_event & P6_EVNTSEL_MASK;
-}
-
+static struct x86_pmu x86_pmu __read_mostly;
-/*
- * Intel PerfMon v3. Used on Core2 and later.
- */
-static const u64 intel_perfmon_event_map[] =
-{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
- [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
+static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
+ .enabled = 1,
};
-static u64 intel_pmu_event_map(int hw_event)
-{
- return intel_perfmon_event_map[hw_event];
-}
+static int x86_perf_event_set_period(struct perf_event *event);
/*
* Generalized hw caching related hw_event table, filled
@@ -190,435 +187,18 @@ static u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX];
-static const u64 nehalem_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
- [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
- [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
- [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
- [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
- [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
- [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
- [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
- [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
- [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-static const u64 core2_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
- [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
- [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
- [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
- [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
- [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
- [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
- [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-static const u64 atom_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
- [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
- [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
- [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
- [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
- [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-static u64 intel_pmu_raw_event(u64 hw_event)
-{
-#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
-#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
-#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
-#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
-#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
-
-#define CORE_EVNTSEL_MASK \
- (CORE_EVNTSEL_EVENT_MASK | \
- CORE_EVNTSEL_UNIT_MASK | \
- CORE_EVNTSEL_EDGE_MASK | \
- CORE_EVNTSEL_INV_MASK | \
- CORE_EVNTSEL_REG_MASK)
-
- return hw_event & CORE_EVNTSEL_MASK;
-}
-
-static const u64 amd_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
- [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
- [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
- [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
- [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
- [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
- [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
- [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-/*
- * AMD Performance Monitor K7 and later.
- */
-static const u64 amd_perfmon_event_map[] =
-{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
-};
-
-static u64 amd_pmu_event_map(int hw_event)
-{
- return amd_perfmon_event_map[hw_event];
-}
-
-static u64 amd_pmu_raw_event(u64 hw_event)
-{
-#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
-#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
-#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
-#define K7_EVNTSEL_INV_MASK 0x000800000ULL
-#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
-
-#define K7_EVNTSEL_MASK \
- (K7_EVNTSEL_EVENT_MASK | \
- K7_EVNTSEL_UNIT_MASK | \
- K7_EVNTSEL_EDGE_MASK | \
- K7_EVNTSEL_INV_MASK | \
- K7_EVNTSEL_REG_MASK)
-
- return hw_event & K7_EVNTSEL_MASK;
-}
-
/*
* Propagate event elapsed time into the generic event.
* Can only be executed on the CPU where the event is active.
* Returns the delta events processed.
*/
static u64
-x86_perf_event_update(struct perf_event *event,
- struct hw_perf_event *hwc, int idx)
+x86_perf_event_update(struct perf_event *event)
{
+ struct hw_perf_event *hwc = &event->hw;
int shift = 64 - x86_pmu.event_bits;
u64 prev_raw_count, new_raw_count;
+ int idx = hwc->idx;
s64 delta;
if (idx == X86_PMC_IDX_FIXED_BTS)
@@ -718,7 +298,7 @@ static inline bool bts_available(void)
return x86_pmu.enable_bts != NULL;
}
-static inline void init_debug_store_on_cpu(int cpu)
+static void init_debug_store_on_cpu(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -730,7 +310,7 @@ static inline void init_debug_store_on_cpu(int cpu)
(u32)((u64)(unsigned long)ds >> 32));
}
-static inline void fini_debug_store_on_cpu(int cpu)
+static void fini_debug_store_on_cpu(int cpu)
{
if (!per_cpu(cpu_hw_events, cpu).ds)
return;
@@ -859,42 +439,6 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
return 0;
}
-static void intel_pmu_enable_bts(u64 config)
-{
- unsigned long debugctlmsr;
-
- debugctlmsr = get_debugctlmsr();
-
- debugctlmsr |= X86_DEBUGCTL_TR;
- debugctlmsr |= X86_DEBUGCTL_BTS;
- debugctlmsr |= X86_DEBUGCTL_BTINT;
-
- if (!(config & ARCH_PERFMON_EVENTSEL_OS))
- debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
-
- if (!(config & ARCH_PERFMON_EVENTSEL_USR))
- debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
-
- update_debugctlmsr(debugctlmsr);
-}
-
-static void intel_pmu_disable_bts(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- unsigned long debugctlmsr;
-
- if (!cpuc->ds)
- return;
-
- debugctlmsr = get_debugctlmsr();
-
- debugctlmsr &=
- ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
- X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
-
- update_debugctlmsr(debugctlmsr);
-}
-
/*
* Setup the hardware configuration for a given attr_type
*/
@@ -932,6 +476,10 @@ static int __hw_perf_event_init(struct perf_event *event)
*/
hwc->config = ARCH_PERFMON_EVENTSEL_INT;
+ hwc->idx = -1;
+ hwc->last_cpu = -1;
+ hwc->last_tag = ~0ULL;
+
/*
* Count user and OS events unless requested not to.
*/
@@ -960,6 +508,9 @@ static int __hw_perf_event_init(struct perf_event *event)
*/
if (attr->type == PERF_TYPE_RAW) {
hwc->config |= x86_pmu.raw_event(attr->config);
+ if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
+ perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
return 0;
}
@@ -999,216 +550,314 @@ static int __hw_perf_event_init(struct perf_event *event)
return 0;
}
-static void p6_pmu_disable_all(void)
+static void x86_pmu_disable_all(void)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- u64 val;
-
- if (!cpuc->enabled)
- return;
+ int idx;
- cpuc->enabled = 0;
- barrier();
+ for (idx = 0; idx < x86_pmu.num_events; idx++) {
+ u64 val;
- /* p6 only has one enable register */
- rdmsrl(MSR_P6_EVNTSEL0, val);
- val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsrl(MSR_P6_EVNTSEL0, val);
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ rdmsrl(x86_pmu.eventsel + idx, val);
+ if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
+ continue;
+ val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrl(x86_pmu.eventsel + idx, val);
+ }
}
-static void intel_pmu_disable_all(void)
+void hw_perf_disable(void)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ if (!x86_pmu_initialized())
+ return;
+
if (!cpuc->enabled)
return;
+ cpuc->n_added = 0;
cpuc->enabled = 0;
barrier();
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
- if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
- intel_pmu_disable_bts();
+ x86_pmu.disable_all();
}
-static void amd_pmu_disable_all(void)
+static void x86_pmu_enable_all(void)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int idx;
- if (!cpuc->enabled)
- return;
-
- cpuc->enabled = 0;
- /*
- * ensure we write the disable before we start disabling the
- * events proper, so that amd_pmu_enable_event() does the
- * right thing.
- */
- barrier();
-
for (idx = 0; idx < x86_pmu.num_events; idx++) {
+ struct perf_event *event = cpuc->events[idx];
u64 val;
if (!test_bit(idx, cpuc->active_mask))
continue;
- rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
- if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
- continue;
- val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+
+ val = event->hw.config;
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrl(x86_pmu.eventsel + idx, val);
}
}
-void hw_perf_disable(void)
+static const struct pmu pmu;
+
+static inline int is_x86_event(struct perf_event *event)
{
- if (!x86_pmu_initialized())
- return;
- return x86_pmu.disable_all();
+ return event->pmu == &pmu;
}
-static void p6_pmu_enable_all(void)
+static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- unsigned long val;
+ struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
+ unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ int i, j, w, wmax, num = 0;
+ struct hw_perf_event *hwc;
- if (cpuc->enabled)
- return;
+ bitmap_zero(used_mask, X86_PMC_IDX_MAX);
- cpuc->enabled = 1;
- barrier();
+ for (i = 0; i < n; i++) {
+ c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
+ constraints[i] = c;
+ }
- /* p6 only has one enable register */
- rdmsrl(MSR_P6_EVNTSEL0, val);
- val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsrl(MSR_P6_EVNTSEL0, val);
-}
+ /*
+ * fastpath, try to reuse previous register
+ */
+ for (i = 0; i < n; i++) {
+ hwc = &cpuc->event_list[i]->hw;
+ c = constraints[i];
-static void intel_pmu_enable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ /* never assigned */
+ if (hwc->idx == -1)
+ break;
- if (cpuc->enabled)
- return;
+ /* constraint still honored */
+ if (!test_bit(hwc->idx, c->idxmsk))
+ break;
- cpuc->enabled = 1;
- barrier();
+ /* not already used */
+ if (test_bit(hwc->idx, used_mask))
+ break;
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+ __set_bit(hwc->idx, used_mask);
+ if (assign)
+ assign[i] = hwc->idx;
+ }
+ if (i == n)
+ goto done;
- if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
- struct perf_event *event =
- cpuc->events[X86_PMC_IDX_FIXED_BTS];
+ /*
+ * begin slow path
+ */
- if (WARN_ON_ONCE(!event))
- return;
+ bitmap_zero(used_mask, X86_PMC_IDX_MAX);
- intel_pmu_enable_bts(event->hw.config);
- }
-}
+ /*
+ * weight = number of possible counters
+ *
+ * 1 = most constrained, only works on one counter
+ * wmax = least constrained, works on any counter
+ *
+ * assign events to counters starting with most
+ * constrained events.
+ */
+ wmax = x86_pmu.num_events;
-static void amd_pmu_enable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int idx;
+ /*
+ * when fixed event counters are present,
+ * wmax is incremented by 1 to account
+ * for one more choice
+ */
+ if (x86_pmu.num_events_fixed)
+ wmax++;
- if (cpuc->enabled)
- return;
+ for (w = 1, num = n; num && w <= wmax; w++) {
+ /* for each event */
+ for (i = 0; num && i < n; i++) {
+ c = constraints[i];
+ hwc = &cpuc->event_list[i]->hw;
- cpuc->enabled = 1;
- barrier();
+ if (c->weight != w)
+ continue;
- for (idx = 0; idx < x86_pmu.num_events; idx++) {
- struct perf_event *event = cpuc->events[idx];
- u64 val;
+ for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
+ if (!test_bit(j, used_mask))
+ break;
+ }
- if (!test_bit(idx, cpuc->active_mask))
- continue;
+ if (j == X86_PMC_IDX_MAX)
+ break;
- val = event->hw.config;
- val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+ __set_bit(j, used_mask);
+
+ if (assign)
+ assign[i] = j;
+ num--;
+ }
}
+done:
+ /*
+ * scheduling failed or is just a simulation,
+ * free resources if necessary
+ */
+ if (!assign || num) {
+ for (i = 0; i < n; i++) {
+ if (x86_pmu.put_event_constraints)
+ x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
+ }
+ }
+ return num ? -ENOSPC : 0;
}
-void hw_perf_enable(void)
+/*
+ * dogrp: true if must collect siblings events (group)
+ * returns total number of events and error code
+ */
+static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
- if (!x86_pmu_initialized())
- return;
- x86_pmu.enable_all();
-}
+ struct perf_event *event;
+ int n, max_count;
-static inline u64 intel_pmu_get_status(void)
-{
- u64 status;
+ max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
- rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+ /* current number of events already accepted */
+ n = cpuc->n_events;
- return status;
-}
+ if (is_x86_event(leader)) {
+ if (n >= max_count)
+ return -ENOSPC;
+ cpuc->event_list[n] = leader;
+ n++;
+ }
+ if (!dogrp)
+ return n;
-static inline void intel_pmu_ack_status(u64 ack)
-{
- wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
-}
+ list_for_each_entry(event, &leader->sibling_list, group_entry) {
+ if (!is_x86_event(event) ||
+ event->state <= PERF_EVENT_STATE_OFF)
+ continue;
-static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx)
-{
- (void)checking_wrmsrl(hwc->config_base + idx,
- hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
-}
+ if (n >= max_count)
+ return -ENOSPC;
-static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx)
-{
- (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
+ cpuc->event_list[n] = event;
+ n++;
+ }
+ return n;
}
-static inline void
-intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx)
+static inline void x86_assign_hw_event(struct perf_event *event,
+ struct cpu_hw_events *cpuc, int i)
{
- int idx = __idx - X86_PMC_IDX_FIXED;
- u64 ctrl_val, mask;
+ struct hw_perf_event *hwc = &event->hw;
- mask = 0xfULL << (idx * 4);
+ hwc->idx = cpuc->assign[i];
+ hwc->last_cpu = smp_processor_id();
+ hwc->last_tag = ++cpuc->tags[i];
- rdmsrl(hwc->config_base, ctrl_val);
- ctrl_val &= ~mask;
- (void)checking_wrmsrl(hwc->config_base, ctrl_val);
+ if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
+ hwc->config_base = 0;
+ hwc->event_base = 0;
+ } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
+ hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+ /*
+ * We set it so that event_base + idx in wrmsr/rdmsr maps to
+ * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
+ */
+ hwc->event_base =
+ MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
+ } else {
+ hwc->config_base = x86_pmu.eventsel;
+ hwc->event_base = x86_pmu.perfctr;
+ }
}
-static inline void
-p6_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+static inline int match_prev_assignment(struct hw_perf_event *hwc,
+ struct cpu_hw_events *cpuc,
+ int i)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- u64 val = P6_NOP_EVENT;
-
- if (cpuc->enabled)
- val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-
- (void)checking_wrmsrl(hwc->config_base + idx, val);
+ return hwc->idx == cpuc->assign[i] &&
+ hwc->last_cpu == smp_processor_id() &&
+ hwc->last_tag == cpuc->tags[i];
}
-static inline void
-intel_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+static int x86_pmu_start(struct perf_event *event);
+static void x86_pmu_stop(struct perf_event *event);
+
+void hw_perf_enable(void)
{
- if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
- intel_pmu_disable_bts();
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct perf_event *event;
+ struct hw_perf_event *hwc;
+ int i;
+
+ if (!x86_pmu_initialized())
return;
- }
- if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
- intel_pmu_disable_fixed(hwc, idx);
+ if (cpuc->enabled)
return;
+
+ if (cpuc->n_added) {
+ int n_running = cpuc->n_events - cpuc->n_added;
+ /*
+ * apply assignment obtained either from
+ * hw_perf_group_sched_in() or x86_pmu_enable()
+ *
+ * step1: save events moving to new counters
+ * step2: reprogram moved events into new counters
+ */
+ for (i = 0; i < n_running; i++) {
+ event = cpuc->event_list[i];
+ hwc = &event->hw;
+
+ /*
+ * we can avoid reprogramming counter if:
+ * - assigned same counter as last time
+ * - running on same CPU as last time
+ * - no other event has used the counter since
+ */
+ if (hwc->idx == -1 ||
+ match_prev_assignment(hwc, cpuc, i))
+ continue;
+
+ x86_pmu_stop(event);
+ }
+
+ for (i = 0; i < cpuc->n_events; i++) {
+ event = cpuc->event_list[i];
+ hwc = &event->hw;
+
+ if (!match_prev_assignment(hwc, cpuc, i))
+ x86_assign_hw_event(event, cpuc, i);
+ else if (i < n_running)
+ continue;
+
+ x86_pmu_start(event);
+ }
+ cpuc->n_added = 0;
+ perf_events_lapic_init();
}
- x86_pmu_disable_event(hwc, idx);
+ cpuc->enabled = 1;
+ barrier();
+
+ x86_pmu.enable_all();
+}
+
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
+{
+ (void)checking_wrmsrl(hwc->config_base + hwc->idx,
+ hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
}
-static inline void
-amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+static inline void x86_pmu_disable_event(struct perf_event *event)
{
- x86_pmu_disable_event(hwc, idx);
+ struct hw_perf_event *hwc = &event->hw;
+ (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config);
}
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -1218,18 +867,18 @@ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
* To be called with the event disabled in hw:
*/
static int
-x86_perf_event_set_period(struct perf_event *event,
- struct hw_perf_event *hwc, int idx)
+x86_perf_event_set_period(struct perf_event *event)
{
+ struct hw_perf_event *hwc = &event->hw;
s64 left = atomic64_read(&hwc->period_left);
s64 period = hwc->sample_period;
- int err, ret = 0;
+ int err, ret = 0, idx = hwc->idx;
if (idx == X86_PMC_IDX_FIXED_BTS)
return 0;
/*
- * If we are way outside a reasoable range then just skip forward:
+ * If we are way outside a reasonable range then just skip forward:
*/
if (unlikely(left <= -period)) {
left = period;
@@ -1269,157 +918,63 @@ x86_perf_event_set_period(struct perf_event *event,
return ret;
}
-static inline void
-intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
-{
- int idx = __idx - X86_PMC_IDX_FIXED;
- u64 ctrl_val, bits, mask;
- int err;
-
- /*
- * Enable IRQ generation (0x8),
- * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
- * if requested:
- */
- bits = 0x8ULL;
- if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
- bits |= 0x2;
- if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
- bits |= 0x1;
- bits <<= (idx * 4);
- mask = 0xfULL << (idx * 4);
-
- rdmsrl(hwc->config_base, ctrl_val);
- ctrl_val &= ~mask;
- ctrl_val |= bits;
- err = checking_wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
+static void x86_pmu_enable_event(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- u64 val;
-
- val = hwc->config;
if (cpuc->enabled)
- val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-
- (void)checking_wrmsrl(hwc->config_base + idx, val);
+ __x86_pmu_enable_event(&event->hw);
}
-
-static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)
-{
- if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
- if (!__get_cpu_var(cpu_hw_events).enabled)
- return;
-
- intel_pmu_enable_bts(hwc->config);
- return;
- }
-
- if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
- intel_pmu_enable_fixed(hwc, idx);
- return;
- }
-
- x86_pmu_enable_event(hwc, idx);
-}
-
-static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
+/*
+ * activate a single event
+ *
+ * The event is added to the group of enabled events
+ * but only if it can be scehduled with existing events.
+ *
+ * Called with PMU disabled. If successful and return value 1,
+ * then guaranteed to call perf_enable() and hw_perf_enable()
+ */
+static int x86_pmu_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc;
+ int assign[X86_PMC_IDX_MAX];
+ int n, n0, ret;
- if (cpuc->enabled)
- x86_pmu_enable_event(hwc, idx);
-}
-
-static int
-fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
-{
- unsigned int hw_event;
-
- hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+ hwc = &event->hw;
- if (unlikely((hw_event ==
- x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
- (hwc->sample_period == 1)))
- return X86_PMC_IDX_FIXED_BTS;
+ n0 = cpuc->n_events;
+ n = collect_events(cpuc, event, false);
+ if (n < 0)
+ return n;
- if (!x86_pmu.num_events_fixed)
- return -1;
+ ret = x86_schedule_events(cpuc, n, assign);
+ if (ret)
+ return ret;
+ /*
+ * copy new assignment, now we know it is possible
+ * will be used by hw_perf_enable()
+ */
+ memcpy(cpuc->assign, assign, n*sizeof(int));
- if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
- return X86_PMC_IDX_FIXED_INSTRUCTIONS;
- if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
- return X86_PMC_IDX_FIXED_CPU_CYCLES;
- if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
- return X86_PMC_IDX_FIXED_BUS_CYCLES;
+ cpuc->n_events = n;
+ cpuc->n_added += n - n0;
- return -1;
+ return 0;
}
-/*
- * Find a PMC slot for the freshly enabled / scheduled in event:
- */
-static int x86_pmu_enable(struct perf_event *event)
+static int x86_pmu_start(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct hw_perf_event *hwc = &event->hw;
- int idx;
-
- idx = fixed_mode_idx(event, hwc);
- if (idx == X86_PMC_IDX_FIXED_BTS) {
- /* BTS is already occupied. */
- if (test_and_set_bit(idx, cpuc->used_mask))
- return -EAGAIN;
-
- hwc->config_base = 0;
- hwc->event_base = 0;
- hwc->idx = idx;
- } else if (idx >= 0) {
- /*
- * Try to get the fixed event, if that is already taken
- * then try to get a generic event:
- */
- if (test_and_set_bit(idx, cpuc->used_mask))
- goto try_generic;
-
- hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
- /*
- * We set it so that event_base + idx in wrmsr/rdmsr maps to
- * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
- */
- hwc->event_base =
- MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
- hwc->idx = idx;
- } else {
- idx = hwc->idx;
- /* Try to get the previous generic event again */
- if (test_and_set_bit(idx, cpuc->used_mask)) {
-try_generic:
- idx = find_first_zero_bit(cpuc->used_mask,
- x86_pmu.num_events);
- if (idx == x86_pmu.num_events)
- return -EAGAIN;
-
- set_bit(idx, cpuc->used_mask);
- hwc->idx = idx;
- }
- hwc->config_base = x86_pmu.eventsel;
- hwc->event_base = x86_pmu.perfctr;
- }
+ int idx = event->hw.idx;
- perf_events_lapic_init();
-
- x86_pmu.disable(hwc, idx);
+ if (idx == -1)
+ return -EAGAIN;
+ x86_perf_event_set_period(event);
cpuc->events[idx] = event;
- set_bit(idx, cpuc->active_mask);
-
- x86_perf_event_set_period(event, hwc, idx);
- x86_pmu.enable(hwc, idx);
-
+ __set_bit(idx, cpuc->active_mask);
+ x86_pmu.enable(event);
perf_event_update_userpage(event);
return 0;
@@ -1427,14 +982,8 @@ try_generic:
static void x86_pmu_unthrottle(struct perf_event *event)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct hw_perf_event *hwc = &event->hw;
-
- if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
- cpuc->events[hwc->idx] != event))
- return;
-
- x86_pmu.enable(hwc, hwc->idx);
+ int ret = x86_pmu_start(event);
+ WARN_ON_ONCE(ret);
}
void perf_event_print_debug(void)
@@ -1464,7 +1013,7 @@ void perf_event_print_debug(void)
pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
}
- pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
+ pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
for (idx = 0; idx < x86_pmu.num_events; idx++) {
rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
@@ -1488,254 +1037,50 @@ void perf_event_print_debug(void)
local_irq_restore(flags);
}
-static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc)
-{
- struct debug_store *ds = cpuc->ds;
- struct bts_record {
- u64 from;
- u64 to;
- u64 flags;
- };
- struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
- struct bts_record *at, *top;
- struct perf_output_handle handle;
- struct perf_event_header header;
- struct perf_sample_data data;
- struct pt_regs regs;
-
- if (!event)
- return;
-
- if (!ds)
- return;
-
- at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
- top = (struct bts_record *)(unsigned long)ds->bts_index;
-
- if (top <= at)
- return;
-
- ds->bts_index = ds->bts_buffer_base;
-
-
- data.period = event->hw.last_period;
- data.addr = 0;
- regs.ip = 0;
-
- /*
- * Prepare a generic sample, i.e. fill in the invariant fields.
- * We will overwrite the from and to address before we output
- * the sample.
- */
- perf_prepare_sample(&header, &data, event, &regs);
-
- if (perf_output_begin(&handle, event,
- header.size * (top - at), 1, 1))
- return;
-
- for (; at < top; at++) {
- data.ip = at->from;
- data.addr = at->to;
-
- perf_output_sample(&handle, &header, &data, event);
- }
-
- perf_output_end(&handle);
-
- /* There's new data available. */
- event->hw.interrupts++;
- event->pending_kill = POLL_IN;
-}
-
-static void x86_pmu_disable(struct perf_event *event)
+static void x86_pmu_stop(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx;
- /*
- * Must be done before we disable, otherwise the nmi handler
- * could reenable again:
- */
- clear_bit(idx, cpuc->active_mask);
- x86_pmu.disable(hwc, idx);
+ if (!__test_and_clear_bit(idx, cpuc->active_mask))
+ return;
- /*
- * Make sure the cleared pointer becomes visible before we
- * (potentially) free the event:
- */
- barrier();
+ x86_pmu.disable(event);
/*
* Drain the remaining delta count out of a event
* that we are disabling:
*/
- x86_perf_event_update(event, hwc, idx);
-
- /* Drain the remaining BTS records. */
- if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
- intel_pmu_drain_bts_buffer(cpuc);
+ x86_perf_event_update(event);
cpuc->events[idx] = NULL;
- clear_bit(idx, cpuc->used_mask);
-
- perf_event_update_userpage(event);
}
-/*
- * Save and restart an expired event. Called by NMI contexts,
- * so it has to be careful about preempting normal event ops:
- */
-static int intel_pmu_save_and_restart(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- int idx = hwc->idx;
- int ret;
-
- x86_perf_event_update(event, hwc, idx);
- ret = x86_perf_event_set_period(event, hwc, idx);
-
- if (event->state == PERF_EVENT_STATE_ACTIVE)
- intel_pmu_enable_event(hwc, idx);
-
- return ret;
-}
-
-static void intel_pmu_reset(void)
-{
- struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
- unsigned long flags;
- int idx;
-
- if (!x86_pmu.num_events)
- return;
-
- local_irq_save(flags);
-
- printk("clearing PMU state on CPU#%d\n", smp_processor_id());
-
- for (idx = 0; idx < x86_pmu.num_events; idx++) {
- checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
- checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
- }
- for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
- checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
- }
- if (ds)
- ds->bts_index = ds->bts_buffer_base;
-
- local_irq_restore(flags);
-}
-
-static int p6_pmu_handle_irq(struct pt_regs *regs)
-{
- struct perf_sample_data data;
- struct cpu_hw_events *cpuc;
- struct perf_event *event;
- struct hw_perf_event *hwc;
- int idx, handled = 0;
- u64 val;
-
- data.addr = 0;
-
- cpuc = &__get_cpu_var(cpu_hw_events);
-
- for (idx = 0; idx < x86_pmu.num_events; idx++) {
- if (!test_bit(idx, cpuc->active_mask))
- continue;
-
- event = cpuc->events[idx];
- hwc = &event->hw;
-
- val = x86_perf_event_update(event, hwc, idx);
- if (val & (1ULL << (x86_pmu.event_bits - 1)))
- continue;
-
- /*
- * event overflow
- */
- handled = 1;
- data.period = event->hw.last_period;
-
- if (!x86_perf_event_set_period(event, hwc, idx))
- continue;
-
- if (perf_event_overflow(event, 1, &data, regs))
- p6_pmu_disable_event(hwc, idx);
- }
-
- if (handled)
- inc_irq_stat(apic_perf_irqs);
-
- return handled;
-}
-
-/*
- * This handler is triggered by the local APIC, so the APIC IRQ handling
- * rules apply:
- */
-static int intel_pmu_handle_irq(struct pt_regs *regs)
+static void x86_pmu_disable(struct perf_event *event)
{
- struct perf_sample_data data;
- struct cpu_hw_events *cpuc;
- int bit, loops;
- u64 ack, status;
-
- data.addr = 0;
-
- cpuc = &__get_cpu_var(cpu_hw_events);
-
- perf_disable();
- intel_pmu_drain_bts_buffer(cpuc);
- status = intel_pmu_get_status();
- if (!status) {
- perf_enable();
- return 0;
- }
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ int i;
- loops = 0;
-again:
- if (++loops > 100) {
- WARN_ONCE(1, "perfevents: irq loop stuck!\n");
- perf_event_print_debug();
- intel_pmu_reset();
- perf_enable();
- return 1;
- }
+ x86_pmu_stop(event);
- inc_irq_stat(apic_perf_irqs);
- ack = status;
- for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
- struct perf_event *event = cpuc->events[bit];
+ for (i = 0; i < cpuc->n_events; i++) {
+ if (event == cpuc->event_list[i]) {
- clear_bit(bit, (unsigned long *) &status);
- if (!test_bit(bit, cpuc->active_mask))
- continue;
+ if (x86_pmu.put_event_constraints)
+ x86_pmu.put_event_constraints(cpuc, event);
- if (!intel_pmu_save_and_restart(event))
- continue;
-
- data.period = event->hw.last_period;
+ while (++i < cpuc->n_events)
+ cpuc->event_list[i-1] = cpuc->event_list[i];
- if (perf_event_overflow(event, 1, &data, regs))
- intel_pmu_disable_event(&event->hw, bit);
+ --cpuc->n_events;
+ break;
+ }
}
-
- intel_pmu_ack_status(ack);
-
- /*
- * Repeat if there is more work to be done:
- */
- status = intel_pmu_get_status();
- if (status)
- goto again;
-
- perf_enable();
-
- return 1;
+ perf_event_update_userpage(event);
}
-static int amd_pmu_handle_irq(struct pt_regs *regs)
+static int x86_pmu_handle_irq(struct pt_regs *regs)
{
struct perf_sample_data data;
struct cpu_hw_events *cpuc;
@@ -1744,7 +1089,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
int idx, handled = 0;
u64 val;
- data.addr = 0;
+ perf_sample_data_init(&data, 0);
cpuc = &__get_cpu_var(cpu_hw_events);
@@ -1755,7 +1100,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
event = cpuc->events[idx];
hwc = &event->hw;
- val = x86_perf_event_update(event, hwc, idx);
+ val = x86_perf_event_update(event);
if (val & (1ULL << (x86_pmu.event_bits - 1)))
continue;
@@ -1765,11 +1110,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
handled = 1;
data.period = event->hw.last_period;
- if (!x86_perf_event_set_period(event, hwc, idx))
+ if (!x86_perf_event_set_period(event))
continue;
if (perf_event_overflow(event, 1, &data, regs))
- amd_pmu_disable_event(hwc, idx);
+ x86_pmu_stop(event);
}
if (handled)
@@ -1852,196 +1197,184 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
.priority = 1
};
-static struct x86_pmu p6_pmu = {
- .name = "p6",
- .handle_irq = p6_pmu_handle_irq,
- .disable_all = p6_pmu_disable_all,
- .enable_all = p6_pmu_enable_all,
- .enable = p6_pmu_enable_event,
- .disable = p6_pmu_disable_event,
- .eventsel = MSR_P6_EVNTSEL0,
- .perfctr = MSR_P6_PERFCTR0,
- .event_map = p6_pmu_event_map,
- .raw_event = p6_pmu_raw_event,
- .max_events = ARRAY_SIZE(p6_perfmon_event_map),
- .apic = 1,
- .max_period = (1ULL << 31) - 1,
- .version = 0,
- .num_events = 2,
- /*
- * Events have 40 bits implemented. However they are designed such
- * that bits [32-39] are sign extensions of bit 31. As such the
- * effective width of a event for P6-like PMU is 32 bits only.
- *
- * See IA-32 Intel Architecture Software developer manual Vol 3B
- */
- .event_bits = 32,
- .event_mask = (1ULL << 32) - 1,
-};
+static struct event_constraint unconstrained;
+static struct event_constraint emptyconstraint;
-static struct x86_pmu intel_pmu = {
- .name = "Intel",
- .handle_irq = intel_pmu_handle_irq,
- .disable_all = intel_pmu_disable_all,
- .enable_all = intel_pmu_enable_all,
- .enable = intel_pmu_enable_event,
- .disable = intel_pmu_disable_event,
- .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
- .event_map = intel_pmu_event_map,
- .raw_event = intel_pmu_raw_event,
- .max_events = ARRAY_SIZE(intel_perfmon_event_map),
- .apic = 1,
- /*
- * Intel PMCs cannot be accessed sanely above 32 bit width,
- * so we install an artificial 1<<31 period regardless of
- * the generic event period:
- */
- .max_period = (1ULL << 31) - 1,
- .enable_bts = intel_pmu_enable_bts,
- .disable_bts = intel_pmu_disable_bts,
-};
+static struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+ struct event_constraint *c;
-static struct x86_pmu amd_pmu = {
- .name = "AMD",
- .handle_irq = amd_pmu_handle_irq,
- .disable_all = amd_pmu_disable_all,
- .enable_all = amd_pmu_enable_all,
- .enable = amd_pmu_enable_event,
- .disable = amd_pmu_disable_event,
- .eventsel = MSR_K7_EVNTSEL0,
- .perfctr = MSR_K7_PERFCTR0,
- .event_map = amd_pmu_event_map,
- .raw_event = amd_pmu_raw_event,
- .max_events = ARRAY_SIZE(amd_perfmon_event_map),
- .num_events = 4,
- .event_bits = 48,
- .event_mask = (1ULL << 48) - 1,
- .apic = 1,
- /* use highest bit to detect overflow */
- .max_period = (1ULL << 47) - 1,
-};
+ if (x86_pmu.event_constraints) {
+ for_each_event_constraint(c, x86_pmu.event_constraints) {
+ if ((event->hw.config & c->cmask) == c->code)
+ return c;
+ }
+ }
-static int p6_pmu_init(void)
+ return &unconstrained;
+}
+
+static int x86_event_sched_in(struct perf_event *event,
+ struct perf_cpu_context *cpuctx)
{
- switch (boot_cpu_data.x86_model) {
- case 1:
- case 3: /* Pentium Pro */
- case 5:
- case 6: /* Pentium II */
- case 7:
- case 8:
- case 11: /* Pentium III */
- break;
- case 9:
- case 13:
- /* Pentium M */
- break;
- default:
- pr_cont("unsupported p6 CPU model %d ",
- boot_cpu_data.x86_model);
- return -ENODEV;
- }
+ int ret = 0;
- x86_pmu = p6_pmu;
+ event->state = PERF_EVENT_STATE_ACTIVE;
+ event->oncpu = smp_processor_id();
+ event->tstamp_running += event->ctx->time - event->tstamp_stopped;
- if (!cpu_has_apic) {
- pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
- pr_info("no hardware sampling interrupt available.\n");
- x86_pmu.apic = 0;
- }
+ if (!is_x86_event(event))
+ ret = event->pmu->enable(event);
- return 0;
+ if (!ret && !is_software_event(event))
+ cpuctx->active_oncpu++;
+
+ if (!ret && event->attr.exclusive)
+ cpuctx->exclusive = 1;
+
+ return ret;
}
-static int intel_pmu_init(void)
+static void x86_event_sched_out(struct perf_event *event,
+ struct perf_cpu_context *cpuctx)
{
- union cpuid10_edx edx;
- union cpuid10_eax eax;
- unsigned int unused;
- unsigned int ebx;
- int version;
-
- if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- /* check for P6 processor family */
- if (boot_cpu_data.x86 == 6) {
- return p6_pmu_init();
- } else {
- return -ENODEV;
- }
- }
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ event->oncpu = -1;
- /*
- * Check whether the Architectural PerfMon supports
- * Branch Misses Retired hw_event or not.
- */
- cpuid(10, &eax.full, &ebx, &unused, &edx.full);
- if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
- return -ENODEV;
+ if (!is_x86_event(event))
+ event->pmu->disable(event);
- version = eax.split.version_id;
- if (version < 2)
- return -ENODEV;
+ event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
- x86_pmu = intel_pmu;
- x86_pmu.version = version;
- x86_pmu.num_events = eax.split.num_events;
- x86_pmu.event_bits = eax.split.bit_width;
- x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
+ if (!is_software_event(event))
+ cpuctx->active_oncpu--;
+
+ if (event->attr.exclusive || !cpuctx->active_oncpu)
+ cpuctx->exclusive = 0;
+}
+/*
+ * Called to enable a whole group of events.
+ * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
+ * Assumes the caller has disabled interrupts and has
+ * frozen the PMU with hw_perf_save_disable.
+ *
+ * called with PMU disabled. If successful and return value 1,
+ * then guaranteed to call perf_enable() and hw_perf_enable()
+ */
+int hw_perf_group_sched_in(struct perf_event *leader,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct perf_event *sub;
+ int assign[X86_PMC_IDX_MAX];
+ int n0, n1, ret;
+
+ /* n0 = total number of events */
+ n0 = collect_events(cpuc, leader, true);
+ if (n0 < 0)
+ return n0;
+
+ ret = x86_schedule_events(cpuc, n0, assign);
+ if (ret)
+ return ret;
+
+ ret = x86_event_sched_in(leader, cpuctx);
+ if (ret)
+ return ret;
+
+ n1 = 1;
+ list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ if (sub->state > PERF_EVENT_STATE_OFF) {
+ ret = x86_event_sched_in(sub, cpuctx);
+ if (ret)
+ goto undo;
+ ++n1;
+ }
+ }
/*
- * Quirk: v2 perfmon does not report fixed-purpose events, so
- * assume at least 3 events:
+ * copy new assignment, now we know it is possible
+ * will be used by hw_perf_enable()
*/
- x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
+ memcpy(cpuc->assign, assign, n0*sizeof(int));
+
+ cpuc->n_events = n0;
+ cpuc->n_added += n1;
+ ctx->nr_active += n1;
/*
- * Install the hw-cache-events table:
+ * 1 means successful and events are active
+ * This is not quite true because we defer
+ * actual activation until hw_perf_enable() but
+ * this way we* ensure caller won't try to enable
+ * individual events
*/
- switch (boot_cpu_data.x86_model) {
- case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
- case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
- case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
- case 29: /* six-core 45 nm xeon "Dunnington" */
- memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
- pr_cont("Core2 events, ");
+ return 1;
+undo:
+ x86_event_sched_out(leader, cpuctx);
+ n0 = 1;
+ list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ if (sub->state == PERF_EVENT_STATE_ACTIVE) {
+ x86_event_sched_out(sub, cpuctx);
+ if (++n0 == n1)
+ break;
+ }
+ }
+ return ret;
+}
+
+#include "perf_event_amd.c"
+#include "perf_event_p6.c"
+#include "perf_event_intel.c"
+
+static int __cpuinit
+x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ if (x86_pmu.cpu_prepare)
+ x86_pmu.cpu_prepare(cpu);
+ break;
+
+ case CPU_STARTING:
+ if (x86_pmu.cpu_starting)
+ x86_pmu.cpu_starting(cpu);
+ break;
+
+ case CPU_DYING:
+ if (x86_pmu.cpu_dying)
+ x86_pmu.cpu_dying(cpu);
break;
- default:
- case 26:
- memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
- pr_cont("Nehalem/Corei7 events, ");
+ case CPU_DEAD:
+ if (x86_pmu.cpu_dead)
+ x86_pmu.cpu_dead(cpu);
break;
- case 28:
- memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
- pr_cont("Atom events, ");
+ default:
break;
}
- return 0;
+
+ return NOTIFY_OK;
}
-static int amd_pmu_init(void)
+static void __init pmu_check_apic(void)
{
- /* Performance-monitoring supported from K7 and later: */
- if (boot_cpu_data.x86 < 6)
- return -ENODEV;
-
- x86_pmu = amd_pmu;
-
- /* Events are common for all AMDs */
- memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
+ if (cpu_has_apic)
+ return;
- return 0;
+ x86_pmu.apic = 0;
+ pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
+ pr_info("no hardware sampling interrupt available.\n");
}
void __init init_hw_perf_events(void)
{
+ struct event_constraint *c;
int err;
pr_info("Performance Events: ");
@@ -2061,6 +1394,8 @@ void __init init_hw_perf_events(void)
return;
}
+ pmu_check_apic();
+
pr_cont("%s PMU driver.\n", x86_pmu.name);
if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
@@ -2084,6 +1419,20 @@ void __init init_hw_perf_events(void)
perf_events_lapic_init();
register_die_notifier(&perf_event_nmi_notifier);
+ unconstrained = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
+ 0, x86_pmu.num_events);
+
+ if (x86_pmu.event_constraints) {
+ for_each_event_constraint(c, x86_pmu.event_constraints) {
+ if (c->cmask != INTEL_ARCH_FIXED_MASK)
+ continue;
+
+ c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1;
+ c->weight += x86_pmu.num_events;
+ }
+ }
+
pr_info("... version: %d\n", x86_pmu.version);
pr_info("... bit width: %d\n", x86_pmu.event_bits);
pr_info("... generic registers: %d\n", x86_pmu.num_events);
@@ -2091,25 +1440,92 @@ void __init init_hw_perf_events(void)
pr_info("... max period: %016Lx\n", x86_pmu.max_period);
pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed);
pr_info("... event mask: %016Lx\n", perf_event_mask);
+
+ perf_cpu_notifier(x86_pmu_notifier);
}
static inline void x86_pmu_read(struct perf_event *event)
{
- x86_perf_event_update(event, &event->hw, event->hw.idx);
+ x86_perf_event_update(event);
}
static const struct pmu pmu = {
.enable = x86_pmu_enable,
.disable = x86_pmu_disable,
+ .start = x86_pmu_start,
+ .stop = x86_pmu_stop,
.read = x86_pmu_read,
.unthrottle = x86_pmu_unthrottle,
};
+/*
+ * validate a single event group
+ *
+ * validation include:
+ * - check events are compatible which each other
+ * - events do not compete for the same counter
+ * - number of events <= number of counters
+ *
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
+static int validate_group(struct perf_event *event)
+{
+ struct perf_event *leader = event->group_leader;
+ struct cpu_hw_events *fake_cpuc;
+ int ret, n;
+
+ ret = -ENOMEM;
+ fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
+ if (!fake_cpuc)
+ goto out;
+
+ /*
+ * the event is not yet connected with its
+ * siblings therefore we must first collect
+ * existing siblings, then add the new event
+ * before we can simulate the scheduling
+ */
+ ret = -ENOSPC;
+ n = collect_events(fake_cpuc, leader, true);
+ if (n < 0)
+ goto out_free;
+
+ fake_cpuc->n_events = n;
+ n = collect_events(fake_cpuc, event, false);
+ if (n < 0)
+ goto out_free;
+
+ fake_cpuc->n_events = n;
+
+ ret = x86_schedule_events(fake_cpuc, n, NULL);
+
+out_free:
+ kfree(fake_cpuc);
+out:
+ return ret;
+}
+
const struct pmu *hw_perf_event_init(struct perf_event *event)
{
+ const struct pmu *tmp;
int err;
err = __hw_perf_event_init(event);
+ if (!err) {
+ /*
+ * we temporarily connect event to its pmu
+ * such that validate_group() can classify
+ * it as an x86 event using is_x86_event()
+ */
+ tmp = event->pmu;
+ event->pmu = &pmu;
+
+ if (event->group_leader != event)
+ err = validate_group(event);
+
+ event->pmu = tmp;
+ }
if (err) {
if (event->destroy)
event->destroy(event);
@@ -2132,7 +1548,6 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
-static DEFINE_PER_CPU(int, in_nmi_frame);
static void
@@ -2148,9 +1563,6 @@ static void backtrace_warning(void *data, char *msg)
static int backtrace_stack(void *data, char *name)
{
- per_cpu(in_nmi_frame, smp_processor_id()) =
- x86_is_stack_id(NMI_STACK, name);
-
return 0;
}
@@ -2158,9 +1570,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
{
struct perf_callchain_entry *entry = data;
- if (per_cpu(in_nmi_frame, smp_processor_id()))
- return;
-
if (reliable)
callchain_store(entry, addr);
}
@@ -2170,6 +1579,7 @@ static const struct stacktrace_ops backtrace_ops = {
.warning_symbol = backtrace_warning_symbol,
.stack = backtrace_stack,
.address = backtrace_address,
+ .walk_stack = print_context_stack_bp,
};
#include "../dumpstack.h"
@@ -2180,7 +1590,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
callchain_store(entry, PERF_CONTEXT_KERNEL);
callchain_store(entry, regs->ip);
- dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
+ dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
}
/*
@@ -2266,9 +1676,6 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
is_user = user_mode(regs);
- if (!current || current->pid == 0)
- return;
-
if (is_user && current->state != TASK_RUNNING)
return;
@@ -2295,7 +1702,16 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
return entry;
}
-void hw_perf_event_setup_online(int cpu)
+#ifdef CONFIG_EVENT_TRACING
+void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
{
- init_debug_store_on_cpu(cpu);
+ regs->ip = ip;
+ /*
+ * perf_arch_fetch_caller_regs adds another call, we need to increment
+ * the skip level
+ */
+ regs->bp = rewind_frame_pointer(skip + 1);
+ regs->cs = __KERNEL_CS;
+ local_save_flags(regs->flags);
}
+#endif
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
new file mode 100644
index 000000000000..573458f1caf2
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -0,0 +1,406 @@
+#ifdef CONFIG_CPU_SUP_AMD
+
+static DEFINE_RAW_SPINLOCK(amd_nb_lock);
+
+static __initconst u64 amd_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
+ [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
+ [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
+ [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
+ [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
+ [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
+ [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
+ [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const u64 amd_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+};
+
+static u64 amd_pmu_event_map(int hw_event)
+{
+ return amd_perfmon_event_map[hw_event];
+}
+
+static u64 amd_pmu_raw_event(u64 hw_event)
+{
+#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL
+#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
+#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
+#define K7_EVNTSEL_INV_MASK 0x000800000ULL
+#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
+
+#define K7_EVNTSEL_MASK \
+ (K7_EVNTSEL_EVENT_MASK | \
+ K7_EVNTSEL_UNIT_MASK | \
+ K7_EVNTSEL_EDGE_MASK | \
+ K7_EVNTSEL_INV_MASK | \
+ K7_EVNTSEL_REG_MASK)
+
+ return hw_event & K7_EVNTSEL_MASK;
+}
+
+/*
+ * AMD64 events are detected based on their event codes.
+ */
+static inline int amd_is_nb_event(struct hw_perf_event *hwc)
+{
+ return (hwc->config & 0xe0) == 0xe0;
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct amd_nb *nb = cpuc->amd_nb;
+ int i;
+
+ /*
+ * only care about NB events
+ */
+ if (!(nb && amd_is_nb_event(hwc)))
+ return;
+
+ /*
+ * need to scan whole list because event may not have
+ * been assigned during scheduling
+ *
+ * no race condition possible because event can only
+ * be removed on one CPU at a time AND PMU is disabled
+ * when we come here
+ */
+ for (i = 0; i < x86_pmu.num_events; i++) {
+ if (nb->owners[i] == event) {
+ cmpxchg(nb->owners+i, event, NULL);
+ break;
+ }
+ }
+}
+
+ /*
+ * AMD64 NorthBridge events need special treatment because
+ * counter access needs to be synchronized across all cores
+ * of a package. Refer to BKDG section 3.12
+ *
+ * NB events are events measuring L3 cache, Hypertransport
+ * traffic. They are identified by an event code >= 0xe00.
+ * They measure events on the NorthBride which is shared
+ * by all cores on a package. NB events are counted on a
+ * shared set of counters. When a NB event is programmed
+ * in a counter, the data actually comes from a shared
+ * counter. Thus, access to those counters needs to be
+ * synchronized.
+ *
+ * We implement the synchronization such that no two cores
+ * can be measuring NB events using the same counters. Thus,
+ * we maintain a per-NB allocation table. The available slot
+ * is propagated using the event_constraint structure.
+ *
+ * We provide only one choice for each NB event based on
+ * the fact that only NB events have restrictions. Consequently,
+ * if a counter is available, there is a guarantee the NB event
+ * will be assigned to it. If no slot is available, an empty
+ * constraint is returned and scheduling will eventually fail
+ * for this event.
+ *
+ * Note that all cores attached the same NB compete for the same
+ * counters to host NB events, this is why we use atomic ops. Some
+ * multi-chip CPUs may have more than one NB.
+ *
+ * Given that resources are allocated (cmpxchg), they must be
+ * eventually freed for others to use. This is accomplished by
+ * calling amd_put_event_constraints().
+ *
+ * Non NB events are not impacted by this restriction.
+ */
+static struct event_constraint *
+amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct amd_nb *nb = cpuc->amd_nb;
+ struct perf_event *old = NULL;
+ int max = x86_pmu.num_events;
+ int i, j, k = -1;
+
+ /*
+ * if not NB event or no NB, then no constraints
+ */
+ if (!(nb && amd_is_nb_event(hwc)))
+ return &unconstrained;
+
+ /*
+ * detect if already present, if so reuse
+ *
+ * cannot merge with actual allocation
+ * because of possible holes
+ *
+ * event can already be present yet not assigned (in hwc->idx)
+ * because of successive calls to x86_schedule_events() from
+ * hw_perf_group_sched_in() without hw_perf_enable()
+ */
+ for (i = 0; i < max; i++) {
+ /*
+ * keep track of first free slot
+ */
+ if (k == -1 && !nb->owners[i])
+ k = i;
+
+ /* already present, reuse */
+ if (nb->owners[i] == event)
+ goto done;
+ }
+ /*
+ * not present, so grab a new slot
+ * starting either at:
+ */
+ if (hwc->idx != -1) {
+ /* previous assignment */
+ i = hwc->idx;
+ } else if (k != -1) {
+ /* start from free slot found */
+ i = k;
+ } else {
+ /*
+ * event not found, no slot found in
+ * first pass, try again from the
+ * beginning
+ */
+ i = 0;
+ }
+ j = i;
+ do {
+ old = cmpxchg(nb->owners+i, NULL, event);
+ if (!old)
+ break;
+ if (++i == max)
+ i = 0;
+ } while (i != j);
+done:
+ if (!old)
+ return &nb->event_constraints[i];
+
+ return &emptyconstraint;
+}
+
+static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
+{
+ struct amd_nb *nb;
+ int i;
+
+ nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL);
+ if (!nb)
+ return NULL;
+
+ memset(nb, 0, sizeof(*nb));
+ nb->nb_id = nb_id;
+
+ /*
+ * initialize all possible NB constraints
+ */
+ for (i = 0; i < x86_pmu.num_events; i++) {
+ __set_bit(i, nb->event_constraints[i].idxmsk);
+ nb->event_constraints[i].weight = 1;
+ }
+ return nb;
+}
+
+static void amd_pmu_cpu_online(int cpu)
+{
+ struct cpu_hw_events *cpu1, *cpu2;
+ struct amd_nb *nb = NULL;
+ int i, nb_id;
+
+ if (boot_cpu_data.x86_max_cores < 2)
+ return;
+
+ /*
+ * function may be called too early in the
+ * boot process, in which case nb_id is bogus
+ */
+ nb_id = amd_get_nb_id(cpu);
+ if (nb_id == BAD_APICID)
+ return;
+
+ cpu1 = &per_cpu(cpu_hw_events, cpu);
+ cpu1->amd_nb = NULL;
+
+ raw_spin_lock(&amd_nb_lock);
+
+ for_each_online_cpu(i) {
+ cpu2 = &per_cpu(cpu_hw_events, i);
+ nb = cpu2->amd_nb;
+ if (!nb)
+ continue;
+ if (nb->nb_id == nb_id)
+ goto found;
+ }
+
+ nb = amd_alloc_nb(cpu, nb_id);
+ if (!nb) {
+ pr_err("perf_events: failed NB allocation for CPU%d\n", cpu);
+ raw_spin_unlock(&amd_nb_lock);
+ return;
+ }
+found:
+ nb->refcnt++;
+ cpu1->amd_nb = nb;
+
+ raw_spin_unlock(&amd_nb_lock);
+}
+
+static void amd_pmu_cpu_offline(int cpu)
+{
+ struct cpu_hw_events *cpuhw;
+
+ if (boot_cpu_data.x86_max_cores < 2)
+ return;
+
+ cpuhw = &per_cpu(cpu_hw_events, cpu);
+
+ raw_spin_lock(&amd_nb_lock);
+
+ if (--cpuhw->amd_nb->refcnt == 0)
+ kfree(cpuhw->amd_nb);
+
+ cpuhw->amd_nb = NULL;
+
+ raw_spin_unlock(&amd_nb_lock);
+}
+
+static __initconst struct x86_pmu amd_pmu = {
+ .name = "AMD",
+ .handle_irq = x86_pmu_handle_irq,
+ .disable_all = x86_pmu_disable_all,
+ .enable_all = x86_pmu_enable_all,
+ .enable = x86_pmu_enable_event,
+ .disable = x86_pmu_disable_event,
+ .eventsel = MSR_K7_EVNTSEL0,
+ .perfctr = MSR_K7_PERFCTR0,
+ .event_map = amd_pmu_event_map,
+ .raw_event = amd_pmu_raw_event,
+ .max_events = ARRAY_SIZE(amd_perfmon_event_map),
+ .num_events = 4,
+ .event_bits = 48,
+ .event_mask = (1ULL << 48) - 1,
+ .apic = 1,
+ /* use highest bit to detect overflow */
+ .max_period = (1ULL << 47) - 1,
+ .get_event_constraints = amd_get_event_constraints,
+ .put_event_constraints = amd_put_event_constraints,
+
+ .cpu_prepare = amd_pmu_cpu_online,
+ .cpu_dead = amd_pmu_cpu_offline,
+};
+
+static __init int amd_pmu_init(void)
+{
+ /* Performance-monitoring supported from K7 and later: */
+ if (boot_cpu_data.x86 < 6)
+ return -ENODEV;
+
+ x86_pmu = amd_pmu;
+
+ /* Events are common for all AMDs */
+ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ return 0;
+}
+
+#else /* CONFIG_CPU_SUP_AMD */
+
+static int amd_pmu_init(void)
+{
+ return 0;
+}
+
+#endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
new file mode 100644
index 000000000000..84bfde64a337
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -0,0 +1,979 @@
+#ifdef CONFIG_CPU_SUP_INTEL
+
+/*
+ * Intel PerfMon, used on Core and later.
+ */
+static const u64 intel_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+ [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
+};
+
+static struct event_constraint intel_core_event_constraints[] =
+{
+ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+ INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+ INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_core2_event_constraints[] =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ /*
+ * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event
+ * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed
+ * ratio between these counters.
+ */
+ /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
+ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+ INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
+ INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+ INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
+ INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
+ INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_nehalem_event_constraints[] =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
+ INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
+ INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
+ INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
+ INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
+ INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
+ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_westmere_event_constraints[] =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+ INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
+ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_gen_event_constraints[] =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ EVENT_CONSTRAINT_END
+};
+
+static u64 intel_pmu_event_map(int hw_event)
+{
+ return intel_perfmon_event_map[hw_event];
+}
+
+static __initconst u64 westmere_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
+ [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
+ [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
+ [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
+ [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
+ [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
+ [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
+ [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static __initconst u64 nehalem_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
+ [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
+ [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
+ [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static __initconst u64 core2_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static __initconst u64 atom_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static u64 intel_pmu_raw_event(u64 hw_event)
+{
+#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
+#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
+#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
+#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
+#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
+
+#define CORE_EVNTSEL_MASK \
+ (INTEL_ARCH_EVTSEL_MASK | \
+ INTEL_ARCH_UNIT_MASK | \
+ INTEL_ARCH_EDGE_MASK | \
+ INTEL_ARCH_INV_MASK | \
+ INTEL_ARCH_CNT_MASK)
+
+ return hw_event & CORE_EVNTSEL_MASK;
+}
+
+static void intel_pmu_enable_bts(u64 config)
+{
+ unsigned long debugctlmsr;
+
+ debugctlmsr = get_debugctlmsr();
+
+ debugctlmsr |= X86_DEBUGCTL_TR;
+ debugctlmsr |= X86_DEBUGCTL_BTS;
+ debugctlmsr |= X86_DEBUGCTL_BTINT;
+
+ if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+ debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
+
+ if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+ debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
+
+ update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_disable_bts(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ unsigned long debugctlmsr;
+
+ if (!cpuc->ds)
+ return;
+
+ debugctlmsr = get_debugctlmsr();
+
+ debugctlmsr &=
+ ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
+ X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
+
+ update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+ if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+ intel_pmu_disable_bts();
+}
+
+static void intel_pmu_enable_all(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
+ if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+ struct perf_event *event =
+ cpuc->events[X86_PMC_IDX_FIXED_BTS];
+
+ if (WARN_ON_ONCE(!event))
+ return;
+
+ intel_pmu_enable_bts(event->hw.config);
+ }
+}
+
+static inline u64 intel_pmu_get_status(void)
+{
+ u64 status;
+
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+ return status;
+}
+
+static inline void intel_pmu_ack_status(u64 ack)
+{
+ wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static inline void
+intel_pmu_disable_fixed(struct hw_perf_event *hwc)
+{
+ int idx = hwc->idx - X86_PMC_IDX_FIXED;
+ u64 ctrl_val, mask;
+
+ mask = 0xfULL << (idx * 4);
+
+ rdmsrl(hwc->config_base, ctrl_val);
+ ctrl_val &= ~mask;
+ (void)checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void intel_pmu_drain_bts_buffer(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct debug_store *ds = cpuc->ds;
+ struct bts_record {
+ u64 from;
+ u64 to;
+ u64 flags;
+ };
+ struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
+ struct bts_record *at, *top;
+ struct perf_output_handle handle;
+ struct perf_event_header header;
+ struct perf_sample_data data;
+ struct pt_regs regs;
+
+ if (!event)
+ return;
+
+ if (!ds)
+ return;
+
+ at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+ top = (struct bts_record *)(unsigned long)ds->bts_index;
+
+ if (top <= at)
+ return;
+
+ ds->bts_index = ds->bts_buffer_base;
+
+ perf_sample_data_init(&data, 0);
+
+ data.period = event->hw.last_period;
+ regs.ip = 0;
+
+ /*
+ * Prepare a generic sample, i.e. fill in the invariant fields.
+ * We will overwrite the from and to address before we output
+ * the sample.
+ */
+ perf_prepare_sample(&header, &data, event, &regs);
+
+ if (perf_output_begin(&handle, event,
+ header.size * (top - at), 1, 1))
+ return;
+
+ for (; at < top; at++) {
+ data.ip = at->from;
+ data.addr = at->to;
+
+ perf_output_sample(&handle, &header, &data, event);
+ }
+
+ perf_output_end(&handle);
+
+ /* There's new data available. */
+ event->hw.interrupts++;
+ event->pending_kill = POLL_IN;
+}
+
+static inline void
+intel_pmu_disable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
+ intel_pmu_disable_bts();
+ intel_pmu_drain_bts_buffer();
+ return;
+ }
+
+ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+ intel_pmu_disable_fixed(hwc);
+ return;
+ }
+
+ x86_pmu_disable_event(event);
+}
+
+static inline void
+intel_pmu_enable_fixed(struct hw_perf_event *hwc)
+{
+ int idx = hwc->idx - X86_PMC_IDX_FIXED;
+ u64 ctrl_val, bits, mask;
+ int err;
+
+ /*
+ * Enable IRQ generation (0x8),
+ * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+ * if requested:
+ */
+ bits = 0x8ULL;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+ bits |= 0x2;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+ bits |= 0x1;
+
+ /*
+ * ANY bit is supported in v3 and up
+ */
+ if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
+ bits |= 0x4;
+
+ bits <<= (idx * 4);
+ mask = 0xfULL << (idx * 4);
+
+ rdmsrl(hwc->config_base, ctrl_val);
+ ctrl_val &= ~mask;
+ ctrl_val |= bits;
+ err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void intel_pmu_enable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
+ if (!__get_cpu_var(cpu_hw_events).enabled)
+ return;
+
+ intel_pmu_enable_bts(hwc->config);
+ return;
+ }
+
+ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+ intel_pmu_enable_fixed(hwc);
+ return;
+ }
+
+ __x86_pmu_enable_event(hwc);
+}
+
+/*
+ * Save and restart an expired event. Called by NMI contexts,
+ * so it has to be careful about preempting normal event ops:
+ */
+static int intel_pmu_save_and_restart(struct perf_event *event)
+{
+ x86_perf_event_update(event);
+ return x86_perf_event_set_period(event);
+}
+
+static void intel_pmu_reset(void)
+{
+ struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
+ unsigned long flags;
+ int idx;
+
+ if (!x86_pmu.num_events)
+ return;
+
+ local_irq_save(flags);
+
+ printk("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+ for (idx = 0; idx < x86_pmu.num_events; idx++) {
+ checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
+ checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
+ }
+ for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+ checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+ }
+ if (ds)
+ ds->bts_index = ds->bts_buffer_base;
+
+ local_irq_restore(flags);
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_events *cpuc;
+ int bit, loops;
+ u64 ack, status;
+
+ perf_sample_data_init(&data, 0);
+
+ cpuc = &__get_cpu_var(cpu_hw_events);
+
+ intel_pmu_disable_all();
+ intel_pmu_drain_bts_buffer();
+ status = intel_pmu_get_status();
+ if (!status) {
+ intel_pmu_enable_all();
+ return 0;
+ }
+
+ loops = 0;
+again:
+ if (++loops > 100) {
+ WARN_ONCE(1, "perfevents: irq loop stuck!\n");
+ perf_event_print_debug();
+ intel_pmu_reset();
+ goto done;
+ }
+
+ inc_irq_stat(apic_perf_irqs);
+ ack = status;
+ for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+ struct perf_event *event = cpuc->events[bit];
+
+ if (!test_bit(bit, cpuc->active_mask))
+ continue;
+
+ if (!intel_pmu_save_and_restart(event))
+ continue;
+
+ data.period = event->hw.last_period;
+
+ if (perf_event_overflow(event, 1, &data, regs))
+ x86_pmu_stop(event);
+ }
+
+ intel_pmu_ack_status(ack);
+
+ /*
+ * Repeat if there is more work to be done:
+ */
+ status = intel_pmu_get_status();
+ if (status)
+ goto again;
+
+done:
+ intel_pmu_enable_all();
+ return 1;
+}
+
+static struct event_constraint bts_constraint =
+ EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
+
+static struct event_constraint *
+intel_special_constraints(struct perf_event *event)
+{
+ unsigned int hw_event;
+
+ hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
+
+ if (unlikely((hw_event ==
+ x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
+ (event->hw.sample_period == 1))) {
+
+ return &bts_constraint;
+ }
+ return NULL;
+}
+
+static struct event_constraint *
+intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = intel_special_constraints(event);
+ if (c)
+ return c;
+
+ return x86_get_event_constraints(cpuc, event);
+}
+
+static __initconst struct x86_pmu core_pmu = {
+ .name = "core",
+ .handle_irq = x86_pmu_handle_irq,
+ .disable_all = x86_pmu_disable_all,
+ .enable_all = x86_pmu_enable_all,
+ .enable = x86_pmu_enable_event,
+ .disable = x86_pmu_disable_event,
+ .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+ .event_map = intel_pmu_event_map,
+ .raw_event = intel_pmu_raw_event,
+ .max_events = ARRAY_SIZE(intel_perfmon_event_map),
+ .apic = 1,
+ /*
+ * Intel PMCs cannot be accessed sanely above 32 bit width,
+ * so we install an artificial 1<<31 period regardless of
+ * the generic event period:
+ */
+ .max_period = (1ULL << 31) - 1,
+ .get_event_constraints = intel_get_event_constraints,
+ .event_constraints = intel_core_event_constraints,
+};
+
+static __initconst struct x86_pmu intel_pmu = {
+ .name = "Intel",
+ .handle_irq = intel_pmu_handle_irq,
+ .disable_all = intel_pmu_disable_all,
+ .enable_all = intel_pmu_enable_all,
+ .enable = intel_pmu_enable_event,
+ .disable = intel_pmu_disable_event,
+ .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+ .event_map = intel_pmu_event_map,
+ .raw_event = intel_pmu_raw_event,
+ .max_events = ARRAY_SIZE(intel_perfmon_event_map),
+ .apic = 1,
+ /*
+ * Intel PMCs cannot be accessed sanely above 32 bit width,
+ * so we install an artificial 1<<31 period regardless of
+ * the generic event period:
+ */
+ .max_period = (1ULL << 31) - 1,
+ .enable_bts = intel_pmu_enable_bts,
+ .disable_bts = intel_pmu_disable_bts,
+ .get_event_constraints = intel_get_event_constraints,
+
+ .cpu_starting = init_debug_store_on_cpu,
+ .cpu_dying = fini_debug_store_on_cpu,
+};
+
+static __init int intel_pmu_init(void)
+{
+ union cpuid10_edx edx;
+ union cpuid10_eax eax;
+ unsigned int unused;
+ unsigned int ebx;
+ int version;
+
+ if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+ /* check for P6 processor family */
+ if (boot_cpu_data.x86 == 6) {
+ return p6_pmu_init();
+ } else {
+ return -ENODEV;
+ }
+ }
+
+ /*
+ * Check whether the Architectural PerfMon supports
+ * Branch Misses Retired hw_event or not.
+ */
+ cpuid(10, &eax.full, &ebx, &unused, &edx.full);
+ if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+ return -ENODEV;
+
+ version = eax.split.version_id;
+ if (version < 2)
+ x86_pmu = core_pmu;
+ else
+ x86_pmu = intel_pmu;
+
+ x86_pmu.version = version;
+ x86_pmu.num_events = eax.split.num_events;
+ x86_pmu.event_bits = eax.split.bit_width;
+ x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
+
+ /*
+ * Quirk: v2 perfmon does not report fixed-purpose events, so
+ * assume at least 3 events:
+ */
+ if (version > 1)
+ x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
+
+ /*
+ * Install the hw-cache-events table:
+ */
+ switch (boot_cpu_data.x86_model) {
+ case 14: /* 65 nm core solo/duo, "Yonah" */
+ pr_cont("Core events, ");
+ break;
+
+ case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+ case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
+ case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
+ case 29: /* six-core 45 nm xeon "Dunnington" */
+ memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ x86_pmu.event_constraints = intel_core2_event_constraints;
+ pr_cont("Core2 events, ");
+ break;
+
+ case 26: /* 45 nm nehalem, "Bloomfield" */
+ case 30: /* 45 nm nehalem, "Lynnfield" */
+ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ x86_pmu.event_constraints = intel_nehalem_event_constraints;
+ pr_cont("Nehalem/Corei7 events, ");
+ break;
+ case 28: /* Atom */
+ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ x86_pmu.event_constraints = intel_gen_event_constraints;
+ pr_cont("Atom events, ");
+ break;
+
+ case 37: /* 32 nm nehalem, "Clarkdale" */
+ case 44: /* 32 nm nehalem, "Gulftown" */
+ memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ x86_pmu.event_constraints = intel_westmere_event_constraints;
+ pr_cont("Westmere events, ");
+ break;
+
+ default:
+ /*
+ * default constraints for v2 and up
+ */
+ x86_pmu.event_constraints = intel_gen_event_constraints;
+ pr_cont("generic architected perfmon, ");
+ }
+ return 0;
+}
+
+#else /* CONFIG_CPU_SUP_INTEL */
+
+static int intel_pmu_init(void)
+{
+ return 0;
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
new file mode 100644
index 000000000000..a330485d14da
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -0,0 +1,159 @@
+#ifdef CONFIG_CPU_SUP_INTEL
+
+/*
+ * Not sure about some of these
+ */
+static const u64 p6_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+ [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
+};
+
+static u64 p6_pmu_event_map(int hw_event)
+{
+ return p6_perfmon_event_map[hw_event];
+}
+
+/*
+ * Event setting that is specified not to count anything.
+ * We use this to effectively disable a counter.
+ *
+ * L2_RQSTS with 0 MESI unit mask.
+ */
+#define P6_NOP_EVENT 0x0000002EULL
+
+static u64 p6_pmu_raw_event(u64 hw_event)
+{
+#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
+#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
+#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
+#define P6_EVNTSEL_INV_MASK 0x00800000ULL
+#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
+
+#define P6_EVNTSEL_MASK \
+ (P6_EVNTSEL_EVENT_MASK | \
+ P6_EVNTSEL_UNIT_MASK | \
+ P6_EVNTSEL_EDGE_MASK | \
+ P6_EVNTSEL_INV_MASK | \
+ P6_EVNTSEL_REG_MASK)
+
+ return hw_event & P6_EVNTSEL_MASK;
+}
+
+static struct event_constraint p6_event_constraints[] =
+{
+ INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
+ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
+ INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
+ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+ EVENT_CONSTRAINT_END
+};
+
+static void p6_pmu_disable_all(void)
+{
+ u64 val;
+
+ /* p6 only has one enable register */
+ rdmsrl(MSR_P6_EVNTSEL0, val);
+ val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void p6_pmu_enable_all(void)
+{
+ unsigned long val;
+
+ /* p6 only has one enable register */
+ rdmsrl(MSR_P6_EVNTSEL0, val);
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static inline void
+p6_pmu_disable_event(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ u64 val = P6_NOP_EVENT;
+
+ if (cpuc->enabled)
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+
+ (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
+}
+
+static void p6_pmu_enable_event(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ u64 val;
+
+ val = hwc->config;
+ if (cpuc->enabled)
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+
+ (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
+}
+
+static __initconst struct x86_pmu p6_pmu = {
+ .name = "p6",
+ .handle_irq = x86_pmu_handle_irq,
+ .disable_all = p6_pmu_disable_all,
+ .enable_all = p6_pmu_enable_all,
+ .enable = p6_pmu_enable_event,
+ .disable = p6_pmu_disable_event,
+ .eventsel = MSR_P6_EVNTSEL0,
+ .perfctr = MSR_P6_PERFCTR0,
+ .event_map = p6_pmu_event_map,
+ .raw_event = p6_pmu_raw_event,
+ .max_events = ARRAY_SIZE(p6_perfmon_event_map),
+ .apic = 1,
+ .max_period = (1ULL << 31) - 1,
+ .version = 0,
+ .num_events = 2,
+ /*
+ * Events have 40 bits implemented. However they are designed such
+ * that bits [32-39] are sign extensions of bit 31. As such the
+ * effective width of a event for P6-like PMU is 32 bits only.
+ *
+ * See IA-32 Intel Architecture Software developer manual Vol 3B
+ */
+ .event_bits = 32,
+ .event_mask = (1ULL << 32) - 1,
+ .get_event_constraints = x86_get_event_constraints,
+ .event_constraints = p6_event_constraints,
+};
+
+static __init int p6_pmu_init(void)
+{
+ switch (boot_cpu_data.x86_model) {
+ case 1:
+ case 3: /* Pentium Pro */
+ case 5:
+ case 6: /* Pentium II */
+ case 7:
+ case 8:
+ case 11: /* Pentium III */
+ case 9:
+ case 13:
+ /* Pentium M */
+ break;
+ default:
+ pr_cont("unsupported p6 CPU model %d ",
+ boot_cpu_data.x86_model);
+ return -ENODEV;
+ }
+
+ x86_pmu = p6_pmu;
+
+ return 0;
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fab786f60ed6..fb329e9f8494 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -115,17 +115,6 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
return !test_bit(counter, perfctr_nmi_owner);
}
-
-/* checks the an msr for availability */
-int avail_to_resrv_perfctr_nmi(unsigned int msr)
-{
- unsigned int counter;
-
- counter = nmi_perfctr_msr_to_bit(msr);
- BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
- return !test_bit(counter, perfctr_nmi_owner);
-}
EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
int reserve_perfctr_nmi(unsigned int msr)
@@ -691,7 +680,7 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
cpu_nmi_set_wd_enabled();
apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
wrmsr(evntsel_msr, evntsel, 0);
intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
return 1;
@@ -712,7 +701,7 @@ static void probe_nmi_watchdog(void)
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
- boot_cpu_data.x86 != 16)
+ boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
return;
wd_ops = &k7_wd_ops;
break;
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index bb62b3e5caad..28000743bbb0 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -26,7 +26,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
early_init_transmeta(c);
- display_cacheinfo(c);
+ cpu_detect_cache_sizes(c);
/* Print CMS and CPU revision */
max = cpuid_eax(0x80860000);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 6a52d4b36a30..83e5e628de73 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -116,21 +116,16 @@ static int cpuid_open(struct inode *inode, struct file *file)
{
unsigned int cpu;
struct cpuinfo_x86 *c;
- int ret = 0;
-
- lock_kernel();
cpu = iminor(file->f_path.dentry->d_inode);
- if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
- ret = -ENXIO; /* No such CPU */
- goto out;
- }
+ if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+ return -ENXIO; /* No such CPU */
+
c = &cpu_data(cpu);
if (c->cpuid_level < 0)
- ret = -EIO; /* CPUID not supported */
-out:
- unlock_kernel();
- return ret;
+ return -EIO; /* CPUID not supported */
+
+ return 0;
}
/*
@@ -192,7 +187,8 @@ static int __init cpuid_init(void)
int i, err = 0;
i = 0;
- if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) {
+ if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS,
+ "cpu/cpuid", &cpuid_fops)) {
printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
CPUID_MAJOR);
err = -EBUSY;
@@ -221,7 +217,7 @@ out_class:
}
class_destroy(cpuid_class);
out_chrdev:
- unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
+ __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
out:
return err;
}
@@ -233,7 +229,7 @@ static void __exit cpuid_exit(void)
for_each_online_cpu(cpu)
cpuid_device_destroy(cpu);
class_destroy(cpuid_class);
- unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
+ __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 5e409dc298a4..a4849c10a77e 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,8 +27,7 @@
#include <asm/cpu.h>
#include <asm/reboot.h>
#include <asm/virtext.h>
-#include <asm/iommu.h>
-
+#include <asm/x86_init.h>
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
@@ -106,7 +105,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#endif
#ifdef CONFIG_X86_64
- pci_iommu_shutdown();
+ x86_platform.iommu_shutdown();
#endif
crash_save_cpu(regs, safe_smp_processor_id());
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index f7cdb3b457aa..cd97ce18c29d 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -16,6 +16,22 @@ static void *kdump_buf_page;
/* Stores the physical address of elf header of crash image. */
unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+static inline bool is_crashed_pfn_valid(unsigned long pfn)
+{
+#ifndef CONFIG_X86_PAE
+ /*
+ * non-PAE kdump kernel executed from a PAE one will crop high pte
+ * bits and poke unwanted space counting again from address 0, we
+ * don't want that. pte must fit into unsigned long. In fact the
+ * test checks high 12 bits for being zero (pfn will be shifted left
+ * by PAGE_SHIFT).
+ */
+ return pte_pfn(pfn_pte(pfn, __pgprot(0))) == pfn;
+#else
+ return true;
+#endif
+}
+
/**
* copy_oldmem_page - copy one page from "oldmem"
* @pfn: page frame number to be copied
@@ -41,6 +57,9 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
if (!csize)
return 0;
+ if (!is_crashed_pfn_valid(pfn))
+ return -EFAULT;
+
vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
if (!userbuf) {
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index ef42a038f1a6..1c47390dd0e5 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -265,13 +265,13 @@ struct ds_context {
int cpu;
};
-static DEFINE_PER_CPU(struct ds_context *, cpu_context);
+static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
{
struct ds_context **p_context =
- (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu));
+ (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
struct ds_context *context = NULL;
struct ds_context *new_context = NULL;
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2d8a371d4339..6d817554780a 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -109,6 +109,32 @@ print_context_stack(struct thread_info *tinfo,
}
return bp;
}
+EXPORT_SYMBOL_GPL(print_context_stack);
+
+unsigned long
+print_context_stack_bp(struct thread_info *tinfo,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data,
+ unsigned long *end, int *graph)
+{
+ struct stack_frame *frame = (struct stack_frame *)bp;
+ unsigned long *ret_addr = &frame->return_address;
+
+ while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) {
+ unsigned long addr = *ret_addr;
+
+ if (!__kernel_text_address(addr))
+ break;
+
+ ops->address(data, addr, 1);
+ frame = frame->next_frame;
+ ret_addr = &frame->return_address;
+ print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
+ }
+
+ return (unsigned long)frame;
+}
+EXPORT_SYMBOL_GPL(print_context_stack_bp);
static void
@@ -141,10 +167,11 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
}
static const struct stacktrace_ops print_trace_ops = {
- .warning = print_trace_warning,
- .warning_symbol = print_trace_warning_symbol,
- .stack = print_trace_stack,
- .address = print_trace_address,
+ .warning = print_trace_warning,
+ .warning_symbol = print_trace_warning_symbol,
+ .stack = print_trace_stack,
+ .address = print_trace_address,
+ .walk_stack = print_context_stack,
};
void
@@ -188,7 +215,7 @@ void dump_stack(void)
}
EXPORT_SYMBOL(dump_stack);
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
static int die_owner = -1;
static unsigned int die_nest_count;
@@ -207,11 +234,11 @@ unsigned __kprobes long oops_begin(void)
/* racy, but better than risking deadlock. */
raw_local_irq_save(flags);
cpu = smp_processor_id();
- if (!__raw_spin_trylock(&die_lock)) {
+ if (!arch_spin_trylock(&die_lock)) {
if (cpu == die_owner)
/* nested oops. should stop eventually */;
else
- __raw_spin_lock(&die_lock);
+ arch_spin_lock(&die_lock);
}
die_nest_count++;
die_owner = cpu;
@@ -231,7 +258,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
die_nest_count--;
if (!die_nest_count)
/* Nest count reaches zero, release the lock. */
- __raw_spin_unlock(&die_lock);
+ arch_spin_unlock(&die_lock);
raw_local_irq_restore(flags);
oops_exit();
@@ -268,11 +295,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
show_registers(regs);
#ifdef CONFIG_X86_32
- sp = (unsigned long) (&regs->sp);
- savesegment(ss, ss);
- if (user_mode(regs)) {
+ if (user_mode_vm(regs)) {
sp = regs->sp;
ss = regs->ss & 0xffff;
+ } else {
+ sp = kernel_stack_pointer(regs);
+ savesegment(ss, ss);
}
printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
print_symbol("%s", regs->ip);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index 81086c227ab7..29e5f7c845b2 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -14,12 +14,6 @@
#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
#endif
-extern unsigned long
-print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end, int *graph);
-
extern void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp, char *log_lvl);
@@ -35,4 +29,19 @@ struct stack_frame {
struct stack_frame *next_frame;
unsigned long return_address;
};
+
+static inline unsigned long rewind_frame_pointer(int n)
+{
+ struct stack_frame *frame;
+
+ get_bp(frame);
+
+#ifdef CONFIG_FRAME_POINTER
+ while (n--)
+ frame = frame->next_frame;
#endif
+
+ return (unsigned long)frame;
+}
+
+#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index f7dd2a7c3bf4..11540a189d93 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -10,19 +10,14 @@
#include <linux/module.h>
#include <linux/ptrace.h>
#include <linux/kexec.h>
+#include <linux/sysfs.h>
#include <linux/bug.h>
#include <linux/nmi.h>
-#include <linux/sysfs.h>
#include <asm/stacktrace.h>
#include "dumpstack.h"
-/* Just a stub for now */
-int x86_is_stack_id(int id, char *name)
-{
- return 0;
-}
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
@@ -35,6 +30,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
if (!stack) {
unsigned long dummy;
+
stack = &dummy;
if (task && task != current)
stack = (unsigned long *)task->thread.sp;
@@ -57,8 +53,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
context = (struct thread_info *)
((unsigned long)stack & (~(THREAD_SIZE - 1)));
- bp = print_context_stack(context, stack, bp, ops,
- data, NULL, &graph);
+ bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph);
stack = (unsigned long *)context->previous_esp;
if (!stack)
@@ -72,7 +67,7 @@ EXPORT_SYMBOL(dump_trace);
void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+ unsigned long *sp, unsigned long bp, char *log_lvl)
{
unsigned long *stack;
int i;
@@ -156,4 +151,3 @@ int is_valid_bugaddr(unsigned long ip)
return ud2 == 0x0b0f;
}
-
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index a071e6be177e..272c9f1f05f3 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -10,34 +10,31 @@
#include <linux/module.h>
#include <linux/ptrace.h>
#include <linux/kexec.h>
+#include <linux/sysfs.h>
#include <linux/bug.h>
#include <linux/nmi.h>
-#include <linux/sysfs.h>
#include <asm/stacktrace.h>
#include "dumpstack.h"
+#define N_EXCEPTION_STACKS_END \
+ (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
static char x86_stack_ids[][8] = {
- [DEBUG_STACK - 1] = "#DB",
- [NMI_STACK - 1] = "NMI",
- [DOUBLEFAULT_STACK - 1] = "#DF",
- [STACKFAULT_STACK - 1] = "#SS",
- [MCE_STACK - 1] = "#MC",
+ [ DEBUG_STACK-1 ] = "#DB",
+ [ NMI_STACK-1 ] = "NMI",
+ [ DOUBLEFAULT_STACK-1 ] = "#DF",
+ [ STACKFAULT_STACK-1 ] = "#SS",
+ [ MCE_STACK-1 ] = "#MC",
#if DEBUG_STKSZ > EXCEPTION_STKSZ
- [N_EXCEPTION_STACKS ...
- N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
+ [ N_EXCEPTION_STACKS ...
+ N_EXCEPTION_STACKS_END ] = "#DB[?]"
#endif
- };
-
-int x86_is_stack_id(int id, char *name)
-{
- return x86_stack_ids[id - 1] == name;
-}
+};
static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
- unsigned *usedp, char **idp)
+ unsigned *usedp, char **idp)
{
unsigned k;
@@ -101,6 +98,41 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
return NULL;
}
+static inline int
+in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
+ unsigned long *irq_stack_end)
+{
+ return (stack >= irq_stack && stack < irq_stack_end);
+}
+
+/*
+ * We are returning from the irq stack and go to the previous one.
+ * If the previous stack is also in the irq stack, then bp in the first
+ * frame of the irq stack points to the previous, interrupted one.
+ * Otherwise we have another level of indirection: We first save
+ * the bp of the previous stack, then we switch the stack to the irq one
+ * and save a new bp that links to the previous one.
+ * (See save_args())
+ */
+static inline unsigned long
+fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
+ unsigned long *irq_stack, unsigned long *irq_stack_end)
+{
+#ifdef CONFIG_FRAME_POINTER
+ struct stack_frame *frame = (struct stack_frame *)bp;
+ unsigned long next;
+
+ if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
+ if (!probe_kernel_address(&frame->next_frame, next))
+ return next;
+ else
+ WARN_ONCE(1, "Perf: bad frame pointer = %p in "
+ "callchain\n", &frame->next_frame);
+ }
+#endif
+ return bp;
+}
+
/*
* x86-64 can have up to three kernel stacks:
* process stack
@@ -157,8 +189,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
if (ops->stack(data, id) < 0)
break;
- bp = print_context_stack(tinfo, stack, bp, ops,
- data, estack_end, &graph);
+ bp = ops->walk_stack(tinfo, stack, bp, ops,
+ data, estack_end, &graph);
ops->stack(data, "<EOE>");
/*
* We link to the next stack via the
@@ -173,10 +205,10 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
irq_stack = irq_stack_end -
(IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
- if (stack >= irq_stack && stack < irq_stack_end) {
+ if (in_irq_stack(stack, irq_stack, irq_stack_end)) {
if (ops->stack(data, "IRQ") < 0)
break;
- bp = print_context_stack(tinfo, stack, bp,
+ bp = ops->walk_stack(tinfo, stack, bp,
ops, data, irq_stack_end, &graph);
/*
* We link to the next stack (which would be
@@ -184,6 +216,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
* pointer (index -1 to end) in the IRQ stack:
*/
stack = (unsigned long *) (irq_stack_end[-1]);
+ bp = fixup_bp_irq_link(bp, stack, irq_stack,
+ irq_stack_end);
irq_stack_end = NULL;
ops->stack(data, "EOI");
continue;
@@ -195,28 +229,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
/*
* This handles the process stack:
*/
- bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph);
+ bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph);
put_cpu();
}
EXPORT_SYMBOL(dump_trace);
void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+ unsigned long *sp, unsigned long bp, char *log_lvl)
{
+ unsigned long *irq_stack_end;
+ unsigned long *irq_stack;
unsigned long *stack;
+ int cpu;
int i;
- const int cpu = smp_processor_id();
- unsigned long *irq_stack_end =
- (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
- unsigned long *irq_stack =
- (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
+
+ preempt_disable();
+ cpu = smp_processor_id();
+
+ irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
+ irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
/*
- * debugging aid: "show_stack(NULL, NULL);" prints the
- * back trace for this cpu.
+ * Debugging aid: "show_stack(NULL, NULL);" prints the
+ * back trace for this cpu:
*/
-
if (sp == NULL) {
if (task)
sp = (unsigned long *)task->thread.sp;
@@ -240,6 +277,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
printk(" %016lx", *stack++);
touch_nmi_watchdog();
}
+ preempt_enable();
+
printk("\n");
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
@@ -253,6 +292,7 @@ void show_registers(struct pt_regs *regs)
sp = regs->sp;
printk("CPU %d ", cpu);
+ print_modules();
__show_regs(regs, 1);
printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
cur->comm, cur->pid, task_thread_info(cur), cur);
@@ -303,4 +343,3 @@ int is_valid_bugaddr(unsigned long ip)
return ud2 == 0x0b0f;
}
-
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 85419bb7d4ab..740b440fbd73 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -12,21 +12,13 @@
#include <linux/types.h>
#include <linux/init.h>
#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
#include <linux/pfn.h>
#include <linux/suspend.h>
#include <linux/firmware-map.h>
-#include <asm/pgtable.h>
-#include <asm/page.h>
#include <asm/e820.h>
#include <asm/proto.h>
#include <asm/setup.h>
-#include <asm/trampoline.h>
/*
* The e820 map is the map that gets modified e.g. with command line parameters
@@ -517,11 +509,19 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
int checktype)
{
int i;
+ u64 end;
u64 real_removed_size = 0;
if (size > (ULLONG_MAX - start))
size = ULLONG_MAX - start;
+ end = start + size;
+ printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
+ (unsigned long long) start,
+ (unsigned long long) end);
+ e820_print_type(old_type);
+ printk(KERN_CONT "\n");
+
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
u64 final_start, final_end;
@@ -722,310 +722,44 @@ core_initcall(e820_mark_nvs_memory);
#endif
/*
- * Early reserved memory areas.
- */
-#define MAX_EARLY_RES 20
-
-struct early_res {
- u64 start, end;
- char name[16];
- char overlap_ok;
-};
-static struct early_res early_res[MAX_EARLY_RES] __initdata = {
- { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
- {}
-};
-
-static int __init find_overlapped_early(u64 start, u64 end)
-{
- int i;
- struct early_res *r;
-
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- r = &early_res[i];
- if (end > r->start && start < r->end)
- break;
- }
-
- return i;
-}
-
-/*
- * Drop the i-th range from the early reservation map,
- * by copying any higher ranges down one over it, and
- * clearing what had been the last slot.
- */
-static void __init drop_range(int i)
-{
- int j;
-
- for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
- ;
-
- memmove(&early_res[i], &early_res[i + 1],
- (j - 1 - i) * sizeof(struct early_res));
-
- early_res[j - 1].end = 0;
-}
-
-/*
- * Split any existing ranges that:
- * 1) are marked 'overlap_ok', and
- * 2) overlap with the stated range [start, end)
- * into whatever portion (if any) of the existing range is entirely
- * below or entirely above the stated range. Drop the portion
- * of the existing range that overlaps with the stated range,
- * which will allow the caller of this routine to then add that
- * stated range without conflicting with any existing range.
+ * Find a free area with specified alignment in a specific range.
*/
-static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
+u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
{
int i;
- struct early_res *r;
- u64 lower_start, lower_end;
- u64 upper_start, upper_end;
- char name[16];
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- r = &early_res[i];
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 addr;
+ u64 ei_start, ei_last;
- /* Continue past non-overlapping ranges */
- if (end <= r->start || start >= r->end)
+ if (ei->type != E820_RAM)
continue;
- /*
- * Leave non-ok overlaps as is; let caller
- * panic "Overlapping early reservations"
- * when it hits this overlap.
- */
- if (!r->overlap_ok)
- return;
-
- /*
- * We have an ok overlap. We will drop it from the early
- * reservation map, and add back in any non-overlapping
- * portions (lower or upper) as separate, overlap_ok,
- * non-overlapping ranges.
- */
-
- /* 1. Note any non-overlapping (lower or upper) ranges. */
- strncpy(name, r->name, sizeof(name) - 1);
-
- lower_start = lower_end = 0;
- upper_start = upper_end = 0;
- if (r->start < start) {
- lower_start = r->start;
- lower_end = start;
- }
- if (r->end > end) {
- upper_start = end;
- upper_end = r->end;
- }
-
- /* 2. Drop the original ok overlapping range */
- drop_range(i);
-
- i--; /* resume for-loop on copied down entry */
-
- /* 3. Add back in any non-overlapping ranges. */
- if (lower_end)
- reserve_early_overlap_ok(lower_start, lower_end, name);
- if (upper_end)
- reserve_early_overlap_ok(upper_start, upper_end, name);
- }
-}
-
-static void __init __reserve_early(u64 start, u64 end, char *name,
- int overlap_ok)
-{
- int i;
- struct early_res *r;
-
- i = find_overlapped_early(start, end);
- if (i >= MAX_EARLY_RES)
- panic("Too many early reservations");
- r = &early_res[i];
- if (r->end)
- panic("Overlapping early reservations "
- "%llx-%llx %s to %llx-%llx %s\n",
- start, end - 1, name?name:"", r->start,
- r->end - 1, r->name);
- r->start = start;
- r->end = end;
- r->overlap_ok = overlap_ok;
- if (name)
- strncpy(r->name, name, sizeof(r->name) - 1);
-}
-
-/*
- * A few early reservtations come here.
- *
- * The 'overlap_ok' in the name of this routine does -not- mean it
- * is ok for these reservations to overlap an earlier reservation.
- * Rather it means that it is ok for subsequent reservations to
- * overlap this one.
- *
- * Use this entry point to reserve early ranges when you are doing
- * so out of "Paranoia", reserving perhaps more memory than you need,
- * just in case, and don't mind a subsequent overlapping reservation
- * that is known to be needed.
- *
- * The drop_overlaps_that_are_ok() call here isn't really needed.
- * It would be needed if we had two colliding 'overlap_ok'
- * reservations, so that the second such would not panic on the
- * overlap with the first. We don't have any such as of this
- * writing, but might as well tolerate such if it happens in
- * the future.
- */
-void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
-{
- drop_overlaps_that_are_ok(start, end);
- __reserve_early(start, end, name, 1);
-}
-
-/*
- * Most early reservations come here.
- *
- * We first have drop_overlaps_that_are_ok() drop any pre-existing
- * 'overlap_ok' ranges, so that we can then reserve this memory
- * range without risk of panic'ing on an overlapping overlap_ok
- * early reservation.
- */
-void __init reserve_early(u64 start, u64 end, char *name)
-{
- if (start >= end)
- return;
-
- drop_overlaps_that_are_ok(start, end);
- __reserve_early(start, end, name, 0);
-}
-
-void __init free_early(u64 start, u64 end)
-{
- struct early_res *r;
- int i;
-
- i = find_overlapped_early(start, end);
- r = &early_res[i];
- if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
- panic("free_early on not reserved area: %llx-%llx!",
- start, end - 1);
-
- drop_range(i);
-}
+ ei_last = ei->addr + ei->size;
+ ei_start = ei->addr;
+ addr = find_early_area(ei_start, ei_last, start, end,
+ size, align);
-void __init early_res_to_bootmem(u64 start, u64 end)
-{
- int i, count;
- u64 final_start, final_end;
-
- count = 0;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
- count++;
-
- printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
- count, start, end);
- for (i = 0; i < count; i++) {
- struct early_res *r = &early_res[i];
- printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
- r->start, r->end, r->name);
- final_start = max(start, r->start);
- final_end = min(end, r->end);
- if (final_start >= final_end) {
- printk(KERN_CONT "\n");
- continue;
- }
- printk(KERN_CONT " ==> [%010llx - %010llx]\n",
- final_start, final_end);
- reserve_bootmem_generic(final_start, final_end - final_start,
- BOOTMEM_DEFAULT);
+ if (addr != -1ULL)
+ return addr;
}
+ return -1ULL;
}
-/* Check for already reserved areas */
-static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
+u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
{
- int i;
- u64 addr = *addrp;
- int changed = 0;
- struct early_res *r;
-again:
- i = find_overlapped_early(addr, addr + size);
- r = &early_res[i];
- if (i < MAX_EARLY_RES && r->end) {
- *addrp = addr = round_up(r->end, align);
- changed = 1;
- goto again;
- }
- return changed;
+ return find_e820_area(start, end, size, align);
}
-/* Check for already reserved areas */
-static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+u64 __init get_max_mapped(void)
{
- int i;
- u64 addr = *addrp, last;
- u64 size = *sizep;
- int changed = 0;
-again:
- last = addr + size;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- struct early_res *r = &early_res[i];
- if (last > r->start && addr < r->start) {
- size = r->start - addr;
- changed = 1;
- goto again;
- }
- if (last > r->end && addr < r->end) {
- addr = round_up(r->end, align);
- size = last - addr;
- changed = 1;
- goto again;
- }
- if (last <= r->end && addr >= r->start) {
- (*sizep)++;
- return 0;
- }
- }
- if (changed) {
- *addrp = addr;
- *sizep = size;
- }
- return changed;
-}
+ u64 end = max_pfn_mapped;
-/*
- * Find a free area with specified alignment in a specific range.
- */
-u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 addr, last;
- u64 ei_last;
+ end <<= PAGE_SHIFT;
- if (ei->type != E820_RAM)
- continue;
- addr = round_up(ei->addr, align);
- ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- while (bad_addr(&addr, size, align) && addr+size <= ei_last)
- ;
- last = addr + size;
- if (last > ei_last)
- continue;
- if (last > end)
- continue;
- return addr;
- }
- return -1ULL;
+ return end;
}
-
/*
* Find next free range after *start
*/
@@ -1035,25 +769,19 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
- u64 addr, last;
- u64 ei_last;
+ u64 addr;
+ u64 ei_start, ei_last;
if (ei->type != E820_RAM)
continue;
- addr = round_up(ei->addr, align);
+
ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- *sizep = ei_last - addr;
- while (bad_addr_size(&addr, sizep, align) &&
- addr + *sizep <= ei_last)
- ;
- last = addr + *sizep;
- if (last > ei_last)
- continue;
- return addr;
+ ei_start = ei->addr;
+ addr = find_early_area_size(ei_start, ei_last, start,
+ sizep, align);
+
+ if (addr != -1ULL)
+ return addr;
}
return -1ULL;
@@ -1378,8 +1106,8 @@ static unsigned long ram_alignment(resource_size_t pos)
if (mb < 16)
return 1024*1024;
- /* To 32MB for anything above that */
- return 32*1024*1024;
+ /* To 64MB for anything above that */
+ return 64*1024*1024;
}
#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
@@ -1412,6 +1140,8 @@ void __init e820_reserve_resources_late(void)
end = MAX_RESOURCE_SIZE;
if (start >= end)
continue;
+ printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
+ start, end);
reserve_region_with_split(&iomem_resource, start, end,
"RAM buffer");
}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 41fd965c80c6..b9c830c12b4a 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -206,8 +206,11 @@ static int __init setup_early_printk(char *buf)
while (*buf != '\0') {
if (!strncmp(buf, "serial", 6)) {
- early_serial_init(buf + 6);
+ buf += 6;
+ early_serial_init(buf);
early_console_register(&early_serial_console, keep);
+ if (!strncmp(buf, ",ttyS", 5))
+ buf += 5;
}
if (!strncmp(buf, "ttyS", 4)) {
early_serial_init(buf + 4);
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index ad5bd988fb79..c2fa9b8b497e 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -362,7 +362,7 @@ void __init efi_init(void)
printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
early_iounmap(tmp, 2);
- printk(KERN_INFO "EFI v%u.%.02u by %s \n",
+ printk(KERN_INFO "EFI v%u.%.02u by %s\n",
efi.systab->hdr.revision >> 16,
efi.systab->hdr.revision & 0xffff, vendor);
@@ -454,8 +454,10 @@ void __init efi_init(void)
if (add_efi_memmap)
do_add_efi_memmap();
+#ifdef CONFIG_X86_32
x86_platform.get_wallclock = efi_get_time;
x86_platform.set_wallclock = efi_set_rtc_mmss;
+#endif
/* Setup for EFI runtime service */
reboot_type = BOOT_EFI;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d607c6..44a8e0dc6737 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -334,6 +334,10 @@ ENTRY(ret_from_fork)
END(ret_from_fork)
/*
+ * Interrupt exit functions should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
+/*
* Return to user mode is not as complex as all this looks,
* but we want the default path for a system call return to
* go as quickly as possible which is why some of this is
@@ -383,6 +387,10 @@ need_resched:
END(resume_kernel)
#endif
CFI_ENDPROC
+/*
+ * End of kprobes section
+ */
+ .popsection
/* SYSENTER_RETURN points to after the "sysenter" instruction in
the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
@@ -513,6 +521,10 @@ sysexit_audit:
PTGS_TO_GS_EX
ENDPROC(ia32_sysenter_target)
+/*
+ * syscall stub including irq exit should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
# system call handler stub
ENTRY(system_call)
RING0_INT_FRAME # can't unwind into user space anyway
@@ -705,26 +717,69 @@ syscall_badsys:
jmp resume_userspace
END(syscall_badsys)
CFI_ENDPROC
+/*
+ * End of kprobes section
+ */
+ .popsection
/*
* System calls that need a pt_regs pointer.
*/
-#define PTREGSCALL(name) \
+#define PTREGSCALL0(name) \
ALIGN; \
ptregs_##name: \
leal 4(%esp),%eax; \
jmp sys_##name;
-PTREGSCALL(iopl)
-PTREGSCALL(fork)
-PTREGSCALL(clone)
-PTREGSCALL(vfork)
-PTREGSCALL(execve)
-PTREGSCALL(sigaltstack)
-PTREGSCALL(sigreturn)
-PTREGSCALL(rt_sigreturn)
-PTREGSCALL(vm86)
-PTREGSCALL(vm86old)
+#define PTREGSCALL1(name) \
+ ALIGN; \
+ptregs_##name: \
+ leal 4(%esp),%edx; \
+ movl (PT_EBX+4)(%esp),%eax; \
+ jmp sys_##name;
+
+#define PTREGSCALL2(name) \
+ ALIGN; \
+ptregs_##name: \
+ leal 4(%esp),%ecx; \
+ movl (PT_ECX+4)(%esp),%edx; \
+ movl (PT_EBX+4)(%esp),%eax; \
+ jmp sys_##name;
+
+#define PTREGSCALL3(name) \
+ ALIGN; \
+ptregs_##name: \
+ leal 4(%esp),%eax; \
+ pushl %eax; \
+ movl PT_EDX(%eax),%ecx; \
+ movl PT_ECX(%eax),%edx; \
+ movl PT_EBX(%eax),%eax; \
+ call sys_##name; \
+ addl $4,%esp; \
+ ret
+
+PTREGSCALL1(iopl)
+PTREGSCALL0(fork)
+PTREGSCALL0(vfork)
+PTREGSCALL3(execve)
+PTREGSCALL2(sigaltstack)
+PTREGSCALL0(sigreturn)
+PTREGSCALL0(rt_sigreturn)
+PTREGSCALL2(vm86)
+PTREGSCALL1(vm86old)
+
+/* Clone is an oddball. The 4th arg is in %edi */
+ ALIGN;
+ptregs_clone:
+ leal 4(%esp),%eax
+ pushl %eax
+ pushl PT_EDI(%eax)
+ movl PT_EDX(%eax),%ecx
+ movl PT_ECX(%eax),%edx
+ movl PT_EBX(%eax),%eax
+ call sys_clone
+ addl $8,%esp
+ ret
.macro FIXUP_ESPFIX_STACK
/*
@@ -814,6 +869,10 @@ common_interrupt:
ENDPROC(common_interrupt)
CFI_ENDPROC
+/*
+ * Irq entries should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
#define BUILD_INTERRUPT3(name, nr, fn) \
ENTRY(name) \
RING0_INT_FRAME; \
@@ -980,16 +1039,16 @@ ENTRY(spurious_interrupt_bug)
jmp error_code
CFI_ENDPROC
END(spurious_interrupt_bug)
+/*
+ * End of kprobes section
+ */
+ .popsection
ENTRY(kernel_thread_helper)
pushl $0 # fake return address for unwinder
CFI_STARTPROC
- movl %edx,%eax
- push %edx
- CFI_ADJUST_CFA_OFFSET 4
- call *%ebx
- push %eax
- CFI_ADJUST_CFA_OFFSET 4
+ movl %edi,%eax
+ call *%esi
call do_exit
ud2 # padding for call trace
CFI_ENDPROC
@@ -1185,17 +1244,14 @@ END(ftrace_graph_caller)
.globl return_to_handler
return_to_handler:
- pushl $0
pushl %eax
- pushl %ecx
pushl %edx
movl %ebp, %eax
call ftrace_return_to_handler
- movl %eax, 0xc(%esp)
+ movl %eax, %ecx
popl %edx
- popl %ecx
popl %eax
- ret
+ jmp *%ecx
#endif
.section .rodata,"a"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b5c061f8f358..0697ff139837 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -155,11 +155,11 @@ GLOBAL(return_to_handler)
call ftrace_return_to_handler
- movq %rax, 16(%rsp)
+ movq %rax, %rdi
movq 8(%rsp), %rdx
movq (%rsp), %rax
- addq $16, %rsp
- retq
+ addq $24, %rsp
+ jmp *%rdi
#endif
@@ -803,6 +803,10 @@ END(interrupt)
call \func
.endm
+/*
+ * Interrupt entry/exit should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
/*
* The interrupt stubs push (~vector+0x80) onto the stack and
* then jump to common_interrupt.
@@ -941,6 +945,10 @@ ENTRY(retint_kernel)
CFI_ENDPROC
END(common_interrupt)
+/*
+ * End of kprobes section
+ */
+ .popsection
/*
* APIC interrupts.
@@ -969,8 +977,8 @@ apicinterrupt UV_BAU_MESSAGE \
#endif
apicinterrupt LOCAL_TIMER_VECTOR \
apic_timer_interrupt smp_apic_timer_interrupt
-apicinterrupt GENERIC_INTERRUPT_VECTOR \
- generic_interrupt smp_generic_interrupt
+apicinterrupt X86_PLATFORM_IPI_VECTOR \
+ x86_platform_ipi smp_x86_platform_ipi
#ifdef CONFIG_SMP
apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
@@ -1068,10 +1076,10 @@ ENTRY(\sym)
TRACE_IRQS_OFF
movq %rsp,%rdi /* pt_regs pointer */
xorl %esi,%esi /* no error code */
- PER_CPU(init_tss, %rbp)
- subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
+ PER_CPU(init_tss, %r12)
+ subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
call \do_sym
- addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
+ addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
END(\sym)
@@ -1158,63 +1166,20 @@ bad_gs:
jmp 2b
.previous
-/*
- * Create a kernel thread.
- *
- * C extern interface:
- * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
- *
- * asm input arguments:
- * rdi: fn, rsi: arg, rdx: flags
- */
-ENTRY(kernel_thread)
- CFI_STARTPROC
- FAKE_STACK_FRAME $child_rip
- SAVE_ALL
-
- # rdi: flags, rsi: usp, rdx: will be &pt_regs
- movq %rdx,%rdi
- orq kernel_thread_flags(%rip),%rdi
- movq $-1, %rsi
- movq %rsp, %rdx
-
- xorl %r8d,%r8d
- xorl %r9d,%r9d
-
- # clone now
- call do_fork
- movq %rax,RAX(%rsp)
- xorl %edi,%edi
-
- /*
- * It isn't worth to check for reschedule here,
- * so internally to the x86_64 port you can rely on kernel_thread()
- * not to reschedule the child before returning, this avoids the need
- * of hacks for example to fork off the per-CPU idle tasks.
- * [Hopefully no generic code relies on the reschedule -AK]
- */
- RESTORE_ALL
- UNFAKE_STACK_FRAME
- ret
- CFI_ENDPROC
-END(kernel_thread)
-
-ENTRY(child_rip)
+ENTRY(kernel_thread_helper)
pushq $0 # fake return address
CFI_STARTPROC
/*
* Here we are in the child and the registers are set as they were
* at kernel_thread() invocation in the parent.
*/
- movq %rdi, %rax
- movq %rsi, %rdi
- call *%rax
+ call *%rsi
# exit
mov %eax, %edi
call do_exit
ud2 # padding for call trace
CFI_ENDPROC
-END(child_rip)
+END(kernel_thread_helper)
/*
* execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -1491,12 +1456,17 @@ error_kernelspace:
leaq irq_return(%rip),%rcx
cmpq %rcx,RIP+8(%rsp)
je error_swapgs
- movl %ecx,%ecx /* zero extend */
- cmpq %rcx,RIP+8(%rsp)
- je error_swapgs
+ movl %ecx,%eax /* zero extend */
+ cmpq %rax,RIP+8(%rsp)
+ je bstep_iret
cmpq $gs_change,RIP+8(%rsp)
je error_swapgs
jmp error_sti
+
+bstep_iret:
+ /* Fix truncated RIP */
+ movq %rcx,RIP+8(%rsp)
+ jmp error_swapgs
END(error_entry)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 9dbb527e1652..cd37469b54ee 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -9,6 +9,8 @@
* the dangers of modifying code on the run.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/spinlock.h>
#include <linux/hardirq.h>
#include <linux/uaccess.h>
@@ -28,14 +30,32 @@
#ifdef CONFIG_DYNAMIC_FTRACE
+/*
+ * modifying_code is set to notify NMIs that they need to use
+ * memory barriers when entering or exiting. But we don't want
+ * to burden NMIs with unnecessary memory barriers when code
+ * modification is not being done (which is most of the time).
+ *
+ * A mutex is already held when ftrace_arch_code_modify_prepare
+ * and post_process are called. No locks need to be taken here.
+ *
+ * Stop machine will make sure currently running NMIs are done
+ * and new NMIs will see the updated variable before we need
+ * to worry about NMIs doing memory barriers.
+ */
+static int modifying_code __read_mostly;
+static DEFINE_PER_CPU(int, save_modifying_code);
+
int ftrace_arch_code_modify_prepare(void)
{
set_kernel_text_rw();
+ modifying_code = 1;
return 0;
}
int ftrace_arch_code_modify_post_process(void)
{
+ modifying_code = 0;
set_kernel_text_ro();
return 0;
}
@@ -147,6 +167,11 @@ static void ftrace_mod_code(void)
void ftrace_nmi_enter(void)
{
+ __get_cpu_var(save_modifying_code) = modifying_code;
+
+ if (!__get_cpu_var(save_modifying_code))
+ return;
+
if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
smp_rmb();
ftrace_mod_code();
@@ -158,6 +183,9 @@ void ftrace_nmi_enter(void)
void ftrace_nmi_exit(void)
{
+ if (!__get_cpu_var(save_modifying_code))
+ return;
+
/* Finish all executions before clearing nmi_running */
smp_mb();
atomic_dec(&nmi_running);
@@ -187,9 +215,26 @@ static void wait_for_nmi(void)
nmi_wait_count++;
}
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+ return addr >= start && addr < end;
+}
+
static int
do_ftrace_mod_code(unsigned long ip, void *new_code)
{
+ /*
+ * On x86_64, kernel text mappings are mapped read-only with
+ * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
+ * of the kernel text mapping to modify the kernel text.
+ *
+ * For 32bit kernels, these mappings are same and we can use
+ * kernel identity mapping to modify code.
+ */
+ if (within(ip, (unsigned long)_text, (unsigned long)_etext))
+ ip = (unsigned long)__va(__pa(ip));
+
mod_code_ip = (void *)ip;
mod_code_newcode = new_code;
@@ -336,15 +381,15 @@ int __init ftrace_dyn_arch_init(void *data)
switch (faulted) {
case 0:
- pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n");
+ pr_info("converting mcount calls to 0f 1f 44 00 00\n");
memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
break;
case 1:
- pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n");
+ pr_info("converting mcount calls to 66 66 66 66 90\n");
memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
break;
case 2:
- pr_info("ftrace: converting mcount calls to jmp . + 5\n");
+ pr_info("converting mcount calls to jmp . + 5\n");
memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
break;
}
@@ -465,85 +510,3 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
}
}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-
-#ifdef CONFIG_FTRACE_SYSCALLS
-
-extern unsigned long __start_syscalls_metadata[];
-extern unsigned long __stop_syscalls_metadata[];
-extern unsigned long *sys_call_table;
-
-static struct syscall_metadata **syscalls_metadata;
-
-static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
-{
- struct syscall_metadata *start;
- struct syscall_metadata *stop;
- char str[KSYM_SYMBOL_LEN];
-
-
- start = (struct syscall_metadata *)__start_syscalls_metadata;
- stop = (struct syscall_metadata *)__stop_syscalls_metadata;
- kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
-
- for ( ; start < stop; start++) {
- if (start->name && !strcmp(start->name, str))
- return start;
- }
- return NULL;
-}
-
-struct syscall_metadata *syscall_nr_to_meta(int nr)
-{
- if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
- return NULL;
-
- return syscalls_metadata[nr];
-}
-
-int syscall_name_to_nr(char *name)
-{
- int i;
-
- if (!syscalls_metadata)
- return -1;
-
- for (i = 0; i < NR_syscalls; i++) {
- if (syscalls_metadata[i]) {
- if (!strcmp(syscalls_metadata[i]->name, name))
- return i;
- }
- }
- return -1;
-}
-
-void set_syscall_enter_id(int num, int id)
-{
- syscalls_metadata[num]->enter_id = id;
-}
-
-void set_syscall_exit_id(int num, int id)
-{
- syscalls_metadata[num]->exit_id = id;
-}
-
-static int __init arch_init_ftrace_syscalls(void)
-{
- int i;
- struct syscall_metadata *meta;
- unsigned long **psys_syscall_table = &sys_call_table;
-
- syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
- NR_syscalls, GFP_KERNEL);
- if (!syscalls_metadata) {
- WARN_ON(1);
- return -ENOMEM;
- }
-
- for (i = 0; i < NR_syscalls; i++) {
- meta = find_syscall_meta(psys_syscall_table[i]);
- syscalls_metadata[i] = meta;
- }
- return 0;
-}
-arch_initcall(arch_init_ftrace_syscalls);
-#endif
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
deleted file mode 100644
index 9b08e852fd1a..000000000000
--- a/arch/x86/kernel/geode_32.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * AMD Geode southbridge support code
- * Copyright (C) 2006, Advanced Micro Devices, Inc.
- * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public License
- * as published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/ioport.h>
-#include <linux/io.h>
-#include <asm/msr.h>
-#include <asm/geode.h>
-
-static struct {
- char *name;
- u32 msr;
- int size;
- u32 base;
-} lbars[] = {
- { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 },
- { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 },
- { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 },
- { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 }
-};
-
-static void __init init_lbars(void)
-{
- u32 lo, hi;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(lbars); i++) {
- rdmsr(lbars[i].msr, lo, hi);
- if (hi & 0x01)
- lbars[i].base = lo & 0x0000ffff;
-
- if (lbars[i].base == 0)
- printk(KERN_ERR "geode: Couldn't initialize '%s'\n",
- lbars[i].name);
- }
-}
-
-int geode_get_dev_base(unsigned int dev)
-{
- BUG_ON(dev >= ARRAY_SIZE(lbars));
- return lbars[dev].base;
-}
-EXPORT_SYMBOL_GPL(geode_get_dev_base);
-
-/* === GPIO API === */
-
-void geode_gpio_set(u32 gpio, unsigned int reg)
-{
- u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
-
- if (!base)
- return;
-
- /* low bank register */
- if (gpio & 0xFFFF)
- outl(gpio & 0xFFFF, base + reg);
- /* high bank register */
- gpio >>= 16;
- if (gpio)
- outl(gpio, base + 0x80 + reg);
-}
-EXPORT_SYMBOL_GPL(geode_gpio_set);
-
-void geode_gpio_clear(u32 gpio, unsigned int reg)
-{
- u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
-
- if (!base)
- return;
-
- /* low bank register */
- if (gpio & 0xFFFF)
- outl((gpio & 0xFFFF) << 16, base + reg);
- /* high bank register */
- gpio &= (0xFFFF << 16);
- if (gpio)
- outl(gpio, base + 0x80 + reg);
-}
-EXPORT_SYMBOL_GPL(geode_gpio_clear);
-
-int geode_gpio_isset(u32 gpio, unsigned int reg)
-{
- u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
- u32 val;
-
- if (!base)
- return 0;
-
- /* low bank register */
- if (gpio & 0xFFFF) {
- val = inl(base + reg) & (gpio & 0xFFFF);
- if ((gpio & 0xFFFF) == val)
- return 1;
- }
- /* high bank register */
- gpio >>= 16;
- if (gpio) {
- val = inl(base + 0x80 + reg) & gpio;
- if (gpio == val)
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(geode_gpio_isset);
-
-void geode_gpio_set_irq(unsigned int group, unsigned int irq)
-{
- u32 lo, hi;
-
- if (group > 7 || irq > 15)
- return;
-
- rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
-
- lo &= ~(0xF << (group * 4));
- lo |= (irq & 0xF) << (group * 4);
-
- wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
-}
-EXPORT_SYMBOL_GPL(geode_gpio_set_irq);
-
-void geode_gpio_setup_event(unsigned int gpio, int pair, int pme)
-{
- u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
- u32 offset, shift, val;
-
- if (gpio >= 24)
- offset = GPIO_MAP_W;
- else if (gpio >= 16)
- offset = GPIO_MAP_Z;
- else if (gpio >= 8)
- offset = GPIO_MAP_Y;
- else
- offset = GPIO_MAP_X;
-
- shift = (gpio % 8) * 4;
-
- val = inl(base + offset);
-
- /* Clear whatever was there before */
- val &= ~(0xF << shift);
-
- /* And set the new value */
-
- val |= ((pair & 7) << shift);
-
- /* Set the PME bit if this is a PME event */
-
- if (pme)
- val |= (1 << (shift + 3));
-
- outl(val, base + offset);
-}
-EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
-
-int geode_has_vsa2(void)
-{
- static int has_vsa2 = -1;
-
- if (has_vsa2 == -1) {
- u16 val;
-
- /*
- * The VSA has virtual registers that we can query for a
- * signature.
- */
- outw(VSA_VR_UNLOCK, VSA_VRC_INDEX);
- outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX);
-
- val = inw(VSA_VRC_DATA);
- has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG);
- }
-
- return has_vsa2;
-}
-EXPORT_SYMBOL_GPL(geode_has_vsa2);
-
-static int __init geode_southbridge_init(void)
-{
- if (!is_geode())
- return -ENODEV;
-
- init_lbars();
- (void) mfgpt_timer_setup();
- return 0;
-}
-
-postcore_initcall(geode_southbridge_init);
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 4f8e2507e8f3..adedeef1dedc 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -29,7 +29,15 @@ static void __init i386_default_early_setup(void)
void __init i386_start_kernel(void)
{
- reserve_trampoline_memory();
+#ifdef CONFIG_X86_TRAMPOLINE
+ /*
+ * But first pinch a few for the stack/trampoline stuff
+ * FIXME: Don't need the extra page at 4K, but need to fix
+ * trampoline before removing it. (see the GDT stuff)
+ */
+ reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
+ "EX TRAMPOLINE");
+#endif
reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 0b06cd778fd9..b5a9896ca1e7 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -98,8 +98,6 @@ void __init x86_64_start_reservations(char *real_mode_data)
{
copy_bootdata(__va(real_mode_data));
- reserve_trampoline_memory();
-
reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 050c278481b1..37c3d4b17d85 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -18,6 +18,8 @@
#include <asm/asm-offsets.h>
#include <asm/setup.h>
#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+#include <asm/cpufeature.h>
#include <asm/percpu.h>
/* Physical address */
@@ -297,25 +299,27 @@ ENTRY(startup_32_smp)
orl %edx,%eax
movl %eax,%cr4
- btl $5, %eax # check if PAE is enabled
- jnc 6f
+ testb $X86_CR4_PAE, %al # check if PAE is enabled
+ jz 6f
/* Check if extended functions are implemented */
movl $0x80000000, %eax
cpuid
- cmpl $0x80000000, %eax
- jbe 6f
+ /* Value must be in the range 0x80000001 to 0x8000ffff */
+ subl $0x80000001, %eax
+ cmpl $(0x8000ffff-0x80000001), %eax
+ ja 6f
mov $0x80000001, %eax
cpuid
/* Execute Disable bit supported? */
- btl $20, %edx
+ btl $(X86_FEATURE_NX & 31), %edx
jnc 6f
/* Setup EFER (Extended Feature Enable Register) */
- movl $0xc0000080, %ecx
+ movl $MSR_EFER, %ecx
rdmsr
- btsl $11, %eax
+ btsl $_EFER_NX, %eax
/* Make changes effective */
wrmsr
@@ -438,8 +442,8 @@ is386: movl $2,%ecx # set MP
*/
cmpb $0,ready
jne 1f
- movl $per_cpu__gdt_page,%eax
- movl $per_cpu__stack_canary,%ecx
+ movl $gdt_page,%eax
+ movl $stack_canary,%ecx
movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
shrl $16, %ecx
movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
@@ -702,7 +706,7 @@ idt_descr:
.word 0 # 32 bit align gdt_desc.address
ENTRY(early_gdt_descr)
.word GDT_ENTRIES*8-1
- .long per_cpu__gdt_page /* Overwritten for secondary CPUs */
+ .long gdt_page /* Overwritten for secondary CPUs */
/*
* The boot_gdt must mirror the equivalent in setup.S and is
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 780cd928fcd5..3d1e6f16b7a6 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -27,7 +27,7 @@
#define GET_CR2_INTO_RCX movq %cr2, %rcx
#endif
-/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
+/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
* because we need identity-mapped pages.
*
*/
@@ -212,8 +212,8 @@ ENTRY(secondary_startup_64)
*/
lgdt early_gdt_descr(%rip)
- /* set up data segments. actually 0 would do too */
- movl $__KERNEL_DS,%eax
+ /* set up data segments */
+ xorl %eax,%eax
movl %eax,%ds
movl %eax,%ss
movl %eax,%es
@@ -262,11 +262,11 @@ ENTRY(secondary_startup_64)
.quad x86_64_start_kernel
ENTRY(initial_gs)
.quad INIT_PER_CPU_VAR(irq_stack_union)
- __FINITDATA
ENTRY(stack_start)
.quad init_thread_union+THREAD_SIZE-8
.word 0
+ __FINITDATA
bad_address:
jmp bad_address
@@ -340,6 +340,7 @@ ENTRY(name)
i = i + 1 ; \
.endr
+ .data
/*
* This default setting generates an ident mapping at address 0x100000
* and a mapping for the kernel that precisely maps virtual address
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index dedc2bddf7a5..ee4fa1bfcb33 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -33,6 +33,9 @@
* HPET address is set in acpi/boot.c, when an ACPI entry exists
*/
unsigned long hpet_address;
+u8 hpet_blockid; /* OS timer block num */
+u8 hpet_msi_disable;
+
#ifdef CONFIG_PCI_MSI
static unsigned long hpet_num_timers;
#endif
@@ -47,12 +50,12 @@ struct hpet_dev {
char name[10];
};
-unsigned long hpet_readl(unsigned long a)
+inline unsigned int hpet_readl(unsigned int a)
{
return readl(hpet_virt_address + a);
}
-static inline void hpet_writel(unsigned long d, unsigned long a)
+static inline void hpet_writel(unsigned int d, unsigned int a)
{
writel(d, hpet_virt_address + a);
}
@@ -167,7 +170,7 @@ do { \
static void hpet_reserve_msi_timers(struct hpet_data *hd);
-static void hpet_reserve_platform_timers(unsigned long id)
+static void hpet_reserve_platform_timers(unsigned int id)
{
struct hpet __iomem *hpet = hpet_virt_address;
struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
@@ -205,7 +208,7 @@ static void hpet_reserve_platform_timers(unsigned long id)
}
#else
-static void hpet_reserve_platform_timers(unsigned long id) { }
+static void hpet_reserve_platform_timers(unsigned int id) { }
#endif
/*
@@ -246,7 +249,7 @@ static void hpet_reset_counter(void)
static void hpet_start_counter(void)
{
- unsigned long cfg = hpet_readl(HPET_CFG);
+ unsigned int cfg = hpet_readl(HPET_CFG);
cfg |= HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
}
@@ -263,7 +266,7 @@ static void hpet_resume_device(void)
force_hpet_resume();
}
-static void hpet_resume_counter(void)
+static void hpet_resume_counter(struct clocksource *cs)
{
hpet_resume_device();
hpet_restart_counter();
@@ -271,7 +274,7 @@ static void hpet_resume_counter(void)
static void hpet_enable_legacy_int(void)
{
- unsigned long cfg = hpet_readl(HPET_CFG);
+ unsigned int cfg = hpet_readl(HPET_CFG);
cfg |= HPET_CFG_LEGACY;
hpet_writel(cfg, HPET_CFG);
@@ -314,7 +317,7 @@ static int hpet_setup_msi_irq(unsigned int irq);
static void hpet_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt, int timer)
{
- unsigned long cfg, cmp, now;
+ unsigned int cfg, cmp, now;
uint64_t delta;
switch (mode) {
@@ -323,7 +326,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
delta >>= evt->shift;
now = hpet_readl(HPET_COUNTER);
- cmp = now + (unsigned long) delta;
+ cmp = now + (unsigned int) delta;
cfg = hpet_readl(HPET_Tn_CFG(timer));
/* Make sure we use edge triggered interrupts */
cfg &= ~HPET_TN_LEVEL;
@@ -339,7 +342,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
* (See AMD-8111 HyperTransport I/O Hub Data Sheet,
* Publication # 24674)
*/
- hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer));
+ hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
hpet_start_counter();
hpet_print_config();
break;
@@ -383,13 +386,24 @@ static int hpet_next_event(unsigned long delta,
hpet_writel(cnt, HPET_Tn_CMP(timer));
/*
- * We need to read back the CMP register to make sure that
- * what we wrote hit the chip before we compare it to the
- * counter.
+ * We need to read back the CMP register on certain HPET
+ * implementations (ATI chipsets) which seem to delay the
+ * transfer of the compare register into the internal compare
+ * logic. With small deltas this might actually be too late as
+ * the counter could already be higher than the compare value
+ * at that point and we would wait for the next hpet interrupt
+ * forever. We found out that reading the CMP register back
+ * forces the transfer so we can rely on the comparison with
+ * the counter register below. If the read back from the
+ * compare register does not match the value we programmed
+ * then we might have a real hardware problem. We can not do
+ * much about it here, but at least alert the user/admin with
+ * a prominent warning.
*/
- WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt);
+ WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
+ KERN_WARNING "hpet: compare register read back failed.\n");
- return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
+ return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
}
static void hpet_legacy_set_mode(enum clock_event_mode mode,
@@ -415,7 +429,7 @@ static struct hpet_dev *hpet_devs;
void hpet_msi_unmask(unsigned int irq)
{
struct hpet_dev *hdev = get_irq_data(irq);
- unsigned long cfg;
+ unsigned int cfg;
/* unmask it */
cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -425,7 +439,7 @@ void hpet_msi_unmask(unsigned int irq)
void hpet_msi_mask(unsigned int irq)
{
- unsigned long cfg;
+ unsigned int cfg;
struct hpet_dev *hdev = get_irq_data(irq);
/* mask it */
@@ -467,7 +481,7 @@ static int hpet_msi_next_event(unsigned long delta,
static int hpet_setup_msi_irq(unsigned int irq)
{
- if (arch_setup_hpet_msi(irq)) {
+ if (arch_setup_hpet_msi(irq, hpet_blockid)) {
destroy_irq(irq);
return -EINVAL;
}
@@ -584,6 +598,11 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
unsigned int num_timers_used = 0;
int i;
+ if (hpet_msi_disable)
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_ARAT))
+ return;
id = hpet_readl(HPET_ID);
num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
@@ -598,7 +617,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
struct hpet_dev *hdev = &hpet_devs[num_timers_used];
- unsigned long cfg = hpet_readl(HPET_Tn_CFG(i));
+ unsigned int cfg = hpet_readl(HPET_Tn_CFG(i));
/* Only consider HPET timer with MSI support */
if (!(cfg & HPET_TN_FSB_CAP))
@@ -813,7 +832,7 @@ static int hpet_clocksource_register(void)
*/
int __init hpet_enable(void)
{
- unsigned long id;
+ unsigned int id;
int i;
if (!is_hpet_capable())
@@ -872,10 +891,8 @@ int __init hpet_enable(void)
if (id & HPET_ID_LEGSUP) {
hpet_legacy_clockevent_register();
- hpet_msi_capability_lookup(2);
return 1;
}
- hpet_msi_capability_lookup(0);
return 0;
out_nohpet:
@@ -908,9 +925,20 @@ static __init int hpet_late_init(void)
if (!hpet_virt_address)
return -ENODEV;
+ if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP)
+ hpet_msi_capability_lookup(2);
+ else
+ hpet_msi_capability_lookup(0);
+
hpet_reserve_platform_timers(hpet_readl(HPET_ID));
hpet_print_config();
+ if (hpet_msi_disable)
+ return 0;
+
+ if (boot_cpu_has(X86_FEATURE_ARAT))
+ return 0;
+
for_each_online_cpu(cpu) {
hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
}
@@ -925,7 +953,7 @@ fs_initcall(hpet_late_init);
void hpet_disable(void)
{
if (is_hpet_capable()) {
- unsigned long cfg = hpet_readl(HPET_CFG);
+ unsigned int cfg = hpet_readl(HPET_CFG);
if (hpet_legacy_int_enabled) {
cfg &= ~HPET_CFG_LEGACY;
@@ -965,8 +993,8 @@ static int hpet_prev_update_sec;
static struct rtc_time hpet_alarm_time;
static unsigned long hpet_pie_count;
static u32 hpet_t1_cmp;
-static unsigned long hpet_default_delta;
-static unsigned long hpet_pie_delta;
+static u32 hpet_default_delta;
+static u32 hpet_pie_delta;
static unsigned long hpet_pie_limit;
static rtc_irq_handler irq_handler;
@@ -1017,7 +1045,8 @@ EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
*/
int hpet_rtc_timer_init(void)
{
- unsigned long cfg, cnt, delta, flags;
+ unsigned int cfg, cnt, delta;
+ unsigned long flags;
if (!is_hpet_enabled())
return 0;
@@ -1027,7 +1056,7 @@ int hpet_rtc_timer_init(void)
clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
- hpet_default_delta = (unsigned long) clc;
+ hpet_default_delta = clc;
}
if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
@@ -1113,7 +1142,7 @@ int hpet_set_periodic_freq(unsigned long freq)
clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
do_div(clc, freq);
clc >>= hpet_clockevent.shift;
- hpet_pie_delta = (unsigned long) clc;
+ hpet_pie_delta = clc;
}
return 1;
}
@@ -1127,7 +1156,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
static void hpet_rtc_timer_reinit(void)
{
- unsigned long cfg, delta;
+ unsigned int cfg, delta;
int lost_ints = -1;
if (unlikely(!hpet_rtc_flags)) {
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..d6cc065f519f
--- /dev/null
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,530 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ * K.Prasad <prasad@linux.vnet.ibm.com>
+ * Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/irqflags.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+
+/* Per cpu debug control register value */
+DEFINE_PER_CPU(unsigned long, cpu_dr7);
+EXPORT_PER_CPU_SYMBOL(cpu_dr7);
+
+/* Per cpu debug address registers values */
+static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
+
+/*
+ * Stores the breakpoints currently in use on each breakpoint address
+ * register for each cpus
+ */
+static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
+
+
+static inline unsigned long
+__encode_dr7(int drnum, unsigned int len, unsigned int type)
+{
+ unsigned long bp_info;
+
+ bp_info = (len | type) & 0xf;
+ bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+ bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));
+
+ return bp_info;
+}
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
+{
+ return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
+}
+
+/*
+ * Decode the length and type bits for a particular breakpoint as
+ * stored in debug register 7. Return the "enabled" status.
+ */
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
+{
+ int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
+
+ *len = (bp_info & 0xc) | 0x40;
+ *type = (bp_info & 0x3) | 0x80;
+
+ return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
+}
+
+/*
+ * Install a perf counter breakpoint.
+ *
+ * We seek a free debug address register and use it for this
+ * breakpoint. Eventually we enable it in the debug control register.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+int arch_install_hw_breakpoint(struct perf_event *bp)
+{
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+ unsigned long *dr7;
+ int i;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+
+ if (!*slot) {
+ *slot = bp;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+ return -EBUSY;
+
+ set_debugreg(info->address, i);
+ __get_cpu_var(cpu_debugreg[i]) = info->address;
+
+ dr7 = &__get_cpu_var(cpu_dr7);
+ *dr7 |= encode_dr7(i, info->len, info->type);
+
+ set_debugreg(*dr7, 7);
+
+ return 0;
+}
+
+/*
+ * Uninstall the breakpoint contained in the given counter.
+ *
+ * First we search the debug address register it uses and then we disable
+ * it.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+void arch_uninstall_hw_breakpoint(struct perf_event *bp)
+{
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+ unsigned long *dr7;
+ int i;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+
+ if (*slot == bp) {
+ *slot = NULL;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+ return;
+
+ dr7 = &__get_cpu_var(cpu_dr7);
+ *dr7 &= ~__encode_dr7(i, info->len, info->type);
+
+ set_debugreg(*dr7, 7);
+}
+
+static int get_hbp_len(u8 hbp_len)
+{
+ unsigned int len_in_bytes = 0;
+
+ switch (hbp_len) {
+ case X86_BREAKPOINT_LEN_1:
+ len_in_bytes = 1;
+ break;
+ case X86_BREAKPOINT_LEN_2:
+ len_in_bytes = 2;
+ break;
+ case X86_BREAKPOINT_LEN_4:
+ len_in_bytes = 4;
+ break;
+#ifdef CONFIG_X86_64
+ case X86_BREAKPOINT_LEN_8:
+ len_in_bytes = 8;
+ break;
+#endif
+ }
+ return len_in_bytes;
+}
+
+/*
+ * Check for virtual address in user space.
+ */
+int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
+{
+ unsigned int len;
+
+ len = get_hbp_len(hbp_len);
+
+ return (va <= TASK_SIZE - len);
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+{
+ unsigned int len;
+
+ len = get_hbp_len(hbp_len);
+
+ return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
+}
+
+int arch_bp_generic_fields(int x86_len, int x86_type,
+ int *gen_len, int *gen_type)
+{
+ /* Len */
+ switch (x86_len) {
+ case X86_BREAKPOINT_LEN_1:
+ *gen_len = HW_BREAKPOINT_LEN_1;
+ break;
+ case X86_BREAKPOINT_LEN_2:
+ *gen_len = HW_BREAKPOINT_LEN_2;
+ break;
+ case X86_BREAKPOINT_LEN_4:
+ *gen_len = HW_BREAKPOINT_LEN_4;
+ break;
+#ifdef CONFIG_X86_64
+ case X86_BREAKPOINT_LEN_8:
+ *gen_len = HW_BREAKPOINT_LEN_8;
+ break;
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ /* Type */
+ switch (x86_type) {
+ case X86_BREAKPOINT_EXECUTE:
+ *gen_type = HW_BREAKPOINT_X;
+ break;
+ case X86_BREAKPOINT_WRITE:
+ *gen_type = HW_BREAKPOINT_W;
+ break;
+ case X86_BREAKPOINT_RW:
+ *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+
+static int arch_build_bp_info(struct perf_event *bp)
+{
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+
+ info->address = bp->attr.bp_addr;
+
+ /* Len */
+ switch (bp->attr.bp_len) {
+ case HW_BREAKPOINT_LEN_1:
+ info->len = X86_BREAKPOINT_LEN_1;
+ break;
+ case HW_BREAKPOINT_LEN_2:
+ info->len = X86_BREAKPOINT_LEN_2;
+ break;
+ case HW_BREAKPOINT_LEN_4:
+ info->len = X86_BREAKPOINT_LEN_4;
+ break;
+#ifdef CONFIG_X86_64
+ case HW_BREAKPOINT_LEN_8:
+ info->len = X86_BREAKPOINT_LEN_8;
+ break;
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ /* Type */
+ switch (bp->attr.bp_type) {
+ case HW_BREAKPOINT_W:
+ info->type = X86_BREAKPOINT_WRITE;
+ break;
+ case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+ info->type = X86_BREAKPOINT_RW;
+ break;
+ case HW_BREAKPOINT_X:
+ info->type = X86_BREAKPOINT_EXECUTE;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct perf_event *bp,
+ struct task_struct *tsk)
+{
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+ unsigned int align;
+ int ret;
+
+
+ ret = arch_build_bp_info(bp);
+ if (ret)
+ return ret;
+
+ ret = -EINVAL;
+
+ if (info->type == X86_BREAKPOINT_EXECUTE)
+ /*
+ * Ptrace-refactoring code
+ * For now, we'll allow instruction breakpoint only for user-space
+ * addresses
+ */
+ if ((!arch_check_va_in_userspace(info->address, info->len)) &&
+ info->len != X86_BREAKPOINT_EXECUTE)
+ return ret;
+
+ switch (info->len) {
+ case X86_BREAKPOINT_LEN_1:
+ align = 0;
+ break;
+ case X86_BREAKPOINT_LEN_2:
+ align = 1;
+ break;
+ case X86_BREAKPOINT_LEN_4:
+ align = 3;
+ break;
+#ifdef CONFIG_X86_64
+ case X86_BREAKPOINT_LEN_8:
+ align = 7;
+ break;
+#endif
+ default:
+ return ret;
+ }
+
+ /*
+ * Check that the low-order bits of the address are appropriate
+ * for the alignment implied by len.
+ */
+ if (info->address & align)
+ return -EINVAL;
+
+ /* Check that the virtual address is in the proper range */
+ if (tsk) {
+ if (!arch_check_va_in_userspace(info->address, info->len))
+ return -EFAULT;
+ } else {
+ if (!arch_check_va_in_kernelspace(info->address, info->len))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*
+ * Dump the debug register contents to the user.
+ * We can't dump our per cpu values because it
+ * may contain cpu wide breakpoint, something that
+ * doesn't belong to the current task.
+ *
+ * TODO: include non-ptrace user breakpoints (perf)
+ */
+void aout_dump_debugregs(struct user *dump)
+{
+ int i;
+ int dr7 = 0;
+ struct perf_event *bp;
+ struct arch_hw_breakpoint *info;
+ struct thread_struct *thread = &current->thread;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ bp = thread->ptrace_bps[i];
+
+ if (bp && !bp->attr.disabled) {
+ dump->u_debugreg[i] = bp->attr.bp_addr;
+ info = counter_arch_bp(bp);
+ dr7 |= encode_dr7(i, info->len, info->type);
+ } else {
+ dump->u_debugreg[i] = 0;
+ }
+ }
+
+ dump->u_debugreg[4] = 0;
+ dump->u_debugreg[5] = 0;
+ dump->u_debugreg[6] = current->thread.debugreg6;
+
+ dump->u_debugreg[7] = dr7;
+}
+EXPORT_SYMBOL_GPL(aout_dump_debugregs);
+
+/*
+ * Release the user breakpoints used by ptrace
+ */
+void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
+{
+ int i;
+ struct thread_struct *t = &tsk->thread;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ unregister_hw_breakpoint(t->ptrace_bps[i]);
+ t->ptrace_bps[i] = NULL;
+ }
+}
+
+void hw_breakpoint_restore(void)
+{
+ set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
+ set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
+ set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
+ set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
+ set_debugreg(current->thread.debugreg6, 6);
+ set_debugreg(__get_cpu_var(cpu_dr7), 7);
+}
+EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
+
+/*
+ * Handle debug exception notifications.
+ *
+ * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
+ *
+ * NOTIFY_DONE returned if one of the following conditions is true.
+ * i) When the causative address is from user-space and the exception
+ * is a valid one, i.e. not triggered as a result of lazy debug register
+ * switching
+ * ii) When there are more bits than trap<n> set in DR6 register (such
+ * as BD, BS or BT) indicating that more than one debug condition is
+ * met and requires some more action in do_debug().
+ *
+ * NOTIFY_STOP returned for all other cases
+ *
+ */
+static int __kprobes hw_breakpoint_handler(struct die_args *args)
+{
+ int i, cpu, rc = NOTIFY_STOP;
+ struct perf_event *bp;
+ unsigned long dr7, dr6;
+ unsigned long *dr6_p;
+
+ /* The DR6 value is pointed by args->err */
+ dr6_p = (unsigned long *)ERR_PTR(args->err);
+ dr6 = *dr6_p;
+
+ /* Do an early return if no trap bits are set in DR6 */
+ if ((dr6 & DR_TRAP_BITS) == 0)
+ return NOTIFY_DONE;
+
+ get_debugreg(dr7, 7);
+ /* Disable breakpoints during exception handling */
+ set_debugreg(0UL, 7);
+ /*
+ * Assert that local interrupts are disabled
+ * Reset the DRn bits in the virtualized register value.
+ * The ptrace trigger routine will add in whatever is needed.
+ */
+ current->thread.debugreg6 &= ~DR_TRAP_BITS;
+ cpu = get_cpu();
+
+ /* Handle all the breakpoints that were triggered */
+ for (i = 0; i < HBP_NUM; ++i) {
+ if (likely(!(dr6 & (DR_TRAP0 << i))))
+ continue;
+
+ /*
+ * The counter may be concurrently released but that can only
+ * occur from a call_rcu() path. We can then safely fetch
+ * the breakpoint, use its callback, touch its counter
+ * while we are in an rcu_read_lock() path.
+ */
+ rcu_read_lock();
+
+ bp = per_cpu(bp_per_reg[i], cpu);
+ /*
+ * Reset the 'i'th TRAP bit in dr6 to denote completion of
+ * exception handling
+ */
+ (*dr6_p) &= ~(DR_TRAP0 << i);
+ /*
+ * bp can be NULL due to lazy debug register switching
+ * or due to concurrent perf counter removing.
+ */
+ if (!bp) {
+ rcu_read_unlock();
+ break;
+ }
+
+ perf_bp_event(bp, args->regs);
+
+ rcu_read_unlock();
+ }
+ /*
+ * Further processing in do_debug() is needed for a) user-space
+ * breakpoints (to generate signals) and b) when the system has
+ * taken exception due to multiple causes
+ */
+ if ((current->thread.debugreg6 & DR_TRAP_BITS) ||
+ (dr6 & (~DR_TRAP_BITS)))
+ rc = NOTIFY_DONE;
+
+ set_debugreg(dr7, 7);
+ put_cpu();
+
+ return rc;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int __kprobes hw_breakpoint_exceptions_notify(
+ struct notifier_block *unused, unsigned long val, void *data)
+{
+ if (val != DIE_DEBUG)
+ return NOTIFY_DONE;
+
+ return hw_breakpoint_handler(data);
+}
+
+void hw_breakpoint_pmu_read(struct perf_event *bp)
+{
+ /* TODO */
+}
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 43cec6bdda63..9c3bd4a2050e 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -10,6 +10,16 @@
EXPORT_SYMBOL(mcount);
#endif
+/*
+ * Note, this is a prototype to get at the symbol for
+ * the export, but dont use it from C code, it is used
+ * by assembly code and is not using C calling convention!
+ */
+#ifndef CONFIG_X86_CMPXCHG64
+extern void cmpxchg8b_emu(void);
+EXPORT_SYMBOL(cmpxchg8b_emu);
+#endif
+
/* Networking helper routines. */
EXPORT_SYMBOL(csum_partial_copy_generic);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index f2f8540a7f3d..c01a2b846d47 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -164,6 +164,11 @@ int init_fpu(struct task_struct *tsk)
return 0;
}
+/*
+ * The xstateregs_active() routine is the same as the fpregs_active() routine,
+ * as the "regset->n" for the xstate regset will be updated based on the feature
+ * capabilites supported by the xsave.
+ */
int fpregs_active(struct task_struct *target, const struct user_regset *regset)
{
return tsk_used_math(target) ? regset->n : 0;
@@ -204,8 +209,6 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
- set_stopped_child_used_math(target);
-
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
&target->thread.xstate->fxsave, 0, -1);
@@ -224,6 +227,68 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
return ret;
}
+int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ int ret;
+
+ if (!cpu_has_xsave)
+ return -ENODEV;
+
+ ret = init_fpu(target);
+ if (ret)
+ return ret;
+
+ /*
+ * Copy the 48bytes defined by the software first into the xstate
+ * memory layout in the thread struct, so that we can copy the entire
+ * xstateregs to the user using one user_regset_copyout().
+ */
+ memcpy(&target->thread.xstate->fxsave.sw_reserved,
+ xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
+
+ /*
+ * Copy the xstate memory layout.
+ */
+ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+ &target->thread.xstate->xsave, 0, -1);
+ return ret;
+}
+
+int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ int ret;
+ struct xsave_hdr_struct *xsave_hdr;
+
+ if (!cpu_has_xsave)
+ return -ENODEV;
+
+ ret = init_fpu(target);
+ if (ret)
+ return ret;
+
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+ &target->thread.xstate->xsave, 0, -1);
+
+ /*
+ * mxcsr reserved bits must be masked to zero for security reasons.
+ */
+ target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+
+ xsave_hdr = &target->thread.xstate->xsave.xsave_hdr;
+
+ xsave_hdr->xstate_bv &= pcntxt_mask;
+ /*
+ * These bits must be zero.
+ */
+ xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+
+ return ret;
+}
+
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
/*
@@ -404,8 +469,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
- set_stopped_child_used_math(target);
-
if (!HAVE_HWFP)
return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index df89102bef80..fb725ee15f55 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -32,8 +32,14 @@
*/
static int i8259A_auto_eoi;
-DEFINE_SPINLOCK(i8259A_lock);
+DEFINE_RAW_SPINLOCK(i8259A_lock);
static void mask_and_ack_8259A(unsigned int);
+static void mask_8259A(void);
+static void unmask_8259A(void);
+static void disable_8259A_irq(unsigned int irq);
+static void enable_8259A_irq(unsigned int irq);
+static void init_8259A(int auto_eoi);
+static int i8259A_irq_pending(unsigned int irq);
struct irq_chip i8259A_chip = {
.name = "XT-PIC",
@@ -63,51 +69,51 @@ unsigned int cached_irq_mask = 0xffff;
*/
unsigned long io_apic_irqs;
-void disable_8259A_irq(unsigned int irq)
+static void disable_8259A_irq(unsigned int irq)
{
unsigned int mask = 1 << irq;
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
cached_irq_mask |= mask;
if (irq & 8)
outb(cached_slave_mask, PIC_SLAVE_IMR);
else
outb(cached_master_mask, PIC_MASTER_IMR);
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
-void enable_8259A_irq(unsigned int irq)
+static void enable_8259A_irq(unsigned int irq)
{
unsigned int mask = ~(1 << irq);
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
cached_irq_mask &= mask;
if (irq & 8)
outb(cached_slave_mask, PIC_SLAVE_IMR);
else
outb(cached_master_mask, PIC_MASTER_IMR);
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
-int i8259A_irq_pending(unsigned int irq)
+static int i8259A_irq_pending(unsigned int irq)
{
unsigned int mask = 1<<irq;
unsigned long flags;
int ret;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
if (irq < 8)
ret = inb(PIC_MASTER_CMD) & mask;
else
ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
return ret;
}
-void make_8259A_irq(unsigned int irq)
+static void make_8259A_irq(unsigned int irq)
{
disable_irq_nosync(irq);
io_apic_irqs &= ~(1<<irq);
@@ -150,7 +156,7 @@ static void mask_and_ack_8259A(unsigned int irq)
unsigned int irqmask = 1 << irq;
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
/*
* Lightweight spurious IRQ detection. We do not want
* to overdo spurious IRQ handling - it's usually a sign
@@ -183,7 +189,7 @@ handle_real_irq:
outb(cached_master_mask, PIC_MASTER_IMR);
outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */
}
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
return;
spurious_8259A_irq:
@@ -281,37 +287,37 @@ static int __init i8259A_init_sysfs(void)
device_initcall(i8259A_init_sysfs);
-void mask_8259A(void)
+static void mask_8259A(void)
{
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
-void unmask_8259A(void)
+static void unmask_8259A(void)
{
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
-void init_8259A(int auto_eoi)
+static void init_8259A(int auto_eoi)
{
unsigned long flags;
i8259A_auto_eoi = auto_eoi;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
@@ -356,5 +362,49 @@ void init_8259A(int auto_eoi)
outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
+
+/*
+ * make i8259 a driver so that we can select pic functions at run time. the goal
+ * is to make x86 binary compatible among pc compatible and non-pc compatible
+ * platforms, such as x86 MID.
+ */
+
+static void legacy_pic_noop(void) { };
+static void legacy_pic_uint_noop(unsigned int unused) { };
+static void legacy_pic_int_noop(int unused) { };
+
+static struct irq_chip dummy_pic_chip = {
+ .name = "dummy pic",
+ .mask = legacy_pic_uint_noop,
+ .unmask = legacy_pic_uint_noop,
+ .disable = legacy_pic_uint_noop,
+ .mask_ack = legacy_pic_uint_noop,
+};
+static int legacy_pic_irq_pending_noop(unsigned int irq)
+{
+ return 0;
+}
+
+struct legacy_pic null_legacy_pic = {
+ .nr_legacy_irqs = 0,
+ .chip = &dummy_pic_chip,
+ .mask_all = legacy_pic_noop,
+ .restore_mask = legacy_pic_noop,
+ .init = legacy_pic_int_noop,
+ .irq_pending = legacy_pic_irq_pending_noop,
+ .make_irq = legacy_pic_uint_noop,
+};
+
+struct legacy_pic default_legacy_pic = {
+ .nr_legacy_irqs = NR_IRQS_LEGACY,
+ .chip = &i8259A_chip,
+ .mask_all = mask_8259A,
+ .restore_mask = unmask_8259A,
+ .init = init_8259A,
+ .irq_pending = i8259A_irq_pending,
+ .make_irq = make_8259A_irq,
+};
+
+struct legacy_pic *legacy_pic = &default_legacy_pic;
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 99c4d308f16b..8eec0ec59af2 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -103,9 +103,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
* on system-call entry - see also fork() and the signal handling
* code.
*/
-static int do_iopl(unsigned int level, struct pt_regs *regs)
+long sys_iopl(unsigned int level, struct pt_regs *regs)
{
unsigned int old = (regs->flags >> 12) & 3;
+ struct thread_struct *t = &current->thread;
if (level > 3)
return -EINVAL;
@@ -115,29 +116,8 @@ static int do_iopl(unsigned int level, struct pt_regs *regs)
return -EPERM;
}
regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
-
- return 0;
-}
-
-#ifdef CONFIG_X86_32
-long sys_iopl(struct pt_regs *regs)
-{
- unsigned int level = regs->bx;
- struct thread_struct *t = &current->thread;
- int rc;
-
- rc = do_iopl(level, regs);
- if (rc < 0)
- goto out;
-
t->iopl = level << 12;
set_iopl_mask(t->iopl);
-out:
- return rc;
-}
-#else
-asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
-{
- return do_iopl(level, regs);
+
+ return 0;
}
-#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 74656d1d4e30..91fd0c70a18a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -18,7 +18,7 @@
atomic_t irq_err_count;
/* Function pointer for generic interrupt vector handling */
-void (*generic_interrupt_extension)(void) = NULL;
+void (*x86_platform_ipi_callback)(void) = NULL;
/*
* 'what should we do if we get a hw irq event on an illegal vector'.
@@ -63,19 +63,19 @@ static int show_other_interrupts(struct seq_file *p, int prec)
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
seq_printf(p, " Spurious interrupts\n");
- seq_printf(p, "%*s: ", prec, "CNT");
+ seq_printf(p, "%*s: ", prec, "PMI");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
- seq_printf(p, " Performance counter interrupts\n");
+ seq_printf(p, " Performance monitoring interrupts\n");
seq_printf(p, "%*s: ", prec, "PND");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
seq_printf(p, " Performance pending work\n");
#endif
- if (generic_interrupt_extension) {
+ if (x86_platform_ipi_callback) {
seq_printf(p, "%*s: ", prec, "PLT");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
+ seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
seq_printf(p, " Platform interrupts\n");
}
#ifdef CONFIG_SMP
@@ -92,17 +92,17 @@ static int show_other_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
seq_printf(p, " TLB shootdowns\n");
#endif
-#ifdef CONFIG_X86_MCE
+#ifdef CONFIG_X86_THERMAL_VECTOR
seq_printf(p, "%*s: ", prec, "TRM");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
seq_printf(p, " Thermal event interrupts\n");
-# ifdef CONFIG_X86_MCE_THRESHOLD
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
seq_printf(p, "%*s: ", prec, "THR");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
seq_printf(p, " Threshold APIC interrupts\n");
-# endif
#endif
#ifdef CONFIG_X86_MCE
seq_printf(p, "%*s: ", prec, "MCE");
@@ -149,7 +149,7 @@ int show_interrupts(struct seq_file *p, void *v)
if (!desc)
return 0;
- spin_lock_irqsave(&desc->lock, flags);
+ raw_spin_lock_irqsave(&desc->lock, flags);
for_each_online_cpu(j)
any_count |= kstat_irqs_cpu(i, j);
action = desc->action;
@@ -170,7 +170,7 @@ int show_interrupts(struct seq_file *p, void *v)
seq_putc(p, '\n');
out:
- spin_unlock_irqrestore(&desc->lock, flags);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
@@ -187,18 +187,18 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += irq_stats(cpu)->apic_perf_irqs;
sum += irq_stats(cpu)->apic_pending_irqs;
#endif
- if (generic_interrupt_extension)
- sum += irq_stats(cpu)->generic_irqs;
+ if (x86_platform_ipi_callback)
+ sum += irq_stats(cpu)->x86_platform_ipis;
#ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
sum += irq_stats(cpu)->irq_call_count;
sum += irq_stats(cpu)->irq_tlb_count;
#endif
-#ifdef CONFIG_X86_MCE
+#ifdef CONFIG_X86_THERMAL_VECTOR
sum += irq_stats(cpu)->irq_thermal_count;
-# ifdef CONFIG_X86_MCE_THRESHOLD
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
sum += irq_stats(cpu)->irq_threshold_count;
-# endif
#endif
#ifdef CONFIG_X86_MCE
sum += per_cpu(mce_exception_count, cpu);
@@ -251,9 +251,9 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
}
/*
- * Handler for GENERIC_INTERRUPT_VECTOR.
+ * Handler for X86_PLATFORM_IPI_VECTOR.
*/
-void smp_generic_interrupt(struct pt_regs *regs)
+void smp_x86_platform_ipi(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
@@ -263,10 +263,10 @@ void smp_generic_interrupt(struct pt_regs *regs)
irq_enter();
- inc_irq_stat(generic_irqs);
+ inc_irq_stat(x86_platform_ipis);
- if (generic_interrupt_extension)
- generic_interrupt_extension();
+ if (x86_platform_ipi_callback)
+ x86_platform_ipi_callback();
irq_exit();
@@ -274,3 +274,93 @@ void smp_generic_interrupt(struct pt_regs *regs)
}
EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
+void fixup_irqs(void)
+{
+ unsigned int irq, vector;
+ static int warned;
+ struct irq_desc *desc;
+
+ for_each_irq_desc(irq, desc) {
+ int break_affinity = 0;
+ int set_affinity = 1;
+ const struct cpumask *affinity;
+
+ if (!desc)
+ continue;
+ if (irq == 2)
+ continue;
+
+ /* interrupt's are disabled at this point */
+ raw_spin_lock(&desc->lock);
+
+ affinity = desc->affinity;
+ if (!irq_has_action(irq) ||
+ cpumask_equal(affinity, cpu_online_mask)) {
+ raw_spin_unlock(&desc->lock);
+ continue;
+ }
+
+ /*
+ * Complete the irq move. This cpu is going down and for
+ * non intr-remapping case, we can't wait till this interrupt
+ * arrives at this cpu before completing the irq move.
+ */
+ irq_force_complete_move(irq);
+
+ if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
+ break_affinity = 1;
+ affinity = cpu_all_mask;
+ }
+
+ if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
+ desc->chip->mask(irq);
+
+ if (desc->chip->set_affinity)
+ desc->chip->set_affinity(irq, affinity);
+ else if (!(warned++))
+ set_affinity = 0;
+
+ if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
+ desc->chip->unmask(irq);
+
+ raw_spin_unlock(&desc->lock);
+
+ if (break_affinity && set_affinity)
+ printk("Broke affinity for irq %i\n", irq);
+ else if (!set_affinity)
+ printk("Cannot set affinity for irq %i\n", irq);
+ }
+
+ /*
+ * We can remove mdelay() and then send spuriuous interrupts to
+ * new cpu targets for all the irqs that were handled previously by
+ * this cpu. While it works, I have seen spurious interrupt messages
+ * (nothing wrong but still...).
+ *
+ * So for now, retain mdelay(1) and check the IRR and then send those
+ * interrupts to new targets as this cpu is already offlined...
+ */
+ mdelay(1);
+
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+ unsigned int irr;
+
+ if (__get_cpu_var(vector_irq)[vector] < 0)
+ continue;
+
+ irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ if (irr & (1 << (vector % 32))) {
+ irq = __get_cpu_var(vector_irq)[vector];
+
+ desc = irq_to_desc(irq);
+ raw_spin_lock(&desc->lock);
+ if (desc->chip->retrigger)
+ desc->chip->retrigger(irq);
+ raw_spin_unlock(&desc->lock);
+ }
+ }
+}
+#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 7d35d0fe2329..10709f29d166 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -211,48 +211,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
return true;
}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
-void fixup_irqs(void)
-{
- unsigned int irq;
- struct irq_desc *desc;
-
- for_each_irq_desc(irq, desc) {
- const struct cpumask *affinity;
-
- if (!desc)
- continue;
- if (irq == 2)
- continue;
-
- affinity = desc->affinity;
- if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
- printk("Breaking affinity for irq %i\n", irq);
- affinity = cpu_all_mask;
- }
- if (desc->chip->set_affinity)
- desc->chip->set_affinity(irq, affinity);
- else if (desc->action)
- printk_once("Cannot set affinity for irq %i\n", irq);
- }
-
-#if 0
- barrier();
- /* Ingo Molnar says: "after the IO-APIC masks have been redirected
- [note the nop - the interrupt-enable boundary on x86 is two
- instructions from sti] - to flush out pending hardirqs and
- IPIs. After this point nothing is supposed to reach this CPU." */
- __asm__ __volatile__("sti; nop; cli");
- barrier();
-#else
- /* That doesn't seem sufficient. Give it 1ms. */
- local_irq_enable();
- mdelay(1);
- local_irq_disable();
-#endif
-}
-#endif
-
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 977d8b43a0dd..acf8fbf8fbda 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
return true;
}
-#ifdef CONFIG_HOTPLUG_CPU
-/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
-void fixup_irqs(void)
-{
- unsigned int irq;
- static int warned;
- struct irq_desc *desc;
-
- for_each_irq_desc(irq, desc) {
- int break_affinity = 0;
- int set_affinity = 1;
- const struct cpumask *affinity;
-
- if (!desc)
- continue;
- if (irq == 2)
- continue;
-
- /* interrupt's are disabled at this point */
- spin_lock(&desc->lock);
-
- affinity = desc->affinity;
- if (!irq_has_action(irq) ||
- cpumask_equal(affinity, cpu_online_mask)) {
- spin_unlock(&desc->lock);
- continue;
- }
-
- if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
- break_affinity = 1;
- affinity = cpu_all_mask;
- }
-
- if (desc->chip->mask)
- desc->chip->mask(irq);
-
- if (desc->chip->set_affinity)
- desc->chip->set_affinity(irq, affinity);
- else if (!(warned++))
- set_affinity = 0;
-
- if (desc->chip->unmask)
- desc->chip->unmask(irq);
-
- spin_unlock(&desc->lock);
-
- if (break_affinity && set_affinity)
- printk("Broke affinity for irq %i\n", irq);
- else if (!set_affinity)
- printk("Cannot set affinity for irq %i\n", irq);
- }
-
- /* That doesn't seem sufficient. Give it 1ms. */
- local_irq_enable();
- mdelay(1);
- local_irq_disable();
-}
-#endif
extern void call_softirq(void);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 40f30773fb29..ef257fc2921b 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -84,24 +84,7 @@ static struct irqaction irq2 = {
};
DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
- [0 ... IRQ0_VECTOR - 1] = -1,
- [IRQ0_VECTOR] = 0,
- [IRQ1_VECTOR] = 1,
- [IRQ2_VECTOR] = 2,
- [IRQ3_VECTOR] = 3,
- [IRQ4_VECTOR] = 4,
- [IRQ5_VECTOR] = 5,
- [IRQ6_VECTOR] = 6,
- [IRQ7_VECTOR] = 7,
- [IRQ8_VECTOR] = 8,
- [IRQ9_VECTOR] = 9,
- [IRQ10_VECTOR] = 10,
- [IRQ11_VECTOR] = 11,
- [IRQ12_VECTOR] = 12,
- [IRQ13_VECTOR] = 13,
- [IRQ14_VECTOR] = 14,
- [IRQ15_VECTOR] = 15,
- [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
+ [0 ... NR_VECTORS - 1] = -1,
};
int vector_used_by_percpu_irq(unsigned int vector)
@@ -123,12 +106,12 @@ void __init init_ISA_irqs(void)
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
init_bsp_APIC();
#endif
- init_8259A(0);
+ legacy_pic->init(0);
/*
* 16 old-style INTA-cycle interrupts:
*/
- for (i = 0; i < NR_IRQS_LEGACY; i++) {
+ for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
struct irq_desc *desc = irq_to_desc(i);
desc->status = IRQ_DISABLED;
@@ -142,6 +125,19 @@ void __init init_ISA_irqs(void)
void __init init_IRQ(void)
{
+ int i;
+
+ /*
+ * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
+ * If these IRQ's are handled by legacy interrupt-controllers like PIC,
+ * then this configuration will likely be static after the boot. If
+ * these IRQ's are handled by more mordern controllers like IO-APIC,
+ * then this vector space can be freed and re-used dynamically as the
+ * irq's migrate etc.
+ */
+ for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
+ per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
+
x86_init.irqs.intr_init();
}
@@ -200,8 +196,8 @@ static void __init apic_intr_init(void)
/* self generated IPI for local APIC timer */
alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
- /* generic IPI for platform specific use */
- alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+ /* IPI for X86 platform specific use */
+ alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index cbc4332a77b2..9b895464dd03 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -121,3 +121,17 @@ void k8_flush_garts(void)
}
EXPORT_SYMBOL_GPL(k8_flush_garts);
+static __init int init_k8_nbs(void)
+{
+ int err = 0;
+
+ err = cache_k8_northbridges();
+
+ if (err < 0)
+ printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
+
+ return err;
+}
+
+/* This has to go after the PCI subsystem */
+fs_initcall(init_k8_nbs);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8d82a77a3f3b..bfba6019d762 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -42,7 +42,9 @@
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/nmi.h>
+#include <linux/hw_breakpoint.h>
+#include <asm/debugreg.h>
#include <asm/apicdef.h>
#include <asm/system.h>
@@ -85,10 +87,15 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
gdb_regs[GDB_DS] = regs->ds;
gdb_regs[GDB_ES] = regs->es;
gdb_regs[GDB_CS] = regs->cs;
- gdb_regs[GDB_SS] = __KERNEL_DS;
gdb_regs[GDB_FS] = 0xFFFF;
gdb_regs[GDB_GS] = 0xFFFF;
- gdb_regs[GDB_SP] = (int)&regs->sp;
+ if (user_mode_vm(regs)) {
+ gdb_regs[GDB_SS] = regs->ss;
+ gdb_regs[GDB_SP] = regs->sp;
+ } else {
+ gdb_regs[GDB_SS] = __KERNEL_DS;
+ gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
+ }
#else
gdb_regs[GDB_R8] = regs->r8;
gdb_regs[GDB_R9] = regs->r9;
@@ -101,7 +108,7 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
gdb_regs32[GDB_PS] = regs->flags;
gdb_regs32[GDB_CS] = regs->cs;
gdb_regs32[GDB_SS] = regs->ss;
- gdb_regs[GDB_SP] = regs->sp;
+ gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
#endif
}
@@ -198,41 +205,81 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
static struct hw_breakpoint {
unsigned enabled;
- unsigned type;
- unsigned len;
unsigned long addr;
+ int len;
+ int type;
+ struct perf_event **pev;
} breakinfo[4];
static void kgdb_correct_hw_break(void)
{
- unsigned long dr7;
- int correctit = 0;
- int breakbit;
int breakno;
- get_debugreg(dr7, 7);
for (breakno = 0; breakno < 4; breakno++) {
- breakbit = 2 << (breakno << 1);
- if (!(dr7 & breakbit) && breakinfo[breakno].enabled) {
- correctit = 1;
- dr7 |= breakbit;
- dr7 &= ~(0xf0000 << (breakno << 2));
- dr7 |= ((breakinfo[breakno].len << 2) |
- breakinfo[breakno].type) <<
- ((breakno << 2) + 16);
- if (breakno >= 0 && breakno <= 3)
- set_debugreg(breakinfo[breakno].addr, breakno);
-
- } else {
- if ((dr7 & breakbit) && !breakinfo[breakno].enabled) {
- correctit = 1;
- dr7 &= ~breakbit;
- dr7 &= ~(0xf0000 << (breakno << 2));
- }
- }
+ struct perf_event *bp;
+ struct arch_hw_breakpoint *info;
+ int val;
+ int cpu = raw_smp_processor_id();
+ if (!breakinfo[breakno].enabled)
+ continue;
+ bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ info = counter_arch_bp(bp);
+ if (bp->attr.disabled != 1)
+ continue;
+ bp->attr.bp_addr = breakinfo[breakno].addr;
+ bp->attr.bp_len = breakinfo[breakno].len;
+ bp->attr.bp_type = breakinfo[breakno].type;
+ info->address = breakinfo[breakno].addr;
+ info->len = breakinfo[breakno].len;
+ info->type = breakinfo[breakno].type;
+ val = arch_install_hw_breakpoint(bp);
+ if (!val)
+ bp->attr.disabled = 0;
}
- if (correctit)
- set_debugreg(dr7, 7);
+ hw_breakpoint_restore();
+}
+
+static int hw_break_reserve_slot(int breakno)
+{
+ int cpu;
+ int cnt = 0;
+ struct perf_event **pevent;
+
+ for_each_online_cpu(cpu) {
+ cnt++;
+ pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ if (dbg_reserve_bp_slot(*pevent))
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ for_each_online_cpu(cpu) {
+ cnt--;
+ if (!cnt)
+ break;
+ pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ dbg_release_bp_slot(*pevent);
+ }
+ return -1;
+}
+
+static int hw_break_release_slot(int breakno)
+{
+ struct perf_event **pevent;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ if (dbg_release_bp_slot(*pevent))
+ /*
+ * The debugger is responisble for handing the retry on
+ * remove failure.
+ */
+ return -1;
+ }
+ return 0;
}
static int
@@ -246,6 +293,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
if (i == 4)
return -1;
+ if (hw_break_release_slot(i)) {
+ printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr);
+ return -1;
+ }
breakinfo[i].enabled = 0;
return 0;
@@ -254,15 +305,23 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
static void kgdb_remove_all_hw_break(void)
{
int i;
+ int cpu = raw_smp_processor_id();
+ struct perf_event *bp;
- for (i = 0; i < 4; i++)
- memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint));
+ for (i = 0; i < 4; i++) {
+ if (!breakinfo[i].enabled)
+ continue;
+ bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
+ if (bp->attr.disabled == 1)
+ continue;
+ arch_uninstall_hw_breakpoint(bp);
+ bp->attr.disabled = 1;
+ }
}
static int
kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
{
- unsigned type;
int i;
for (i = 0; i < 4; i++)
@@ -273,27 +332,42 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
switch (bptype) {
case BP_HARDWARE_BREAKPOINT:
- type = 0;
- len = 1;
+ len = 1;
+ breakinfo[i].type = X86_BREAKPOINT_EXECUTE;
break;
case BP_WRITE_WATCHPOINT:
- type = 1;
+ breakinfo[i].type = X86_BREAKPOINT_WRITE;
break;
case BP_ACCESS_WATCHPOINT:
- type = 3;
+ breakinfo[i].type = X86_BREAKPOINT_RW;
break;
default:
return -1;
}
-
- if (len == 1 || len == 2 || len == 4)
- breakinfo[i].len = len - 1;
- else
+ switch (len) {
+ case 1:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_1;
+ break;
+ case 2:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_2;
+ break;
+ case 4:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_4;
+ break;
+#ifdef CONFIG_X86_64
+ case 8:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_8;
+ break;
+#endif
+ default:
return -1;
-
- breakinfo[i].enabled = 1;
+ }
breakinfo[i].addr = addr;
- breakinfo[i].type = type;
+ if (hw_break_reserve_slot(i)) {
+ breakinfo[i].addr = 0;
+ return -1;
+ }
+ breakinfo[i].enabled = 1;
return 0;
}
@@ -308,8 +382,21 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
*/
void kgdb_disable_hw_debug(struct pt_regs *regs)
{
+ int i;
+ int cpu = raw_smp_processor_id();
+ struct perf_event *bp;
+
/* Disable hardware debugging while we are in kgdb: */
set_debugreg(0UL, 7);
+ for (i = 0; i < 4; i++) {
+ if (!breakinfo[i].enabled)
+ continue;
+ bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
+ if (bp->attr.disabled == 1)
+ continue;
+ arch_uninstall_hw_breakpoint(bp);
+ bp->attr.disabled = 1;
+ }
}
/**
@@ -373,7 +460,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
struct pt_regs *linux_regs)
{
unsigned long addr;
- unsigned long dr6;
char *ptr;
int newPC;
@@ -395,25 +481,10 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
/* set the trace bit if we're stepping */
if (remcomInBuffer[0] == 's') {
linux_regs->flags |= X86_EFLAGS_TF;
- kgdb_single_step = 1;
atomic_set(&kgdb_cpu_doing_single_step,
raw_smp_processor_id());
}
- get_debugreg(dr6, 6);
- if (!(dr6 & 0x4000)) {
- int breakno;
-
- for (breakno = 0; breakno < 4; breakno++) {
- if (dr6 & (1 << breakno) &&
- breakinfo[breakno].type == 0) {
- /* Set restore flag: */
- linux_regs->flags |= X86_EFLAGS_RF;
- break;
- }
- }
- }
- set_debugreg(0UL, 6);
kgdb_correct_hw_break();
return 0;
@@ -434,6 +505,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
"resuming...\n");
kgdb_arch_handle_exception(args->trapnr, args->signr,
args->err, "c", "", regs);
+ /*
+ * Reset the BS bit in dr6 (pointed by args->err) to
+ * denote completion of processing
+ */
+ (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
return NOTIFY_STOP;
}
@@ -476,8 +552,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
break;
case DIE_DEBUG:
- if (atomic_read(&kgdb_cpu_doing_single_step) ==
- raw_smp_processor_id()) {
+ if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
if (user_mode(regs))
return single_step_cont(regs, args);
break;
@@ -530,7 +605,42 @@ static struct notifier_block kgdb_notifier = {
*/
int kgdb_arch_init(void)
{
- return register_die_notifier(&kgdb_notifier);
+ int i, cpu;
+ int ret;
+ struct perf_event_attr attr;
+ struct perf_event **pevent;
+
+ ret = register_die_notifier(&kgdb_notifier);
+ if (ret != 0)
+ return ret;
+ /*
+ * Pre-allocate the hw breakpoint structions in the non-atomic
+ * portion of kgdb because this operation requires mutexs to
+ * complete.
+ */
+ attr.bp_addr = (unsigned long)kgdb_arch_init;
+ attr.type = PERF_TYPE_BREAKPOINT;
+ attr.bp_len = HW_BREAKPOINT_LEN_1;
+ attr.bp_type = HW_BREAKPOINT_W;
+ attr.disabled = 1;
+ for (i = 0; i < 4; i++) {
+ breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
+ if (IS_ERR(breakinfo[i].pev)) {
+ printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n");
+ breakinfo[i].pev = NULL;
+ kgdb_arch_exit();
+ return -1;
+ }
+ for_each_online_cpu(cpu) {
+ pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
+ pevent[0]->hw.sample_period = 1;
+ if (pevent[0]->destroy != NULL) {
+ pevent[0]->destroy = NULL;
+ release_bp_slot(*pevent);
+ }
+ }
+ }
+ return ret;
}
/**
@@ -541,6 +651,13 @@ int kgdb_arch_init(void)
*/
void kgdb_arch_exit(void)
{
+ int i;
+ for (i = 0; i < 4; i++) {
+ if (breakinfo[i].pev) {
+ unregister_wide_hw_breakpoint(breakinfo[i].pev);
+ breakinfo[i].pev = NULL;
+ }
+ }
unregister_die_notifier(&kgdb_notifier);
}
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7b5169d2b000..b43bbaebe2c0 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -48,31 +48,23 @@
#include <linux/preempt.h>
#include <linux/module.h>
#include <linux/kdebug.h>
+#include <linux/kallsyms.h>
+#include <linux/ftrace.h>
#include <asm/cacheflush.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/uaccess.h>
#include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/debugreg.h>
void jprobe_return_end(void);
DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
-#ifdef CONFIG_X86_64
-#define stack_addr(regs) ((unsigned long *)regs->sp)
-#else
-/*
- * "&regs->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs
- * don't save the ss and esp registers if the CPU is already in kernel
- * mode when it traps. So for kprobes, regs->sp and regs->ss are not
- * the [nonexistent] saved stack pointer and ss register, but rather
- * the top 8 bytes of the pre-int3 stack. So &regs->sp happens to
- * point to the top of the pre-int3 stack.
- */
-#define stack_addr(regs) ((unsigned long *)&regs->sp)
-#endif
+#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
@@ -106,50 +98,6 @@ static const u32 twobyte_is_boostable[256 / 32] = {
/* ----------------------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
-static const u32 onebyte_has_modrm[256 / 32] = {
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* ----------------------------------------------- */
- W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
- W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
- W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
- W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
- W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
- W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
- W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
- W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
- W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
- W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
- W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
- W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
- W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
- W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
- W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
- W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */
- /* ----------------------------------------------- */
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
-};
-static const u32 twobyte_has_modrm[256 / 32] = {
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* ----------------------------------------------- */
- W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
- W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
- W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
- W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
- W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
- W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
- W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
- W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
- W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
- W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
- W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
- W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
- W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
- W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
- W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
- W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */
- /* ----------------------------------------------- */
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
-};
#undef W
struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@ -159,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
};
const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
-/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-static void __kprobes set_jmp_op(void *from, void *to)
+static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
{
- struct __arch_jmp_op {
- char op;
+ struct __arch_relative_insn {
+ u8 op;
s32 raddr;
- } __attribute__((packed)) * jop;
- jop = (struct __arch_jmp_op *)from;
- jop->raddr = (s32)((long)(to) - ((long)(from) + 5));
- jop->op = RELATIVEJUMP_INSTRUCTION;
+ } __attribute__((packed)) *insn;
+
+ insn = (struct __arch_relative_insn *)from;
+ insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
+ insn->op = op;
+}
+
+/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
+static void __kprobes synthesize_reljump(void *from, void *to)
+{
+ __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
}
/*
@@ -244,6 +198,75 @@ retry:
}
}
+/* Recover the probed instruction at addr for further analysis. */
+static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+{
+ struct kprobe *kp;
+ kp = get_kprobe((void *)addr);
+ if (!kp)
+ return -EINVAL;
+
+ /*
+ * Basically, kp->ainsn.insn has an original instruction.
+ * However, RIP-relative instruction can not do single-stepping
+ * at different place, __copy_instruction() tweaks the displacement of
+ * that instruction. In that case, we can't recover the instruction
+ * from the kp->ainsn.insn.
+ *
+ * On the other hand, kp->opcode has a copy of the first byte of
+ * the probed instruction, which is overwritten by int3. And
+ * the instruction at kp->addr is not modified by kprobes except
+ * for the first byte, we can recover the original instruction
+ * from it and kp->opcode.
+ */
+ memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+ buf[0] = kp->opcode;
+ return 0;
+}
+
+/* Dummy buffers for kallsyms_lookup */
+static char __dummy_buf[KSYM_NAME_LEN];
+
+/* Check if paddr is at an instruction boundary */
+static int __kprobes can_probe(unsigned long paddr)
+{
+ int ret;
+ unsigned long addr, offset = 0;
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+ if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
+ return 0;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr) {
+ kernel_insn_init(&insn, (void *)addr);
+ insn_get_opcode(&insn);
+
+ /*
+ * Check if the instruction has been modified by another
+ * kprobe, in which case we replace the breakpoint by the
+ * original instruction in our buffer.
+ */
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+ ret = recover_probed_instruction(buf, addr);
+ if (ret)
+ /*
+ * Another debugging subsystem might insert
+ * this breakpoint. In that case, we can't
+ * recover it.
+ */
+ return 0;
+ kernel_insn_init(&insn, buf);
+ }
+ insn_get_length(&insn);
+ addr += insn.length;
+ }
+
+ return (addr == paddr);
+}
+
/*
* Returns non-zero if opcode modifies the interrupt flag.
*/
@@ -268,86 +291,67 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
}
/*
- * Adjust the displacement if the instruction uses the %rip-relative
- * addressing mode.
+ * Copy an instruction and adjust the displacement if the instruction
+ * uses the %rip-relative addressing mode.
* If it does, Return the address of the 32-bit displacement word.
* If not, return null.
* Only applicable to 64-bit x86.
*/
-static void __kprobes fix_riprel(struct kprobe *p)
+static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
{
-#ifdef CONFIG_X86_64
- u8 *insn = p->ainsn.insn;
- s64 disp;
- int need_modrm;
-
- /* Skip legacy instruction prefixes. */
- while (1) {
- switch (*insn) {
- case 0x66:
- case 0x67:
- case 0x2e:
- case 0x3e:
- case 0x26:
- case 0x64:
- case 0x65:
- case 0x36:
- case 0xf0:
- case 0xf3:
- case 0xf2:
- ++insn;
- continue;
+ struct insn insn;
+ int ret;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+ kernel_insn_init(&insn, src);
+ if (recover) {
+ insn_get_opcode(&insn);
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+ ret = recover_probed_instruction(buf,
+ (unsigned long)src);
+ if (ret)
+ return 0;
+ kernel_insn_init(&insn, buf);
}
- break;
}
+ insn_get_length(&insn);
+ memcpy(dest, insn.kaddr, insn.length);
- /* Skip REX instruction prefix. */
- if (is_REX_prefix(insn))
- ++insn;
-
- if (*insn == 0x0f) {
- /* Two-byte opcode. */
- ++insn;
- need_modrm = test_bit(*insn,
- (unsigned long *)twobyte_has_modrm);
- } else
- /* One-byte opcode. */
- need_modrm = test_bit(*insn,
- (unsigned long *)onebyte_has_modrm);
-
- if (need_modrm) {
- u8 modrm = *++insn;
- if ((modrm & 0xc7) == 0x05) {
- /* %rip+disp32 addressing mode */
- /* Displacement follows ModRM byte. */
- ++insn;
- /*
- * The copied instruction uses the %rip-relative
- * addressing mode. Adjust the displacement for the
- * difference between the original location of this
- * instruction and the location of the copy that will
- * actually be run. The tricky bit here is making sure
- * that the sign extension happens correctly in this
- * calculation, since we need a signed 32-bit result to
- * be sign-extended to 64 bits when it's added to the
- * %rip value and yield the same 64-bit result that the
- * sign-extension of the original signed 32-bit
- * displacement would have given.
- */
- disp = (u8 *) p->addr + *((s32 *) insn) -
- (u8 *) p->ainsn.insn;
- BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
- *(s32 *)insn = (s32) disp;
- }
+#ifdef CONFIG_X86_64
+ if (insn_rip_relative(&insn)) {
+ s64 newdisp;
+ u8 *disp;
+ kernel_insn_init(&insn, dest);
+ insn_get_displacement(&insn);
+ /*
+ * The copied instruction uses the %rip-relative addressing
+ * mode. Adjust the displacement for the difference between
+ * the original location of this instruction and the location
+ * of the copy that will actually be run. The tricky bit here
+ * is making sure that the sign extension happens correctly in
+ * this calculation, since we need a signed 32-bit result to
+ * be sign-extended to 64 bits when it's added to the %rip
+ * value and yield the same 64-bit result that the sign-
+ * extension of the original signed 32-bit displacement would
+ * have given.
+ */
+ newdisp = (u8 *) src + (s64) insn.displacement.value -
+ (u8 *) dest;
+ BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
+ disp = (u8 *) dest + insn_offset_displacement(&insn);
+ *(s32 *) disp = (s32) newdisp;
}
#endif
+ return insn.length;
}
static void __kprobes arch_copy_kprobe(struct kprobe *p)
{
- memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-
- fix_riprel(p);
+ /*
+ * Copy an instruction without recovering int3, because it will be
+ * put by another subsystem.
+ */
+ __copy_instruction(p->ainsn.insn, p->addr, 0);
if (can_boost(p->addr))
p->ainsn.boostable = 0;
@@ -359,6 +363,11 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
int __kprobes arch_prepare_kprobe(struct kprobe *p)
{
+ if (alternatives_text_reserved(p->addr, p->addr))
+ return -EINVAL;
+
+ if (!can_probe((unsigned long)p->addr))
+ return -EILSEQ;
/* insn: must be on special executable page on x86. */
p->ainsn.insn = get_insn_slot();
if (!p->ainsn.insn)
@@ -423,18 +432,6 @@ static void __kprobes restore_btf(void)
update_debugctlmsr(current->thread.debugctlmsr);
}
-static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
-{
- clear_btf();
- regs->flags |= X86_EFLAGS_TF;
- regs->flags &= ~X86_EFLAGS_IF;
- /* single step inline if the instruction is an int3 */
- if (p->opcode == BREAKPOINT_INSTRUCTION)
- regs->ip = (unsigned long)p->addr;
- else
- regs->ip = (unsigned long)p->ainsn.insn;
-}
-
void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
@@ -446,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
*sara = (unsigned long) &kretprobe_trampoline;
}
+#ifdef CONFIG_OPTPROBES
+static int __kprobes setup_detour_execution(struct kprobe *p,
+ struct pt_regs *regs,
+ int reenter);
+#else
+#define setup_detour_execution(p, regs, reenter) (0)
+#endif
+
static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
+ struct kprobe_ctlblk *kcb, int reenter)
{
-#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER)
+ if (setup_detour_execution(p, regs, reenter))
+ return;
+
+#if !defined(CONFIG_PREEMPT)
if (p->ainsn.boostable == 1 && !p->post_handler) {
/* Boost up -- we can execute copied instructions directly */
- reset_current_kprobe();
+ if (!reenter)
+ reset_current_kprobe();
+ /*
+ * Reentering boosted probe doesn't reset current_kprobe,
+ * nor set current_kprobe, because it doesn't use single
+ * stepping.
+ */
regs->ip = (unsigned long)p->ainsn.insn;
preempt_enable_no_resched();
return;
}
#endif
- prepare_singlestep(p, regs);
- kcb->kprobe_status = KPROBE_HIT_SS;
+ if (reenter) {
+ save_previous_kprobe(kcb);
+ set_current_kprobe(p, regs, kcb);
+ kcb->kprobe_status = KPROBE_REENTER;
+ } else
+ kcb->kprobe_status = KPROBE_HIT_SS;
+ /* Prepare real single stepping */
+ clear_btf();
+ regs->flags |= X86_EFLAGS_TF;
+ regs->flags &= ~X86_EFLAGS_IF;
+ /* single step inline if the instruction is an int3 */
+ if (p->opcode == BREAKPOINT_INSTRUCTION)
+ regs->ip = (unsigned long)p->addr;
+ else
+ regs->ip = (unsigned long)p->ainsn.insn;
}
/*
@@ -472,37 +499,21 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
{
switch (kcb->kprobe_status) {
case KPROBE_HIT_SSDONE:
-#ifdef CONFIG_X86_64
- /* TODO: Provide re-entrancy from post_kprobes_handler() and
- * avoid exception stack corruption while single-stepping on
- * the instruction of the new probe.
- */
- arch_disarm_kprobe(p);
- regs->ip = (unsigned long)p->addr;
- reset_current_kprobe();
- preempt_enable_no_resched();
- break;
-#endif
case KPROBE_HIT_ACTIVE:
- save_previous_kprobe(kcb);
- set_current_kprobe(p, regs, kcb);
kprobes_inc_nmissed_count(p);
- prepare_singlestep(p, regs);
- kcb->kprobe_status = KPROBE_REENTER;
+ setup_singlestep(p, regs, kcb, 1);
break;
case KPROBE_HIT_SS:
- if (p == kprobe_running()) {
- regs->flags &= ~X86_EFLAGS_TF;
- regs->flags |= kcb->kprobe_saved_flags;
- return 0;
- } else {
- /* A probe has been hit in the codepath leading up
- * to, or just after, single-stepping of a probed
- * instruction. This entire codepath should strictly
- * reside in .kprobes.text section. Raise a warning
- * to highlight this peculiar case.
- */
- }
+ /* A probe has been hit in the codepath leading up to, or just
+ * after, single-stepping of a probed instruction. This entire
+ * codepath should strictly reside in .kprobes.text section.
+ * Raise a BUG or we'll continue in an endless reentering loop
+ * and eventually a stack overflow.
+ */
+ printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
+ p->addr);
+ dump_kprobe(p);
+ BUG();
default:
/* impossible cases */
WARN_ON(1);
@@ -514,7 +525,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
/*
* Interrupts are disabled on entry as trap3 is an interrupt gate and they
- * remain disabled thorough out this function.
+ * remain disabled throughout this function.
*/
static int __kprobes kprobe_handler(struct pt_regs *regs)
{
@@ -565,13 +576,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
* more here.
*/
if (!p->pre_handler || !p->pre_handler(p, regs))
- setup_singlestep(p, regs, kcb);
+ setup_singlestep(p, regs, kcb, 0);
return 1;
}
} else if (kprobe_running()) {
p = __get_cpu_var(current_kprobe);
if (p->break_handler && p->break_handler(p, regs)) {
- setup_singlestep(p, regs, kcb);
+ setup_singlestep(p, regs, kcb, 0);
return 1;
}
} /* else: not a kprobe fault; let the kernel handle it */
@@ -580,6 +591,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
return 0;
}
+#ifdef CONFIG_X86_64
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax. */ \
+ " subq $24, %rsp\n" \
+ " pushq %rdi\n" \
+ " pushq %rsi\n" \
+ " pushq %rdx\n" \
+ " pushq %rcx\n" \
+ " pushq %rax\n" \
+ " pushq %r8\n" \
+ " pushq %r9\n" \
+ " pushq %r10\n" \
+ " pushq %r11\n" \
+ " pushq %rbx\n" \
+ " pushq %rbp\n" \
+ " pushq %r12\n" \
+ " pushq %r13\n" \
+ " pushq %r14\n" \
+ " pushq %r15\n"
+#define RESTORE_REGS_STRING \
+ " popq %r15\n" \
+ " popq %r14\n" \
+ " popq %r13\n" \
+ " popq %r12\n" \
+ " popq %rbp\n" \
+ " popq %rbx\n" \
+ " popq %r11\n" \
+ " popq %r10\n" \
+ " popq %r9\n" \
+ " popq %r8\n" \
+ " popq %rax\n" \
+ " popq %rcx\n" \
+ " popq %rdx\n" \
+ " popq %rsi\n" \
+ " popq %rdi\n" \
+ /* Skip orig_ax, ip, cs */ \
+ " addq $24, %rsp\n"
+#else
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax and gs. */ \
+ " subl $16, %esp\n" \
+ " pushl %fs\n" \
+ " pushl %ds\n" \
+ " pushl %es\n" \
+ " pushl %eax\n" \
+ " pushl %ebp\n" \
+ " pushl %edi\n" \
+ " pushl %esi\n" \
+ " pushl %edx\n" \
+ " pushl %ecx\n" \
+ " pushl %ebx\n"
+#define RESTORE_REGS_STRING \
+ " popl %ebx\n" \
+ " popl %ecx\n" \
+ " popl %edx\n" \
+ " popl %esi\n" \
+ " popl %edi\n" \
+ " popl %ebp\n" \
+ " popl %eax\n" \
+ /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
+ " addl $24, %esp\n"
+#endif
+
/*
* When a retprobed function returns, this code saves registers and
* calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -593,65 +667,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
/* We don't bother saving the ss register */
" pushq %rsp\n"
" pushfq\n"
- /*
- * Skip cs, ip, orig_ax.
- * trampoline_handler() will plug in these values
- */
- " subq $24, %rsp\n"
- " pushq %rdi\n"
- " pushq %rsi\n"
- " pushq %rdx\n"
- " pushq %rcx\n"
- " pushq %rax\n"
- " pushq %r8\n"
- " pushq %r9\n"
- " pushq %r10\n"
- " pushq %r11\n"
- " pushq %rbx\n"
- " pushq %rbp\n"
- " pushq %r12\n"
- " pushq %r13\n"
- " pushq %r14\n"
- " pushq %r15\n"
+ SAVE_REGS_STRING
" movq %rsp, %rdi\n"
" call trampoline_handler\n"
/* Replace saved sp with true return address. */
" movq %rax, 152(%rsp)\n"
- " popq %r15\n"
- " popq %r14\n"
- " popq %r13\n"
- " popq %r12\n"
- " popq %rbp\n"
- " popq %rbx\n"
- " popq %r11\n"
- " popq %r10\n"
- " popq %r9\n"
- " popq %r8\n"
- " popq %rax\n"
- " popq %rcx\n"
- " popq %rdx\n"
- " popq %rsi\n"
- " popq %rdi\n"
- /* Skip orig_ax, ip, cs */
- " addq $24, %rsp\n"
+ RESTORE_REGS_STRING
" popfq\n"
#else
" pushf\n"
- /*
- * Skip cs, ip, orig_ax and gs.
- * trampoline_handler() will plug in these values
- */
- " subl $16, %esp\n"
- " pushl %fs\n"
- " pushl %es\n"
- " pushl %ds\n"
- " pushl %eax\n"
- " pushl %ebp\n"
- " pushl %edi\n"
- " pushl %esi\n"
- " pushl %edx\n"
- " pushl %ecx\n"
- " pushl %ebx\n"
+ SAVE_REGS_STRING
" movl %esp, %eax\n"
" call trampoline_handler\n"
/* Move flags to cs */
@@ -659,15 +684,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
" movl %edx, 52(%esp)\n"
/* Replace saved flags with true return address. */
" movl %eax, 56(%esp)\n"
- " popl %ebx\n"
- " popl %ecx\n"
- " popl %edx\n"
- " popl %esi\n"
- " popl %edi\n"
- " popl %ebp\n"
- " popl %eax\n"
- /* Skip ds, es, fs, gs, orig_ax and ip */
- " addl $24, %esp\n"
+ RESTORE_REGS_STRING
" popf\n"
#endif
" ret\n");
@@ -835,8 +852,8 @@ static void __kprobes resume_execution(struct kprobe *p,
* These instructions can be executed directly if it
* jumps back to correct address.
*/
- set_jmp_op((void *)regs->ip,
- (void *)orig_ip + (regs->ip - copy_ip));
+ synthesize_reljump((void *)regs->ip,
+ (void *)orig_ip + (regs->ip - copy_ip));
p->ainsn.boostable = 1;
} else {
p->ainsn.boostable = -1;
@@ -851,7 +868,7 @@ no_change:
/*
* Interrupts are disabled on entry as trap1 is an interrupt gate and they
- * remain disabled thoroughout this function.
+ * remain disabled throughout this function.
*/
static int __kprobes post_kprobe_handler(struct pt_regs *regs)
{
@@ -967,8 +984,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
ret = NOTIFY_STOP;
break;
case DIE_DEBUG:
- if (post_kprobe_handler(args->regs))
+ if (post_kprobe_handler(args->regs)) {
+ /*
+ * Reset the BS bit in dr6 (pointed by args->err) to
+ * denote completion of processing
+ */
+ (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
ret = NOTIFY_STOP;
+ }
break;
case DIE_GPF:
/*
@@ -1057,6 +1080,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
return 0;
}
+
+#ifdef CONFIG_OPTPROBES
+
+/* Insert a call instruction at address 'from', which calls address 'to'.*/
+static void __kprobes synthesize_relcall(void *from, void *to)
+{
+ __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
+}
+
+/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
+static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
+ unsigned long val)
+{
+#ifdef CONFIG_X86_64
+ *addr++ = 0x48;
+ *addr++ = 0xbf;
+#else
+ *addr++ = 0xb8;
+#endif
+ *(unsigned long *)addr = val;
+}
+
+void __kprobes kprobes_optinsn_template_holder(void)
+{
+ asm volatile (
+ ".global optprobe_template_entry\n"
+ "optprobe_template_entry: \n"
+#ifdef CONFIG_X86_64
+ /* We don't bother saving the ss register */
+ " pushq %rsp\n"
+ " pushfq\n"
+ SAVE_REGS_STRING
+ " movq %rsp, %rsi\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val: \n"
+ ASM_NOP5
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call: \n"
+ ASM_NOP5
+ /* Move flags to rsp */
+ " movq 144(%rsp), %rdx\n"
+ " movq %rdx, 152(%rsp)\n"
+ RESTORE_REGS_STRING
+ /* Skip flags entry */
+ " addq $8, %rsp\n"
+ " popfq\n"
+#else /* CONFIG_X86_32 */
+ " pushf\n"
+ SAVE_REGS_STRING
+ " movl %esp, %edx\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val: \n"
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call: \n"
+ ASM_NOP5
+ RESTORE_REGS_STRING
+ " addl $4, %esp\n" /* skip cs */
+ " popf\n"
+#endif
+ ".global optprobe_template_end\n"
+ "optprobe_template_end: \n");
+}
+
+#define TMPL_MOVE_IDX \
+ ((long)&optprobe_template_val - (long)&optprobe_template_entry)
+#define TMPL_CALL_IDX \
+ ((long)&optprobe_template_call - (long)&optprobe_template_entry)
+#define TMPL_END_IDX \
+ ((long)&optprobe_template_end - (long)&optprobe_template_entry)
+
+#define INT3_SIZE sizeof(kprobe_opcode_t)
+
+/* Optimized kprobe call back function: called from optinsn */
+static void __kprobes optimized_callback(struct optimized_kprobe *op,
+ struct pt_regs *regs)
+{
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ preempt_disable();
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(&op->kp);
+ } else {
+ /* Save skipped registers */
+#ifdef CONFIG_X86_64
+ regs->cs = __KERNEL_CS;
+#else
+ regs->cs = __KERNEL_CS | get_kernel_rpl();
+ regs->gs = 0;
+#endif
+ regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
+ regs->orig_ax = ~0UL;
+
+ __get_cpu_var(current_kprobe) = &op->kp;
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ opt_pre_handler(&op->kp, regs);
+ __get_cpu_var(current_kprobe) = NULL;
+ }
+ preempt_enable_no_resched();
+}
+
+static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
+{
+ int len = 0, ret;
+
+ while (len < RELATIVEJUMP_SIZE) {
+ ret = __copy_instruction(dest + len, src + len, 1);
+ if (!ret || !can_boost(dest + len))
+ return -EINVAL;
+ len += ret;
+ }
+ /* Check whether the address range is reserved */
+ if (ftrace_text_reserved(src, src + len - 1) ||
+ alternatives_text_reserved(src, src + len - 1))
+ return -EBUSY;
+
+ return len;
+}
+
+/* Check whether insn is indirect jump */
+static int __kprobes insn_is_indirect_jump(struct insn *insn)
+{
+ return ((insn->opcode.bytes[0] == 0xff &&
+ (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
+ insn->opcode.bytes[0] == 0xea); /* Segment based jump */
+}
+
+/* Check whether insn jumps into specified address range */
+static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
+{
+ unsigned long target = 0;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0xe0: /* loopne */
+ case 0xe1: /* loope */
+ case 0xe2: /* loop */
+ case 0xe3: /* jcxz */
+ case 0xe9: /* near relative jump */
+ case 0xeb: /* short relative jump */
+ break;
+ case 0x0f:
+ if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
+ break;
+ return 0;
+ default:
+ if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
+ break;
+ return 0;
+ }
+ target = (unsigned long)insn->next_byte + insn->immediate.value;
+
+ return (start <= target && target <= start + len);
+}
+
+/* Decode whole function to ensure any instructions don't jump into target */
+static int __kprobes can_optimize(unsigned long paddr)
+{
+ int ret;
+ unsigned long addr, size = 0, offset = 0;
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+ /* Dummy buffers for lookup_symbol_attrs */
+ static char __dummy_buf[KSYM_NAME_LEN];
+
+ /* Lookup symbol including addr */
+ if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
+ return 0;
+
+ /* Check there is enough space for a relative jump. */
+ if (size - offset < RELATIVEJUMP_SIZE)
+ return 0;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr - offset + size) { /* Decode until function end */
+ if (search_exception_tables(addr))
+ /*
+ * Since some fixup code will jumps into this function,
+ * we can't optimize kprobe in this function.
+ */
+ return 0;
+ kernel_insn_init(&insn, (void *)addr);
+ insn_get_opcode(&insn);
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+ ret = recover_probed_instruction(buf, addr);
+ if (ret)
+ return 0;
+ kernel_insn_init(&insn, buf);
+ }
+ insn_get_length(&insn);
+ /* Recover address */
+ insn.kaddr = (void *)addr;
+ insn.next_byte = (void *)(addr + insn.length);
+ /* Check any instructions don't jump into target */
+ if (insn_is_indirect_jump(&insn) ||
+ insn_jump_into_range(&insn, paddr + INT3_SIZE,
+ RELATIVE_ADDR_SIZE))
+ return 0;
+ addr += insn.length;
+ }
+
+ return 1;
+}
+
+/* Check optimized_kprobe can actually be optimized. */
+int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+ int i;
+ struct kprobe *p;
+
+ for (i = 1; i < op->optinsn.size; i++) {
+ p = get_kprobe(op->kp.addr + i);
+ if (p && !kprobe_disabled(p))
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+/* Check the addr is within the optimized instructions. */
+int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
+ unsigned long addr)
+{
+ return ((unsigned long)op->kp.addr <= addr &&
+ (unsigned long)op->kp.addr + op->optinsn.size > addr);
+}
+
+/* Free optimized instruction slot */
+static __kprobes
+void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
+{
+ if (op->optinsn.insn) {
+ free_optinsn_slot(op->optinsn.insn, dirty);
+ op->optinsn.insn = NULL;
+ op->optinsn.size = 0;
+ }
+}
+
+void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+ __arch_remove_optimized_kprobe(op, 1);
+}
+
+/*
+ * Copy replacing target instructions
+ * Target instructions MUST be relocatable (checked inside)
+ */
+int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+{
+ u8 *buf;
+ int ret;
+ long rel;
+
+ if (!can_optimize((unsigned long)op->kp.addr))
+ return -EILSEQ;
+
+ op->optinsn.insn = get_optinsn_slot();
+ if (!op->optinsn.insn)
+ return -ENOMEM;
+
+ /*
+ * Verify if the address gap is in 2GB range, because this uses
+ * a relative jump.
+ */
+ rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
+ if (abs(rel) > 0x7fffffff)
+ return -ERANGE;
+
+ buf = (u8 *)op->optinsn.insn;
+
+ /* Copy instructions into the out-of-line buffer */
+ ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
+ if (ret < 0) {
+ __arch_remove_optimized_kprobe(op, 0);
+ return ret;
+ }
+ op->optinsn.size = ret;
+
+ /* Copy arch-dep-instance from template */
+ memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
+
+ /* Set probe information */
+ synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
+
+ /* Set probe function call */
+ synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
+
+ /* Set returning jmp instruction at the tail of out-of-line buffer */
+ synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
+ (u8 *)op->kp.addr + op->optinsn.size);
+
+ flush_icache_range((unsigned long) buf,
+ (unsigned long) buf + TMPL_END_IDX +
+ op->optinsn.size + RELATIVEJUMP_SIZE);
+ return 0;
+}
+
+/* Replace a breakpoint (int3) with a relative jump. */
+int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
+{
+ unsigned char jmp_code[RELATIVEJUMP_SIZE];
+ s32 rel = (s32)((long)op->optinsn.insn -
+ ((long)op->kp.addr + RELATIVEJUMP_SIZE));
+
+ /* Backup instructions which will be replaced by jump address */
+ memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
+ RELATIVE_ADDR_SIZE);
+
+ jmp_code[0] = RELATIVEJUMP_OPCODE;
+ *(s32 *)(&jmp_code[1]) = rel;
+
+ /*
+ * text_poke_smp doesn't support NMI/MCE code modifying.
+ * However, since kprobes itself also doesn't support NMI/MCE
+ * code probing, it's not a problem.
+ */
+ text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
+ return 0;
+}
+
+/* Replace a relative jump with a breakpoint (int3). */
+void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+ u8 buf[RELATIVEJUMP_SIZE];
+
+ /* Set int3 to first byte for kprobes */
+ buf[0] = BREAKPOINT_INSTRUCTION;
+ memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+ text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
+}
+
+static int __kprobes setup_detour_execution(struct kprobe *p,
+ struct pt_regs *regs,
+ int reenter)
+{
+ struct optimized_kprobe *op;
+
+ if (p->flags & KPROBE_FLAG_OPTIMIZED) {
+ /* This kprobe is really able to run optimized path. */
+ op = container_of(p, struct optimized_kprobe, kp);
+ /* Detour through copied instructions */
+ regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
+ if (!reenter)
+ reset_current_kprobe();
+ preempt_enable_no_resched();
+ return 1;
+ }
+ return 0;
+}
+#endif
+
int __init arch_init_kprobes(void)
{
return 0;
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index c1c429d00130..a3fa43ba5d3b 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -25,6 +25,7 @@
#include <asm/desc.h>
#include <asm/system.h>
#include <asm/cacheflush.h>
+#include <asm/debugreg.h>
static void set_idt(void *newidt, __u16 limit)
{
@@ -157,8 +158,7 @@ int machine_kexec_prepare(struct kimage *image)
{
int error;
- if (nx_enabled)
- set_pages_x(image->control_code_page, 1);
+ set_pages_x(image->control_code_page, 1);
error = machine_kexec_alloc_page_tables(image);
if (error)
return error;
@@ -172,8 +172,7 @@ int machine_kexec_prepare(struct kimage *image)
*/
void machine_kexec_cleanup(struct kimage *image)
{
- if (nx_enabled)
- set_pages_nx(image->control_code_page, 1);
+ set_pages_nx(image->control_code_page, 1);
machine_kexec_free_page_tables(image);
}
@@ -202,6 +201,7 @@ void machine_kexec(struct kimage *image)
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
+ hw_breakpoint_disable();
if (image->preserve_context) {
#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 84c3bf209e98..4a8bb82248ae 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -18,6 +18,7 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
+#include <asm/debugreg.h>
static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
unsigned long addr)
@@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image)
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
+ hw_breakpoint_disable();
if (image->preserve_context) {
#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
deleted file mode 100644
index 2a62d843f015..000000000000
--- a/arch/x86/kernel/mfgpt_32.c
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * Driver/API for AMD Geode Multi-Function General Purpose Timers (MFGPT)
- *
- * Copyright (C) 2006, Advanced Micro Devices, Inc.
- * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public License
- * as published by the Free Software Foundation.
- *
- * The MFGPTs are documented in AMD Geode CS5536 Companion Device Data Book.
- */
-
-/*
- * We are using the 32.768kHz input clock - it's the only one that has the
- * ranges we find desirable. The following table lists the suitable
- * divisors and the associated Hz, minimum interval and the maximum interval:
- *
- * Divisor Hz Min Delta (s) Max Delta (s)
- * 1 32768 .00048828125 2.000
- * 2 16384 .0009765625 4.000
- * 4 8192 .001953125 8.000
- * 8 4096 .00390625 16.000
- * 16 2048 .0078125 32.000
- * 32 1024 .015625 64.000
- * 64 512 .03125 128.000
- * 128 256 .0625 256.000
- * 256 128 .125 512.000
- */
-
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <asm/geode.h>
-
-#define MFGPT_DEFAULT_IRQ 7
-
-static struct mfgpt_timer_t {
- unsigned int avail:1;
-} mfgpt_timers[MFGPT_MAX_TIMERS];
-
-/* Selected from the table above */
-
-#define MFGPT_DIVISOR 16
-#define MFGPT_SCALE 4 /* divisor = 2^(scale) */
-#define MFGPT_HZ (32768 / MFGPT_DIVISOR)
-#define MFGPT_PERIODIC (MFGPT_HZ / HZ)
-
-/* Allow for disabling of MFGPTs */
-static int disable;
-static int __init mfgpt_disable(char *s)
-{
- disable = 1;
- return 1;
-}
-__setup("nomfgpt", mfgpt_disable);
-
-/* Reset the MFGPT timers. This is required by some broken BIOSes which already
- * do the same and leave the system in an unstable state. TinyBIOS 0.98 is
- * affected at least (0.99 is OK with MFGPT workaround left to off).
- */
-static int __init mfgpt_fix(char *s)
-{
- u32 val, dummy;
-
- /* The following udocumented bit resets the MFGPT timers */
- val = 0xFF; dummy = 0;
- wrmsr(MSR_MFGPT_SETUP, val, dummy);
- return 1;
-}
-__setup("mfgptfix", mfgpt_fix);
-
-/*
- * Check whether any MFGPTs are available for the kernel to use. In most
- * cases, firmware that uses AMD's VSA code will claim all timers during
- * bootup; we certainly don't want to take them if they're already in use.
- * In other cases (such as with VSAless OpenFirmware), the system firmware
- * leaves timers available for us to use.
- */
-
-
-static int timers = -1;
-
-static void geode_mfgpt_detect(void)
-{
- int i;
- u16 val;
-
- timers = 0;
-
- if (disable) {
- printk(KERN_INFO "geode-mfgpt: MFGPT support is disabled\n");
- goto done;
- }
-
- if (!geode_get_dev_base(GEODE_DEV_MFGPT)) {
- printk(KERN_INFO "geode-mfgpt: MFGPT LBAR is not set up\n");
- goto done;
- }
-
- for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
- val = geode_mfgpt_read(i, MFGPT_REG_SETUP);
- if (!(val & MFGPT_SETUP_SETUP)) {
- mfgpt_timers[i].avail = 1;
- timers++;
- }
- }
-
-done:
- printk(KERN_INFO "geode-mfgpt: %d MFGPT timers available.\n", timers);
-}
-
-int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
-{
- u32 msr, mask, value, dummy;
- int shift = (cmp == MFGPT_CMP1) ? 0 : 8;
-
- if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
- return -EIO;
-
- /*
- * The register maps for these are described in sections 6.17.1.x of
- * the AMD Geode CS5536 Companion Device Data Book.
- */
- switch (event) {
- case MFGPT_EVENT_RESET:
- /*
- * XXX: According to the docs, we cannot reset timers above
- * 6; that is, resets for 7 and 8 will be ignored. Is this
- * a problem? -dilinger
- */
- msr = MSR_MFGPT_NR;
- mask = 1 << (timer + 24);
- break;
-
- case MFGPT_EVENT_NMI:
- msr = MSR_MFGPT_NR;
- mask = 1 << (timer + shift);
- break;
-
- case MFGPT_EVENT_IRQ:
- msr = MSR_MFGPT_IRQ;
- mask = 1 << (timer + shift);
- break;
-
- default:
- return -EIO;
- }
-
- rdmsr(msr, value, dummy);
-
- if (enable)
- value |= mask;
- else
- value &= ~mask;
-
- wrmsr(msr, value, dummy);
- return 0;
-}
-EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
-
-int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable)
-{
- u32 zsel, lpc, dummy;
- int shift;
-
- if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
- return -EIO;
-
- /*
- * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA
- * is using the same CMP of the timer's Siamese twin, the IRQ is set to
- * 2, and we mustn't use nor change it.
- * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the
- * IRQ of the 1st. This can only happen if forcing an IRQ, calling this
- * with *irq==0 is safe. Currently there _are_ no 2 drivers.
- */
- rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
- shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4;
- if (((zsel >> shift) & 0xF) == 2)
- return -EIO;
-
- /* Choose IRQ: if none supplied, keep IRQ already set or use default */
- if (!*irq)
- *irq = (zsel >> shift) & 0xF;
- if (!*irq)
- *irq = MFGPT_DEFAULT_IRQ;
-
- /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */
- if (*irq < 1 || *irq == 2 || *irq > 15)
- return -EIO;
- rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy);
- if (lpc & (1 << *irq))
- return -EIO;
-
- /* All chosen and checked - go for it */
- if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
- return -EIO;
- if (enable) {
- zsel = (zsel & ~(0xF << shift)) | (*irq << shift);
- wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
- }
-
- return 0;
-}
-
-static int mfgpt_get(int timer)
-{
- mfgpt_timers[timer].avail = 0;
- printk(KERN_INFO "geode-mfgpt: Registered timer %d\n", timer);
- return timer;
-}
-
-int geode_mfgpt_alloc_timer(int timer, int domain)
-{
- int i;
-
- if (timers == -1) {
- /* timers haven't been detected yet */
- geode_mfgpt_detect();
- }
-
- if (!timers)
- return -1;
-
- if (timer >= MFGPT_MAX_TIMERS)
- return -1;
-
- if (timer < 0) {
- /* Try to find an available timer */
- for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
- if (mfgpt_timers[i].avail)
- return mfgpt_get(i);
-
- if (i == 5 && domain == MFGPT_DOMAIN_WORKING)
- break;
- }
- } else {
- /* If they requested a specific timer, try to honor that */
- if (mfgpt_timers[timer].avail)
- return mfgpt_get(timer);
- }
-
- /* No timers available - too bad */
- return -1;
-}
-EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
-
-
-#ifdef CONFIG_GEODE_MFGPT_TIMER
-
-/*
- * The MFPGT timers on the CS5536 provide us with suitable timers to use
- * as clock event sources - not as good as a HPET or APIC, but certainly
- * better than the PIT. This isn't a general purpose MFGPT driver, but
- * a simplified one designed specifically to act as a clock event source.
- * For full details about the MFGPT, please consult the CS5536 data sheet.
- */
-
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-
-static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN;
-static u16 mfgpt_event_clock;
-
-static int irq;
-static int __init mfgpt_setup(char *str)
-{
- get_option(&str, &irq);
- return 1;
-}
-__setup("mfgpt_irq=", mfgpt_setup);
-
-static void mfgpt_disable_timer(u16 clock)
-{
- /* avoid races by clearing CMP1 and CMP2 unconditionally */
- geode_mfgpt_write(clock, MFGPT_REG_SETUP, (u16) ~MFGPT_SETUP_CNTEN |
- MFGPT_SETUP_CMP1 | MFGPT_SETUP_CMP2);
-}
-
-static int mfgpt_next_event(unsigned long, struct clock_event_device *);
-static void mfgpt_set_mode(enum clock_event_mode, struct clock_event_device *);
-
-static struct clock_event_device mfgpt_clockevent = {
- .name = "mfgpt-timer",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .set_mode = mfgpt_set_mode,
- .set_next_event = mfgpt_next_event,
- .rating = 250,
- .cpumask = cpu_all_mask,
- .shift = 32
-};
-
-static void mfgpt_start_timer(u16 delta)
-{
- geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_CMP2, (u16) delta);
- geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
-
- geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP,
- MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2);
-}
-
-static void mfgpt_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- mfgpt_disable_timer(mfgpt_event_clock);
-
- if (mode == CLOCK_EVT_MODE_PERIODIC)
- mfgpt_start_timer(MFGPT_PERIODIC);
-
- mfgpt_tick_mode = mode;
-}
-
-static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt)
-{
- mfgpt_start_timer(delta);
- return 0;
-}
-
-static irqreturn_t mfgpt_tick(int irq, void *dev_id)
-{
- u16 val = geode_mfgpt_read(mfgpt_event_clock, MFGPT_REG_SETUP);
-
- /* See if the interrupt was for us */
- if (!(val & (MFGPT_SETUP_SETUP | MFGPT_SETUP_CMP2 | MFGPT_SETUP_CMP1)))
- return IRQ_NONE;
-
- /* Turn off the clock (and clear the event) */
- mfgpt_disable_timer(mfgpt_event_clock);
-
- if (mfgpt_tick_mode == CLOCK_EVT_MODE_SHUTDOWN)
- return IRQ_HANDLED;
-
- /* Clear the counter */
- geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
-
- /* Restart the clock in periodic mode */
-
- if (mfgpt_tick_mode == CLOCK_EVT_MODE_PERIODIC) {
- geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP,
- MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2);
- }
-
- mfgpt_clockevent.event_handler(&mfgpt_clockevent);
- return IRQ_HANDLED;
-}
-
-static struct irqaction mfgptirq = {
- .handler = mfgpt_tick,
- .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
- .name = "mfgpt-timer"
-};
-
-int __init mfgpt_timer_setup(void)
-{
- int timer, ret;
- u16 val;
-
- timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING);
- if (timer < 0) {
- printk(KERN_ERR
- "mfgpt-timer: Could not allocate a MFPGT timer\n");
- return -ENODEV;
- }
-
- mfgpt_event_clock = timer;
-
- /* Set up the IRQ on the MFGPT side */
- if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) {
- printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq);
- return -EIO;
- }
-
- /* And register it with the kernel */
- ret = setup_irq(irq, &mfgptirq);
-
- if (ret) {
- printk(KERN_ERR
- "mfgpt-timer: Unable to set up the interrupt.\n");
- goto err;
- }
-
- /* Set the clock scale and enable the event mode for CMP2 */
- val = MFGPT_SCALE | (3 << 8);
-
- geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val);
-
- /* Set up the clock event */
- mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC,
- mfgpt_clockevent.shift);
- mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF,
- &mfgpt_clockevent);
- mfgpt_clockevent.max_delta_ns = clockevent_delta2ns(0xFFFE,
- &mfgpt_clockevent);
-
- printk(KERN_INFO
- "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n",
- timer, irq);
- clockevents_register_device(&mfgpt_clockevent);
-
- return 0;
-
-err:
- geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq);
- printk(KERN_ERR
- "mfgpt-timer: Unable to set up the MFGPT clock source\n");
- return -EIO;
-}
-
-#endif
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 366baa179913..e1af7c055c7d 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,6 +13,9 @@
* Licensed under the terms of the GNU General Public
* License version 2. See file COPYING for details.
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/firmware.h>
#include <linux/pci_ids.h>
#include <linux/uaccess.h>
@@ -76,12 +79,12 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
memset(csig, 0, sizeof(*csig));
if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
- printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not "
- "supported\n", cpu, c->x86);
+ pr_warning("microcode: CPU%d: AMD CPU family 0x%x not "
+ "supported\n", cpu, c->x86);
return -1;
}
rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
- printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev);
+ pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev);
return 0;
}
@@ -103,23 +106,16 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
i++;
}
- if (!equiv_cpu_id) {
- printk(KERN_WARNING "microcode: CPU%d: cpu revision "
- "not listed in equivalent cpu table\n", cpu);
+ if (!equiv_cpu_id)
return 0;
- }
- if (mc_header->processor_rev_id != equiv_cpu_id) {
- printk(KERN_ERR "microcode: CPU%d: patch mismatch "
- "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
- cpu, mc_header->processor_rev_id, equiv_cpu_id);
+ if (mc_header->processor_rev_id != equiv_cpu_id)
return 0;
- }
/* ucode might be chipset specific -- currently we don't support this */
if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
- printk(KERN_ERR "microcode: CPU%d: loading of chipset "
- "specific code not yet supported\n", cpu);
+ pr_err("CPU%d: loading of chipset specific code not yet supported\n",
+ cpu);
return 0;
}
@@ -148,14 +144,12 @@ static int apply_microcode_amd(int cpu)
/* check current patch id and patch's id for match */
if (rev != mc_amd->hdr.patch_id) {
- printk(KERN_ERR "microcode: CPU%d: update failed "
- "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
+ pr_err("CPU%d: update failed (for patch_level=0x%x)\n",
+ cpu, mc_amd->hdr.patch_id);
return -1;
}
- printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
- cpu, rev);
-
+ pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev);
uci->cpu_sig.rev = rev;
return 0;
@@ -178,18 +172,14 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
return NULL;
if (section_hdr[0] != UCODE_UCODE_TYPE) {
- printk(KERN_ERR "microcode: error: invalid type field in "
- "container file section header\n");
+ pr_err("error: invalid type field in container file section header\n");
return NULL;
}
total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
- printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
- size, total_size);
-
if (total_size > size || total_size > UCODE_MAX_SIZE) {
- printk(KERN_ERR "microcode: error: size mismatch\n");
+ pr_err("error: size mismatch\n");
return NULL;
}
@@ -218,15 +208,13 @@ static int install_equiv_cpu_table(const u8 *buf)
size = buf_pos[2];
if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
- printk(KERN_ERR "microcode: error: invalid type field in "
- "container file section header\n");
+ pr_err("error: invalid type field in container file section header\n");
return 0;
}
equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
if (!equiv_cpu_table) {
- printk(KERN_ERR "microcode: failed to allocate "
- "equivalent CPU table\n");
+ pr_err("failed to allocate equivalent CPU table\n");
return 0;
}
@@ -259,8 +247,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
offset = install_equiv_cpu_table(ucode_ptr);
if (!offset) {
- printk(KERN_ERR "microcode: failed to create "
- "equivalent cpu table\n");
+ pr_err("failed to create equivalent cpu table\n");
return UCODE_ERROR;
}
@@ -291,8 +278,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
if (!leftover) {
vfree(uci->mc);
uci->mc = new_mc;
- pr_debug("microcode: CPU%d found a matching microcode "
- "update with version 0x%x (current=0x%x)\n",
+ pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
} else {
vfree(new_mc);
@@ -317,6 +303,12 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
return UCODE_NFOUND;
}
+ if (*(u32 *)firmware->data != UCODE_MAGIC) {
+ pr_err("invalid UCODE_MAGIC (0x%08x)\n",
+ *(u32 *)firmware->data);
+ return UCODE_ERROR;
+ }
+
ret = generic_load_microcode(cpu, firmware->data, firmware->size);
release_firmware(firmware);
@@ -327,8 +319,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
static enum ucode_state
request_microcode_user(int cpu, const void __user *buf, size_t size)
{
- printk(KERN_INFO "microcode: AMD microcode update via "
- "/dev/cpu/microcode not supported\n");
+ pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");
return UCODE_ERROR;
}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 378e9a8f1bf8..cceb5bc3c3c2 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -70,10 +70,12 @@
* Fix sigmatch() macro to handle old CPUs with pf == 0.
* Thanks to Stuart Swales for pointing out this bug.
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/platform_device.h>
#include <linux/miscdevice.h>
#include <linux/capability.h>
-#include <linux/smp_lock.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mutex.h>
@@ -201,7 +203,6 @@ static int do_microcode_update(const void __user *buf, size_t size)
static int microcode_open(struct inode *unused1, struct file *unused2)
{
- cycle_kernel_lock();
return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
}
@@ -211,7 +212,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
ssize_t ret = -EINVAL;
if ((len >> PAGE_SHIFT) > totalram_pages) {
- pr_err("microcode: too much data (max %ld pages)\n", totalram_pages);
+ pr_err("too much data (max %ld pages)\n", totalram_pages);
return ret;
}
@@ -246,7 +247,7 @@ static int __init microcode_dev_init(void)
error = misc_register(&microcode_dev);
if (error) {
- pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR);
+ pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
return error;
}
@@ -361,7 +362,7 @@ static enum ucode_state microcode_resume_cpu(int cpu)
if (!uci->mc)
return UCODE_NFOUND;
- pr_debug("microcode: CPU%d updated upon resume\n", cpu);
+ pr_debug("CPU%d updated upon resume\n", cpu);
apply_microcode_on_target(cpu);
return UCODE_OK;
@@ -381,7 +382,7 @@ static enum ucode_state microcode_init_cpu(int cpu)
ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
if (ustate == UCODE_OK) {
- pr_debug("microcode: CPU%d updated upon init\n", cpu);
+ pr_debug("CPU%d updated upon init\n", cpu);
apply_microcode_on_target(cpu);
}
@@ -408,7 +409,7 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
if (!cpu_online(cpu))
return 0;
- pr_debug("microcode: CPU%d added\n", cpu);
+ pr_debug("CPU%d added\n", cpu);
err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
if (err)
@@ -427,7 +428,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
if (!cpu_online(cpu))
return 0;
- pr_debug("microcode: CPU%d removed\n", cpu);
+ pr_debug("CPU%d removed\n", cpu);
microcode_fini_cpu(cpu);
sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
return 0;
@@ -475,15 +476,15 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
microcode_update_cpu(cpu);
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
- pr_debug("microcode: CPU%d added\n", cpu);
+ pr_debug("CPU%d added\n", cpu);
if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
- pr_err("microcode: Failed to create group for CPU%d\n", cpu);
+ pr_err("Failed to create group for CPU%d\n", cpu);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
/* Suspend is in progress, only remove the interface */
sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
- pr_debug("microcode: CPU%d removed\n", cpu);
+ pr_debug("CPU%d removed\n", cpu);
break;
case CPU_DEAD:
case CPU_UP_CANCELED_FROZEN:
@@ -509,7 +510,7 @@ static int __init microcode_init(void)
microcode_ops = init_amd_microcode();
if (!microcode_ops) {
- pr_err("microcode: no support for this CPU vendor\n");
+ pr_err("no support for this CPU vendor\n");
return -ENODEV;
}
@@ -540,8 +541,7 @@ static int __init microcode_init(void)
register_hotcpu_notifier(&mc_cpu_notifier);
pr_info("Microcode Update Driver: v" MICROCODE_VERSION
- " <tigran@aivazian.fsnet.co.uk>,"
- " Peter Oruba\n");
+ " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
return 0;
}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 0d334ddd0a96..85a343e28937 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,6 +70,9 @@
* Fix sigmatch() macro to handle old CPUs with pf == 0.
* Thanks to Stuart Swales for pointing out this bug.
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/firmware.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
@@ -146,8 +149,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
cpu_has(c, X86_FEATURE_IA64)) {
- printk(KERN_ERR "microcode: CPU%d not a capable Intel "
- "processor\n", cpu_num);
+ pr_err("CPU%d not a capable Intel processor\n", cpu_num);
return -1;
}
@@ -165,8 +167,8 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
- printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
- cpu_num, csig->sig, csig->pf, csig->rev);
+ pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
+ cpu_num, csig->sig, csig->pf, csig->rev);
return 0;
}
@@ -194,28 +196,24 @@ static int microcode_sanity_check(void *mc)
data_size = get_datasize(mc_header);
if (data_size + MC_HEADER_SIZE > total_size) {
- printk(KERN_ERR "microcode: error! "
- "Bad data size in microcode data file\n");
+ pr_err("error! Bad data size in microcode data file\n");
return -EINVAL;
}
if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
- printk(KERN_ERR "microcode: error! "
- "Unknown microcode update format\n");
+ pr_err("error! Unknown microcode update format\n");
return -EINVAL;
}
ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
if (ext_table_size) {
if ((ext_table_size < EXT_HEADER_SIZE)
|| ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
- printk(KERN_ERR "microcode: error! "
- "Small exttable size in microcode data file\n");
+ pr_err("error! Small exttable size in microcode data file\n");
return -EINVAL;
}
ext_header = mc + MC_HEADER_SIZE + data_size;
if (ext_table_size != exttable_size(ext_header)) {
- printk(KERN_ERR "microcode: error! "
- "Bad exttable size in microcode data file\n");
+ pr_err("error! Bad exttable size in microcode data file\n");
return -EFAULT;
}
ext_sigcount = ext_header->count;
@@ -230,8 +228,7 @@ static int microcode_sanity_check(void *mc)
while (i--)
ext_table_sum += ext_tablep[i];
if (ext_table_sum) {
- printk(KERN_WARNING "microcode: aborting, "
- "bad extended signature table checksum\n");
+ pr_warning("aborting, bad extended signature table checksum\n");
return -EINVAL;
}
}
@@ -242,7 +239,7 @@ static int microcode_sanity_check(void *mc)
while (i--)
orig_sum += ((int *)mc)[i];
if (orig_sum) {
- printk(KERN_ERR "microcode: aborting, bad checksum\n");
+ pr_err("aborting, bad checksum\n");
return -EINVAL;
}
if (!ext_table_size)
@@ -255,7 +252,7 @@ static int microcode_sanity_check(void *mc)
- (mc_header->sig + mc_header->pf + mc_header->cksum)
+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
if (sum) {
- printk(KERN_ERR "microcode: aborting, bad checksum\n");
+ pr_err("aborting, bad checksum\n");
return -EINVAL;
}
}
@@ -327,13 +324,11 @@ static int apply_microcode(int cpu)
rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
if (val[1] != mc_intel->hdr.rev) {
- printk(KERN_ERR "microcode: CPU%d update "
- "to revision 0x%x failed\n",
- cpu_num, mc_intel->hdr.rev);
+ pr_err("CPU%d update to revision 0x%x failed\n",
+ cpu_num, mc_intel->hdr.rev);
return -1;
}
- printk(KERN_INFO "microcode: CPU%d updated to revision "
- "0x%x, date = %04x-%02x-%02x \n",
+ pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
cpu_num, val[1],
mc_intel->hdr.date & 0xffff,
mc_intel->hdr.date >> 24,
@@ -362,8 +357,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
mc_size = get_totalsize(&mc_header);
if (!mc_size || mc_size > leftover) {
- printk(KERN_ERR "microcode: error!"
- "Bad data in microcode data file\n");
+ pr_err("error! Bad data in microcode data file\n");
break;
}
@@ -405,9 +399,8 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
vfree(uci->mc);
uci->mc = (struct microcode_intel *)new_mc;
- pr_debug("microcode: CPU%d found a matching microcode update with"
- " version 0x%x (current=0x%x)\n",
- cpu, new_rev, uci->cpu_sig.rev);
+ pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
+ cpu, new_rev, uci->cpu_sig.rev);
out:
return state;
}
@@ -429,7 +422,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
c->x86, c->x86_model, c->x86_mask);
if (request_firmware(&firmware, name, device)) {
- pr_debug("microcode: data file %s load failed\n", name);
+ pr_debug("data file %s load failed\n", name);
return UCODE_NFOUND;
}
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 712d15fdc416..71825806cd44 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -7,6 +7,8 @@
#include <linux/string.h>
#include <linux/pci.h>
#include <linux/dmi.h>
+#include <linux/range.h>
+
#include <asm/pci-direct.h>
#include <linux/sort.h>
#include <asm/io.h>
@@ -30,11 +32,6 @@ static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
{ 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
};
-struct range {
- u64 start;
- u64 end;
-};
-
static int __cpuinit cmp_range(const void *x1, const void *x2)
{
const struct range *r1 = x1;
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5be95ef4ffec..a2c1edd2d3ac 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -359,13 +359,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
x86_init.mpparse.mpc_record(1);
}
-#ifdef CONFIG_X86_BIGSMP
- generic_bigsmp_probe();
-#endif
-
- if (apic->setup_apic_routing)
- apic->setup_apic_routing();
-
if (!num_processors)
printk(KERN_ERR "MPTABLE: no processors registered!\n");
return num_processors;
@@ -667,36 +660,18 @@ void __init default_get_smp_config(unsigned int early)
*/
}
-static void __init smp_reserve_bootmem(struct mpf_intel *mpf)
+static void __init smp_reserve_memory(struct mpf_intel *mpf)
{
unsigned long size = get_mpc_size(mpf->physptr);
-#ifdef CONFIG_X86_32
- /*
- * We cannot access to MPC table to compute table size yet,
- * as only few megabytes from the bottom is mapped now.
- * PC-9800's MPC table places on the very last of physical
- * memory; so that simply reserving PAGE_SIZE from mpf->physptr
- * yields BUG() in reserve_bootmem.
- * also need to make sure physptr is below than max_low_pfn
- * we don't need reserve the area above max_low_pfn
- */
- unsigned long end = max_low_pfn * PAGE_SIZE;
- if (mpf->physptr < end) {
- if (mpf->physptr + size > end)
- size = end - mpf->physptr;
- reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
- }
-#else
- reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
-#endif
+ reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc");
}
-static int __init smp_scan_config(unsigned long base, unsigned long length,
- unsigned reserve)
+static int __init smp_scan_config(unsigned long base, unsigned long length)
{
unsigned int *bp = phys_to_virt(base);
struct mpf_intel *mpf;
+ unsigned long mem;
apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
bp, length);
@@ -717,12 +692,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
mpf, (u64)virt_to_phys(mpf));
- if (!reserve)
- return 1;
- reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
- BOOTMEM_DEFAULT);
+ mem = virt_to_phys(mpf);
+ reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf");
if (mpf->physptr)
- smp_reserve_bootmem(mpf);
+ smp_reserve_memory(mpf);
return 1;
}
@@ -732,7 +705,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
return 0;
}
-void __init default_find_smp_config(unsigned int reserve)
+void __init default_find_smp_config(void)
{
unsigned int address;
@@ -744,9 +717,9 @@ void __init default_find_smp_config(unsigned int reserve)
* 2) Scan the top 1K of base RAM
* 3) Scan the 64K of bios
*/
- if (smp_scan_config(0x0, 0x400, reserve) ||
- smp_scan_config(639 * 0x400, 0x400, reserve) ||
- smp_scan_config(0xF0000, 0x10000, reserve))
+ if (smp_scan_config(0x0, 0x400) ||
+ smp_scan_config(639 * 0x400, 0x400) ||
+ smp_scan_config(0xF0000, 0x10000))
return;
/*
* If it is an SMP machine we should know now, unless the
@@ -767,7 +740,7 @@ void __init default_find_smp_config(unsigned int reserve)
address = get_bios_ebda();
if (address)
- smp_scan_config(address, 0x400, reserve);
+ smp_scan_config(address, 0x400);
}
#ifdef CONFIG_X86_IO_APIC
@@ -965,9 +938,6 @@ void __init early_reserve_e820_mpc_new(void)
{
if (enable_update_mptable && alloc_mptable) {
u64 startt = 0;
-#ifdef CONFIG_X86_TRAMPOLINE
- startt = TRAMPOLINE_BASE;
-#endif
mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
}
}
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 3b7078abc871..0aad8670858e 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -10,8 +10,211 @@
* of the License.
*/
#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sfi.h>
+#include <linux/irq.h>
+#include <linux/module.h>
#include <asm/setup.h>
+#include <asm/mpspec_def.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/mrst.h>
+#include <asm/io.h>
+#include <asm/i8259.h>
+#include <asm/apb_timer.h>
+
+static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
+static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
+int sfi_mtimer_num;
+
+struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
+EXPORT_SYMBOL_GPL(sfi_mrtc_array);
+int sfi_mrtc_num;
+
+static inline void assign_to_mp_irq(struct mpc_intsrc *m,
+ struct mpc_intsrc *mp_irq)
+{
+ memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
+}
+
+static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq,
+ struct mpc_intsrc *m)
+{
+ return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
+}
+
+static void save_mp_irq(struct mpc_intsrc *m)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (!mp_irq_cmp(&mp_irqs[i], m))
+ return;
+ }
+
+ assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
+ panic("Max # of irq sources exceeded!!\n");
+}
+
+/* parse all the mtimer info to a static mtimer array */
+static int __init sfi_parse_mtmr(struct sfi_table_header *table)
+{
+ struct sfi_table_simple *sb;
+ struct sfi_timer_table_entry *pentry;
+ struct mpc_intsrc mp_irq;
+ int totallen;
+
+ sb = (struct sfi_table_simple *)table;
+ if (!sfi_mtimer_num) {
+ sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
+ struct sfi_timer_table_entry);
+ pentry = (struct sfi_timer_table_entry *) sb->pentry;
+ totallen = sfi_mtimer_num * sizeof(*pentry);
+ memcpy(sfi_mtimer_array, pentry, totallen);
+ }
+
+ printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
+ pentry = sfi_mtimer_array;
+ for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
+ printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
+ " irq = %d\n", totallen, (u32)pentry->phys_addr,
+ pentry->freq_hz, pentry->irq);
+ if (!pentry->irq)
+ continue;
+ mp_irq.type = MP_IOAPIC;
+ mp_irq.irqtype = mp_INT;
+/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
+ mp_irq.irqflag = 5;
+ mp_irq.srcbus = 0;
+ mp_irq.srcbusirq = pentry->irq; /* IRQ */
+ mp_irq.dstapic = MP_APIC_ALL;
+ mp_irq.dstirq = pentry->irq;
+ save_mp_irq(&mp_irq);
+ }
+
+ return 0;
+}
+
+struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
+{
+ int i;
+ if (hint < sfi_mtimer_num) {
+ if (!sfi_mtimer_usage[hint]) {
+ pr_debug("hint taken for timer %d irq %d\n",\
+ hint, sfi_mtimer_array[hint].irq);
+ sfi_mtimer_usage[hint] = 1;
+ return &sfi_mtimer_array[hint];
+ }
+ }
+ /* take the first timer available */
+ for (i = 0; i < sfi_mtimer_num;) {
+ if (!sfi_mtimer_usage[i]) {
+ sfi_mtimer_usage[i] = 1;
+ return &sfi_mtimer_array[i];
+ }
+ i++;
+ }
+ return NULL;
+}
+
+void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
+{
+ int i;
+ for (i = 0; i < sfi_mtimer_num;) {
+ if (mtmr->irq == sfi_mtimer_array[i].irq) {
+ sfi_mtimer_usage[i] = 0;
+ return;
+ }
+ i++;
+ }
+}
+
+/* parse all the mrtc info to a global mrtc array */
+int __init sfi_parse_mrtc(struct sfi_table_header *table)
+{
+ struct sfi_table_simple *sb;
+ struct sfi_rtc_table_entry *pentry;
+ struct mpc_intsrc mp_irq;
+
+ int totallen;
+
+ sb = (struct sfi_table_simple *)table;
+ if (!sfi_mrtc_num) {
+ sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
+ struct sfi_rtc_table_entry);
+ pentry = (struct sfi_rtc_table_entry *)sb->pentry;
+ totallen = sfi_mrtc_num * sizeof(*pentry);
+ memcpy(sfi_mrtc_array, pentry, totallen);
+ }
+
+ printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
+ pentry = sfi_mrtc_array;
+ for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
+ printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
+ totallen, (u32)pentry->phys_addr, pentry->irq);
+ mp_irq.type = MP_IOAPIC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = 0;
+ mp_irq.srcbus = 0;
+ mp_irq.srcbusirq = pentry->irq; /* IRQ */
+ mp_irq.dstapic = MP_APIC_ALL;
+ mp_irq.dstirq = pentry->irq;
+ save_mp_irq(&mp_irq);
+ }
+ return 0;
+}
+
+/*
+ * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
+ * APBT but cmdline option can also override it.
+ */
+static void __cpuinit mrst_setup_secondary_clock(void)
+{
+ /* restore default lapic clock if disabled by cmdline */
+ if (disable_apbt_percpu)
+ return setup_secondary_APIC_clock();
+ apbt_setup_secondary_clock();
+}
+
+static unsigned long __init mrst_calibrate_tsc(void)
+{
+ unsigned long flags, fast_calibrate;
+
+ local_irq_save(flags);
+ fast_calibrate = apbt_quick_calibrate();
+ local_irq_restore(flags);
+
+ if (fast_calibrate)
+ return fast_calibrate;
+
+ return 0;
+}
+
+void __init mrst_time_init(void)
+{
+ sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
+ pre_init_apic_IRQ0();
+ apbt_time_init();
+}
+
+void __init mrst_rtc_init(void)
+{
+ sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
+}
+
+/*
+ * if we use per cpu apb timer, the bootclock already setup. if we use lapic
+ * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
+ */
+static void __init mrst_setup_boot_clock(void)
+{
+ pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu);
+ if (disable_apbt_percpu)
+ setup_boot_APIC_clock();
+};
/*
* Moorestown specific x86_init function overrides and early setup
@@ -21,4 +224,17 @@ void __init x86_mrst_early_setup(void)
{
x86_init.resources.probe_roms = x86_init_noop;
x86_init.resources.reserve_resources = x86_init_noop;
+
+ x86_init.timers.timer_init = mrst_time_init;
+ x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock;
+
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+
+ x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock;
+
+ x86_platform.calibrate_tsc = mrst_calibrate_tsc;
+ x86_init.pci.init = pci_mrst_init;
+ x86_init.pci.fixup_irqs = x86_init_noop;
+
+ legacy_pic = &null_legacy_pic;
}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 6a3cefc7dda1..206735ac8cbd 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -172,23 +172,18 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
static int msr_open(struct inode *inode, struct file *file)
{
- unsigned int cpu = iminor(file->f_path.dentry->d_inode);
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- int ret = 0;
+ unsigned int cpu;
+ struct cpuinfo_x86 *c;
- lock_kernel();
cpu = iminor(file->f_path.dentry->d_inode);
+ if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+ return -ENXIO; /* No such CPU */
- if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
- ret = -ENXIO; /* No such CPU */
- goto out;
- }
c = &cpu_data(cpu);
if (!cpu_has(c, X86_FEATURE_MSR))
- ret = -EIO; /* MSR not supported */
-out:
- unlock_kernel();
- return ret;
+ return -EIO; /* MSR not supported */
+
+ return 0;
}
/*
@@ -251,7 +246,7 @@ static int __init msr_init(void)
int i, err = 0;
i = 0;
- if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) {
+ if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
printk(KERN_ERR "msr: unable to get major %d for msr\n",
MSR_MAJOR);
err = -EBUSY;
@@ -279,7 +274,7 @@ out_class:
msr_device_destroy(i);
class_destroy(msr_class);
out_chrdev:
- unregister_chrdev(MSR_MAJOR, "cpu/msr");
+ __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
out:
return err;
}
@@ -290,7 +285,7 @@ static void __exit msr_exit(void)
for_each_online_cpu(cpu)
msr_device_destroy(cpu);
class_destroy(msr_class);
- unregister_chrdev(MSR_MAJOR, "cpu/msr");
+ __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
unregister_hotcpu_notifier(&msr_class_cpu_notifier);
}
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 4006c522adc7..8297160c41b3 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -17,7 +17,9 @@
#include <linux/spinlock.h>
#include <linux/io.h>
#include <linux/string.h>
+
#include <asm/geode.h>
+#include <asm/setup.h>
#include <asm/olpc.h>
#ifdef CONFIG_OPEN_FIRMWARE
@@ -212,7 +214,7 @@ static int __init olpc_init(void)
unsigned char *romsig;
/* The ioremap check is dangerous; limit what we run it on */
- if (!is_geode() || geode_has_vsa2())
+ if (!is_geode() || cs5535_has_vsa2())
return 0;
spin_lock_init(&ec_lock);
@@ -243,9 +245,11 @@ static int __init olpc_init(void)
olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
(unsigned char *) &olpc_platform_info.ecver, 1);
- /* check to see if the VSA exists */
- if (geode_has_vsa2())
- olpc_platform_info.flags |= OLPC_F_VSA;
+#ifdef CONFIG_PCI_OLPC
+ /* If the VSA exists let it emulate PCI, if not emulate in kernel */
+ if (!cs5535_has_vsa2())
+ x86_init.pci.arch_init = pci_olpc_init;
+#endif
printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 3a7c5a44082e..676b8c77a976 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -8,9 +8,9 @@
#include <asm/paravirt.h>
static inline void
-default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
+default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
{
- __raw_spin_lock(lock);
+ arch_spin_lock(lock);
}
struct pv_lock_ops pv_lock_ops = {
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b1739d16310..1db183ed7c01 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -428,10 +428,6 @@ struct pv_mmu_ops pv_mmu_ops = {
.ptep_modify_prot_start = __ptep_modify_prot_start,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
-#ifdef CONFIG_HIGHPTE
- .kmap_atomic_pte = kmap_atomic,
-#endif
-
#if PAGETABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
.set_pte_atomic = native_set_pte_atomic,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 971a3bec47a8..fb99f7edb341 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -31,7 +31,7 @@
#include <linux/string.h>
#include <linux/crash_dump.h>
#include <linux/dma-mapping.h>
-#include <linux/bitops.h>
+#include <linux/bitmap.h>
#include <linux/pci_ids.h>
#include <linux/pci.h>
#include <linux/delay.h>
@@ -46,6 +46,7 @@
#include <asm/dma.h>
#include <asm/rio.h>
#include <asm/bios_ebda.h>
+#include <asm/x86_init.h>
#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
int use_calgary __read_mostly = 1;
@@ -211,7 +212,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
spin_lock_irqsave(&tbl->it_lock, flags);
- iommu_area_reserve(tbl->it_map, index, npages);
+ bitmap_set(tbl->it_map, index, npages);
spin_unlock_irqrestore(&tbl->it_lock, flags);
}
@@ -244,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
if (panic_on_overflow)
panic("Calgary: fix the allocator.\n");
else
- return bad_dma_address;
+ return DMA_ERROR_CODE;
}
}
@@ -260,12 +261,15 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
void *vaddr, unsigned int npages, int direction)
{
unsigned long entry;
- dma_addr_t ret = bad_dma_address;
+ dma_addr_t ret;
entry = iommu_range_alloc(dev, tbl, npages);
- if (unlikely(entry == bad_dma_address))
- goto error;
+ if (unlikely(entry == DMA_ERROR_CODE)) {
+ printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
+ "iommu %p\n", npages, tbl);
+ return DMA_ERROR_CODE;
+ }
/* set the return dma address */
ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
@@ -273,13 +277,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
/* put the TCEs in the HW table */
tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
direction);
-
return ret;
-
-error:
- printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
- "iommu %p\n", npages, tbl);
- return bad_dma_address;
}
static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -290,8 +288,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
unsigned long flags;
/* were we called with bad_dma_address? */
- badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
- if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
+ badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
+ if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
"address 0x%Lx\n", dma_addr);
return;
@@ -305,7 +303,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
spin_lock_irqsave(&tbl->it_lock, flags);
- iommu_area_free(tbl->it_map, entry, npages);
+ bitmap_clear(tbl->it_map, entry, npages);
spin_unlock_irqrestore(&tbl->it_lock, flags);
}
@@ -318,13 +316,15 @@ static inline struct iommu_table *find_iommu_table(struct device *dev)
pdev = to_pci_dev(dev);
+ /* search up the device tree for an iommu */
pbus = pdev->bus;
-
- /* is the device behind a bridge? Look for the root bus */
- while (pbus->parent)
+ do {
+ tbl = pci_iommu(pbus);
+ if (tbl && tbl->it_busno == pbus->number)
+ break;
+ tbl = NULL;
pbus = pbus->parent;
-
- tbl = pci_iommu(pbus);
+ } while (pbus);
BUG_ON(tbl && (tbl->it_busno != pbus->number));
@@ -373,7 +373,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
entry = iommu_range_alloc(dev, tbl, npages);
- if (entry == bad_dma_address) {
+ if (entry == DMA_ERROR_CODE) {
/* makes sure unmap knows to stop */
s->dma_length = 0;
goto error;
@@ -391,7 +391,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
error:
calgary_unmap_sg(dev, sg, nelems, dir, NULL);
for_each_sg(sg, s, nelems, i) {
- sg->dma_address = bad_dma_address;
+ sg->dma_address = DMA_ERROR_CODE;
sg->dma_length = 0;
}
return 0;
@@ -446,7 +446,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
/* set up tces to cover the allocated range */
mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
- if (mapping == bad_dma_address)
+ if (mapping == DMA_ERROR_CODE)
goto free;
*dma_handle = mapping;
return ret;
@@ -727,7 +727,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
struct iommu_table *tbl = pci_iommu(dev->bus);
/* reserve EMERGENCY_PAGES from bad_dma_address and up */
- iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
+ iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES);
/* avoid the BIOS/VGA first 640KB-1MB region */
/* for CalIOC2 - avoid the entire first MB */
@@ -1309,7 +1309,7 @@ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
/*
* get_tce_space_from_tar():
* Function for kdump case. Get the tce tables from first kernel
- * by reading the contents of the base adress register of calgary iommu
+ * by reading the contents of the base address register of calgary iommu
*/
static void __init get_tce_space_from_tar(void)
{
@@ -1344,6 +1344,23 @@ static void __init get_tce_space_from_tar(void)
return;
}
+static int __init calgary_iommu_init(void)
+{
+ int ret;
+
+ /* ok, we're trying to use Calgary - let's roll */
+ printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
+
+ ret = calgary_init();
+ if (ret) {
+ printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
+ "falling back to no_iommu\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
void __init detect_calgary(void)
{
int bus;
@@ -1357,7 +1374,7 @@ void __init detect_calgary(void)
* if the user specified iommu=off or iommu=soft or we found
* another HW IOMMU already, bail out.
*/
- if (swiotlb || no_iommu || iommu_detected)
+ if (no_iommu || iommu_detected)
return;
if (!use_calgary)
@@ -1442,9 +1459,7 @@ void __init detect_calgary(void)
printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
specified_table_size);
- /* swiotlb for devices that aren't behind the Calgary. */
- if (max_pfn > MAX_DMA32_PFN)
- swiotlb = 1;
+ x86_init.iommu.iommu_init = calgary_iommu_init;
}
return;
@@ -1457,35 +1472,6 @@ cleanup:
}
}
-int __init calgary_iommu_init(void)
-{
- int ret;
-
- if (no_iommu || (swiotlb && !calgary_detected))
- return -ENODEV;
-
- if (!calgary_detected)
- return -ENODEV;
-
- /* ok, we're trying to use Calgary - let's roll */
- printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
-
- ret = calgary_init();
- if (ret) {
- printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
- "falling back to no_iommu\n", ret);
- return ret;
- }
-
- force_iommu = 1;
- bad_dma_address = 0x0;
- /* dma_ops is set to swiotlb or nommu */
- if (!dma_ops)
- dma_ops = &nommu_dma_ops;
-
- return 0;
-}
-
static int __init calgary_parse_options(char *p)
{
unsigned int bridge;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 64b838eac18c..a4ac764a6880 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,10 +11,11 @@
#include <asm/gart.h>
#include <asm/calgary.h>
#include <asm/amd_iommu.h>
+#include <asm/x86_init.h>
static int forbid_dac __read_mostly;
-struct dma_map_ops *dma_ops;
+struct dma_map_ops *dma_ops = &nommu_dma_ops;
EXPORT_SYMBOL(dma_ops);
static int iommu_sac_force __read_mostly;
@@ -35,22 +36,17 @@ int iommu_detected __read_mostly = 0;
/*
* This variable becomes 1 if iommu=pt is passed on the kernel command line.
- * If this variable is 1, IOMMU implementations do no DMA ranslation for
+ * If this variable is 1, IOMMU implementations do no DMA translation for
* devices and allow every device to access to whole physical memory. This is
- * useful if a user want to use an IOMMU only for KVM device assignment to
+ * useful if a user wants to use an IOMMU only for KVM device assignment to
* guests and not for driver dma translation.
*/
int iommu_pass_through __read_mostly;
-dma_addr_t bad_dma_address __read_mostly = 0;
-EXPORT_SYMBOL(bad_dma_address);
-
-/* Dummy device used for NULL arguments (normally ISA). Better would
- be probably a smaller DMA mask, but this is bug-to-bug compatible
- to older i386. */
+/* Dummy device used for NULL arguments (normally ISA). */
struct device x86_dma_fallback_dev = {
.init_name = "fallback device",
- .coherent_dma_mask = DMA_BIT_MASK(32),
+ .coherent_dma_mask = ISA_DMA_BIT_MASK,
.dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
};
EXPORT_SYMBOL(x86_dma_fallback_dev);
@@ -69,7 +65,7 @@ int dma_set_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_mask);
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
static __initdata void *dma32_bootmem_ptr;
static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
@@ -120,27 +116,33 @@ static void __init dma32_free_bootmem(void)
dma32_bootmem_ptr = NULL;
dma32_bootmem_size = 0;
}
+#else
+void __init dma32_reserve_bootmem(void)
+{
+}
+static void __init dma32_free_bootmem(void)
+{
+}
+
#endif
void __init pci_iommu_alloc(void)
{
-#ifdef CONFIG_X86_64
/* free the range so iommu could get some range less than 4G */
dma32_free_bootmem();
-#endif
- /*
- * The order of these functions is important for
- * fall-back/fail-over reasons
- */
+ if (pci_swiotlb_detect())
+ goto out;
+
gart_iommu_hole_init();
detect_calgary();
detect_intel_iommu();
+ /* needs to be called after gart_iommu_hole_init */
amd_iommu_detect();
-
+out:
pci_swiotlb_init();
}
@@ -216,7 +218,7 @@ static __init int iommu_setup(char *p)
if (!strncmp(p, "allowdac", 8))
forbid_dac = 0;
if (!strncmp(p, "nodac", 5))
- forbid_dac = -1;
+ forbid_dac = 1;
if (!strncmp(p, "usedac", 6)) {
forbid_dac = -1;
return 1;
@@ -291,27 +293,19 @@ static int __init pci_iommu_init(void)
#ifdef CONFIG_PCI
dma_debug_add_bus(&pci_bus_type);
#endif
+ x86_init.iommu.iommu_init();
- calgary_iommu_init();
-
- intel_iommu_init();
-
- amd_iommu_init();
+ if (swiotlb) {
+ printk(KERN_INFO "PCI-DMA: "
+ "Using software bounce buffering for IO (SWIOTLB)\n");
+ swiotlb_print_info();
+ } else
+ swiotlb_free();
- gart_iommu_init();
-
- no_iommu_init();
return 0;
}
-
-void pci_iommu_shutdown(void)
-{
- gart_iommu_shutdown();
-
- amd_iommu_shutdown();
-}
/* Must execute after PCI subsystem */
-fs_initcall(pci_iommu_init);
+rootfs_initcall(pci_iommu_init);
#ifdef CONFIG_PCI
/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 98a827ee9ed7..f3af115a573a 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -16,13 +16,14 @@
#include <linux/agp_backend.h>
#include <linux/init.h>
#include <linux/mm.h>
+#include <linux/sched.h>
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/pci.h>
#include <linux/module.h>
#include <linux/topology.h>
#include <linux/interrupt.h>
-#include <linux/bitops.h>
+#include <linux/bitmap.h>
#include <linux/kdebug.h>
#include <linux/scatterlist.h>
#include <linux/iommu-helper.h>
@@ -38,6 +39,7 @@
#include <asm/swiotlb.h>
#include <asm/dma.h>
#include <asm/k8.h>
+#include <asm/x86_init.h>
static unsigned long iommu_bus_base; /* GART remapping area (physical) */
static unsigned long iommu_size; /* size of remapping area bytes */
@@ -45,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */
static u32 *iommu_gatt_base; /* Remapping table */
+static dma_addr_t bad_dma_addr;
+
/*
* If this is disabled the IOMMU will use an optimized flushing strategy
* of only flushing when an mapping is reused. With it true the GART is
@@ -91,7 +95,7 @@ static unsigned long alloc_iommu(struct device *dev, int size,
base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
PAGE_SIZE) >> PAGE_SHIFT;
- boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
+ boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1,
PAGE_SIZE) >> PAGE_SHIFT;
spin_lock_irqsave(&iommu_bitmap_lock, flags);
@@ -122,7 +126,7 @@ static void free_iommu(unsigned long offset, int size)
unsigned long flags;
spin_lock_irqsave(&iommu_bitmap_lock, flags);
- iommu_area_free(iommu_gart_bitmap, offset, size);
+ bitmap_clear(iommu_gart_bitmap, offset, size);
if (offset >= next_bit)
next_bit = offset + size;
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -215,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
if (panic_on_overflow)
panic("dma_map_area overflow %lu bytes\n", size);
iommu_full(dev, size, dir);
- return bad_dma_address;
+ return bad_dma_addr;
}
for (i = 0; i < npages; i++) {
@@ -293,7 +297,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
int i;
#ifdef CONFIG_IOMMU_DEBUG
- printk(KERN_DEBUG "dma_map_sg overflow\n");
+ pr_debug("dma_map_sg overflow\n");
#endif
for_each_sg(sg, s, nents, i) {
@@ -301,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
if (nonforced_iommu(dev, addr, s->length)) {
addr = dma_map_area(dev, addr, s->length, dir, 0);
- if (addr == bad_dma_address) {
+ if (addr == bad_dma_addr) {
if (i > 0)
gart_unmap_sg(dev, sg, i, dir, NULL);
nents = 0;
@@ -388,12 +392,14 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
if (!dev)
dev = &x86_dma_fallback_dev;
- out = 0;
- start = 0;
- start_sg = sgmap = sg;
- seg_size = 0;
- max_seg_size = dma_get_max_seg_size(dev);
- ps = NULL; /* shut up gcc */
+ out = 0;
+ start = 0;
+ start_sg = sg;
+ sgmap = sg;
+ seg_size = 0;
+ max_seg_size = dma_get_max_seg_size(dev);
+ ps = NULL; /* shut up gcc */
+
for_each_sg(sg, s, nents, i) {
dma_addr_t addr = sg_phys(s);
@@ -416,11 +422,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
sgmap, pages, need) < 0)
goto error;
out++;
- seg_size = 0;
- sgmap = sg_next(sgmap);
- pages = 0;
- start = i;
- start_sg = s;
+
+ seg_size = 0;
+ sgmap = sg_next(sgmap);
+ pages = 0;
+ start = i;
+ start_sg = s;
}
}
@@ -454,7 +461,7 @@ error:
iommu_full(dev, pages << PAGE_SHIFT, dir);
for_each_sg(sg, s, nents, i)
- s->dma_address = bad_dma_address;
+ s->dma_address = bad_dma_addr;
return 0;
}
@@ -478,7 +485,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
DMA_BIDIRECTIONAL, align_mask);
flush_gart();
- if (paddr != bad_dma_address) {
+ if (paddr != bad_dma_addr) {
*dma_addr = paddr;
return page_address(page);
}
@@ -498,6 +505,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
free_pages((unsigned long)vaddr, get_order(size));
}
+static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ return (dma_addr == bad_dma_addr);
+}
+
static int no_agp;
static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -514,7 +526,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
if (iommu_size < 64*1024*1024) {
- printk(KERN_WARNING
+ pr_warning(
"PCI-DMA: Warning: Small IOMMU %luMB."
" Consider increasing the AGP aperture in BIOS\n",
iommu_size >> 20);
@@ -569,28 +581,32 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
aperture_alloc = aper_alloc;
}
-static int gart_resume(struct sys_device *dev)
+static void gart_fixup_northbridges(struct sys_device *dev)
{
- printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n");
+ int i;
- if (fix_up_north_bridges) {
- int i;
+ if (!fix_up_north_bridges)
+ return;
- printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n");
+ pr_info("PCI-DMA: Restoring GART aperture settings\n");
- for (i = 0; i < num_k8_northbridges; i++) {
- struct pci_dev *dev = k8_northbridges[i];
+ for (i = 0; i < num_k8_northbridges; i++) {
+ struct pci_dev *dev = k8_northbridges[i];
- /*
- * Don't enable translations just yet. That is the next
- * step. Restore the pre-suspend aperture settings.
- */
- pci_write_config_dword(dev, AMD64_GARTAPERTURECTL,
- aperture_order << 1);
- pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE,
- aperture_alloc >> 25);
- }
+ /*
+ * Don't enable translations just yet. That is the next
+ * step. Restore the pre-suspend aperture settings.
+ */
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
+ pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
}
+}
+
+static int gart_resume(struct sys_device *dev)
+{
+ pr_info("PCI-DMA: Resuming GART IOMMU\n");
+
+ gart_fixup_northbridges(dev);
enable_gart_translations();
@@ -603,15 +619,14 @@ static int gart_suspend(struct sys_device *dev, pm_message_t state)
}
static struct sysdev_class gart_sysdev_class = {
- .name = "gart",
- .suspend = gart_suspend,
- .resume = gart_resume,
+ .name = "gart",
+ .suspend = gart_suspend,
+ .resume = gart_resume,
};
static struct sys_device device_gart = {
- .id = 0,
- .cls = &gart_sysdev_class,
+ .cls = &gart_sysdev_class,
};
/*
@@ -626,7 +641,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
void *gatt;
int i, error;
- printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
+ pr_info("PCI-DMA: Disabling AGP.\n");
+
aper_size = aper_base = info->aper_size = 0;
dev = NULL;
for (i = 0; i < num_k8_northbridges; i++) {
@@ -644,6 +660,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
}
if (!aper_base)
goto nommu;
+
info->aper_base = aper_base;
info->aper_size = aper_size >> 20;
@@ -666,14 +683,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
flush_gart();
- printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
+ pr_info("PCI-DMA: aperture base @ %x size %u KB\n",
aper_base, aper_size>>10);
return 0;
nommu:
/* Should not happen anymore */
- printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
+ pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
"falling back to iommu=soft.\n");
return -1;
}
@@ -685,14 +702,16 @@ static struct dma_map_ops gart_dma_ops = {
.unmap_page = gart_unmap_page,
.alloc_coherent = gart_alloc_coherent,
.free_coherent = gart_free_coherent,
+ .mapping_error = gart_mapping_error,
};
-void gart_iommu_shutdown(void)
+static void gart_iommu_shutdown(void)
{
struct pci_dev *dev;
int i;
- if (no_agp && (dma_ops != &gart_dma_ops))
+ /* don't shutdown it if there is AGP installed */
+ if (!no_agp)
return;
for (i = 0; i < num_k8_northbridges; i++) {
@@ -707,7 +726,7 @@ void gart_iommu_shutdown(void)
}
}
-void __init gart_iommu_init(void)
+int __init gart_iommu_init(void)
{
struct agp_kern_info info;
unsigned long iommu_start;
@@ -716,8 +735,8 @@ void __init gart_iommu_init(void)
unsigned long scratch;
long i;
- if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
- return;
+ if (num_k8_northbridges == 0)
+ return 0;
#ifndef CONFIG_AGP_AMD64
no_agp = 1;
@@ -729,35 +748,28 @@ void __init gart_iommu_init(void)
(agp_copy_info(agp_bridge, &info) < 0);
#endif
- if (swiotlb)
- return;
-
- /* Did we detect a different HW IOMMU? */
- if (iommu_detected && !gart_iommu_aperture)
- return;
-
if (no_iommu ||
(!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
!gart_iommu_aperture ||
(no_agp && init_k8_gatt(&info) < 0)) {
if (max_pfn > MAX_DMA32_PFN) {
- printk(KERN_WARNING "More than 4GB of memory "
- "but GART IOMMU not available.\n");
- printk(KERN_WARNING "falling back to iommu=soft.\n");
+ pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
+ pr_warning("falling back to iommu=soft.\n");
}
- return;
+ return 0;
}
/* need to map that range */
- aper_size = info.aper_size << 20;
- aper_base = info.aper_base;
- end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
+ aper_size = info.aper_size << 20;
+ aper_base = info.aper_base;
+ end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
+
if (end_pfn > max_low_pfn_mapped) {
start_pfn = (aper_base>>PAGE_SHIFT);
init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
}
- printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
+ pr_info("PCI-DMA: using GART IOMMU.\n");
iommu_size = check_iommu_size(info.aper_base, aper_size);
iommu_pages = iommu_size >> PAGE_SHIFT;
@@ -772,8 +784,7 @@ void __init gart_iommu_init(void)
ret = dma_debug_resize_entries(iommu_pages);
if (ret)
- printk(KERN_DEBUG
- "PCI-DMA: Cannot trace all the entries\n");
+ pr_debug("PCI-DMA: Cannot trace all the entries\n");
}
#endif
@@ -781,17 +792,16 @@ void __init gart_iommu_init(void)
* Out of IOMMU space handling.
* Reserve some invalid pages at the beginning of the GART.
*/
- iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
+ bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
- agp_memory_reserved = iommu_size;
- printk(KERN_INFO
- "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
+ pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
iommu_size >> 20);
- iommu_start = aper_size - iommu_size;
- iommu_bus_base = info.aper_base + iommu_start;
- bad_dma_address = iommu_bus_base;
- iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
+ agp_memory_reserved = iommu_size;
+ iommu_start = aper_size - iommu_size;
+ iommu_bus_base = info.aper_base + iommu_start;
+ bad_dma_addr = iommu_bus_base;
+ iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
/*
* Unmap the IOMMU part of the GART. The alias of the page is
@@ -813,7 +823,7 @@ void __init gart_iommu_init(void)
* the pages as Not-Present:
*/
wbinvd();
-
+
/*
* Now all caches are flushed and we can safely enable
* GART hardware. Doing it early leaves the possibility
@@ -837,6 +847,10 @@ void __init gart_iommu_init(void)
flush_gart();
dma_ops = &gart_dma_ops;
+ x86_platform.iommu_shutdown = gart_iommu_shutdown;
+ swiotlb = 0;
+
+ return 0;
}
void __init gart_parse_options(char *p)
@@ -855,7 +869,7 @@ void __init gart_parse_options(char *p)
#endif
if (isdigit(*p) && get_option(&p, &arg))
iommu_size = arg;
- if (!strncmp(p, "fullflush", 8))
+ if (!strncmp(p, "fullflush", 9))
iommu_fullflush = 1;
if (!strncmp(p, "nofullflush", 11))
iommu_fullflush = 0;
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a3933d4330cd..22be12b60a8f 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
dma_addr_t bus = page_to_phys(page) + offset;
WARN_ON(size == 0);
if (!check_addr("map_single", dev, bus, size))
- return bad_dma_address;
+ return DMA_ERROR_CODE;
flush_write_buffers();
return bus;
}
@@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = {
.sync_sg_for_device = nommu_sync_sg_for_device,
.is_phys = 1,
};
-
-void __init no_iommu_init(void)
-{
- if (dma_ops)
- return;
-
- force_iommu = 0; /* no HW IOMMU */
- dma_ops = &nommu_dma_ops;
-}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index aaa6b7839f1e..7d2829dde20e 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -42,18 +42,31 @@ static struct dma_map_ops swiotlb_dma_ops = {
.dma_supported = NULL,
};
-void __init pci_swiotlb_init(void)
+/*
+ * pci_swiotlb_detect - set swiotlb to 1 if necessary
+ *
+ * This returns non-zero if we are forced to use swiotlb (by the boot
+ * option).
+ */
+int __init pci_swiotlb_detect(void)
{
+ int use_swiotlb = swiotlb | swiotlb_force;
+
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
#ifdef CONFIG_X86_64
- if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN))
+ if (!no_iommu && max_pfn > MAX_DMA32_PFN)
swiotlb = 1;
#endif
if (swiotlb_force)
swiotlb = 1;
+
+ return use_swiotlb;
+}
+
+void __init pci_swiotlb_init(void)
+{
if (swiotlb) {
- printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
- swiotlb_init();
+ swiotlb_init(0);
dma_ops = &swiotlb_dma_ops;
}
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2b5776..ad9540676fcc 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,11 @@
#include <linux/pm.h>
#include <linux/clockchips.h>
#include <linux/random.h>
+#include <linux/user-return-notifier.h>
+#include <linux/dmi.h>
+#include <linux/utsname.h>
#include <trace/events/power.h>
+#include <linux/hw_breakpoint.h>
#include <asm/system.h>
#include <asm/apic.h>
#include <asm/syscalls.h>
@@ -17,6 +21,7 @@
#include <asm/uaccess.h>
#include <asm/i387.h>
#include <asm/ds.h>
+#include <asm/debugreg.h>
unsigned long idle_halt;
EXPORT_SYMBOL(idle_halt);
@@ -87,30 +92,37 @@ void exit_thread(void)
}
}
-void flush_thread(void)
+void show_regs(struct pt_regs *regs)
{
- struct task_struct *tsk = current;
+ show_registers(regs);
+ show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs),
+ regs->bp);
+}
-#ifdef CONFIG_X86_64
- if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
- clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
- if (test_tsk_thread_flag(tsk, TIF_IA32)) {
- clear_tsk_thread_flag(tsk, TIF_IA32);
- } else {
- set_tsk_thread_flag(tsk, TIF_IA32);
- current_thread_info()->status |= TS_COMPAT;
- }
- }
-#endif
+void show_regs_common(void)
+{
+ const char *board, *product;
+
+ board = dmi_get_system_info(DMI_BOARD_NAME);
+ if (!board)
+ board = "";
+ product = dmi_get_system_info(DMI_PRODUCT_NAME);
+ if (!product)
+ product = "";
- clear_tsk_thread_flag(tsk, TIF_DEBUG);
+ printk(KERN_CONT "\n");
+ printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n",
+ current->pid, current->comm, print_tainted(),
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version, board, product);
+}
+
+void flush_thread(void)
+{
+ struct task_struct *tsk = current;
- tsk->thread.debugreg0 = 0;
- tsk->thread.debugreg1 = 0;
- tsk->thread.debugreg2 = 0;
- tsk->thread.debugreg3 = 0;
- tsk->thread.debugreg6 = 0;
- tsk->thread.debugreg7 = 0;
+ flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
/*
* Forget coprocessor state..
@@ -192,16 +204,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
else if (next->debugctlmsr != prev->debugctlmsr)
update_debugctlmsr(next->debugctlmsr);
- if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
- set_debugreg(next->debugreg0, 0);
- set_debugreg(next->debugreg1, 1);
- set_debugreg(next->debugreg2, 2);
- set_debugreg(next->debugreg3, 3);
- /* no 4 and 5 */
- set_debugreg(next->debugreg6, 6);
- set_debugreg(next->debugreg7, 7);
- }
-
if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
test_tsk_thread_flag(next_p, TIF_NOTSC)) {
/* prev and next are different */
@@ -224,6 +226,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
*/
memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
}
+ propagate_user_return_notify(prev_p, next_p);
}
int sys_fork(struct pt_regs *regs)
@@ -247,6 +250,78 @@ int sys_vfork(struct pt_regs *regs)
NULL, NULL);
}
+long
+sys_clone(unsigned long clone_flags, unsigned long newsp,
+ void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+{
+ if (!newsp)
+ newsp = regs->sp;
+ return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
+}
+
+/*
+ * This gets run with %si containing the
+ * function to call, and %di containing
+ * the "args".
+ */
+extern void kernel_thread_helper(void);
+
+/*
+ * Create a kernel thread
+ */
+int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+ struct pt_regs regs;
+
+ memset(&regs, 0, sizeof(regs));
+
+ regs.si = (unsigned long) fn;
+ regs.di = (unsigned long) arg;
+
+#ifdef CONFIG_X86_32
+ regs.ds = __USER_DS;
+ regs.es = __USER_DS;
+ regs.fs = __KERNEL_PERCPU;
+ regs.gs = __KERNEL_STACK_CANARY;
+#else
+ regs.ss = __KERNEL_DS;
+#endif
+
+ regs.orig_ax = -1;
+ regs.ip = (unsigned long) kernel_thread_helper;
+ regs.cs = __KERNEL_CS | get_kernel_rpl();
+ regs.flags = X86_EFLAGS_IF | 0x2;
+
+ /* Ok, create the new process.. */
+ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(kernel_thread);
+
+/*
+ * sys_execve() executes a new program.
+ */
+long sys_execve(char __user *name, char __user * __user *argv,
+ char __user * __user *envp, struct pt_regs *regs)
+{
+ long error;
+ char *filename;
+
+ filename = getname(name);
+ error = PTR_ERR(filename);
+ if (IS_ERR(filename))
+ return error;
+ error = do_execve(filename, argv, envp, regs);
+
+#ifdef CONFIG_X86_32
+ if (error == 0) {
+ /* Make sure we don't return using sysenter.. */
+ set_thread_flag(TIF_IRET);
+ }
+#endif
+
+ putname(filename);
+ return error;
+}
/*
* Idle related variables and functions
@@ -532,7 +607,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
if (pm_idle == poll_idle && smp_num_siblings > 1) {
- printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
+ printk_once(KERN_WARNING "WARNING: polling idle and HT enabled,"
" performance may degrade.\n");
}
#endif
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4cf79567cdab..f6c62667e30c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -23,7 +23,6 @@
#include <linux/vmalloc.h>
#include <linux/user.h>
#include <linux/interrupt.h>
-#include <linux/utsname.h>
#include <linux/delay.h>
#include <linux/reboot.h>
#include <linux/init.h>
@@ -35,7 +34,6 @@
#include <linux/tick.h>
#include <linux/percpu.h>
#include <linux/prctl.h>
-#include <linux/dmi.h>
#include <linux/ftrace.h>
#include <linux/uaccess.h>
#include <linux/io.h>
@@ -58,6 +56,7 @@
#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/ds.h>
+#include <asm/debugreg.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -127,39 +126,29 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned long d0, d1, d2, d3, d6, d7;
unsigned long sp;
unsigned short ss, gs;
- const char *board;
if (user_mode_vm(regs)) {
sp = regs->sp;
ss = regs->ss & 0xffff;
gs = get_user_gs(regs);
} else {
- sp = (unsigned long) (&regs->sp);
+ sp = kernel_stack_pointer(regs);
savesegment(ss, ss);
savesegment(gs, gs);
}
- printk("\n");
+ show_regs_common();
- board = dmi_get_system_info(DMI_PRODUCT_NAME);
- if (!board)
- board = "";
- printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
- task_pid_nr(current), current->comm,
- print_tainted(), init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version, board);
-
- printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
+ printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
(u16)regs->cs, regs->ip, regs->flags,
smp_processor_id());
print_symbol("EIP is at %s\n", regs->ip);
- printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
+ printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
regs->ax, regs->bx, regs->cx, regs->dx);
- printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
+ printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
regs->si, regs->di, regs->bp, sp);
- printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
+ printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
(u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
if (!all)
@@ -169,61 +158,22 @@ void __show_regs(struct pt_regs *regs, int all)
cr2 = read_cr2();
cr3 = read_cr3();
cr4 = read_cr4_safe();
- printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
+ printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
cr0, cr2, cr3, cr4);
get_debugreg(d0, 0);
get_debugreg(d1, 1);
get_debugreg(d2, 2);
get_debugreg(d3, 3);
- printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
+ printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
d0, d1, d2, d3);
get_debugreg(d6, 6);
get_debugreg(d7, 7);
- printk("DR6: %08lx DR7: %08lx\n",
+ printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
d6, d7);
}
-void show_regs(struct pt_regs *regs)
-{
- __show_regs(regs, 1);
- show_trace(NULL, regs, &regs->sp, regs->bp);
-}
-
-/*
- * This gets run with %bx containing the
- * function to call, and %dx containing
- * the "args".
- */
-extern void kernel_thread_helper(void);
-
-/*
- * Create a kernel thread
- */
-int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
-{
- struct pt_regs regs;
-
- memset(&regs, 0, sizeof(regs));
-
- regs.bx = (unsigned long) fn;
- regs.dx = (unsigned long) arg;
-
- regs.ds = __USER_DS;
- regs.es = __USER_DS;
- regs.fs = __KERNEL_PERCPU;
- regs.gs = __KERNEL_STACK_CANARY;
- regs.orig_ax = -1;
- regs.ip = (unsigned long) kernel_thread_helper;
- regs.cs = __KERNEL_CS | get_kernel_rpl();
- regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
-
- /* Ok, create the new process.. */
- return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
-}
-EXPORT_SYMBOL(kernel_thread);
-
void release_thread(struct task_struct *dead_task)
{
BUG_ON(dead_task->mm);
@@ -259,7 +209,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
task_user_gs(p) = get_user_gs(regs);
+ p->thread.io_bitmap_ptr = NULL;
tsk = current;
+ err = -ENOMEM;
+
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
IO_BITMAP_BYTES, GFP_KERNEL);
@@ -430,46 +385,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
return prev_p;
}
-int sys_clone(struct pt_regs *regs)
-{
- unsigned long clone_flags;
- unsigned long newsp;
- int __user *parent_tidptr, *child_tidptr;
-
- clone_flags = regs->bx;
- newsp = regs->cx;
- parent_tidptr = (int __user *)regs->dx;
- child_tidptr = (int __user *)regs->di;
- if (!newsp)
- newsp = regs->sp;
- return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
-}
-
-/*
- * sys_execve() executes a new program.
- */
-int sys_execve(struct pt_regs *regs)
-{
- int error;
- char *filename;
-
- filename = getname((char __user *) regs->bx);
- error = PTR_ERR(filename);
- if (IS_ERR(filename))
- goto out;
- error = do_execve(filename,
- (char __user * __user *) regs->cx,
- (char __user * __user *) regs->dx,
- regs);
- if (error == 0) {
- /* Make sure we don't return using sysenter.. */
- set_thread_flag(TIF_IRET);
- }
- putname(filename);
-out:
- return error;
-}
-
#define top_esp (THREAD_SIZE - sizeof(unsigned long))
#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ad535b683170..dc9690b4c4cc 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -26,7 +26,6 @@
#include <linux/slab.h>
#include <linux/user.h>
#include <linux/interrupt.h>
-#include <linux/utsname.h>
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/ptrace.h>
@@ -38,7 +37,6 @@
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/ftrace.h>
-#include <linux/dmi.h>
#include <asm/pgtable.h>
#include <asm/system.h>
@@ -52,14 +50,13 @@
#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/ds.h>
+#include <asm/debugreg.h>
asmlinkage extern void ret_from_fork(void);
DEFINE_PER_CPU(unsigned long, old_rsp);
static DEFINE_PER_CPU(unsigned char, is_idle);
-unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
-
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
void idle_notifier_register(struct notifier_block *n)
@@ -162,31 +159,21 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned long d0, d1, d2, d3, d6, d7;
unsigned int fsindex, gsindex;
unsigned int ds, cs, es;
- const char *board;
-
- printk("\n");
- print_modules();
- board = dmi_get_system_info(DMI_PRODUCT_NAME);
- if (!board)
- board = "";
- printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
- current->pid, current->comm, print_tainted(),
- init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version, board);
- printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
+
+ show_regs_common();
+ printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
printk_address(regs->ip, 1);
- printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
+ printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
regs->sp, regs->flags);
- printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
+ printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
regs->ax, regs->bx, regs->cx);
- printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
+ printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
regs->dx, regs->si, regs->di);
- printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
+ printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
regs->bp, regs->r8, regs->r9);
- printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
+ printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
regs->r10, regs->r11, regs->r12);
- printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
+ printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
regs->r13, regs->r14, regs->r15);
asm("movl %%ds,%0" : "=r" (ds));
@@ -207,28 +194,21 @@ void __show_regs(struct pt_regs *regs, int all)
cr3 = read_cr3();
cr4 = read_cr4();
- printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
+ printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
fs, fsindex, gs, gsindex, shadowgs);
- printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
+ printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
es, cr0);
- printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
+ printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
cr4);
get_debugreg(d0, 0);
get_debugreg(d1, 1);
get_debugreg(d2, 2);
- printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
+ printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
get_debugreg(d3, 3);
get_debugreg(d6, 6);
get_debugreg(d7, 7);
- printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
-}
-
-void show_regs(struct pt_regs *regs)
-{
- printk(KERN_INFO "CPU %d:", smp_processor_id());
- __show_regs(regs, 1);
- show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
+ printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
}
void release_thread(struct task_struct *dead_task)
@@ -285,8 +265,9 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
*childregs = *regs;
childregs->ax = 0;
- childregs->sp = sp;
- if (sp == ~0UL)
+ if (user_mode(regs))
+ childregs->sp = sp;
+ else
childregs->sp = (unsigned long)childregs;
p->thread.sp = (unsigned long) childregs;
@@ -297,12 +278,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.fs = me->thread.fs;
p->thread.gs = me->thread.gs;
+ p->thread.io_bitmap_ptr = NULL;
savesegment(gs, p->thread.gsindex);
savesegment(fs, p->thread.fsindex);
savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds);
+ err = -ENOMEM;
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) {
@@ -341,29 +326,46 @@ out:
kfree(p->thread.io_bitmap_ptr);
p->thread.io_bitmap_max = 0;
}
+
return err;
}
-void
-start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+static void
+start_thread_common(struct pt_regs *regs, unsigned long new_ip,
+ unsigned long new_sp,
+ unsigned int _cs, unsigned int _ss, unsigned int _ds)
{
loadsegment(fs, 0);
- loadsegment(es, 0);
- loadsegment(ds, 0);
+ loadsegment(es, _ds);
+ loadsegment(ds, _ds);
load_gs_index(0);
regs->ip = new_ip;
regs->sp = new_sp;
percpu_write(old_rsp, new_sp);
- regs->cs = __USER_CS;
- regs->ss = __USER_DS;
- regs->flags = 0x200;
+ regs->cs = _cs;
+ regs->ss = _ss;
+ regs->flags = X86_EFLAGS_IF;
set_fs(USER_DS);
/*
* Free the old FP and other extended state
*/
free_thread_xstate(current);
}
-EXPORT_SYMBOL_GPL(start_thread);
+
+void
+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+{
+ start_thread_common(regs, new_ip, new_sp,
+ __USER_CS, __USER_DS, 0);
+}
+
+#ifdef CONFIG_IA32_EMULATION
+void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
+{
+ start_thread_common(regs, new_ip, new_sp,
+ __USER32_CS, __USER32_DS, __USER32_DS);
+}
+#endif
/*
* switch_to(x,y) should switch tasks from x to y.
@@ -495,26 +497,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
if (preload_fpu)
__math_state_restore();
- return prev_p;
-}
-/*
- * sys_execve() executes a new program.
- */
-asmlinkage
-long sys_execve(char __user *name, char __user * __user *argv,
- char __user * __user *envp, struct pt_regs *regs)
-{
- long error;
- char *filename;
-
- filename = getname(name);
- error = PTR_ERR(filename);
- if (IS_ERR(filename))
- return error;
- error = do_execve(filename, argv, envp, regs);
- putname(filename);
- return error;
+ return prev_p;
}
void set_personality_64bit(void)
@@ -531,13 +515,16 @@ void set_personality_64bit(void)
current->personality &= ~READ_IMPLIES_EXEC;
}
-asmlinkage long
-sys_clone(unsigned long clone_flags, unsigned long newsp,
- void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+void set_personality_ia32(void)
{
- if (!newsp)
- newsp = regs->sp;
- return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
+ /* inherit personality from parent */
+
+ /* Make sure to be in 32bit mode */
+ set_thread_flag(TIF_IA32);
+ current->personality |= force_personality32;
+
+ /* Prepare the first "return" to user space */
+ current_thread_info()->status |= TS_COMPAT;
}
unsigned long get_wchan(struct task_struct *p)
@@ -664,3 +651,8 @@ long sys_arch_prctl(int code, unsigned long addr)
return do_arch_prctl(current, code, addr);
}
+unsigned long KSTK_ESP(struct task_struct *task)
+{
+ return (test_tsk_thread_flag(task, TIF_IA32)) ?
+ (task_pt_regs(task)->sp) : ((task)->thread.usersp);
+}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7b058a2dc66a..a503b1fd04e5 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,6 +22,8 @@
#include <linux/seccomp.h>
#include <linux/signal.h>
#include <linux/workqueue.h>
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -34,6 +36,7 @@
#include <asm/prctl.h>
#include <asm/proto.h>
#include <asm/ds.h>
+#include <asm/hw_breakpoint.h>
#include "tls.h"
@@ -45,10 +48,99 @@ enum x86_regset {
REGSET_FP,
REGSET_XFP,
REGSET_IOPERM64 = REGSET_XFP,
+ REGSET_XSTATE,
REGSET_TLS,
REGSET_IOPERM32,
};
+struct pt_regs_offset {
+ const char *name;
+ int offset;
+};
+
+#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
+#define REG_OFFSET_END {.name = NULL, .offset = 0}
+
+static const struct pt_regs_offset regoffset_table[] = {
+#ifdef CONFIG_X86_64
+ REG_OFFSET_NAME(r15),
+ REG_OFFSET_NAME(r14),
+ REG_OFFSET_NAME(r13),
+ REG_OFFSET_NAME(r12),
+ REG_OFFSET_NAME(r11),
+ REG_OFFSET_NAME(r10),
+ REG_OFFSET_NAME(r9),
+ REG_OFFSET_NAME(r8),
+#endif
+ REG_OFFSET_NAME(bx),
+ REG_OFFSET_NAME(cx),
+ REG_OFFSET_NAME(dx),
+ REG_OFFSET_NAME(si),
+ REG_OFFSET_NAME(di),
+ REG_OFFSET_NAME(bp),
+ REG_OFFSET_NAME(ax),
+#ifdef CONFIG_X86_32
+ REG_OFFSET_NAME(ds),
+ REG_OFFSET_NAME(es),
+ REG_OFFSET_NAME(fs),
+ REG_OFFSET_NAME(gs),
+#endif
+ REG_OFFSET_NAME(orig_ax),
+ REG_OFFSET_NAME(ip),
+ REG_OFFSET_NAME(cs),
+ REG_OFFSET_NAME(flags),
+ REG_OFFSET_NAME(sp),
+ REG_OFFSET_NAME(ss),
+ REG_OFFSET_END,
+};
+
+/**
+ * regs_query_register_offset() - query register offset from its name
+ * @name: the name of a register
+ *
+ * regs_query_register_offset() returns the offset of a register in struct
+ * pt_regs from its name. If the name is invalid, this returns -EINVAL;
+ */
+int regs_query_register_offset(const char *name)
+{
+ const struct pt_regs_offset *roff;
+ for (roff = regoffset_table; roff->name != NULL; roff++)
+ if (!strcmp(roff->name, name))
+ return roff->offset;
+ return -EINVAL;
+}
+
+/**
+ * regs_query_register_name() - query register name from its offset
+ * @offset: the offset of a register in struct pt_regs.
+ *
+ * regs_query_register_name() returns the name of a register from its
+ * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
+ */
+const char *regs_query_register_name(unsigned int offset)
+{
+ const struct pt_regs_offset *roff;
+ for (roff = regoffset_table; roff->name != NULL; roff++)
+ if (roff->offset == offset)
+ return roff->name;
+ return NULL;
+}
+
+static const int arg_offs_table[] = {
+#ifdef CONFIG_X86_32
+ [0] = offsetof(struct pt_regs, ax),
+ [1] = offsetof(struct pt_regs, dx),
+ [2] = offsetof(struct pt_regs, cx)
+#else /* CONFIG_X86_64 */
+ [0] = offsetof(struct pt_regs, di),
+ [1] = offsetof(struct pt_regs, si),
+ [2] = offsetof(struct pt_regs, dx),
+ [3] = offsetof(struct pt_regs, cx),
+ [4] = offsetof(struct pt_regs, r8),
+ [5] = offsetof(struct pt_regs, r9)
+#endif
+};
+
/*
* does not yet catch signals sent when the child dies.
* in exit.c or in signal.c.
@@ -137,11 +229,6 @@ static int set_segment_reg(struct task_struct *task,
return 0;
}
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
- return TASK_SIZE - 3;
-}
-
#else /* CONFIG_X86_64 */
#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -266,15 +353,6 @@ static int set_segment_reg(struct task_struct *task,
return 0;
}
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
- return IA32_PAGE_OFFSET - 3;
-#endif
- return TASK_SIZE_MAX - 7;
-}
-
#endif /* CONFIG_X86_32 */
static unsigned long get_flags(struct task_struct *task)
@@ -408,14 +486,14 @@ static int genregs_get(struct task_struct *target,
{
if (kbuf) {
unsigned long *k = kbuf;
- while (count > 0) {
+ while (count >= sizeof(*k)) {
*k++ = getreg(target, pos);
count -= sizeof(*k);
pos += sizeof(*k);
}
} else {
unsigned long __user *u = ubuf;
- while (count > 0) {
+ while (count >= sizeof(*u)) {
if (__put_user(getreg(target, pos), u++))
return -EFAULT;
count -= sizeof(*u);
@@ -434,14 +512,14 @@ static int genregs_set(struct task_struct *target,
int ret = 0;
if (kbuf) {
const unsigned long *k = kbuf;
- while (count > 0 && !ret) {
+ while (count >= sizeof(*k) && !ret) {
ret = putreg(target, pos, *k++);
count -= sizeof(*k);
pos += sizeof(*k);
}
} else {
const unsigned long __user *u = ubuf;
- while (count > 0 && !ret) {
+ while (count >= sizeof(*u) && !ret) {
unsigned long word;
ret = __get_user(word, u++);
if (ret)
@@ -454,99 +532,240 @@ static int genregs_set(struct task_struct *target,
return ret;
}
+static void ptrace_triggered(struct perf_event *bp, int nmi,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ int i;
+ struct thread_struct *thread = &(current->thread);
+
+ /*
+ * Store in the virtual DR6 register the fact that the breakpoint
+ * was hit so the thread's debugger will see it.
+ */
+ for (i = 0; i < HBP_NUM; i++) {
+ if (thread->ptrace_bps[i] == bp)
+ break;
+ }
+
+ thread->debugreg6 |= (DR_TRAP0 << i);
+}
+
/*
- * This function is trivial and will be inlined by the compiler.
- * Having it separates the implementation details of debug
- * registers from the interface details of ptrace.
+ * Walk through every ptrace breakpoints for this thread and
+ * build the dr7 value on top of their attributes.
+ *
*/
-static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
+static unsigned long ptrace_get_dr7(struct perf_event *bp[])
{
- switch (n) {
- case 0: return child->thread.debugreg0;
- case 1: return child->thread.debugreg1;
- case 2: return child->thread.debugreg2;
- case 3: return child->thread.debugreg3;
- case 6: return child->thread.debugreg6;
- case 7: return child->thread.debugreg7;
+ int i;
+ int dr7 = 0;
+ struct arch_hw_breakpoint *info;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ if (bp[i] && !bp[i]->attr.disabled) {
+ info = counter_arch_bp(bp[i]);
+ dr7 |= encode_dr7(i, info->len, info->type);
+ }
}
- return 0;
+
+ return dr7;
}
-static int ptrace_set_debugreg(struct task_struct *child,
- int n, unsigned long data)
+static int
+ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
+ struct task_struct *tsk, int disabled)
{
- int i;
+ int err;
+ int gen_len, gen_type;
+ struct perf_event_attr attr;
- if (unlikely(n == 4 || n == 5))
- return -EIO;
+ /*
+ * We should have at least an inactive breakpoint at this
+ * slot. It means the user is writing dr7 without having
+ * written the address register first
+ */
+ if (!bp)
+ return -EINVAL;
- if (n < 4 && unlikely(data >= debugreg_addr_limit(child)))
- return -EIO;
+ err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
+ if (err)
+ return err;
- switch (n) {
- case 0: child->thread.debugreg0 = data; break;
- case 1: child->thread.debugreg1 = data; break;
- case 2: child->thread.debugreg2 = data; break;
- case 3: child->thread.debugreg3 = data; break;
+ attr = bp->attr;
+ attr.bp_len = gen_len;
+ attr.bp_type = gen_type;
+ attr.disabled = disabled;
- case 6:
- if ((data & ~0xffffffffUL) != 0)
- return -EIO;
- child->thread.debugreg6 = data;
- break;
+ return modify_user_hw_breakpoint(bp, &attr);
+}
- case 7:
+/*
+ * Handle ptrace writes to debug register 7.
+ */
+static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
+{
+ struct thread_struct *thread = &(tsk->thread);
+ unsigned long old_dr7;
+ int i, orig_ret = 0, rc = 0;
+ int enabled, second_pass = 0;
+ unsigned len, type;
+ struct perf_event *bp;
+
+ data &= ~DR_CONTROL_RESERVED;
+ old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
+restore:
+ /*
+ * Loop through all the hardware breakpoints, making the
+ * appropriate changes to each.
+ */
+ for (i = 0; i < HBP_NUM; i++) {
+ enabled = decode_dr7(data, i, &len, &type);
+ bp = thread->ptrace_bps[i];
+
+ if (!enabled) {
+ if (bp) {
+ /*
+ * Don't unregister the breakpoints right-away,
+ * unless all register_user_hw_breakpoint()
+ * requests have succeeded. This prevents
+ * any window of opportunity for debug
+ * register grabbing by other users.
+ */
+ if (!second_pass)
+ continue;
+
+ rc = ptrace_modify_breakpoint(bp, len, type,
+ tsk, 1);
+ if (rc)
+ break;
+ }
+ continue;
+ }
+
+ rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
+ if (rc)
+ break;
+ }
+ /*
+ * Make a second pass to free the remaining unused breakpoints
+ * or to restore the original breakpoints if an error occurred.
+ */
+ if (!second_pass) {
+ second_pass = 1;
+ if (rc < 0) {
+ orig_ret = rc;
+ data = old_dr7;
+ }
+ goto restore;
+ }
+ return ((orig_ret < 0) ? orig_ret : rc);
+}
+
+/*
+ * Handle PTRACE_PEEKUSR calls for the debug register area.
+ */
+static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
+{
+ struct thread_struct *thread = &(tsk->thread);
+ unsigned long val = 0;
+
+ if (n < HBP_NUM) {
+ struct perf_event *bp;
+ bp = thread->ptrace_bps[n];
+ if (!bp)
+ return 0;
+ val = bp->hw.info.address;
+ } else if (n == 6) {
+ val = thread->debugreg6;
+ } else if (n == 7) {
+ val = thread->ptrace_dr7;
+ }
+ return val;
+}
+
+static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
+ unsigned long addr)
+{
+ struct perf_event *bp;
+ struct thread_struct *t = &tsk->thread;
+ struct perf_event_attr attr;
+
+ if (!t->ptrace_bps[nr]) {
+ hw_breakpoint_init(&attr);
/*
- * Sanity-check data. Take one half-byte at once with
- * check = (val >> (16 + 4*i)) & 0xf. It contains the
- * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
- * 2 and 3 are LENi. Given a list of invalid values,
- * we do mask |= 1 << invalid_value, so that
- * (mask >> check) & 1 is a correct test for invalid
- * values.
- *
- * R/Wi contains the type of the breakpoint /
- * watchpoint, LENi contains the length of the watched
- * data in the watchpoint case.
- *
- * The invalid values are:
- * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit]
- * - R/Wi == 0x10 (break on I/O reads or writes), so
- * mask |= 0x4444.
- * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
- * 0x1110.
- *
- * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
- *
- * See the Intel Manual "System Programming Guide",
- * 15.2.4
- *
- * Note that LENi == 0x10 is defined on x86_64 in long
- * mode (i.e. even for 32-bit userspace software, but
- * 64-bit kernel), so the x86_64 mask value is 0x5454.
- * See the AMD manual no. 24593 (AMD64 System Programming)
+ * Put stub len and type to register (reserve) an inactive but
+ * correct bp
*/
-#ifdef CONFIG_X86_32
-#define DR7_MASK 0x5f54
-#else
-#define DR7_MASK 0x5554
-#endif
- data &= ~DR_CONTROL_RESERVED;
- for (i = 0; i < 4; i++)
- if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1)
- return -EIO;
- child->thread.debugreg7 = data;
- if (data)
- set_tsk_thread_flag(child, TIF_DEBUG);
- else
- clear_tsk_thread_flag(child, TIF_DEBUG);
- break;
+ attr.bp_addr = addr;
+ attr.bp_len = HW_BREAKPOINT_LEN_1;
+ attr.bp_type = HW_BREAKPOINT_W;
+ attr.disabled = 1;
+
+ bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
+
+ /*
+ * CHECKME: the previous code returned -EIO if the addr wasn't
+ * a valid task virtual addr. The new one will return -EINVAL in
+ * this case.
+ * -EINVAL may be what we want for in-kernel breakpoints users,
+ * but -EIO looks better for ptrace, since we refuse a register
+ * writing for the user. And anyway this is the previous
+ * behaviour.
+ */
+ if (IS_ERR(bp))
+ return PTR_ERR(bp);
+
+ t->ptrace_bps[nr] = bp;
+ } else {
+ int err;
+
+ bp = t->ptrace_bps[nr];
+
+ attr = bp->attr;
+ attr.bp_addr = addr;
+ err = modify_user_hw_breakpoint(bp, &attr);
+ if (err)
+ return err;
}
+
return 0;
}
/*
+ * Handle PTRACE_POKEUSR calls for the debug register area.
+ */
+int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
+{
+ struct thread_struct *thread = &(tsk->thread);
+ int rc = 0;
+
+ /* There are no DR4 or DR5 registers */
+ if (n == 4 || n == 5)
+ return -EIO;
+
+ if (n == 6) {
+ thread->debugreg6 = val;
+ goto ret_path;
+ }
+ if (n < HBP_NUM) {
+ rc = ptrace_set_breakpoint_addr(tsk, n, val);
+ if (rc)
+ return rc;
+ }
+ /* All that's left is DR7 */
+ if (n == 7) {
+ rc = ptrace_write_dr7(tsk, val);
+ if (!rc)
+ thread->ptrace_dr7 = val;
+ }
+
+ret_path:
+ return rc;
+}
+
+/*
* These access the current or another (stopped) task's io permission
* bitmap for debugging or core dump.
*/
@@ -1219,14 +1438,14 @@ static int genregs32_get(struct task_struct *target,
{
if (kbuf) {
compat_ulong_t *k = kbuf;
- while (count > 0) {
+ while (count >= sizeof(*k)) {
getreg32(target, pos, k++);
count -= sizeof(*k);
pos += sizeof(*k);
}
} else {
compat_ulong_t __user *u = ubuf;
- while (count > 0) {
+ while (count >= sizeof(*u)) {
compat_ulong_t word;
getreg32(target, pos, &word);
if (__put_user(word, u++))
@@ -1247,14 +1466,14 @@ static int genregs32_set(struct task_struct *target,
int ret = 0;
if (kbuf) {
const compat_ulong_t *k = kbuf;
- while (count > 0 && !ret) {
+ while (count >= sizeof(*k) && !ret) {
ret = putreg32(target, pos, *k++);
count -= sizeof(*k);
pos += sizeof(*k);
}
} else {
const compat_ulong_t __user *u = ubuf;
- while (count > 0 && !ret) {
+ while (count >= sizeof(*u) && !ret) {
compat_ulong_t word;
ret = __get_user(word, u++);
if (ret)
@@ -1345,7 +1564,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
#ifdef CONFIG_X86_64
-static const struct user_regset x86_64_regsets[] = {
+static struct user_regset x86_64_regsets[] __read_mostly = {
[REGSET_GENERAL] = {
.core_note_type = NT_PRSTATUS,
.n = sizeof(struct user_regs_struct) / sizeof(long),
@@ -1358,6 +1577,12 @@ static const struct user_regset x86_64_regsets[] = {
.size = sizeof(long), .align = sizeof(long),
.active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
},
+ [REGSET_XSTATE] = {
+ .core_note_type = NT_X86_XSTATE,
+ .size = sizeof(u64), .align = sizeof(u64),
+ .active = xstateregs_active, .get = xstateregs_get,
+ .set = xstateregs_set
+ },
[REGSET_IOPERM64] = {
.core_note_type = NT_386_IOPERM,
.n = IO_BITMAP_LONGS,
@@ -1383,7 +1608,7 @@ static const struct user_regset_view user_x86_64_view = {
#endif /* CONFIG_X86_64 */
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-static const struct user_regset x86_32_regsets[] = {
+static struct user_regset x86_32_regsets[] __read_mostly = {
[REGSET_GENERAL] = {
.core_note_type = NT_PRSTATUS,
.n = sizeof(struct user_regs_struct32) / sizeof(u32),
@@ -1402,6 +1627,12 @@ static const struct user_regset x86_32_regsets[] = {
.size = sizeof(u32), .align = sizeof(u32),
.active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
},
+ [REGSET_XSTATE] = {
+ .core_note_type = NT_X86_XSTATE,
+ .size = sizeof(u64), .align = sizeof(u64),
+ .active = xstateregs_active, .get = xstateregs_get,
+ .set = xstateregs_set
+ },
[REGSET_TLS] = {
.core_note_type = NT_386_TLS,
.n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
@@ -1424,6 +1655,23 @@ static const struct user_regset_view user_x86_32_view = {
};
#endif
+/*
+ * This represents bytes 464..511 in the memory layout exported through
+ * the REGSET_XSTATE interface.
+ */
+u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
+
+void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
+{
+#ifdef CONFIG_X86_64
+ x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
+#endif
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64);
+#endif
+ xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
+}
+
const struct user_regset_view *task_user_regset_view(struct task_struct *task)
{
#ifdef CONFIG_IA32_EMULATION
@@ -1437,21 +1685,33 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
#endif
}
-void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
- int error_code, int si_code)
+static void fill_sigtrap_info(struct task_struct *tsk,
+ struct pt_regs *regs,
+ int error_code, int si_code,
+ struct siginfo *info)
{
- struct siginfo info;
-
tsk->thread.trap_no = 1;
tsk->thread.error_code = error_code;
- memset(&info, 0, sizeof(info));
- info.si_signo = SIGTRAP;
- info.si_code = si_code;
+ memset(info, 0, sizeof(*info));
+ info->si_signo = SIGTRAP;
+ info->si_code = si_code;
+ info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL;
+}
- /* User-mode ip? */
- info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
+void user_single_step_siginfo(struct task_struct *tsk,
+ struct pt_regs *regs,
+ struct siginfo *info)
+{
+ fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info);
+}
+void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
+ int error_code, int si_code)
+{
+ struct siginfo info;
+
+ fill_sigtrap_info(tsk, regs, error_code, si_code, &info);
/* Send us the fake SIGTRAP */
force_sig_info(SIGTRAP, &info, tsk);
}
@@ -1516,29 +1776,22 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
asmregparm void syscall_trace_leave(struct pt_regs *regs)
{
+ bool step;
+
if (unlikely(current->audit_context))
audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
trace_sys_exit(regs, regs->ax);
- if (test_thread_flag(TIF_SYSCALL_TRACE))
- tracehook_report_syscall_exit(regs, 0);
-
/*
* If TIF_SYSCALL_EMU is set, we only get here because of
* TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
* We already reported this syscall instruction in
- * syscall_trace_enter(), so don't do any more now.
- */
- if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
- return;
-
- /*
- * If we are single-stepping, synthesize a trap to follow the
- * system call instruction.
+ * syscall_trace_enter().
*/
- if (test_thread_flag(TIF_SINGLESTEP) &&
- tracehook_consider_fatal_signal(current, SIGTRAP))
- send_sigtrap(current, regs, 0, TRAP_BRKPT);
+ step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
+ !test_thread_flag(TIF_SYSCALL_EMU);
+ if (step || test_thread_flag(TIF_SYSCALL_TRACE))
+ tracehook_report_syscall_exit(regs, step);
}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 6c3b2c6fd772..12e9feaa2f7a 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,6 +491,19 @@ void force_hpet_resume(void)
break;
}
}
+
+/*
+ * HPET MSI on some boards (ATI SB700/SB800) has side effect on
+ * floppy DMA. Disable HPET MSI on such platforms.
+ */
+static void force_disable_hpet_msi(struct pci_dev *unused)
+{
+ hpet_msi_disable = 1;
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
+ force_disable_hpet_msi);
+
#endif
#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
@@ -499,6 +512,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
{
struct pci_dev *nb_ht;
unsigned int devfn;
+ u32 node;
u32 val;
devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
@@ -507,7 +521,13 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
return;
pci_read_config_dword(nb_ht, 0x60, &val);
- set_dev_node(&dev->dev, val & 7);
+ node = val & 7;
+ /*
+ * Some hardware may return an invalid node ID,
+ * so check it first:
+ */
+ if (node_online(node))
+ set_dev_node(&dev->dev, node);
pci_dev_put(nb_ht);
}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 27349f92a6d7..8e1aac86b50c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -4,6 +4,7 @@
#include <linux/pm.h>
#include <linux/efi.h>
#include <linux/dmi.h>
+#include <linux/sched.h>
#include <linux/tboot.h>
#include <acpi/reboot.h>
#include <asm/io.h>
@@ -22,7 +23,7 @@
# include <linux/ctype.h>
# include <linux/mc146818rtc.h>
#else
-# include <asm/iommu.h>
+# include <asm/x86_init.h>
#endif
/*
@@ -202,6 +203,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
},
},
+ { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 760",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
+ DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
+ },
+ },
{ /* Handle problems with rebooting on Dell 2400's */
.callback = set_bios_reboot,
.ident = "Dell PowerEdge 2400",
@@ -258,6 +268,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
},
},
+ { /* Handle problems with rebooting on ASUS P4S800 */
+ .callback = set_bios_reboot,
+ .ident = "ASUS P4S800",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+ DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
+ },
+ },
{ }
};
@@ -435,6 +453,22 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
},
},
+ { /* Handle problems with rebooting on Apple Macmini3,1 */
+ .callback = set_pci_reboot,
+ .ident = "Apple Macmini3,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
+ },
+ },
+ { /* Handle problems with rebooting on the iMac9,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac9,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
+ },
+ },
{ }
};
@@ -613,7 +647,7 @@ void native_machine_shutdown(void)
#endif
#ifdef CONFIG_X86_64
- pci_iommu_shutdown();
+ x86_platform.iommu_shutdown();
#endif
}
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index 61a837743fe5..fda313ebbb03 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -12,7 +12,7 @@
#include <linux/interrupt.h>
#include <asm/reboot_fixups.h>
#include <asm/msr.h>
-#include <asm/geode.h>
+#include <linux/cs5535.h>
static void cs5530a_warm_reset(struct pci_dev *dev)
{
@@ -80,6 +80,7 @@ void mach_reboot_fixups(void)
continue;
cur->reboot_fixup(dev);
+ pci_dev_put(dev);
}
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e09f0e2c14b5..5d7ba1a449bd 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -73,6 +73,7 @@
#include <asm/mtrr.h>
#include <asm/apic.h>
+#include <asm/trampoline.h>
#include <asm/e820.h>
#include <asm/mpspec.h>
#include <asm/setup.h>
@@ -106,9 +107,11 @@
#include <asm/percpu.h>
#include <asm/topology.h>
#include <asm/apicdef.h>
+#include <asm/k8.h>
#ifdef CONFIG_X86_64
#include <asm/numa_64.h>
#endif
+#include <asm/mce.h>
/*
* end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -118,7 +121,9 @@
unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
+#ifdef CONFIG_DMI
RESERVE_BRK(dmi_alloc, 65536);
+#endif
unsigned int boot_cpu_id __read_mostly;
@@ -247,7 +252,7 @@ EXPORT_SYMBOL(edd);
* from boot_params into a safe place.
*
*/
-static inline void copy_edd(void)
+static inline void __init copy_edd(void)
{
memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
sizeof(edd.mbr_signature));
@@ -256,7 +261,7 @@ static inline void copy_edd(void)
edd.edd_info_nr = boot_params.eddbuf_entries;
}
#else
-static inline void copy_edd(void)
+static inline void __init copy_edd(void)
{
}
#endif
@@ -486,42 +491,11 @@ static void __init reserve_early_setup_data(void)
#ifdef CONFIG_KEXEC
-/**
- * Reserve @size bytes of crashkernel memory at any suitable offset.
- *
- * @size: Size of the crashkernel memory to reserve.
- * Returns the base address on success, and -1ULL on failure.
- */
-static
-unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
-{
- const unsigned long long alignment = 16<<20; /* 16M */
- unsigned long long start = 0LL;
-
- while (1) {
- int ret;
-
- start = find_e820_area(start, ULONG_MAX, size, alignment);
- if (start == -1ULL)
- return start;
-
- /* try to reserve it */
- ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
- if (ret >= 0)
- return start;
-
- start += alignment;
- }
-}
-
static inline unsigned long long get_total_mem(void)
{
unsigned long long total;
- total = max_low_pfn - min_low_pfn;
-#ifdef CONFIG_HIGHMEM
- total += highend_pfn - highstart_pfn;
-#endif
+ total = max_pfn - min_low_pfn;
return total << PAGE_SHIFT;
}
@@ -541,21 +515,25 @@ static void __init reserve_crashkernel(void)
/* 0 means: find the address automatically */
if (crash_base <= 0) {
- crash_base = find_and_reserve_crashkernel(crash_size);
+ const unsigned long long alignment = 16<<20; /* 16M */
+
+ crash_base = find_e820_area(alignment, ULONG_MAX, crash_size,
+ alignment);
if (crash_base == -1ULL) {
- pr_info("crashkernel reservation failed. "
- "No suitable area found.\n");
+ pr_info("crashkernel reservation failed - No suitable area found.\n");
return;
}
} else {
- ret = reserve_bootmem_generic(crash_base, crash_size,
- BOOTMEM_EXCLUSIVE);
- if (ret < 0) {
- pr_info("crashkernel reservation failed - "
- "memory is in use\n");
+ unsigned long long start;
+
+ start = find_e820_area(crash_base, ULONG_MAX, crash_size,
+ 1<<20);
+ if (start != crash_base) {
+ pr_info("crashkernel reservation failed - memory is in use.\n");
return;
}
}
+ reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL");
printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
"for crashkernel (System RAM: %ldMB)\n",
@@ -660,22 +638,54 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
},
},
{
+ .callback = dmi_low_memory_corruption,
+ .ident = "Phoenix/MSC BIOS",
+ .matches = {
+ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
+ },
+ },
/*
- * AMI BIOS with low memory corruption was found on Intel DG45ID board.
- * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
+ * AMI BIOS with low memory corruption was found on Intel DG45ID and
+ * DG45FC boards.
+ * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
* match only DMI_BOARD_NAME and see if there is more bad products
* with this vendor.
*/
+ {
.callback = dmi_low_memory_corruption,
.ident = "AMI BIOS",
.matches = {
DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
},
},
+ {
+ .callback = dmi_low_memory_corruption,
+ .ident = "AMI BIOS",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
+ },
+ },
#endif
{}
};
+static void __init trim_bios_range(void)
+{
+ /*
+ * A special case is the first 4Kb of memory;
+ * This is a BIOS owned area, not kernel ram, but generally
+ * not listed as such in the E820 table.
+ */
+ e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
+ /*
+ * special case: Some BIOSen report the PC BIOS
+ * area (640->1Mb) as ram even though it is not.
+ * take them out.
+ */
+ e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+}
+
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -691,6 +701,9 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
void __init setup_arch(char **cmdline_p)
{
+ int acpi = 0;
+ int k8 = 0;
+
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
visws_early_detect();
@@ -783,21 +796,18 @@ void __init setup_arch(char **cmdline_p)
strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
-#ifdef CONFIG_X86_64
/*
- * Must call this twice: Once just to detect whether hardware doesn't
- * support NX (so that the early EHCI debug console setup can safely
- * call set_fixmap(), and then again after parsing early parameters to
- * honor the respective command line option.
+ * x86_configure_nx() is called before parse_early_param() to detect
+ * whether hardware doesn't support NX (so that the early EHCI debug
+ * console setup can safely call set_fixmap()). It may then be called
+ * again from within noexec_setup() during parsing early parameters
+ * to honor the respective command line option.
*/
- check_efer();
-#endif
+ x86_configure_nx();
parse_early_param();
-#ifdef CONFIG_X86_64
- check_efer();
-#endif
+ x86_report_nx();
/* Must be before kernel pagetables are setup */
vmi_activate();
@@ -839,7 +849,7 @@ void __init setup_arch(char **cmdline_p)
insert_resource(&iomem_resource, &data_resource);
insert_resource(&iomem_resource, &bss_resource);
-
+ trim_bios_range();
#ifdef CONFIG_X86_32
if (ppro_with_ram_bug()) {
e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
@@ -893,6 +903,20 @@ void __init setup_arch(char **cmdline_p)
reserve_brk();
+ /*
+ * Find and reserve possible boot-time SMP configuration:
+ */
+ find_smp_config();
+
+ reserve_trampoline_memory();
+
+#ifdef CONFIG_ACPI_SLEEP
+ /*
+ * Reserve low memory region for sleep support.
+ * even before init_memory_mapping
+ */
+ acpi_reserve_wakeup_memory();
+#endif
init_gbpages();
/* max_pfn_mapped is updated here */
@@ -919,6 +943,8 @@ void __init setup_arch(char **cmdline_p)
reserve_initrd();
+ reserve_crashkernel();
+
vsmp_init();
io_delay_init();
@@ -934,32 +960,20 @@ void __init setup_arch(char **cmdline_p)
/*
* Parse SRAT to discover nodes.
*/
- acpi_numa_init();
+ acpi = acpi_numa_init();
#endif
- initmem_init(0, max_pfn);
-
-#ifdef CONFIG_ACPI_SLEEP
- /*
- * Reserve low memory region for sleep support.
- */
- acpi_reserve_bootmem();
+#ifdef CONFIG_K8_NUMA
+ if (!acpi)
+ k8 = !k8_numa_init(0, max_pfn);
#endif
- /*
- * Find and reserve possible boot-time SMP configuration:
- */
- find_smp_config();
- reserve_crashkernel();
+ initmem_init(0, max_pfn, acpi, k8);
+#ifndef CONFIG_NO_BOOTMEM
+ early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
+#endif
-#ifdef CONFIG_X86_64
- /*
- * dma32_reserve_bootmem() allocates bootmem which may conflict
- * with the crashkernel command line, so do that after
- * reserve_crashkernel()
- */
dma32_reserve_bootmem();
-#endif
reserve_ibft_region();
@@ -1024,6 +1038,8 @@ void __init setup_arch(char **cmdline_p)
#endif
#endif
x86_init.oem.banner();
+
+ mcheck_init();
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d559af913e1f..ef6370b00e70 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
@@ -20,9 +22,9 @@
#include <asm/stackprotector.h>
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-# define DBG(x...) printk(KERN_DEBUG x)
+# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__)
#else
-# define DBG(x...)
+# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0)
#endif
DEFINE_PER_CPU(int, cpu_number);
@@ -116,8 +118,8 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
} else {
ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
size, align, goal);
- pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
- "%016lx\n", cpu, size, node, __pa(ptr));
+ pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
+ cpu, size, node, __pa(ptr));
}
return ptr;
#else
@@ -135,7 +137,13 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
static void __init pcpu_fc_free(void *ptr, size_t size)
{
+#ifdef CONFIG_NO_BOOTMEM
+ u64 start = __pa(ptr);
+ u64 end = start + size;
+ free_early_partial(start, end);
+#else
free_bootmem(__pa(ptr), size);
+#endif
}
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
@@ -198,8 +206,7 @@ void __init setup_per_cpu_areas(void)
pcpu_cpu_distance,
pcpu_fc_alloc, pcpu_fc_free);
if (rc < 0)
- pr_warning("PERCPU: %s allocator failed (%d), "
- "falling back to page size\n",
+ pr_warning("%s allocator failed (%d), falling back to page size\n",
pcpu_fc_names[pcpu_chosen_fc], rc);
}
if (rc < 0)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76055ad..4fd173cd8e57 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/uaccess.h>
+#include <linux/user-return-notifier.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
@@ -544,22 +545,12 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
}
#endif /* CONFIG_X86_32 */
-#ifdef CONFIG_X86_32
-int sys_sigaltstack(struct pt_regs *regs)
-{
- const stack_t __user *uss = (const stack_t __user *)regs->bx;
- stack_t __user *uoss = (stack_t __user *)regs->cx;
-
- return do_sigaltstack(uss, uoss, regs->sp);
-}
-#else /* !CONFIG_X86_32 */
-asmlinkage long
+long
sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
struct pt_regs *regs)
{
return do_sigaltstack(uss, uoss, regs->sp);
}
-#endif /* CONFIG_X86_32 */
/*
* Do a signal return; undo the signal stack.
@@ -799,15 +790,6 @@ static void do_signal(struct pt_regs *regs)
signr = get_signal_to_deliver(&info, &ka, regs, NULL);
if (signr > 0) {
- /*
- * Re-enable any watchpoints before delivering the
- * signal to user space. The processor register will
- * have been cleared if the watchpoint triggered
- * inside the kernel.
- */
- if (current->thread.debugreg7)
- set_debugreg(current->thread.debugreg7, 7);
-
/* Whee! Actually deliver the signal. */
if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
/*
@@ -872,6 +854,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
if (current->replacement_session_keyring)
key_replace_session_keyring();
}
+ if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
+ fire_user_return_notifiers();
#ifdef CONFIG_X86_32
clear_thread_flag(TIF_IRET);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 565ebc65920e..a02e80c3c54b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -48,6 +48,7 @@
#include <linux/err.h>
#include <linux/nmi.h>
#include <linux/tboot.h>
+#include <linux/stackprotector.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -67,6 +68,7 @@
#include <linux/mc146818rtc.h>
#include <asm/smpboot_hooks.h>
+#include <asm/i8259.h>
#ifdef CONFIG_X86_32
u8 apicid_2_node[MAX_APICID];
@@ -241,6 +243,11 @@ static void __cpuinit smp_callin(void)
map_cpu_to_logical_apicid();
notify_cpu_starting(cpuid);
+
+ /*
+ * Need to setup vector mappings before we enable interrupts.
+ */
+ __setup_vector_irq(smp_processor_id());
/*
* Get our bogomips.
*
@@ -286,9 +293,9 @@ notrace static void __cpuinit start_secondary(void *unused)
check_tsc_sync_target();
if (nmi_watchdog == NMI_IO_APIC) {
- disable_8259A_irq(0);
+ legacy_pic->chip->mask(0);
enable_NMI_through_LVT0();
- enable_8259A_irq(0);
+ legacy_pic->chip->unmask(0);
}
#ifdef CONFIG_X86_32
@@ -315,15 +322,18 @@ notrace static void __cpuinit start_secondary(void *unused)
*/
ipi_call_lock();
lock_vector_lock();
- __setup_vector_irq(smp_processor_id());
set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
ipi_call_unlock();
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+ x86_platform.nmi_init();
/* enable local interrupts */
local_irq_enable();
+ /* to prevent fake stack check failure in clock setup */
+ boot_init_stack_canary();
+
x86_cpuinit.setup_percpu_clockev();
wmb();
@@ -671,6 +681,26 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
complete(&c_idle->done);
}
+/* reduce the number of lines printed when booting a large cpu count system */
+static void __cpuinit announce_cpu(int cpu, int apicid)
+{
+ static int current_node = -1;
+ int node = cpu_to_node(cpu);
+
+ if (system_state == SYSTEM_BOOTING) {
+ if (node != current_node) {
+ if (current_node > (-1))
+ pr_cont(" Ok.\n");
+ current_node = node;
+ pr_info("Booting Node %3d, Processors ", node);
+ }
+ pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : "");
+ return;
+ } else
+ pr_info("Booting Node %d Processor %d APIC 0x%x\n",
+ node, cpu, apicid);
+}
+
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
* (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -687,7 +717,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
.done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
};
- INIT_WORK(&c_idle.work, do_fork_idle);
+ INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
alternatives_smp_switch(1);
@@ -713,6 +743,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
if (IS_ERR(c_idle.idle)) {
printk("failed fork for CPU %d\n", cpu);
+ destroy_work_on_stack(&c_idle.work);
return PTR_ERR(c_idle.idle);
}
@@ -736,9 +767,8 @@ do_rest:
/* start_ip had better be page-aligned! */
start_ip = setup_trampoline();
- /* So we see what's up */
- printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n",
- cpu, apicid, start_ip);
+ /* So we see what's up */
+ announce_cpu(cpu, apicid);
/*
* This grunge runs the startup process for
@@ -787,21 +817,17 @@ do_rest:
udelay(100);
}
- if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
- /* number CPUs logically, starting from 1 (BSP is 0) */
- pr_debug("OK.\n");
- printk(KERN_INFO "CPU%d: ", cpu);
- print_cpu_info(&cpu_data(cpu));
- pr_debug("CPU has booted.\n");
- } else {
+ if (cpumask_test_cpu(cpu, cpu_callin_mask))
+ pr_debug("CPU%d: has booted.\n", cpu);
+ else {
boot_error = 1;
if (*((volatile unsigned char *)trampoline_base)
== 0xA5)
/* trampoline started but...? */
- printk(KERN_ERR "Stuck ??\n");
+ pr_err("CPU%d: Stuck ??\n", cpu);
else
/* trampoline code not run */
- printk(KERN_ERR "Not responding.\n");
+ pr_err("CPU%d: Not responding.\n", cpu);
if (apic->inquire_remote_apic)
apic->inquire_remote_apic(apicid);
}
@@ -831,6 +857,7 @@ do_rest:
smpboot_restore_warm_reset_vector();
}
+ destroy_work_on_stack(&c_idle.work);
return boot_error;
}
@@ -1066,9 +1093,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_cpu_sibling_map(0);
enable_IR_x2apic();
-#ifdef CONFIG_X86_64
default_setup_apic_routing();
-#endif
if (smp_sanity_check(max_cpus) < 0) {
printk(KERN_INFO "SMP disabled\n");
@@ -1196,11 +1221,12 @@ __init void prefill_possible_map(void)
total_cpus = max_t(int, possible, num_processors + disabled_cpus);
- if (possible > CONFIG_NR_CPUS) {
+ /* nr_cpu_ids could be reduced via nr_cpus= */
+ if (possible > nr_cpu_ids) {
printk(KERN_WARNING
"%d Processors exceeds NR_CPUS limit of %d\n",
- possible, CONFIG_NR_CPUS);
- possible = CONFIG_NR_CPUS;
+ possible, nr_cpu_ids);
+ possible = nr_cpu_ids;
}
printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
@@ -1250,16 +1276,7 @@ static void __ref remove_cpu_from_maps(int cpu)
void cpu_disable_common(void)
{
int cpu = smp_processor_id();
- /*
- * HACK:
- * Allow any queued timer interrupts to get serviced
- * This is only a temporary solution until we cleanup
- * fixup_irqs as we do for IA64.
- */
- local_irq_enable();
- mdelay(1);
- local_irq_disable();
remove_siblinginfo(cpu);
/* It's now safe to remove this processor from the online map */
@@ -1300,14 +1317,16 @@ void native_cpu_die(unsigned int cpu)
for (i = 0; i < 10; i++) {
/* They ack this in play_dead by setting CPU_DEAD */
if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
- printk(KERN_INFO "CPU %d is now offline\n", cpu);
+ if (system_state == SYSTEM_RUNNING)
+ pr_info("CPU %u is now offline\n", cpu);
+
if (1 == num_online_cpus())
alternatives_smp_switch(0);
return;
}
msleep(100);
}
- printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+ pr_err("CPU %u didn't die...\n", cpu);
}
void play_dead_common(void)
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index c3eb207181fe..922eefbb3f6c 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -53,17 +53,19 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
}
static const struct stacktrace_ops save_stack_ops = {
- .warning = save_stack_warning,
- .warning_symbol = save_stack_warning_symbol,
- .stack = save_stack_stack,
- .address = save_stack_address,
+ .warning = save_stack_warning,
+ .warning_symbol = save_stack_warning_symbol,
+ .stack = save_stack_stack,
+ .address = save_stack_address,
+ .walk_stack = print_context_stack,
};
static const struct stacktrace_ops save_stack_ops_nosched = {
- .warning = save_stack_warning,
- .warning_symbol = save_stack_warning_symbol,
- .stack = save_stack_stack,
- .address = save_stack_address_nosched,
+ .warning = save_stack_warning,
+ .warning_symbol = save_stack_warning_symbol,
+ .stack = save_stack_stack,
+ .address = save_stack_address_nosched,
+ .walk_stack = print_context_stack,
};
/*
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 1884a8d12bfa..196552bb412c 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -24,216 +24,6 @@
#include <asm/syscalls.h>
-asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
- unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long pgoff)
-{
- int error = -EBADF;
- struct file *file = NULL;
- struct mm_struct *mm = current->mm;
-
- flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- if (!(flags & MAP_ANONYMOUS)) {
- file = fget(fd);
- if (!file)
- goto out;
- }
-
- down_write(&mm->mmap_sem);
- error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
- up_write(&mm->mmap_sem);
-
- if (file)
- fput(file);
-out:
- return error;
-}
-
-/*
- * Perform the select(nd, in, out, ex, tv) and mmap() system
- * calls. Linux/i386 didn't use to be able to handle more than
- * 4 system call parameters, so these system calls used a memory
- * block for parameter passing..
- */
-
-struct mmap_arg_struct {
- unsigned long addr;
- unsigned long len;
- unsigned long prot;
- unsigned long flags;
- unsigned long fd;
- unsigned long offset;
-};
-
-asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
-{
- struct mmap_arg_struct a;
- int err = -EFAULT;
-
- if (copy_from_user(&a, arg, sizeof(a)))
- goto out;
-
- err = -EINVAL;
- if (a.offset & ~PAGE_MASK)
- goto out;
-
- err = sys_mmap2(a.addr, a.len, a.prot, a.flags,
- a.fd, a.offset >> PAGE_SHIFT);
-out:
- return err;
-}
-
-
-struct sel_arg_struct {
- unsigned long n;
- fd_set __user *inp, *outp, *exp;
- struct timeval __user *tvp;
-};
-
-asmlinkage int old_select(struct sel_arg_struct __user *arg)
-{
- struct sel_arg_struct a;
-
- if (copy_from_user(&a, arg, sizeof(a)))
- return -EFAULT;
- /* sys_select() does the appropriate kernel locking */
- return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
-}
-
-/*
- * sys_ipc() is the de-multiplexer for the SysV IPC calls..
- *
- * This is really horribly ugly.
- */
-asmlinkage int sys_ipc(uint call, int first, int second,
- int third, void __user *ptr, long fifth)
-{
- int version, ret;
-
- version = call >> 16; /* hack for backward compatibility */
- call &= 0xffff;
-
- switch (call) {
- case SEMOP:
- return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL);
- case SEMTIMEDOP:
- return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
- (const struct timespec __user *)fifth);
-
- case SEMGET:
- return sys_semget(first, second, third);
- case SEMCTL: {
- union semun fourth;
- if (!ptr)
- return -EINVAL;
- if (get_user(fourth.__pad, (void __user * __user *) ptr))
- return -EFAULT;
- return sys_semctl(first, second, third, fourth);
- }
-
- case MSGSND:
- return sys_msgsnd(first, (struct msgbuf __user *) ptr,
- second, third);
- case MSGRCV:
- switch (version) {
- case 0: {
- struct ipc_kludge tmp;
- if (!ptr)
- return -EINVAL;
-
- if (copy_from_user(&tmp,
- (struct ipc_kludge __user *) ptr,
- sizeof(tmp)))
- return -EFAULT;
- return sys_msgrcv(first, tmp.msgp, second,
- tmp.msgtyp, third);
- }
- default:
- return sys_msgrcv(first,
- (struct msgbuf __user *) ptr,
- second, fifth, third);
- }
- case MSGGET:
- return sys_msgget((key_t) first, second);
- case MSGCTL:
- return sys_msgctl(first, second, (struct msqid_ds __user *) ptr);
-
- case SHMAT:
- switch (version) {
- default: {
- ulong raddr;
- ret = do_shmat(first, (char __user *) ptr, second, &raddr);
- if (ret)
- return ret;
- return put_user(raddr, (ulong __user *) third);
- }
- case 1: /* iBCS2 emulator entry point */
- if (!segment_eq(get_fs(), get_ds()))
- return -EINVAL;
- /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
- return do_shmat(first, (char __user *) ptr, second, (ulong *) third);
- }
- case SHMDT:
- return sys_shmdt((char __user *)ptr);
- case SHMGET:
- return sys_shmget(first, second, third);
- case SHMCTL:
- return sys_shmctl(first, second,
- (struct shmid_ds __user *) ptr);
- default:
- return -ENOSYS;
- }
-}
-
-/*
- * Old cruft
- */
-asmlinkage int sys_uname(struct old_utsname __user *name)
-{
- int err;
- if (!name)
- return -EFAULT;
- down_read(&uts_sem);
- err = copy_to_user(name, utsname(), sizeof(*name));
- up_read(&uts_sem);
- return err? -EFAULT:0;
-}
-
-asmlinkage int sys_olduname(struct oldold_utsname __user *name)
-{
- int error;
-
- if (!name)
- return -EFAULT;
- if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
- return -EFAULT;
-
- down_read(&uts_sem);
-
- error = __copy_to_user(&name->sysname, &utsname()->sysname,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->nodename, &utsname()->nodename,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->release, &utsname()->release,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->release + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->version, &utsname()->version,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->version + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->machine, &utsname()->machine,
- __OLD_UTS_LEN);
- error |= __put_user(0, name->machine + __OLD_UTS_LEN);
-
- up_read(&uts_sem);
-
- error = error ? -EFAULT : 0;
-
- return error;
-}
-
-
/*
* Do a system call from kernel instead of calling sys_execve so we
* end up with proper pt_regs.
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 45e00eb09c3a..ff14a5044ce6 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -23,26 +23,11 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, fd, unsigned long, off)
{
long error;
- struct file *file;
-
error = -EINVAL;
if (off & ~PAGE_MASK)
goto out;
- error = -EBADF;
- file = NULL;
- flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- if (!(flags & MAP_ANONYMOUS)) {
- file = fget(fd);
- if (!file)
- goto out;
- }
- down_write(&current->mm->mmap_sem);
- error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
- up_write(&current->mm->mmap_sem);
-
- if (file)
- fput(file);
+ error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
out:
return error;
}
@@ -224,15 +209,3 @@ bottomup:
return addr;
}
-
-
-SYSCALL_DEFINE1(uname, struct new_utsname __user *, name)
-{
- int err;
- down_read(&uts_sem);
- err = copy_to_user(name, utsname(), sizeof(*name));
- up_read(&uts_sem);
- if (personality(current->personality) == PER_LINUX32)
- err |= copy_to_user(&name->machine, "i686", 5);
- return err ? -EFAULT : 0;
-}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 0157cd26d7cc..8b3729341216 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -81,7 +81,7 @@ ENTRY(sys_call_table)
.long sys_settimeofday
.long sys_getgroups16 /* 80 */
.long sys_setgroups16
- .long old_select
+ .long sys_old_select
.long sys_symlink
.long sys_lstat
.long sys_readlink /* 85 */
@@ -89,7 +89,7 @@ ENTRY(sys_call_table)
.long sys_swapon
.long sys_reboot
.long sys_old_readdir
- .long old_mmap /* 90 */
+ .long sys_old_mmap /* 90 */
.long sys_munmap
.long sys_truncate
.long sys_ftruncate
@@ -191,7 +191,7 @@ ENTRY(sys_call_table)
.long sys_ni_syscall /* reserved for streams2 */
.long ptregs_vfork /* 190 */
.long sys_getrlimit
- .long sys_mmap2
+ .long sys_mmap_pgoff
.long sys_truncate64
.long sys_ftruncate64
.long sys_stat64 /* 195 */
@@ -336,3 +336,4 @@ ENTRY(sys_call_table)
.long sys_pwritev
.long sys_rt_tgsigqueueinfo /* 335 */
.long sys_perf_event_open
+ .long sys_recvmmsg
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index dcb00d278512..fb5cc5e14cfa 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -38,7 +38,8 @@ unsigned long profile_pc(struct pt_regs *regs)
#ifdef CONFIG_FRAME_POINTER
return *(unsigned long *)(regs->bp + sizeof(long));
#else
- unsigned long *sp = (unsigned long *)regs->sp;
+ unsigned long *sp =
+ (unsigned long *)kernel_stack_pointer(regs);
/*
* Return address is either directly at stack pointer
* or above a saved flags. Eflags has bits 22-31 zero,
@@ -69,11 +70,11 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
* manually to deassert NMI lines for the watchdog if run
* on an 82489DX-based system.
*/
- spin_lock(&i8259A_lock);
+ raw_spin_lock(&i8259A_lock);
outb(0x0c, PIC_MASTER_OCW3);
/* Ack the IRQ; AEOI will end it automatically. */
inb(PIC_MASTER_POLL);
- spin_unlock(&i8259A_lock);
+ raw_spin_unlock(&i8259A_lock);
}
global_clock_event->event_handler(global_clock_event);
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 503c1f2e8835..364d015efebc 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -23,8 +23,6 @@
static struct bau_control **uv_bau_table_bases __read_mostly;
static int uv_bau_retry_limit __read_mostly;
-/* position of pnode (which is nasid>>1): */
-static int uv_nshift __read_mostly;
/* base pnode in this partition */
static int uv_partition_base_pnode __read_mostly;
@@ -723,7 +721,7 @@ uv_activation_descriptor_init(int node, int pnode)
BUG_ON(!adp);
pa = uv_gpa(adp); /* need the real nasid*/
- n = pa >> uv_nshift;
+ n = uv_gpa_to_pnode(pa);
m = pa & uv_mmask;
uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
@@ -778,7 +776,7 @@ uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
* need the pnode of where the memory was really allocated
*/
pa = uv_gpa(pqp);
- pn = pa >> uv_nshift;
+ pn = uv_gpa_to_pnode(pa);
uv_write_global_mmr64(pnode,
UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
@@ -819,10 +817,8 @@ static int __init uv_init_blade(int blade)
*/
apicid = blade_to_first_apicid(blade);
pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
- if ((pa & 0xff) != UV_BAU_MESSAGE) {
- uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
+ uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
((apicid << 32) | UV_BAU_MESSAGE));
- }
return 0;
}
@@ -843,8 +839,7 @@ static int __init uv_bau_init(void)
GFP_KERNEL, cpu_to_node(cur_cpu));
uv_bau_retry_limit = 1;
- uv_nshift = uv_hub_info->n_val;
- uv_mmask = (1UL << uv_hub_info->n_val) - 1;
+ uv_mmask = (1UL << uv_hub_info->m_val) - 1;
nblades = uv_num_possible_blades();
uv_bau_table_bases = (struct bau_control **)
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 699f7eeb896a..c652ef62742d 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -3,22 +3,28 @@
#include <asm/trampoline.h>
#include <asm/e820.h>
+#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
+#define __trampinit
+#define __trampinitdata
+#else
+#define __trampinit __cpuinit
+#define __trampinitdata __cpuinitdata
+#endif
+
/* ready for x86_64 and x86 */
-unsigned char *__cpuinitdata trampoline_base = __va(TRAMPOLINE_BASE);
+unsigned char *__trampinitdata trampoline_base;
void __init reserve_trampoline_memory(void)
{
-#ifdef CONFIG_X86_32
- /*
- * But first pinch a few for the stack/trampoline stuff
- * FIXME: Don't need the extra page at 4K, but need to fix
- * trampoline before removing it. (see the GDT stuff)
- */
- reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
-#endif
+ unsigned long mem;
+
/* Has to be in very low memory so we can execute real-mode AP code. */
- reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE,
- "TRAMPOLINE");
+ mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE);
+ if (mem == -1L)
+ panic("Cannot allocate trampoline\n");
+
+ trampoline_base = __va(mem);
+ reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE");
}
/*
@@ -26,7 +32,7 @@ void __init reserve_trampoline_memory(void)
* bootstrap into the page concerned. The caller
* has made sure it's suitably aligned.
*/
-unsigned long __cpuinit setup_trampoline(void)
+unsigned long __trampinit setup_trampoline(void)
{
memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
return virt_to_phys(trampoline_base);
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 596d54c660a5..3af2dff58b21 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -32,8 +32,12 @@
#include <asm/segment.h>
#include <asm/processor-flags.h>
+#ifdef CONFIG_ACPI_SLEEP
+.section .rodata, "a", @progbits
+#else
/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
__CPUINITRODATA
+#endif
.code16
ENTRY(trampoline_data)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7e37dcee0cc3..1168e4454188 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -529,77 +529,59 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk = current;
- unsigned long condition;
+ unsigned long dr6;
int si_code;
- get_debugreg(condition, 6);
+ get_debugreg(dr6, 6);
+
+ /* Filter out all the reserved bits which are preset to 1 */
+ dr6 &= ~DR6_RESERVED;
/* Catch kmemcheck conditions first of all! */
- if (condition & DR_STEP && kmemcheck_trap(regs))
+ if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
return;
+ /* DR6 may or may not be cleared by the CPU */
+ set_debugreg(0, 6);
/*
* The processor cleared BTF, so don't mark that we need it set.
*/
clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
tsk->thread.debugctlmsr = 0;
- if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
- SIGTRAP) == NOTIFY_STOP)
+ /* Store the virtualized DR6 value */
+ tsk->thread.debugreg6 = dr6;
+
+ if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
+ SIGTRAP) == NOTIFY_STOP)
return;
/* It's safe to allow irq's after DR6 has been saved */
preempt_conditional_sti(regs);
- /* Mask out spurious debug traps due to lazy DR7 setting */
- if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
- if (!tsk->thread.debugreg7)
- goto clear_dr7;
+ if (regs->flags & X86_VM_MASK) {
+ handle_vm86_trap((struct kernel_vm86_regs *) regs,
+ error_code, 1);
+ return;
}
-#ifdef CONFIG_X86_32
- if (regs->flags & X86_VM_MASK)
- goto debug_vm86;
-#endif
-
- /* Save debug status register where ptrace can see it */
- tsk->thread.debugreg6 = condition;
-
/*
- * Single-stepping through TF: make sure we ignore any events in
- * kernel space (but re-enable TF when returning to user mode).
+ * Single-stepping through system calls: ignore any exceptions in
+ * kernel space, but re-enable TF when returning to user mode.
+ *
+ * We already checked v86 mode above, so we can check for kernel mode
+ * by just checking the CPL of CS.
*/
- if (condition & DR_STEP) {
- if (!user_mode(regs))
- goto clear_TF_reenable;
+ if ((dr6 & DR_STEP) && !user_mode(regs)) {
+ tsk->thread.debugreg6 &= ~DR_STEP;
+ set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+ regs->flags &= ~X86_EFLAGS_TF;
}
-
- si_code = get_si_code(condition);
- /* Ok, finally something we can handle */
- send_sigtrap(tsk, regs, error_code, si_code);
-
- /*
- * Disable additional traps. They'll be re-enabled when
- * the signal is delivered.
- */
-clear_dr7:
- set_debugreg(0, 7);
+ si_code = get_si_code(tsk->thread.debugreg6);
+ if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS))
+ send_sigtrap(tsk, regs, error_code, si_code);
preempt_conditional_cli(regs);
- return;
-
-#ifdef CONFIG_X86_32
-debug_vm86:
- /* reenable preemption: handle_vm86_trap() might sleep */
- dec_preempt_count();
- handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
- conditional_cli(regs);
- return;
-#endif
-clear_TF_reenable:
- set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
- regs->flags &= ~X86_EFLAGS_TF;
- preempt_conditional_cli(regs);
return;
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cd982f48e23e..9faf91ae1841 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -50,7 +50,7 @@ u64 native_sched_clock(void)
* unstable. We do this because unlike Time Of Day,
* the scheduler clock tolerates small errors and it's
* very important for it to be as fast as the platform
- * can achive it. )
+ * can achieve it. )
*/
if (unlikely(tsc_disabled)) {
/* No locking but a rare wrong value is not a big deal: */
@@ -740,7 +740,7 @@ static cycle_t __vsyscall_fn vread_tsc(void)
}
#endif
-static void resume_tsc(void)
+static void resume_tsc(struct clocksource *cs)
{
clocksource_tsc.cycle_last = 0;
}
@@ -763,6 +763,7 @@ void mark_tsc_unstable(char *reason)
{
if (!tsc_unstable) {
tsc_unstable = 1;
+ sched_clock_stable = 0;
printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
/* Change only the rating, when not registered */
if (clocksource_tsc.mult)
@@ -805,7 +806,7 @@ static void __init check_system_tsc_reliable(void)
unsigned long res_low, res_high;
rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
- /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */
+ /* Geode_LX - the OLPC CPU has a very reliable TSC */
if (res_low & RTSC_SUSP)
tsc_clocksource_reliable = 1;
#endif
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index f37930954d15..0aa5fed8b9e6 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count;
* we want to have the fastest, inlined, non-debug version
* of a critical section, to be able to prove TSC time-warps:
*/
-static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
static __cpuinitdata cycles_t last_tsc;
static __cpuinitdata cycles_t max_warp;
@@ -62,13 +62,13 @@ static __cpuinit void check_tsc_warp(void)
* previous TSC that was measured (possibly on
* another CPU) and update the previous TSC timestamp.
*/
- __raw_spin_lock(&sync_lock);
+ arch_spin_lock(&sync_lock);
prev = last_tsc;
rdtsc_barrier();
now = get_cycles();
rdtsc_barrier();
last_tsc = now;
- __raw_spin_unlock(&sync_lock);
+ arch_spin_unlock(&sync_lock);
/*
* Be nice every now and then (and also check whether
@@ -87,10 +87,10 @@ static __cpuinit void check_tsc_warp(void)
* we saw a time-warp of the TSC going backwards:
*/
if (unlikely(prev > now)) {
- __raw_spin_lock(&sync_lock);
+ arch_spin_lock(&sync_lock);
max_warp = max(max_warp, prev - now);
nr_warps++;
- __raw_spin_unlock(&sync_lock);
+ arch_spin_unlock(&sync_lock);
}
}
WARN(!(now-start),
@@ -114,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
return;
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
- printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n");
+ if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
+ pr_info(
+ "Skipped synchronization checks as TSC is reliable.\n");
return;
}
- pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
- smp_processor_id(), cpu);
-
/*
* Reset it - in case this is a second bootup:
*/
@@ -142,12 +141,14 @@ void __cpuinit check_tsc_sync_source(int cpu)
cpu_relax();
if (nr_warps) {
- printk("\n");
+ pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
+ smp_processor_id(), cpu);
pr_warning("Measured %Ld cycles TSC warp between CPUs, "
"turning off TSC clock.\n", max_warp);
mark_tsc_unstable("check_tsc_sync_source failed");
} else {
- printk(" passed.\n");
+ pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
+ smp_processor_id(), cpu);
}
/*
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index aeef529917e4..ece73d8e3240 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -9,10 +9,25 @@
*/
#include <linux/module.h>
+#include <linux/rbtree.h>
#include <linux/irq.h>
#include <asm/apic.h>
#include <asm/uv/uv_irq.h>
+#include <asm/uv/uv_hub.h>
+
+/* MMR offset and pnode of hub sourcing interrupts for a given irq */
+struct uv_irq_2_mmr_pnode{
+ struct rb_node list;
+ unsigned long offset;
+ int pnode;
+ int irq;
+};
+
+static spinlock_t uv_irq_lock;
+static struct rb_root uv_irq_root;
+
+static int uv_set_irq_affinity(unsigned int, const struct cpumask *);
static void uv_noop(unsigned int irq)
{
@@ -39,25 +54,213 @@ struct irq_chip uv_irq_chip = {
.unmask = uv_noop,
.eoi = uv_ack_apic,
.end = uv_noop,
+ .set_affinity = uv_set_irq_affinity,
};
/*
+ * Add offset and pnode information of the hub sourcing interrupts to the
+ * rb tree for a specific irq.
+ */
+static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
+{
+ struct rb_node **link = &uv_irq_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct uv_irq_2_mmr_pnode *n;
+ struct uv_irq_2_mmr_pnode *e;
+ unsigned long irqflags;
+
+ n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
+ uv_blade_to_memory_nid(blade));
+ if (!n)
+ return -ENOMEM;
+
+ n->irq = irq;
+ n->offset = offset;
+ n->pnode = uv_blade_to_pnode(blade);
+ spin_lock_irqsave(&uv_irq_lock, irqflags);
+ /* Find the right place in the rbtree: */
+ while (*link) {
+ parent = *link;
+ e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
+
+ if (unlikely(irq == e->irq)) {
+ /* irq entry exists */
+ e->pnode = uv_blade_to_pnode(blade);
+ e->offset = offset;
+ spin_unlock_irqrestore(&uv_irq_lock, irqflags);
+ kfree(n);
+ return 0;
+ }
+
+ if (irq < e->irq)
+ link = &(*link)->rb_left;
+ else
+ link = &(*link)->rb_right;
+ }
+
+ /* Insert the node into the rbtree. */
+ rb_link_node(&n->list, parent, link);
+ rb_insert_color(&n->list, &uv_irq_root);
+
+ spin_unlock_irqrestore(&uv_irq_lock, irqflags);
+ return 0;
+}
+
+/* Retrieve offset and pnode information from the rb tree for a specific irq */
+int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
+{
+ struct uv_irq_2_mmr_pnode *e;
+ struct rb_node *n;
+ unsigned long irqflags;
+
+ spin_lock_irqsave(&uv_irq_lock, irqflags);
+ n = uv_irq_root.rb_node;
+ while (n) {
+ e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
+
+ if (e->irq == irq) {
+ *offset = e->offset;
+ *pnode = e->pnode;
+ spin_unlock_irqrestore(&uv_irq_lock, irqflags);
+ return 0;
+ }
+
+ if (irq < e->irq)
+ n = n->rb_left;
+ else
+ n = n->rb_right;
+ }
+ spin_unlock_irqrestore(&uv_irq_lock, irqflags);
+ return -1;
+}
+
+/*
+ * Re-target the irq to the specified CPU and enable the specified MMR located
+ * on the specified blade to allow the sending of MSIs to the specified CPU.
+ */
+static int
+arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
+ unsigned long mmr_offset, int restrict)
+{
+ const struct cpumask *eligible_cpu = cpumask_of(cpu);
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg;
+ int mmr_pnode;
+ unsigned long mmr_value;
+ struct uv_IO_APIC_route_entry *entry;
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
+ sizeof(unsigned long));
+
+ cfg = irq_cfg(irq);
+
+ err = assign_irq_vector(irq, cfg, eligible_cpu);
+ if (err != 0)
+ return err;
+
+ if (restrict == UV_AFFINITY_CPU)
+ desc->status |= IRQ_NO_BALANCING;
+ else
+ desc->status |= IRQ_MOVE_PCNTXT;
+
+ set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
+ irq_name);
+
+ mmr_value = 0;
+ entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+ entry->vector = cfg->vector;
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
+ entry->polarity = 0;
+ entry->trigger = 0;
+ entry->mask = 0;
+ entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
+
+ mmr_pnode = uv_blade_to_pnode(mmr_blade);
+ uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+
+ if (cfg->move_in_progress)
+ send_cleanup_vector(cfg);
+
+ return irq;
+}
+
+/*
+ * Disable the specified MMR located on the specified blade so that MSIs are
+ * longer allowed to be sent.
+ */
+static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
+{
+ unsigned long mmr_value;
+ struct uv_IO_APIC_route_entry *entry;
+
+ BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
+ sizeof(unsigned long));
+
+ mmr_value = 0;
+ entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+ entry->mask = 1;
+
+ uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+}
+
+static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg = desc->chip_data;
+ unsigned int dest;
+ unsigned long mmr_value;
+ struct uv_IO_APIC_route_entry *entry;
+ unsigned long mmr_offset;
+ unsigned mmr_pnode;
+
+ if (set_desc_affinity(desc, mask, &dest))
+ return -1;
+
+ mmr_value = 0;
+ entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+
+ entry->vector = cfg->vector;
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
+ entry->polarity = 0;
+ entry->trigger = 0;
+ entry->mask = 0;
+ entry->dest = dest;
+
+ /* Get previously stored MMR and pnode of hub sourcing interrupts */
+ if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode))
+ return -1;
+
+ uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+
+ if (cfg->move_in_progress)
+ send_cleanup_vector(cfg);
+
+ return 0;
+}
+
+/*
* Set up a mapping of an available irq and vector, and enable the specified
* MMR that defines the MSI that is to be sent to the specified CPU when an
* interrupt is raised.
*/
int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
- unsigned long mmr_offset)
+ unsigned long mmr_offset, int restrict)
{
- int irq;
- int ret;
+ int irq, ret;
+
+ irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade));
- irq = create_irq();
if (irq <= 0)
return -EBUSY;
- ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset);
- if (ret != irq)
+ ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
+ restrict);
+ if (ret == irq)
+ uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
+ else
destroy_irq(irq);
return ret;
@@ -71,9 +274,28 @@ EXPORT_SYMBOL_GPL(uv_setup_irq);
*
* Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
*/
-void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset)
+void uv_teardown_irq(unsigned int irq)
{
- arch_disable_uv_irq(mmr_blade, mmr_offset);
+ struct uv_irq_2_mmr_pnode *e;
+ struct rb_node *n;
+ unsigned long irqflags;
+
+ spin_lock_irqsave(&uv_irq_lock, irqflags);
+ n = uv_irq_root.rb_node;
+ while (n) {
+ e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
+ if (e->irq == irq) {
+ arch_disable_uv_irq(e->pnode, e->offset);
+ rb_erase(n, &uv_irq_root);
+ kfree(e);
+ break;
+ }
+ if (irq < e->irq)
+ n = n->rb_left;
+ else
+ n = n->rb_right;
+ }
+ spin_unlock_irqrestore(&uv_irq_lock, irqflags);
destroy_irq(irq);
}
EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
index 36afb98675a4..309c70fb7759 100644
--- a/arch/x86/kernel/uv_sysfs.c
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -54,19 +54,19 @@ static int __init sgi_uv_sysfs_init(void)
if (!sgi_uv_kobj)
sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
if (!sgi_uv_kobj) {
- printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n");
+ printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n");
return -EINVAL;
}
ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
if (ret) {
- printk(KERN_WARNING "sysfs_create_file partition_id failed \n");
+ printk(KERN_WARNING "sysfs_create_file partition_id failed\n");
return ret;
}
ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
if (ret) {
- printk(KERN_WARNING "sysfs_create_file coherence_id failed \n");
+ printk(KERN_WARNING "sysfs_create_file coherence_id failed\n");
return ret;
}
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 583f11d5c480..2b75ef638dbc 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -74,7 +74,7 @@ struct uv_rtc_timer_head {
*/
static struct uv_rtc_timer_head **blade_info __read_mostly;
-static int uv_rtc_enable;
+static int uv_rtc_evt_enable;
/*
* Hardware interface routines
@@ -90,7 +90,7 @@ static void uv_rtc_send_IPI(int cpu)
pnode = uv_apicid_to_pnode(apicid);
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(apicid << UVH_IPI_INT_APIC_ID_SHFT) |
- (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
+ (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
}
@@ -115,7 +115,7 @@ static int uv_setup_intr(int cpu, u64 expires)
uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
UVH_EVENT_OCCURRED0_RTC1_MASK);
- val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
+ val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
/* Set configuration */
@@ -123,7 +123,10 @@ static int uv_setup_intr(int cpu, u64 expires)
/* Initialize comparator value */
uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
- return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode));
+ if (uv_read_rtc(NULL) <= expires)
+ return 0;
+
+ return !uv_intr_pending(pnode);
}
/*
@@ -223,6 +226,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
next_cpu = head->next_cpu;
*t = expires;
+
/* Will this one be next to go off? */
if (next_cpu < 0 || bcpu == next_cpu ||
expires < head->cpu[next_cpu].expires) {
@@ -231,7 +235,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
*t = ULLONG_MAX;
uv_rtc_find_next_timer(head, pnode);
spin_unlock_irqrestore(&head->lock, flags);
- return 1;
+ return -ETIME;
}
}
@@ -244,7 +248,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
*
* Returns 1 if this timer was pending.
*/
-static int uv_rtc_unset_timer(int cpu)
+static int uv_rtc_unset_timer(int cpu, int force)
{
int pnode = uv_cpu_to_pnode(cpu);
int bid = uv_cpu_to_blade_id(cpu);
@@ -256,14 +260,15 @@ static int uv_rtc_unset_timer(int cpu)
spin_lock_irqsave(&head->lock, flags);
- if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t)
+ if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
rc = 1;
- *t = ULLONG_MAX;
-
- /* Was the hardware setup for this timer? */
- if (head->next_cpu == bcpu)
- uv_rtc_find_next_timer(head, pnode);
+ if (rc) {
+ *t = ULLONG_MAX;
+ /* Was the hardware setup for this timer? */
+ if (head->next_cpu == bcpu)
+ uv_rtc_find_next_timer(head, pnode);
+ }
spin_unlock_irqrestore(&head->lock, flags);
@@ -277,10 +282,21 @@ static int uv_rtc_unset_timer(int cpu)
/*
* Read the RTC.
+ *
+ * Starting with HUB rev 2.0, the UV RTC register is replicated across all
+ * cachelines of it's own page. This allows faster simultaneous reads
+ * from a given socket.
*/
static cycle_t uv_read_rtc(struct clocksource *cs)
{
- return (cycle_t)uv_read_local_mmr(UVH_RTC);
+ unsigned long offset;
+
+ if (uv_get_min_hub_revision_id() == 1)
+ offset = 0;
+ else
+ offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
+
+ return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
}
/*
@@ -310,32 +326,32 @@ static void uv_rtc_timer_setup(enum clock_event_mode mode,
break;
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
- uv_rtc_unset_timer(ced_cpu);
+ uv_rtc_unset_timer(ced_cpu, 1);
break;
}
}
static void uv_rtc_interrupt(void)
{
- struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
int cpu = smp_processor_id();
+ struct clock_event_device *ced = &per_cpu(cpu_ced, cpu);
if (!ced || !ced->event_handler)
return;
- if (uv_rtc_unset_timer(cpu) != 1)
+ if (uv_rtc_unset_timer(cpu, 0) != 1)
return;
ced->event_handler(ced);
}
-static int __init uv_enable_rtc(char *str)
+static int __init uv_enable_evt_rtc(char *str)
{
- uv_rtc_enable = 1;
+ uv_rtc_evt_enable = 1;
return 1;
}
-__setup("uvrtc", uv_enable_rtc);
+__setup("uvrtcevt", uv_enable_evt_rtc);
static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
{
@@ -350,27 +366,32 @@ static __init int uv_rtc_setup_clock(void)
{
int rc;
- if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension)
+ if (!is_uv_system())
return -ENODEV;
- generic_interrupt_extension = uv_rtc_interrupt;
-
clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
clocksource_uv.shift);
+ /* If single blade, prefer tsc */
+ if (uv_num_possible_blades() == 1)
+ clocksource_uv.rating = 250;
+
rc = clocksource_register(&clocksource_uv);
- if (rc) {
- generic_interrupt_extension = NULL;
+ if (rc)
+ printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
+ else
+ printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n",
+ sn_rtc_cycles_per_second/(unsigned long)1E6);
+
+ if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback)
return rc;
- }
/* Setup and register clockevents */
rc = uv_rtc_allocate_timers();
- if (rc) {
- clocksource_unregister(&clocksource_uv);
- generic_interrupt_extension = NULL;
- return rc;
- }
+ if (rc)
+ goto error;
+
+ x86_platform_ipi_callback = uv_rtc_interrupt;
clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
NSEC_PER_SEC, clock_event_device_uv.shift);
@@ -383,11 +404,19 @@ static __init int uv_rtc_setup_clock(void)
rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
if (rc) {
- clocksource_unregister(&clocksource_uv);
- generic_interrupt_extension = NULL;
+ x86_platform_ipi_callback = NULL;
uv_rtc_deallocate_timers();
+ goto error;
}
+ printk(KERN_INFO "UV RTC clockevents registered\n");
+
+ return 0;
+
+error:
+ clocksource_unregister(&clocksource_uv);
+ printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc);
+
return rc;
}
arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index f068553a1b17..e680ea52db9b 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -49,11 +49,6 @@ extern int no_broadcast;
char visws_board_type = -1;
char visws_board_rev = -1;
-int is_visws_box(void)
-{
- return visws_board_type >= 0;
-}
-
static void __init visws_time_init(void)
{
printk(KERN_INFO "Starting Cobalt Timer system clock\n");
@@ -183,7 +178,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
return;
}
- apic_cpus = apic->apicid_to_cpu_present(m->apicid);
+ apic->apicid_to_cpu_present(m->apicid, &apic_cpus);
physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
/*
* Validate version
@@ -197,7 +192,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
apic_version[m->apicid] = ver;
}
-static void __init visws_find_smp_config(unsigned int reserve)
+static void __init visws_find_smp_config(void)
{
struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -242,6 +237,8 @@ void __init visws_early_detect(void)
x86_init.irqs.pre_vector_init = visws_pre_intr_init;
x86_init.irqs.trap_init = visws_trap_init;
x86_init.timers.timer_init = visws_time_init;
+ x86_init.pci.init = pci_visws_init;
+ x86_init.pci.init_irq = x86_init_noop;
/*
* Install reboot quirks:
@@ -486,7 +483,7 @@ static void end_cobalt_irq(unsigned int irq)
}
static struct irq_chip cobalt_irq_type = {
- .typename = "Cobalt-APIC",
+ .name = "Cobalt-APIC",
.startup = startup_cobalt_irq,
.shutdown = disable_cobalt_irq,
.enable = enable_cobalt_irq,
@@ -508,7 +505,7 @@ static struct irq_chip cobalt_irq_type = {
*/
static unsigned int startup_piix4_master_irq(unsigned int irq)
{
- init_8259A(0);
+ legacy_pic->init(0);
return startup_cobalt_irq(irq);
}
@@ -523,7 +520,7 @@ static void end_piix4_master_irq(unsigned int irq)
}
static struct irq_chip piix4_master_irq_type = {
- .typename = "PIIX4-master",
+ .name = "PIIX4-master",
.startup = startup_piix4_master_irq,
.ack = ack_cobalt_irq,
.end = end_piix4_master_irq,
@@ -531,10 +528,7 @@ static struct irq_chip piix4_master_irq_type = {
static struct irq_chip piix4_virtual_irq_type = {
- .typename = "PIIX4-virtual",
- .shutdown = disable_8259A_irq,
- .enable = enable_8259A_irq,
- .disable = disable_8259A_irq,
+ .name = "PIIX4-virtual",
};
@@ -559,7 +553,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
struct irq_desc *desc;
unsigned long flags;
- spin_lock_irqsave(&i8259A_lock, flags);
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
/* Find out what's interrupting in the PIIX4 master 8259 */
outb(0x0c, 0x20); /* OCW3 Poll command */
@@ -596,7 +590,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
outb(0x60 + realirq, 0x20);
}
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
desc = irq_to_desc(realirq);
@@ -609,12 +603,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
handle_IRQ_event(realirq, desc->action);
if (!(desc->status & IRQ_DISABLED))
- enable_8259A_irq(realirq);
+ legacy_pic->chip->unmask(realirq);
return IRQ_HANDLED;
out_unlock:
- spin_unlock_irqrestore(&i8259A_lock, flags);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
return IRQ_NONE;
}
@@ -628,6 +622,12 @@ static struct irqaction cascade_action = {
.name = "cascade",
};
+static inline void set_piix4_virtual_irq_type(void)
+{
+ piix4_virtual_irq_type.shutdown = i8259A_chip.mask;
+ piix4_virtual_irq_type.enable = i8259A_chip.unmask;
+ piix4_virtual_irq_type.disable = i8259A_chip.mask;
+}
void init_VISWS_APIC_irqs(void)
{
@@ -653,6 +653,7 @@ void init_VISWS_APIC_irqs(void)
desc->chip = &piix4_master_irq_type;
}
else if (i < CO_IRQ_APIC0) {
+ set_piix4_virtual_irq_type();
desc->chip = &piix4_virtual_irq_type;
}
else if (IS_CO_APIC(i)) {
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 9c4e62539058..5ffb5622f793 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -197,9 +197,8 @@ out:
static int do_vm86_irq_handling(int subfunction, int irqnumber);
static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
-int sys_vm86old(struct pt_regs *regs)
+int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs)
{
- struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx;
struct kernel_vm86_struct info; /* declare this _on top_,
* this avoids wasting of stack space.
* This remains on the stack until we
@@ -227,7 +226,7 @@ out:
}
-int sys_vm86(struct pt_regs *regs)
+int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs)
{
struct kernel_vm86_struct info; /* declare this _on top_,
* this avoids wasting of stack space.
@@ -239,12 +238,12 @@ int sys_vm86(struct pt_regs *regs)
struct vm86plus_struct __user *v86;
tsk = current;
- switch (regs->bx) {
+ switch (cmd) {
case VM86_REQUEST_IRQ:
case VM86_FREE_IRQ:
case VM86_GET_IRQ_BITS:
case VM86_GET_AND_RESET_IRQ:
- ret = do_vm86_irq_handling(regs->bx, (int)regs->cx);
+ ret = do_vm86_irq_handling(cmd, (int)arg);
goto out;
case VM86_PLUS_INSTALL_CHECK:
/*
@@ -261,7 +260,7 @@ int sys_vm86(struct pt_regs *regs)
ret = -EPERM;
if (tsk->thread.saved_sp0)
goto out;
- v86 = (struct vm86plus_struct __user *)regs->cx;
+ v86 = (struct vm86plus_struct __user *)arg;
tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
offsetof(struct kernel_vm86_struct, regs32) -
sizeof(info.regs));
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 31e6f6cfe53e..7dd599deca4a 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -33,6 +33,7 @@
#include <asm/fixmap.h>
#include <asm/apicdef.h>
#include <asm/apic.h>
+#include <asm/pgalloc.h>
#include <asm/processor.h>
#include <asm/timer.h>
#include <asm/vmi_time.h>
@@ -266,30 +267,6 @@ static void vmi_nop(void)
{
}
-#ifdef CONFIG_HIGHPTE
-static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
-{
- void *va = kmap_atomic(page, type);
-
- /*
- * Internally, the VMI ROM must map virtual addresses to physical
- * addresses for processing MMU updates. By the time MMU updates
- * are issued, this information is typically already lost.
- * Fortunately, the VMI provides a cache of mapping slots for active
- * page tables.
- *
- * We use slot zero for the linear mapping of physical memory, and
- * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
- *
- * args: SLOT VA COUNT PFN
- */
- BUG_ON(type != KM_PTE0 && type != KM_PTE1);
- vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
-
- return va;
-}
-#endif
-
static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
{
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
@@ -640,6 +617,12 @@ static inline int __init activate_vmi(void)
u64 reloc;
const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
+ /*
+ * Prevent page tables from being allocated in highmem, even if
+ * CONFIG_HIGHPTE is enabled.
+ */
+ __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
+
if (call_vrom_func(vmi_rom, vmi_init) != 0) {
printk(KERN_ERR "VMI ROM failed to initialize!");
return 0;
@@ -648,7 +631,7 @@ static inline int __init activate_vmi(void)
pv_info.paravirt_enabled = 1;
pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
- pv_info.name = "vmi";
+ pv_info.name = "vmi [deprecated]";
pv_init_ops.patch = vmi_patch;
@@ -778,10 +761,6 @@ static inline int __init activate_vmi(void)
/* Set linear is needed in all cases */
vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
-#ifdef CONFIG_HIGHPTE
- if (vmi_ops.set_linear_mapping)
- pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
-#endif
/*
* These MUST always be patched. Don't support indirect jumps
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 611b9e2360d3..5e1ff66ecd73 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -79,11 +79,7 @@ unsigned long vmi_tsc_khz(void)
static inline unsigned int vmi_get_timer_vector(void)
{
-#ifdef CONFIG_X86_IO_APIC
- return FIRST_DEVICE_VECTOR;
-#else
- return FIRST_EXTERNAL_VECTOR;
-#endif
+ return IRQ0_VECTOR;
}
/** vmi clockchip */
@@ -171,7 +167,7 @@ static int vmi_timer_next_event(unsigned long delta,
{
/* Unfortunately, set_next_event interface only passes relative
* expiry, but we want absolute expiry. It'd be better if were
- * were passed an aboslute expiry, since a bunch of time may
+ * were passed an absolute expiry, since a bunch of time may
* have been stolen between the time the delta is computed and
* when we set the alarm below. */
cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
@@ -226,7 +222,7 @@ static void __devinit vmi_time_init_clockevent(void)
evt->min_delta_ns = clockevent_delta2ns(1, evt);
evt->cpumask = cpumask_of(cpu);
- printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
+ printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
evt->name, evt->mult, evt->shift);
clockevents_register_device(evt);
}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 92929fb3f9fa..44879df55696 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,6 +41,32 @@ ENTRY(phys_startup_64)
jiffies_64 = jiffies;
#endif
+#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+/*
+ * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
+ * we retain large page mappings for boundaries spanning kernel text, rodata
+ * and data sections.
+ *
+ * However, kernel identity mappings will have different RWX permissions
+ * to the pages mapping to text and to the pages padding (which are freed) the
+ * text section. Hence kernel identity mappings will be broken to smaller
+ * pages. For 64-bit, kernel text and kernel identity mappings are different,
+ * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
+ * as well as retain 2MB large page mappings for kernel text.
+ */
+#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
+
+#define X64_ALIGN_DEBUG_RODATA_END \
+ . = ALIGN(HPAGE_SIZE); \
+ __end_rodata_hpage_align = .;
+
+#else
+
+#define X64_ALIGN_DEBUG_RODATA_BEGIN
+#define X64_ALIGN_DEBUG_RODATA_END
+
+#endif
+
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
data PT_LOAD FLAGS(7); /* RWE */
@@ -90,7 +116,9 @@ SECTIONS
EXCEPTION_TABLE(16) :text = 0x9090
+ X64_ALIGN_DEBUG_RODATA_BEGIN
RO_DATA(PAGE_SIZE)
+ X64_ALIGN_DEBUG_RODATA_END
/* Data */
.data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -107,13 +135,13 @@ SECTIONS
PAGE_ALIGNED_DATA(PAGE_SIZE)
- CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
+ CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
DATA_DATA
CONSTRUCTORS
/* rarely changed data like cpu maps */
- READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES)
+ READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
/* End of data section */
_edata = .;
@@ -137,12 +165,12 @@ SECTIONS
*(.vsyscall_0)
} :user
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ . = ALIGN(L1_CACHE_BYTES);
.vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
*(.vsyscall_fn)
}
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ . = ALIGN(L1_CACHE_BYTES);
.vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
*(.vsyscall_gtod_data)
}
@@ -166,7 +194,7 @@ SECTIONS
}
vgetcpu_mode = VVIRT(.vgetcpu_mode);
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ . = ALIGN(L1_CACHE_BYTES);
.jiffies : AT(VLOAD(.jiffies)) {
*(.jiffies)
}
@@ -291,9 +319,7 @@ SECTIONS
__brk_limit = .;
}
- .end : AT(ADDR(.end) - LOAD_OFFSET) {
- _end = .;
- }
+ _end = .;
STABS_DEBUG
DWARF_DEBUG
@@ -305,6 +331,9 @@ SECTIONS
#ifdef CONFIG_X86_32
+/*
+ * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
+ */
. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
"kernel image bigger than KERNEL_IMAGE_SIZE");
#else
@@ -312,7 +341,7 @@ SECTIONS
* Per-cpu symbols which need to be offset from __per_cpu_load
* for the boot processor.
*/
-#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
+#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
INIT_PER_CPU(gdt_page);
INIT_PER_CPU(irq_stack_union);
@@ -323,7 +352,7 @@ INIT_PER_CPU(irq_stack_union);
"kernel image bigger than KERNEL_IMAGE_SIZE");
#ifdef CONFIG_SMP
-. = ASSERT((per_cpu__irq_stack_union == 0),
+. = ASSERT((irq_stack_union == 0),
"irq_stack_union is not at start of per-cpu area");
#endif
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8cb4974ff599..1c0c6ab9c60f 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,7 +73,8 @@ void update_vsyscall_tz(void)
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
+void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
+ u32 mult)
{
unsigned long flags;
@@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
vsyscall_gtod_data.clock.vread = clock->vread;
vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
vsyscall_gtod_data.clock.mask = clock->mask;
- vsyscall_gtod_data.clock.mult = clock->mult;
+ vsyscall_gtod_data.clock.mult = mult;
vsyscall_gtod_data.clock.shift = clock->shift;
vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
@@ -237,7 +238,7 @@ static ctl_table kernel_table2[] = {
};
static ctl_table kernel_root_table2[] = {
- { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
+ { .procname = "kernel", .mode = 0555,
.child = kernel_table2 },
{}
};
@@ -300,7 +301,8 @@ static int __init vsyscall_init(void)
register_sysctl_table(kernel_root_table2);
#endif
on_each_cpu(cpu_vsyscall_init, NULL, 1);
- hotcpu_notifier(cpu_vsyscall_notifier, 0);
+ /* notifier priority > KVM */
+ hotcpu_notifier(cpu_vsyscall_notifier, 30);
return 0;
}
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 3909e3ba5ce3..693920b22496 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -17,8 +17,6 @@
EXPORT_SYMBOL(mcount);
#endif
-EXPORT_SYMBOL(kernel_thread);
-
EXPORT_SYMBOL(__get_user_1);
EXPORT_SYMBOL(__get_user_2);
EXPORT_SYMBOL(__get_user_4);
@@ -28,11 +26,11 @@ EXPORT_SYMBOL(__put_user_2);
EXPORT_SYMBOL(__put_user_4);
EXPORT_SYMBOL(__put_user_8);
-EXPORT_SYMBOL(copy_user_generic);
+EXPORT_SYMBOL(copy_user_generic_string);
+EXPORT_SYMBOL(copy_user_generic_unrolled);
EXPORT_SYMBOL(__copy_user_nocache);
-EXPORT_SYMBOL(copy_from_user);
-EXPORT_SYMBOL(copy_to_user);
-EXPORT_SYMBOL(__copy_from_user_inatomic);
+EXPORT_SYMBOL(_copy_from_user);
+EXPORT_SYMBOL(_copy_to_user);
EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page);
@@ -57,4 +55,6 @@ EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(empty_zero_page);
EXPORT_SYMBOL(init_level4_pgt);
-EXPORT_SYMBOL(load_gs_index);
+#ifndef CONFIG_PARAVIRT
+EXPORT_SYMBOL(native_load_gs_index);
+#endif
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 4449a4a2c2ed..61a1e8c7e19f 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -4,20 +4,26 @@
* For licencing details see kernel-base/COPYING
*/
#include <linux/init.h>
+#include <linux/ioport.h>
#include <asm/bios_ebda.h>
#include <asm/paravirt.h>
+#include <asm/pci_x86.h>
#include <asm/mpspec.h>
#include <asm/setup.h>
#include <asm/apic.h>
#include <asm/e820.h>
#include <asm/time.h>
#include <asm/irq.h>
+#include <asm/pat.h>
#include <asm/tsc.h>
+#include <asm/iommu.h>
void __cpuinit x86_init_noop(void) { }
void __init x86_init_uint_noop(unsigned int unused) { }
void __init x86_init_pgd_noop(pgd_t *unused) { }
+int __init iommu_init_noop(void) { return 0; }
+void iommu_shutdown_noop(void) { }
/*
* The platform setup functions are preset with the default functions
@@ -62,14 +68,29 @@ struct x86_init_ops x86_init __initdata = {
.tsc_pre_init = x86_init_noop,
.timer_init = hpet_time_init,
},
+
+ .iommu = {
+ .iommu_init = iommu_init_noop,
+ },
+
+ .pci = {
+ .init = x86_default_pci_init,
+ .init_irq = x86_default_pci_init_irq,
+ .fixup_irqs = x86_default_pci_fixup_irqs,
+ },
};
struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
.setup_percpu_clockev = setup_secondary_APIC_clock,
};
+static void default_nmi_init(void) { };
+
struct x86_platform_ops x86_platform = {
.calibrate_tsc = native_calibrate_tsc,
.get_wallclock = mach_get_cmos_time,
.set_wallclock = mach_set_rtc_mmss,
+ .iommu_shutdown = iommu_shutdown_noop,
+ .is_untracked_pat_range = is_ISA_range,
+ .nmi_init = default_nmi_init
};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index c5ee17e8c6d9..782c3a362ec6 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -337,6 +337,7 @@ void __ref xsave_cntxt_init(void)
cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
xstate_size = ebx;
+ update_regset_xstate_info(xstate_size, pcntxt_mask);
prepare_fx_sw_frame();
setup_xstate_init();
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b84e571f4175..970bbd479516 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,8 @@ config KVM
select HAVE_KVM_IRQCHIP
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
+ select USER_RETURN_NOTIFIER
+ select KVM_MMIO
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
@@ -64,6 +66,7 @@ config KVM_AMD
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
+source drivers/vhost/Kconfig
source drivers/lguest/Kconfig
source drivers/virtio/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 0e7fe78d0f74..31a7035c4bd9 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -6,7 +6,8 @@ CFLAGS_svm.o := -I.
CFLAGS_vmx.o := -I.
kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
- coalesced_mmio.o irq_comm.o eventfd.o)
+ coalesced_mmio.o irq_comm.o eventfd.o \
+ assigned-dev.o)
kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1be5cd640e93..4dade6ac0827 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -32,7 +32,7 @@
#include <linux/module.h>
#include <asm/kvm_emulate.h>
-#include "mmu.h" /* for is_long_mode() */
+#include "x86.h"
/*
* Opcode effective-address decode tables.
@@ -75,6 +75,10 @@
#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
#define GroupMask 0xff /* Group number stored in bits 0:7 */
+/* Misc flags */
+#define Lock (1<<26) /* lock prefix is allowed for the instruction */
+#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
+#define No64 (1<<28)
/* Source 2 operand type */
#define Src2None (0<<29)
#define Src2CL (1<<29)
@@ -86,35 +90,40 @@
enum {
Group1_80, Group1_81, Group1_82, Group1_83,
Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
+ Group8, Group9,
};
static u32 opcode_table[256] = {
/* 0x00 - 0x07 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+ ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
/* 0x08 - 0x0F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+ ImplicitOps | Stack | No64, 0,
/* 0x10 - 0x17 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+ ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
/* 0x18 - 0x1F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+ ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
/* 0x20 - 0x27 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
/* 0x28 - 0x2F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
0, 0, 0, 0,
/* 0x30 - 0x37 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
0, 0, 0, 0,
/* 0x38 - 0x3F */
@@ -133,7 +142,8 @@ static u32 opcode_table[256] = {
DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
/* 0x60 - 0x67 */
- 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
+ ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
+ 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
0, 0, 0, 0,
/* 0x68 - 0x6F */
SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
@@ -149,7 +159,7 @@ static u32 opcode_table[256] = {
Group | Group1_80, Group | Group1_81,
Group | Group1_82, Group | Group1_83,
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
/* 0x88 - 0x8F */
ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -158,7 +168,7 @@ static u32 opcode_table[256] = {
/* 0x90 - 0x97 */
DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
/* 0x98 - 0x9F */
- 0, 0, SrcImm | Src2Imm16, 0,
+ 0, 0, SrcImm | Src2Imm16 | No64, 0,
ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
/* 0xA0 - 0xA7 */
ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -185,7 +195,7 @@ static u32 opcode_table[256] = {
ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
/* 0xC8 - 0xCF */
0, 0, 0, ImplicitOps | Stack,
- ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps,
+ ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
/* 0xD0 - 0xD7 */
ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -198,12 +208,12 @@ static u32 opcode_table[256] = {
ByteOp | SrcImmUByte, SrcImmUByte,
/* 0xE8 - 0xEF */
SrcImm | Stack, SrcImm | ImplicitOps,
- SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps,
+ SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
/* 0xF0 - 0xF7 */
0, 0, 0, 0,
- ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
+ ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
/* 0xF8 - 0xFF */
ImplicitOps, 0, ImplicitOps, ImplicitOps,
ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
@@ -211,16 +221,20 @@ static u32 opcode_table[256] = {
static u32 twobyte_table[256] = {
/* 0x00 - 0x0F */
- 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
- ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
+ 0, Group | GroupDual | Group7, 0, 0,
+ 0, ImplicitOps, ImplicitOps | Priv, 0,
+ ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
+ 0, ImplicitOps | ModRM, 0, 0,
/* 0x10 - 0x1F */
0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
/* 0x20 - 0x2F */
- ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
+ ModRM | ImplicitOps | Priv, ModRM | Priv,
+ ModRM | ImplicitOps | Priv, ModRM | Priv,
+ 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x30 - 0x3F */
- ImplicitOps, 0, ImplicitOps, 0,
- ImplicitOps, ImplicitOps, 0, 0,
+ ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
+ ImplicitOps, ImplicitOps | Priv, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x40 - 0x47 */
DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -244,25 +258,29 @@ static u32 twobyte_table[256] = {
/* 0x90 - 0x9F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xA0 - 0xA7 */
- 0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+ ImplicitOps | Stack, ImplicitOps | Stack,
+ 0, DstMem | SrcReg | ModRM | BitOp,
DstMem | SrcReg | Src2ImmByte | ModRM,
DstMem | SrcReg | Src2CL | ModRM, 0, 0,
/* 0xA8 - 0xAF */
- 0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+ ImplicitOps | Stack, ImplicitOps | Stack,
+ 0, DstMem | SrcReg | ModRM | BitOp | Lock,
DstMem | SrcReg | Src2ImmByte | ModRM,
DstMem | SrcReg | Src2CL | ModRM,
ModRM, 0,
/* 0xB0 - 0xB7 */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
- DstMem | SrcReg | ModRM | BitOp,
+ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+ 0, DstMem | SrcReg | ModRM | BitOp | Lock,
0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
DstReg | SrcMem16 | ModRM | Mov,
/* 0xB8 - 0xBF */
- 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
+ 0, 0,
+ Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
DstReg | SrcMem16 | ModRM | Mov,
/* 0xC0 - 0xCF */
- 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
+ 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
+ 0, 0, 0, Group | GroupDual | Group9,
0, 0, 0, 0, 0, 0, 0, 0,
/* 0xD0 - 0xDF */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -274,25 +292,41 @@ static u32 twobyte_table[256] = {
static u32 group_table[] = {
[Group1_80*8] =
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | Lock,
+ ByteOp | DstMem | SrcImm | ModRM,
[Group1_81*8] =
- DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM | Lock,
+ DstMem | SrcImm | ModRM,
[Group1_82*8] =
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
+ ByteOp | DstMem | SrcImm | ModRM | No64,
[Group1_83*8] =
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM,
[Group1A*8] =
DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
[Group3_Byte*8] =
@@ -311,24 +345,39 @@ static u32 group_table[] = {
SrcMem | ModRM | Stack, 0,
SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
[Group7*8] =
- 0, 0, ModRM | SrcMem, ModRM | SrcMem,
+ 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
SrcNone | ModRM | DstMem | Mov, 0,
- SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp,
+ SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
+ [Group8*8] =
+ 0, 0, 0, 0,
+ DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
+ DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
+ [Group9*8] =
+ 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0,
};
static u32 group2_table[] = {
[Group7*8] =
- SrcNone | ModRM, 0, 0, SrcNone | ModRM,
+ SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM,
SrcNone | ModRM | DstMem | Mov, 0,
SrcMem16 | ModRM | Mov, 0,
+ [Group9*8] =
+ 0, 0, 0, 0, 0, 0, 0, 0,
};
/* EFLAGS bit definitions. */
+#define EFLG_ID (1<<21)
+#define EFLG_VIP (1<<20)
+#define EFLG_VIF (1<<19)
+#define EFLG_AC (1<<18)
#define EFLG_VM (1<<17)
#define EFLG_RF (1<<16)
+#define EFLG_IOPL (3<<12)
+#define EFLG_NT (1<<14)
#define EFLG_OF (1<<11)
#define EFLG_DF (1<<10)
#define EFLG_IF (1<<9)
+#define EFLG_TF (1<<8)
#define EFLG_SF (1<<7)
#define EFLG_ZF (1<<6)
#define EFLG_AF (1<<4)
@@ -597,7 +646,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
if (linear < fc->start || linear >= fc->end) {
size = min(15UL, PAGE_SIZE - offset_in_page(linear));
- rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
+ rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);
if (rc)
return rc;
fc->start = linear;
@@ -613,6 +662,9 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
{
int rc = 0;
+ /* x86 instructions are limited to 15 bytes. */
+ if (eip + size - ctxt->decode.eip_orig > 15)
+ return X86EMUL_UNHANDLEABLE;
eip += ctxt->cs_base;
while (size--) {
rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
@@ -649,11 +701,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
op_bytes = 3;
*address = 0;
rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
- ctxt->vcpu);
+ ctxt->vcpu, NULL);
if (rc)
return rc;
rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
- ctxt->vcpu);
+ ctxt->vcpu, NULL);
return rc;
}
@@ -871,12 +923,13 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
/* Shadow copy of register state. Committed on successful emulation. */
memset(c, 0, sizeof(struct decode_cache));
- c->eip = kvm_rip_read(ctxt->vcpu);
+ c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu);
ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
switch (mode) {
case X86EMUL_MODE_REAL:
+ case X86EMUL_MODE_VM86:
case X86EMUL_MODE_PROT16:
def_op_bytes = def_ad_bytes = 2;
break;
@@ -962,6 +1015,11 @@ done_prefixes:
}
}
+ if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
+ kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
+ return -1;
+ }
+
if (c->d & Group) {
group = c->d & GroupMask;
c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1179,13 +1237,119 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
rc = ops->read_emulated(register_address(c, ss_base(ctxt),
c->regs[VCPU_REGS_RSP]),
dest, len, ctxt->vcpu);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
return rc;
register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
return rc;
}
+static int emulate_popf(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ void *dest, int len)
+{
+ int rc;
+ unsigned long val, change_mask;
+ int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+ int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu);
+
+ rc = emulate_pop(ctxt, ops, &val, len);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
+ | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
+
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_PROT64:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT16:
+ if (cpl == 0)
+ change_mask |= EFLG_IOPL;
+ if (cpl <= iopl)
+ change_mask |= EFLG_IF;
+ break;
+ case X86EMUL_MODE_VM86:
+ if (iopl < 3) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ change_mask |= EFLG_IF;
+ break;
+ default: /* real mode */
+ change_mask |= (EFLG_IOPL | EFLG_IF);
+ break;
+ }
+
+ *(unsigned long *)dest =
+ (ctxt->eflags & ~change_mask) | (val & change_mask);
+
+ return rc;
+}
+
+static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct kvm_segment segment;
+
+ kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
+
+ c->src.val = segment.selector;
+ emulate_push(ctxt);
+}
+
+static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int seg)
+{
+ struct decode_cache *c = &ctxt->decode;
+ unsigned long selector;
+ int rc;
+
+ rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
+ if (rc != 0)
+ return rc;
+
+ rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg);
+ return rc;
+}
+
+static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+ int reg = VCPU_REGS_RAX;
+
+ while (reg <= VCPU_REGS_RDI) {
+ (reg == VCPU_REGS_RSP) ?
+ (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
+
+ emulate_push(ctxt);
+ ++reg;
+ }
+}
+
+static int emulate_popa(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc = 0;
+ int reg = VCPU_REGS_RDI;
+
+ while (reg >= VCPU_REGS_RAX) {
+ if (reg == VCPU_REGS_RSP) {
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP],
+ c->op_bytes);
+ --reg;
+ }
+
+ rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
+ if (rc != 0)
+ break;
+ --reg;
+ }
+ return rc;
+}
+
static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
@@ -1290,7 +1454,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
int rc;
rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
return rc;
if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
@@ -1305,7 +1469,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
(u32) c->regs[VCPU_REGS_RBX];
rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
return rc;
ctxt->eflags |= EFLG_ZF;
}
@@ -1327,7 +1491,7 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
if (rc)
return rc;
- rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS);
+ rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS);
return rc;
}
@@ -1371,7 +1535,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
&c->dst.val,
c->dst.bytes,
ctxt->vcpu);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
return rc;
break;
case OP_NONE:
@@ -1434,9 +1598,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
u64 msr_data;
/* syscall is not available in real mode */
- if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL
- || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE))
- return -1;
+ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86)
+ return X86EMUL_UNHANDLEABLE;
setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1473,7 +1636,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
}
- return 0;
+ return X86EMUL_CONTINUE;
}
static int
@@ -1483,22 +1646,17 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
struct kvm_segment cs, ss;
u64 msr_data;
- /* inject #UD if LOCK prefix is used */
- if (c->lock_prefix)
- return -1;
-
- /* inject #GP if in real mode or paging is disabled */
- if (ctxt->mode == X86EMUL_MODE_REAL ||
- !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
+ /* inject #GP if in real mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_UNHANDLEABLE;
}
/* XXX sysenter/sysexit have not been tested in 64bit mode.
* Therefore, we inject an #UD.
*/
if (ctxt->mode == X86EMUL_MODE_PROT64)
- return -1;
+ return X86EMUL_UNHANDLEABLE;
setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1507,13 +1665,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
case X86EMUL_MODE_PROT32:
if ((msr_data & 0xfffc) == 0x0) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_PROPAGATE_FAULT;
}
break;
case X86EMUL_MODE_PROT64:
if (msr_data == 0x0) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_PROPAGATE_FAULT;
}
break;
}
@@ -1538,7 +1696,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
c->regs[VCPU_REGS_RSP] = msr_data;
- return 0;
+ return X86EMUL_CONTINUE;
}
static int
@@ -1549,21 +1707,11 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
u64 msr_data;
int usermode;
- /* inject #UD if LOCK prefix is used */
- if (c->lock_prefix)
- return -1;
-
- /* inject #GP if in real mode or paging is disabled */
- if (ctxt->mode == X86EMUL_MODE_REAL
- || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
- kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
- }
-
- /* sysexit must be called from CPL 0 */
- if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) {
+ /* inject #GP if in real mode or Virtual 8086 mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ ctxt->mode == X86EMUL_MODE_VM86) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_UNHANDLEABLE;
}
setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1581,7 +1729,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
cs.selector = (u16)(msr_data + 16);
if ((msr_data & 0xfffc) == 0x0) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_PROPAGATE_FAULT;
}
ss.selector = (u16)(msr_data + 24);
break;
@@ -1589,7 +1737,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
cs.selector = (u16)(msr_data + 32);
if (msr_data == 0x0) {
kvm_inject_gp(ctxt->vcpu, 0);
- return -1;
+ return X86EMUL_PROPAGATE_FAULT;
}
ss.selector = cs.selector + 8;
cs.db = 0;
@@ -1605,7 +1753,58 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
- return 0;
+ return X86EMUL_CONTINUE;
+}
+
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
+{
+ int iopl;
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return false;
+ if (ctxt->mode == X86EMUL_MODE_VM86)
+ return true;
+ iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+ return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl;
+}
+
+static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 port, u16 len)
+{
+ struct kvm_segment tr_seg;
+ int r;
+ u16 io_bitmap_ptr;
+ u8 perm, bit_idx = port & 0x7;
+ unsigned mask = (1 << len) - 1;
+
+ kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
+ if (tr_seg.unusable)
+ return false;
+ if (tr_seg.limit < 103)
+ return false;
+ r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
+ NULL);
+ if (r != X86EMUL_CONTINUE)
+ return false;
+ if (io_bitmap_ptr + port/8 > tr_seg.limit)
+ return false;
+ r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
+ ctxt->vcpu, NULL);
+ if (r != X86EMUL_CONTINUE)
+ return false;
+ if ((perm >> bit_idx) & mask)
+ return false;
+ return true;
+}
+
+static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 port, u16 len)
+{
+ if (emulator_bad_iopl(ctxt))
+ if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
+ return false;
+ return true;
}
int
@@ -1629,6 +1828,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
saved_eip = c->eip;
+ /* LOCK prefix is allowed only with some instructions */
+ if (c->lock_prefix && !(c->d & Lock)) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
+ }
+
+ /* Privileged instruction can be executed only in CPL=0 */
+ if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+
if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
memop = c->modrm_ea;
@@ -1669,7 +1880,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
&c->src.val,
c->src.bytes,
ctxt->vcpu);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
c->src.orig_val = c->src.val;
}
@@ -1688,12 +1899,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
c->dst.ptr = (void *)c->dst.ptr +
(c->src.val & mask) / 8;
}
- if (!(c->d & Mov) &&
- /* optimisation - avoid slow emulated read */
- ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
- &c->dst.val,
- c->dst.bytes, ctxt->vcpu)) != 0))
- goto done;
+ if (!(c->d & Mov)) {
+ /* optimisation - avoid slow emulated read */
+ rc = ops->read_emulated((unsigned long)c->dst.ptr,
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
}
c->dst.orig_val = c->dst.val;
@@ -1707,18 +1921,45 @@ special_insn:
add: /* add */
emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
break;
+ case 0x06: /* push es */
+ emulate_push_sreg(ctxt, VCPU_SREG_ES);
+ break;
+ case 0x07: /* pop es */
+ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
+ if (rc != 0)
+ goto done;
+ break;
case 0x08 ... 0x0d:
or: /* or */
emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
break;
+ case 0x0e: /* push cs */
+ emulate_push_sreg(ctxt, VCPU_SREG_CS);
+ break;
case 0x10 ... 0x15:
adc: /* adc */
emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
break;
+ case 0x16: /* push ss */
+ emulate_push_sreg(ctxt, VCPU_SREG_SS);
+ break;
+ case 0x17: /* pop ss */
+ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
+ if (rc != 0)
+ goto done;
+ break;
case 0x18 ... 0x1d:
sbb: /* sbb */
emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
break;
+ case 0x1e: /* push ds */
+ emulate_push_sreg(ctxt, VCPU_SREG_DS);
+ break;
+ case 0x1f: /* pop ds */
+ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
+ if (rc != 0)
+ goto done;
+ break;
case 0x20 ... 0x25:
and: /* and */
emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
@@ -1750,6 +1991,14 @@ special_insn:
if (rc != 0)
goto done;
break;
+ case 0x60: /* pusha */
+ emulate_pusha(ctxt);
+ break;
+ case 0x61: /* popa */
+ rc = emulate_popa(ctxt, ops);
+ if (rc != 0)
+ goto done;
+ break;
case 0x63: /* movsxd */
if (ctxt->mode != X86EMUL_MODE_PROT64)
goto cannot_emulate;
@@ -1761,7 +2010,12 @@ special_insn:
break;
case 0x6c: /* insb */
case 0x6d: /* insw/insd */
- if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
+ if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
+ (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+ if (kvm_emulate_pio_string(ctxt->vcpu,
1,
(c->d & ByteOp) ? 1 : c->op_bytes,
c->rep_prefix ?
@@ -1777,7 +2031,12 @@ special_insn:
return 0;
case 0x6e: /* outsb */
case 0x6f: /* outsw/outsd */
- if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
+ if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
+ (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+ if (kvm_emulate_pio_string(ctxt->vcpu,
0,
(c->d & ByteOp) ? 1 : c->op_bytes,
c->rep_prefix ?
@@ -1863,25 +2122,19 @@ special_insn:
break;
case 0x8e: { /* mov seg, r/m16 */
uint16_t sel;
- int type_bits;
- int err;
sel = c->src.val;
- if (c->modrm_reg == VCPU_SREG_SS)
- toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
- if (c->modrm_reg <= 5) {
- type_bits = (c->modrm_reg == 1) ? 9 : 1;
- err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
- type_bits, c->modrm_reg);
- } else {
- printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n",
- c->modrm);
- goto cannot_emulate;
+ if (c->modrm_reg == VCPU_SREG_CS ||
+ c->modrm_reg > VCPU_SREG_GS) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
}
- if (err < 0)
- goto cannot_emulate;
+ if (c->modrm_reg == VCPU_SREG_SS)
+ toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
+
+ rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);
c->dst.type = OP_NONE; /* Disable writeback. */
break;
@@ -1910,7 +2163,10 @@ special_insn:
c->dst.type = OP_REG;
c->dst.ptr = (unsigned long *) &ctxt->eflags;
c->dst.bytes = c->op_bytes;
- goto pop_instruction;
+ rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ break;
case 0xa0 ... 0xa1: /* mov */
c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
c->dst.val = c->src.val;
@@ -1924,11 +2180,12 @@ special_insn:
c->dst.ptr = (unsigned long *)register_address(c,
es_base(ctxt),
c->regs[VCPU_REGS_RDI]);
- if ((rc = ops->read_emulated(register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]),
+ rc = ops->read_emulated(register_address(c,
+ seg_override_base(ctxt, c),
+ c->regs[VCPU_REGS_RSI]),
&c->dst.val,
- c->dst.bytes, ctxt->vcpu)) != 0)
+ c->dst.bytes, ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
goto done;
register_address_increment(c, &c->regs[VCPU_REGS_RSI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
@@ -1943,10 +2200,11 @@ special_insn:
c->src.ptr = (unsigned long *)register_address(c,
seg_override_base(ctxt, c),
c->regs[VCPU_REGS_RSI]);
- if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
- &c->src.val,
- c->src.bytes,
- ctxt->vcpu)) != 0)
+ rc = ops->read_emulated((unsigned long)c->src.ptr,
+ &c->src.val,
+ c->src.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
goto done;
c->dst.type = OP_NONE; /* Disable writeback. */
@@ -1954,10 +2212,11 @@ special_insn:
c->dst.ptr = (unsigned long *)register_address(c,
es_base(ctxt),
c->regs[VCPU_REGS_RDI]);
- if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu)) != 0)
+ rc = ops->read_emulated((unsigned long)c->dst.ptr,
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
goto done;
DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
@@ -1987,12 +2246,13 @@ special_insn:
c->dst.type = OP_REG;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
- if ((rc = ops->read_emulated(register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]),
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu)) != 0)
+ rc = ops->read_emulated(register_address(c,
+ seg_override_base(ctxt, c),
+ c->regs[VCPU_REGS_RSI]),
+ &c->dst.val,
+ c->dst.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
goto done;
register_address_increment(c, &c->regs[VCPU_REGS_RSI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
@@ -2048,11 +2308,9 @@ special_insn:
case 0xe9: /* jmp rel */
goto jmp;
case 0xea: /* jmp far */
- if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9,
- VCPU_SREG_CS) < 0) {
- DPRINTF("jmp far: Failed to load CS descriptor\n");
- goto cannot_emulate;
- }
+ if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val,
+ VCPU_SREG_CS))
+ goto done;
c->eip = c->src.val;
break;
@@ -2070,7 +2328,13 @@ special_insn:
case 0xef: /* out (e/r)ax,dx */
port = c->regs[VCPU_REGS_RDX];
io_dir_in = 0;
- do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in,
+ do_io:
+ if (!emulator_io_permited(ctxt, ops, port,
+ (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+ if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
(c->d & ByteOp) ? 1 : c->op_bytes,
port) != 0) {
c->eip = saved_eip;
@@ -2095,13 +2359,21 @@ special_insn:
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfa: /* cli */
- ctxt->eflags &= ~X86_EFLAGS_IF;
- c->dst.type = OP_NONE; /* Disable writeback. */
+ if (emulator_bad_iopl(ctxt))
+ kvm_inject_gp(ctxt->vcpu, 0);
+ else {
+ ctxt->eflags &= ~X86_EFLAGS_IF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ }
break;
case 0xfb: /* sti */
- toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
- ctxt->eflags |= X86_EFLAGS_IF;
- c->dst.type = OP_NONE; /* Disable writeback. */
+ if (emulator_bad_iopl(ctxt))
+ kvm_inject_gp(ctxt->vcpu, 0);
+ else {
+ toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
+ ctxt->eflags |= X86_EFLAGS_IF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ }
break;
case 0xfc: /* cld */
ctxt->eflags &= ~EFLG_DF;
@@ -2204,8 +2476,9 @@ twobyte_insn:
}
break;
case 0x05: /* syscall */
- if (emulate_syscall(ctxt) == -1)
- goto cannot_emulate;
+ rc = emulate_syscall(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
else
goto writeback;
break;
@@ -2276,14 +2549,16 @@ twobyte_insn:
c->dst.type = OP_NONE;
break;
case 0x34: /* sysenter */
- if (emulate_sysenter(ctxt) == -1)
- goto cannot_emulate;
+ rc = emulate_sysenter(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
else
goto writeback;
break;
case 0x35: /* sysexit */
- if (emulate_sysexit(ctxt) == -1)
- goto cannot_emulate;
+ rc = emulate_sysexit(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
else
goto writeback;
break;
@@ -2297,6 +2572,14 @@ twobyte_insn:
jmp_rel(c, c->src.val);
c->dst.type = OP_NONE;
break;
+ case 0xa0: /* push fs */
+ emulate_push_sreg(ctxt, VCPU_SREG_FS);
+ break;
+ case 0xa1: /* pop fs */
+ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
+ if (rc != 0)
+ goto done;
+ break;
case 0xa3:
bt: /* bt */
c->dst.type = OP_NONE;
@@ -2308,6 +2591,14 @@ twobyte_insn:
case 0xa5: /* shld cl, r, r/m */
emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
break;
+ case 0xa8: /* push gs */
+ emulate_push_sreg(ctxt, VCPU_SREG_GS);
+ break;
+ case 0xa9: /* pop gs */
+ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
+ if (rc != 0)
+ goto done;
+ break;
case 0xab:
bts: /* bts */
/* only subword offset */
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 82ad523b4901..294698b6daff 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -29,6 +29,8 @@
* Based on QEMU and Xen.
*/
+#define pr_fmt(fmt) "pit: " fmt
+
#include <linux/kvm_host.h>
#include "irq.h"
@@ -116,7 +118,7 @@ static s64 __kpit_elapsed(struct kvm *kvm)
* itself with the initial count and continues counting
* from there.
*/
- remaining = hrtimer_expires_remaining(&ps->pit_timer.timer);
+ remaining = hrtimer_get_remaining(&ps->pit_timer.timer);
elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
elapsed = mod_64(elapsed, ps->pit_timer.period);
@@ -240,11 +242,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
irq_ack_notifier);
- spin_lock(&ps->inject_lock);
+ raw_spin_lock(&ps->inject_lock);
if (atomic_dec_return(&ps->pit_timer.pending) < 0)
atomic_inc(&ps->pit_timer.pending);
ps->irq_ack = 1;
- spin_unlock(&ps->inject_lock);
+ raw_spin_unlock(&ps->inject_lock);
}
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -262,7 +264,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
static void destroy_pit_timer(struct kvm_timer *pt)
{
- pr_debug("pit: execute del timer!\n");
+ pr_debug("execute del timer!\n");
hrtimer_cancel(&pt->timer);
}
@@ -284,7 +286,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
- pr_debug("pit: create pit timer, interval is %llu nsec\n", interval);
+ pr_debug("create pit timer, interval is %llu nsec\n", interval);
/* TODO The new value only affected after the retriggered */
hrtimer_cancel(&pt->timer);
@@ -309,7 +311,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
WARN_ON(!mutex_is_locked(&ps->lock));
- pr_debug("pit: load_count val is %d, channel is %d\n", val, channel);
+ pr_debug("load_count val is %d, channel is %d\n", val, channel);
/*
* The largest possible initial count is 0; this is equivalent
@@ -395,8 +397,8 @@ static int pit_ioport_write(struct kvm_io_device *this,
mutex_lock(&pit_state->lock);
if (val != 0)
- pr_debug("pit: write addr is 0x%x, len is %d, val is 0x%x\n",
- (unsigned int)addr, len, val);
+ pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n",
+ (unsigned int)addr, len, val);
if (addr == 3) {
channel = val >> 6;
@@ -465,6 +467,9 @@ static int pit_ioport_read(struct kvm_io_device *this,
return -EOPNOTSUPP;
addr &= KVM_PIT_CHANNEL_MASK;
+ if (addr == 3)
+ return 0;
+
s = &pit_state->channels[addr];
mutex_lock(&pit_state->lock);
@@ -600,7 +605,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = {
.write = speaker_ioport_write,
};
-/* Caller must have writers lock on slots_lock */
+/* Caller must hold slots_lock */
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
{
struct kvm_pit *pit;
@@ -619,7 +624,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
mutex_init(&pit->pit_state.lock);
mutex_lock(&pit->pit_state.lock);
- spin_lock_init(&pit->pit_state.inject_lock);
+ raw_spin_lock_init(&pit->pit_state.inject_lock);
kvm->arch.vpit = pit;
pit->kvm = kvm;
@@ -640,13 +645,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
kvm_iodevice_init(&pit->dev, &pit_dev_ops);
- ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev);
if (ret < 0)
goto fail;
if (flags & KVM_PIT_SPEAKER_DUMMY) {
kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
- ret = __kvm_io_bus_register_dev(&kvm->pio_bus,
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
&pit->speaker_dev);
if (ret < 0)
goto fail_unregister;
@@ -655,11 +660,12 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
return pit;
fail_unregister:
- __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
fail:
- if (pit->irq_source_id >= 0)
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
+ kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
kfree(pit);
return NULL;
@@ -688,10 +694,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
struct kvm_vcpu *vcpu;
int i;
- mutex_lock(&kvm->irq_lock);
kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
- mutex_unlock(&kvm->irq_lock);
/*
* Provides NMI watchdog support via Virtual Wire mode.
@@ -720,12 +724,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
/* Try to inject pending interrupts when
* last one has been acked.
*/
- spin_lock(&ps->inject_lock);
+ raw_spin_lock(&ps->inject_lock);
if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
ps->irq_ack = 0;
inject = 1;
}
- spin_unlock(&ps->inject_lock);
+ raw_spin_unlock(&ps->inject_lock);
if (inject)
__inject_pit_timer_intr(kvm);
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index d4c1c7ffdc09..900d6b0ba7c2 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
u32 speaker_data_on;
struct mutex lock;
struct kvm_pit *pit;
- spinlock_t inject_lock;
+ raw_spinlock_t inject_lock;
unsigned long irq_ack;
struct kvm_irq_ack_notifier irq_ack_notifier;
};
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 01f151682802..07771da85de5 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -38,16 +38,25 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
s->isr_ack |= (1 << irq);
if (s != &s->pics_state->pics[0])
irq += 8;
+ /*
+ * We are dropping lock while calling ack notifiers since ack
+ * notifier callbacks for assigned devices call into PIC recursively.
+ * Other interrupt may be delivered to PIC while lock is dropped but
+ * it should be safe since PIC state is already updated at this stage.
+ */
+ raw_spin_unlock(&s->pics_state->lock);
kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
+ raw_spin_lock(&s->pics_state->lock);
}
void kvm_pic_clear_isr_ack(struct kvm *kvm)
{
struct kvm_pic *s = pic_irqchip(kvm);
- spin_lock(&s->lock);
+
+ raw_spin_lock(&s->lock);
s->pics[0].isr_ack = 0xff;
s->pics[1].isr_ack = 0xff;
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
}
/*
@@ -148,9 +157,9 @@ static void pic_update_irq(struct kvm_pic *s)
void kvm_pic_update_irq(struct kvm_pic *s)
{
- spin_lock(&s->lock);
+ raw_spin_lock(&s->lock);
pic_update_irq(s);
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
}
int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -158,14 +167,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
struct kvm_pic *s = opaque;
int ret = -1;
- spin_lock(&s->lock);
+ raw_spin_lock(&s->lock);
if (irq >= 0 && irq < PIC_NUM_PINS) {
ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
pic_update_irq(s);
trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
s->pics[irq >> 3].imr, ret == 0);
}
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
return ret;
}
@@ -176,16 +185,18 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
static inline void pic_intack(struct kvm_kpic_state *s, int irq)
{
s->isr |= 1 << irq;
- if (s->auto_eoi) {
- if (s->rotate_on_auto_eoi)
- s->priority_add = (irq + 1) & 7;
- pic_clear_isr(s, irq);
- }
/*
* We don't clear a level sensitive interrupt here
*/
if (!(s->elcr & (1 << irq)))
s->irr &= ~(1 << irq);
+
+ if (s->auto_eoi) {
+ if (s->rotate_on_auto_eoi)
+ s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
+ }
+
}
int kvm_pic_read_irq(struct kvm *kvm)
@@ -193,7 +204,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
int irq, irq2, intno;
struct kvm_pic *s = pic_irqchip(kvm);
- spin_lock(&s->lock);
+ raw_spin_lock(&s->lock);
irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) {
pic_intack(&s->pics[0], irq);
@@ -218,29 +229,18 @@ int kvm_pic_read_irq(struct kvm *kvm)
intno = s->pics[0].irq_base + irq;
}
pic_update_irq(s);
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
return intno;
}
void kvm_pic_reset(struct kvm_kpic_state *s)
{
- int irq, irqbase, n;
+ int irq;
struct kvm *kvm = s->pics_state->irq_request_opaque;
struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
+ u8 irr = s->irr, isr = s->imr;
- if (s == &s->pics_state->pics[0])
- irqbase = 0;
- else
- irqbase = 8;
-
- for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
- if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
- if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
- n = irq + irqbase;
- kvm_notify_acked_irq(kvm, SELECT_PIC(n), n);
- }
- }
s->last_irr = 0;
s->irr = 0;
s->imr = 0;
@@ -256,6 +256,13 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
s->rotate_on_auto_eoi = 0;
s->special_fully_nested_mode = 0;
s->init4 = 0;
+
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
+ if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
+ if (irr & (1 << irq) || isr & (1 << irq)) {
+ pic_clear_isr(s, irq);
+ }
+ }
}
static void pic_ioport_write(void *opaque, u32 addr, u32 val)
@@ -298,9 +305,9 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
priority = get_priority(s, s->isr);
if (priority != 8) {
irq = (priority + s->priority_add) & 7;
- pic_clear_isr(s, irq);
if (cmd == 5)
s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
pic_update_irq(s->pics_state);
}
break;
@@ -436,7 +443,7 @@ static int picdev_write(struct kvm_io_device *this,
printk(KERN_ERR "PIC: non byte write\n");
return 0;
}
- spin_lock(&s->lock);
+ raw_spin_lock(&s->lock);
switch (addr) {
case 0x20:
case 0x21:
@@ -449,7 +456,7 @@ static int picdev_write(struct kvm_io_device *this,
elcr_ioport_write(&s->pics[addr & 1], addr, data);
break;
}
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
return 0;
}
@@ -466,7 +473,7 @@ static int picdev_read(struct kvm_io_device *this,
printk(KERN_ERR "PIC: non byte read\n");
return 0;
}
- spin_lock(&s->lock);
+ raw_spin_lock(&s->lock);
switch (addr) {
case 0x20:
case 0x21:
@@ -480,7 +487,7 @@ static int picdev_read(struct kvm_io_device *this,
break;
}
*(unsigned char *)val = data;
- spin_unlock(&s->lock);
+ raw_spin_unlock(&s->lock);
return 0;
}
@@ -514,7 +521,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
if (!s)
return NULL;
- spin_lock_init(&s->lock);
+ raw_spin_lock_init(&s->lock);
s->kvm = kvm;
s->pics[0].elcr_mask = 0xf8;
s->pics[1].elcr_mask = 0xde;
@@ -527,7 +534,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
* Initialize PIO device
*/
kvm_iodevice_init(&s->dev, &picdev_ops);
- ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev);
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev);
+ mutex_unlock(&kvm->slots_lock);
if (ret < 0) {
kfree(s);
return NULL;
@@ -535,3 +544,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
return s;
}
+
+void kvm_destroy_pic(struct kvm *kvm)
+{
+ struct kvm_pic *vpic = kvm->arch.vpic;
+
+ if (vpic) {
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev);
+ kvm->arch.vpic = NULL;
+ kfree(vpic);
+ }
+}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7d6058a2fd38..34b15915754d 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -62,7 +62,7 @@ struct kvm_kpic_state {
};
struct kvm_pic {
- spinlock_t lock;
+ raw_spinlock_t lock;
unsigned pending_acks;
struct kvm *kvm;
struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
@@ -71,9 +71,11 @@ struct kvm_pic {
int output; /* intr from master PIC */
struct kvm_io_device dev;
void (*ack_notifier)(void *opaque, int irq);
+ unsigned long irq_states[16];
};
struct kvm_pic *kvm_create_pic(struct kvm *kvm);
+void kvm_destroy_pic(struct kvm *kvm);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
void kvm_pic_clear_isr_ack(struct kvm *kvm);
@@ -85,7 +87,11 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
static inline int irqchip_in_kernel(struct kvm *kvm)
{
- return pic_irqchip(kvm) != NULL;
+ int ret;
+
+ ret = (pic_irqchip(kvm) != NULL);
+ smp_rmb();
+ return ret;
}
void kvm_pic_reset(struct kvm_kpic_state *s);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 7bcc5b6a4403..cff851cf5322 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -1,6 +1,11 @@
#ifndef ASM_KVM_CACHE_REGS_H
#define ASM_KVM_CACHE_REGS_H
+#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
+#define KVM_POSSIBLE_CR4_GUEST_BITS \
+ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
+
static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
@@ -38,4 +43,30 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
return vcpu->arch.pdptrs[index];
}
+static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
+ if (tmask & vcpu->arch.cr0_guest_owned_bits)
+ kvm_x86_ops->decache_cr0_guest_bits(vcpu);
+ return vcpu->arch.cr0 & mask;
+}
+
+static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, ~0UL);
+}
+
+static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
+ if (tmask & vcpu->arch.cr4_guest_owned_bits)
+ kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+ return vcpu->arch.cr4 & mask;
+}
+
+static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, ~0UL);
+}
+
#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1ae5ceba7eb2..4b224f90087b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,7 +32,6 @@
#include <asm/current.h>
#include <asm/apicdef.h>
#include <asm/atomic.h>
-#include <asm/apicdef.h>
#include "kvm_cache_regs.h"
#include "irq.h"
#include "trace.h"
@@ -374,6 +373,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
if (unlikely(!apic_enabled(apic)))
break;
+ if (trig_mode) {
+ apic_debug("level trig mode for vector %d", vector);
+ apic_set_vector(vector, apic->regs + APIC_TMR);
+ } else
+ apic_clear_vector(vector, apic->regs + APIC_TMR);
+
result = !apic_test_and_set_irr(vector, apic);
trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
trig_mode, vector, !result);
@@ -384,11 +389,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
break;
}
- if (trig_mode) {
- apic_debug("level trig mode for vector %d", vector);
- apic_set_vector(vector, apic->regs + APIC_TMR);
- } else
- apic_clear_vector(vector, apic->regs + APIC_TMR);
kvm_vcpu_kick(vcpu);
break;
@@ -471,11 +471,8 @@ static void apic_set_eoi(struct kvm_lapic *apic)
trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
- if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) {
- mutex_lock(&apic->vcpu->kvm->irq_lock);
+ if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
- mutex_unlock(&apic->vcpu->kvm->irq_lock);
- }
}
static void apic_send_ipi(struct kvm_lapic *apic)
@@ -504,9 +501,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
irq.vector);
- mutex_lock(&apic->vcpu->kvm->irq_lock);
kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
- mutex_unlock(&apic->vcpu->kvm->irq_lock);
}
static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -521,7 +516,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
if (apic_get_reg(apic, APIC_TMICT) == 0)
return 0;
- remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer);
+ remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
if (ktime_to_ns(remaining) < 0)
remaining = ktime_set(0, 0);
@@ -664,7 +659,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
{
ktime_t now = apic->lapic_timer.timer.base->get_time();
- apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) *
+ apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
APIC_BUS_CYCLE_NS * apic->divide_count;
atomic_set(&apic->lapic_timer.pending, 0);
@@ -1156,6 +1151,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
hrtimer_cancel(&apic->lapic_timer.timer);
update_divide_count(apic);
start_apic_timer(apic);
+ apic->irr_pending = true;
}
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@@ -1250,3 +1246,34 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
return 0;
}
+
+int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return 1;
+
+ /* if this is ICR write vector before command */
+ if (reg == APIC_ICR)
+ apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+ return apic_reg_write(apic, reg, (u32)data);
+}
+
+int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 low, high = 0;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return 1;
+
+ if (apic_reg_read(apic, reg, 4, &low))
+ return 1;
+ if (reg == APIC_ICR)
+ apic_reg_read(apic, APIC_ICR2, 4, &high);
+
+ *data = (((u64)high) << 32) | low;
+
+ return 0;
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 40010b09c4aa..f5fe32c5edad 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -48,4 +48,12 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+
+int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+
+static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
+}
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index eca41ae9f453..741373e8ca77 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,6 +18,7 @@
*/
#include "mmu.h"
+#include "x86.h"
#include "kvm_cache_regs.h"
#include <linux/kvm_host.h>
@@ -29,6 +30,7 @@
#include <linux/swap.h>
#include <linux/hugetlb.h>
#include <linux/compiler.h>
+#include <linux/srcu.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
@@ -136,16 +138,6 @@ module_param(oos_shadow, bool, 0644);
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
| PT64_NX_MASK)
-#define PFERR_PRESENT_MASK (1U << 0)
-#define PFERR_WRITE_MASK (1U << 1)
-#define PFERR_USER_MASK (1U << 2)
-#define PFERR_RSVD_MASK (1U << 3)
-#define PFERR_FETCH_MASK (1U << 4)
-
-#define PT_PDPE_LEVEL 3
-#define PT_DIRECTORY_LEVEL 2
-#define PT_PAGE_TABLE_LEVEL 1
-
#define RMAP_EXT 4
#define ACC_EXEC_MASK 1
@@ -153,9 +145,14 @@ module_param(oos_shadow, bool, 0644);
#define ACC_USER_MASK PT_USER_MASK
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+#include <trace/events/kvm.h>
+
+#undef TRACE_INCLUDE_FILE
#define CREATE_TRACE_POINTS
#include "mmutrace.h"
+#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
struct kvm_rmap_desc {
@@ -227,7 +224,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
static int is_write_protection(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.cr0 & X86_CR0_WP;
+ return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
}
static int is_cpuid_PSE36(void)
@@ -237,7 +234,7 @@ static int is_cpuid_PSE36(void)
static int is_nx(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.shadow_efer & EFER_NX;
+ return vcpu->arch.efer & EFER_NX;
}
static int is_shadow_present_pte(u64 pte)
@@ -251,7 +248,7 @@ static int is_large_pte(u64 pte)
return pte & PT_PAGE_SIZE_MASK;
}
-static int is_writeble_pte(unsigned long pte)
+static int is_writable_pte(unsigned long pte)
{
return pte & PT_WRITABLE_MASK;
}
@@ -468,24 +465,10 @@ static int has_wrprotected_page(struct kvm *kvm,
static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
{
- unsigned long page_size = PAGE_SIZE;
- struct vm_area_struct *vma;
- unsigned long addr;
+ unsigned long page_size;
int i, ret = 0;
- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr))
- return page_size;
-
- down_read(&current->mm->mmap_sem);
- vma = find_vma(current->mm, addr);
- if (!vma)
- goto out;
-
- page_size = vma_kernel_pagesize(vma);
-
-out:
- up_read(&current->mm->mmap_sem);
+ page_size = kvm_host_page_size(kvm, gfn);
for (i = PT_PAGE_TABLE_LEVEL;
i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
@@ -501,8 +484,7 @@ out:
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{
struct kvm_memory_slot *slot;
- int host_level;
- int level = PT_PAGE_TABLE_LEVEL;
+ int host_level, level, max_level;
slot = gfn_to_memslot(vcpu->kvm, large_gfn);
if (slot && slot->dirty_bitmap)
@@ -513,11 +495,12 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
if (host_level == PT_PAGE_TABLE_LEVEL)
return host_level;
- for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) {
+ max_level = kvm_x86_ops->get_lpage_level() < host_level ?
+ kvm_x86_ops->get_lpage_level() : host_level;
+ for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
break;
- }
return level - 1;
}
@@ -633,10 +616,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
pfn = spte_to_pfn(*spte);
if (*spte & shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
- if (is_writeble_pte(*spte))
- kvm_release_pfn_dirty(pfn);
- else
- kvm_release_pfn_clean(pfn);
+ if (is_writable_pte(*spte))
+ kvm_set_pfn_dirty(pfn);
rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
if (!*rmapp) {
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
@@ -664,6 +645,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
prev_desc = desc;
desc = desc->more;
}
+ pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
BUG();
}
}
@@ -710,7 +692,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!spte);
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
- if (is_writeble_pte(*spte)) {
+ if (is_writable_pte(*spte)) {
__set_spte(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
@@ -734,7 +716,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!(*spte & PT_PRESENT_MASK));
BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
- if (is_writeble_pte(*spte)) {
+ if (is_writable_pte(*spte)) {
rmap_remove(kvm, spte);
--kvm->stat.lpages;
__set_spte(spte, shadow_trap_nonpresent_pte);
@@ -748,7 +730,8 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
return write_protected;
}
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
{
u64 *spte;
int need_tlb_flush = 0;
@@ -763,37 +746,75 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
return need_tlb_flush;
}
+static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
+{
+ int need_flush = 0;
+ u64 *spte, new_spte;
+ pte_t *ptep = (pte_t *)data;
+ pfn_t new_pfn;
+
+ WARN_ON(pte_huge(*ptep));
+ new_pfn = pte_pfn(*ptep);
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ BUG_ON(!is_shadow_present_pte(*spte));
+ rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
+ need_flush = 1;
+ if (pte_write(*ptep)) {
+ rmap_remove(kvm, spte);
+ __set_spte(spte, shadow_trap_nonpresent_pte);
+ spte = rmap_next(kvm, rmapp, NULL);
+ } else {
+ new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
+ new_spte |= (u64)new_pfn << PAGE_SHIFT;
+
+ new_spte &= ~PT_WRITABLE_MASK;
+ new_spte &= ~SPTE_HOST_WRITEABLE;
+ if (is_writable_pte(*spte))
+ kvm_set_pfn_dirty(spte_to_pfn(*spte));
+ __set_spte(spte, new_spte);
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+ }
+ if (need_flush)
+ kvm_flush_remote_tlbs(kvm);
+
+ return 0;
+}
+
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp))
+ unsigned long data,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data))
{
int i, j;
+ int ret;
int retval = 0;
+ struct kvm_memslots *slots;
- /*
- * If mmap_sem isn't taken, we can look the memslots with only
- * the mmu_lock by skipping over the slots with userspace_addr == 0.
- */
- for (i = 0; i < kvm->nmemslots; i++) {
- struct kvm_memory_slot *memslot = &kvm->memslots[i];
+ slots = rcu_dereference(kvm->memslots);
+
+ for (i = 0; i < slots->nmemslots; i++) {
+ struct kvm_memory_slot *memslot = &slots->memslots[i];
unsigned long start = memslot->userspace_addr;
unsigned long end;
- /* mmu_lock protects userspace_addr */
- if (!start)
- continue;
-
end = start + (memslot->npages << PAGE_SHIFT);
if (hva >= start && hva < end) {
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
- retval |= handler(kvm, &memslot->rmap[gfn_offset]);
+ ret = handler(kvm, &memslot->rmap[gfn_offset], data);
for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
int idx = gfn_offset;
idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
- retval |= handler(kvm,
- &memslot->lpage_info[j][idx].rmap_pde);
+ ret |= handler(kvm,
+ &memslot->lpage_info[j][idx].rmap_pde,
+ data);
}
+ trace_kvm_age_page(hva, memslot, ret);
+ retval |= ret;
}
}
@@ -802,17 +823,29 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
{
- return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+ return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+ kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
}
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
{
u64 *spte;
int young = 0;
- /* always return old for EPT */
+ /*
+ * Emulate the accessed bit for EPT, by checking if this page has
+ * an EPT mapping, and clearing it if it does. On the next access,
+ * a new EPT mapping will be established.
+ * This has some overhead, but not as much as the cost of swapping
+ * out actively used pages or breaking up actively used hugepages.
+ */
if (!shadow_accessed_mask)
- return 0;
+ return kvm_unmap_rmapp(kvm, rmapp, data);
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
@@ -841,13 +874,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
gfn = unalias_gfn(vcpu->kvm, gfn);
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
- kvm_unmap_rmapp(vcpu->kvm, rmapp);
+ kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
kvm_flush_remote_tlbs(vcpu->kvm);
}
int kvm_age_hva(struct kvm *kvm, unsigned long hva)
{
- return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
+ return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
}
#ifdef MMU_DEBUG
@@ -1569,7 +1602,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
{
- int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
+ int slot = memslot_id(kvm, gfn);
struct kvm_mmu_page *sp = page_header(__pa(pte));
__set_bit(slot, sp->slot_bitmap);
@@ -1593,7 +1626,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
{
struct page *page;
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+ gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
if (gpa == UNMAPPED_GVA)
return NULL;
@@ -1756,7 +1789,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
- bool can_unsync)
+ bool can_unsync, bool reset_host_protection)
{
u64 spte;
int ret = 0;
@@ -1783,6 +1816,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
kvm_is_mmio_pfn(pfn));
+ if (reset_host_protection)
+ spte |= SPTE_HOST_WRITEABLE;
+
spte |= (u64)pfn << PAGE_SHIFT;
if ((pte_access & ACC_WRITE_MASK)
@@ -1803,7 +1839,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* is responsibility of mmu_get_page / kvm_sync_page.
* Same reasoning can be applied to dirty page accounting.
*/
- if (!can_unsync && is_writeble_pte(*sptep))
+ if (!can_unsync && is_writable_pte(*sptep))
goto set_pte;
if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
@@ -1811,7 +1847,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
__func__, gfn);
ret = 1;
pte_access &= ~ACC_WRITE_MASK;
- if (is_writeble_pte(spte))
+ if (is_writable_pte(spte))
spte &= ~PT_WRITABLE_MASK;
}
}
@@ -1828,10 +1864,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
int *ptwrite, int level, gfn_t gfn,
- pfn_t pfn, bool speculative)
+ pfn_t pfn, bool speculative,
+ bool reset_host_protection)
{
int was_rmapped = 0;
- int was_writeble = is_writeble_pte(*sptep);
+ int was_writable = is_writable_pte(*sptep);
int rmap_count;
pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1860,7 +1897,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
}
if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
- dirty, level, gfn, pfn, speculative, true)) {
+ dirty, level, gfn, pfn, speculative, true,
+ reset_host_protection)) {
if (write_fault)
*ptwrite = 1;
kvm_x86_ops->tlb_flush(vcpu);
@@ -1877,12 +1915,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
page_header_update_slot(vcpu->kvm, sptep, gfn);
if (!was_rmapped) {
rmap_count = rmap_add(vcpu, sptep, gfn);
- if (!is_rmap_spte(*sptep))
- kvm_release_pfn_clean(pfn);
+ kvm_release_pfn_clean(pfn);
if (rmap_count > RMAP_RECYCLE_THRESHOLD)
rmap_recycle(vcpu, sptep, gfn);
} else {
- if (was_writeble)
+ if (was_writable)
kvm_release_pfn_dirty(pfn);
else
kvm_release_pfn_clean(pfn);
@@ -1909,7 +1946,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
if (iterator.level == level) {
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
0, write, 1, &pt_write,
- level, gfn, pfn, false);
+ level, gfn, pfn, false, true);
++vcpu->stat.pf_fixed;
break;
}
@@ -2112,8 +2149,11 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
spin_unlock(&vcpu->kvm->mmu_lock);
}
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
+ u32 access, u32 *error)
{
+ if (error)
+ *error = 0;
return vaddr;
}
@@ -2697,7 +2737,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
if (tdp_enabled)
return 0;
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
spin_lock(&vcpu->kvm->mmu_lock);
r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
@@ -2737,7 +2777,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
if (r)
goto out;
- er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
+ er = emulate_instruction(vcpu, cr2, error_code, 0);
switch (er) {
case EMULATE_DONE:
@@ -2748,6 +2788,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
case EMULATE_FAIL:
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
return 0;
default:
BUG();
@@ -2796,16 +2837,13 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
*/
page = alloc_page(GFP_KERNEL | __GFP_DMA32);
if (!page)
- goto error_1;
+ return -ENOMEM;
+
vcpu->arch.mmu.pae_root = page_address(page);
for (i = 0; i < 4; ++i)
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
return 0;
-
-error_1:
- free_mmu_pages(vcpu);
- return -ENOMEM;
}
int kvm_mmu_create(struct kvm_vcpu *vcpu)
@@ -2885,10 +2923,9 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
- int npages;
+ int npages, idx;
- if (!down_read_trylock(&kvm->slots_lock))
- continue;
+ idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
npages = kvm->arch.n_alloc_mmu_pages -
kvm->arch.n_free_mmu_pages;
@@ -2901,7 +2938,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
nr_to_scan--;
spin_unlock(&kvm->mmu_lock);
- up_read(&kvm->slots_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
}
if (kvm_freed)
list_move_tail(&kvm_freed->vm_list, &vm_list);
@@ -2968,9 +3005,11 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
int i;
unsigned int nr_mmu_pages;
unsigned int nr_pages = 0;
+ struct kvm_memslots *slots;
- for (i = 0; i < kvm->nmemslots; i++)
- nr_pages += kvm->memslots[i].npages;
+ slots = rcu_dereference(kvm->memslots);
+ for (i = 0; i < slots->nmemslots; i++)
+ nr_pages += slots->memslots[i].npages;
nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
nr_mmu_pages = max(nr_mmu_pages,
@@ -3195,7 +3234,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
audit_mappings_page(vcpu, ent, va, level - 1);
else {
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
+ gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
gfn_t gfn = gpa >> PAGE_SHIFT;
pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
@@ -3240,10 +3279,12 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
static int count_rmaps(struct kvm_vcpu *vcpu)
{
int nmaps = 0;
- int i, j, k;
+ int i, j, k, idx;
+ idx = srcu_read_lock(&kvm->srcu);
+ slots = rcu_dereference(kvm->memslots);
for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
- struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
+ struct kvm_memory_slot *m = &slots->memslots[i];
struct kvm_rmap_desc *d;
for (j = 0; j < m->npages; ++j) {
@@ -3266,6 +3307,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
}
}
}
+ srcu_read_unlock(&kvm->srcu, idx);
return nmaps;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 61a1b3884b49..be66759321a5 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -2,6 +2,7 @@
#define __KVM_X86_MMU_H
#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
#define PT64_PT_BITS 9
#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
@@ -37,6 +38,16 @@
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
+#define PT_PDPE_LEVEL 3
+#define PT_DIRECTORY_LEVEL 2
+#define PT_PAGE_TABLE_LEVEL 1
+
+#define PFERR_PRESENT_MASK (1U << 0)
+#define PFERR_WRITE_MASK (1U << 1)
+#define PFERR_USER_MASK (1U << 2)
+#define PFERR_RSVD_MASK (1U << 3)
+#define PFERR_FETCH_MASK (1U << 4)
+
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
@@ -53,30 +64,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
return kvm_mmu_load(vcpu);
}
-static inline int is_long_mode(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- return vcpu->arch.shadow_efer & EFER_LMA;
-#else
- return 0;
-#endif
-}
-
-static inline int is_pae(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.cr4 & X86_CR4_PAE;
-}
-
-static inline int is_pse(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.cr4 & X86_CR4_PSE;
-}
-
-static inline int is_paging(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.cr0 & X86_CR0_PG;
-}
-
static inline int is_present_gpte(unsigned long pte)
{
return pte & PT_PRESENT_MASK;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d2fec9c12d22..81eab9a50e6a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -150,7 +150,9 @@ walk:
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
- kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
+ if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)))
+ goto not_present;
+
trace_kvm_mmu_paging_element(pte, walker->level);
if (!is_present_gpte(pte))
@@ -160,7 +162,7 @@ walk:
if (rsvd_fault)
goto access_error;
- if (write_fault && !is_writeble_pte(pte))
+ if (write_fault && !is_writable_pte(pte))
if (user_fault || is_write_protection(vcpu))
goto access_error;
@@ -273,9 +275,13 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
return;
kvm_get_pfn(pfn);
+ /*
+ * we call mmu_set_spte() with reset_host_protection = true beacuse that
+ * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
+ */
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
- gpte_to_gfn(gpte), pfn, true);
+ gpte_to_gfn(gpte), pfn, true, true);
}
/*
@@ -308,7 +314,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
user_fault, write_fault,
gw->ptes[gw->level-1] & PT_DIRTY_MASK,
ptwrite, level,
- gw->gfn, pfn, false);
+ gw->gfn, pfn, false, true);
break;
}
@@ -451,8 +457,6 @@ out_unlock:
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
{
struct kvm_shadow_walk_iterator iterator;
- pt_element_t gpte;
- gpa_t pte_gpa = -1;
int level;
u64 *sptep;
int need_flush = 0;
@@ -463,14 +467,9 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
level = iterator.level;
sptep = iterator.sptep;
- /* FIXME: properly handle invlpg on large guest pages */
if (level == PT_PAGE_TABLE_LEVEL ||
((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
-
- pte_gpa = (sp->gfn << PAGE_SHIFT);
- pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
if (is_shadow_present_pte(*sptep)) {
rmap_remove(vcpu->kvm, sptep);
@@ -489,32 +488,25 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
if (need_flush)
kvm_flush_remote_tlbs(vcpu->kvm);
spin_unlock(&vcpu->kvm->mmu_lock);
-
- if (pte_gpa == -1)
- return;
- if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
- sizeof(pt_element_t)))
- return;
- if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) {
- if (mmu_topup_memory_caches(vcpu))
- return;
- kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
- sizeof(pt_element_t), 0);
- }
}
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
+ u32 *error)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
int r;
- r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
+ r = FNAME(walk_addr)(&walker, vcpu, vaddr,
+ !!(access & PFERR_WRITE_MASK),
+ !!(access & PFERR_USER_MASK),
+ !!(access & PFERR_FETCH_MASK));
if (r) {
gpa = gfn_to_gpa(walker.gfn);
gpa |= vaddr & ~PAGE_MASK;
- }
+ } else if (error)
+ *error = walker.error_code;
return gpa;
}
@@ -558,6 +550,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
int i, offset, nr_present;
+ bool reset_host_protection;
offset = nr_present = 0;
@@ -595,9 +588,16 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
nr_present++;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
+ pte_access &= ~ACC_WRITE_MASK;
+ reset_host_protection = 0;
+ } else {
+ reset_host_protection = 1;
+ }
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
- spte_to_pfn(sp->spt[i]), true, false);
+ spte_to_pfn(sp->spt[i]), true, false,
+ reset_host_protection);
}
return !nr_present;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 944cc9c04b3c..52f78dd03010 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -46,6 +46,7 @@ MODULE_LICENSE("GPL");
#define SVM_FEATURE_NPT (1 << 0)
#define SVM_FEATURE_LBRV (1 << 1)
#define SVM_FEATURE_SVML (1 << 2)
+#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
@@ -53,15 +54,6 @@ MODULE_LICENSE("GPL");
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
-/* Turn on to get debugging output*/
-/* #define NESTED_DEBUG */
-
-#ifdef NESTED_DEBUG
-#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
-#else
-#define nsvm_printk(fmt, args...) do {} while(0)
-#endif
-
static const u32 host_save_user_msrs[] = {
#ifdef CONFIG_X86_64
MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
@@ -85,6 +77,9 @@ struct nested_state {
/* gpa pointers to the real vectors */
u64 vmcb_msrpm;
+ /* A VMEXIT is required but not yet emulated */
+ bool exit_required;
+
/* cache for intercepts of the guest */
u16 intercept_cr_read;
u16 intercept_cr_write;
@@ -112,6 +107,8 @@ struct vcpu_svm {
u32 *msrpm;
struct nested_state nested;
+
+ bool nmi_singlestep;
};
/* enable NPT for AMD64 and X86 with PAE */
@@ -234,7 +231,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
efer &= ~EFER_LME;
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
- vcpu->arch.shadow_efer = efer;
+ vcpu->arch.efer = efer;
}
static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -286,7 +283,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
if (!svm->next_rip) {
- if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) !=
+ if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
EMULATE_DONE)
printk(KERN_DEBUG "%s: NOP\n", __func__);
return;
@@ -316,75 +313,79 @@ static void svm_hardware_disable(void *garbage)
cpu_svm_disable();
}
-static void svm_hardware_enable(void *garbage)
+static int svm_hardware_enable(void *garbage)
{
- struct svm_cpu_data *svm_data;
+ struct svm_cpu_data *sd;
uint64_t efer;
struct descriptor_table gdt_descr;
struct desc_struct *gdt;
int me = raw_smp_processor_id();
+ rdmsrl(MSR_EFER, efer);
+ if (efer & EFER_SVME)
+ return -EBUSY;
+
if (!has_svm()) {
- printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
- return;
+ printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
+ me);
+ return -EINVAL;
}
- svm_data = per_cpu(svm_data, me);
+ sd = per_cpu(svm_data, me);
- if (!svm_data) {
- printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
+ if (!sd) {
+ printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
me);
- return;
+ return -EINVAL;
}
- svm_data->asid_generation = 1;
- svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
- svm_data->next_asid = svm_data->max_asid + 1;
+ sd->asid_generation = 1;
+ sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
+ sd->next_asid = sd->max_asid + 1;
kvm_get_gdt(&gdt_descr);
gdt = (struct desc_struct *)gdt_descr.base;
- svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
+ sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
- rdmsrl(MSR_EFER, efer);
wrmsrl(MSR_EFER, efer | EFER_SVME);
- wrmsrl(MSR_VM_HSAVE_PA,
- page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
+ wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
+
+ return 0;
}
static void svm_cpu_uninit(int cpu)
{
- struct svm_cpu_data *svm_data
- = per_cpu(svm_data, raw_smp_processor_id());
+ struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
- if (!svm_data)
+ if (!sd)
return;
per_cpu(svm_data, raw_smp_processor_id()) = NULL;
- __free_page(svm_data->save_area);
- kfree(svm_data);
+ __free_page(sd->save_area);
+ kfree(sd);
}
static int svm_cpu_init(int cpu)
{
- struct svm_cpu_data *svm_data;
+ struct svm_cpu_data *sd;
int r;
- svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
- if (!svm_data)
+ sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
+ if (!sd)
return -ENOMEM;
- svm_data->cpu = cpu;
- svm_data->save_area = alloc_page(GFP_KERNEL);
+ sd->cpu = cpu;
+ sd->save_area = alloc_page(GFP_KERNEL);
r = -ENOMEM;
- if (!svm_data->save_area)
+ if (!sd->save_area)
goto err_1;
- per_cpu(svm_data, cpu) = svm_data;
+ per_cpu(svm_data, cpu) = sd;
return 0;
err_1:
- kfree(svm_data);
+ kfree(sd);
return r;
}
@@ -476,7 +477,7 @@ static __init int svm_hardware_setup(void)
kvm_enable_efer_bits(EFER_SVME);
}
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
r = svm_cpu_init(cpu);
if (r)
goto err;
@@ -510,7 +511,7 @@ static __exit void svm_hardware_unsetup(void)
{
int cpu;
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu)
svm_cpu_uninit(cpu);
__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
@@ -539,6 +540,8 @@ static void init_vmcb(struct vcpu_svm *svm)
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
+ svm->vcpu.fpu_active = 1;
+
control->intercept_cr_read = INTERCEPT_CR0_MASK |
INTERCEPT_CR3_MASK |
INTERCEPT_CR4_MASK;
@@ -551,13 +554,19 @@ static void init_vmcb(struct vcpu_svm *svm)
control->intercept_dr_read = INTERCEPT_DR0_MASK |
INTERCEPT_DR1_MASK |
INTERCEPT_DR2_MASK |
- INTERCEPT_DR3_MASK;
+ INTERCEPT_DR3_MASK |
+ INTERCEPT_DR4_MASK |
+ INTERCEPT_DR5_MASK |
+ INTERCEPT_DR6_MASK |
+ INTERCEPT_DR7_MASK;
control->intercept_dr_write = INTERCEPT_DR0_MASK |
INTERCEPT_DR1_MASK |
INTERCEPT_DR2_MASK |
INTERCEPT_DR3_MASK |
+ INTERCEPT_DR4_MASK |
INTERCEPT_DR5_MASK |
+ INTERCEPT_DR6_MASK |
INTERCEPT_DR7_MASK;
control->intercept_exceptions = (1 << PF_VECTOR) |
@@ -568,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm)
control->intercept = (1ULL << INTERCEPT_INTR) |
(1ULL << INTERCEPT_NMI) |
(1ULL << INTERCEPT_SMI) |
+ (1ULL << INTERCEPT_SELECTIVE_CR0) |
(1ULL << INTERCEPT_CPUID) |
(1ULL << INTERCEPT_INVD) |
(1ULL << INTERCEPT_HLT) |
@@ -625,11 +635,12 @@ static void init_vmcb(struct vcpu_svm *svm)
save->rip = 0x0000fff0;
svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
- /*
- * cr0 val on cpu init should be 0x60000010, we enable cpu
- * cache by default. the orderly way is to enable cache in bios.
+ /* This is the guest-visible cr0 value.
+ * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
*/
- save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
+ svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+ kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
+
save->cr4 = X86_CR4_PAE;
/* rdx = ?? */
@@ -639,13 +650,9 @@ static void init_vmcb(struct vcpu_svm *svm)
control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
(1ULL << INTERCEPT_INVLPG));
control->intercept_exceptions &= ~(1 << PF_VECTOR);
- control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
- INTERCEPT_CR3_MASK);
- control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
- INTERCEPT_CR3_MASK);
+ control->intercept_cr_read &= ~INTERCEPT_CR3_MASK;
+ control->intercept_cr_write &= ~INTERCEPT_CR3_MASK;
save->g_pat = 0x0007040600070406ULL;
- /* enable caching because the QEMU Bios doesn't enable it */
- save->cr0 = X86_CR0_ET;
save->cr3 = 0;
save->cr4 = 0;
}
@@ -654,6 +661,11 @@ static void init_vmcb(struct vcpu_svm *svm)
svm->nested.vmcb = 0;
svm->vcpu.arch.hflags = 0;
+ if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
+ control->pause_filter_count = 3000;
+ control->intercept |= (1ULL << INTERCEPT_PAUSE);
+ }
+
enable_gif(svm);
}
@@ -725,7 +737,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
init_vmcb(svm);
fx_init(&svm->vcpu);
- svm->vcpu.fpu_active = 1;
svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
if (kvm_vcpu_is_bsp(&svm->vcpu))
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
@@ -758,15 +769,18 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
int i;
if (unlikely(cpu != vcpu->cpu)) {
- u64 tsc_this, delta;
-
- /*
- * Make sure that the guest sees a monotonically
- * increasing TSC.
- */
- rdtscll(tsc_this);
- delta = vcpu->arch.host_tsc - tsc_this;
- svm->vmcb->control.tsc_offset += delta;
+ u64 delta;
+
+ if (check_tsc_unstable()) {
+ /*
+ * Make sure that the guest sees a monotonically
+ * increasing TSC.
+ */
+ delta = vcpu->arch.host_tsc - native_read_tsc();
+ svm->vmcb->control.tsc_offset += delta;
+ if (is_nested(svm))
+ svm->nested.hsave->control.tsc_offset += delta;
+ }
vcpu->cpu = cpu;
kvm_migrate_timers(vcpu);
svm->asid_generation = 0;
@@ -785,7 +799,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
- rdtscll(vcpu->arch.host_tsc);
+ vcpu->arch.host_tsc = native_read_tsc();
}
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -948,42 +962,59 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
svm->vmcb->save.gdtr.base = dt->base ;
}
+static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
+{
+}
+
static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
}
+static void update_cr0_intercept(struct vcpu_svm *svm)
+{
+ ulong gcr0 = svm->vcpu.arch.cr0;
+ u64 *hcr0 = &svm->vmcb->save.cr0;
+
+ if (!svm->vcpu.fpu_active)
+ *hcr0 |= SVM_CR0_SELECTIVE_MASK;
+ else
+ *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
+ | (gcr0 & SVM_CR0_SELECTIVE_MASK);
+
+
+ if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
+ svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
+ svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+ } else {
+ svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
+ svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
+ }
+}
+
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
#ifdef CONFIG_X86_64
- if (vcpu->arch.shadow_efer & EFER_LME) {
+ if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
- vcpu->arch.shadow_efer |= EFER_LMA;
+ vcpu->arch.efer |= EFER_LMA;
svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
}
if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
- vcpu->arch.shadow_efer &= ~EFER_LMA;
+ vcpu->arch.efer &= ~EFER_LMA;
svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
}
}
#endif
- if (npt_enabled)
- goto set;
+ vcpu->arch.cr0 = cr0;
- if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
- svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
- vcpu->fpu_active = 1;
- }
+ if (!npt_enabled)
+ cr0 |= X86_CR0_PG | X86_CR0_WP;
- vcpu->arch.cr0 = cr0;
- cr0 |= X86_CR0_PG | X86_CR0_WP;
- if (!vcpu->fpu_active) {
- svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
+ if (!vcpu->fpu_active)
cr0 |= X86_CR0_TS;
- }
-set:
/*
* re-enable caching here because the QEMU bios
* does not do it - this results in some delay at
@@ -991,6 +1022,7 @@ set:
*/
cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
+ update_cr0_intercept(svm);
}
static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1043,7 +1075,7 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
svm->vmcb->control.intercept_exceptions &=
~((1 << DB_VECTOR) | (1 << BP_VECTOR));
- if (vcpu->arch.singlestep)
+ if (svm->nmi_singlestep)
svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -1058,26 +1090,16 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
vcpu->guest_debug = 0;
}
-static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
{
- int old_debug = vcpu->guest_debug;
struct vcpu_svm *svm = to_svm(vcpu);
- vcpu->guest_debug = dbg->control;
-
- update_db_intercept(vcpu);
-
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
else
svm->vmcb->save.dr7 = vcpu->arch.dr7;
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
- else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
- svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-
- return 0;
+ update_db_intercept(vcpu);
}
static void load_host_msrs(struct kvm_vcpu *vcpu)
@@ -1094,91 +1116,85 @@ static void save_host_msrs(struct kvm_vcpu *vcpu)
#endif
}
-static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
+static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
{
- if (svm_data->next_asid > svm_data->max_asid) {
- ++svm_data->asid_generation;
- svm_data->next_asid = 1;
+ if (sd->next_asid > sd->max_asid) {
+ ++sd->asid_generation;
+ sd->next_asid = 1;
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
}
- svm->asid_generation = svm_data->asid_generation;
- svm->vmcb->control.asid = svm_data->next_asid++;
+ svm->asid_generation = sd->asid_generation;
+ svm->vmcb->control.asid = sd->next_asid++;
}
-static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
+static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest)
{
struct vcpu_svm *svm = to_svm(vcpu);
- unsigned long val;
switch (dr) {
case 0 ... 3:
- val = vcpu->arch.db[dr];
+ *dest = vcpu->arch.db[dr];
break;
+ case 4:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return EMULATE_FAIL; /* will re-inject UD */
+ /* fall through */
case 6:
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- val = vcpu->arch.dr6;
+ *dest = vcpu->arch.dr6;
else
- val = svm->vmcb->save.dr6;
+ *dest = svm->vmcb->save.dr6;
break;
+ case 5:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return EMULATE_FAIL; /* will re-inject UD */
+ /* fall through */
case 7:
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- val = vcpu->arch.dr7;
+ *dest = vcpu->arch.dr7;
else
- val = svm->vmcb->save.dr7;
+ *dest = svm->vmcb->save.dr7;
break;
- default:
- val = 0;
}
- return val;
+ return EMULATE_DONE;
}
-static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
- int *exception)
+static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
{
struct vcpu_svm *svm = to_svm(vcpu);
- *exception = 0;
-
switch (dr) {
case 0 ... 3:
vcpu->arch.db[dr] = value;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
vcpu->arch.eff_db[dr] = value;
- return;
- case 4 ... 5:
- if (vcpu->arch.cr4 & X86_CR4_DE)
- *exception = UD_VECTOR;
- return;
+ break;
+ case 4:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return EMULATE_FAIL; /* will re-inject UD */
+ /* fall through */
case 6:
- if (value & 0xffffffff00000000ULL) {
- *exception = GP_VECTOR;
- return;
- }
vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
- return;
+ break;
+ case 5:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return EMULATE_FAIL; /* will re-inject UD */
+ /* fall through */
case 7:
- if (value & 0xffffffff00000000ULL) {
- *exception = GP_VECTOR;
- return;
- }
vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
svm->vmcb->save.dr7 = vcpu->arch.dr7;
vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
}
- return;
- default:
- /* FIXME: Possible case? */
- printk(KERN_DEBUG "%s: unexpected dr %u\n",
- __func__, dr);
- *exception = UD_VECTOR;
- return;
+ break;
}
+
+ return EMULATE_DONE;
}
-static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int pf_interception(struct vcpu_svm *svm)
{
u64 fault_address;
u32 error_code;
@@ -1192,17 +1208,19 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
}
-static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int db_interception(struct vcpu_svm *svm)
{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+
if (!(svm->vcpu.guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
- !svm->vcpu.arch.singlestep) {
+ !svm->nmi_singlestep) {
kvm_queue_exception(&svm->vcpu, DB_VECTOR);
return 1;
}
- if (svm->vcpu.arch.singlestep) {
- svm->vcpu.arch.singlestep = false;
+ if (svm->nmi_singlestep) {
+ svm->nmi_singlestep = false;
if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
svm->vmcb->save.rflags &=
~(X86_EFLAGS_TF | X86_EFLAGS_RF);
@@ -1221,35 +1239,41 @@ static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
return 1;
}
-static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int bp_interception(struct vcpu_svm *svm)
{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
kvm_run->debug.arch.exception = BP_VECTOR;
return 0;
}
-static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int ud_interception(struct vcpu_svm *svm)
{
int er;
- er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
+ er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
if (er != EMULATE_DONE)
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
}
-static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static void svm_fpu_activate(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
- if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
- svm->vmcb->save.cr0 &= ~X86_CR0_TS;
svm->vcpu.fpu_active = 1;
+ update_cr0_intercept(svm);
+}
+static int nm_interception(struct vcpu_svm *svm)
+{
+ svm_fpu_activate(&svm->vcpu);
return 1;
}
-static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int mc_interception(struct vcpu_svm *svm)
{
/*
* On an #MC intercept the MCE handler is not called automatically in
@@ -1262,8 +1286,10 @@ static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
return 1;
}
-static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int shutdown_interception(struct vcpu_svm *svm)
{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+
/*
* VMCB is undefined after a SHUTDOWN intercept
* so reinitialize it.
@@ -1275,7 +1301,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
return 0;
}
-static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int io_interception(struct vcpu_svm *svm)
{
u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
int size, in, string;
@@ -1289,7 +1315,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
if (string) {
if (emulate_instruction(&svm->vcpu,
- kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
+ 0, 0, 0) == EMULATE_DO_MMIO)
return 0;
return 1;
}
@@ -1299,33 +1325,33 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
skip_emulated_instruction(&svm->vcpu);
- return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
+ return kvm_emulate_pio(&svm->vcpu, in, size, port);
}
-static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int nmi_interception(struct vcpu_svm *svm)
{
return 1;
}
-static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int intr_interception(struct vcpu_svm *svm)
{
++svm->vcpu.stat.irq_exits;
return 1;
}
-static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int nop_on_interception(struct vcpu_svm *svm)
{
return 1;
}
-static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int halt_interception(struct vcpu_svm *svm)
{
svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
skip_emulated_instruction(&svm->vcpu);
return kvm_emulate_halt(&svm->vcpu);
}
-static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int vmmcall_interception(struct vcpu_svm *svm)
{
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
@@ -1335,7 +1361,7 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
static int nested_svm_check_permissions(struct vcpu_svm *svm)
{
- if (!(svm->vcpu.arch.shadow_efer & EFER_SVME)
+ if (!(svm->vcpu.arch.efer & EFER_SVME)
|| !is_paging(&svm->vcpu)) {
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
@@ -1376,8 +1402,15 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
svm->vmcb->control.exit_code = SVM_EXIT_INTR;
- if (nested_svm_exit_handled(svm)) {
- nsvm_printk("VMexit -> INTR\n");
+ if (svm->nested.intercept & 1ULL) {
+ /*
+ * The #vmexit can't be emulated here directly because this
+ * code path runs with irqs and preemtion disabled. A
+ * #vmexit emulation might sleep. Only signal request for
+ * the #vmexit here.
+ */
+ svm->nested.exit_required = true;
+ trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
return 1;
}
@@ -1388,10 +1421,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
{
struct page *page;
- down_read(&current->mm->mmap_sem);
page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
- up_read(&current->mm->mmap_sem);
-
if (is_error_page(page))
goto error;
@@ -1530,14 +1560,12 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
}
default: {
u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
- nsvm_printk("exit code: 0x%x\n", exit_code);
if (svm->nested.intercept & exit_bits)
vmexit = NESTED_EXIT_DONE;
}
}
if (vmexit == NESTED_EXIT_DONE) {
- nsvm_printk("#VMEXIT reason=%04x\n", exit_code);
nested_svm_vmexit(svm);
}
@@ -1582,6 +1610,12 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
+ trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
+ vmcb->control.exit_info_1,
+ vmcb->control.exit_info_2,
+ vmcb->control.exit_int_info,
+ vmcb->control.exit_int_info_err);
+
nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
if (!nested_vmcb)
return 1;
@@ -1615,6 +1649,22 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
+
+ /*
+ * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
+ * to make sure that we do not lose injected events. So check event_inj
+ * here and copy it to exit_int_info if it is valid.
+ * Exit_int_info and event_inj can't be both valid because the case
+ * below only happens on a VMRUN instruction intercept which has
+ * no valid exit_int_info set.
+ */
+ if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
+ struct vmcb_control_area *nc = &nested_vmcb->control;
+
+ nc->exit_int_info = vmcb->control.event_inj;
+ nc->exit_int_info_err = vmcb->control.event_inj_err;
+ }
+
nested_vmcb->control.tlb_ctl = 0;
nested_vmcb->control.event_inj = 0;
nested_vmcb->control.event_inj_err = 0;
@@ -1626,10 +1676,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
/* Restore the original control entries */
copy_vmcb_control_area(vmcb, hsave);
- /* Kill any pending exceptions */
- if (svm->vcpu.arch.exception.pending == true)
- nsvm_printk("WARNING: Pending Exception\n");
-
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);
@@ -1700,6 +1746,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
/* nested_vmcb is our indicator if nested SVM is activated */
svm->nested.vmcb = svm->vmcb->save.rax;
+ trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
+ nested_vmcb->save.rip,
+ nested_vmcb->control.int_ctl,
+ nested_vmcb->control.event_inj,
+ nested_vmcb->control.nested_ctl);
+
/* Clear internal status */
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);
@@ -1712,8 +1764,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
hsave->save.ds = vmcb->save.ds;
hsave->save.gdtr = vmcb->save.gdtr;
hsave->save.idtr = vmcb->save.idtr;
- hsave->save.efer = svm->vcpu.arch.shadow_efer;
- hsave->save.cr0 = svm->vcpu.arch.cr0;
+ hsave->save.efer = svm->vcpu.arch.efer;
+ hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
hsave->save.cr4 = svm->vcpu.arch.cr4;
hsave->save.rflags = vmcb->save.rflags;
hsave->save.rip = svm->next_rip;
@@ -1787,28 +1839,15 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->nested.intercept = nested_vmcb->control.intercept;
force_new_asid(&svm->vcpu);
- svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
- svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
- if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
- nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
- nested_vmcb->control.int_ctl);
- }
if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
svm->vcpu.arch.hflags |= HF_VINTR_MASK;
else
svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
- nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
- nested_vmcb->control.exit_int_info,
- nested_vmcb->control.int_state);
-
svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
svm->vmcb->control.int_state = nested_vmcb->control.int_state;
svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
- if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
- nsvm_printk("Injecting Event: 0x%x\n",
- nested_vmcb->control.event_inj);
svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
@@ -1835,7 +1874,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
}
-static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int vmload_interception(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
@@ -1855,7 +1894,7 @@ static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
return 1;
}
-static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int vmsave_interception(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
@@ -1875,10 +1914,8 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
return 1;
}
-static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int vmrun_interception(struct vcpu_svm *svm)
{
- nsvm_printk("VMrun\n");
-
if (nested_svm_check_permissions(svm))
return 1;
@@ -1905,7 +1942,7 @@ failed:
return 1;
}
-static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int stgi_interception(struct vcpu_svm *svm)
{
if (nested_svm_check_permissions(svm))
return 1;
@@ -1918,7 +1955,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
return 1;
}
-static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int clgi_interception(struct vcpu_svm *svm)
{
if (nested_svm_check_permissions(svm))
return 1;
@@ -1935,10 +1972,12 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
return 1;
}
-static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int invlpga_interception(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
- nsvm_printk("INVLPGA\n");
+
+ trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
+ vcpu->arch.regs[VCPU_REGS_RAX]);
/* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
@@ -1948,15 +1987,21 @@ static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
return 1;
}
-static int invalid_op_interception(struct vcpu_svm *svm,
- struct kvm_run *kvm_run)
+static int skinit_interception(struct vcpu_svm *svm)
{
+ trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
+
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
}
-static int task_switch_interception(struct vcpu_svm *svm,
- struct kvm_run *kvm_run)
+static int invalid_op_interception(struct vcpu_svm *svm)
+{
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+}
+
+static int task_switch_interception(struct vcpu_svm *svm)
{
u16 tss_selector;
int reason;
@@ -2006,14 +2051,14 @@ static int task_switch_interception(struct vcpu_svm *svm,
return kvm_task_switch(&svm->vcpu, tss_selector, reason);
}
-static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int cpuid_interception(struct vcpu_svm *svm)
{
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
kvm_emulate_cpuid(&svm->vcpu);
return 1;
}
-static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int iret_interception(struct vcpu_svm *svm)
{
++svm->vcpu.stat.nmi_window_exits;
svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
@@ -2021,26 +2066,27 @@ static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
return 1;
}
-static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int invlpg_interception(struct vcpu_svm *svm)
{
- if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
+ if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
return 1;
}
-static int emulate_on_interception(struct vcpu_svm *svm,
- struct kvm_run *kvm_run)
+static int emulate_on_interception(struct vcpu_svm *svm)
{
- if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
+ if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
return 1;
}
-static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int cr8_write_interception(struct vcpu_svm *svm)
{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+
u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
/* instruction emulation calls kvm_set_cr8() */
- emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
+ emulate_instruction(&svm->vcpu, 0, 0, 0);
if (irqchip_in_kernel(svm->vcpu.kvm)) {
svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
return 1;
@@ -2057,10 +2103,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
switch (ecx) {
case MSR_IA32_TSC: {
- u64 tsc;
+ u64 tsc_offset;
- rdtscll(tsc);
- *data = svm->vmcb->control.tsc_offset + tsc;
+ if (is_nested(svm))
+ tsc_offset = svm->nested.hsave->control.tsc_offset;
+ else
+ tsc_offset = svm->vmcb->control.tsc_offset;
+
+ *data = tsc_offset + native_read_tsc();
break;
}
case MSR_K6_STAR:
@@ -2122,14 +2172,15 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
return 0;
}
-static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int rdmsr_interception(struct vcpu_svm *svm)
{
u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
u64 data;
- if (svm_get_msr(&svm->vcpu, ecx, &data))
+ if (svm_get_msr(&svm->vcpu, ecx, &data)) {
+ trace_kvm_msr_read_ex(ecx);
kvm_inject_gp(&svm->vcpu, 0);
- else {
+ } else {
trace_kvm_msr_read(ecx, data);
svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
@@ -2146,10 +2197,17 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
switch (ecx) {
case MSR_IA32_TSC: {
- u64 tsc;
+ u64 tsc_offset = data - native_read_tsc();
+ u64 g_tsc_offset = 0;
+
+ if (is_nested(svm)) {
+ g_tsc_offset = svm->vmcb->control.tsc_offset -
+ svm->nested.hsave->control.tsc_offset;
+ svm->nested.hsave->control.tsc_offset = tsc_offset;
+ }
+
+ svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
- rdtscll(tsc);
- svm->vmcb->control.tsc_offset = data - tsc;
break;
}
case MSR_K6_STAR:
@@ -2208,33 +2266,36 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
return 0;
}
-static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int wrmsr_interception(struct vcpu_svm *svm)
{
u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
- trace_kvm_msr_write(ecx, data);
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- if (svm_set_msr(&svm->vcpu, ecx, data))
+ if (svm_set_msr(&svm->vcpu, ecx, data)) {
+ trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(&svm->vcpu, 0);
- else
+ } else {
+ trace_kvm_msr_write(ecx, data);
skip_emulated_instruction(&svm->vcpu);
+ }
return 1;
}
-static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int msr_interception(struct vcpu_svm *svm)
{
if (svm->vmcb->control.exit_info_1)
- return wrmsr_interception(svm, kvm_run);
+ return wrmsr_interception(svm);
else
- return rdmsr_interception(svm, kvm_run);
+ return rdmsr_interception(svm);
}
-static int interrupt_window_interception(struct vcpu_svm *svm,
- struct kvm_run *kvm_run)
+static int interrupt_window_interception(struct vcpu_svm *svm)
{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
/*
@@ -2252,13 +2313,18 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
return 1;
}
-static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
- struct kvm_run *kvm_run) = {
+static int pause_interception(struct vcpu_svm *svm)
+{
+ kvm_vcpu_on_spin(&(svm->vcpu));
+ return 1;
+}
+
+static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_READ_CR0] = emulate_on_interception,
[SVM_EXIT_READ_CR3] = emulate_on_interception,
[SVM_EXIT_READ_CR4] = emulate_on_interception,
[SVM_EXIT_READ_CR8] = emulate_on_interception,
- /* for now: */
+ [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
[SVM_EXIT_WRITE_CR0] = emulate_on_interception,
[SVM_EXIT_WRITE_CR3] = emulate_on_interception,
[SVM_EXIT_WRITE_CR4] = emulate_on_interception,
@@ -2267,11 +2333,17 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
[SVM_EXIT_READ_DR1] = emulate_on_interception,
[SVM_EXIT_READ_DR2] = emulate_on_interception,
[SVM_EXIT_READ_DR3] = emulate_on_interception,
+ [SVM_EXIT_READ_DR4] = emulate_on_interception,
+ [SVM_EXIT_READ_DR5] = emulate_on_interception,
+ [SVM_EXIT_READ_DR6] = emulate_on_interception,
+ [SVM_EXIT_READ_DR7] = emulate_on_interception,
[SVM_EXIT_WRITE_DR0] = emulate_on_interception,
[SVM_EXIT_WRITE_DR1] = emulate_on_interception,
[SVM_EXIT_WRITE_DR2] = emulate_on_interception,
[SVM_EXIT_WRITE_DR3] = emulate_on_interception,
+ [SVM_EXIT_WRITE_DR4] = emulate_on_interception,
[SVM_EXIT_WRITE_DR5] = emulate_on_interception,
+ [SVM_EXIT_WRITE_DR6] = emulate_on_interception,
[SVM_EXIT_WRITE_DR7] = emulate_on_interception,
[SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
[SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
@@ -2288,6 +2360,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
[SVM_EXIT_CPUID] = cpuid_interception,
[SVM_EXIT_IRET] = iret_interception,
[SVM_EXIT_INVD] = emulate_on_interception,
+ [SVM_EXIT_PAUSE] = pause_interception,
[SVM_EXIT_HLT] = halt_interception,
[SVM_EXIT_INVLPG] = invlpg_interception,
[SVM_EXIT_INVLPGA] = invlpga_interception,
@@ -2301,26 +2374,36 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
[SVM_EXIT_VMSAVE] = vmsave_interception,
[SVM_EXIT_STGI] = stgi_interception,
[SVM_EXIT_CLGI] = clgi_interception,
- [SVM_EXIT_SKINIT] = invalid_op_interception,
+ [SVM_EXIT_SKINIT] = skinit_interception,
[SVM_EXIT_WBINVD] = emulate_on_interception,
[SVM_EXIT_MONITOR] = invalid_op_interception,
[SVM_EXIT_MWAIT] = invalid_op_interception,
[SVM_EXIT_NPF] = pf_interception,
};
-static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+static int handle_exit(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;
trace_kvm_exit(exit_code, svm->vmcb->save.rip);
+ if (unlikely(svm->nested.exit_required)) {
+ nested_svm_vmexit(svm);
+ svm->nested.exit_required = false;
+
+ return 1;
+ }
+
if (is_nested(svm)) {
int vmexit;
- nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n",
- exit_code, svm->vmcb->control.exit_info_1,
- svm->vmcb->control.exit_info_2, svm->vmcb->save.rip);
+ trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
+ svm->vmcb->control.exit_info_1,
+ svm->vmcb->control.exit_info_2,
+ svm->vmcb->control.exit_int_info,
+ svm->vmcb->control.exit_int_info_err);
vmexit = nested_svm_exit_special(svm);
@@ -2333,20 +2416,10 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
svm_complete_interrupts(svm);
- if (npt_enabled) {
- int mmu_reload = 0;
- if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
- svm_set_cr0(vcpu, svm->vmcb->save.cr0);
- mmu_reload = 1;
- }
+ if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
vcpu->arch.cr0 = svm->vmcb->save.cr0;
+ if (npt_enabled)
vcpu->arch.cr3 = svm->vmcb->save.cr3;
- if (mmu_reload) {
- kvm_mmu_reset_context(vcpu);
- kvm_mmu_load(vcpu);
- }
- }
-
if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2370,15 +2443,15 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
return 0;
}
- return svm_exit_handlers[exit_code](svm, kvm_run);
+ return svm_exit_handlers[exit_code](svm);
}
static void reload_tss(struct kvm_vcpu *vcpu)
{
int cpu = raw_smp_processor_id();
- struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
- svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ sd->tss_desc->type = 9; /* available 32/64-bit TSS */
load_TR_desc();
}
@@ -2386,12 +2459,12 @@ static void pre_svm_run(struct vcpu_svm *svm)
{
int cpu = raw_smp_processor_id();
- struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
/* FIXME: handle wraparound of asid_generation */
- if (svm->asid_generation != svm_data->asid_generation)
- new_asid(svm, svm_data);
+ if (svm->asid_generation != sd->asid_generation)
+ new_asid(svm, sd);
}
static void svm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2447,20 +2520,47 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
!(svm->vcpu.arch.hflags & HF_NMI_MASK);
}
+static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
+}
+
+static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (masked) {
+ svm->vcpu.arch.hflags |= HF_NMI_MASK;
+ svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
+ } else {
+ svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
+ svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
+ }
+}
+
static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
- return (vmcb->save.rflags & X86_EFLAGS_IF) &&
- !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
- gif_set(svm) &&
- !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK));
+ int ret;
+
+ if (!gif_set(svm) ||
+ (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
+ return 0;
+
+ ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
+
+ if (is_nested(svm))
+ return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
+
+ return ret;
}
static void enable_irq_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- nsvm_printk("Trying to open IRQ window\n");
nested_svm_intr(svm);
@@ -2485,7 +2585,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
/* Something prevents NMI from been injected. Single step over
possible problem (IRET or exception injection or interrupt
shadow) */
- vcpu->arch.singlestep = true;
+ svm->nmi_singlestep = true;
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
update_db_intercept(vcpu);
}
@@ -2575,13 +2675,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
#define R "e"
#endif
-static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void svm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u16 fs_selector;
u16 gs_selector;
u16 ldt_selector;
+ /*
+ * A vmexit emulation is required before the vcpu can be executed
+ * again.
+ */
+ if (unlikely(svm->nested.exit_required))
+ return;
+
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
@@ -2714,12 +2821,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
svm->vmcb->save.cr3 = root;
force_new_asid(vcpu);
-
- if (vcpu->fpu_active) {
- svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
- svm->vmcb->save.cr0 |= X86_CR0_TS;
- vcpu->fpu_active = 0;
- }
}
static int is_disabled(void)
@@ -2768,6 +2869,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
return 0;
}
+static void svm_cpuid_update(struct kvm_vcpu *vcpu)
+{
+}
+
static const struct trace_print_flags svm_exit_reasons_str[] = {
{ SVM_EXIT_READ_CR0, "read_cr0" },
{ SVM_EXIT_READ_CR3, "read_cr3" },
@@ -2821,9 +2926,22 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
{ -1, NULL }
};
-static bool svm_gb_page_enable(void)
+static int svm_get_lpage_level(void)
{
- return true;
+ return PT_PDPE_LEVEL;
+}
+
+static bool svm_rdtscp_supported(void)
+{
+ return false;
+}
+
+static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ update_cr0_intercept(svm);
+ svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
}
static struct kvm_x86_ops svm_x86_ops = {
@@ -2852,6 +2970,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_segment = svm_set_segment,
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
+ .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
.set_cr0 = svm_set_cr0,
.set_cr3 = svm_set_cr3,
@@ -2866,6 +2985,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
+ .fpu_activate = svm_fpu_activate,
+ .fpu_deactivate = svm_fpu_deactivate,
.tlb_flush = svm_flush_tlb,
@@ -2880,6 +3001,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.queue_exception = svm_queue_exception,
.interrupt_allowed = svm_interrupt_allowed,
.nmi_allowed = svm_nmi_allowed,
+ .get_nmi_mask = svm_get_nmi_mask,
+ .set_nmi_mask = svm_set_nmi_mask,
.enable_nmi_window = enable_nmi_window,
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
@@ -2889,7 +3012,11 @@ static struct kvm_x86_ops svm_x86_ops = {
.get_mt_mask = svm_get_mt_mask,
.exit_reasons_str = svm_exit_reasons_str,
- .gb_page_enable = svm_gb_page_enable,
+ .get_lpage_level = svm_get_lpage_level,
+
+ .cpuid_update = svm_cpuid_update,
+
+ .rdtscp_supported = svm_rdtscp_supported,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0d480e77eacf..6ad30a29f044 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -56,6 +56,38 @@ TRACE_EVENT(kvm_hypercall,
);
/*
+ * Tracepoint for hypercall.
+ */
+TRACE_EVENT(kvm_hv_hypercall,
+ TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx,
+ __u64 ingpa, __u64 outgpa),
+ TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
+
+ TP_STRUCT__entry(
+ __field( __u16, code )
+ __field( bool, fast )
+ __field( __u16, rep_cnt )
+ __field( __u16, rep_idx )
+ __field( __u64, ingpa )
+ __field( __u64, outgpa )
+ ),
+
+ TP_fast_assign(
+ __entry->code = code;
+ __entry->fast = fast;
+ __entry->rep_cnt = rep_cnt;
+ __entry->rep_idx = rep_idx;
+ __entry->ingpa = ingpa;
+ __entry->outgpa = outgpa;
+ ),
+
+ TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
+ __entry->code, __entry->fast ? "fast" : "slow",
+ __entry->rep_cnt, __entry->rep_idx, __entry->ingpa,
+ __entry->outgpa)
+);
+
+/*
* Tracepoint for PIO.
*/
TRACE_EVENT(kvm_pio,
@@ -214,28 +246,33 @@ TRACE_EVENT(kvm_page_fault,
* Tracepoint for guest MSR access.
*/
TRACE_EVENT(kvm_msr,
- TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data),
- TP_ARGS(rw, ecx, data),
+ TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception),
+ TP_ARGS(write, ecx, data, exception),
TP_STRUCT__entry(
- __field( unsigned int, rw )
- __field( unsigned int, ecx )
- __field( unsigned long, data )
+ __field( unsigned, write )
+ __field( u32, ecx )
+ __field( u64, data )
+ __field( u8, exception )
),
TP_fast_assign(
- __entry->rw = rw;
+ __entry->write = write;
__entry->ecx = ecx;
__entry->data = data;
+ __entry->exception = exception;
),
- TP_printk("msr_%s %x = 0x%lx",
- __entry->rw ? "write" : "read",
- __entry->ecx, __entry->data)
+ TP_printk("msr_%s %x = 0x%llx%s",
+ __entry->write ? "write" : "read",
+ __entry->ecx, __entry->data,
+ __entry->exception ? " (#GP)" : "")
);
-#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data)
-#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data)
+#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false)
+#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false)
+#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true)
+#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true)
/*
* Tracepoint for guest CR access.
@@ -349,6 +386,171 @@ TRACE_EVENT(kvm_apic_accept_irq,
__entry->coalesced ? " (coalesced)" : "")
);
+/*
+ * Tracepoint for nested VMRUN
+ */
+TRACE_EVENT(kvm_nested_vmrun,
+ TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
+ __u32 event_inj, bool npt),
+ TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u64, vmcb )
+ __field( __u64, nested_rip )
+ __field( __u32, int_ctl )
+ __field( __u32, event_inj )
+ __field( bool, npt )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->vmcb = vmcb;
+ __entry->nested_rip = nested_rip;
+ __entry->int_ctl = int_ctl;
+ __entry->event_inj = event_inj;
+ __entry->npt = npt;
+ ),
+
+ TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
+ "event_inj: 0x%08x npt: %s\n",
+ __entry->rip, __entry->vmcb, __entry->nested_rip,
+ __entry->int_ctl, __entry->event_inj,
+ __entry->npt ? "on" : "off")
+);
+
+/*
+ * Tracepoint for #VMEXIT while nested
+ */
+TRACE_EVENT(kvm_nested_vmexit,
+ TP_PROTO(__u64 rip, __u32 exit_code,
+ __u64 exit_info1, __u64 exit_info2,
+ __u32 exit_int_info, __u32 exit_int_info_err),
+ TP_ARGS(rip, exit_code, exit_info1, exit_info2,
+ exit_int_info, exit_int_info_err),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u32, exit_code )
+ __field( __u64, exit_info1 )
+ __field( __u64, exit_info2 )
+ __field( __u32, exit_int_info )
+ __field( __u32, exit_int_info_err )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->exit_code = exit_code;
+ __entry->exit_info1 = exit_info1;
+ __entry->exit_info2 = exit_info2;
+ __entry->exit_int_info = exit_int_info;
+ __entry->exit_int_info_err = exit_int_info_err;
+ ),
+ TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
+ "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
+ __entry->rip,
+ ftrace_print_symbols_seq(p, __entry->exit_code,
+ kvm_x86_ops->exit_reasons_str),
+ __entry->exit_info1, __entry->exit_info2,
+ __entry->exit_int_info, __entry->exit_int_info_err)
+);
+
+/*
+ * Tracepoint for #VMEXIT reinjected to the guest
+ */
+TRACE_EVENT(kvm_nested_vmexit_inject,
+ TP_PROTO(__u32 exit_code,
+ __u64 exit_info1, __u64 exit_info2,
+ __u32 exit_int_info, __u32 exit_int_info_err),
+ TP_ARGS(exit_code, exit_info1, exit_info2,
+ exit_int_info, exit_int_info_err),
+
+ TP_STRUCT__entry(
+ __field( __u32, exit_code )
+ __field( __u64, exit_info1 )
+ __field( __u64, exit_info2 )
+ __field( __u32, exit_int_info )
+ __field( __u32, exit_int_info_err )
+ ),
+
+ TP_fast_assign(
+ __entry->exit_code = exit_code;
+ __entry->exit_info1 = exit_info1;
+ __entry->exit_info2 = exit_info2;
+ __entry->exit_int_info = exit_int_info;
+ __entry->exit_int_info_err = exit_int_info_err;
+ ),
+
+ TP_printk("reason: %s ext_inf1: 0x%016llx "
+ "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
+ ftrace_print_symbols_seq(p, __entry->exit_code,
+ kvm_x86_ops->exit_reasons_str),
+ __entry->exit_info1, __entry->exit_info2,
+ __entry->exit_int_info, __entry->exit_int_info_err)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_nested_intr_vmexit,
+ TP_PROTO(__u64 rip),
+ TP_ARGS(rip),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip
+ ),
+
+ TP_printk("rip: 0x%016llx\n", __entry->rip)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_invlpga,
+ TP_PROTO(__u64 rip, int asid, u64 address),
+ TP_ARGS(rip, asid, address),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( int, asid )
+ __field( __u64, address )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->asid = asid;
+ __entry->address = address;
+ ),
+
+ TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n",
+ __entry->rip, __entry->asid, __entry->address)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_skinit,
+ TP_PROTO(__u64 rip, __u32 slb),
+ TP_ARGS(rip, slb),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u32, slb )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->slb = slb;
+ ),
+
+ TP_printk("rip: 0x%016llx slb: 0x%08x\n",
+ __entry->rip, __entry->slb)
+);
+
#endif /* _TRACE_KVM_H */
/* This part must be outside protection */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f3812014bd0b..14873b9f8430 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -61,12 +61,52 @@ module_param_named(unrestricted_guest,
static int __read_mostly emulate_invalid_guest_state = 0;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
+ (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
+#define KVM_GUEST_CR0_MASK \
+ (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
+ (X86_CR0_WP | X86_CR0_NE)
+#define KVM_VM_CR0_ALWAYS_ON \
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+#define KVM_CR4_GUEST_OWNED_BITS \
+ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXMMEXCPT)
+
+#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
+#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
+
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * ple_gap: upper bound on the amount of time between two successive
+ * executions of PAUSE in a loop. Also indicate if ple enabled.
+ * According to test, this time is usually small than 41 cycles.
+ * ple_window: upper bound on the amount of time a guest is allowed to execute
+ * in a PAUSE loop. Tests indicate that most spinlocks are held for
+ * less than 2^12 cycles
+ * Time is measured based on a counter that runs at the same rate as the TSC,
+ * refer SDM volume 3b section 21.6.13 & 22.1.3.
+ */
+#define KVM_VMX_DEFAULT_PLE_GAP 41
+#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
+static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
+module_param(ple_gap, int, S_IRUGO);
+
+static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
+module_param(ple_window, int, S_IRUGO);
+
struct vmcs {
u32 revision_id;
u32 abort;
char data[0];
};
+struct shared_msr_entry {
+ unsigned index;
+ u64 data;
+ u64 mask;
+};
+
struct vcpu_vmx {
struct kvm_vcpu vcpu;
struct list_head local_vcpus_link;
@@ -74,13 +114,12 @@ struct vcpu_vmx {
int launched;
u8 fail;
u32 idt_vectoring_info;
- struct kvm_msr_entry *guest_msrs;
- struct kvm_msr_entry *host_msrs;
+ struct shared_msr_entry *guest_msrs;
int nmsrs;
int save_nmsrs;
- int msr_offset_efer;
#ifdef CONFIG_X86_64
- int msr_offset_kernel_gs_base;
+ u64 msr_host_kernel_gs_base;
+ u64 msr_guest_kernel_gs_base;
#endif
struct vmcs *vmcs;
struct {
@@ -88,7 +127,6 @@ struct vcpu_vmx {
u16 fs_sel, gs_sel, ldt_sel;
int gs_ldt_reload_needed;
int fs_reload_needed;
- int guest_efer_loaded;
} host_state;
struct {
int vm86_active;
@@ -107,13 +145,14 @@ struct vcpu_vmx {
} rmode;
int vpid;
bool emulation_required;
- enum emulation_result invalid_state_emulation_result;
/* Support for vnmi-less CPUs */
int soft_vnmi_blocked;
ktime_t entry_time;
s64 vnmi_blocked_time;
u32 exit_reason;
+
+ bool rdtscp_enabled;
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -176,6 +215,8 @@ static struct kvm_vmx_segment_field {
VMX_SEGMENT_FIELD(LDTR),
};
+static u64 host_efer;
+
static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
/*
@@ -184,28 +225,12 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
*/
static const u32 vmx_msr_index[] = {
#ifdef CONFIG_X86_64
- MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
+ MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
- MSR_EFER, MSR_K6_STAR,
+ MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR,
};
#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
-static void load_msrs(struct kvm_msr_entry *e, int n)
-{
- int i;
-
- for (i = 0; i < n; ++i)
- wrmsrl(e[i].index, e[i].data);
-}
-
-static void save_msrs(struct kvm_msr_entry *e, int n)
-{
- int i;
-
- for (i = 0; i < n; ++i)
- rdmsrl(e[i].index, e[i].data);
-}
-
static inline int is_page_fault(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -293,6 +318,11 @@ static inline bool cpu_has_vmx_ept_2m_page(void)
return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
}
+static inline bool cpu_has_vmx_ept_1g_page(void)
+{
+ return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT);
+}
+
static inline int cpu_has_vmx_invept_individual_addr(void)
{
return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -320,11 +350,15 @@ static inline int cpu_has_vmx_unrestricted_guest(void)
SECONDARY_EXEC_UNRESTRICTED_GUEST;
}
+static inline int cpu_has_vmx_ple(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+}
+
static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
{
- return flexpriority_enabled &&
- (cpu_has_vmx_virtualize_apic_accesses()) &&
- (irqchip_in_kernel(kvm));
+ return flexpriority_enabled && irqchip_in_kernel(kvm);
}
static inline int cpu_has_vmx_vpid(void)
@@ -333,6 +367,12 @@ static inline int cpu_has_vmx_vpid(void)
SECONDARY_EXEC_ENABLE_VPID;
}
+static inline int cpu_has_vmx_rdtscp(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_RDTSCP;
+}
+
static inline int cpu_has_virtual_nmis(void)
{
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -348,7 +388,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
int i;
for (i = 0; i < vmx->nmsrs; ++i)
- if (vmx->guest_msrs[i].index == msr)
+ if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
return i;
return -1;
}
@@ -379,7 +419,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
: : "a" (&operand), "c" (ext) : "cc", "memory");
}
-static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
+static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
{
int i;
@@ -537,22 +577,18 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;
- eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
- if (!vcpu->fpu_active)
- eb |= 1u << NM_VECTOR;
- /*
- * Unconditionally intercept #DB so we can maintain dr6 without
- * reading it every exit.
- */
- eb |= 1u << DB_VECTOR;
- if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
- eb |= 1u << BP_VECTOR;
- }
+ eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
+ (1u << NM_VECTOR) | (1u << DB_VECTOR);
+ if ((vcpu->guest_debug &
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
+ eb |= 1u << BP_VECTOR;
if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0;
if (enable_ept)
eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
+ if (vcpu->fpu_active)
+ eb &= ~(1u << NM_VECTOR);
vmcs_write32(EXCEPTION_BITMAP, eb);
}
@@ -570,17 +606,12 @@ static void reload_tss(void)
load_TR_desc();
}
-static void load_transition_efer(struct vcpu_vmx *vmx)
+static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
{
- int efer_offset = vmx->msr_offset_efer;
- u64 host_efer;
u64 guest_efer;
u64 ignore_bits;
- if (efer_offset < 0)
- return;
- host_efer = vmx->host_msrs[efer_offset].data;
- guest_efer = vmx->guest_msrs[efer_offset].data;
+ guest_efer = vmx->vcpu.arch.efer;
/*
* NX is emulated; LMA and LME handled by hardware; SCE meaninless
@@ -593,27 +624,17 @@ static void load_transition_efer(struct vcpu_vmx *vmx)
if (guest_efer & EFER_LMA)
ignore_bits &= ~(u64)EFER_SCE;
#endif
- if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
- return;
-
- vmx->host_state.guest_efer_loaded = 1;
guest_efer &= ~ignore_bits;
guest_efer |= host_efer & ignore_bits;
- wrmsrl(MSR_EFER, guest_efer);
- vmx->vcpu.stat.efer_reload++;
-}
-
-static void reload_host_efer(struct vcpu_vmx *vmx)
-{
- if (vmx->host_state.guest_efer_loaded) {
- vmx->host_state.guest_efer_loaded = 0;
- load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
- }
+ vmx->guest_msrs[efer_offset].data = guest_efer;
+ vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+ return true;
}
static void vmx_save_host_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int i;
if (vmx->host_state.loaded)
return;
@@ -650,13 +671,15 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
#endif
#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu))
- save_msrs(vmx->host_msrs +
- vmx->msr_offset_kernel_gs_base, 1);
-
+ if (is_long_mode(&vmx->vcpu)) {
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ }
#endif
- load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
- load_transition_efer(vmx);
+ for (i = 0; i < vmx->save_nmsrs; ++i)
+ kvm_set_shared_msr(vmx->guest_msrs[i].index,
+ vmx->guest_msrs[i].data,
+ vmx->guest_msrs[i].mask);
}
static void __vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -684,9 +707,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
local_irq_restore(flags);
}
reload_tss();
- save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
- load_msrs(vmx->host_msrs, vmx->save_nmsrs);
- reload_host_efer(vmx);
+#ifdef CONFIG_X86_64
+ if (is_long_mode(&vmx->vcpu)) {
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ }
+#endif
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -709,7 +735,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (vcpu->cpu != cpu) {
vcpu_clear(vmx);
kvm_migrate_timers(vcpu);
- vpid_sync_vcpu_all(vmx);
+ set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
local_irq_disable();
list_add(&vmx->local_vcpus_link,
&per_cpu(vcpus_on_cpu, cpu));
@@ -763,22 +789,30 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
{
+ ulong cr0;
+
if (vcpu->fpu_active)
return;
vcpu->fpu_active = 1;
- vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
- if (vcpu->arch.cr0 & X86_CR0_TS)
- vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
+ cr0 = vmcs_readl(GUEST_CR0);
+ cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
+ cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
+ vmcs_writel(GUEST_CR0, cr0);
update_exception_bitmap(vcpu);
+ vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
}
+static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
+
static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
{
- if (!vcpu->fpu_active)
- return;
- vcpu->fpu_active = 0;
- vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
+ vmx_decache_cr0_guest_bits(vcpu);
+ vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
update_exception_bitmap(vcpu);
+ vcpu->arch.cr0_guest_owned_bits = 0;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+ vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
}
static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -874,22 +908,22 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
}
+static bool vmx_rdtscp_supported(void)
+{
+ return cpu_has_vmx_rdtscp();
+}
+
/*
* Swap MSR entry in host/guest MSR entry array.
*/
-#ifdef CONFIG_X86_64
static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
{
- struct kvm_msr_entry tmp;
+ struct shared_msr_entry tmp;
tmp = vmx->guest_msrs[to];
vmx->guest_msrs[to] = vmx->guest_msrs[from];
vmx->guest_msrs[from] = tmp;
- tmp = vmx->host_msrs[to];
- vmx->host_msrs[to] = vmx->host_msrs[from];
- vmx->host_msrs[from] = tmp;
}
-#endif
/*
* Set up the vmcs to automatically save and restore system
@@ -898,15 +932,13 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
*/
static void setup_msrs(struct vcpu_vmx *vmx)
{
- int save_nmsrs;
+ int save_nmsrs, index;
unsigned long *msr_bitmap;
vmx_load_host_state(vmx);
save_nmsrs = 0;
#ifdef CONFIG_X86_64
if (is_long_mode(&vmx->vcpu)) {
- int index;
-
index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
@@ -916,25 +948,23 @@ static void setup_msrs(struct vcpu_vmx *vmx)
index = __find_msr_index(vmx, MSR_CSTAR);
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
- if (index >= 0)
+ index = __find_msr_index(vmx, MSR_TSC_AUX);
+ if (index >= 0 && vmx->rdtscp_enabled)
move_msr_up(vmx, index, save_nmsrs++);
/*
* MSR_K6_STAR is only needed on long mode guests, and only
* if efer.sce is enabled.
*/
index = __find_msr_index(vmx, MSR_K6_STAR);
- if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
+ if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
move_msr_up(vmx, index, save_nmsrs++);
}
#endif
- vmx->save_nmsrs = save_nmsrs;
+ index = __find_msr_index(vmx, MSR_EFER);
+ if (index >= 0 && update_transition_efer(vmx, index))
+ move_msr_up(vmx, index, save_nmsrs++);
-#ifdef CONFIG_X86_64
- vmx->msr_offset_kernel_gs_base =
- __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
-#endif
- vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
+ vmx->save_nmsrs = save_nmsrs;
if (cpu_has_vmx_msr_bitmap()) {
if (is_long_mode(&vmx->vcpu))
@@ -976,7 +1006,7 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
{
u64 data;
- struct kvm_msr_entry *msr;
+ struct shared_msr_entry *msr;
if (!pdata) {
printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
@@ -991,9 +1021,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
case MSR_GS_BASE:
data = vmcs_readl(GUEST_GS_BASE);
break;
+ case MSR_KERNEL_GS_BASE:
+ vmx_load_host_state(to_vmx(vcpu));
+ data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
+ break;
+#endif
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_index, pdata);
-#endif
case MSR_IA32_TSC:
data = guest_read_tsc();
break;
@@ -1006,7 +1040,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
case MSR_IA32_SYSENTER_ESP:
data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
+ case MSR_TSC_AUX:
+ if (!to_vmx(vcpu)->rdtscp_enabled)
+ return 1;
+ /* Otherwise falls through */
default:
+ vmx_load_host_state(to_vmx(vcpu));
msr = find_msr_entry(to_vmx(vcpu), msr_index);
if (msr) {
vmx_load_host_state(to_vmx(vcpu));
@@ -1028,7 +1067,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_msr_entry *msr;
+ struct shared_msr_entry *msr;
u64 host_tsc;
int ret = 0;
@@ -1044,6 +1083,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
case MSR_GS_BASE:
vmcs_writel(GUEST_GS_BASE, data);
break;
+ case MSR_KERNEL_GS_BASE:
+ vmx_load_host_state(vmx);
+ vmx->msr_guest_kernel_gs_base = data;
+ break;
#endif
case MSR_IA32_SYSENTER_CS:
vmcs_write32(GUEST_SYSENTER_CS, data);
@@ -1064,7 +1107,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
vcpu->arch.pat = data;
break;
}
- /* Otherwise falls through to kvm_set_msr_common */
+ ret = kvm_set_msr_common(vcpu, msr_index, data);
+ break;
+ case MSR_TSC_AUX:
+ if (!vmx->rdtscp_enabled)
+ return 1;
+ /* Check reserved bit, higher 32 bits should be zero */
+ if ((data >> 32) != 0)
+ return 1;
+ /* Otherwise falls through */
default:
msr = find_msr_entry(vmx, msr_index);
if (msr) {
@@ -1097,30 +1148,14 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
}
}
-static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
{
- int old_debug = vcpu->guest_debug;
- unsigned long flags;
-
- vcpu->guest_debug = dbg->control;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
- vcpu->guest_debug = 0;
-
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
else
vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
- flags = vmcs_readl(GUEST_RFLAGS);
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
- else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
- flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
- vmcs_writel(GUEST_RFLAGS, flags);
-
update_exception_bitmap(vcpu);
-
- return 0;
}
static __init int cpu_has_kvm_support(void)
@@ -1139,12 +1174,15 @@ static __init int vmx_disabled_by_bios(void)
/* locked but not enabled */
}
-static void hardware_enable(void *garbage)
+static int hardware_enable(void *garbage)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
u64 old;
+ if (read_cr4() & X86_CR4_VMXE)
+ return -EBUSY;
+
INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
if ((old & (FEATURE_CONTROL_LOCKED |
@@ -1159,6 +1197,10 @@ static void hardware_enable(void *garbage)
asm volatile (ASM_VMX_VMXON_RAX
: : "a"(&phys_addr), "m"(phys_addr)
: "memory", "cc");
+
+ ept_sync_global();
+
+ return 0;
}
static void vmclear_local_vcpus(void)
@@ -1232,6 +1274,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_USE_IO_BITMAPS |
CPU_BASED_MOV_DR_EXITING |
CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_MWAIT_EXITING |
+ CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING;
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
@@ -1250,7 +1294,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_ENABLE_EPT |
- SECONDARY_EXEC_UNRESTRICTED_GUEST;
+ SECONDARY_EXEC_UNRESTRICTED_GUEST |
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING |
+ SECONDARY_EXEC_RDTSCP;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -1344,15 +1390,17 @@ static void free_kvm_area(void)
{
int cpu;
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu) {
free_vmcs(per_cpu(vmxarea, cpu));
+ per_cpu(vmxarea, cpu) = NULL;
+ }
}
static __init int alloc_kvm_area(void)
{
int cpu;
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
struct vmcs *vmcs;
vmcs = alloc_vmcs_cpu(cpu);
@@ -1394,6 +1442,9 @@ static __init int hardware_setup(void)
if (enable_ept && !cpu_has_vmx_ept_2m_page())
kvm_disable_largepages();
+ if (!cpu_has_vmx_ple())
+ ple_gap = 0;
+
return alloc_kvm_area();
}
@@ -1459,8 +1510,12 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
static gva_t rmode_tss_base(struct kvm *kvm)
{
if (!kvm->arch.tss_addr) {
- gfn_t base_gfn = kvm->memslots[0].base_gfn +
- kvm->memslots[0].npages - 3;
+ struct kvm_memslots *slots;
+ gfn_t base_gfn;
+
+ slots = rcu_dereference(kvm->memslots);
+ base_gfn = kvm->memslots->memslots[0].base_gfn +
+ kvm->memslots->memslots[0].npages - 3;
return base_gfn << PAGE_SHIFT;
}
return kvm->arch.tss_addr;
@@ -1536,11 +1591,17 @@ continue_rmode:
static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+ struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
- vcpu->arch.shadow_efer = efer;
if (!msr)
return;
+
+ /*
+ * Force kernel_gs_base reloading before EFER changes, as control
+ * of this msr depends on is_long_mode().
+ */
+ vmx_load_host_state(to_vmx(vcpu));
+ vcpu->arch.efer = efer;
if (efer & EFER_LMA) {
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1570,13 +1631,13 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
(guest_tr_ar & ~AR_TYPE_MASK)
| AR_TYPE_BUSY_64_TSS);
}
- vcpu->arch.shadow_efer |= EFER_LMA;
- vmx_set_efer(vcpu, vcpu->arch.shadow_efer);
+ vcpu->arch.efer |= EFER_LMA;
+ vmx_set_efer(vcpu, vcpu->arch.efer);
}
static void exit_lmode(struct kvm_vcpu *vcpu)
{
- vcpu->arch.shadow_efer &= ~EFER_LMA;
+ vcpu->arch.efer &= ~EFER_LMA;
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS)
@@ -1592,10 +1653,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
}
+static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
+{
+ ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
+
+ vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
+ vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
+}
+
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
- vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
- vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
+ ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
+
+ vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
+ vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
}
static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -1640,7 +1711,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING));
vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, vcpu->arch.cr4);
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
} else if (!is_paging(vcpu)) {
/* From nonpaging to paging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1648,23 +1719,13 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
~(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING));
vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, vcpu->arch.cr4);
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
}
if (!(cr0 & X86_CR0_WP))
*hw_cr0 &= ~X86_CR0_WP;
}
-static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
- struct kvm_vcpu *vcpu)
-{
- if (!is_paging(vcpu)) {
- *hw_cr4 &= ~X86_CR4_PAE;
- *hw_cr4 |= X86_CR4_PSE;
- } else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
- *hw_cr4 &= ~X86_CR4_PAE;
-}
-
static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1676,8 +1737,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
else
hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
- vmx_fpu_deactivate(vcpu);
-
if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
@@ -1685,7 +1744,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
enter_rmode(vcpu);
#ifdef CONFIG_X86_64
- if (vcpu->arch.shadow_efer & EFER_LME) {
+ if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
enter_lmode(vcpu);
if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
@@ -1696,12 +1755,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (enable_ept)
ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+ if (!vcpu->fpu_active)
+ hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
+
vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0, hw_cr0);
vcpu->arch.cr0 = cr0;
-
- if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
- vmx_fpu_activate(vcpu);
}
static u64 construct_eptp(unsigned long root_hpa)
@@ -1727,12 +1786,11 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
vmcs_write64(EPT_POINTER, eptp);
guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
vcpu->kvm->arch.ept_identity_map_addr;
+ ept_load_pdptrs(vcpu);
}
vmx_flush_tlb(vcpu);
vmcs_writel(GUEST_CR3, guest_cr3);
- if (vcpu->arch.cr0 & X86_CR0_PE)
- vmx_fpu_deactivate(vcpu);
}
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1741,8 +1799,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
vcpu->arch.cr4 = cr4;
- if (enable_ept)
- ept_update_paging_mode_cr4(&hw_cr4, vcpu);
+ if (enable_ept) {
+ if (!is_paging(vcpu)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ hw_cr4 |= X86_CR4_PSE;
+ } else if (!(cr4 & X86_CR4_PAE)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ }
+ }
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
@@ -1780,7 +1844,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
static int vmx_get_cpl(struct kvm_vcpu *vcpu)
{
- if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */
+ if (!is_protmode(vcpu))
return 0;
if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
@@ -2035,7 +2099,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
static bool guest_state_valid(struct kvm_vcpu *vcpu)
{
/* real mode guest state checks */
- if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
+ if (!is_protmode(vcpu)) {
if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
return false;
if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
@@ -2168,7 +2232,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
if (kvm->arch.apic_access_page)
goto out;
kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -2181,7 +2245,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
out:
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -2190,7 +2254,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
if (kvm->arch.ept_identity_pagetable)
goto out;
kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
@@ -2205,7 +2269,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
out:
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -2302,13 +2366,22 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
if (vmx->vpid == 0)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
- if (!enable_ept)
+ if (!enable_ept) {
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ enable_unrestricted_guest = 0;
+ }
if (!enable_unrestricted_guest)
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+ if (!ple_gap)
+ exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
+ if (ple_gap) {
+ vmcs_write32(PLE_GAP, ple_gap);
+ vmcs_write32(PLE_WINDOW, ple_window);
+ }
+
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
@@ -2368,18 +2441,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
for (i = 0; i < NR_VMX_MSR; ++i) {
u32 index = vmx_msr_index[i];
u32 data_low, data_high;
- u64 data;
int j = vmx->nmsrs;
if (rdmsr_safe(index, &data_low, &data_high) < 0)
continue;
if (wrmsr_safe(index, data_low, data_high) < 0)
continue;
- data = data_low | ((u64)data_high << 32);
- vmx->host_msrs[j].index = index;
- vmx->host_msrs[j].reserved = 0;
- vmx->host_msrs[j].data = data;
- vmx->guest_msrs[j] = vmx->host_msrs[j];
+ vmx->guest_msrs[j].index = i;
+ vmx->guest_msrs[j].data = 0;
+ vmx->guest_msrs[j].mask = -1ull;
++vmx->nmsrs;
}
@@ -2389,7 +2459,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
- vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
+ vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
+ if (enable_ept)
+ vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
+ vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
rdtscll(tsc_this);
@@ -2414,10 +2487,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 msr;
- int ret;
+ int ret, idx;
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
if (!init_rmode(vmx->vcpu.kvm)) {
ret = -ENOMEM;
goto out;
@@ -2510,8 +2583,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
if (vmx->vpid != 0)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
- vmx->vcpu.arch.cr0 = 0x60000010;
- vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
+ vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+ vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
vmx_set_cr4(&vmx->vcpu, 0);
vmx_set_efer(&vmx->vcpu, 0);
vmx_fpu_activate(&vmx->vcpu);
@@ -2525,7 +2598,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->emulation_required = 0;
out:
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
return ret;
}
@@ -2627,6 +2700,34 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
GUEST_INTR_STATE_NMI));
}
+static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ if (!cpu_has_virtual_nmis())
+ return to_vmx(vcpu)->soft_vnmi_blocked;
+ else
+ return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ GUEST_INTR_STATE_NMI);
+}
+
+static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!cpu_has_virtual_nmis()) {
+ if (vmx->soft_vnmi_blocked != masked) {
+ vmx->soft_vnmi_blocked = masked;
+ vmx->vnmi_blocked_time = 0;
+ }
+ } else {
+ if (masked)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ }
+}
+
static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
{
return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
@@ -2659,7 +2760,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
* Cause the #SS fault with 0 error code in VM86 mode.
*/
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
- if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
+ if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
return 1;
/*
* Forward all other exceptions that are valid in real mode.
@@ -2674,6 +2775,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
kvm_queue_exception(vcpu, vec);
return 1;
case BP_VECTOR:
+ /*
+ * Update instruction length as we may reinject the exception
+ * from user space while in guest debugging mode.
+ */
+ to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
return 0;
/* fall through */
@@ -2710,15 +2817,16 @@ static void kvm_machine_check(void)
#endif
}
-static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_machine_check(struct kvm_vcpu *vcpu)
{
/* already handled by vcpu_run */
return 1;
}
-static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_exception(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
u32 intr_info, ex_no, error_code;
unsigned long cr2, rip, dr6;
u32 vect_info;
@@ -2728,12 +2836,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
if (is_machine_check(intr_info))
- return handle_machine_check(vcpu, kvm_run);
+ return handle_machine_check(vcpu);
if ((vect_info & VECTORING_INFO_VALID_MASK) &&
- !is_page_fault(intr_info))
- printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
- "intr info 0x%x\n", __func__, vect_info, intr_info);
+ !is_page_fault(intr_info)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = vect_info;
+ vcpu->run->internal.data[1] = intr_info;
+ return 0;
+ }
if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
return 1; /* already handled by vmx_vcpu_run() */
@@ -2744,7 +2857,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
if (is_invalid_opcode(intr_info)) {
- er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
+ er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
if (er != EMULATE_DONE)
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
@@ -2790,6 +2903,13 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
/* fall through */
case BP_VECTOR:
+ /*
+ * Update instruction length as we may reinject #BP from
+ * user space while in guest debugging mode. Reading it for
+ * #DB as well causes no harm, it is not used in that case.
+ */
+ vmx->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
kvm_run->debug.arch.exception = ex_no;
@@ -2803,20 +2923,19 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 0;
}
-static int handle_external_interrupt(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
+static int handle_external_interrupt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.irq_exits;
return 1;
}
-static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_triple_fault(struct kvm_vcpu *vcpu)
{
- kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
}
-static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_io(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
int size, in, string;
@@ -2827,8 +2946,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
string = (exit_qualification & 16) != 0;
if (string) {
- if (emulate_instruction(vcpu,
- kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
+ if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
return 0;
return 1;
}
@@ -2838,7 +2956,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
port = exit_qualification >> 16;
skip_emulated_instruction(vcpu);
- return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
+ return kvm_emulate_pio(vcpu, in, size, port);
}
static void
@@ -2852,7 +2970,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xc1;
}
-static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_cr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification, val;
int cr;
@@ -2887,17 +3005,16 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
if (cr8_prev <= cr8)
return 1;
- kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+ vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
return 0;
}
};
break;
case 2: /* clts */
- vmx_fpu_deactivate(vcpu);
- vcpu->arch.cr0 &= ~X86_CR0_TS;
- vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
- vmx_fpu_activate(vcpu);
+ vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+ trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
skip_emulated_instruction(vcpu);
+ vmx_fpu_activate(vcpu);
return 1;
case 1: /*mov from cr*/
switch (cr) {
@@ -2915,25 +3032,37 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
break;
case 3: /* lmsw */
- kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
+ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
+ trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
+ kvm_lmsw(vcpu, val);
skip_emulated_instruction(vcpu);
return 1;
default:
break;
}
- kvm_run->exit_reason = 0;
+ vcpu->run->exit_reason = 0;
pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
(int)(exit_qualification >> 4) & 3, cr);
return 0;
}
-static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int check_dr_alias(struct kvm_vcpu *vcpu)
+{
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return -1;
+ }
+ return 0;
+}
+
+static int handle_dr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
unsigned long val;
int dr, reg;
+ /* Do not handle if the CPL > 0, will trigger GP on re-entry */
if (!kvm_require_cpl(vcpu, 0))
return 1;
dr = vmcs_readl(GUEST_DR7);
@@ -2944,13 +3073,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
* guest debugging itself.
*/
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
- kvm_run->debug.arch.dr6 = vcpu->arch.dr6;
- kvm_run->debug.arch.dr7 = dr;
- kvm_run->debug.arch.pc =
+ vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
+ vcpu->run->debug.arch.dr7 = dr;
+ vcpu->run->debug.arch.pc =
vmcs_readl(GUEST_CS_BASE) +
vmcs_readl(GUEST_RIP);
- kvm_run->debug.arch.exception = DB_VECTOR;
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ vcpu->run->debug.arch.exception = DB_VECTOR;
+ vcpu->run->exit_reason = KVM_EXIT_DEBUG;
return 0;
} else {
vcpu->arch.dr7 &= ~DR7_GD;
@@ -2969,14 +3098,20 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
case 0 ... 3:
val = vcpu->arch.db[dr];
break;
+ case 4:
+ if (check_dr_alias(vcpu) < 0)
+ return 1;
+ /* fall through */
case 6:
val = vcpu->arch.dr6;
break;
- case 7:
+ case 5:
+ if (check_dr_alias(vcpu) < 0)
+ return 1;
+ /* fall through */
+ default: /* 7 */
val = vcpu->arch.dr7;
break;
- default:
- val = 0;
}
kvm_register_write(vcpu, reg, val);
} else {
@@ -2987,21 +3122,25 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
vcpu->arch.eff_db[dr] = val;
break;
- case 4 ... 5:
- if (vcpu->arch.cr4 & X86_CR4_DE)
- kvm_queue_exception(vcpu, UD_VECTOR);
- break;
+ case 4:
+ if (check_dr_alias(vcpu) < 0)
+ return 1;
+ /* fall through */
case 6:
if (val & 0xffffffff00000000ULL) {
- kvm_queue_exception(vcpu, GP_VECTOR);
- break;
+ kvm_inject_gp(vcpu, 0);
+ return 1;
}
vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
break;
- case 7:
+ case 5:
+ if (check_dr_alias(vcpu) < 0)
+ return 1;
+ /* fall through */
+ default: /* 7 */
if (val & 0xffffffff00000000ULL) {
- kvm_queue_exception(vcpu, GP_VECTOR);
- break;
+ kvm_inject_gp(vcpu, 0);
+ return 1;
}
vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
@@ -3016,18 +3155,19 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
-static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_cpuid(struct kvm_vcpu *vcpu)
{
kvm_emulate_cpuid(vcpu);
return 1;
}
-static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_rdmsr(struct kvm_vcpu *vcpu)
{
u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
u64 data;
if (vmx_get_msr(vcpu, ecx, &data)) {
+ trace_kvm_msr_read_ex(ecx);
kvm_inject_gp(vcpu, 0);
return 1;
}
@@ -3041,31 +3181,29 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
-static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_wrmsr(struct kvm_vcpu *vcpu)
{
u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
- trace_kvm_msr_write(ecx, data);
-
if (vmx_set_msr(vcpu, ecx, data) != 0) {
+ trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(vcpu, 0);
return 1;
}
+ trace_kvm_msr_write(ecx, data);
skip_emulated_instruction(vcpu);
return 1;
}
-static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
+static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
{
return 1;
}
-static int handle_interrupt_window(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
+static int handle_interrupt_window(struct kvm_vcpu *vcpu)
{
u32 cpu_based_vm_exec_control;
@@ -3081,34 +3219,34 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
* possible
*/
if (!irqchip_in_kernel(vcpu->kvm) &&
- kvm_run->request_interrupt_window &&
+ vcpu->run->request_interrupt_window &&
!kvm_cpu_has_interrupt(vcpu)) {
- kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+ vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
return 0;
}
return 1;
}
-static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_halt(struct kvm_vcpu *vcpu)
{
skip_emulated_instruction(vcpu);
return kvm_emulate_halt(vcpu);
}
-static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_vmcall(struct kvm_vcpu *vcpu)
{
skip_emulated_instruction(vcpu);
kvm_emulate_hypercall(vcpu);
return 1;
}
-static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_vmx_insn(struct kvm_vcpu *vcpu)
{
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
-static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_invlpg(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3117,14 +3255,14 @@ static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
-static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_wbinvd(struct kvm_vcpu *vcpu)
{
skip_emulated_instruction(vcpu);
/* TODO: Add support for VT-d/pass-through device */
return 1;
}
-static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_apic_access(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
enum emulation_result er;
@@ -3133,7 +3271,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
offset = exit_qualification & 0xffful;
- er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+ er = emulate_instruction(vcpu, 0, 0, 0);
if (er != EMULATE_DONE) {
printk(KERN_ERR
@@ -3144,7 +3282,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
-static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_task_switch(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qualification;
@@ -3198,7 +3336,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
-static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_ept_violation(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
gpa_t gpa;
@@ -3219,8 +3357,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vmcs_readl(GUEST_LINEAR_ADDRESS));
printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
(long unsigned int)exit_qualification);
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+ vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
return 0;
}
@@ -3290,7 +3428,7 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
}
}
-static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
u64 sptes[4];
int nr_sptes, i;
@@ -3306,13 +3444,13 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+ vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
return 0;
}
-static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int handle_nmi_window(struct kvm_vcpu *vcpu)
{
u32 cpu_based_vm_exec_control;
@@ -3325,36 +3463,55 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
-static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
+static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
enum emulation_result err = EMULATE_DONE;
-
- local_irq_enable();
- preempt_enable();
+ int ret = 1;
while (!guest_state_valid(vcpu)) {
- err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+ err = emulate_instruction(vcpu, 0, 0, 0);
- if (err == EMULATE_DO_MMIO)
- break;
+ if (err == EMULATE_DO_MMIO) {
+ ret = 0;
+ goto out;
+ }
if (err != EMULATE_DONE) {
- kvm_report_emulation_failure(vcpu, "emulation failure");
- break;
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ ret = 0;
+ goto out;
}
if (signal_pending(current))
- break;
+ goto out;
if (need_resched())
schedule();
}
- preempt_disable();
- local_irq_disable();
+ vmx->emulation_required = 0;
+out:
+ return ret;
+}
- vmx->invalid_state_emulation_result = err;
+/*
+ * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
+ * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
+ */
+static int handle_pause(struct kvm_vcpu *vcpu)
+{
+ skip_emulated_instruction(vcpu);
+ kvm_vcpu_on_spin(vcpu);
+
+ return 1;
+}
+
+static int handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
}
/*
@@ -3362,8 +3519,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
* to be done to userspace and return 0.
*/
-static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run) = {
+static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_EXCEPTION_NMI] = handle_exception,
[EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
[EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
@@ -3394,6 +3550,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
+ [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
+ [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
+ [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
};
static const int kvm_vmx_max_exit_handlers =
@@ -3403,7 +3562,7 @@ static const int kvm_vmx_max_exit_handlers =
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
*/
-static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+static int vmx_handle_exit(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 exit_reason = vmx->exit_reason;
@@ -3411,13 +3570,9 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
- /* If we need to emulate an MMIO from handle_invalid_guest_state
- * we just return 0 */
- if (vmx->emulation_required && emulate_invalid_guest_state) {
- if (guest_state_valid(vcpu))
- vmx->emulation_required = 0;
- return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO;
- }
+ /* If guest state is invalid, start emulating */
+ if (vmx->emulation_required && emulate_invalid_guest_state)
+ return handle_invalid_guest_state(vcpu);
/* Access CR3 don't cause VMExit in paging mode, so we need
* to sync with guest real CR3. */
@@ -3425,8 +3580,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
if (unlikely(vmx->fail)) {
- kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
- kvm_run->fail_entry.hardware_entry_failure_reason
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason
= vmcs_read32(VM_INSTRUCTION_ERROR);
return 0;
}
@@ -3459,10 +3614,10 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
if (exit_reason < kvm_vmx_max_exit_handlers
&& kvm_vmx_exit_handlers[exit_reason])
- return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
+ return kvm_vmx_exit_handlers[exit_reason](vcpu);
else {
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- kvm_run->hw.hardware_exit_reason = exit_reason;
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+ vcpu->run->hw.hardware_exit_reason = exit_reason;
}
return 0;
}
@@ -3600,23 +3755,18 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
#define Q "l"
#endif
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (enable_ept && is_paging(vcpu)) {
- vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
- ept_load_pdptrs(vcpu);
- }
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
vmx->entry_time = ktime_get();
- /* Handle invalid guest state instead of entering VMX */
- if (vmx->emulation_required && emulate_invalid_guest_state) {
- handle_invalid_guest_state(vcpu, kvm_run);
+ /* Don't enter VMX if guest state is invalid, let the exit handler
+ start emulation until we arrive back to a valid state */
+ if (vmx->emulation_required && emulate_invalid_guest_state)
return;
- }
if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
@@ -3636,9 +3786,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
*/
vmcs_writel(HOST_CR0, read_cr0());
- if (vcpu->arch.switch_db_regs)
- set_debugreg(vcpu->arch.dr6, 6);
-
asm(
/* Store host registers */
"push %%"R"dx; push %%"R"bp;"
@@ -3739,9 +3886,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
| (1 << VCPU_EXREG_PDPTR));
vcpu->arch.regs_dirty = 0;
- if (vcpu->arch.switch_db_regs)
- get_debugreg(vcpu->arch.dr6, 6);
-
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
if (vmx->rmode.irq.pending)
fixup_rmode_irq(vmx);
@@ -3775,7 +3919,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
__clear_bit(vmx->vpid, vmx_vpid_bitmap);
spin_unlock(&vmx_vpid_lock);
vmx_free_vmcs(vcpu);
- kfree(vmx->host_msrs);
kfree(vmx->guest_msrs);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -3802,10 +3945,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
goto uninit_vcpu;
}
- vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!vmx->host_msrs)
- goto free_guest_msrs;
-
vmx->vmcs = alloc_vmcs();
if (!vmx->vmcs)
goto free_msrs;
@@ -3836,8 +3975,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
free_vmcs:
free_vmcs(vmx->vmcs);
free_msrs:
- kfree(vmx->host_msrs);
-free_guest_msrs:
kfree(vmx->guest_msrs);
uninit_vcpu:
kvm_vcpu_uninit(&vmx->vcpu);
@@ -3877,7 +4014,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
* b. VT-d with snooping control feature: snooping control feature of
* VT-d engine can guarantee the cache correctness. Just set it
* to WB to keep consistent with host. So the same as item 3.
- * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep
+ * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
* consistent with host MTRR
*/
if (is_mmio)
@@ -3888,37 +4025,88 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
VMX_EPT_MT_EPTE_SHIFT;
else
ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
- | VMX_EPT_IGMT_BIT;
+ | VMX_EPT_IPAT_BIT;
return ret;
}
+#define _ER(x) { EXIT_REASON_##x, #x }
+
static const struct trace_print_flags vmx_exit_reasons_str[] = {
- { EXIT_REASON_EXCEPTION_NMI, "exception" },
- { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" },
- { EXIT_REASON_TRIPLE_FAULT, "triple_fault" },
- { EXIT_REASON_NMI_WINDOW, "nmi_window" },
- { EXIT_REASON_IO_INSTRUCTION, "io_instruction" },
- { EXIT_REASON_CR_ACCESS, "cr_access" },
- { EXIT_REASON_DR_ACCESS, "dr_access" },
- { EXIT_REASON_CPUID, "cpuid" },
- { EXIT_REASON_MSR_READ, "rdmsr" },
- { EXIT_REASON_MSR_WRITE, "wrmsr" },
- { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" },
- { EXIT_REASON_HLT, "halt" },
- { EXIT_REASON_INVLPG, "invlpg" },
- { EXIT_REASON_VMCALL, "hypercall" },
- { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" },
- { EXIT_REASON_APIC_ACCESS, "apic_access" },
- { EXIT_REASON_WBINVD, "wbinvd" },
- { EXIT_REASON_TASK_SWITCH, "task_switch" },
- { EXIT_REASON_EPT_VIOLATION, "ept_violation" },
+ _ER(EXCEPTION_NMI),
+ _ER(EXTERNAL_INTERRUPT),
+ _ER(TRIPLE_FAULT),
+ _ER(PENDING_INTERRUPT),
+ _ER(NMI_WINDOW),
+ _ER(TASK_SWITCH),
+ _ER(CPUID),
+ _ER(HLT),
+ _ER(INVLPG),
+ _ER(RDPMC),
+ _ER(RDTSC),
+ _ER(VMCALL),
+ _ER(VMCLEAR),
+ _ER(VMLAUNCH),
+ _ER(VMPTRLD),
+ _ER(VMPTRST),
+ _ER(VMREAD),
+ _ER(VMRESUME),
+ _ER(VMWRITE),
+ _ER(VMOFF),
+ _ER(VMON),
+ _ER(CR_ACCESS),
+ _ER(DR_ACCESS),
+ _ER(IO_INSTRUCTION),
+ _ER(MSR_READ),
+ _ER(MSR_WRITE),
+ _ER(MWAIT_INSTRUCTION),
+ _ER(MONITOR_INSTRUCTION),
+ _ER(PAUSE_INSTRUCTION),
+ _ER(MCE_DURING_VMENTRY),
+ _ER(TPR_BELOW_THRESHOLD),
+ _ER(APIC_ACCESS),
+ _ER(EPT_VIOLATION),
+ _ER(EPT_MISCONFIG),
+ _ER(WBINVD),
{ -1, NULL }
};
-static bool vmx_gb_page_enable(void)
+#undef _ER
+
+static int vmx_get_lpage_level(void)
+{
+ if (enable_ept && !cpu_has_vmx_ept_1g_page())
+ return PT_DIRECTORY_LEVEL;
+ else
+ /* For shadow and EPT supported 1GB page */
+ return PT_PDPE_LEVEL;
+}
+
+static inline u32 bit(int bitno)
+{
+ return 1 << (bitno & 31);
+}
+
+static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
- return false;
+ struct kvm_cpuid_entry2 *best;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exec_control;
+
+ vmx->rdtscp_enabled = false;
+ if (vmx_rdtscp_supported()) {
+ exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ if (exec_control & SECONDARY_EXEC_RDTSCP) {
+ best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+ if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
+ vmx->rdtscp_enabled = true;
+ else {
+ exec_control &= ~SECONDARY_EXEC_RDTSCP;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ exec_control);
+ }
+ }
+ }
}
static struct kvm_x86_ops vmx_x86_ops = {
@@ -3947,6 +4135,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_segment = vmx_set_segment,
.get_cpl = vmx_get_cpl,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+ .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
.set_cr0 = vmx_set_cr0,
.set_cr3 = vmx_set_cr3,
@@ -3959,6 +4148,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
+ .fpu_activate = vmx_fpu_activate,
+ .fpu_deactivate = vmx_fpu_deactivate,
.tlb_flush = vmx_flush_tlb,
@@ -3973,6 +4164,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.queue_exception = vmx_queue_exception,
.interrupt_allowed = vmx_interrupt_allowed,
.nmi_allowed = vmx_nmi_allowed,
+ .get_nmi_mask = vmx_get_nmi_mask,
+ .set_nmi_mask = vmx_set_nmi_mask,
.enable_nmi_window = enable_nmi_window,
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
@@ -3982,12 +4175,21 @@ static struct kvm_x86_ops vmx_x86_ops = {
.get_mt_mask = vmx_get_mt_mask,
.exit_reasons_str = vmx_exit_reasons_str,
- .gb_page_enable = vmx_gb_page_enable,
+ .get_lpage_level = vmx_get_lpage_level,
+
+ .cpuid_update = vmx_cpuid_update,
+
+ .rdtscp_supported = vmx_rdtscp_supported,
};
static int __init vmx_init(void)
{
- int r;
+ int r, i;
+
+ rdmsrl_safe(MSR_EFER, &host_efer);
+
+ for (i = 0; i < NR_VMX_MSR; ++i)
+ kvm_define_shared_msr(i, vmx_msr_index[i]);
vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_io_bitmap_a)
@@ -4049,8 +4251,6 @@ static int __init vmx_init(void)
if (bypass_guest_pf)
kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
- ept_sync_global();
-
return 0;
out3:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index be451ee44249..e46282a56565 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -37,11 +37,14 @@
#include <linux/iommu.h>
#include <linux/intel-iommu.h>
#include <linux/cpufreq.h>
+#include <linux/user-return-notifier.h>
+#include <linux/srcu.h>
#include <trace/events/kvm.h>
#undef TRACE_INCLUDE_FILE
#define CREATE_TRACE_POINTS
#include "trace.h"
+#include <asm/debugreg.h>
#include <asm/uaccess.h>
#include <asm/msr.h>
#include <asm/desc.h>
@@ -87,6 +90,25 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
int ignore_msrs = 0;
module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
+#define KVM_NR_SHARED_MSRS 16
+
+struct kvm_shared_msrs_global {
+ int nr;
+ u32 msrs[KVM_NR_SHARED_MSRS];
+};
+
+struct kvm_shared_msrs {
+ struct user_return_notifier urn;
+ bool registered;
+ struct kvm_shared_msr_values {
+ u64 host;
+ u64 curr;
+ } values[KVM_NR_SHARED_MSRS];
+};
+
+static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
+static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
+
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "pf_fixed", VCPU_STAT(pf_fixed) },
{ "pf_guest", VCPU_STAT(pf_guest) },
@@ -123,6 +145,83 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ NULL }
};
+static void kvm_on_user_return(struct user_return_notifier *urn)
+{
+ unsigned slot;
+ struct kvm_shared_msrs *locals
+ = container_of(urn, struct kvm_shared_msrs, urn);
+ struct kvm_shared_msr_values *values;
+
+ for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
+ values = &locals->values[slot];
+ if (values->host != values->curr) {
+ wrmsrl(shared_msrs_global.msrs[slot], values->host);
+ values->curr = values->host;
+ }
+ }
+ locals->registered = false;
+ user_return_notifier_unregister(urn);
+}
+
+static void shared_msr_update(unsigned slot, u32 msr)
+{
+ struct kvm_shared_msrs *smsr;
+ u64 value;
+
+ smsr = &__get_cpu_var(shared_msrs);
+ /* only read, and nobody should modify it at this time,
+ * so don't need lock */
+ if (slot >= shared_msrs_global.nr) {
+ printk(KERN_ERR "kvm: invalid MSR slot!");
+ return;
+ }
+ rdmsrl_safe(msr, &value);
+ smsr->values[slot].host = value;
+ smsr->values[slot].curr = value;
+}
+
+void kvm_define_shared_msr(unsigned slot, u32 msr)
+{
+ if (slot >= shared_msrs_global.nr)
+ shared_msrs_global.nr = slot + 1;
+ shared_msrs_global.msrs[slot] = msr;
+ /* we need ensured the shared_msr_global have been updated */
+ smp_wmb();
+}
+EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
+
+static void kvm_shared_msr_cpu_online(void)
+{
+ unsigned i;
+
+ for (i = 0; i < shared_msrs_global.nr; ++i)
+ shared_msr_update(i, shared_msrs_global.msrs[i]);
+}
+
+void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
+{
+ struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
+
+ if (((value ^ smsr->values[slot].curr) & mask) == 0)
+ return;
+ smsr->values[slot].curr = value;
+ wrmsrl(shared_msrs_global.msrs[slot], value);
+ if (!smsr->registered) {
+ smsr->urn.on_user_return = kvm_on_user_return;
+ user_return_notifier_register(&smsr->urn);
+ smsr->registered = true;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
+
+static void drop_user_return_notifiers(void *ignore)
+{
+ struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
+
+ if (smsr->registered)
+ kvm_on_user_return(&smsr->urn);
+}
+
unsigned long segment_base(u16 selector)
{
struct descriptor_table gdt;
@@ -170,12 +269,68 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
+#define EXCPT_BENIGN 0
+#define EXCPT_CONTRIBUTORY 1
+#define EXCPT_PF 2
+
+static int exception_class(int vector)
+{
+ switch (vector) {
+ case PF_VECTOR:
+ return EXCPT_PF;
+ case DE_VECTOR:
+ case TS_VECTOR:
+ case NP_VECTOR:
+ case SS_VECTOR:
+ case GP_VECTOR:
+ return EXCPT_CONTRIBUTORY;
+ default:
+ break;
+ }
+ return EXCPT_BENIGN;
+}
+
+static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
+ unsigned nr, bool has_error, u32 error_code)
+{
+ u32 prev_nr;
+ int class1, class2;
+
+ if (!vcpu->arch.exception.pending) {
+ queue:
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.has_error_code = has_error;
+ vcpu->arch.exception.nr = nr;
+ vcpu->arch.exception.error_code = error_code;
+ return;
+ }
+
+ /* to check exception */
+ prev_nr = vcpu->arch.exception.nr;
+ if (prev_nr == DF_VECTOR) {
+ /* triple fault -> shutdown */
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ return;
+ }
+ class1 = exception_class(prev_nr);
+ class2 = exception_class(nr);
+ if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
+ || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+ /* generate double fault per SDM Table 5-5 */
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.has_error_code = true;
+ vcpu->arch.exception.nr = DF_VECTOR;
+ vcpu->arch.exception.error_code = 0;
+ } else
+ /* replace previous exception with a new one in a hope
+ that instruction re-execution will regenerate lost
+ exception */
+ goto queue;
+}
+
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
- WARN_ON(vcpu->arch.exception.pending);
- vcpu->arch.exception.pending = true;
- vcpu->arch.exception.has_error_code = false;
- vcpu->arch.exception.nr = nr;
+ kvm_multiple_exception(vcpu, nr, false, 0);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception);
@@ -183,25 +338,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
u32 error_code)
{
++vcpu->stat.pf_guest;
-
- if (vcpu->arch.exception.pending) {
- switch(vcpu->arch.exception.nr) {
- case DF_VECTOR:
- /* triple fault -> shutdown */
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
- return;
- case PF_VECTOR:
- vcpu->arch.exception.nr = DF_VECTOR;
- vcpu->arch.exception.error_code = 0;
- return;
- default:
- /* replace previous exception with a new one in a hope
- that instruction re-execution will regenerate lost
- exception */
- vcpu->arch.exception.pending = false;
- break;
- }
- }
vcpu->arch.cr2 = addr;
kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
}
@@ -214,11 +350,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
- WARN_ON(vcpu->arch.exception.pending);
- vcpu->arch.exception.pending = true;
- vcpu->arch.exception.has_error_code = true;
- vcpu->arch.exception.nr = nr;
- vcpu->arch.exception.error_code = error_code;
+ kvm_multiple_exception(vcpu, nr, true, error_code);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
@@ -296,12 +428,18 @@ out:
void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
- if (cr0 & CR0_RESERVED_BITS) {
+ cr0 |= X86_CR0_ET;
+
+#ifdef CONFIG_X86_64
+ if (cr0 & 0xffffffff00000000UL) {
printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
- cr0, vcpu->arch.cr0);
+ cr0, kvm_read_cr0(vcpu));
kvm_inject_gp(vcpu, 0);
return;
}
+#endif
+
+ cr0 &= ~CR0_RESERVED_BITS;
if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
@@ -318,7 +456,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
- if ((vcpu->arch.shadow_efer & EFER_LME)) {
+ if ((vcpu->arch.efer & EFER_LME)) {
int cs_db, cs_l;
if (!is_pae(vcpu)) {
@@ -356,13 +494,13 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
- kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
+ kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long old_cr4 = vcpu->arch.cr4;
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
if (cr4 & CR4_RESERVED_BITS) {
@@ -484,16 +622,21 @@ static inline u32 bit(int bitno)
* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
*
* This list is modified at module load time to reflect the
- * capabilities of the host cpu.
+ * capabilities of the host cpu. This capabilities test skips MSRs that are
+ * kvm-specific. Those are put in the beginning of the list.
*/
+
+#define KVM_SAVE_MSRS_BEGIN 5
static u32 msrs_to_save[] = {
+ MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ HV_X64_MSR_APIC_ASSIST_PAGE,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_K6_STAR,
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
- MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
- MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+ MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
};
static unsigned num_msrs_to_save;
@@ -512,7 +655,7 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
}
if (is_paging(vcpu)
- && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
+ && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
kvm_inject_gp(vcpu, 0);
return;
@@ -543,9 +686,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
kvm_x86_ops->set_efer(vcpu, efer);
efer &= ~EFER_LMA;
- efer |= vcpu->arch.shadow_efer & EFER_LMA;
+ efer |= vcpu->arch.efer & EFER_LMA;
- vcpu->arch.shadow_efer = efer;
+ vcpu->arch.efer = efer;
vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
kvm_mmu_reset_context(vcpu);
@@ -580,7 +723,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
static int version;
struct pvclock_wall_clock wc;
- struct timespec now, sys, boot;
+ struct timespec boot;
if (!wall_clock)
return;
@@ -595,9 +738,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
* wall clock specified here. guest system time equals host
* system time for us, thus we must fill in host boot time here.
*/
- now = current_kernel_time();
- ktime_get_ts(&sys);
- boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
+ getboottime(&boot);
wc.sec = boot.tv_sec;
wc.nsec = boot.tv_nsec;
@@ -672,12 +813,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
local_irq_save(flags);
kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
ktime_get_ts(&ts);
+ monotonic_to_bootbased(&ts);
local_irq_restore(flags);
/* With all the info we got, fill in the values */
vcpu->hv_clock.system_time = ts.tv_nsec +
- (NSEC_PER_SEC * (u64)ts.tv_sec);
+ (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
+
/*
* The interface expects us to write an even number signaling that the
* update is finished. Since the guest won't see the intermediate
@@ -835,6 +978,132 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return 0;
}
+static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int lm = is_long_mode(vcpu);
+ u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
+ : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
+ u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
+ : kvm->arch.xen_hvm_config.blob_size_32;
+ u32 page_num = data & ~PAGE_MASK;
+ u64 page_addr = data & PAGE_MASK;
+ u8 *page;
+ int r;
+
+ r = -E2BIG;
+ if (page_num >= blob_size)
+ goto out;
+ r = -ENOMEM;
+ page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!page)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
+ goto out_free;
+ if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
+ goto out_free;
+ r = 0;
+out_free:
+ kfree(page);
+out:
+ return r;
+}
+
+static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
+{
+ return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
+}
+
+static bool kvm_hv_msr_partition_wide(u32 msr)
+{
+ bool r = false;
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ case HV_X64_MSR_HYPERCALL:
+ r = true;
+ break;
+ }
+
+ return r;
+}
+
+static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ kvm->arch.hv_guest_os_id = data;
+ /* setting guest os id to zero disables hypercall page */
+ if (!kvm->arch.hv_guest_os_id)
+ kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
+ break;
+ case HV_X64_MSR_HYPERCALL: {
+ u64 gfn;
+ unsigned long addr;
+ u8 instructions[4];
+
+ /* if guest os id is not set hypercall should remain disabled */
+ if (!kvm->arch.hv_guest_os_id)
+ break;
+ if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
+ kvm->arch.hv_hypercall = data;
+ break;
+ }
+ gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return 1;
+ kvm_x86_ops->patch_hypercall(vcpu, instructions);
+ ((unsigned char *)instructions)[3] = 0xc3; /* ret */
+ if (copy_to_user((void __user *)addr, instructions, 4))
+ return 1;
+ kvm->arch.hv_hypercall = data;
+ break;
+ }
+ default:
+ pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
+ "data 0x%llx\n", msr, data);
+ return 1;
+ }
+ return 0;
+}
+
+static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ switch (msr) {
+ case HV_X64_MSR_APIC_ASSIST_PAGE: {
+ unsigned long addr;
+
+ if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
+ vcpu->arch.hv_vapic = data;
+ break;
+ }
+ addr = gfn_to_hva(vcpu->kvm, data >>
+ HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
+ if (kvm_is_error_hva(addr))
+ return 1;
+ if (clear_user((void __user *)addr, PAGE_SIZE))
+ return 1;
+ vcpu->arch.hv_vapic = data;
+ break;
+ }
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+ default:
+ pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
+ "data 0x%llx\n", msr, data);
+ return 1;
+ }
+
+ return 0;
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
@@ -949,7 +1218,19 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
"0x%x data 0x%llx\n", msr, data);
break;
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+ mutex_lock(&vcpu->kvm->lock);
+ r = set_msr_hyperv_pw(vcpu, msr, data);
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+ } else
+ return set_msr_hyperv(vcpu, msr, data);
+ break;
default:
+ if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
+ return xen_hvm_config(vcpu, data);
if (!ignore_msrs) {
pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
msr, data);
@@ -1046,6 +1327,54 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return 0;
}
+static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data = 0;
+ struct kvm *kvm = vcpu->kvm;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ data = kvm->arch.hv_guest_os_id;
+ break;
+ case HV_X64_MSR_HYPERCALL:
+ data = kvm->arch.hv_hypercall;
+ break;
+ default:
+ pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ }
+
+ *pdata = data;
+ return 0;
+}
+
+static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data = 0;
+
+ switch (msr) {
+ case HV_X64_MSR_VP_INDEX: {
+ int r;
+ struct kvm_vcpu *v;
+ kvm_for_each_vcpu(r, v, vcpu->kvm)
+ if (v == vcpu)
+ data = r;
+ break;
+ }
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+ default:
+ pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
u64 data;
@@ -1097,7 +1426,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
data |= (((uint64_t)4ULL) << 40);
break;
case MSR_EFER:
- data = vcpu->arch.shadow_efer;
+ data = vcpu->arch.efer;
break;
case MSR_KVM_WALL_CLOCK:
data = vcpu->kvm->arch.wall_clock;
@@ -1112,6 +1441,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
return get_msr_mce(vcpu, msr, pdata);
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+ mutex_lock(&vcpu->kvm->lock);
+ r = get_msr_hyperv_pw(vcpu, msr, pdata);
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+ } else
+ return get_msr_hyperv(vcpu, msr, pdata);
+ break;
default:
if (!ignore_msrs) {
pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@@ -1137,15 +1476,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
int (*do_msr)(struct kvm_vcpu *vcpu,
unsigned index, u64 *data))
{
- int i;
+ int i, idx;
vcpu_load(vcpu);
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
for (i = 0; i < msrs->nmsrs; ++i)
if (do_msr(vcpu, entries[i].index, &entries[i].data))
break;
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
vcpu_put(vcpu);
@@ -1224,6 +1563,14 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_PIT2:
case KVM_CAP_PIT_STATE2:
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
+ case KVM_CAP_XEN_HVM:
+ case KVM_CAP_ADJUST_CLOCK:
+ case KVM_CAP_VCPU_EVENTS:
+ case KVM_CAP_HYPERV:
+ case KVM_CAP_HYPERV_VAPIC:
+ case KVM_CAP_HYPERV_SPIN:
+ case KVM_CAP_PCI_SEGMENT:
+ case KVM_CAP_X86_ROBUST_SINGLESTEP:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1238,8 +1585,8 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_NR_MEMSLOTS:
r = KVM_MEMORY_SLOTS;
break;
- case KVM_CAP_PV_MMU:
- r = !tdp_enabled;
+ case KVM_CAP_PV_MMU: /* obsolete */
+ r = 0;
break;
case KVM_CAP_IOMMU:
r = iommu_found();
@@ -1326,13 +1673,19 @@ out:
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
kvm_x86_ops->vcpu_load(vcpu, cpu);
+ if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
+ unsigned long khz = cpufreq_quick_get(cpu);
+ if (!khz)
+ khz = tsc_khz;
+ per_cpu(cpu_tsc_khz, cpu) = khz;
+ }
kvm_request_guest_time_update(vcpu);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
- kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
+ kvm_x86_ops->vcpu_put(vcpu);
}
static int is_efer_nx(void)
@@ -1397,6 +1750,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
cpuid_fix_nx_cap(vcpu);
r = 0;
kvm_apic_set_version(vcpu);
+ kvm_x86_ops->cpuid_update(vcpu);
out_free:
vfree(cpuid_entries);
@@ -1419,6 +1773,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
goto out;
vcpu->arch.cpuid_nent = cpuid->nent;
kvm_apic_set_version(vcpu);
+ kvm_x86_ops->cpuid_update(vcpu);
return 0;
out:
@@ -1461,12 +1816,15 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
u32 index, int *nent, int maxnent)
{
unsigned f_nx = is_efer_nx() ? F(NX) : 0;
- unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
#ifdef CONFIG_X86_64
+ unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
+ ? F(GBPAGES) : 0;
unsigned f_lm = F(LM);
#else
+ unsigned f_gbpages = 0;
unsigned f_lm = 0;
#endif
+ unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
/* cpuid 1.edx */
const u32 kvm_supported_word0_x86_features =
@@ -1486,7 +1844,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
F(PAT) | F(PSE36) | 0 /* Reserved */ |
f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
- F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ |
+ F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
/* cpuid 1.ecx */
const u32 kvm_supported_word4_x86_features =
@@ -1591,6 +1949,8 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
if (cpuid->nent < 1)
goto out;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ cpuid->nent = KVM_MAX_CPUID_ENTRIES;
r = -ENOMEM;
cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
if (!cpuid_entries)
@@ -1690,7 +2050,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
unsigned bank_num = mcg_cap & 0xff, bank;
r = -EINVAL;
- if (!bank_num)
+ if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
goto out;
if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
goto out;
@@ -1731,7 +2091,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
return 0;
if (mce->status & MCI_STATUS_UC) {
if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
- !(vcpu->arch.cr4 & X86_CR4_MCE)) {
+ !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
printk(KERN_DEBUG "kvm: set_mce: "
"injects mce exception while "
"previous one is in progress!\n");
@@ -1757,6 +2117,65 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
return 0;
}
+static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_events *events)
+{
+ vcpu_load(vcpu);
+
+ events->exception.injected = vcpu->arch.exception.pending;
+ events->exception.nr = vcpu->arch.exception.nr;
+ events->exception.has_error_code = vcpu->arch.exception.has_error_code;
+ events->exception.error_code = vcpu->arch.exception.error_code;
+
+ events->interrupt.injected = vcpu->arch.interrupt.pending;
+ events->interrupt.nr = vcpu->arch.interrupt.nr;
+ events->interrupt.soft = vcpu->arch.interrupt.soft;
+
+ events->nmi.injected = vcpu->arch.nmi_injected;
+ events->nmi.pending = vcpu->arch.nmi_pending;
+ events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
+
+ events->sipi_vector = vcpu->arch.sipi_vector;
+
+ events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
+ | KVM_VCPUEVENT_VALID_SIPI_VECTOR);
+
+ vcpu_put(vcpu);
+}
+
+static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_events *events)
+{
+ if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
+ | KVM_VCPUEVENT_VALID_SIPI_VECTOR))
+ return -EINVAL;
+
+ vcpu_load(vcpu);
+
+ vcpu->arch.exception.pending = events->exception.injected;
+ vcpu->arch.exception.nr = events->exception.nr;
+ vcpu->arch.exception.has_error_code = events->exception.has_error_code;
+ vcpu->arch.exception.error_code = events->exception.error_code;
+
+ vcpu->arch.interrupt.pending = events->interrupt.injected;
+ vcpu->arch.interrupt.nr = events->interrupt.nr;
+ vcpu->arch.interrupt.soft = events->interrupt.soft;
+ if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
+ kvm_pic_clear_isr_ack(vcpu->kvm);
+
+ vcpu->arch.nmi_injected = events->nmi.injected;
+ if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
+ vcpu->arch.nmi_pending = events->nmi.pending;
+ kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
+
+ if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
+ vcpu->arch.sipi_vector = events->sipi_vector;
+
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -1767,6 +2186,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
switch (ioctl) {
case KVM_GET_LAPIC: {
+ r = -EINVAL;
+ if (!vcpu->arch.apic)
+ goto out;
lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
@@ -1782,6 +2204,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_SET_LAPIC: {
+ r = -EINVAL;
+ if (!vcpu->arch.apic)
+ goto out;
lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
if (!lapic)
@@ -1908,6 +2333,27 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
break;
}
+ case KVM_GET_VCPU_EVENTS: {
+ struct kvm_vcpu_events events;
+
+ kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_VCPU_EVENTS: {
+ struct kvm_vcpu_events events;
+
+ r = -EFAULT;
+ if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
+ break;
+ }
default:
r = -EINVAL;
}
@@ -1939,14 +2385,14 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
return -EINVAL;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
spin_lock(&kvm->mmu_lock);
kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
spin_unlock(&kvm->mmu_lock);
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return 0;
}
@@ -1955,13 +2401,35 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
return kvm->arch.n_alloc_mmu_pages;
}
+gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
+{
+ int i;
+ struct kvm_mem_alias *alias;
+ struct kvm_mem_aliases *aliases;
+
+ aliases = rcu_dereference(kvm->arch.aliases);
+
+ for (i = 0; i < aliases->naliases; ++i) {
+ alias = &aliases->aliases[i];
+ if (alias->flags & KVM_ALIAS_INVALID)
+ continue;
+ if (gfn >= alias->base_gfn
+ && gfn < alias->base_gfn + alias->npages)
+ return alias->target_gfn + gfn - alias->base_gfn;
+ }
+ return gfn;
+}
+
gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
{
int i;
struct kvm_mem_alias *alias;
+ struct kvm_mem_aliases *aliases;
+
+ aliases = rcu_dereference(kvm->arch.aliases);
- for (i = 0; i < kvm->arch.naliases; ++i) {
- alias = &kvm->arch.aliases[i];
+ for (i = 0; i < aliases->naliases; ++i) {
+ alias = &aliases->aliases[i];
if (gfn >= alias->base_gfn
&& gfn < alias->base_gfn + alias->npages)
return alias->target_gfn + gfn - alias->base_gfn;
@@ -1979,6 +2447,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
{
int r, n;
struct kvm_mem_alias *p;
+ struct kvm_mem_aliases *aliases, *old_aliases;
r = -EINVAL;
/* General sanity checks */
@@ -1995,26 +2464,48 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
< alias->target_phys_addr)
goto out;
- down_write(&kvm->slots_lock);
- spin_lock(&kvm->mmu_lock);
+ r = -ENOMEM;
+ aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
+ if (!aliases)
+ goto out;
+
+ mutex_lock(&kvm->slots_lock);
+
+ /* invalidate any gfn reference in case of deletion/shrinking */
+ memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
+ aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
+ old_aliases = kvm->arch.aliases;
+ rcu_assign_pointer(kvm->arch.aliases, aliases);
+ synchronize_srcu_expedited(&kvm->srcu);
+ kvm_mmu_zap_all(kvm);
+ kfree(old_aliases);
+
+ r = -ENOMEM;
+ aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
+ if (!aliases)
+ goto out_unlock;
- p = &kvm->arch.aliases[alias->slot];
+ memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
+
+ p = &aliases->aliases[alias->slot];
p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
p->npages = alias->memory_size >> PAGE_SHIFT;
p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
+ p->flags &= ~(KVM_ALIAS_INVALID);
for (n = KVM_ALIAS_SLOTS; n > 0; --n)
- if (kvm->arch.aliases[n - 1].npages)
+ if (aliases->aliases[n - 1].npages)
break;
- kvm->arch.naliases = n;
-
- spin_unlock(&kvm->mmu_lock);
- kvm_mmu_zap_all(kvm);
+ aliases->naliases = n;
- up_write(&kvm->slots_lock);
-
- return 0;
+ old_aliases = kvm->arch.aliases;
+ rcu_assign_pointer(kvm->arch.aliases, aliases);
+ synchronize_srcu_expedited(&kvm->srcu);
+ kfree(old_aliases);
+ r = 0;
+out_unlock:
+ mutex_unlock(&kvm->slots_lock);
out:
return r;
}
@@ -2036,9 +2527,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
sizeof(struct kvm_pic_state));
break;
case KVM_IRQCHIP_IOAPIC:
- memcpy(&chip->chip.ioapic,
- ioapic_irqchip(kvm),
- sizeof(struct kvm_ioapic_state));
+ r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
break;
default:
r = -EINVAL;
@@ -2054,25 +2543,21 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
r = 0;
switch (chip->chip_id) {
case KVM_IRQCHIP_PIC_MASTER:
- spin_lock(&pic_irqchip(kvm)->lock);
+ raw_spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[0],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
- spin_unlock(&pic_irqchip(kvm)->lock);
+ raw_spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_PIC_SLAVE:
- spin_lock(&pic_irqchip(kvm)->lock);
+ raw_spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[1],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
- spin_unlock(&pic_irqchip(kvm)->lock);
+ raw_spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_IOAPIC:
- mutex_lock(&kvm->irq_lock);
- memcpy(ioapic_irqchip(kvm),
- &chip->chip.ioapic,
- sizeof(struct kvm_ioapic_state));
- mutex_unlock(&kvm->irq_lock);
+ r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
break;
default:
r = -EINVAL;
@@ -2149,29 +2634,62 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_dirty_log *log)
{
- int r;
- int n;
+ int r, n, i;
struct kvm_memory_slot *memslot;
- int is_dirty = 0;
+ unsigned long is_dirty = 0;
+ unsigned long *dirty_bitmap = NULL;
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
- r = kvm_get_dirty_log(kvm, log, &is_dirty);
- if (r)
+ r = -EINVAL;
+ if (log->slot >= KVM_MEMORY_SLOTS)
+ goto out;
+
+ memslot = &kvm->memslots->memslots[log->slot];
+ r = -ENOENT;
+ if (!memslot->dirty_bitmap)
+ goto out;
+
+ n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+
+ r = -ENOMEM;
+ dirty_bitmap = vmalloc(n);
+ if (!dirty_bitmap)
goto out;
+ memset(dirty_bitmap, 0, n);
+
+ for (i = 0; !is_dirty && i < n/sizeof(long); i++)
+ is_dirty = memslot->dirty_bitmap[i];
/* If nothing is dirty, don't bother messing with page tables. */
if (is_dirty) {
+ struct kvm_memslots *slots, *old_slots;
+
spin_lock(&kvm->mmu_lock);
kvm_mmu_slot_remove_write_access(kvm, log->slot);
spin_unlock(&kvm->mmu_lock);
- memslot = &kvm->memslots[log->slot];
- n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
- memset(memslot->dirty_bitmap, 0, n);
+
+ slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ if (!slots)
+ goto out_free;
+
+ memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
+ slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
+
+ old_slots = kvm->memslots;
+ rcu_assign_pointer(kvm->memslots, slots);
+ synchronize_srcu_expedited(&kvm->srcu);
+ dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
+ kfree(old_slots);
}
+
r = 0;
+ if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
+ r = -EFAULT;
+out_free:
+ vfree(dirty_bitmap);
out:
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -2180,7 +2698,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
{
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
- int r = -EINVAL;
+ int r = -ENOTTY;
/*
* This union makes it completely explicit to gcc-3.x
* that these two variables' stack usage should be
@@ -2242,25 +2760,39 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (r)
goto out;
break;
- case KVM_CREATE_IRQCHIP:
+ case KVM_CREATE_IRQCHIP: {
+ struct kvm_pic *vpic;
+
+ mutex_lock(&kvm->lock);
+ r = -EEXIST;
+ if (kvm->arch.vpic)
+ goto create_irqchip_unlock;
r = -ENOMEM;
- kvm->arch.vpic = kvm_create_pic(kvm);
- if (kvm->arch.vpic) {
+ vpic = kvm_create_pic(kvm);
+ if (vpic) {
r = kvm_ioapic_init(kvm);
if (r) {
- kfree(kvm->arch.vpic);
- kvm->arch.vpic = NULL;
- goto out;
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+ &vpic->dev);
+ kfree(vpic);
+ goto create_irqchip_unlock;
}
} else
- goto out;
+ goto create_irqchip_unlock;
+ smp_wmb();
+ kvm->arch.vpic = vpic;
+ smp_wmb();
r = kvm_setup_default_irq_routing(kvm);
if (r) {
- kfree(kvm->arch.vpic);
- kfree(kvm->arch.vioapic);
- goto out;
+ mutex_lock(&kvm->irq_lock);
+ kvm_ioapic_destroy(kvm);
+ kvm_destroy_pic(kvm);
+ mutex_unlock(&kvm->irq_lock);
}
+ create_irqchip_unlock:
+ mutex_unlock(&kvm->lock);
break;
+ }
case KVM_CREATE_PIT:
u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
goto create_pit;
@@ -2270,7 +2802,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
sizeof(struct kvm_pit_config)))
goto out;
create_pit:
- down_write(&kvm->slots_lock);
+ mutex_lock(&kvm->slots_lock);
r = -EEXIST;
if (kvm->arch.vpit)
goto create_pit_unlock;
@@ -2279,7 +2811,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (kvm->arch.vpit)
r = 0;
create_pit_unlock:
- up_write(&kvm->slots_lock);
+ mutex_unlock(&kvm->slots_lock);
break;
case KVM_IRQ_LINE_STATUS:
case KVM_IRQ_LINE: {
@@ -2290,10 +2822,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
if (irqchip_in_kernel(kvm)) {
__s32 status;
- mutex_lock(&kvm->irq_lock);
status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
irq_event.irq, irq_event.level);
- mutex_unlock(&kvm->irq_lock);
if (ioctl == KVM_IRQ_LINE_STATUS) {
irq_event.status = status;
if (copy_to_user(argp, &irq_event,
@@ -2419,6 +2949,55 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
break;
}
+ case KVM_XEN_HVM_CONFIG: {
+ r = -EFAULT;
+ if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
+ sizeof(struct kvm_xen_hvm_config)))
+ goto out;
+ r = -EINVAL;
+ if (kvm->arch.xen_hvm_config.flags)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_CLOCK: {
+ struct timespec now;
+ struct kvm_clock_data user_ns;
+ u64 now_ns;
+ s64 delta;
+
+ r = -EFAULT;
+ if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
+ goto out;
+
+ r = -EINVAL;
+ if (user_ns.flags)
+ goto out;
+
+ r = 0;
+ ktime_get_ts(&now);
+ now_ns = timespec_to_ns(&now);
+ delta = user_ns.clock - now_ns;
+ kvm->arch.kvmclock_offset = delta;
+ break;
+ }
+ case KVM_GET_CLOCK: {
+ struct timespec now;
+ struct kvm_clock_data user_ns;
+ u64 now_ns;
+
+ ktime_get_ts(&now);
+ now_ns = timespec_to_ns(&now);
+ user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
+ user_ns.flags = 0;
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
+ goto out;
+ r = 0;
+ break;
+ }
+
default:
;
}
@@ -2431,7 +3010,8 @@ static void kvm_init_msr_list(void)
u32 dummy[2];
unsigned i, j;
- for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
+ /* skip the first msrs in the list. KVM-specific */
+ for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
continue;
if (j < i)
@@ -2448,7 +3028,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
!kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
return 0;
- return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v);
+ return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
}
static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
@@ -2457,17 +3037,44 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
!kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
return 0;
- return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
+ return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
}
-static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu)
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+}
+
+ gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ access |= PFERR_FETCH_MASK;
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+}
+
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ access |= PFERR_WRITE_MASK;
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+}
+
+/* uses this to access any guest's mapped memory without checking CPL */
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+{
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
+}
+
+static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 access,
+ u32 *error)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -2490,14 +3097,37 @@ out:
return r;
}
+/* used for instruction fetching */
+static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 *error)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
+ access | PFERR_FETCH_MASK, error);
+}
+
+static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 *error)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
+ error);
+}
+
+static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 *error)
+{
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
+}
+
static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu)
+ struct kvm_vcpu *vcpu, u32 *error)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -2527,6 +3157,7 @@ static int emulator_read_emulated(unsigned long addr,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
+ u32 error_code;
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
@@ -2536,17 +3167,20 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
+
+ if (gpa == UNMAPPED_GVA) {
+ kvm_inject_page_fault(vcpu, addr, error_code);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto mmio;
- if (kvm_read_guest_virt(addr, val, bytes, vcpu)
+ if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
== X86EMUL_CONTINUE)
return X86EMUL_CONTINUE;
- if (gpa == UNMAPPED_GVA)
- return X86EMUL_PROPAGATE_FAULT;
mmio:
/*
@@ -2585,11 +3219,12 @@ static int emulator_write_emulated_onepage(unsigned long addr,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
+ u32 error_code;
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
if (gpa == UNMAPPED_GVA) {
- kvm_inject_page_fault(vcpu, addr, 2);
+ kvm_inject_page_fault(vcpu, addr, error_code);
return X86EMUL_PROPAGATE_FAULT;
}
@@ -2653,7 +3288,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
char *kaddr;
u64 val;
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
if (gpa == UNMAPPED_GVA ||
(gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -2690,35 +3325,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
int emulate_clts(struct kvm_vcpu *vcpu)
{
- kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
+ kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+ kvm_x86_ops->fpu_activate(vcpu);
return X86EMUL_CONTINUE;
}
int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
{
- struct kvm_vcpu *vcpu = ctxt->vcpu;
-
- switch (dr) {
- case 0 ... 3:
- *dest = kvm_x86_ops->get_dr(vcpu, dr);
- return X86EMUL_CONTINUE;
- default:
- pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
- return X86EMUL_UNHANDLEABLE;
- }
+ return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest);
}
int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
{
unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
- int exception;
- kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
- if (exception) {
- /* FIXME: better handling */
- return X86EMUL_UNHANDLEABLE;
- }
- return X86EMUL_CONTINUE;
+ return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask);
}
void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -2732,7 +3353,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
- kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
+ kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
@@ -2740,7 +3361,8 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
static struct x86_emulate_ops emulate_ops = {
- .read_std = kvm_read_guest_virt,
+ .read_std = kvm_read_guest_virt_system,
+ .fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
.cmpxchg_emulated = emulator_cmpxchg_emulated,
@@ -2755,13 +3377,13 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
}
int emulate_instruction(struct kvm_vcpu *vcpu,
- struct kvm_run *run,
unsigned long cr2,
u16 error_code,
int emulation_type)
{
int r, shadow_mask;
struct decode_cache *c;
+ struct kvm_run *run = vcpu->run;
kvm_clear_exception_queue(vcpu);
vcpu->arch.mmio_fault_cr2 = cr2;
@@ -2781,10 +3403,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
vcpu->arch.emulate_ctxt.vcpu = vcpu;
- vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+ vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
vcpu->arch.emulate_ctxt.mode =
+ (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
- ? X86EMUL_MODE_REAL : cs_l
+ ? X86EMUL_MODE_VM86 : cs_l
? X86EMUL_MODE_PROT64 : cs_db
? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
@@ -2859,7 +3482,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
return EMULATE_DO_MMIO;
}
- kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
if (vcpu->mmio_is_write) {
vcpu->mmio_needed = 0;
@@ -2876,12 +3499,17 @@ static int pio_copy_data(struct kvm_vcpu *vcpu)
gva_t q = vcpu->arch.pio.guest_gva;
unsigned bytes;
int ret;
+ u32 error_code;
bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
if (vcpu->arch.pio.in)
- ret = kvm_write_guest_virt(q, p, bytes, vcpu);
+ ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
else
- ret = kvm_read_guest_virt(q, p, bytes, vcpu);
+ ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
+
+ if (ret == X86EMUL_PROPAGATE_FAULT)
+ kvm_inject_page_fault(vcpu, q, error_code);
+
return ret;
}
@@ -2902,7 +3530,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
if (io->in) {
r = pio_copy_data(vcpu);
if (r)
- return r;
+ goto out;
}
delta = 1;
@@ -2929,7 +3557,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
kvm_register_write(vcpu, VCPU_REGS_RSI, val);
}
}
-
+out:
io->count -= io->cur_count;
io->cur_count = 0;
@@ -2942,11 +3570,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
int r;
if (vcpu->arch.pio.in)
- r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
+ r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
vcpu->arch.pio.size, pd);
else
- r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
- vcpu->arch.pio.size, pd);
+ r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+ vcpu->arch.pio.port, vcpu->arch.pio.size,
+ pd);
return r;
}
@@ -2957,7 +3586,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
int i, r = 0;
for (i = 0; i < io->cur_count; i++) {
- if (kvm_io_bus_write(&vcpu->kvm->pio_bus,
+ if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
io->port, io->size, pd)) {
r = -EOPNOTSUPP;
break;
@@ -2967,11 +3596,12 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
return r;
}
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
- int size, unsigned port)
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
{
unsigned long val;
+ trace_kvm_pio(!in, port, size, 1);
+
vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -2983,11 +3613,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
vcpu->arch.pio.down = 0;
vcpu->arch.pio.rep = 0;
- trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
- size, 1);
-
- val = kvm_register_read(vcpu, VCPU_REGS_RAX);
- memcpy(vcpu->arch.pio_data, &val, 4);
+ if (!vcpu->arch.pio.in) {
+ val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ memcpy(vcpu->arch.pio_data, &val, 4);
+ }
if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
complete_pio(vcpu);
@@ -2997,13 +3626,15 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
}
EXPORT_SYMBOL_GPL(kvm_emulate_pio);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
int size, unsigned long count, int down,
gva_t address, int rep, unsigned port)
{
unsigned now, in_page;
int ret = 0;
+ trace_kvm_pio(!in, port, size, count);
+
vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -3015,9 +3646,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
vcpu->arch.pio.down = down;
vcpu->arch.pio.rep = rep;
- trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
- size, count);
-
if (!count) {
kvm_x86_ops->skip_emulated_instruction(vcpu);
return 1;
@@ -3049,10 +3677,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
if (!vcpu->arch.pio.in) {
/* string PIO write */
ret = pio_copy_data(vcpu);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
- kvm_inject_gp(vcpu, 0);
+ if (ret == X86EMUL_PROPAGATE_FAULT)
return 1;
- }
if (ret == 0 && !pio_string_write(vcpu)) {
complete_pio(vcpu);
if (vcpu->arch.pio.count == 0)
@@ -3070,9 +3696,6 @@ static void bounce_off(void *info)
/* nothing */
}
-static unsigned int ref_freq;
-static unsigned long tsc_khz_ref;
-
static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
@@ -3081,14 +3704,11 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
- if (!ref_freq)
- ref_freq = freq->old;
-
if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
return 0;
if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
return 0;
- per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+ per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -3125,9 +3745,28 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = {
.notifier_call = kvmclock_cpufreq_notifier
};
+static void kvm_timer_init(void)
+{
+ int cpu;
+
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ for_each_online_cpu(cpu) {
+ unsigned long khz = cpufreq_get(cpu);
+ if (!khz)
+ khz = tsc_khz;
+ per_cpu(cpu_tsc_khz, cpu) = khz;
+ }
+ } else {
+ for_each_possible_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+ }
+}
+
int kvm_arch_init(void *opaque)
{
- int r, cpu;
+ int r;
struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
if (kvm_x86_ops) {
@@ -3159,13 +3798,7 @@ int kvm_arch_init(void *opaque)
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
- for_each_possible_cpu(cpu)
- per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
- tsc_khz_ref = tsc_khz;
- cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
- CPUFREQ_TRANSITION_NOTIFIER);
- }
+ kvm_timer_init();
return 0;
@@ -3204,11 +3837,76 @@ static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
return a0 | ((gpa_t)a1 << 32);
}
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+ u64 param, ingpa, outgpa, ret;
+ uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
+ bool fast, longmode;
+ int cs_db, cs_l;
+
+ /*
+ * hypercall generates UD from non zero cpl and real mode
+ * per HYPER-V spec
+ */
+ if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 0;
+ }
+
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ longmode = is_long_mode(vcpu) && cs_l == 1;
+
+ if (!longmode) {
+ param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
+ ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
+ outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
+ }
+#ifdef CONFIG_X86_64
+ else {
+ param = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
+ }
+#endif
+
+ code = param & 0xffff;
+ fast = (param >> 16) & 0x1;
+ rep_cnt = (param >> 32) & 0xfff;
+ rep_idx = (param >> 48) & 0xfff;
+
+ trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
+
+ switch (code) {
+ case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
+ kvm_vcpu_on_spin(vcpu);
+ break;
+ default:
+ res = HV_STATUS_INVALID_HYPERCALL_CODE;
+ break;
+ }
+
+ ret = res | (((u64)rep_done & 0xfff) << 32);
+ if (longmode) {
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+ } else {
+ kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
+ }
+
+ return 1;
+}
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
int r = 1;
+ if (kvm_hv_hypercall_enabled(vcpu->kvm))
+ return kvm_hv_hypercall(vcpu);
+
nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -3251,10 +3949,8 @@ EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
{
char instruction[3];
- int ret = 0;
unsigned long rip = kvm_rip_read(vcpu);
-
/*
* Blow out the MMU to ensure that no other VCPU has an active mapping
* to ensure that the updated hypercall appears atomically across all
@@ -3263,11 +3959,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
kvm_mmu_zap_all(vcpu->kvm);
kvm_x86_ops->patch_hypercall(vcpu, instruction);
- if (emulator_write_emulated(rip, instruction, 3, vcpu)
- != X86EMUL_CONTINUE)
- ret = -EFAULT;
- return ret;
+ return emulator_write_emulated(rip, instruction, 3, vcpu);
}
static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -3293,17 +3986,16 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
unsigned long *rflags)
{
kvm_lmsw(vcpu, msw);
- *rflags = kvm_x86_ops->get_rflags(vcpu);
+ *rflags = kvm_get_rflags(vcpu);
}
unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
{
unsigned long value;
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
switch (cr) {
case 0:
- value = vcpu->arch.cr0;
+ value = kvm_read_cr0(vcpu);
break;
case 2:
value = vcpu->arch.cr2;
@@ -3312,7 +4004,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
value = vcpu->arch.cr3;
break;
case 4:
- value = vcpu->arch.cr4;
+ value = kvm_read_cr4(vcpu);
break;
case 8:
value = kvm_get_cr8(vcpu);
@@ -3330,8 +4022,8 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
{
switch (cr) {
case 0:
- kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
- *rflags = kvm_x86_ops->get_rflags(vcpu);
+ kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+ *rflags = kvm_get_rflags(vcpu);
break;
case 2:
vcpu->arch.cr2 = val;
@@ -3340,7 +4032,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
kvm_set_cr3(vcpu, val);
break;
case 4:
- kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
+ kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
break;
case 8:
kvm_set_cr8(vcpu, val & 0xfUL);
@@ -3407,6 +4099,7 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
}
return best;
}
+EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
{
@@ -3451,18 +4144,18 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
*
* No need to exit to userspace if we already have an interrupt queued.
*/
-static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
+static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
{
return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
- kvm_run->request_interrupt_window &&
+ vcpu->run->request_interrupt_window &&
kvm_arch_interrupt_allowed(vcpu));
}
-static void post_kvm_run_save(struct kvm_vcpu *vcpu,
- struct kvm_run *kvm_run)
+static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{
- kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+ struct kvm_run *kvm_run = vcpu->run;
+
+ kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
if (irqchip_in_kernel(vcpu->kvm))
@@ -3490,14 +4183,15 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
static void vapic_exit(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ int idx;
if (!apic || !apic->vapic_addr)
return;
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_release_page_dirty(apic->vapic_page);
mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -3523,7 +4217,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
}
-static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void inject_pending_event(struct kvm_vcpu *vcpu)
{
/* try to reinject previous events if any */
if (vcpu->arch.exception.pending) {
@@ -3559,11 +4253,11 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
}
-static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
- kvm_run->request_interrupt_window;
+ vcpu->run->request_interrupt_window;
if (vcpu->requests)
if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3584,21 +4278,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_x86_ops->tlb_flush(vcpu);
if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
&vcpu->requests)) {
- kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
+ vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
goto out;
}
if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
- kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
r = 0;
goto out;
}
+ if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
+ vcpu->fpu_active = 0;
+ kvm_x86_ops->fpu_deactivate(vcpu);
+ }
}
preempt_disable();
kvm_x86_ops->prepare_guest_switch(vcpu);
- kvm_load_guest_fpu(vcpu);
+ if (vcpu->fpu_active)
+ kvm_load_guest_fpu(vcpu);
local_irq_disable();
@@ -3613,7 +4312,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
goto out;
}
- inject_pending_event(vcpu, kvm_run);
+ inject_pending_event(vcpu);
/* enable NMI/IRQ window open exits if needed */
if (vcpu->arch.nmi_pending)
@@ -3626,7 +4325,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_lapic_sync_to_vapic(vcpu);
}
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_guest_enter();
@@ -3639,16 +4338,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
trace_kvm_entry(vcpu->vcpu_id);
- kvm_x86_ops->run(vcpu, kvm_run);
+ kvm_x86_ops->run(vcpu);
- if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
- set_debugreg(current->thread.debugreg0, 0);
- set_debugreg(current->thread.debugreg1, 1);
- set_debugreg(current->thread.debugreg2, 2);
- set_debugreg(current->thread.debugreg3, 3);
- set_debugreg(current->thread.debugreg6, 6);
- set_debugreg(current->thread.debugreg7, 7);
- }
+ /*
+ * If the guest has used debug registers, at least dr7
+ * will be disabled while returning to the host.
+ * If we don't have active breakpoints in the host, we don't
+ * care about the messed up debug address registers. But if
+ * we have some of them active, restore the old state.
+ */
+ if (hw_breakpoint_active())
+ hw_breakpoint_restore();
set_bit(KVM_REQ_KICK, &vcpu->requests);
local_irq_enable();
@@ -3667,7 +4367,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
preempt_enable();
- down_read(&vcpu->kvm->slots_lock);
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
/*
* Profile KVM exit RIPs:
@@ -3680,15 +4380,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
kvm_lapic_sync_from_vapic(vcpu);
- r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
+ r = kvm_x86_ops->handle_exit(vcpu);
out:
return r;
}
-static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int __vcpu_run(struct kvm_vcpu *vcpu)
{
int r;
+ struct kvm *kvm = vcpu->kvm;
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
pr_debug("vcpu %d received sipi with vector # %x\n",
@@ -3700,17 +4401,17 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
- down_read(&vcpu->kvm->slots_lock);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
vapic_enter(vcpu);
r = 1;
while (r > 0) {
if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
- r = vcpu_enter_guest(vcpu, kvm_run);
+ r = vcpu_enter_guest(vcpu);
else {
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
- down_read(&vcpu->kvm->slots_lock);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
{
switch(vcpu->arch.mp_state) {
@@ -3734,25 +4435,25 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
- if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+ if (dm_request_for_irq_injection(vcpu)) {
r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.request_irq_exits;
}
if (signal_pending(current)) {
r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.signal_exits;
}
if (need_resched()) {
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_resched(vcpu);
- down_read(&vcpu->kvm->slots_lock);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
}
}
- up_read(&vcpu->kvm->slots_lock);
- post_kvm_run_save(vcpu, kvm_run);
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ post_kvm_run_save(vcpu);
vapic_exit(vcpu);
@@ -3785,17 +4486,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (r)
goto out;
}
-#if CONFIG_HAS_IOMEM
if (vcpu->mmio_needed) {
memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
vcpu->mmio_read_completed = 1;
vcpu->mmio_needed = 0;
- down_read(&vcpu->kvm->slots_lock);
- r = emulate_instruction(vcpu, kvm_run,
- vcpu->arch.mmio_fault_cr2, 0,
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
EMULTYPE_NO_DECODE);
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
if (r == EMULATE_DO_MMIO) {
/*
* Read-modify-write. Back to userspace.
@@ -3804,12 +4503,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
goto out;
}
}
-#endif
if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
kvm_register_write(vcpu, VCPU_REGS_RAX,
kvm_run->hypercall.ret);
- r = __vcpu_run(vcpu, kvm_run);
+ r = __vcpu_run(vcpu);
out:
if (vcpu->sigset_active)
@@ -3843,13 +4541,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
#endif
regs->rip = kvm_rip_read(vcpu);
- regs->rflags = kvm_x86_ops->get_rflags(vcpu);
-
- /*
- * Don't leak debug flags in case they were set for guest debugging
- */
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+ regs->rflags = kvm_get_rflags(vcpu);
vcpu_put(vcpu);
@@ -3877,12 +4569,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
-
#endif
kvm_rip_write(vcpu, regs->rip);
- kvm_x86_ops->set_rflags(vcpu, regs->rflags);
-
+ kvm_set_rflags(vcpu, regs->rflags);
vcpu->arch.exception.pending = false;
@@ -3931,13 +4621,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
sregs->gdt.limit = dt.limit;
sregs->gdt.base = dt.base;
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
- sregs->cr0 = vcpu->arch.cr0;
+ sregs->cr0 = kvm_read_cr0(vcpu);
sregs->cr2 = vcpu->arch.cr2;
sregs->cr3 = vcpu->arch.cr3;
- sregs->cr4 = vcpu->arch.cr4;
+ sregs->cr4 = kvm_read_cr4(vcpu);
sregs->cr8 = kvm_get_cr8(vcpu);
- sregs->efer = vcpu->arch.shadow_efer;
+ sregs->efer = vcpu->arch.efer;
sregs->apic_base = kvm_get_apic_base(vcpu);
memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
@@ -4025,14 +4714,23 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
{
struct descriptor_table dtable;
u16 index = selector >> 3;
+ int ret;
+ u32 err;
+ gva_t addr;
get_segment_descriptor_dtable(vcpu, selector, &dtable);
if (dtable.limit < index * 8 + 7) {
kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
- return 1;
+ return X86EMUL_PROPAGATE_FAULT;
}
- return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
+ addr = dtable.base + index * 8;
+ ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
+ vcpu, &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT)
+ kvm_inject_page_fault(vcpu, addr, err);
+
+ return ret;
}
/* allowed just for 8 bytes segments */
@@ -4046,15 +4744,23 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
if (dtable.limit < index * 8 + 7)
return 1;
- return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
+ return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
}
-static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
+static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
+ struct desc_struct *seg_desc)
+{
+ u32 base_addr = get_desc_base(seg_desc);
+
+ return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
+}
+
+static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
struct desc_struct *seg_desc)
{
u32 base_addr = get_desc_base(seg_desc);
- return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
+ return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
}
static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
@@ -4065,18 +4771,6 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
return kvm_seg.selector;
}
-static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
- u16 selector,
- struct kvm_segment *kvm_seg)
-{
- struct desc_struct seg_desc;
-
- if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
- return 1;
- seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
- return 0;
-}
-
static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
{
struct kvm_segment segvar = {
@@ -4094,34 +4788,122 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se
.unusable = 0,
};
kvm_x86_ops->set_segment(vcpu, &segvar, seg);
- return 0;
+ return X86EMUL_CONTINUE;
}
static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
{
return (seg != VCPU_SREG_LDTR) &&
(seg != VCPU_SREG_TR) &&
- (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM);
+ (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
}
-int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
- int type_bits, int seg)
+int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
{
struct kvm_segment kvm_seg;
+ struct desc_struct seg_desc;
+ u8 dpl, rpl, cpl;
+ unsigned err_vec = GP_VECTOR;
+ u32 err_code = 0;
+ bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+ int ret;
- if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE))
+ if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
return kvm_load_realmode_segment(vcpu, selector, seg);
- if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
- return 1;
- kvm_seg.type |= type_bits;
- if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
- seg != VCPU_SREG_LDTR)
- if (!kvm_seg.s)
- kvm_seg.unusable = 1;
+ /* NULL selector is not valid for TR, CS and SS */
+ if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+ && null_selector)
+ goto exception;
+
+ /* TR should be in GDT only */
+ if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+ goto exception;
+
+ ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
+ if (ret)
+ return ret;
+
+ seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
+
+ if (null_selector) { /* for NULL selector skip all following checks */
+ kvm_seg.unusable = 1;
+ goto load;
+ }
+
+ err_code = selector & 0xfffc;
+ err_vec = GP_VECTOR;
+
+ /* can't load system descriptor into segment selecor */
+ if (seg <= VCPU_SREG_GS && !kvm_seg.s)
+ goto exception;
+ if (!kvm_seg.present) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
+ rpl = selector & 3;
+ dpl = kvm_seg.dpl;
+ cpl = kvm_x86_ops->get_cpl(vcpu);
+
+ switch (seg) {
+ case VCPU_SREG_SS:
+ /*
+ * segment is not a writable data segment or segment
+ * selector's RPL != CPL or segment selector's RPL != CPL
+ */
+ if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
+ goto exception;
+ break;
+ case VCPU_SREG_CS:
+ if (!(kvm_seg.type & 8))
+ goto exception;
+
+ if (kvm_seg.type & 4) {
+ /* conforming */
+ if (dpl > cpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (rpl > cpl || dpl != cpl)
+ goto exception;
+ }
+ /* CS(RPL) <- CPL */
+ selector = (selector & 0xfffc) | cpl;
+ break;
+ case VCPU_SREG_TR:
+ if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
+ goto exception;
+ break;
+ case VCPU_SREG_LDTR:
+ if (kvm_seg.s || kvm_seg.type != 2)
+ goto exception;
+ break;
+ default: /* DS, ES, FS, or GS */
+ /*
+ * segment is not a data or readable code segment or
+ * ((segment is a data or nonconforming code segment)
+ * and (both RPL and CPL > DPL))
+ */
+ if ((kvm_seg.type & 0xa) == 0x8 ||
+ (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
+ goto exception;
+ break;
+ }
+
+ if (!kvm_seg.unusable && kvm_seg.s) {
+ /* mark segment as accessed */
+ kvm_seg.type |= 1;
+ seg_desc.type |= 1;
+ save_guest_segment_descriptor(vcpu, selector, &seg_desc);
+ }
+load:
kvm_set_segment(vcpu, &kvm_seg, seg);
- return 0;
+ return X86EMUL_CONTINUE;
+exception:
+ kvm_queue_exception_e(vcpu, err_vec, err_code);
+ return X86EMUL_PROPAGATE_FAULT;
}
static void save_state_to_tss32(struct kvm_vcpu *vcpu,
@@ -4129,7 +4911,7 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
{
tss->cr3 = vcpu->arch.cr3;
tss->eip = kvm_rip_read(vcpu);
- tss->eflags = kvm_x86_ops->get_rflags(vcpu);
+ tss->eflags = kvm_get_rflags(vcpu);
tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4147,13 +4929,21 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
}
+static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
+{
+ struct kvm_segment kvm_seg;
+ kvm_get_segment(vcpu, &kvm_seg, seg);
+ kvm_seg.selector = sel;
+ kvm_set_segment(vcpu, &kvm_seg, seg);
+}
+
static int load_state_from_tss32(struct kvm_vcpu *vcpu,
struct tss_segment_32 *tss)
{
kvm_set_cr3(vcpu, tss->cr3);
kvm_rip_write(vcpu, tss->eip);
- kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
+ kvm_set_rflags(vcpu, tss->eflags | 2);
kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
@@ -4164,25 +4954,41 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
- if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
+ kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+ kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+ kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+ kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+ kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
+ kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
+ if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
+ if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
+ if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
+ if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
+ if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
+ if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
return 1;
return 0;
}
@@ -4191,7 +4997,7 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
struct tss_segment_16 *tss)
{
tss->ip = kvm_rip_read(vcpu);
- tss->flag = kvm_x86_ops->get_rflags(vcpu);
+ tss->flag = kvm_get_rflags(vcpu);
tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4206,14 +5012,13 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
- tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
}
static int load_state_from_tss16(struct kvm_vcpu *vcpu,
struct tss_segment_16 *tss)
{
kvm_rip_write(vcpu, tss->ip);
- kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
+ kvm_set_rflags(vcpu, tss->flag | 2);
kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
@@ -4223,19 +5028,33 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
- if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
+ kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
+ kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
+ kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
+ kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
+ if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
+ if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
+ if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
return 1;
- if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
+ if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
return 1;
return 0;
}
@@ -4257,7 +5076,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
sizeof tss_segment_16))
goto out;
- if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
+ if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
&tss_segment_16, sizeof tss_segment_16))
goto out;
@@ -4265,7 +5084,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
tss_segment_16.prev_task_link = old_tss_sel;
if (kvm_write_guest(vcpu->kvm,
- get_tss_base_addr(vcpu, nseg_desc),
+ get_tss_base_addr_write(vcpu, nseg_desc),
&tss_segment_16.prev_task_link,
sizeof tss_segment_16.prev_task_link))
goto out;
@@ -4296,7 +5115,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
sizeof tss_segment_32))
goto out;
- if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
+ if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
&tss_segment_32, sizeof tss_segment_32))
goto out;
@@ -4304,7 +5123,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
tss_segment_32.prev_task_link = old_tss_sel;
if (kvm_write_guest(vcpu->kvm,
- get_tss_base_addr(vcpu, nseg_desc),
+ get_tss_base_addr_write(vcpu, nseg_desc),
&tss_segment_32.prev_task_link,
sizeof tss_segment_32.prev_task_link))
goto out;
@@ -4327,7 +5146,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
- old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
+ old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
/* FIXME: Handle errors. Failure to read either TSS or their
* descriptors should generate a pagefault.
@@ -4359,8 +5178,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
}
if (reason == TASK_SWITCH_IRET) {
- u32 eflags = kvm_x86_ops->get_rflags(vcpu);
- kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
+ u32 eflags = kvm_get_rflags(vcpu);
+ kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
}
/* set back link to prev task only if NT bit is set in eflags
@@ -4368,11 +5187,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
old_tss_sel = 0xffff;
- /* set back link to prev task only if NT bit is set in eflags
- note that old_tss_sel is not used afetr this point */
- if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
- old_tss_sel = 0xffff;
-
if (nseg_desc.type & 8)
ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
old_tss_base, &nseg_desc);
@@ -4381,8 +5195,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
old_tss_base, &nseg_desc);
if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
- u32 eflags = kvm_x86_ops->get_rflags(vcpu);
- kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);
+ u32 eflags = kvm_get_rflags(vcpu);
+ kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
}
if (reason != TASK_SWITCH_IRET) {
@@ -4391,7 +5205,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
&nseg_desc);
}
- kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
+ kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
tr_seg.type = 11;
kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
@@ -4422,20 +5236,20 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_set_cr8(vcpu, sregs->cr8);
- mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
+ mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
kvm_x86_ops->set_efer(vcpu, sregs->efer);
kvm_set_apic_base(vcpu, sregs->apic_base);
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
-
- mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
+ mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
vcpu->arch.cr0 = sregs->cr0;
- mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
+ mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
- if (!is_long_mode(vcpu) && is_pae(vcpu))
+ if (!is_long_mode(vcpu) && is_pae(vcpu)) {
load_pdptrs(vcpu, vcpu->arch.cr3);
+ mmu_reset_needed = 1;
+ }
if (mmu_reset_needed)
kvm_mmu_reset_context(vcpu);
@@ -4465,7 +5279,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
/* Older userspace won't unhalt the vcpu on reset. */
if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
- !(vcpu->arch.cr0 & X86_CR0_PE))
+ !is_protmode(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
vcpu_put(vcpu);
@@ -4476,12 +5290,32 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg)
{
+ unsigned long rflags;
int i, r;
vcpu_load(vcpu);
- if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) ==
- (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) {
+ if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
+ r = -EBUSY;
+ if (vcpu->arch.exception.pending)
+ goto unlock_out;
+ if (dbg->control & KVM_GUESTDBG_INJECT_DB)
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ else
+ kvm_queue_exception(vcpu, BP_VECTOR);
+ }
+
+ /*
+ * Read rflags as long as potentially injected trace flags are still
+ * filtered out.
+ */
+ rflags = kvm_get_rflags(vcpu);
+
+ vcpu->guest_debug = dbg->control;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
+ vcpu->guest_debug = 0;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
for (i = 0; i < KVM_NR_DB_REGS; ++i)
vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
vcpu->arch.switch_db_regs =
@@ -4492,13 +5326,23 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
}
- r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+ vcpu->arch.singlestep_cs =
+ get_segment_selector(vcpu, VCPU_SREG_CS);
+ vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
+ }
+
+ /*
+ * Trigger an rflags update that will inject or remove the trace
+ * flags.
+ */
+ kvm_set_rflags(vcpu, rflags);
- if (dbg->control & KVM_GUESTDBG_INJECT_DB)
- kvm_queue_exception(vcpu, DB_VECTOR);
- else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
- kvm_queue_exception(vcpu, BP_VECTOR);
+ kvm_x86_ops->set_guest_debug(vcpu, dbg);
+ r = 0;
+
+unlock_out:
vcpu_put(vcpu);
return r;
@@ -4533,11 +5377,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
{
unsigned long vaddr = tr->linear_address;
gpa_t gpa;
+ int idx;
vcpu_load(vcpu);
- down_read(&vcpu->kvm->slots_lock);
- gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
- up_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
tr->physical_address = gpa;
tr->valid = gpa != UNMAPPED_GVA;
tr->writeable = 1;
@@ -4618,14 +5463,14 @@ EXPORT_SYMBOL_GPL(fx_init);
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
- if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
+ if (vcpu->guest_fpu_loaded)
return;
vcpu->guest_fpu_loaded = 1;
kvm_fx_save(&vcpu->arch.host_fx_image);
kvm_fx_restore(&vcpu->arch.guest_fx_image);
+ trace_kvm_fpu(1);
}
-EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
@@ -4636,8 +5481,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
kvm_fx_save(&vcpu->arch.guest_fx_image);
kvm_fx_restore(&vcpu->arch.host_fx_image);
++vcpu->stat.fpu_reload;
+ set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
+ trace_kvm_fpu(0);
}
-EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
{
@@ -4699,14 +5545,26 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
return kvm_x86_ops->vcpu_reset(vcpu);
}
-void kvm_arch_hardware_enable(void *garbage)
+int kvm_arch_hardware_enable(void *garbage)
{
- kvm_x86_ops->hardware_enable(garbage);
+ /*
+ * Since this may be called from a hotplug notifcation,
+ * we can't get the CPU frequency directly.
+ */
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ int cpu = raw_smp_processor_id();
+ per_cpu(cpu_tsc_khz, cpu) = 0;
+ }
+
+ kvm_shared_msr_cpu_online();
+
+ return kvm_x86_ops->hardware_enable(garbage);
}
void kvm_arch_hardware_disable(void *garbage)
{
kvm_x86_ops->hardware_disable(garbage);
+ drop_user_return_notifiers(garbage);
}
int kvm_arch_hardware_setup(void)
@@ -4760,12 +5618,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
GFP_KERNEL);
if (!vcpu->arch.mce_banks) {
r = -ENOMEM;
- goto fail_mmu_destroy;
+ goto fail_free_lapic;
}
vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
return 0;
-
+fail_free_lapic:
+ kvm_free_lapic(vcpu);
fail_mmu_destroy:
kvm_mmu_destroy(vcpu);
fail_free_pio_data:
@@ -4776,10 +5635,13 @@ fail:
void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
{
+ int idx;
+
+ kfree(vcpu->arch.mce_banks);
kvm_free_lapic(vcpu);
- down_read(&vcpu->kvm->slots_lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_mmu_destroy(vcpu);
- up_read(&vcpu->kvm->slots_lock);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
}
@@ -4790,6 +5652,12 @@ struct kvm *kvm_arch_create_vm(void)
if (!kvm)
return ERR_PTR(-ENOMEM);
+ kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
+ if (!kvm->arch.aliases) {
+ kfree(kvm);
+ return ERR_PTR(-ENOMEM);
+ }
+
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
@@ -4846,16 +5714,18 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
put_page(kvm->arch.apic_access_page);
if (kvm->arch.ept_identity_pagetable)
put_page(kvm->arch.ept_identity_pagetable);
+ cleanup_srcu_struct(&kvm->srcu);
+ kfree(kvm->arch.aliases);
kfree(kvm);
}
-int kvm_arch_set_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem,
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
struct kvm_memory_slot old,
+ struct kvm_userspace_memory_region *mem,
int user_alloc)
{
- int npages = mem->memory_size >> PAGE_SHIFT;
- struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
+ int npages = memslot->npages;
/*To keep backward compatibility with older userspace,
*x86 needs to hanlde !user_alloc case.
@@ -4875,26 +5745,35 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
if (IS_ERR((void *)userspace_addr))
return PTR_ERR((void *)userspace_addr);
- /* set userspace_addr atomically for kvm_hva_to_rmapp */
- spin_lock(&kvm->mmu_lock);
memslot->userspace_addr = userspace_addr;
- spin_unlock(&kvm->mmu_lock);
- } else {
- if (!old.user_alloc && old.rmap) {
- int ret;
-
- down_write(&current->mm->mmap_sem);
- ret = do_munmap(current->mm, old.userspace_addr,
- old.npages * PAGE_SIZE);
- up_write(&current->mm->mmap_sem);
- if (ret < 0)
- printk(KERN_WARNING
- "kvm_vm_ioctl_set_memory_region: "
- "failed to munmap memory\n");
- }
}
}
+
+ return 0;
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old,
+ int user_alloc)
+{
+
+ int npages = mem->memory_size >> PAGE_SHIFT;
+
+ if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
+ int ret;
+
+ down_write(&current->mm->mmap_sem);
+ ret = do_munmap(current->mm, old.userspace_addr,
+ old.npages * PAGE_SIZE);
+ up_write(&current->mm->mmap_sem);
+ if (ret < 0)
+ printk(KERN_WARNING
+ "kvm_vm_ioctl_set_memory_region: "
+ "failed to munmap memory\n");
+ }
+
spin_lock(&kvm->mmu_lock);
if (!kvm->arch.n_requested_mmu_pages) {
unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
@@ -4903,8 +5782,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
kvm_mmu_slot_remove_write_access(kvm, mem->slot);
spin_unlock(&kvm->mmu_lock);
-
- return 0;
}
void kvm_arch_flush_shadow(struct kvm *kvm)
@@ -4944,8 +5821,36 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
return kvm_x86_ops->interrupt_allowed(vcpu);
}
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags;
+
+ rflags = kvm_x86_ops->get_rflags(vcpu);
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
+ return rflags;
+}
+EXPORT_SYMBOL_GPL(kvm_get_rflags);
+
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
+ vcpu->arch.singlestep_cs ==
+ get_segment_selector(vcpu, VCPU_SREG_CS) &&
+ vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
+ rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+ kvm_x86_ops->set_rflags(vcpu, rflags);
+}
+EXPORT_SYMBOL_GPL(kvm_set_rflags);
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 5eadea585d2a..2d101639bd8d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -2,6 +2,7 @@
#define ARCH_X86_KVM_X86_H
#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
{
@@ -35,4 +36,33 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function, u32 index);
+static inline bool is_protmode(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
+}
+
+static inline int is_long_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return vcpu->arch.efer & EFER_LMA;
+#else
+ return 0;
+#endif
+}
+
+static inline int is_pae(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
+}
+
+static inline int is_pse(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, X86_CR4_PSE);
+}
+
+static inline int is_paging(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
+}
+
#endif
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore
new file mode 100644
index 000000000000..8df89f0a3fe6
--- /dev/null
+++ b/arch/x86/lib/.gitignore
@@ -0,0 +1 @@
+inat-tables.c
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 9e609206fac9..419386c24b82 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,26 +2,42 @@
# Makefile for x86 specific library files.
#
-obj-$(CONFIG_SMP) := msr.o
+inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
+inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
+quiet_cmd_inat_tables = GEN $@
+ cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ || rm -f $@
+
+$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps)
+ $(call cmd,inat_tables)
+
+$(obj)/inat.o: $(obj)/inat-tables.c
+
+clean-files := inat-tables.c
+
+obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
lib-y := delay.o
lib-y += thunk_$(BITS).o
lib-y += usercopy_$(BITS).o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
+lib-$(CONFIG_KPROBES) += insn.o inat.o
-obj-y += msr-reg.o msr-reg-export.o
+obj-y += msr.o msr-reg.o msr-reg-export.o
ifeq ($(CONFIG_X86_32),y)
obj-y += atomic64_32.o
lib-y += checksum_32.o
lib-y += strstr_32.o
lib-y += semaphore_32.o string_32.o
-
+ifneq ($(CONFIG_X86_CMPXCHG64),y)
+ lib-y += cmpxchg8b_emu.o
+endif
lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
else
- obj-y += io_64.o iomap_copy_64.o
+ obj-y += iomap_copy_64.o
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
lib-y += thunk_64.o clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
+ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
endif
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
new file mode 100644
index 000000000000..a3c668875038
--- /dev/null
+++ b/arch/x86/lib/cache-smp.c
@@ -0,0 +1,19 @@
+#include <linux/smp.h>
+#include <linux/module.h>
+
+static void __wbinvd(void *dummy)
+{
+ wbinvd();
+}
+
+void wbinvd_on_cpu(int cpu)
+{
+ smp_call_function_single(cpu, __wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL(wbinvd_on_cpu);
+
+int wbinvd_on_all_cpus(void)
+{
+ return on_each_cpu(__wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL(wbinvd_on_all_cpus);
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
new file mode 100644
index 000000000000..828cb710dec2
--- /dev/null
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -0,0 +1,57 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/dwarf2.h>
+
+
+.text
+
+/*
+ * Inputs:
+ * %esi : memory location to compare
+ * %eax : low 32 bits of old value
+ * %edx : high 32 bits of old value
+ * %ebx : low 32 bits of new value
+ * %ecx : high 32 bits of new value
+ */
+ENTRY(cmpxchg8b_emu)
+CFI_STARTPROC
+
+#
+# Emulate 'cmpxchg8b (%esi)' on UP except we don't
+# set the whole ZF thing (caller will just compare
+# eax:edx with the expected value)
+#
+cmpxchg8b_emu:
+ pushfl
+ cli
+
+ cmpl (%esi), %eax
+ jne not_same
+ cmpl 4(%esi), %edx
+ jne half_same
+
+ movl %ebx, (%esi)
+ movl %ecx, 4(%esi)
+
+ popfl
+ ret
+
+ not_same:
+ movl (%esi), %eax
+ half_same:
+ movl 4(%esi), %edx
+
+ popfl
+ ret
+
+CFI_ENDPROC
+ENDPROC(cmpxchg8b_emu)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 6ba0f7bb85ea..71100c98e337 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -65,7 +65,7 @@
.endm
/* Standard copy_to_user with segment limit checking */
-ENTRY(copy_to_user)
+ENTRY(_copy_to_user)
CFI_STARTPROC
GET_THREAD_INFO(%rax)
movq %rdi,%rcx
@@ -75,10 +75,10 @@ ENTRY(copy_to_user)
jae bad_to_user
ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
CFI_ENDPROC
-ENDPROC(copy_to_user)
+ENDPROC(_copy_to_user)
/* Standard copy_from_user with segment limit checking */
-ENTRY(copy_from_user)
+ENTRY(_copy_from_user)
CFI_STARTPROC
GET_THREAD_INFO(%rax)
movq %rsi,%rcx
@@ -88,19 +88,7 @@ ENTRY(copy_from_user)
jae bad_from_user
ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
CFI_ENDPROC
-ENDPROC(copy_from_user)
-
-ENTRY(copy_user_generic)
- CFI_STARTPROC
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
- CFI_ENDPROC
-ENDPROC(copy_user_generic)
-
-ENTRY(__copy_from_user_inatomic)
- CFI_STARTPROC
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
- CFI_ENDPROC
-ENDPROC(__copy_from_user_inatomic)
+ENDPROC(_copy_from_user)
.section .fixup,"ax"
/* must zero dest */
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
new file mode 100644
index 000000000000..46fc4ee09fc4
--- /dev/null
+++ b/arch/x86/lib/inat.c
@@ -0,0 +1,90 @@
+/*
+ * x86 instruction attribute tables
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+#include <asm/insn.h>
+
+/* Attribute tables are generated from opcode map */
+#include "inat-tables.c"
+
+/* Attribute search APIs */
+insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode)
+{
+ return inat_primary_table[opcode];
+}
+
+insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, insn_byte_t last_pfx,
+ insn_attr_t esc_attr)
+{
+ const insn_attr_t *table;
+ insn_attr_t lpfx_attr;
+ int n, m = 0;
+
+ n = inat_escape_id(esc_attr);
+ if (last_pfx) {
+ lpfx_attr = inat_get_opcode_attribute(last_pfx);
+ m = inat_last_prefix_id(lpfx_attr);
+ }
+ table = inat_escape_tables[n][0];
+ if (!table)
+ return 0;
+ if (inat_has_variant(table[opcode]) && m) {
+ table = inat_escape_tables[n][m];
+ if (!table)
+ return 0;
+ }
+ return table[opcode];
+}
+
+insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx,
+ insn_attr_t grp_attr)
+{
+ const insn_attr_t *table;
+ insn_attr_t lpfx_attr;
+ int n, m = 0;
+
+ n = inat_group_id(grp_attr);
+ if (last_pfx) {
+ lpfx_attr = inat_get_opcode_attribute(last_pfx);
+ m = inat_last_prefix_id(lpfx_attr);
+ }
+ table = inat_group_tables[n][0];
+ if (!table)
+ return inat_group_common_attribute(grp_attr);
+ if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && m) {
+ table = inat_group_tables[n][m];
+ if (!table)
+ return inat_group_common_attribute(grp_attr);
+ }
+ return table[X86_MODRM_REG(modrm)] |
+ inat_group_common_attribute(grp_attr);
+}
+
+insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
+ insn_byte_t vex_p)
+{
+ const insn_attr_t *table;
+ if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
+ return 0;
+ table = inat_avx_tables[vex_m][vex_p];
+ if (!table)
+ return 0;
+ return table[opcode];
+}
+
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
new file mode 100644
index 000000000000..9f33b984d0ef
--- /dev/null
+++ b/arch/x86/lib/insn.c
@@ -0,0 +1,516 @@
+/*
+ * x86 instruction analysis
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004, 2009
+ */
+
+#include <linux/string.h>
+#include <asm/inat.h>
+#include <asm/insn.h>
+
+#define get_next(t, insn) \
+ ({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
+
+#define peek_next(t, insn) \
+ ({t r; r = *(t*)insn->next_byte; r; })
+
+#define peek_nbyte_next(t, insn, n) \
+ ({t r; r = *(t*)((insn)->next_byte + n); r; })
+
+/**
+ * insn_init() - initialize struct insn
+ * @insn: &struct insn to be initialized
+ * @kaddr: address (in kernel memory) of instruction (or copy thereof)
+ * @x86_64: !0 for 64-bit kernel or 64-bit app
+ */
+void insn_init(struct insn *insn, const void *kaddr, int x86_64)
+{
+ memset(insn, 0, sizeof(*insn));
+ insn->kaddr = kaddr;
+ insn->next_byte = kaddr;
+ insn->x86_64 = x86_64 ? 1 : 0;
+ insn->opnd_bytes = 4;
+ if (x86_64)
+ insn->addr_bytes = 8;
+ else
+ insn->addr_bytes = 4;
+}
+
+/**
+ * insn_get_prefixes - scan x86 instruction prefix bytes
+ * @insn: &struct insn containing instruction
+ *
+ * Populates the @insn->prefixes bitmap, and updates @insn->next_byte
+ * to point to the (first) opcode. No effect if @insn->prefixes.got
+ * is already set.
+ */
+void insn_get_prefixes(struct insn *insn)
+{
+ struct insn_field *prefixes = &insn->prefixes;
+ insn_attr_t attr;
+ insn_byte_t b, lb;
+ int i, nb;
+
+ if (prefixes->got)
+ return;
+
+ nb = 0;
+ lb = 0;
+ b = peek_next(insn_byte_t, insn);
+ attr = inat_get_opcode_attribute(b);
+ while (inat_is_legacy_prefix(attr)) {
+ /* Skip if same prefix */
+ for (i = 0; i < nb; i++)
+ if (prefixes->bytes[i] == b)
+ goto found;
+ if (nb == 4)
+ /* Invalid instruction */
+ break;
+ prefixes->bytes[nb++] = b;
+ if (inat_is_address_size_prefix(attr)) {
+ /* address size switches 2/4 or 4/8 */
+ if (insn->x86_64)
+ insn->addr_bytes ^= 12;
+ else
+ insn->addr_bytes ^= 6;
+ } else if (inat_is_operand_size_prefix(attr)) {
+ /* oprand size switches 2/4 */
+ insn->opnd_bytes ^= 6;
+ }
+found:
+ prefixes->nbytes++;
+ insn->next_byte++;
+ lb = b;
+ b = peek_next(insn_byte_t, insn);
+ attr = inat_get_opcode_attribute(b);
+ }
+ /* Set the last prefix */
+ if (lb && lb != insn->prefixes.bytes[3]) {
+ if (unlikely(insn->prefixes.bytes[3])) {
+ /* Swap the last prefix */
+ b = insn->prefixes.bytes[3];
+ for (i = 0; i < nb; i++)
+ if (prefixes->bytes[i] == lb)
+ prefixes->bytes[i] = b;
+ }
+ insn->prefixes.bytes[3] = lb;
+ }
+
+ /* Decode REX prefix */
+ if (insn->x86_64) {
+ b = peek_next(insn_byte_t, insn);
+ attr = inat_get_opcode_attribute(b);
+ if (inat_is_rex_prefix(attr)) {
+ insn->rex_prefix.value = b;
+ insn->rex_prefix.nbytes = 1;
+ insn->next_byte++;
+ if (X86_REX_W(b))
+ /* REX.W overrides opnd_size */
+ insn->opnd_bytes = 8;
+ }
+ }
+ insn->rex_prefix.got = 1;
+
+ /* Decode VEX prefix */
+ b = peek_next(insn_byte_t, insn);
+ attr = inat_get_opcode_attribute(b);
+ if (inat_is_vex_prefix(attr)) {
+ insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1);
+ if (!insn->x86_64) {
+ /*
+ * In 32-bits mode, if the [7:6] bits (mod bits of
+ * ModRM) on the second byte are not 11b, it is
+ * LDS or LES.
+ */
+ if (X86_MODRM_MOD(b2) != 3)
+ goto vex_end;
+ }
+ insn->vex_prefix.bytes[0] = b;
+ insn->vex_prefix.bytes[1] = b2;
+ if (inat_is_vex3_prefix(attr)) {
+ b2 = peek_nbyte_next(insn_byte_t, insn, 2);
+ insn->vex_prefix.bytes[2] = b2;
+ insn->vex_prefix.nbytes = 3;
+ insn->next_byte += 3;
+ if (insn->x86_64 && X86_VEX_W(b2))
+ /* VEX.W overrides opnd_size */
+ insn->opnd_bytes = 8;
+ } else {
+ insn->vex_prefix.nbytes = 2;
+ insn->next_byte += 2;
+ }
+ }
+vex_end:
+ insn->vex_prefix.got = 1;
+
+ prefixes->got = 1;
+ return;
+}
+
+/**
+ * insn_get_opcode - collect opcode(s)
+ * @insn: &struct insn containing instruction
+ *
+ * Populates @insn->opcode, updates @insn->next_byte to point past the
+ * opcode byte(s), and set @insn->attr (except for groups).
+ * If necessary, first collects any preceding (prefix) bytes.
+ * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got
+ * is already 1.
+ */
+void insn_get_opcode(struct insn *insn)
+{
+ struct insn_field *opcode = &insn->opcode;
+ insn_byte_t op, pfx;
+ if (opcode->got)
+ return;
+ if (!insn->prefixes.got)
+ insn_get_prefixes(insn);
+
+ /* Get first opcode */
+ op = get_next(insn_byte_t, insn);
+ opcode->bytes[0] = op;
+ opcode->nbytes = 1;
+
+ /* Check if there is VEX prefix or not */
+ if (insn_is_avx(insn)) {
+ insn_byte_t m, p;
+ m = insn_vex_m_bits(insn);
+ p = insn_vex_p_bits(insn);
+ insn->attr = inat_get_avx_attribute(op, m, p);
+ if (!inat_accept_vex(insn->attr))
+ insn->attr = 0; /* This instruction is bad */
+ goto end; /* VEX has only 1 byte for opcode */
+ }
+
+ insn->attr = inat_get_opcode_attribute(op);
+ while (inat_is_escape(insn->attr)) {
+ /* Get escaped opcode */
+ op = get_next(insn_byte_t, insn);
+ opcode->bytes[opcode->nbytes++] = op;
+ pfx = insn_last_prefix(insn);
+ insn->attr = inat_get_escape_attribute(op, pfx, insn->attr);
+ }
+ if (inat_must_vex(insn->attr))
+ insn->attr = 0; /* This instruction is bad */
+end:
+ opcode->got = 1;
+}
+
+/**
+ * insn_get_modrm - collect ModRM byte, if any
+ * @insn: &struct insn containing instruction
+ *
+ * Populates @insn->modrm and updates @insn->next_byte to point past the
+ * ModRM byte, if any. If necessary, first collects the preceding bytes
+ * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1.
+ */
+void insn_get_modrm(struct insn *insn)
+{
+ struct insn_field *modrm = &insn->modrm;
+ insn_byte_t pfx, mod;
+ if (modrm->got)
+ return;
+ if (!insn->opcode.got)
+ insn_get_opcode(insn);
+
+ if (inat_has_modrm(insn->attr)) {
+ mod = get_next(insn_byte_t, insn);
+ modrm->value = mod;
+ modrm->nbytes = 1;
+ if (inat_is_group(insn->attr)) {
+ pfx = insn_last_prefix(insn);
+ insn->attr = inat_get_group_attribute(mod, pfx,
+ insn->attr);
+ }
+ }
+
+ if (insn->x86_64 && inat_is_force64(insn->attr))
+ insn->opnd_bytes = 8;
+ modrm->got = 1;
+}
+
+
+/**
+ * insn_rip_relative() - Does instruction use RIP-relative addressing mode?
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * ModRM byte. No effect if @insn->x86_64 is 0.
+ */
+int insn_rip_relative(struct insn *insn)
+{
+ struct insn_field *modrm = &insn->modrm;
+
+ if (!insn->x86_64)
+ return 0;
+ if (!modrm->got)
+ insn_get_modrm(insn);
+ /*
+ * For rip-relative instructions, the mod field (top 2 bits)
+ * is zero and the r/m field (bottom 3 bits) is 0x5.
+ */
+ return (modrm->nbytes && (modrm->value & 0xc7) == 0x5);
+}
+
+/**
+ * insn_get_sib() - Get the SIB byte of instruction
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * ModRM byte.
+ */
+void insn_get_sib(struct insn *insn)
+{
+ insn_byte_t modrm;
+
+ if (insn->sib.got)
+ return;
+ if (!insn->modrm.got)
+ insn_get_modrm(insn);
+ if (insn->modrm.nbytes) {
+ modrm = (insn_byte_t)insn->modrm.value;
+ if (insn->addr_bytes != 2 &&
+ X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) {
+ insn->sib.value = get_next(insn_byte_t, insn);
+ insn->sib.nbytes = 1;
+ }
+ }
+ insn->sib.got = 1;
+}
+
+
+/**
+ * insn_get_displacement() - Get the displacement of instruction
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * SIB byte.
+ * Displacement value is sign-expanded.
+ */
+void insn_get_displacement(struct insn *insn)
+{
+ insn_byte_t mod, rm, base;
+
+ if (insn->displacement.got)
+ return;
+ if (!insn->sib.got)
+ insn_get_sib(insn);
+ if (insn->modrm.nbytes) {
+ /*
+ * Interpreting the modrm byte:
+ * mod = 00 - no displacement fields (exceptions below)
+ * mod = 01 - 1-byte displacement field
+ * mod = 10 - displacement field is 4 bytes, or 2 bytes if
+ * address size = 2 (0x67 prefix in 32-bit mode)
+ * mod = 11 - no memory operand
+ *
+ * If address size = 2...
+ * mod = 00, r/m = 110 - displacement field is 2 bytes
+ *
+ * If address size != 2...
+ * mod != 11, r/m = 100 - SIB byte exists
+ * mod = 00, SIB base = 101 - displacement field is 4 bytes
+ * mod = 00, r/m = 101 - rip-relative addressing, displacement
+ * field is 4 bytes
+ */
+ mod = X86_MODRM_MOD(insn->modrm.value);
+ rm = X86_MODRM_RM(insn->modrm.value);
+ base = X86_SIB_BASE(insn->sib.value);
+ if (mod == 3)
+ goto out;
+ if (mod == 1) {
+ insn->displacement.value = get_next(char, insn);
+ insn->displacement.nbytes = 1;
+ } else if (insn->addr_bytes == 2) {
+ if ((mod == 0 && rm == 6) || mod == 2) {
+ insn->displacement.value =
+ get_next(short, insn);
+ insn->displacement.nbytes = 2;
+ }
+ } else {
+ if ((mod == 0 && rm == 5) || mod == 2 ||
+ (mod == 0 && base == 5)) {
+ insn->displacement.value = get_next(int, insn);
+ insn->displacement.nbytes = 4;
+ }
+ }
+ }
+out:
+ insn->displacement.got = 1;
+}
+
+/* Decode moffset16/32/64 */
+static void __get_moffset(struct insn *insn)
+{
+ switch (insn->addr_bytes) {
+ case 2:
+ insn->moffset1.value = get_next(short, insn);
+ insn->moffset1.nbytes = 2;
+ break;
+ case 4:
+ insn->moffset1.value = get_next(int, insn);
+ insn->moffset1.nbytes = 4;
+ break;
+ case 8:
+ insn->moffset1.value = get_next(int, insn);
+ insn->moffset1.nbytes = 4;
+ insn->moffset2.value = get_next(int, insn);
+ insn->moffset2.nbytes = 4;
+ break;
+ }
+ insn->moffset1.got = insn->moffset2.got = 1;
+}
+
+/* Decode imm v32(Iz) */
+static void __get_immv32(struct insn *insn)
+{
+ switch (insn->opnd_bytes) {
+ case 2:
+ insn->immediate.value = get_next(short, insn);
+ insn->immediate.nbytes = 2;
+ break;
+ case 4:
+ case 8:
+ insn->immediate.value = get_next(int, insn);
+ insn->immediate.nbytes = 4;
+ break;
+ }
+}
+
+/* Decode imm v64(Iv/Ov) */
+static void __get_immv(struct insn *insn)
+{
+ switch (insn->opnd_bytes) {
+ case 2:
+ insn->immediate1.value = get_next(short, insn);
+ insn->immediate1.nbytes = 2;
+ break;
+ case 4:
+ insn->immediate1.value = get_next(int, insn);
+ insn->immediate1.nbytes = 4;
+ break;
+ case 8:
+ insn->immediate1.value = get_next(int, insn);
+ insn->immediate1.nbytes = 4;
+ insn->immediate2.value = get_next(int, insn);
+ insn->immediate2.nbytes = 4;
+ break;
+ }
+ insn->immediate1.got = insn->immediate2.got = 1;
+}
+
+/* Decode ptr16:16/32(Ap) */
+static void __get_immptr(struct insn *insn)
+{
+ switch (insn->opnd_bytes) {
+ case 2:
+ insn->immediate1.value = get_next(short, insn);
+ insn->immediate1.nbytes = 2;
+ break;
+ case 4:
+ insn->immediate1.value = get_next(int, insn);
+ insn->immediate1.nbytes = 4;
+ break;
+ case 8:
+ /* ptr16:64 is not exist (no segment) */
+ return;
+ }
+ insn->immediate2.value = get_next(unsigned short, insn);
+ insn->immediate2.nbytes = 2;
+ insn->immediate1.got = insn->immediate2.got = 1;
+}
+
+/**
+ * insn_get_immediate() - Get the immediates of instruction
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * displacement bytes.
+ * Basically, most of immediates are sign-expanded. Unsigned-value can be
+ * get by bit masking with ((1 << (nbytes * 8)) - 1)
+ */
+void insn_get_immediate(struct insn *insn)
+{
+ if (insn->immediate.got)
+ return;
+ if (!insn->displacement.got)
+ insn_get_displacement(insn);
+
+ if (inat_has_moffset(insn->attr)) {
+ __get_moffset(insn);
+ goto done;
+ }
+
+ if (!inat_has_immediate(insn->attr))
+ /* no immediates */
+ goto done;
+
+ switch (inat_immediate_size(insn->attr)) {
+ case INAT_IMM_BYTE:
+ insn->immediate.value = get_next(char, insn);
+ insn->immediate.nbytes = 1;
+ break;
+ case INAT_IMM_WORD:
+ insn->immediate.value = get_next(short, insn);
+ insn->immediate.nbytes = 2;
+ break;
+ case INAT_IMM_DWORD:
+ insn->immediate.value = get_next(int, insn);
+ insn->immediate.nbytes = 4;
+ break;
+ case INAT_IMM_QWORD:
+ insn->immediate1.value = get_next(int, insn);
+ insn->immediate1.nbytes = 4;
+ insn->immediate2.value = get_next(int, insn);
+ insn->immediate2.nbytes = 4;
+ break;
+ case INAT_IMM_PTR:
+ __get_immptr(insn);
+ break;
+ case INAT_IMM_VWORD32:
+ __get_immv32(insn);
+ break;
+ case INAT_IMM_VWORD:
+ __get_immv(insn);
+ break;
+ default:
+ break;
+ }
+ if (inat_has_second_immediate(insn->attr)) {
+ insn->immediate2.value = get_next(char, insn);
+ insn->immediate2.nbytes = 1;
+ }
+done:
+ insn->immediate.got = 1;
+}
+
+/**
+ * insn_get_length() - Get the length of instruction
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * immediates bytes.
+ */
+void insn_get_length(struct insn *insn)
+{
+ if (insn->length)
+ return;
+ if (!insn->immediate.got)
+ insn_get_immediate(insn);
+ insn->length = (unsigned char)((unsigned long)insn->next_byte
+ - (unsigned long)insn->kaddr);
+}
diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c
deleted file mode 100644
index 3f1eb59b5f08..000000000000
--- a/arch/x86/lib/io_64.c
+++ /dev/null
@@ -1,25 +0,0 @@
-#include <linux/string.h>
-#include <linux/module.h>
-#include <asm/io.h>
-
-void __memcpy_toio(unsigned long dst, const void *src, unsigned len)
-{
- __inline_memcpy((void *)dst, src, len);
-}
-EXPORT_SYMBOL(__memcpy_toio);
-
-void __memcpy_fromio(void *dst, unsigned long src, unsigned len)
-{
- __inline_memcpy(dst, (const void *)src, len);
-}
-EXPORT_SYMBOL(__memcpy_fromio);
-
-void memset_io(volatile void __iomem *a, int b, size_t c)
-{
- /*
- * TODO: memset can mangle the IO patterns quite a bit.
- * perhaps it would be better to use a dumb one:
- */
- memset((void *)a, b, c);
-}
-EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index ad5441ed1b57..f82e884928af 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -20,12 +20,11 @@
/*
* memcpy_c() - fast string ops (REP MOVSQ) based variant.
*
- * Calls to this get patched into the kernel image via the
+ * This gets patched over the unrolled variant (below) via the
* alternative instructions framework:
*/
- ALIGN
-memcpy_c:
- CFI_STARTPROC
+ .section .altinstr_replacement, "ax", @progbits
+.Lmemcpy_c:
movq %rdi, %rax
movl %edx, %ecx
@@ -35,8 +34,8 @@ memcpy_c:
movl %edx, %ecx
rep movsb
ret
- CFI_ENDPROC
-ENDPROC(memcpy_c)
+.Lmemcpy_e:
+ .previous
ENTRY(__memcpy)
ENTRY(memcpy)
@@ -128,16 +127,10 @@ ENDPROC(__memcpy)
* It is also a lot simpler. Use this when possible:
*/
- .section .altinstr_replacement, "ax"
-1: .byte 0xeb /* jmp <disp8> */
- .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
-2:
- .previous
-
.section .altinstructions, "a"
.align 8
.quad memcpy
- .quad 1b
+ .quad .Lmemcpy_c
.byte X86_FEATURE_REP_GOOD
/*
@@ -145,6 +138,6 @@ ENDPROC(__memcpy)
* so it is silly to overwrite itself with nops - reboot is the
* only outcome...
*/
- .byte 2b - 1b
- .byte 2b - 1b
+ .byte .Lmemcpy_e - .Lmemcpy_c
+ .byte .Lmemcpy_e - .Lmemcpy_c
.previous
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 2c5948116bd2..e88d3b81644a 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -12,9 +12,8 @@
*
* rax original destination
*/
- ALIGN
-memset_c:
- CFI_STARTPROC
+ .section .altinstr_replacement, "ax", @progbits
+.Lmemset_c:
movq %rdi,%r9
movl %edx,%r8d
andl $7,%r8d
@@ -29,8 +28,8 @@ memset_c:
rep stosb
movq %r9,%rax
ret
- CFI_ENDPROC
-ENDPROC(memset_c)
+.Lmemset_e:
+ .previous
ENTRY(memset)
ENTRY(__memset)
@@ -118,16 +117,11 @@ ENDPROC(__memset)
#include <asm/cpufeature.h>
- .section .altinstr_replacement,"ax"
-1: .byte 0xeb /* jmp <disp8> */
- .byte (memset_c - memset) - (2f - 1b) /* offset */
-2:
- .previous
.section .altinstructions,"a"
.align 8
.quad memset
- .quad 1b
+ .quad .Lmemset_c
.byte X86_FEATURE_REP_GOOD
.byte .Lfinal - memset
- .byte 2b - 1b
+ .byte .Lmemset_e - .Lmemset_c
.previous
diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c
new file mode 100644
index 000000000000..a6b1b86d2253
--- /dev/null
+++ b/arch/x86/lib/msr-smp.c
@@ -0,0 +1,204 @@
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <asm/msr.h>
+
+static void __rdmsr_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+ struct msr *reg;
+ int this_cpu = raw_smp_processor_id();
+
+ if (rv->msrs)
+ reg = per_cpu_ptr(rv->msrs, this_cpu);
+ else
+ reg = &rv->reg;
+
+ rdmsr(rv->msr_no, reg->l, reg->h);
+}
+
+static void __wrmsr_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+ struct msr *reg;
+ int this_cpu = raw_smp_processor_id();
+
+ if (rv->msrs)
+ reg = per_cpu_ptr(rv->msrs, this_cpu);
+ else
+ reg = &rv->reg;
+
+ wrmsr(rv->msr_no, reg->l, reg->h);
+}
+
+int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
+ *l = rv.reg.l;
+ *h = rv.reg.h;
+
+ return err;
+}
+EXPORT_SYMBOL(rdmsr_on_cpu);
+
+int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ rv.reg.l = l;
+ rv.reg.h = h;
+ err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
+
+ return err;
+}
+EXPORT_SYMBOL(wrmsr_on_cpu);
+
+static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no,
+ struct msr *msrs,
+ void (*msr_func) (void *info))
+{
+ struct msr_info rv;
+ int this_cpu;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msrs = msrs;
+ rv.msr_no = msr_no;
+
+ this_cpu = get_cpu();
+
+ if (cpumask_test_cpu(this_cpu, mask))
+ msr_func(&rv);
+
+ smp_call_function_many(mask, msr_func, &rv, 1);
+ put_cpu();
+}
+
+/* rdmsr on a bunch of CPUs
+ *
+ * @mask: which CPUs
+ * @msr_no: which MSR
+ * @msrs: array of MSR values
+ *
+ */
+void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
+{
+ __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu);
+}
+EXPORT_SYMBOL(rdmsr_on_cpus);
+
+/*
+ * wrmsr on a bunch of CPUs
+ *
+ * @mask: which CPUs
+ * @msr_no: which MSR
+ * @msrs: array of MSR values
+ *
+ */
+void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
+{
+ __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu);
+}
+EXPORT_SYMBOL(wrmsr_on_cpus);
+
+/* These "safe" variants are slower and should be used when the target MSR
+ may not actually exist. */
+static void __rdmsr_safe_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+
+ rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
+}
+
+static void __wrmsr_safe_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+
+ rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
+}
+
+int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
+ *l = rv.reg.l;
+ *h = rv.reg.h;
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(rdmsr_safe_on_cpu);
+
+int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ rv.reg.l = l;
+ rv.reg.h = h;
+ err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(wrmsr_safe_on_cpu);
+
+/*
+ * These variants are significantly slower, but allows control over
+ * the entire 32-bit GPR set.
+ */
+static void __rdmsr_safe_regs_on_cpu(void *info)
+{
+ struct msr_regs_info *rv = info;
+
+ rv->err = rdmsr_safe_regs(rv->regs);
+}
+
+static void __wrmsr_safe_regs_on_cpu(void *info)
+{
+ struct msr_regs_info *rv = info;
+
+ rv->err = wrmsr_safe_regs(rv->regs);
+}
+
+int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
+{
+ int err;
+ struct msr_regs_info rv;
+
+ rv.regs = regs;
+ rv.err = -EIO;
+ err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1);
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu);
+
+int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
+{
+ int err;
+ struct msr_regs_info rv;
+
+ rv.regs = regs;
+ rv.err = -EIO;
+ err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1);
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 33a1e3ca22d8..8f8eebdca7d4 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -1,226 +1,23 @@
#include <linux/module.h>
#include <linux/preempt.h>
-#include <linux/smp.h>
#include <asm/msr.h>
-struct msr_info {
- u32 msr_no;
- struct msr reg;
- struct msr *msrs;
- int off;
- int err;
-};
-
-static void __rdmsr_on_cpu(void *info)
-{
- struct msr_info *rv = info;
- struct msr *reg;
- int this_cpu = raw_smp_processor_id();
-
- if (rv->msrs)
- reg = &rv->msrs[this_cpu - rv->off];
- else
- reg = &rv->reg;
-
- rdmsr(rv->msr_no, reg->l, reg->h);
-}
-
-static void __wrmsr_on_cpu(void *info)
-{
- struct msr_info *rv = info;
- struct msr *reg;
- int this_cpu = raw_smp_processor_id();
-
- if (rv->msrs)
- reg = &rv->msrs[this_cpu - rv->off];
- else
- reg = &rv->reg;
-
- wrmsr(rv->msr_no, reg->l, reg->h);
-}
-
-int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
-{
- int err;
- struct msr_info rv;
-
- memset(&rv, 0, sizeof(rv));
-
- rv.msr_no = msr_no;
- err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
- *l = rv.reg.l;
- *h = rv.reg.h;
-
- return err;
-}
-EXPORT_SYMBOL(rdmsr_on_cpu);
-
-int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
-{
- int err;
- struct msr_info rv;
-
- memset(&rv, 0, sizeof(rv));
-
- rv.msr_no = msr_no;
- rv.reg.l = l;
- rv.reg.h = h;
- err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
-
- return err;
-}
-EXPORT_SYMBOL(wrmsr_on_cpu);
-
-/* rdmsr on a bunch of CPUs
- *
- * @mask: which CPUs
- * @msr_no: which MSR
- * @msrs: array of MSR values
- *
- */
-void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
-{
- struct msr_info rv;
- int this_cpu;
-
- memset(&rv, 0, sizeof(rv));
-
- rv.off = cpumask_first(mask);
- rv.msrs = msrs;
- rv.msr_no = msr_no;
-
- this_cpu = get_cpu();
-
- if (cpumask_test_cpu(this_cpu, mask))
- __rdmsr_on_cpu(&rv);
-
- smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1);
- put_cpu();
-}
-EXPORT_SYMBOL(rdmsr_on_cpus);
-
-/*
- * wrmsr on a bunch of CPUs
- *
- * @mask: which CPUs
- * @msr_no: which MSR
- * @msrs: array of MSR values
- *
- */
-void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
-{
- struct msr_info rv;
- int this_cpu;
-
- memset(&rv, 0, sizeof(rv));
-
- rv.off = cpumask_first(mask);
- rv.msrs = msrs;
- rv.msr_no = msr_no;
-
- this_cpu = get_cpu();
-
- if (cpumask_test_cpu(this_cpu, mask))
- __wrmsr_on_cpu(&rv);
-
- smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
- put_cpu();
-}
-EXPORT_SYMBOL(wrmsr_on_cpus);
-
-/* These "safe" variants are slower and should be used when the target MSR
- may not actually exist. */
-static void __rdmsr_safe_on_cpu(void *info)
-{
- struct msr_info *rv = info;
-
- rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
-}
-
-static void __wrmsr_safe_on_cpu(void *info)
-{
- struct msr_info *rv = info;
-
- rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
-}
-
-int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+struct msr *msrs_alloc(void)
{
- int err;
- struct msr_info rv;
+ struct msr *msrs = NULL;
- memset(&rv, 0, sizeof(rv));
+ msrs = alloc_percpu(struct msr);
+ if (!msrs) {
+ pr_warning("%s: error allocating msrs\n", __func__);
+ return NULL;
+ }
- rv.msr_no = msr_no;
- err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
- *l = rv.reg.l;
- *h = rv.reg.h;
-
- return err ? err : rv.err;
+ return msrs;
}
-EXPORT_SYMBOL(rdmsr_safe_on_cpu);
+EXPORT_SYMBOL(msrs_alloc);
-int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+void msrs_free(struct msr *msrs)
{
- int err;
- struct msr_info rv;
-
- memset(&rv, 0, sizeof(rv));
-
- rv.msr_no = msr_no;
- rv.reg.l = l;
- rv.reg.h = h;
- err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
-
- return err ? err : rv.err;
-}
-EXPORT_SYMBOL(wrmsr_safe_on_cpu);
-
-/*
- * These variants are significantly slower, but allows control over
- * the entire 32-bit GPR set.
- */
-struct msr_regs_info {
- u32 *regs;
- int err;
-};
-
-static void __rdmsr_safe_regs_on_cpu(void *info)
-{
- struct msr_regs_info *rv = info;
-
- rv->err = rdmsr_safe_regs(rv->regs);
-}
-
-static void __wrmsr_safe_regs_on_cpu(void *info)
-{
- struct msr_regs_info *rv = info;
-
- rv->err = wrmsr_safe_regs(rv->regs);
-}
-
-int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
-{
- int err;
- struct msr_regs_info rv;
-
- rv.regs = regs;
- rv.err = -EIO;
- err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1);
-
- return err ? err : rv.err;
-}
-EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu);
-
-int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
-{
- int err;
- struct msr_regs_info rv;
-
- rv.regs = regs;
- rv.err = -EIO;
- err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1);
-
- return err ? err : rv.err;
+ free_percpu(msrs);
}
-EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu);
+EXPORT_SYMBOL(msrs_free);
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S
new file mode 100644
index 000000000000..15acecf0d7aa
--- /dev/null
+++ b/arch/x86/lib/rwsem_64.S
@@ -0,0 +1,81 @@
+/*
+ * x86-64 rwsem wrappers
+ *
+ * This interfaces the inline asm code to the slow-path
+ * C routines. We need to save the call-clobbered regs
+ * that the asm does not mark as clobbered, and move the
+ * argument from %rax to %rdi.
+ *
+ * NOTE! We don't need to save %rax, because the functions
+ * will always return the semaphore pointer in %rax (which
+ * is also the input argument to these helpers)
+ *
+ * The following can clobber %rdx because the asm clobbers it:
+ * call_rwsem_down_write_failed
+ * call_rwsem_wake
+ * but %rdi, %rsi, %rcx, %r8-r11 always need saving.
+ */
+
+#include <linux/linkage.h>
+#include <asm/rwlock.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/dwarf2.h>
+
+#define save_common_regs \
+ pushq %rdi; \
+ pushq %rsi; \
+ pushq %rcx; \
+ pushq %r8; \
+ pushq %r9; \
+ pushq %r10; \
+ pushq %r11
+
+#define restore_common_regs \
+ popq %r11; \
+ popq %r10; \
+ popq %r9; \
+ popq %r8; \
+ popq %rcx; \
+ popq %rsi; \
+ popq %rdi
+
+/* Fix up special calling conventions */
+ENTRY(call_rwsem_down_read_failed)
+ save_common_regs
+ pushq %rdx
+ movq %rax,%rdi
+ call rwsem_down_read_failed
+ popq %rdx
+ restore_common_regs
+ ret
+ ENDPROC(call_rwsem_down_read_failed)
+
+ENTRY(call_rwsem_down_write_failed)
+ save_common_regs
+ movq %rax,%rdi
+ call rwsem_down_write_failed
+ restore_common_regs
+ ret
+ ENDPROC(call_rwsem_down_write_failed)
+
+ENTRY(call_rwsem_wake)
+ decw %dx /* do nothing if still outstanding active readers */
+ jnz 1f
+ save_common_regs
+ movq %rax,%rdi
+ call rwsem_wake
+ restore_common_regs
+1: ret
+ ENDPROC(call_rwsem_wake)
+
+/* Fix up special calling conventions */
+ENTRY(call_rwsem_downgrade_wake)
+ save_common_regs
+ pushq %rdx
+ movq %rax,%rdi
+ call rwsem_downgrade_wake
+ popq %rdx
+ restore_common_regs
+ ret
+ ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 1f118d462acc..e218d5df85ff 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -874,7 +874,7 @@ EXPORT_SYMBOL(copy_to_user);
* data to the requested size using zero bytes.
*/
unsigned long
-copy_from_user(void *to, const void __user *from, unsigned long n)
+_copy_from_user(void *to, const void __user *from, unsigned long n)
{
if (access_ok(VERIFY_READ, from, n))
n = __copy_from_user(to, from, n);
@@ -882,4 +882,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
memset(to, 0, n);
return n;
}
-EXPORT_SYMBOL(copy_from_user);
+EXPORT_SYMBOL(_copy_from_user);
+
+void copy_from_user_overflow(void)
+{
+ WARN(1, "Buffer overflow detected!\n");
+}
+EXPORT_SYMBOL(copy_from_user_overflow);
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
new file mode 100644
index 000000000000..a793da5e560e
--- /dev/null
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -0,0 +1,893 @@
+# x86 Opcode Maps
+#
+#<Opcode maps>
+# Table: table-name
+# Referrer: escaped-name
+# AVXcode: avx-code
+# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
+# (or)
+# opcode: escape # escaped-name
+# EndTable
+#
+#<group maps>
+# GrpTable: GrpXXX
+# reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
+# EndTable
+#
+# AVX Superscripts
+# (VEX): this opcode can accept VEX prefix.
+# (oVEX): this opcode requires VEX prefix.
+# (o128): this opcode only supports 128bit VEX.
+# (o256): this opcode only supports 256bit VEX.
+#
+
+Table: one byte opcode
+Referrer:
+AVXcode:
+# 0x00 - 0x0f
+00: ADD Eb,Gb
+01: ADD Ev,Gv
+02: ADD Gb,Eb
+03: ADD Gv,Ev
+04: ADD AL,Ib
+05: ADD rAX,Iz
+06: PUSH ES (i64)
+07: POP ES (i64)
+08: OR Eb,Gb
+09: OR Ev,Gv
+0a: OR Gb,Eb
+0b: OR Gv,Ev
+0c: OR AL,Ib
+0d: OR rAX,Iz
+0e: PUSH CS (i64)
+0f: escape # 2-byte escape
+# 0x10 - 0x1f
+10: ADC Eb,Gb
+11: ADC Ev,Gv
+12: ADC Gb,Eb
+13: ADC Gv,Ev
+14: ADC AL,Ib
+15: ADC rAX,Iz
+16: PUSH SS (i64)
+17: POP SS (i64)
+18: SBB Eb,Gb
+19: SBB Ev,Gv
+1a: SBB Gb,Eb
+1b: SBB Gv,Ev
+1c: SBB AL,Ib
+1d: SBB rAX,Iz
+1e: PUSH DS (i64)
+1f: POP DS (i64)
+# 0x20 - 0x2f
+20: AND Eb,Gb
+21: AND Ev,Gv
+22: AND Gb,Eb
+23: AND Gv,Ev
+24: AND AL,Ib
+25: AND rAx,Iz
+26: SEG=ES (Prefix)
+27: DAA (i64)
+28: SUB Eb,Gb
+29: SUB Ev,Gv
+2a: SUB Gb,Eb
+2b: SUB Gv,Ev
+2c: SUB AL,Ib
+2d: SUB rAX,Iz
+2e: SEG=CS (Prefix)
+2f: DAS (i64)
+# 0x30 - 0x3f
+30: XOR Eb,Gb
+31: XOR Ev,Gv
+32: XOR Gb,Eb
+33: XOR Gv,Ev
+34: XOR AL,Ib
+35: XOR rAX,Iz
+36: SEG=SS (Prefix)
+37: AAA (i64)
+38: CMP Eb,Gb
+39: CMP Ev,Gv
+3a: CMP Gb,Eb
+3b: CMP Gv,Ev
+3c: CMP AL,Ib
+3d: CMP rAX,Iz
+3e: SEG=DS (Prefix)
+3f: AAS (i64)
+# 0x40 - 0x4f
+40: INC eAX (i64) | REX (o64)
+41: INC eCX (i64) | REX.B (o64)
+42: INC eDX (i64) | REX.X (o64)
+43: INC eBX (i64) | REX.XB (o64)
+44: INC eSP (i64) | REX.R (o64)
+45: INC eBP (i64) | REX.RB (o64)
+46: INC eSI (i64) | REX.RX (o64)
+47: INC eDI (i64) | REX.RXB (o64)
+48: DEC eAX (i64) | REX.W (o64)
+49: DEC eCX (i64) | REX.WB (o64)
+4a: DEC eDX (i64) | REX.WX (o64)
+4b: DEC eBX (i64) | REX.WXB (o64)
+4c: DEC eSP (i64) | REX.WR (o64)
+4d: DEC eBP (i64) | REX.WRB (o64)
+4e: DEC eSI (i64) | REX.WRX (o64)
+4f: DEC eDI (i64) | REX.WRXB (o64)
+# 0x50 - 0x5f
+50: PUSH rAX/r8 (d64)
+51: PUSH rCX/r9 (d64)
+52: PUSH rDX/r10 (d64)
+53: PUSH rBX/r11 (d64)
+54: PUSH rSP/r12 (d64)
+55: PUSH rBP/r13 (d64)
+56: PUSH rSI/r14 (d64)
+57: PUSH rDI/r15 (d64)
+58: POP rAX/r8 (d64)
+59: POP rCX/r9 (d64)
+5a: POP rDX/r10 (d64)
+5b: POP rBX/r11 (d64)
+5c: POP rSP/r12 (d64)
+5d: POP rBP/r13 (d64)
+5e: POP rSI/r14 (d64)
+5f: POP rDI/r15 (d64)
+# 0x60 - 0x6f
+60: PUSHA/PUSHAD (i64)
+61: POPA/POPAD (i64)
+62: BOUND Gv,Ma (i64)
+63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64)
+64: SEG=FS (Prefix)
+65: SEG=GS (Prefix)
+66: Operand-Size (Prefix)
+67: Address-Size (Prefix)
+68: PUSH Iz (d64)
+69: IMUL Gv,Ev,Iz
+6a: PUSH Ib (d64)
+6b: IMUL Gv,Ev,Ib
+6c: INS/INSB Yb,DX
+6d: INS/INSW/INSD Yz,DX
+6e: OUTS/OUTSB DX,Xb
+6f: OUTS/OUTSW/OUTSD DX,Xz
+# 0x70 - 0x7f
+70: JO Jb
+71: JNO Jb
+72: JB/JNAE/JC Jb
+73: JNB/JAE/JNC Jb
+74: JZ/JE Jb
+75: JNZ/JNE Jb
+76: JBE/JNA Jb
+77: JNBE/JA Jb
+78: JS Jb
+79: JNS Jb
+7a: JP/JPE Jb
+7b: JNP/JPO Jb
+7c: JL/JNGE Jb
+7d: JNL/JGE Jb
+7e: JLE/JNG Jb
+7f: JNLE/JG Jb
+# 0x80 - 0x8f
+80: Grp1 Eb,Ib (1A)
+81: Grp1 Ev,Iz (1A)
+82: Grp1 Eb,Ib (1A),(i64)
+83: Grp1 Ev,Ib (1A)
+84: TEST Eb,Gb
+85: TEST Ev,Gv
+86: XCHG Eb,Gb
+87: XCHG Ev,Gv
+88: MOV Eb,Gb
+89: MOV Ev,Gv
+8a: MOV Gb,Eb
+8b: MOV Gv,Ev
+8c: MOV Ev,Sw
+8d: LEA Gv,M
+8e: MOV Sw,Ew
+8f: Grp1A (1A) | POP Ev (d64)
+# 0x90 - 0x9f
+90: NOP | PAUSE (F3) | XCHG r8,rAX
+91: XCHG rCX/r9,rAX
+92: XCHG rDX/r10,rAX
+93: XCHG rBX/r11,rAX
+94: XCHG rSP/r12,rAX
+95: XCHG rBP/r13,rAX
+96: XCHG rSI/r14,rAX
+97: XCHG rDI/r15,rAX
+98: CBW/CWDE/CDQE
+99: CWD/CDQ/CQO
+9a: CALLF Ap (i64)
+9b: FWAIT/WAIT
+9c: PUSHF/D/Q Fv (d64)
+9d: POPF/D/Q Fv (d64)
+9e: SAHF
+9f: LAHF
+# 0xa0 - 0xaf
+a0: MOV AL,Ob
+a1: MOV rAX,Ov
+a2: MOV Ob,AL
+a3: MOV Ov,rAX
+a4: MOVS/B Xb,Yb
+a5: MOVS/W/D/Q Xv,Yv
+a6: CMPS/B Xb,Yb
+a7: CMPS/W/D Xv,Yv
+a8: TEST AL,Ib
+a9: TEST rAX,Iz
+aa: STOS/B Yb,AL
+ab: STOS/W/D/Q Yv,rAX
+ac: LODS/B AL,Xb
+ad: LODS/W/D/Q rAX,Xv
+ae: SCAS/B AL,Yb
+af: SCAS/W/D/Q rAX,Xv
+# 0xb0 - 0xbf
+b0: MOV AL/R8L,Ib
+b1: MOV CL/R9L,Ib
+b2: MOV DL/R10L,Ib
+b3: MOV BL/R11L,Ib
+b4: MOV AH/R12L,Ib
+b5: MOV CH/R13L,Ib
+b6: MOV DH/R14L,Ib
+b7: MOV BH/R15L,Ib
+b8: MOV rAX/r8,Iv
+b9: MOV rCX/r9,Iv
+ba: MOV rDX/r10,Iv
+bb: MOV rBX/r11,Iv
+bc: MOV rSP/r12,Iv
+bd: MOV rBP/r13,Iv
+be: MOV rSI/r14,Iv
+bf: MOV rDI/r15,Iv
+# 0xc0 - 0xcf
+c0: Grp2 Eb,Ib (1A)
+c1: Grp2 Ev,Ib (1A)
+c2: RETN Iw (f64)
+c3: RETN
+c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix)
+c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix)
+c6: Grp11 Eb,Ib (1A)
+c7: Grp11 Ev,Iz (1A)
+c8: ENTER Iw,Ib
+c9: LEAVE (d64)
+ca: RETF Iw
+cb: RETF
+cc: INT3
+cd: INT Ib
+ce: INTO (i64)
+cf: IRET/D/Q
+# 0xd0 - 0xdf
+d0: Grp2 Eb,1 (1A)
+d1: Grp2 Ev,1 (1A)
+d2: Grp2 Eb,CL (1A)
+d3: Grp2 Ev,CL (1A)
+d4: AAM Ib (i64)
+d5: AAD Ib (i64)
+d6:
+d7: XLAT/XLATB
+d8: ESC
+d9: ESC
+da: ESC
+db: ESC
+dc: ESC
+dd: ESC
+de: ESC
+df: ESC
+# 0xe0 - 0xef
+e0: LOOPNE/LOOPNZ Jb (f64)
+e1: LOOPE/LOOPZ Jb (f64)
+e2: LOOP Jb (f64)
+e3: JrCXZ Jb (f64)
+e4: IN AL,Ib
+e5: IN eAX,Ib
+e6: OUT Ib,AL
+e7: OUT Ib,eAX
+e8: CALL Jz (f64)
+e9: JMP-near Jz (f64)
+ea: JMP-far Ap (i64)
+eb: JMP-short Jb (f64)
+ec: IN AL,DX
+ed: IN eAX,DX
+ee: OUT DX,AL
+ef: OUT DX,eAX
+# 0xf0 - 0xff
+f0: LOCK (Prefix)
+f1:
+f2: REPNE (Prefix)
+f3: REP/REPE (Prefix)
+f4: HLT
+f5: CMC
+f6: Grp3_1 Eb (1A)
+f7: Grp3_2 Ev (1A)
+f8: CLC
+f9: STC
+fa: CLI
+fb: STI
+fc: CLD
+fd: STD
+fe: Grp4 (1A)
+ff: Grp5 (1A)
+EndTable
+
+Table: 2-byte opcode (0x0f)
+Referrer: 2-byte escape
+AVXcode: 1
+# 0x0f 0x00-0x0f
+00: Grp6 (1A)
+01: Grp7 (1A)
+02: LAR Gv,Ew
+03: LSL Gv,Ew
+04:
+05: SYSCALL (o64)
+06: CLTS
+07: SYSRET (o64)
+08: INVD
+09: WBINVD
+0a:
+0b: UD2 (1B)
+0c:
+0d: NOP Ev | GrpP
+0e: FEMMS
+# 3DNow! uses the last imm byte as opcode extension.
+0f: 3DNow! Pq,Qq,Ib
+# 0x0f 0x10-0x1f
+10: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128)
+11: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128)
+12: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX)
+13: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128)
+14: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX)
+15: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX)
+16: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX)
+17: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128)
+18: Grp16 (1A)
+19:
+1a:
+1b:
+1c:
+1d:
+1e:
+1f: NOP Ev
+# 0x0f 0x20-0x2f
+20: MOV Rd,Cd
+21: MOV Rd,Dd
+22: MOV Cd,Rd
+23: MOV Dd,Rd
+24:
+25:
+26:
+27:
+28: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX)
+29: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX)
+2a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128)
+2b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX)
+2c: cvttps2pi Ppi,Wps | cvttss2si Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128)
+2d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128)
+2e: ucomiss Vss,Wss (VEX),(o128) | ucomisd Vsd,Wsd (66),(VEX),(o128)
+2f: comiss Vss,Wss (VEX),(o128) | comisd Vsd,Wsd (66),(VEX),(o128)
+# 0x0f 0x30-0x3f
+30: WRMSR
+31: RDTSC
+32: RDMSR
+33: RDPMC
+34: SYSENTER
+35: SYSEXIT
+36:
+37: GETSEC
+38: escape # 3-byte escape 1
+39:
+3a: escape # 3-byte escape 2
+3b:
+3c:
+3d:
+3e:
+3f:
+# 0x0f 0x40-0x4f
+40: CMOVO Gv,Ev
+41: CMOVNO Gv,Ev
+42: CMOVB/C/NAE Gv,Ev
+43: CMOVAE/NB/NC Gv,Ev
+44: CMOVE/Z Gv,Ev
+45: CMOVNE/NZ Gv,Ev
+46: CMOVBE/NA Gv,Ev
+47: CMOVA/NBE Gv,Ev
+48: CMOVS Gv,Ev
+49: CMOVNS Gv,Ev
+4a: CMOVP/PE Gv,Ev
+4b: CMOVNP/PO Gv,Ev
+4c: CMOVL/NGE Gv,Ev
+4d: CMOVNL/GE Gv,Ev
+4e: CMOVLE/NG Gv,Ev
+4f: CMOVNLE/G Gv,Ev
+# 0x0f 0x50-0x5f
+50: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX)
+51: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128)
+52: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128)
+53: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128)
+54: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX)
+55: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX)
+56: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX)
+57: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX)
+58: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128)
+59: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128)
+5a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128)
+5b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX)
+5c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128)
+5d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128)
+5e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128)
+5f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128)
+# 0x0f 0x60-0x6f
+60: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128)
+61: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128)
+62: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128)
+63: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128)
+64: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128)
+65: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128)
+66: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128)
+67: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128)
+68: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128)
+69: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128)
+6a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128)
+6b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128)
+6c: punpcklqdq Vdq,Wdq (66),(VEX),(o128)
+6d: punpckhqdq Vdq,Wdq (66),(VEX),(o128)
+6e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128)
+6f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX)
+# 0x0f 0x70-0x7f
+70: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128)
+71: Grp12 (1A)
+72: Grp13 (1A)
+73: Grp14 (1A)
+74: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128)
+75: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128)
+76: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128)
+77: emms/vzeroupper/vzeroall (VEX)
+78: VMREAD Ed/q,Gd/q
+79: VMWRITE Gd/q,Ed/q
+7a:
+7b:
+7c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX)
+7d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX)
+7e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128)
+7f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX)
+# 0x0f 0x80-0x8f
+80: JO Jz (f64)
+81: JNO Jz (f64)
+82: JB/JNAE/JC Jz (f64)
+83: JNB/JAE/JNC Jz (f64)
+84: JZ/JE Jz (f64)
+85: JNZ/JNE Jz (f64)
+86: JBE/JNA Jz (f64)
+87: JNBE/JA Jz (f64)
+88: JS Jz (f64)
+89: JNS Jz (f64)
+8a: JP/JPE Jz (f64)
+8b: JNP/JPO Jz (f64)
+8c: JL/JNGE Jz (f64)
+8d: JNL/JGE Jz (f64)
+8e: JLE/JNG Jz (f64)
+8f: JNLE/JG Jz (f64)
+# 0x0f 0x90-0x9f
+90: SETO Eb
+91: SETNO Eb
+92: SETB/C/NAE Eb
+93: SETAE/NB/NC Eb
+94: SETE/Z Eb
+95: SETNE/NZ Eb
+96: SETBE/NA Eb
+97: SETA/NBE Eb
+98: SETS Eb
+99: SETNS Eb
+9a: SETP/PE Eb
+9b: SETNP/PO Eb
+9c: SETL/NGE Eb
+9d: SETNL/GE Eb
+9e: SETLE/NG Eb
+9f: SETNLE/G Eb
+# 0x0f 0xa0-0xaf
+a0: PUSH FS (d64)
+a1: POP FS (d64)
+a2: CPUID
+a3: BT Ev,Gv
+a4: SHLD Ev,Gv,Ib
+a5: SHLD Ev,Gv,CL
+a6: GrpPDLK
+a7: GrpRNG
+a8: PUSH GS (d64)
+a9: POP GS (d64)
+aa: RSM
+ab: BTS Ev,Gv
+ac: SHRD Ev,Gv,Ib
+ad: SHRD Ev,Gv,CL
+ae: Grp15 (1A),(1C)
+af: IMUL Gv,Ev
+# 0x0f 0xb0-0xbf
+b0: CMPXCHG Eb,Gb
+b1: CMPXCHG Ev,Gv
+b2: LSS Gv,Mp
+b3: BTR Ev,Gv
+b4: LFS Gv,Mp
+b5: LGS Gv,Mp
+b6: MOVZX Gv,Eb
+b7: MOVZX Gv,Ew
+b8: JMPE | POPCNT Gv,Ev (F3)
+b9: Grp10 (1A)
+ba: Grp8 Ev,Ib (1A)
+bb: BTC Ev,Gv
+bc: BSF Gv,Ev
+bd: BSR Gv,Ev
+be: MOVSX Gv,Eb
+bf: MOVSX Gv,Ew
+# 0x0f 0xc0-0xcf
+c0: XADD Eb,Gb
+c1: XADD Ev,Gv
+c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX)
+c3: movnti Md/q,Gd/q
+c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128)
+c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128)
+c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX)
+c7: Grp9 (1A)
+c8: BSWAP RAX/EAX/R8/R8D
+c9: BSWAP RCX/ECX/R9/R9D
+ca: BSWAP RDX/EDX/R10/R10D
+cb: BSWAP RBX/EBX/R11/R11D
+cc: BSWAP RSP/ESP/R12/R12D
+cd: BSWAP RBP/EBP/R13/R13D
+ce: BSWAP RSI/ESI/R14/R14D
+cf: BSWAP RDI/EDI/R15/R15D
+# 0x0f 0xd0-0xdf
+d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX)
+d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128)
+d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128)
+d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128)
+d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128)
+d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128)
+d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
+d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128)
+d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128)
+d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128)
+da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128)
+db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128)
+dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128)
+dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128)
+de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128)
+df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128)
+# 0x0f 0xe0-0xef
+e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128)
+e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128)
+e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128)
+e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128)
+e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128)
+e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128)
+e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX)
+e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX)
+e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128)
+e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128)
+ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128)
+eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128)
+ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128)
+ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128)
+ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128)
+ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128)
+# 0x0f 0xf0-0xff
+f0: lddqu Vdq,Mdq (F2),(VEX)
+f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128)
+f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128)
+f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128)
+f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128)
+f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128)
+f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128)
+f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128)
+f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128)
+f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128)
+fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128)
+fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128)
+fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128)
+fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128)
+fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128)
+ff:
+EndTable
+
+Table: 3-byte opcode 1 (0x0f 0x38)
+Referrer: 3-byte escape 1
+AVXcode: 2
+# 0x0f 0x38 0x00-0x0f
+00: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128)
+01: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128)
+02: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128)
+03: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128)
+04: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128)
+05: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128)
+06: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128)
+07: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128)
+08: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128)
+09: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128)
+0a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128)
+0b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128)
+0c: Vpermilps /r (66),(oVEX)
+0d: Vpermilpd /r (66),(oVEX)
+0e: vtestps /r (66),(oVEX)
+0f: vtestpd /r (66),(oVEX)
+# 0x0f 0x38 0x10-0x1f
+10: pblendvb Vdq,Wdq (66)
+11:
+12:
+13:
+14: blendvps Vdq,Wdq (66)
+15: blendvpd Vdq,Wdq (66)
+16:
+17: ptest Vdq,Wdq (66),(VEX)
+18: vbroadcastss /r (66),(oVEX)
+19: vbroadcastsd /r (66),(oVEX),(o256)
+1a: vbroadcastf128 /r (66),(oVEX),(o256)
+1b:
+1c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128)
+1d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128)
+1e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128)
+1f:
+# 0x0f 0x38 0x20-0x2f
+20: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128)
+21: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128)
+22: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128)
+23: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128)
+24: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128)
+25: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128)
+26:
+27:
+28: pmuldq Vdq,Wdq (66),(VEX),(o128)
+29: pcmpeqq Vdq,Wdq (66),(VEX),(o128)
+2a: movntdqa Vdq,Mdq (66),(VEX),(o128)
+2b: packusdw Vdq,Wdq (66),(VEX),(o128)
+2c: vmaskmovps(ld) /r (66),(oVEX)
+2d: vmaskmovpd(ld) /r (66),(oVEX)
+2e: vmaskmovps(st) /r (66),(oVEX)
+2f: vmaskmovpd(st) /r (66),(oVEX)
+# 0x0f 0x38 0x30-0x3f
+30: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128)
+31: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128)
+32: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128)
+33: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128)
+34: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128)
+35: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128)
+36:
+37: pcmpgtq Vdq,Wdq (66),(VEX),(o128)
+38: pminsb Vdq,Wdq (66),(VEX),(o128)
+39: pminsd Vdq,Wdq (66),(VEX),(o128)
+3a: pminuw Vdq,Wdq (66),(VEX),(o128)
+3b: pminud Vdq,Wdq (66),(VEX),(o128)
+3c: pmaxsb Vdq,Wdq (66),(VEX),(o128)
+3d: pmaxsd Vdq,Wdq (66),(VEX),(o128)
+3e: pmaxuw Vdq,Wdq (66),(VEX),(o128)
+3f: pmaxud Vdq,Wdq (66),(VEX),(o128)
+# 0x0f 0x38 0x40-0x8f
+40: pmulld Vdq,Wdq (66),(VEX),(o128)
+41: phminposuw Vdq,Wdq (66),(VEX),(o128)
+80: INVEPT Gd/q,Mdq (66)
+81: INVPID Gd/q,Mdq (66)
+# 0x0f 0x38 0x90-0xbf (FMA)
+96: vfmaddsub132pd/ps /r (66),(VEX)
+97: vfmsubadd132pd/ps /r (66),(VEX)
+98: vfmadd132pd/ps /r (66),(VEX)
+99: vfmadd132sd/ss /r (66),(VEX),(o128)
+9a: vfmsub132pd/ps /r (66),(VEX)
+9b: vfmsub132sd/ss /r (66),(VEX),(o128)
+9c: vfnmadd132pd/ps /r (66),(VEX)
+9d: vfnmadd132sd/ss /r (66),(VEX),(o128)
+9e: vfnmsub132pd/ps /r (66),(VEX)
+9f: vfnmsub132sd/ss /r (66),(VEX),(o128)
+a6: vfmaddsub213pd/ps /r (66),(VEX)
+a7: vfmsubadd213pd/ps /r (66),(VEX)
+a8: vfmadd213pd/ps /r (66),(VEX)
+a9: vfmadd213sd/ss /r (66),(VEX),(o128)
+aa: vfmsub213pd/ps /r (66),(VEX)
+ab: vfmsub213sd/ss /r (66),(VEX),(o128)
+ac: vfnmadd213pd/ps /r (66),(VEX)
+ad: vfnmadd213sd/ss /r (66),(VEX),(o128)
+ae: vfnmsub213pd/ps /r (66),(VEX)
+af: vfnmsub213sd/ss /r (66),(VEX),(o128)
+b6: vfmaddsub231pd/ps /r (66),(VEX)
+b7: vfmsubadd231pd/ps /r (66),(VEX)
+b8: vfmadd231pd/ps /r (66),(VEX)
+b9: vfmadd231sd/ss /r (66),(VEX),(o128)
+ba: vfmsub231pd/ps /r (66),(VEX)
+bb: vfmsub231sd/ss /r (66),(VEX),(o128)
+bc: vfnmadd231pd/ps /r (66),(VEX)
+bd: vfnmadd231sd/ss /r (66),(VEX),(o128)
+be: vfnmsub231pd/ps /r (66),(VEX)
+bf: vfnmsub231sd/ss /r (66),(VEX),(o128)
+# 0x0f 0x38 0xc0-0xff
+db: aesimc Vdq,Wdq (66),(VEX),(o128)
+dc: aesenc Vdq,Wdq (66),(VEX),(o128)
+dd: aesenclast Vdq,Wdq (66),(VEX),(o128)
+de: aesdec Vdq,Wdq (66),(VEX),(o128)
+df: aesdeclast Vdq,Wdq (66),(VEX),(o128)
+f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2)
+f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2)
+EndTable
+
+Table: 3-byte opcode 2 (0x0f 0x3a)
+Referrer: 3-byte escape 2
+AVXcode: 3
+# 0x0f 0x3a 0x00-0xff
+04: vpermilps /r,Ib (66),(oVEX)
+05: vpermilpd /r,Ib (66),(oVEX)
+06: vperm2f128 /r,Ib (66),(oVEX),(o256)
+08: roundps Vdq,Wdq,Ib (66),(VEX)
+09: roundpd Vdq,Wdq,Ib (66),(VEX)
+0a: roundss Vss,Wss,Ib (66),(VEX),(o128)
+0b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128)
+0c: blendps Vdq,Wdq,Ib (66),(VEX)
+0d: blendpd Vdq,Wdq,Ib (66),(VEX)
+0e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128)
+0f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128)
+14: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128)
+15: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128)
+16: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128)
+17: extractps Ed,Vdq,Ib (66),(VEX),(o128)
+18: vinsertf128 /r,Ib (66),(oVEX),(o256)
+19: vextractf128 /r,Ib (66),(oVEX),(o256)
+20: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128)
+21: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128)
+22: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128)
+40: dpps Vdq,Wdq,Ib (66),(VEX)
+41: dppd Vdq,Wdq,Ib (66),(VEX),(o128)
+42: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128)
+44: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128)
+4a: vblendvps /r,Ib (66),(oVEX)
+4b: vblendvpd /r,Ib (66),(oVEX)
+4c: vpblendvb /r,Ib (66),(oVEX),(o128)
+60: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128)
+61: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128)
+62: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128)
+63: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128)
+df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128)
+EndTable
+
+GrpTable: Grp1
+0: ADD
+1: OR
+2: ADC
+3: SBB
+4: AND
+5: SUB
+6: XOR
+7: CMP
+EndTable
+
+GrpTable: Grp1A
+0: POP
+EndTable
+
+GrpTable: Grp2
+0: ROL
+1: ROR
+2: RCL
+3: RCR
+4: SHL/SAL
+5: SHR
+6:
+7: SAR
+EndTable
+
+GrpTable: Grp3_1
+0: TEST Eb,Ib
+1:
+2: NOT Eb
+3: NEG Eb
+4: MUL AL,Eb
+5: IMUL AL,Eb
+6: DIV AL,Eb
+7: IDIV AL,Eb
+EndTable
+
+GrpTable: Grp3_2
+0: TEST Ev,Iz
+1:
+2: NOT Ev
+3: NEG Ev
+4: MUL rAX,Ev
+5: IMUL rAX,Ev
+6: DIV rAX,Ev
+7: IDIV rAX,Ev
+EndTable
+
+GrpTable: Grp4
+0: INC Eb
+1: DEC Eb
+EndTable
+
+GrpTable: Grp5
+0: INC Ev
+1: DEC Ev
+2: CALLN Ev (f64)
+3: CALLF Ep
+4: JMPN Ev (f64)
+5: JMPF Ep
+6: PUSH Ev (d64)
+7:
+EndTable
+
+GrpTable: Grp6
+0: SLDT Rv/Mw
+1: STR Rv/Mw
+2: LLDT Ew
+3: LTR Ew
+4: VERR Ew
+5: VERW Ew
+EndTable
+
+GrpTable: Grp7
+0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
+1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001)
+2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B)
+3: LIDT Ms
+4: SMSW Mw/Rv
+5:
+6: LMSW Ew
+7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
+EndTable
+
+GrpTable: Grp8
+4: BT
+5: BTS
+6: BTR
+7: BTC
+EndTable
+
+GrpTable: Grp9
+1: CMPXCHG8B/16B Mq/Mdq
+6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3)
+7: VMPTRST Mq
+EndTable
+
+GrpTable: Grp10
+EndTable
+
+GrpTable: Grp11
+0: MOV
+EndTable
+
+GrpTable: Grp12
+2: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128)
+4: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128)
+6: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128)
+EndTable
+
+GrpTable: Grp13
+2: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128)
+4: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128)
+6: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128)
+EndTable
+
+GrpTable: Grp14
+2: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128)
+3: psrldq Udq,Ib (66),(11B),(VEX),(o128)
+6: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128)
+7: pslldq Udq,Ib (66),(11B),(VEX),(o128)
+EndTable
+
+GrpTable: Grp15
+0: fxsave
+1: fxstor
+2: ldmxcsr (VEX)
+3: stmxcsr (VEX)
+4: XSAVE
+5: XRSTOR | lfence (11B)
+6: mfence (11B)
+7: clflush | sfence (11B)
+EndTable
+
+GrpTable: Grp16
+0: prefetch NTA
+1: prefetch T0
+2: prefetch T1
+3: prefetch T2
+EndTable
+
+# AMD's Prefetch Group
+GrpTable: GrpP
+0: PREFETCH
+1: PREFETCHW
+EndTable
+
+GrpTable: GrpPDLK
+0: MONTMUL
+1: XSHA1
+2: XSHA2
+EndTable
+
+GrpTable: GrpRNG
+0: xstore-rng
+1: xcrypt-ecb
+2: xcrypt-cbc
+4: xcrypt-cfb
+5: xcrypt-ofb
+EndTable
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 61b41ca3b5a2..d0474ad2a6e5 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -35,34 +35,3 @@ int fixup_exception(struct pt_regs *regs)
return 0;
}
-
-#ifdef CONFIG_X86_64
-/*
- * Need to defined our own search_extable on X86_64 to work around
- * a B stepping K8 bug.
- */
-const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
- const struct exception_table_entry *last,
- unsigned long value)
-{
- /* B stepping K8 bug */
- if ((value >> 32) == 0)
- value |= 0xffffffffUL << 32;
-
- while (first <= last) {
- const struct exception_table_entry *mid;
- long diff;
-
- mid = (last - first) / 2 + first;
- diff = mid->insn - value;
- if (diff == 0)
- return mid;
- else if (diff < 0)
- first = mid+1;
- else
- last = mid-1;
- }
- return NULL;
-}
-#endif
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f4cee9028cf0..f62777940dfb 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -38,7 +38,8 @@ enum x86_pf_error_code {
* Returns 0 if mmiotrace is disabled, or if the fault is not
* handled by mmiotrace:
*/
-static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
+static inline int __kprobes
+kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
if (unlikely(is_kmmio_active()))
if (kmmio_handler(regs, addr) == 1)
@@ -46,7 +47,7 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
return 0;
}
-static inline int notify_page_fault(struct pt_regs *regs)
+static inline int __kprobes notify_page_fault(struct pt_regs *regs)
{
int ret = 0;
@@ -240,7 +241,7 @@ void vmalloc_sync_all(void)
*
* Handle a fault on the vmalloc or module mapping area
*/
-static noinline int vmalloc_fault(unsigned long address)
+static noinline __kprobes int vmalloc_fault(unsigned long address)
{
unsigned long pgd_paddr;
pmd_t *pmd_k;
@@ -357,7 +358,7 @@ void vmalloc_sync_all(void)
*
* This assumes no large pages in there.
*/
-static noinline int vmalloc_fault(unsigned long address)
+static noinline __kprobes int vmalloc_fault(unsigned long address)
{
pgd_t *pgd, *pgd_ref;
pud_t *pud, *pud_ref;
@@ -658,7 +659,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
show_fault_oops(regs, error_code, address);
stackend = end_of_stack(tsk);
- if (*stackend != STACK_END_MAGIC)
+ if (tsk != &init_task && *stackend != STACK_END_MAGIC)
printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
tsk->thread.cr2 = address;
@@ -860,7 +861,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
* There are no security implications to leaving a stale TLB when
* increasing the permissions on a page.
*/
-static noinline int
+static noinline __kprobes int
spurious_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 71da1bca13cb..738e6593799d 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -18,7 +18,7 @@ static inline pte_t gup_get_pte(pte_t *ptep)
#else
/*
* With get_user_pages_fast, we walk down the pagetables without taking
- * any locks. For this we would like to load the pointers atoimcally,
+ * any locks. For this we would like to load the pointers atomically,
* but that is not possible (without expensive cmpxchg8b) on PAE. What
* we do have is the guarantee that a pte will only either go from not
* present to present, or present to not present or both -- it will not
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 73ffd5536f62..e71c5cbc8f35 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -146,10 +146,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
use_gbpages = direct_gbpages;
#endif
- set_nx();
- if (nx_enabled)
- printk(KERN_INFO "NX (Execute Disable) protection: active\n");
-
/* Enable PSE if available */
if (cpu_has_pse)
set_in_cr4(X86_CR4_PSE);
@@ -270,16 +266,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
if (!after_bootmem)
find_early_table_space(end, use_pse, use_gbpages);
-#ifdef CONFIG_X86_32
- for (i = 0; i < nr_range; i++)
- kernel_physical_mapping_init(mr[i].start, mr[i].end,
- mr[i].page_size_mask);
- ret = end;
-#else /* CONFIG_X86_64 */
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
mr[i].page_size_mask);
-#endif
#ifdef CONFIG_X86_32
early_ioremap_page_table_range_init();
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 30938c1d8d5d..5cb3f0f54f47 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -241,6 +241,7 @@ kernel_physical_mapping_init(unsigned long start,
unsigned long page_size_mask)
{
int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
+ unsigned long last_map_addr = end;
unsigned long start_pfn, end_pfn;
pgd_t *pgd_base = swapper_pg_dir;
int pgd_idx, pmd_idx, pte_ofs;
@@ -341,9 +342,10 @@ repeat:
prot = PAGE_KERNEL_EXEC;
pages_4k++;
- if (mapping_iter == 1)
+ if (mapping_iter == 1) {
set_pte(pte, pfn_pte(pfn, init_prot));
- else
+ last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
+ } else
set_pte(pte, pfn_pte(pfn, prot));
}
}
@@ -368,7 +370,7 @@ repeat:
mapping_iter = 2;
goto repeat;
}
- return 0;
+ return last_map_addr;
}
pte_t *kmap_pte;
@@ -412,7 +414,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
pkmap_page_table = pte;
}
-static void __init add_one_highpage_init(struct page *page, int pfn)
+static void __init add_one_highpage_init(struct page *page)
{
ClearPageReserved(page);
init_page_count(page);
@@ -445,7 +447,7 @@ static int __init add_highpages_work_fn(unsigned long start_pfn,
if (!pfn_valid(node_pfn))
continue;
page = pfn_to_page(node_pfn);
- add_one_highpage_init(page, node_pfn);
+ add_one_highpage_init(page);
}
return 0;
@@ -703,8 +705,8 @@ void __init find_low_pfn_range(void)
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init initmem_init(unsigned long start_pfn,
- unsigned long end_pfn)
+void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
+ int acpi, int k8)
{
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
@@ -748,6 +750,7 @@ static void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}
+#ifndef CONFIG_NO_BOOTMEM
static unsigned long __init setup_node_bootmem(int nodeid,
unsigned long start_pfn,
unsigned long end_pfn,
@@ -764,13 +767,14 @@ static unsigned long __init setup_node_bootmem(int nodeid,
printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
nodeid, bootmap, bootmap + bootmap_size);
free_bootmem_with_active_regions(nodeid, end_pfn);
- early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
return bootmap + bootmap_size;
}
+#endif
void __init setup_bootmem_allocator(void)
{
+#ifndef CONFIG_NO_BOOTMEM
int nodeid;
unsigned long bootmap_size, bootmap;
/*
@@ -782,11 +786,13 @@ void __init setup_bootmem_allocator(void)
if (bootmap == -1L)
panic("Cannot find bootmem map of size %ld\n", bootmap_size);
reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
+#endif
printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
max_pfn_mapped<<PAGE_SHIFT);
printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
+#ifndef CONFIG_NO_BOOTMEM
for_each_online_node(nodeid) {
unsigned long start_pfn, end_pfn;
@@ -804,6 +810,7 @@ void __init setup_bootmem_allocator(void)
bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
bootmap);
}
+#endif
after_bootmem = 1;
}
@@ -892,8 +899,7 @@ void __init mem_init(void)
reservedpages << (PAGE_SHIFT-10),
datasize >> 10,
initsize >> 10,
- (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
- );
+ totalhigh_pages << (PAGE_SHIFT-10));
printk(KERN_INFO "virtual kernel memory layout:\n"
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
@@ -997,7 +1003,7 @@ static noinline int do_test_wp_bit(void)
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
-static int kernel_set_to_readonly;
+int kernel_set_to_readonly __read_mostly;
void set_kernel_text_rw(void)
{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5a4398a6006b..e9b040e1cde5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -49,6 +49,7 @@
#include <asm/numa.h>
#include <asm/cacheflush.h>
#include <asm/init.h>
+#include <linux/bootmem.h>
static unsigned long dma_reserve __initdata;
@@ -568,8 +569,10 @@ kernel_physical_mapping_init(unsigned long start,
}
#ifndef CONFIG_NUMA
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
+ int acpi, int k8)
{
+#ifndef CONFIG_NO_BOOTMEM
unsigned long bootmap_size, bootmap;
bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
@@ -577,13 +580,15 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
PAGE_SIZE);
if (bootmap == -1L)
panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+ reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
/* don't touch min_low_pfn */
bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
0, end_pfn);
e820_register_active_regions(0, start_pfn, end_pfn);
free_bootmem_with_active_regions(0, end_pfn);
- early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
- reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
+#else
+ e820_register_active_regions(0, start_pfn, end_pfn);
+#endif
}
#endif
@@ -615,6 +620,21 @@ void __init paging_init(void)
*/
#ifdef CONFIG_MEMORY_HOTPLUG
/*
+ * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
+ * updating.
+ */
+static void update_end_of_memory_vars(u64 start, u64 size)
+{
+ unsigned long end_pfn = PFN_UP(start + size);
+
+ if (end_pfn > max_pfn) {
+ max_pfn = end_pfn;
+ max_low_pfn = end_pfn;
+ high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+ }
+}
+
+/*
* Memory is added always to NORMAL zone. This means you will never get
* additional DMA/DMA32 memory.
*/
@@ -633,6 +653,9 @@ int arch_add_memory(int nid, u64 start, u64 size)
ret = __add_pages(nid, zone, start_pfn, nr_pages);
WARN_ON_ONCE(ret);
+ /* update max_pfn, max_low_pfn and high_memory */
+ update_end_of_memory_vars(start, size);
+
return ret;
}
EXPORT_SYMBOL_GPL(arch_add_memory);
@@ -694,12 +717,12 @@ void __init mem_init(void)
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
-static int kernel_set_to_readonly;
+int kernel_set_to_readonly;
void set_kernel_text_rw(void)
{
- unsigned long start = PFN_ALIGN(_stext);
- unsigned long end = PFN_ALIGN(__start_rodata);
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long end = PFN_ALIGN(__stop___ex_table);
if (!kernel_set_to_readonly)
return;
@@ -707,13 +730,18 @@ void set_kernel_text_rw(void)
pr_debug("Set kernel text: %lx - %lx for read write\n",
start, end);
+ /*
+ * Make the kernel identity mapping for text RW. Kernel text
+ * mapping will always be RO. Refer to the comment in
+ * static_protections() in pageattr.c
+ */
set_memory_rw(start, (end - start) >> PAGE_SHIFT);
}
void set_kernel_text_ro(void)
{
- unsigned long start = PFN_ALIGN(_stext);
- unsigned long end = PFN_ALIGN(__start_rodata);
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long end = PFN_ALIGN(__stop___ex_table);
if (!kernel_set_to_readonly)
return;
@@ -721,14 +749,21 @@ void set_kernel_text_ro(void)
pr_debug("Set kernel text: %lx - %lx for read only\n",
start, end);
+ /*
+ * Set the kernel identity mapping for text RO.
+ */
set_memory_ro(start, (end - start) >> PAGE_SHIFT);
}
void mark_rodata_ro(void)
{
- unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
+ unsigned long start = PFN_ALIGN(_text);
unsigned long rodata_start =
((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+ unsigned long end = (unsigned long) &__end_rodata_hpage_align;
+ unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
+ unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
+ unsigned long data_start = (unsigned long) &_sdata;
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
@@ -751,6 +786,14 @@ void mark_rodata_ro(void)
printk(KERN_INFO "Testing CPA: again\n");
set_memory_ro(start, (end-start) >> PAGE_SHIFT);
#endif
+
+ free_init_pages("unused kernel memory",
+ (unsigned long) page_address(virt_to_page(text_end)),
+ (unsigned long)
+ page_address(virt_to_page(rodata_start)));
+ free_init_pages("unused kernel memory",
+ (unsigned long) page_address(virt_to_page(rodata_end)),
+ (unsigned long) page_address(virt_to_page(data_start)));
}
#endif
@@ -934,7 +977,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
if (pmd_none(*pmd)) {
pte_t entry;
- p = vmemmap_alloc_block(PMD_SIZE, node);
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node);
if (!p)
return -ENOMEM;
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 334e63ca7b2b..5eb1ba74a3a9 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,43 +24,6 @@
#include "physaddr.h"
-int page_is_ram(unsigned long pagenr)
-{
- resource_size_t addr, end;
- int i;
-
- /*
- * A special case is the first 4Kb of memory;
- * This is a BIOS owned area, not kernel ram, but generally
- * not listed as such in the E820 table.
- */
- if (pagenr == 0)
- return 0;
-
- /*
- * Second special case: Some BIOSen report the PC BIOS
- * area (640->1Mb) as ram even though it is not.
- */
- if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
- pagenr < (BIOS_END >> PAGE_SHIFT))
- return 0;
-
- for (i = 0; i < e820.nr_map; i++) {
- /*
- * Not usable memory:
- */
- if (e820.map[i].type != E820_RAM)
- continue;
- addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
- end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
-
-
- if ((pagenr >= addr) && (pagenr < end))
- return 1;
- }
- return 0;
-}
-
/*
* Fix up the linear direct mapping of the kernel to avoid cache attribute
* conflicts.
@@ -170,8 +133,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
(unsigned long long)phys_addr,
(unsigned long long)(phys_addr + size),
prot_val, new_prot_val);
- free_memtype(phys_addr, phys_addr + size);
- return NULL;
+ goto err_free_memtype;
}
prot_val = new_prot_val;
}
@@ -197,26 +159,25 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
*/
area = get_vm_area_caller(size, VM_IOREMAP, caller);
if (!area)
- return NULL;
+ goto err_free_memtype;
area->phys_addr = phys_addr;
vaddr = (unsigned long) area->addr;
- if (kernel_map_sync_memtype(phys_addr, size, prot_val)) {
- free_memtype(phys_addr, phys_addr + size);
- free_vm_area(area);
- return NULL;
- }
+ if (kernel_map_sync_memtype(phys_addr, size, prot_val))
+ goto err_free_area;
- if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
- free_memtype(phys_addr, phys_addr + size);
- free_vm_area(area);
- return NULL;
- }
+ if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
+ goto err_free_area;
ret_addr = (void __iomem *) (vaddr + offset);
mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
return ret_addr;
+err_free_area:
+ free_vm_area(area);
+err_free_memtype:
+ free_memtype(phys_addr, phys_addr + size);
+ return NULL;
}
/**
@@ -283,30 +244,6 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
}
EXPORT_SYMBOL(ioremap_cache);
-static void __iomem *ioremap_default(resource_size_t phys_addr,
- unsigned long size)
-{
- unsigned long flags;
- void __iomem *ret;
- int err;
-
- /*
- * - WB for WB-able memory and no other conflicting mappings
- * - UC_MINUS for non-WB-able memory with no other conflicting mappings
- * - Inherit from confliting mappings otherwise
- */
- err = reserve_memtype(phys_addr, phys_addr + size,
- _PAGE_CACHE_WB, &flags);
- if (err < 0)
- return NULL;
-
- ret = __ioremap_caller(phys_addr, size, flags,
- __builtin_return_address(0));
-
- free_memtype(phys_addr, phys_addr + size);
- return ret;
-}
-
void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
unsigned long prot_val)
{
@@ -382,7 +319,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
if (page_is_ram(start >> PAGE_SHIFT))
return __va(phys);
- addr = (void __force *)ioremap_default(start, PAGE_SIZE);
+ addr = (void __force *)ioremap_cache(start, PAGE_SIZE);
if (addr)
addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
@@ -448,6 +385,10 @@ void __init early_ioremap_init(void)
* The boot-ioremap range spans multiple pmds, for which
* we are not prepared:
*/
+#define __FIXADDR_TOP (-PAGE_SIZE)
+ BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
+ != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
+#undef __FIXADDR_TOP
if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
WARN_ON(1);
printk(KERN_WARNING "pmd %p != %p\n",
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 268f8255280f..970ed579d4e4 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -24,6 +24,9 @@
#include <asm/apic.h>
#include <asm/k8.h>
+static struct bootnode __initdata nodes[8];
+static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
+
static __init int find_northbridge(void)
{
int num;
@@ -54,18 +57,6 @@ static __init void early_get_boot_cpu_id(void)
* need to get boot_cpu_id so can use that to create apicid_to_node
* in k8_scan_nodes()
*/
- /*
- * Find possible boot-time SMP configuration:
- */
-#ifdef CONFIG_X86_MPPARSE
- early_find_smp_config();
-#endif
-#ifdef CONFIG_ACPI
- /*
- * Read APIC information from ACPI tables.
- */
- early_acpi_boot_init();
-#endif
#ifdef CONFIG_X86_MPPARSE
/*
* get boot-time SMP configuration:
@@ -76,12 +67,26 @@ static __init void early_get_boot_cpu_id(void)
early_init_lapic_mapping();
}
-int __init k8_scan_nodes(unsigned long start, unsigned long end)
+int __init k8_get_nodes(struct bootnode *physnodes)
{
- unsigned numnodes, cores, bits, apicid_base;
+ int i;
+ int ret = 0;
+
+ for_each_node_mask(i, nodes_parsed) {
+ physnodes[ret].start = nodes[i].start;
+ physnodes[ret].end = nodes[i].end;
+ ret++;
+ }
+ return ret;
+}
+
+int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long start = PFN_PHYS(start_pfn);
+ unsigned long end = PFN_PHYS(end_pfn);
+ unsigned numnodes;
unsigned long prevbase;
- struct bootnode nodes[8];
- int i, j, nb, found = 0;
+ int i, nb, found = 0;
u32 nodeid, reg;
if (!early_pci_allowed())
@@ -91,16 +96,15 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
if (nb < 0)
return nb;
- printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
+ pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
reg = read_pci_config(0, nb, 0, 0x60);
numnodes = ((reg >> 4) & 0xF) + 1;
if (numnodes <= 1)
return -1;
- printk(KERN_INFO "Number of nodes %d\n", numnodes);
+ pr_info("Number of physical nodes %d\n", numnodes);
- memset(&nodes, 0, sizeof(nodes));
prevbase = 0;
for (i = 0; i < 8; i++) {
unsigned long base, limit;
@@ -111,28 +115,28 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
nodeid = limit & 7;
if ((base & 3) == 0) {
if (i < numnodes)
- printk("Skipping disabled node %d\n", i);
+ pr_info("Skipping disabled node %d\n", i);
continue;
}
if (nodeid >= numnodes) {
- printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
- base, limit);
+ pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
+ base, limit);
continue;
}
if (!limit) {
- printk(KERN_INFO "Skipping node entry %d (base %lx)\n",
- i, base);
+ pr_info("Skipping node entry %d (base %lx)\n",
+ i, base);
continue;
}
if ((base >> 8) & 3 || (limit >> 8) & 3) {
- printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n",
- nodeid, (base>>8)&3, (limit>>8) & 3);
+ pr_err("Node %d using interleaving mode %lx/%lx\n",
+ nodeid, (base >> 8) & 3, (limit >> 8) & 3);
return -1;
}
- if (node_isset(nodeid, node_possible_map)) {
- printk(KERN_INFO "Node %d already present. Skipping\n",
- nodeid);
+ if (node_isset(nodeid, nodes_parsed)) {
+ pr_info("Node %d already present, skipping\n",
+ nodeid);
continue;
}
@@ -141,8 +145,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
limit |= (1<<24)-1;
limit++;
- if (limit > max_pfn << PAGE_SHIFT)
- limit = max_pfn << PAGE_SHIFT;
+ if (limit > end)
+ limit = end;
if (limit <= base)
continue;
@@ -154,24 +158,24 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
if (limit > end)
limit = end;
if (limit == base) {
- printk(KERN_ERR "Empty node %d\n", nodeid);
+ pr_err("Empty node %d\n", nodeid);
continue;
}
if (limit < base) {
- printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
+ pr_err("Node %d bogus settings %lx-%lx.\n",
nodeid, base, limit);
continue;
}
/* Could sort here, but pun for now. Should not happen anyroads. */
if (prevbase > base) {
- printk(KERN_ERR "Node map not sorted %lx,%lx\n",
+ pr_err("Node map not sorted %lx,%lx\n",
prevbase, base);
return -1;
}
- printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n",
- nodeid, base, limit);
+ pr_info("Node %d MemBase %016lx Limit %016lx\n",
+ nodeid, base, limit);
found++;
@@ -180,18 +184,29 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
prevbase = base;
- node_set(nodeid, node_possible_map);
+ node_set(nodeid, nodes_parsed);
}
if (!found)
return -1;
+ return 0;
+}
+
+int __init k8_scan_nodes(void)
+{
+ unsigned int bits;
+ unsigned int cores;
+ unsigned int apicid_base;
+ int i;
+ BUG_ON(nodes_empty(nodes_parsed));
+ node_possible_map = nodes_parsed;
memnode_shift = compute_hash_shift(nodes, 8, NULL);
if (memnode_shift < 0) {
- printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
+ pr_err("No NUMA node hash function found. Contact maintainer\n");
return -1;
}
- printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift);
+ pr_info("Using node hash shift of %d\n", memnode_shift);
/* use the coreid bits from early_identify_cpu */
bits = boot_cpu_data.x86_coreid_bits;
@@ -200,14 +215,12 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
/* need to get boot_cpu_id early for system with apicid lifting */
early_get_boot_cpu_id();
if (boot_cpu_physical_apicid > 0) {
- printk(KERN_INFO "BSP APIC ID: %02x\n",
- boot_cpu_physical_apicid);
+ pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
apicid_base = boot_cpu_physical_apicid;
}
- for (i = 0; i < 8; i++) {
- if (nodes[i].start == nodes[i].end)
- continue;
+ for_each_node_mask(i, node_possible_map) {
+ int j;
e820_register_active_regions(i,
nodes[i].start >> PAGE_SHIFT,
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 4901d0dafda6..af3b6c8a436f 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -106,26 +106,25 @@ void kmemcheck_error_recall(void)
switch (e->type) {
case KMEMCHECK_ERROR_INVALID_ACCESS:
- printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read "
- "from %s memory (%p)\n",
+ printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n",
8 * e->size, e->state < ARRAY_SIZE(desc) ?
desc[e->state] : "(invalid shadow state)",
(void *) e->address);
- printk(KERN_INFO);
+ printk(KERN_WARNING);
for (i = 0; i < SHADOW_COPY_SIZE; ++i)
- printk("%02x", e->memory_copy[i]);
- printk("\n");
+ printk(KERN_CONT "%02x", e->memory_copy[i]);
+ printk(KERN_CONT "\n");
- printk(KERN_INFO);
+ printk(KERN_WARNING);
for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
- printk(" %c", short_desc[e->shadow_copy[i]]);
+ printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]);
else
- printk(" ?");
+ printk(KERN_CONT " ?");
}
- printk("\n");
- printk(KERN_INFO "%*c\n", 2 + 2
+ printk(KERN_CONT "\n");
+ printk(KERN_WARNING "%*c\n", 2 + 2
* (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
break;
case KMEMCHECK_ERROR_BUG:
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index 8cc183344140..b3b531a4f8e5 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -337,7 +337,7 @@ bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
if (!shadow)
return true;
- status = kmemcheck_shadow_test(shadow, size);
+ status = kmemcheck_shadow_test_all(shadow, size);
return status == KMEMCHECK_SHADOW_INITIALIZED;
}
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
index 3f66b82076a3..aec124214d97 100644
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -125,12 +125,12 @@ void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
{
+#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
uint8_t *x;
unsigned int i;
x = shadow;
-#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
/*
* Make sure _some_ bytes are initialized. Gcc frequently generates
* code to access neighboring bytes.
@@ -139,13 +139,25 @@ enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
return x[i];
}
+
+ return x[0];
#else
+ return kmemcheck_shadow_test_all(shadow, size);
+#endif
+}
+
+enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
+{
+ uint8_t *x;
+ unsigned int i;
+
+ x = shadow;
+
/* All bytes must be initialized. */
for (i = 0; i < size; ++i) {
if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
return x[i];
}
-#endif
return x[0];
}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
index af46d9ab9d86..ff0b2f70fbcb 100644
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -11,6 +11,8 @@ enum kmemcheck_shadow {
void *kmemcheck_shadow_lookup(unsigned long address);
enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
+enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
+ unsigned int size);
void kmemcheck_shadow_set(void *shadow, unsigned int size);
#endif
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 16ccbd77917f..536fb6823366 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -5,6 +5,8 @@
* 2008 Pekka Paalanen <pq@iki.fi>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/list.h>
#include <linux/rculist.h>
#include <linux/spinlock.h>
@@ -136,7 +138,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
pte_t *pte = lookup_address(f->page, &level);
if (!pte) {
- pr_err("kmmio: no pte for page 0x%08lx\n", f->page);
+ pr_err("no pte for page 0x%08lx\n", f->page);
return -1;
}
@@ -148,7 +150,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
clear_pte_presence(pte, clear, &f->old_presence);
break;
default:
- pr_err("kmmio: unexpected page level 0x%x.\n", level);
+ pr_err("unexpected page level 0x%x.\n", level);
return -1;
}
@@ -170,13 +172,14 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
{
int ret;
- WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
+ WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
if (f->armed) {
- pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
- f->page, f->count, !!f->old_presence);
+ pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n",
+ f->page, f->count, !!f->old_presence);
}
ret = clear_page_presence(f, true);
- WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
+ WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"),
+ f->page);
f->armed = true;
return ret;
}
@@ -203,7 +206,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
*/
/*
* Interrupts are disabled on entry as trap3 is an interrupt gate
- * and they remain disabled thorough out this function.
+ * and they remain disabled throughout this function.
*/
int kmmio_handler(struct pt_regs *regs, unsigned long addr)
{
@@ -240,24 +243,21 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
* condition needs handling by do_page_fault(), the
* page really not being present is the most common.
*/
- pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n",
- addr, smp_processor_id());
+ pr_debug("secondary hit for 0x%08lx CPU %d.\n",
+ addr, smp_processor_id());
if (!faultpage->old_presence)
- pr_info("kmmio: unexpected secondary hit for "
- "address 0x%08lx on CPU %d.\n", addr,
- smp_processor_id());
+ pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
+ addr, smp_processor_id());
} else {
/*
* Prevent overwriting already in-flight context.
* This should not happen, let's hope disarming at
* least prevents a panic.
*/
- pr_emerg("kmmio: recursive probe hit on CPU %d, "
- "for address 0x%08lx. Ignoring.\n",
- smp_processor_id(), addr);
- pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
- ctx->addr);
+ pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
+ smp_processor_id(), addr);
+ pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
disarm_kmmio_fault_page(faultpage);
}
goto no_kmmio_ctx;
@@ -302,7 +302,7 @@ no_kmmio:
/*
* Interrupts are disabled on entry as trap1 is an interrupt gate
- * and they remain disabled thorough out this function.
+ * and they remain disabled throughout this function.
* This must always get called as the pair to kmmio_handler().
*/
static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
@@ -316,8 +316,8 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
* something external causing them (f.e. using a debugger while
* mmio tracing enabled), or erroneous behaviour
*/
- pr_warning("kmmio: unexpected debug trap on CPU %d.\n",
- smp_processor_id());
+ pr_warning("unexpected debug trap on CPU %d.\n",
+ smp_processor_id());
goto out;
}
@@ -425,7 +425,7 @@ int register_kmmio_probe(struct kmmio_probe *p)
list_add_rcu(&p->list, &kmmio_probes);
while (size < size_lim) {
if (add_kmmio_fault_page(p->addr + size))
- pr_err("kmmio: Unable to set page fault.\n");
+ pr_err("Unable to set page fault.\n");
size += PAGE_SIZE;
}
out:
@@ -490,7 +490,7 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
* 2. remove_kmmio_fault_pages()
* Remove the pages from kmmio_page_table.
* 3. rcu_free_kmmio_fault_pages()
- * Actally free the kmmio_fault_page structs as with RCU.
+ * Actually free the kmmio_fault_page structs as with RCU.
*/
void unregister_kmmio_probe(struct kmmio_probe *p)
{
@@ -511,7 +511,7 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
if (!drelease) {
- pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
+ pr_crit("leaking kmmio_fault_page objects.\n");
return;
}
drelease->release_list = release_list;
@@ -538,10 +538,17 @@ static int
kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
{
struct die_args *arg = args;
+ unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err);
- if (val == DIE_DEBUG && (arg->err & DR_STEP))
- if (post_kmmio_handler(arg->err, arg->regs) == 1)
+ if (val == DIE_DEBUG && (*dr6_p & DR_STEP))
+ if (post_kmmio_handler(*dr6_p, arg->regs) == 1) {
+ /*
+ * Reset the BS bit in dr6 (pointed by args->err) to
+ * denote completion of processing
+ */
+ *dr6_p &= ~DR_STEP;
return NOTIFY_STOP;
+ }
return NOTIFY_DONE;
}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index c8191defc38a..1dab5194fd9d 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -71,7 +71,7 @@ static int mmap_is_legacy(void)
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
- if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
+ if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
return 1;
return sysctl_legacy_va_layout;
@@ -96,7 +96,7 @@ static unsigned long mmap_rnd(void)
static unsigned long mmap_base(void)
{
- unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
+ unsigned long gap = rlimit(RLIMIT_STACK);
if (gap < MIN_GAP)
gap = MIN_GAP;
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 132772a8ec57..34a3291ca103 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -19,6 +19,9 @@
*
* Derived from the read-mod example from relay-examples by Tom Zanussi.
*/
+
+#define pr_fmt(fmt) "mmiotrace: " fmt
+
#define DEBUG 1
#include <linux/module.h>
@@ -36,8 +39,6 @@
#include "pf_in.h"
-#define NAME "mmiotrace: "
-
struct trap_reason {
unsigned long addr;
unsigned long ip;
@@ -96,17 +97,18 @@ static void print_pte(unsigned long address)
pte_t *pte = lookup_address(address, &level);
if (!pte) {
- pr_err(NAME "Error in %s: no pte for page 0x%08lx\n",
- __func__, address);
+ pr_err("Error in %s: no pte for page 0x%08lx\n",
+ __func__, address);
return;
}
if (level == PG_LEVEL_2M) {
- pr_emerg(NAME "4MB pages are not currently supported: "
- "0x%08lx\n", address);
+ pr_emerg("4MB pages are not currently supported: 0x%08lx\n",
+ address);
BUG();
}
- pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address,
+ pr_info("pte for 0x%lx: 0x%llx 0x%llx\n",
+ address,
(unsigned long long)pte_val(*pte),
(unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
}
@@ -118,22 +120,21 @@ static void print_pte(unsigned long address)
static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
{
const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
- pr_emerg(NAME "unexpected fault for address: 0x%08lx, "
- "last fault for address: 0x%08lx\n",
- addr, my_reason->addr);
+ pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n",
+ addr, my_reason->addr);
print_pte(addr);
print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
#ifdef __i386__
pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
- regs->ax, regs->bx, regs->cx, regs->dx);
+ regs->ax, regs->bx, regs->cx, regs->dx);
pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
- regs->si, regs->di, regs->bp, regs->sp);
+ regs->si, regs->di, regs->bp, regs->sp);
#else
pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n",
- regs->ax, regs->cx, regs->dx);
+ regs->ax, regs->cx, regs->dx);
pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n",
- regs->si, regs->di, regs->bp, regs->sp);
+ regs->si, regs->di, regs->bp, regs->sp);
#endif
put_cpu_var(pf_reason);
BUG();
@@ -213,7 +214,7 @@ static void post(struct kmmio_probe *p, unsigned long condition,
/* this should always return the active_trace count to 0 */
my_reason->active_traces--;
if (my_reason->active_traces) {
- pr_emerg(NAME "unexpected post handler");
+ pr_emerg("unexpected post handler");
BUG();
}
@@ -244,7 +245,7 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size,
};
if (!trace) {
- pr_err(NAME "kmalloc failed in ioremap\n");
+ pr_err("kmalloc failed in ioremap\n");
return;
}
@@ -282,8 +283,8 @@ void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
if (!is_enabled()) /* recheck and proper locking in *_core() */
return;
- pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n",
- (unsigned long long)offset, size, addr);
+ pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n",
+ (unsigned long long)offset, size, addr);
if ((filter_offset) && (offset != filter_offset))
return;
ioremap_trace_core(offset, size, addr);
@@ -301,7 +302,7 @@ static void iounmap_trace_core(volatile void __iomem *addr)
struct remap_trace *tmp;
struct remap_trace *found_trace = NULL;
- pr_debug(NAME "Unmapping %p.\n", addr);
+ pr_debug("Unmapping %p.\n", addr);
spin_lock_irq(&trace_lock);
if (!is_enabled())
@@ -363,9 +364,8 @@ static void clear_trace_list(void)
* Caller also ensures is_enabled() cannot change.
*/
list_for_each_entry(trace, &trace_list, list) {
- pr_notice(NAME "purging non-iounmapped "
- "trace @0x%08lx, size 0x%lx.\n",
- trace->probe.addr, trace->probe.len);
+ pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n",
+ trace->probe.addr, trace->probe.len);
if (!nommiotrace)
unregister_kmmio_probe(&trace->probe);
}
@@ -387,7 +387,7 @@ static void enter_uniprocessor(void)
if (downed_cpus == NULL &&
!alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
- pr_notice(NAME "Failed to allocate mask\n");
+ pr_notice("Failed to allocate mask\n");
goto out;
}
@@ -395,20 +395,19 @@ static void enter_uniprocessor(void)
cpumask_copy(downed_cpus, cpu_online_mask);
cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus);
if (num_online_cpus() > 1)
- pr_notice(NAME "Disabling non-boot CPUs...\n");
+ pr_notice("Disabling non-boot CPUs...\n");
put_online_cpus();
for_each_cpu(cpu, downed_cpus) {
err = cpu_down(cpu);
if (!err)
- pr_info(NAME "CPU%d is down.\n", cpu);
+ pr_info("CPU%d is down.\n", cpu);
else
- pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err);
+ pr_err("Error taking CPU%d down: %d\n", cpu, err);
}
out:
if (num_online_cpus() > 1)
- pr_warning(NAME "multiple CPUs still online, "
- "may miss events.\n");
+ pr_warning("multiple CPUs still online, may miss events.\n");
}
/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit,
@@ -420,13 +419,13 @@ static void __ref leave_uniprocessor(void)
if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0)
return;
- pr_notice(NAME "Re-enabling CPUs...\n");
+ pr_notice("Re-enabling CPUs...\n");
for_each_cpu(cpu, downed_cpus) {
err = cpu_up(cpu);
if (!err)
- pr_info(NAME "enabled CPU%d.\n", cpu);
+ pr_info("enabled CPU%d.\n", cpu);
else
- pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err);
+ pr_err("cannot re-enable CPU%d: %d\n", cpu, err);
}
}
@@ -434,8 +433,8 @@ static void __ref leave_uniprocessor(void)
static void enter_uniprocessor(void)
{
if (num_online_cpus() > 1)
- pr_warning(NAME "multiple CPUs are online, may miss events. "
- "Suggest booting with maxcpus=1 kernel argument.\n");
+ pr_warning("multiple CPUs are online, may miss events. "
+ "Suggest booting with maxcpus=1 kernel argument.\n");
}
static void leave_uniprocessor(void)
@@ -450,13 +449,13 @@ void enable_mmiotrace(void)
goto out;
if (nommiotrace)
- pr_info(NAME "MMIO tracing disabled.\n");
+ pr_info("MMIO tracing disabled.\n");
kmmio_init();
enter_uniprocessor();
spin_lock_irq(&trace_lock);
atomic_inc(&mmiotrace_enabled);
spin_unlock_irq(&trace_lock);
- pr_info(NAME "enabled.\n");
+ pr_info("enabled.\n");
out:
mutex_unlock(&mmiotrace_mutex);
}
@@ -475,7 +474,7 @@ void disable_mmiotrace(void)
clear_trace_list(); /* guarantees: no more kmmio callbacks */
leave_uniprocessor();
kmmio_cleanup();
- pr_info(NAME "disabled.\n");
+ pr_info("disabled.\n");
out:
mutex_unlock(&mmiotrace_mutex);
}
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index d2530062fe00..809baaaf48b1 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -347,8 +347,8 @@ static void init_remap_allocator(int nid)
(ulong) node_remap_end_vaddr[nid]);
}
-void __init initmem_init(unsigned long start_pfn,
- unsigned long end_pfn)
+void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
+ int acpi, int k8)
{
int nid;
long kva_target_pfn;
@@ -418,7 +418,10 @@ void __init initmem_init(unsigned long start_pfn,
for_each_online_node(nid) {
memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+ NODE_DATA(nid)->node_id = nid;
+#ifndef CONFIG_NO_BOOTMEM
NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
+#endif
}
setup_bootmem_allocator();
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 459913beac71..8948f47fde05 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -163,30 +163,48 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
unsigned long end, unsigned long size,
unsigned long align)
{
- unsigned long mem = find_e820_area(start, end, size, align);
- void *ptr;
+ unsigned long mem;
+ /*
+ * put it on high as possible
+ * something will go with NODE_DATA
+ */
+ if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
+ start = MAX_DMA_PFN<<PAGE_SHIFT;
+ if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
+ end > (MAX_DMA32_PFN<<PAGE_SHIFT))
+ start = MAX_DMA32_PFN<<PAGE_SHIFT;
+ mem = find_e820_area(start, end, size, align);
+ if (mem != -1L)
+ return __va(mem);
+
+ /* extend the search scope */
+ end = max_pfn_mapped << PAGE_SHIFT;
+ if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
+ start = MAX_DMA32_PFN<<PAGE_SHIFT;
+ else
+ start = MAX_DMA_PFN<<PAGE_SHIFT;
+ mem = find_e820_area(start, end, size, align);
if (mem != -1L)
return __va(mem);
- ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
- if (ptr == NULL) {
- printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
+ printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
size, nodeid);
- return NULL;
- }
- return ptr;
+
+ return NULL;
}
/* Initialize bootmem allocator for a node */
void __init
setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
{
- unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
+ unsigned long start_pfn, last_pfn, nodedata_phys;
const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
- unsigned long bootmap_start, nodedata_phys;
- void *bootmap;
int nid;
+#ifndef CONFIG_NO_BOOTMEM
+ unsigned long bootmap_start, bootmap_pages, bootmap_size;
+ void *bootmap;
+#endif
if (!end)
return;
@@ -200,7 +218,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
start = roundup(start, ZONE_ALIGN);
- printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
+ printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
start, end);
start_pfn = start >> PAGE_SHIFT;
@@ -211,14 +229,21 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
if (node_data[nodeid] == NULL)
return;
nodedata_phys = __pa(node_data[nodeid]);
+ reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
nodedata_phys + pgdat_size - 1);
+ nid = phys_to_nid(nodedata_phys);
+ if (nid != nodeid)
+ printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
- NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
+ NODE_DATA(nodeid)->node_id = nodeid;
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
+#ifndef CONFIG_NO_BOOTMEM
+ NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
+
/*
* Find a place for the bootmem map
* nodedata_phys could be on other nodes by alloc_bootmem,
@@ -227,11 +252,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
* of alloc_bootmem, that could clash with reserved range
*/
bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
- nid = phys_to_nid(nodedata_phys);
- if (nid == nodeid)
- bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
- else
- bootmap_start = roundup(start, PAGE_SIZE);
+ bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
/*
* SMP_CACHE_BYTES could be enough, but init_bootmem_node like
* to use that to align to PAGE_SIZE
@@ -239,12 +260,13 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
bootmap = early_node_mem(nodeid, bootmap_start, end,
bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
if (bootmap == NULL) {
- if (nodedata_phys < start || nodedata_phys >= end)
- free_bootmem(nodedata_phys, pgdat_size);
+ free_early(nodedata_phys, nodedata_phys + pgdat_size);
node_data[nodeid] = NULL;
return;
}
bootmap_start = __pa(bootmap);
+ reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
+ "BOOTMAP");
bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
bootmap_start >> PAGE_SHIFT,
@@ -253,31 +275,12 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
bootmap_start, bootmap_start + bootmap_size - 1,
bootmap_pages);
-
- free_bootmem_with_active_regions(nodeid, end);
-
- /*
- * convert early reserve to bootmem reserve earlier
- * otherwise early_node_mem could use early reserved mem
- * on previous node
- */
- early_res_to_bootmem(start, end);
-
- /*
- * in some case early_node_mem could use alloc_bootmem
- * to get range on other node, don't reserve that again
- */
- if (nid != nodeid)
- printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
- else
- reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
- pgdat_size, BOOTMEM_DEFAULT);
nid = phys_to_nid(bootmap_start);
if (nid != nodeid)
printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
- else
- reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
- bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
+
+ free_bootmem_with_active_regions(nodeid, end);
+#endif
node_set_online(nodeid);
}
@@ -306,8 +309,71 @@ void __init numa_init_array(void)
#ifdef CONFIG_NUMA_EMU
/* Numa emulation */
+static struct bootnode nodes[MAX_NUMNODES] __initdata;
+static struct bootnode physnodes[MAX_NUMNODES] __initdata;
static char *cmdline __initdata;
+static int __init setup_physnodes(unsigned long start, unsigned long end,
+ int acpi, int k8)
+{
+ int nr_nodes = 0;
+ int ret = 0;
+ int i;
+
+#ifdef CONFIG_ACPI_NUMA
+ if (acpi)
+ nr_nodes = acpi_get_nodes(physnodes);
+#endif
+#ifdef CONFIG_K8_NUMA
+ if (k8)
+ nr_nodes = k8_get_nodes(physnodes);
+#endif
+ /*
+ * Basic sanity checking on the physical node map: there may be errors
+ * if the SRAT or K8 incorrectly reported the topology or the mem=
+ * kernel parameter is used.
+ */
+ for (i = 0; i < nr_nodes; i++) {
+ if (physnodes[i].start == physnodes[i].end)
+ continue;
+ if (physnodes[i].start > end) {
+ physnodes[i].end = physnodes[i].start;
+ continue;
+ }
+ if (physnodes[i].end < start) {
+ physnodes[i].start = physnodes[i].end;
+ continue;
+ }
+ if (physnodes[i].start < start)
+ physnodes[i].start = start;
+ if (physnodes[i].end > end)
+ physnodes[i].end = end;
+ }
+
+ /*
+ * Remove all nodes that have no memory or were truncated because of the
+ * limited address range.
+ */
+ for (i = 0; i < nr_nodes; i++) {
+ if (physnodes[i].start == physnodes[i].end)
+ continue;
+ physnodes[ret].start = physnodes[i].start;
+ physnodes[ret].end = physnodes[i].end;
+ ret++;
+ }
+
+ /*
+ * If no physical topology was detected, a single node is faked to cover
+ * the entire address space.
+ */
+ if (!ret) {
+ physnodes[ret].start = start;
+ physnodes[ret].end = end;
+ ret = 1;
+ }
+ return ret;
+}
+
/*
* Setups up nid to range from addr to addr + size. If the end
* boundary is greater than max_addr, then max_addr is used instead.
@@ -315,11 +381,9 @@ static char *cmdline __initdata;
* allocation past addr and -1 otherwise. addr is adjusted to be at
* the end of the node.
*/
-static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
- u64 size, u64 max_addr)
+static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
{
int ret = 0;
-
nodes[nid].start = *addr;
*addr += size;
if (*addr >= max_addr) {
@@ -335,167 +399,234 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
}
/*
- * Splits num_nodes nodes up equally starting at node_start. The return value
- * is the number of nodes split up and addr is adjusted to be at the end of the
- * last node allocated.
+ * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
+ * to max_addr. The return value is the number of nodes allocated.
*/
-static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
- u64 max_addr, int node_start,
- int num_nodes)
+static int __init split_nodes_interleave(u64 addr, u64 max_addr,
+ int nr_phys_nodes, int nr_nodes)
{
- unsigned int big;
+ nodemask_t physnode_mask = NODE_MASK_NONE;
u64 size;
+ int big;
+ int ret = 0;
int i;
- if (num_nodes <= 0)
+ if (nr_nodes <= 0)
return -1;
- if (num_nodes > MAX_NUMNODES)
- num_nodes = MAX_NUMNODES;
- size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
- num_nodes;
+ if (nr_nodes > MAX_NUMNODES) {
+ pr_info("numa=fake=%d too large, reducing to %d\n",
+ nr_nodes, MAX_NUMNODES);
+ nr_nodes = MAX_NUMNODES;
+ }
+
+ size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
/*
* Calculate the number of big nodes that can be allocated as a result
- * of consolidating the leftovers.
+ * of consolidating the remainder.
*/
- big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
- FAKE_NODE_MIN_SIZE;
+ big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
+ FAKE_NODE_MIN_SIZE;
- /* Round down to nearest FAKE_NODE_MIN_SIZE. */
size &= FAKE_NODE_MIN_HASH_MASK;
if (!size) {
- printk(KERN_ERR "Not enough memory for each node. "
- "NUMA emulation disabled.\n");
+ pr_err("Not enough memory for each node. "
+ "NUMA emulation disabled.\n");
return -1;
}
- for (i = node_start; i < num_nodes + node_start; i++) {
- u64 end = *addr + size;
+ for (i = 0; i < nr_phys_nodes; i++)
+ if (physnodes[i].start != physnodes[i].end)
+ node_set(i, physnode_mask);
- if (i < big)
- end += FAKE_NODE_MIN_SIZE;
- /*
- * The final node can have the remaining system RAM. Other
- * nodes receive roughly the same amount of available pages.
- */
- if (i == num_nodes + node_start - 1)
- end = max_addr;
- else
- while (end - *addr - e820_hole_size(*addr, end) <
- size) {
+ /*
+ * Continue to fill physical nodes with fake nodes until there is no
+ * memory left on any of them.
+ */
+ while (nodes_weight(physnode_mask)) {
+ for_each_node_mask(i, physnode_mask) {
+ u64 end = physnodes[i].start + size;
+ u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+
+ if (ret < big)
+ end += FAKE_NODE_MIN_SIZE;
+
+ /*
+ * Continue to add memory to this fake node if its
+ * non-reserved memory is less than the per-node size.
+ */
+ while (end - physnodes[i].start -
+ e820_hole_size(physnodes[i].start, end) < size) {
end += FAKE_NODE_MIN_SIZE;
- if (end > max_addr) {
- end = max_addr;
+ if (end > physnodes[i].end) {
+ end = physnodes[i].end;
break;
}
}
- if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
- break;
+
+ /*
+ * If there won't be at least FAKE_NODE_MIN_SIZE of
+ * non-reserved memory in ZONE_DMA32 for the next node,
+ * this one must extend to the boundary.
+ */
+ if (end < dma32_end && dma32_end - end -
+ e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+ end = dma32_end;
+
+ /*
+ * If there won't be enough non-reserved memory for the
+ * next node, this one must extend to the end of the
+ * physical node.
+ */
+ if (physnodes[i].end - end -
+ e820_hole_size(end, physnodes[i].end) < size)
+ end = physnodes[i].end;
+
+ /*
+ * Avoid allocating more nodes than requested, which can
+ * happen as a result of rounding down each node's size
+ * to FAKE_NODE_MIN_SIZE.
+ */
+ if (nodes_weight(physnode_mask) + ret >= nr_nodes)
+ end = physnodes[i].end;
+
+ if (setup_node_range(ret++, &physnodes[i].start,
+ end - physnodes[i].start,
+ physnodes[i].end) < 0)
+ node_clear(i, physnode_mask);
+ }
}
- return i - node_start + 1;
+ return ret;
}
/*
- * Splits the remaining system RAM into chunks of size. The remaining memory is
- * always assigned to a final node and can be asymmetric. Returns the number of
- * nodes split.
+ * Returns the end address of a node so that there is at least `size' amount of
+ * non-reserved memory or `max_addr' is reached.
*/
-static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
- u64 max_addr, int node_start, u64 size)
+static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
{
- int i = node_start;
- size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
- while (!setup_node_range(i++, nodes, addr, size, max_addr))
- ;
- return i - node_start;
+ u64 end = start + size;
+
+ while (end - start - e820_hole_size(start, end) < size) {
+ end += FAKE_NODE_MIN_SIZE;
+ if (end > max_addr) {
+ end = max_addr;
+ break;
+ }
+ }
+ return end;
}
/*
- * Sets up the system RAM area from start_pfn to last_pfn according to the
- * numa=fake command-line option.
+ * Sets up fake nodes of `size' interleaved over physical nodes ranging from
+ * `addr' to `max_addr'. The return value is the number of nodes allocated.
*/
-static struct bootnode nodes[MAX_NUMNODES] __initdata;
-
-static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
+static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
{
- u64 size, addr = start_pfn << PAGE_SHIFT;
- u64 max_addr = last_pfn << PAGE_SHIFT;
- int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
+ nodemask_t physnode_mask = NODE_MASK_NONE;
+ u64 min_size;
+ int ret = 0;
+ int i;
- memset(&nodes, 0, sizeof(nodes));
+ if (!size)
+ return -1;
/*
- * If the numa=fake command-line is just a single number N, split the
- * system RAM into N fake nodes.
+ * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
+ * increased accordingly if the requested size is too small. This
+ * creates a uniform distribution of node sizes across the entire
+ * machine (but not necessarily over physical nodes).
*/
- if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
- long n = simple_strtol(cmdline, NULL, 0);
-
- num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
- if (num_nodes < 0)
- return num_nodes;
- goto out;
+ min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) /
+ MAX_NUMNODES;
+ min_size = max(min_size, FAKE_NODE_MIN_SIZE);
+ if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
+ min_size = (min_size + FAKE_NODE_MIN_SIZE) &
+ FAKE_NODE_MIN_HASH_MASK;
+ if (size < min_size) {
+ pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
+ size >> 20, min_size >> 20);
+ size = min_size;
}
+ size &= FAKE_NODE_MIN_HASH_MASK;
- /* Parse the command line. */
- for (coeff_flag = 0; ; cmdline++) {
- if (*cmdline && isdigit(*cmdline)) {
- num = num * 10 + *cmdline - '0';
- continue;
- }
- if (*cmdline == '*') {
- if (num > 0)
- coeff = num;
- coeff_flag = 1;
- }
- if (!*cmdline || *cmdline == ',') {
- if (!coeff_flag)
- coeff = 1;
+ for (i = 0; i < MAX_NUMNODES; i++)
+ if (physnodes[i].start != physnodes[i].end)
+ node_set(i, physnode_mask);
+ /*
+ * Fill physical nodes with fake nodes of size until there is no memory
+ * left on any of them.
+ */
+ while (nodes_weight(physnode_mask)) {
+ for_each_node_mask(i, physnode_mask) {
+ u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
+ u64 end;
+
+ end = find_end_of_node(physnodes[i].start,
+ physnodes[i].end, size);
/*
- * Round down to the nearest FAKE_NODE_MIN_SIZE.
- * Command-line coefficients are in megabytes.
+ * If there won't be at least FAKE_NODE_MIN_SIZE of
+ * non-reserved memory in ZONE_DMA32 for the next node,
+ * this one must extend to the boundary.
*/
- size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
- if (size)
- for (i = 0; i < coeff; i++, num_nodes++)
- if (setup_node_range(num_nodes, nodes,
- &addr, size, max_addr) < 0)
- goto done;
- if (!*cmdline)
- break;
- coeff_flag = 0;
- coeff = -1;
+ if (end < dma32_end && dma32_end - end -
+ e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+ end = dma32_end;
+
+ /*
+ * If there won't be enough non-reserved memory for the
+ * next node, this one must extend to the end of the
+ * physical node.
+ */
+ if (physnodes[i].end - end -
+ e820_hole_size(end, physnodes[i].end) < size)
+ end = physnodes[i].end;
+
+ /*
+ * Setup the fake node that will be allocated as bootmem
+ * later. If setup_node_range() returns non-zero, there
+ * is no more memory available on this physical node.
+ */
+ if (setup_node_range(ret++, &physnodes[i].start,
+ end - physnodes[i].start,
+ physnodes[i].end) < 0)
+ node_clear(i, physnode_mask);
}
- num = 0;
}
-done:
- if (!num_nodes)
- return -1;
- /* Fill remainder of system RAM, if appropriate. */
- if (addr < max_addr) {
- if (coeff_flag && coeff < 0) {
- /* Split remaining nodes into num-sized chunks */
- num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
- num_nodes, num);
- goto out;
- }
- switch (*(cmdline - 1)) {
- case '*':
- /* Split remaining nodes into coeff chunks */
- if (coeff <= 0)
- break;
- num_nodes += split_nodes_equally(nodes, &addr, max_addr,
- num_nodes, coeff);
- break;
- case ',':
- /* Do not allocate remaining system RAM */
- break;
- default:
- /* Give one final node */
- setup_node_range(num_nodes, nodes, &addr,
- max_addr - addr, max_addr);
- num_nodes++;
- }
+ return ret;
+}
+
+/*
+ * Sets up the system RAM area from start_pfn to last_pfn according to the
+ * numa=fake command-line option.
+ */
+static int __init numa_emulation(unsigned long start_pfn,
+ unsigned long last_pfn, int acpi, int k8)
+{
+ u64 addr = start_pfn << PAGE_SHIFT;
+ u64 max_addr = last_pfn << PAGE_SHIFT;
+ int num_phys_nodes;
+ int num_nodes;
+ int i;
+
+ num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
+ /*
+ * If the numa=fake command-line contains a 'M' or 'G', it represents
+ * the fixed node size. Otherwise, if it is just a single number N,
+ * split the system RAM into N fake nodes.
+ */
+ if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
+ u64 size;
+
+ size = memparse(cmdline, &cmdline);
+ num_nodes = split_nodes_size_interleave(addr, max_addr, size);
+ } else {
+ unsigned long n;
+
+ n = simple_strtoul(cmdline, NULL, 0);
+ num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
}
-out:
+
+ if (num_nodes < 0)
+ return num_nodes;
memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
if (memnode_shift < 0) {
memnode_shift = 0;
@@ -505,14 +636,10 @@ out:
}
/*
- * We need to vacate all active ranges that may have been registered by
- * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
- * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
+ * We need to vacate all active ranges that may have been registered for
+ * the e820 memory map.
*/
remove_all_active_ranges();
-#ifdef CONFIG_ACPI_NUMA
- acpi_numa = -1;
-#endif
for_each_node_mask(i, node_possible_map) {
e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
nodes[i].end >> PAGE_SHIFT);
@@ -524,7 +651,8 @@ out:
}
#endif /* CONFIG_NUMA_EMU */
-void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
+void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
+ int acpi, int k8)
{
int i;
@@ -532,23 +660,22 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
nodes_clear(node_online_map);
#ifdef CONFIG_NUMA_EMU
- if (cmdline && !numa_emulation(start_pfn, last_pfn))
+ if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
return;
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
#endif
#ifdef CONFIG_ACPI_NUMA
- if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
- last_pfn << PAGE_SHIFT))
+ if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
+ last_pfn << PAGE_SHIFT))
return;
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
#endif
#ifdef CONFIG_K8_NUMA
- if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
- last_pfn<<PAGE_SHIFT))
+ if (!numa_off && k8 && !k8_scan_nodes())
return;
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
@@ -579,6 +706,10 @@ unsigned long __init numa_free_all_bootmem(void)
for_each_online_node(i)
pages += free_all_bootmem_node(NODE_DATA(i));
+#ifdef CONFIG_NO_BOOTMEM
+ pages += free_all_memory_core_early(MAX_NUMNODES);
+#endif
+
return pages;
}
@@ -601,6 +732,25 @@ static __init int numa_setup(char *opt)
early_param("numa", numa_setup);
#ifdef CONFIG_NUMA
+
+static __init int find_near_online_node(int node)
+{
+ int n, val;
+ int min_val = INT_MAX;
+ int best_node = -1;
+
+ for_each_online_node(n) {
+ val = node_distance(node, n);
+
+ if (val < min_val) {
+ min_val = val;
+ best_node = n;
+ }
+ }
+
+ return best_node;
+}
+
/*
* Setup early cpu_to_node.
*
@@ -632,7 +782,7 @@ void __init init_cpu_to_node(void)
if (node == NUMA_NO_NODE)
continue;
if (!node_online(node))
- continue;
+ node = find_near_online_node(node);
numa_set_node(cpu, node);
}
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dd38bfbefd1f..cf07c26d9a4a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -279,6 +279,43 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
__pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_RW;
+#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+ /*
+ * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
+ * kernel text mappings for the large page aligned text, rodata sections
+ * will be always read-only. For the kernel identity mappings covering
+ * the holes caused by this alignment can be anything that user asks.
+ *
+ * This will preserve the large page mappings for kernel text/data
+ * at no extra cost.
+ */
+ if (kernel_set_to_readonly &&
+ within(address, (unsigned long)_text,
+ (unsigned long)__end_rodata_hpage_align)) {
+ unsigned int level;
+
+ /*
+ * Don't enforce the !RW mapping for the kernel text mapping,
+ * if the current mapping is already using small page mapping.
+ * No need to work hard to preserve large page mappings in this
+ * case.
+ *
+ * This also fixes the Linux Xen paravirt guest boot failure
+ * (because of unexpected read-only mappings for kernel identity
+ * mappings). In this paravirt guest case, the kernel text
+ * mapping and the kernel identity mapping share the same
+ * page-table pages. Thus we can't really use different
+ * protections for the kernel text and identity mappings. Also,
+ * these shared mappings are made of small page mappings.
+ * Thus this don't enforce !RW mapping for small page kernel
+ * text mapping logic will help Linux Xen parvirt guest boot
+ * aswell.
+ */
+ if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
+ pgprot_val(forbidden) |= _PAGE_RW;
+ }
+#endif
+
prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
return prot;
@@ -1069,12 +1106,18 @@ EXPORT_SYMBOL(set_memory_array_wb);
int set_memory_x(unsigned long addr, int numpages)
{
+ if (!(__supported_pte_mask & _PAGE_NX))
+ return 0;
+
return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
}
EXPORT_SYMBOL(set_memory_x);
int set_memory_nx(unsigned long addr, int numpages)
{
+ if (!(__supported_pte_mask & _PAGE_NX))
+ return 0;
+
return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
}
EXPORT_SYMBOL(set_memory_nx);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e78cd0ec2bcf..ae9648eb1c7f 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -20,6 +20,7 @@
#include <asm/cacheflush.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
+#include <asm/x86_init.h>
#include <asm/pgtable.h>
#include <asm/fcntl.h>
#include <asm/e820.h>
@@ -355,9 +356,6 @@ static int free_ram_pages_type(u64 start, u64 end)
* - _PAGE_CACHE_UC_MINUS
* - _PAGE_CACHE_UC
*
- * req_type will have a special case value '-1', when requester want to inherit
- * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
- *
* If new_type is NULL, function will return an error if it cannot reserve the
* region with req_type. If new_type is non-NULL, function will return
* available type in new_type in case of no error. In case of any error
@@ -377,9 +375,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
if (!pat_enabled) {
/* This is identical to page table setting without PAT */
if (new_type) {
- if (req_type == -1)
- *new_type = _PAGE_CACHE_WB;
- else if (req_type == _PAGE_CACHE_WC)
+ if (req_type == _PAGE_CACHE_WC)
*new_type = _PAGE_CACHE_UC_MINUS;
else
*new_type = req_type & _PAGE_CACHE_MASK;
@@ -388,7 +384,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
}
/* Low ISA region is always mapped WB in page table. No need to track */
- if (is_ISA_range(start, end - 1)) {
+ if (x86_platform.is_untracked_pat_range(start, end)) {
if (new_type)
*new_type = _PAGE_CACHE_WB;
return 0;
@@ -499,7 +495,7 @@ int free_memtype(u64 start, u64 end)
return 0;
/* Low ISA region is always mapped WB. No need to track */
- if (is_ISA_range(start, end - 1))
+ if (x86_platform.is_untracked_pat_range(start, end))
return 0;
is_range_ram = pat_pagerange_is_ram(start, end);
@@ -582,7 +578,7 @@ static unsigned long lookup_memtype(u64 paddr)
int rettype = _PAGE_CACHE_WB;
struct memtype *entry;
- if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1))
+ if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
return rettype;
if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
@@ -708,9 +704,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
if (!range_is_allowed(pfn, size))
return 0;
- if (file->f_flags & O_SYNC) {
+ if (file->f_flags & O_DSYNC)
flags = _PAGE_CACHE_UC_MINUS;
- }
#ifdef CONFIG_X86_32
/*
@@ -1018,8 +1013,10 @@ static const struct file_operations memtype_fops = {
static int __init pat_memtype_list_init(void)
{
- debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
- NULL, &memtype_fops);
+ if (pat_enabled) {
+ debugfs_create_file("pat_memtype_list", S_IRUSR,
+ arch_debugfs_dir, NULL, &memtype_fops);
+ }
return 0;
}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ed34f5e35999..c9ba9deafe83 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -6,6 +6,14 @@
#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+#ifdef CONFIG_HIGHPTE
+#define PGALLOC_USER_GFP __GFP_HIGHMEM
+#else
+#define PGALLOC_USER_GFP 0
+#endif
+
+gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
+
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
return (pte_t *)__get_free_page(PGALLOC_GFP);
@@ -15,16 +23,29 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
struct page *pte;
-#ifdef CONFIG_HIGHPTE
- pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0);
-#else
- pte = alloc_pages(PGALLOC_GFP, 0);
-#endif
+ pte = alloc_pages(__userpte_alloc_gfp, 0);
if (pte)
pgtable_page_ctor(pte);
return pte;
}
+static int __init setup_userpte(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ /*
+ * "userpte=nohigh" disables allocation of user pagetables in
+ * high memory.
+ */
+ if (strcmp(arg, "nohigh") == 0)
+ __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
+ else
+ return -EINVAL;
+ return 0;
+}
+early_param("userpte", setup_userpte);
+
void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
pgtable_page_dtor(pte);
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index 513d8ed5d2ec..a3250aa34086 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -3,10 +3,8 @@
#include <linux/init.h>
#include <asm/pgtable.h>
+#include <asm/proto.h>
-int nx_enabled;
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
static int disable_nx __cpuinitdata;
/*
@@ -22,48 +20,41 @@ static int __init noexec_setup(char *str)
if (!str)
return -EINVAL;
if (!strncmp(str, "on", 2)) {
- __supported_pte_mask |= _PAGE_NX;
disable_nx = 0;
} else if (!strncmp(str, "off", 3)) {
disable_nx = 1;
- __supported_pte_mask &= ~_PAGE_NX;
}
+ x86_configure_nx();
return 0;
}
early_param("noexec", noexec_setup);
-#endif
-#ifdef CONFIG_X86_PAE
-void __init set_nx(void)
+void __cpuinit x86_configure_nx(void)
{
- unsigned int v[4], l, h;
-
- if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
- cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+ if (cpu_has_nx && !disable_nx)
+ __supported_pte_mask |= _PAGE_NX;
+ else
+ __supported_pte_mask &= ~_PAGE_NX;
+}
- if ((v[3] & (1 << 20)) && !disable_nx) {
- rdmsr(MSR_EFER, l, h);
- l |= EFER_NX;
- wrmsr(MSR_EFER, l, h);
- nx_enabled = 1;
- __supported_pte_mask |= _PAGE_NX;
+void __init x86_report_nx(void)
+{
+ if (!cpu_has_nx) {
+ printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
+ "missing in CPU or disabled in BIOS!\n");
+ } else {
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+ if (disable_nx) {
+ printk(KERN_INFO "NX (Execute Disable) protection: "
+ "disabled by kernel command line option\n");
+ } else {
+ printk(KERN_INFO "NX (Execute Disable) protection: "
+ "active\n");
}
- }
-}
#else
-void set_nx(void)
-{
-}
+ /* 32bit non-PAE kernel, NX cannot be used */
+ printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
+ "cannot be enabled: non-PAE kernel!\n");
#endif
-
-#ifdef CONFIG_X86_64
-void __cpuinit check_efer(void)
-{
- unsigned long efer;
-
- rdmsrl(MSR_EFER, efer);
- if (!(efer & EFER_NX) || disable_nx)
- __supported_pte_mask &= ~_PAGE_NX;
+ }
}
-#endif
-
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 6f8aa33031c7..9324f13492d5 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -267,6 +267,8 @@ int __init get_memcfg_from_srat(void)
e820_register_active_regions(chunk->nid, chunk->start_pfn,
min(chunk->end_pfn, max_pfn));
}
+ /* for out of order entries in SRAT */
+ sort_node_map();
for_each_online_node(nid) {
unsigned long start = node_start_pfn[nid];
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index dbb5381f7b3b..28c68762648f 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -136,7 +136,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
apicid_to_node[apic_id] = node;
node_set(node, cpu_nodes_parsed);
acpi_numa = 1;
- printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
pxm, apic_id, node);
}
@@ -170,7 +170,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
apicid_to_node[apic_id] = node;
node_set(node, cpu_nodes_parsed);
acpi_numa = 1;
- printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
pxm, apic_id, node);
}
@@ -229,9 +229,11 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
}
- if (changed)
+ if (changed) {
+ node_set(node, cpu_nodes_parsed);
printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
nd->start, nd->end);
+ }
}
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
@@ -290,8 +292,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
start, end);
- e820_register_active_regions(node, start >> PAGE_SHIFT,
- end >> PAGE_SHIFT);
if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
update_nodes_add(node, start, end);
@@ -319,7 +319,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
unsigned long s = nodes[i].start >> PAGE_SHIFT;
unsigned long e = nodes[i].end >> PAGE_SHIFT;
pxmram += e - s;
- pxmram -= absent_pages_in_range(s, e);
+ pxmram -= __absent_pages_in_range(i, s, e);
if ((long)pxmram < 0)
pxmram = 0;
}
@@ -338,6 +338,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
void __init acpi_numa_arch_fixup(void) {}
+int __init acpi_get_nodes(struct bootnode *physnodes)
+{
+ int i;
+ int ret = 0;
+
+ for_each_node_mask(i, nodes_parsed) {
+ physnodes[ret].start = nodes[i].start;
+ physnodes[ret].end = nodes[i].end;
+ ret++;
+ }
+ return ret;
+}
+
/* Use the information discovered above to actually set up the nodes. */
int __init acpi_scan_nodes(unsigned long start, unsigned long end)
{
@@ -350,11 +363,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
for (i = 0; i < MAX_NUMNODES; i++)
cutoff_node(i, start, end);
- if (!nodes_cover_memory(nodes)) {
- bad_srat();
- return -1;
- }
-
memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
memblk_nodeid);
if (memnode_shift < 0) {
@@ -364,6 +372,16 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
return -1;
}
+ for_each_node_mask(i, nodes_parsed)
+ e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
+ nodes[i].end >> PAGE_SHIFT);
+ /* for out of order entries in SRAT */
+ sort_node_map();
+ if (!nodes_cover_memory(nodes)) {
+ bad_srat();
+ return -1;
+ }
+
/* Account for nodes with cpus and no memory */
nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
@@ -454,7 +472,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
for (i = 0; i < num_nodes; i++)
if (fake_nodes[i].start != fake_nodes[i].end)
node_set(i, nodes_parsed);
- WARN_ON(!nodes_cover_memory(fake_nodes));
}
static int null_slit_node_compare(int a, int b)
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 427fd1b56df5..8565d944f7cf 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -1,12 +1,13 @@
/*
* Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/io.h>
#include <linux/mmiotrace.h>
-#define MODULE_NAME "testmmiotrace"
-
static unsigned long mmio_address;
module_param(mmio_address, ulong, 0);
MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
@@ -30,7 +31,7 @@ static unsigned v32(unsigned i)
static void do_write_test(void __iomem *p)
{
unsigned int i;
- pr_info(MODULE_NAME ": write test.\n");
+ pr_info("write test.\n");
mmiotrace_printk("Write test.\n");
for (i = 0; i < 256; i++)
@@ -47,7 +48,7 @@ static void do_read_test(void __iomem *p)
{
unsigned int i;
unsigned errs[3] = { 0 };
- pr_info(MODULE_NAME ": read test.\n");
+ pr_info("read test.\n");
mmiotrace_printk("Read test.\n");
for (i = 0; i < 256; i++)
@@ -68,7 +69,7 @@ static void do_read_test(void __iomem *p)
static void do_read_far_test(void __iomem *p)
{
- pr_info(MODULE_NAME ": read far test.\n");
+ pr_info("read far test.\n");
mmiotrace_printk("Read far test.\n");
ioread32(p + read_far);
@@ -78,7 +79,7 @@ static void do_test(unsigned long size)
{
void __iomem *p = ioremap_nocache(mmio_address, size);
if (!p) {
- pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
+ pr_err("could not ioremap, aborting.\n");
return;
}
mmiotrace_printk("ioremap returned %p.\n", p);
@@ -94,24 +95,22 @@ static int __init init(void)
unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
if (mmio_address == 0) {
- pr_err(MODULE_NAME ": you have to use the module argument "
- "mmio_address.\n");
- pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
- " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
+ pr_err("you have to use the module argument mmio_address.\n");
+ pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n");
return -ENXIO;
}
- pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI "
- "address space, and writing 16 kB of rubbish in there.\n",
- size >> 10, mmio_address);
+ pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, "
+ "and writing 16 kB of rubbish in there.\n",
+ size >> 10, mmio_address);
do_test(size);
- pr_info(MODULE_NAME ": All done.\n");
+ pr_info("All done.\n");
return 0;
}
static void __exit cleanup(void)
{
- pr_debug(MODULE_NAME ": unloaded.\n");
+ pr_debug("unloaded.\n");
}
module_init(init);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 36fe08eeb5c3..426f3a1a64d3 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -8,6 +8,7 @@
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
+#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
@@ -40,10 +41,10 @@ union smp_flush_state {
struct {
struct mm_struct *flush_mm;
unsigned long flush_va;
- spinlock_t tlbstate_lock;
+ raw_spinlock_t tlbstate_lock;
DECLARE_BITMAP(flush_cpumask, NR_CPUS);
};
- char pad[CONFIG_X86_INTERNODE_CACHE_BYTES];
+ char pad[INTERNODE_CACHE_BYTES];
} ____cacheline_internodealigned_in_smp;
/* State is put into the per CPU data section, but padded
@@ -180,7 +181,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
* num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
* probably not worth checking this for a cache-hot lock.
*/
- spin_lock(&f->tlbstate_lock);
+ raw_spin_lock(&f->tlbstate_lock);
f->flush_mm = mm;
f->flush_va = va;
@@ -198,7 +199,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
f->flush_mm = NULL;
f->flush_va = 0;
- spin_unlock(&f->tlbstate_lock);
+ raw_spin_unlock(&f->tlbstate_lock);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -222,7 +223,7 @@ static int __cpuinit init_smp_flush(void)
int i;
for (i = 0; i < ARRAY_SIZE(flush_state); i++)
- spin_lock_init(&flush_state[i].tlbstate_lock);
+ raw_spin_lock_init(&flush_state[i].tlbstate_lock);
return 0;
}
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 044897be021f..3855096c59b8 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -41,10 +41,11 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
}
static struct stacktrace_ops backtrace_ops = {
- .warning = backtrace_warning,
- .warning_symbol = backtrace_warning_symbol,
- .stack = backtrace_stack,
- .address = backtrace_address,
+ .warning = backtrace_warning,
+ .warning_symbol = backtrace_warning_symbol,
+ .stack = backtrace_stack,
+ .address = backtrace_address,
+ .walk_stack = print_context_stack,
};
struct frame_head {
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index cb88b1a0bd5f..2c505ee71014 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -159,7 +159,7 @@ static int nmi_setup_mux(void)
for_each_possible_cpu(i) {
per_cpu(cpu_msrs, i).multiplex =
- kmalloc(multiplex_size, GFP_KERNEL);
+ kzalloc(multiplex_size, GFP_KERNEL);
if (!per_cpu(cpu_msrs, i).multiplex)
return 0;
}
@@ -179,7 +179,6 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
if (counter_config[i].enabled) {
multiplex[i].saved = -(u64)counter_config[i].count;
} else {
- multiplex[i].addr = 0;
multiplex[i].saved = 0;
}
}
@@ -189,25 +188,27 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs)
{
+ struct op_msr *counters = msrs->counters;
struct op_msr *multiplex = msrs->multiplex;
int i;
for (i = 0; i < model->num_counters; ++i) {
int virt = op_x86_phys_to_virt(i);
- if (multiplex[virt].addr)
- rdmsrl(multiplex[virt].addr, multiplex[virt].saved);
+ if (counters[i].addr)
+ rdmsrl(counters[i].addr, multiplex[virt].saved);
}
}
static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs)
{
+ struct op_msr *counters = msrs->counters;
struct op_msr *multiplex = msrs->multiplex;
int i;
for (i = 0; i < model->num_counters; ++i) {
int virt = op_x86_phys_to_virt(i);
- if (multiplex[virt].addr)
- wrmsrl(multiplex[virt].addr, multiplex[virt].saved);
+ if (counters[i].addr)
+ wrmsrl(counters[i].addr, multiplex[virt].saved);
}
}
@@ -222,7 +223,7 @@ static void nmi_cpu_switch(void *dummy)
/* move to next set */
si += model->num_counters;
- if ((si > model->num_virt_counters) || (counter_config[si].count == 0))
+ if ((si >= model->num_virt_counters) || (counter_config[si].count == 0))
per_cpu(switch_index, cpu) = 0;
else
per_cpu(switch_index, cpu) = si;
@@ -303,11 +304,11 @@ static int allocate_msrs(void)
int i;
for_each_possible_cpu(i) {
- per_cpu(cpu_msrs, i).counters = kmalloc(counters_size,
+ per_cpu(cpu_msrs, i).counters = kzalloc(counters_size,
GFP_KERNEL);
if (!per_cpu(cpu_msrs, i).counters)
return 0;
- per_cpu(cpu_msrs, i).controls = kmalloc(controls_size,
+ per_cpu(cpu_msrs, i).controls = kzalloc(controls_size,
GFP_KERNEL);
if (!per_cpu(cpu_msrs, i).controls)
return 0;
@@ -598,6 +599,7 @@ static int __init ppro_init(char **cpu_type)
case 15: case 23:
*cpu_type = "i386/core_2";
break;
+ case 0x2e:
case 26:
spec = &op_arch_perfmon_spec;
*cpu_type = "i386/core_i7";
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 39686c29f03a..090cbbec7dbd 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -22,6 +22,9 @@
#include <asm/ptrace.h>
#include <asm/msr.h>
#include <asm/nmi.h>
+#include <asm/apic.h>
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
#include "op_x86_model.h"
#include "op_counter.h"
@@ -43,23 +46,10 @@
static unsigned long reset_value[NUM_VIRT_COUNTERS];
-#ifdef CONFIG_OPROFILE_IBS
-
-/* IbsFetchCtl bits/masks */
-#define IBS_FETCH_RAND_EN (1ULL<<57)
-#define IBS_FETCH_VAL (1ULL<<49)
-#define IBS_FETCH_ENABLE (1ULL<<48)
-#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL
-
-/*IbsOpCtl bits */
-#define IBS_OP_CNT_CTL (1ULL<<19)
-#define IBS_OP_VAL (1ULL<<18)
-#define IBS_OP_ENABLE (1ULL<<17)
-
#define IBS_FETCH_SIZE 6
#define IBS_OP_SIZE 12
-static int has_ibs; /* AMD Family10h and later */
+static u32 ibs_caps;
struct op_ibs_config {
unsigned long op_enabled;
@@ -71,24 +61,52 @@ struct op_ibs_config {
};
static struct op_ibs_config ibs_config;
+static u64 ibs_op_ctl;
-#endif
+/*
+ * IBS cpuid feature detection
+ */
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+#define IBS_CPUID_FEATURES 0x8000001b
+
+/*
+ * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
+ * bit 0 is used to indicate the existence of IBS.
+ */
+#define IBS_CAPS_AVAIL (1LL<<0)
+#define IBS_CAPS_RDWROPCNT (1LL<<3)
+#define IBS_CAPS_OPCNT (1LL<<4)
-static void op_mux_fill_in_addresses(struct op_msrs * const msrs)
+/*
+ * IBS randomization macros
+ */
+#define IBS_RANDOM_BITS 12
+#define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1)
+#define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5))
+
+static u32 get_ibs_caps(void)
{
- int i;
+ u32 ibs_caps;
+ unsigned int max_level;
- for (i = 0; i < NUM_VIRT_COUNTERS; i++) {
- int hw_counter = op_x86_virt_to_phys(i);
- if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
- msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter;
- else
- msrs->multiplex[i].addr = 0;
- }
+ if (!boot_cpu_has(X86_FEATURE_IBS))
+ return 0;
+
+ /* check IBS cpuid feature flags */
+ max_level = cpuid_eax(0x80000000);
+ if (max_level < IBS_CPUID_FEATURES)
+ return IBS_CAPS_AVAIL;
+
+ ibs_caps = cpuid_eax(IBS_CPUID_FEATURES);
+ if (!(ibs_caps & IBS_CAPS_AVAIL))
+ /* cpuid flags not valid */
+ return IBS_CAPS_AVAIL;
+
+ return ibs_caps;
}
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+
static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
struct op_msrs const * const msrs)
{
@@ -98,7 +116,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
/* enable active counters */
for (i = 0; i < NUM_COUNTERS; ++i) {
int virt = op_x86_phys_to_virt(i);
- if (!counter_config[virt].enabled)
+ if (!reset_value[virt])
continue;
rdmsrl(msrs->controls[i].addr, val);
val &= model->reserved;
@@ -107,10 +125,6 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
}
}
-#else
-
-static inline void op_mux_fill_in_addresses(struct op_msrs * const msrs) { }
-
#endif
/* functions for op_amd_spec */
@@ -122,18 +136,12 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
for (i = 0; i < NUM_COUNTERS; i++) {
if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
- else
- msrs->counters[i].addr = 0;
}
for (i = 0; i < NUM_CONTROLS; i++) {
if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
- else
- msrs->controls[i].addr = 0;
}
-
- op_mux_fill_in_addresses(msrs);
}
static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
@@ -144,7 +152,8 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
/* setup reset_value */
for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
- if (counter_config[i].enabled)
+ if (counter_config[i].enabled
+ && msrs->counters[op_x86_virt_to_phys(i)].addr)
reset_value[i] = counter_config[i].count;
else
reset_value[i] = 0;
@@ -152,9 +161,18 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
/* clear all counters */
for (i = 0; i < NUM_CONTROLS; ++i) {
- if (unlikely(!msrs->controls[i].addr))
+ if (unlikely(!msrs->controls[i].addr)) {
+ if (counter_config[i].enabled && !smp_processor_id())
+ /*
+ * counter is reserved, this is on all
+ * cpus, so report only for cpu #0
+ */
+ op_x86_warn_reserved(i);
continue;
+ }
rdmsrl(msrs->controls[i].addr, val);
+ if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
+ op_x86_warn_in_use(i);
val &= model->reserved;
wrmsrl(msrs->controls[i].addr, val);
}
@@ -169,9 +187,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
/* enable active counters */
for (i = 0; i < NUM_COUNTERS; ++i) {
int virt = op_x86_phys_to_virt(i);
- if (!counter_config[virt].enabled)
- continue;
- if (!msrs->counters[i].addr)
+ if (!reset_value[virt])
continue;
/* setup counter registers */
@@ -185,7 +201,60 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
}
}
-#ifdef CONFIG_OPROFILE_IBS
+/*
+ * 16-bit Linear Feedback Shift Register (LFSR)
+ *
+ * 16 14 13 11
+ * Feedback polynomial = X + X + X + X + 1
+ */
+static unsigned int lfsr_random(void)
+{
+ static unsigned int lfsr_value = 0xF00D;
+ unsigned int bit;
+
+ /* Compute next bit to shift in */
+ bit = ((lfsr_value >> 0) ^
+ (lfsr_value >> 2) ^
+ (lfsr_value >> 3) ^
+ (lfsr_value >> 5)) & 0x0001;
+
+ /* Advance to next register value */
+ lfsr_value = (lfsr_value >> 1) | (bit << 15);
+
+ return lfsr_value;
+}
+
+/*
+ * IBS software randomization
+ *
+ * The IBS periodic op counter is randomized in software. The lower 12
+ * bits of the 20 bit counter are randomized. IbsOpCurCnt is
+ * initialized with a 12 bit random value.
+ */
+static inline u64 op_amd_randomize_ibs_op(u64 val)
+{
+ unsigned int random = lfsr_random();
+
+ if (!(ibs_caps & IBS_CAPS_RDWROPCNT))
+ /*
+ * Work around if the hw can not write to IbsOpCurCnt
+ *
+ * Randomize the lower 8 bits of the 16 bit
+ * IbsOpMaxCnt [15:0] value in the range of -128 to
+ * +127 by adding/subtracting an offset to the
+ * maximum count (IbsOpMaxCnt).
+ *
+ * To avoid over or underflows and protect upper bits
+ * starting at bit 16, the initial value for
+ * IbsOpMaxCnt must fit in the range from 0x0081 to
+ * 0xff80.
+ */
+ val += (s8)(random >> 4);
+ else
+ val |= (u64)(random & IBS_RANDOM_MASK) << 32;
+
+ return val;
+}
static inline void
op_amd_handle_ibs(struct pt_regs * const regs,
@@ -194,7 +263,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
u64 val, ctl;
struct op_entry entry;
- if (!has_ibs)
+ if (!ibs_caps)
return;
if (ibs_config.fetch_enabled) {
@@ -210,7 +279,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
oprofile_write_commit(&entry);
/* reenable the IRQ */
- ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK);
+ ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT);
ctl |= IBS_FETCH_ENABLE;
wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
}
@@ -236,8 +305,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
oprofile_write_commit(&entry);
/* reenable the IRQ */
- ctl &= ~IBS_OP_VAL & 0xFFFFFFFF;
- ctl |= IBS_OP_ENABLE;
+ ctl = op_amd_randomize_ibs_op(ibs_op_ctl);
wrmsrl(MSR_AMD64_IBSOPCTL, ctl);
}
}
@@ -246,41 +314,57 @@ op_amd_handle_ibs(struct pt_regs * const regs,
static inline void op_amd_start_ibs(void)
{
u64 val;
- if (has_ibs && ibs_config.fetch_enabled) {
- val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF;
+
+ if (!ibs_caps)
+ return;
+
+ if (ibs_config.fetch_enabled) {
+ val = (ibs_config.max_cnt_fetch >> 4) & IBS_FETCH_MAX_CNT;
val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
val |= IBS_FETCH_ENABLE;
wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
}
- if (has_ibs && ibs_config.op_enabled) {
- val = (ibs_config.max_cnt_op >> 4) & 0xFFFF;
- val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0;
- val |= IBS_OP_ENABLE;
+ if (ibs_config.op_enabled) {
+ ibs_op_ctl = ibs_config.max_cnt_op >> 4;
+ if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) {
+ /*
+ * IbsOpCurCnt not supported. See
+ * op_amd_randomize_ibs_op() for details.
+ */
+ ibs_op_ctl = clamp(ibs_op_ctl, 0x0081ULL, 0xFF80ULL);
+ } else {
+ /*
+ * The start value is randomized with a
+ * positive offset, we need to compensate it
+ * with the half of the randomized range. Also
+ * avoid underflows.
+ */
+ ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET,
+ IBS_OP_MAX_CNT);
+ }
+ if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops)
+ ibs_op_ctl |= IBS_OP_CNT_CTL;
+ ibs_op_ctl |= IBS_OP_ENABLE;
+ val = op_amd_randomize_ibs_op(ibs_op_ctl);
wrmsrl(MSR_AMD64_IBSOPCTL, val);
}
}
static void op_amd_stop_ibs(void)
{
- if (has_ibs && ibs_config.fetch_enabled)
+ if (!ibs_caps)
+ return;
+
+ if (ibs_config.fetch_enabled)
/* clear max count and enable */
wrmsrl(MSR_AMD64_IBSFETCHCTL, 0);
- if (has_ibs && ibs_config.op_enabled)
+ if (ibs_config.op_enabled)
/* clear max count and enable */
wrmsrl(MSR_AMD64_IBSOPCTL, 0);
}
-#else
-
-static inline void op_amd_handle_ibs(struct pt_regs * const regs,
- struct op_msrs const * const msrs) { }
-static inline void op_amd_start_ibs(void) { }
-static inline void op_amd_stop_ibs(void) { }
-
-#endif
-
static int op_amd_check_ctrs(struct pt_regs * const regs,
struct op_msrs const * const msrs)
{
@@ -314,7 +398,7 @@ static void op_amd_start(struct op_msrs const * const msrs)
if (!reset_value[op_x86_phys_to_virt(i)])
continue;
rdmsrl(msrs->controls[i].addr, val);
- val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
wrmsrl(msrs->controls[i].addr, val);
}
@@ -334,7 +418,7 @@ static void op_amd_stop(struct op_msrs const * const msrs)
if (!reset_value[op_x86_phys_to_virt(i)])
continue;
rdmsrl(msrs->controls[i].addr, val);
- val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+ val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
wrmsrl(msrs->controls[i].addr, val);
}
@@ -355,8 +439,6 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
}
}
-#ifdef CONFIG_OPROFILE_IBS
-
static u8 ibs_eilvt_off;
static inline void apic_init_ibs_nmi_per_cpu(void *arg)
@@ -405,45 +487,36 @@ static int init_ibs_nmi(void)
return 1;
}
-#ifdef CONFIG_NUMA
- /* Sanity check */
- /* Works only for 64bit with proper numa implementation. */
- if (nodes != num_possible_nodes()) {
- printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, "
- "found: %d, expected %d",
- nodes, num_possible_nodes());
- return 1;
- }
-#endif
return 0;
}
/* uninitialize the APIC for the IBS interrupts if needed */
static void clear_ibs_nmi(void)
{
- if (has_ibs)
+ if (ibs_caps)
on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
}
/* initialize the APIC for the IBS interrupts if available */
static void ibs_init(void)
{
- has_ibs = boot_cpu_has(X86_FEATURE_IBS);
+ ibs_caps = get_ibs_caps();
- if (!has_ibs)
+ if (!ibs_caps)
return;
if (init_ibs_nmi()) {
- has_ibs = 0;
+ ibs_caps = 0;
return;
}
- printk(KERN_INFO "oprofile: AMD IBS detected\n");
+ printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n",
+ (unsigned)ibs_caps);
}
static void ibs_exit(void)
{
- if (!has_ibs)
+ if (!ibs_caps)
return;
clear_ibs_nmi();
@@ -463,7 +536,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
if (ret)
return ret;
- if (!has_ibs)
+ if (!ibs_caps)
return ret;
/* model specific files */
@@ -473,7 +546,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
ibs_config.fetch_enabled = 0;
ibs_config.max_cnt_op = 250000;
ibs_config.op_enabled = 0;
- ibs_config.dispatched_ops = 1;
+ ibs_config.dispatched_ops = 0;
dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
oprofilefs_create_ulong(sb, dir, "enable",
@@ -488,8 +561,9 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
&ibs_config.op_enabled);
oprofilefs_create_ulong(sb, dir, "max_count",
&ibs_config.max_cnt_op);
- oprofilefs_create_ulong(sb, dir, "dispatched_ops",
- &ibs_config.dispatched_ops);
+ if (ibs_caps & IBS_CAPS_OPCNT)
+ oprofilefs_create_ulong(sb, dir, "dispatched_ops",
+ &ibs_config.dispatched_ops);
return 0;
}
@@ -507,19 +581,6 @@ static void op_amd_exit(void)
ibs_exit();
}
-#else
-
-/* no IBS support */
-
-static int op_amd_init(struct oprofile_operations *ops)
-{
- return 0;
-}
-
-static void op_amd_exit(void) {}
-
-#endif /* CONFIG_OPROFILE_IBS */
-
struct op_x86_model_spec op_amd_spec = {
.num_counters = NUM_COUNTERS,
.num_controls = NUM_CONTROLS,
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index ac6b354becdf..e6a160a4684a 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -394,12 +394,6 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
setup_num_counters();
stag = get_stagger();
- /* initialize some registers */
- for (i = 0; i < num_counters; ++i)
- msrs->counters[i].addr = 0;
- for (i = 0; i < num_controls; ++i)
- msrs->controls[i].addr = 0;
-
/* the counter & cccr registers we pay attention to */
for (i = 0; i < num_counters; ++i) {
addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 8eb05878554c..2bf90fafa7b5 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -37,15 +37,11 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs)
for (i = 0; i < num_counters; i++) {
if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
- else
- msrs->counters[i].addr = 0;
}
for (i = 0; i < num_counters; i++) {
if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i))
msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
- else
- msrs->controls[i].addr = 0;
}
}
@@ -57,7 +53,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
int i;
if (!reset_value) {
- reset_value = kmalloc(sizeof(reset_value[0]) * num_counters,
+ reset_value = kzalloc(sizeof(reset_value[0]) * num_counters,
GFP_ATOMIC);
if (!reset_value)
return;
@@ -82,9 +78,18 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
/* clear all counters */
for (i = 0; i < num_counters; ++i) {
- if (unlikely(!msrs->controls[i].addr))
+ if (unlikely(!msrs->controls[i].addr)) {
+ if (counter_config[i].enabled && !smp_processor_id())
+ /*
+ * counter is reserved, this is on all
+ * cpus, so report only for cpu #0
+ */
+ op_x86_warn_reserved(i);
continue;
+ }
rdmsrl(msrs->controls[i].addr, val);
+ if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
+ op_x86_warn_in_use(i);
val &= model->reserved;
wrmsrl(msrs->controls[i].addr, val);
}
@@ -161,7 +166,7 @@ static void ppro_start(struct op_msrs const * const msrs)
for (i = 0; i < num_counters; ++i) {
if (reset_value[i]) {
rdmsrl(msrs->controls[i].addr, val);
- val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
wrmsrl(msrs->controls[i].addr, val);
}
}
@@ -179,7 +184,7 @@ static void ppro_stop(struct op_msrs const * const msrs)
if (!reset_value[i])
continue;
rdmsrl(msrs->controls[i].addr, val);
- val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+ val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
wrmsrl(msrs->controls[i].addr, val);
}
}
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 7b8e75d16081..ff82a755edd4 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -57,6 +57,26 @@ struct op_x86_model_spec {
struct op_counter_config;
+static inline void op_x86_warn_in_use(int counter)
+{
+ /*
+ * The warning indicates an already running counter. If
+ * oprofile doesn't collect data, then try using a different
+ * performance counter on your platform to monitor the desired
+ * event. Delete counter #%d from the desired event by editing
+ * the /usr/share/oprofile/%s/<cpu>/events file. If the event
+ * cannot be monitored by any other counter, contact your
+ * hardware or BIOS vendor.
+ */
+ pr_warning("oprofile: counter #%d on cpu #%d may already be used\n",
+ counter, smp_processor_id());
+}
+
+static inline void op_x86_warn_reserved(int counter)
+{
+ pr_warning("oprofile: counter #%d is already reserved\n", counter);
+}
+
extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
struct op_counter_config *counter_config);
extern int op_x86_phys_to_virt(int phys);
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index d49202e740ea..b110d97fb925 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -13,5 +13,11 @@ obj-$(CONFIG_X86_VISWS) += visws.o
obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
+obj-$(CONFIG_X86_MRST) += mrst.o
+
obj-y += common.o early.o
-obj-y += amd_bus.o
+obj-y += amd_bus.o bus_numa.o
+
+ifeq ($(CONFIG_PCI_DEBUG),y)
+EXTRA_CFLAGS += -DDEBUG
+endif
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 1014eb4bfc37..6e22454bfaa6 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -7,6 +7,7 @@
#include <asm/pci_x86.h>
struct pci_root_info {
+ struct acpi_device *bridge;
char *name;
unsigned int res_num;
struct resource *res;
@@ -14,6 +15,51 @@ struct pci_root_info {
int busnum;
};
+static bool pci_use_crs = true;
+
+static int __init set_use_crs(const struct dmi_system_id *id)
+{
+ pci_use_crs = true;
+ return 0;
+}
+
+static const struct dmi_system_id pci_use_crs_table[] __initconst = {
+ /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */
+ {
+ .callback = set_use_crs,
+ .ident = "IBM System x3800",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
+ },
+ },
+ {}
+};
+
+void __init pci_acpi_crs_quirks(void)
+{
+ int year;
+
+ if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008)
+ pci_use_crs = false;
+
+ dmi_check_system(pci_use_crs_table);
+
+ /*
+ * If the user specifies "pci=use_crs" or "pci=nocrs" explicitly, that
+ * takes precedence over anything we figured out above.
+ */
+ if (pci_probe & PCI_ROOT_NO_CRS)
+ pci_use_crs = false;
+ else if (pci_probe & PCI_USE__CRS)
+ pci_use_crs = true;
+
+ printk(KERN_INFO "PCI: %s host bridge windows from ACPI; "
+ "if necessary, use \"pci=%s\" and report a bug\n",
+ pci_use_crs ? "Using" : "Ignoring",
+ pci_use_crs ? "nocrs" : "use_crs");
+}
+
static acpi_status
resource_to_addr(struct acpi_resource *resource,
struct acpi_resource_address64 *addr)
@@ -44,18 +90,28 @@ count_resource(struct acpi_resource *acpi_res, void *data)
return AE_OK;
}
-static int
-bus_has_transparent_bridge(struct pci_bus *bus)
+static void
+align_resource(struct acpi_device *bridge, struct resource *res)
{
- struct pci_dev *dev;
+ int align = (res->flags & IORESOURCE_MEM) ? 16 : 4;
- list_for_each_entry(dev, &bus->devices, bus_list) {
- u16 class = dev->class >> 8;
-
- if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent)
- return true;
+ /*
+ * Host bridge windows are not BARs, but the decoders on the PCI side
+ * that claim this address space have starting alignment and length
+ * constraints, so fix any obvious BIOS goofs.
+ */
+ if (!IS_ALIGNED(res->start, align)) {
+ dev_printk(KERN_DEBUG, &bridge->dev,
+ "host bridge window %pR invalid; "
+ "aligning start to %d-byte boundary\n", res, align);
+ res->start &= ~(align - 1);
+ }
+ if (!IS_ALIGNED(res->end + 1, align)) {
+ dev_printk(KERN_DEBUG, &bridge->dev,
+ "host bridge window %pR invalid; "
+ "aligning end to %d-byte boundary\n", res, align);
+ res->end = ALIGN(res->end, align) - 1;
}
- return false;
}
static acpi_status
@@ -67,12 +123,8 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
acpi_status status;
unsigned long flags;
struct resource *root;
- int max_root_bus_resources = PCI_BUS_NUM_RESOURCES;
u64 start, end;
- if (bus_has_transparent_bridge(info->bus))
- max_root_bus_resources -= 3;
-
status = resource_to_addr(acpi_res, &addr);
if (!ACPI_SUCCESS(status))
return AE_OK;
@@ -90,14 +142,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
start = addr.minimum + addr.translation_offset;
end = start + addr.address_length - 1;
- if (info->res_num >= max_root_bus_resources) {
- printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx "
- "from %s for %s due to _CRS returning more than "
- "%d resource descriptors\n", (unsigned long) start,
- (unsigned long) end, root->name, info->name,
- max_root_bus_resources);
- return AE_OK;
- }
res = &info->res[info->res_num];
res->name = info->name;
@@ -105,14 +149,28 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
res->start = start;
res->end = end;
res->child = NULL;
+ align_resource(info->bridge, res);
+
+ if (!pci_use_crs) {
+ dev_printk(KERN_DEBUG, &info->bridge->dev,
+ "host bridge window %pR (ignored)\n", res);
+ return AE_OK;
+ }
if (insert_resource(root, res)) {
- printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx "
- "from %s for %s\n", (unsigned long) res->start,
- (unsigned long) res->end, root->name, info->name);
+ dev_err(&info->bridge->dev,
+ "can't allocate host bridge window %pR\n", res);
} else {
- info->bus->resource[info->res_num] = res;
+ pci_bus_add_resource(info->bus, res, 0);
info->res_num++;
+ if (addr.translation_offset)
+ dev_info(&info->bridge->dev, "host bridge window %pR "
+ "(PCI address [%#llx-%#llx])\n",
+ res, res->start - addr.translation_offset,
+ res->end - addr.translation_offset);
+ else
+ dev_info(&info->bridge->dev,
+ "host bridge window %pR\n", res);
}
return AE_OK;
}
@@ -124,6 +182,10 @@ get_current_resources(struct acpi_device *device, int busnum,
struct pci_root_info info;
size_t size;
+ if (pci_use_crs)
+ pci_bus_remove_resources(bus);
+
+ info.bridge = device;
info.bus = bus;
info.res_num = 0;
acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
@@ -163,8 +225,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
#endif
if (domain && !pci_domains_supported) {
- printk(KERN_WARNING "PCI: Multiple domains not supported "
- "(dom %d, bus %d)\n", domain, busnum);
+ printk(KERN_WARNING "pci_bus %04x:%02x: "
+ "ignored (multiple domains not supported)\n",
+ domain, busnum);
return NULL;
}
@@ -188,7 +251,8 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
*/
sd = kzalloc(sizeof(*sd), GFP_KERNEL);
if (!sd) {
- printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum);
+ printk(KERN_WARNING "pci_bus %04x:%02x: "
+ "ignored (out of memory)\n", domain, busnum);
return NULL;
}
@@ -209,9 +273,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
} else {
bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd);
if (bus) {
- if (pci_probe & PCI_USE__CRS)
- get_current_resources(device, busnum, domain,
- bus);
+ get_current_resources(device, busnum, domain, bus);
bus->subordinate = pci_scan_child_bus(bus);
}
}
@@ -236,17 +298,14 @@ int __init pci_acpi_init(void)
{
struct pci_dev *dev = NULL;
- if (pcibios_scanned)
- return 0;
-
if (acpi_noirq)
- return 0;
+ return -ENODEV;
printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n");
acpi_irq_penalty_init();
- pcibios_scanned++;
pcibios_enable_irq = acpi_pci_irq_enable;
pcibios_disable_irq = acpi_pci_irq_disable;
+ x86_init.pci.init_irq = x86_init_noop;
if (pci_routeirq) {
/*
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 572ee9782f2a..fc1e8fe07e5c 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -2,180 +2,19 @@
#include <linux/pci.h>
#include <linux/topology.h>
#include <linux/cpu.h>
+#include <linux/range.h>
+
#include <asm/pci_x86.h>
-#ifdef CONFIG_X86_64
#include <asm/pci-direct.h>
-#include <asm/mpspec.h>
-#include <linux/cpumask.h>
-#endif
+
+#include "bus_numa.h"
/*
* This discovers the pcibus <-> node mapping on AMD K8.
* also get peer root bus resource for io,mmio
*/
-#ifdef CONFIG_X86_64
-
-/*
- * sub bus (transparent) will use entres from 3 to store extra from root,
- * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES?
- */
-#define RES_NUM 16
-struct pci_root_info {
- char name[12];
- unsigned int res_num;
- struct resource res[RES_NUM];
- int bus_min;
- int bus_max;
- int node;
- int link;
-};
-
-/* 4 at this time, it may become to 32 */
-#define PCI_ROOT_NR 4
-static int pci_root_num;
-static struct pci_root_info pci_root_info[PCI_ROOT_NR];
-
-void x86_pci_root_bus_res_quirks(struct pci_bus *b)
-{
- int i;
- int j;
- struct pci_root_info *info;
-
- /* don't go for it if _CRS is used already */
- if (b->resource[0] != &ioport_resource ||
- b->resource[1] != &iomem_resource)
- return;
-
- /* if only one root bus, don't need to anything */
- if (pci_root_num < 2)
- return;
-
- for (i = 0; i < pci_root_num; i++) {
- if (pci_root_info[i].bus_min == b->number)
- break;
- }
-
- if (i == pci_root_num)
- return;
-
- printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
- b->number);
-
- info = &pci_root_info[i];
- for (j = 0; j < info->res_num; j++) {
- struct resource *res;
- struct resource *root;
-
- res = &info->res[j];
- b->resource[j] = res;
- if (res->flags & IORESOURCE_IO)
- root = &ioport_resource;
- else
- root = &iomem_resource;
- insert_resource(root, res);
- }
-}
-
-#define RANGE_NUM 16
-
-struct res_range {
- size_t start;
- size_t end;
-};
-
-static void __init update_range(struct res_range *range, size_t start,
- size_t end)
-{
- int i;
- int j;
-
- for (j = 0; j < RANGE_NUM; j++) {
- if (!range[j].end)
- continue;
-
- if (start <= range[j].start && end >= range[j].end) {
- range[j].start = 0;
- range[j].end = 0;
- continue;
- }
-
- if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) {
- range[j].start = end + 1;
- continue;
- }
-
-
- if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) {
- range[j].end = start - 1;
- continue;
- }
-
- if (start > range[j].start && end < range[j].end) {
- /* find the new spare */
- for (i = 0; i < RANGE_NUM; i++) {
- if (range[i].end == 0)
- break;
- }
- if (i < RANGE_NUM) {
- range[i].end = range[j].end;
- range[i].start = end + 1;
- } else {
- printk(KERN_ERR "run of slot in ranges\n");
- }
- range[j].end = start - 1;
- continue;
- }
- }
-}
-
-static void __init update_res(struct pci_root_info *info, size_t start,
- size_t end, unsigned long flags, int merge)
-{
- int i;
- struct resource *res;
-
- if (!merge)
- goto addit;
-
- /* try to merge it with old one */
- for (i = 0; i < info->res_num; i++) {
- size_t final_start, final_end;
- size_t common_start, common_end;
-
- res = &info->res[i];
- if (res->flags != flags)
- continue;
-
- common_start = max((size_t)res->start, start);
- common_end = min((size_t)res->end, end);
- if (common_start > common_end + 1)
- continue;
-
- final_start = min((size_t)res->start, start);
- final_end = max((size_t)res->end, end);
-
- res->start = final_start;
- res->end = final_end;
- return;
- }
-
-addit:
-
- /* need to add that */
- if (info->res_num >= RES_NUM)
- return;
-
- res = &info->res[info->res_num];
- res->name = info->name;
- res->flags = flags;
- res->start = start;
- res->end = end;
- res->child = NULL;
- info->res_num++;
-}
-
struct pci_hostbridge_probe {
u32 bus;
u32 slot;
@@ -218,6 +57,8 @@ static void __init get_pci_mmcfg_amd_fam10h_range(void)
fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
}
+#define RANGE_NUM 16
+
/**
* early_fill_mp_bus_to_node()
* called before pcibios_scan_root and pci_scan_bus
@@ -230,7 +71,6 @@ static int __init early_fill_mp_bus_info(void)
int j;
unsigned bus;
unsigned slot;
- int found;
int node;
int link;
int def_node;
@@ -238,16 +78,17 @@ static int __init early_fill_mp_bus_info(void)
struct pci_root_info *info;
u32 reg;
struct resource *res;
- size_t start;
- size_t end;
- struct res_range range[RANGE_NUM];
+ u64 start;
+ u64 end;
+ struct range range[RANGE_NUM];
u64 val;
u32 address;
+ bool found;
if (!early_pci_allowed())
return -1;
- found = 0;
+ found = false;
for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
u32 id;
u16 device;
@@ -261,7 +102,7 @@ static int __init early_fill_mp_bus_info(void)
device = (id>>16) & 0xffff;
if (pci_probes[i].vendor == vendor &&
pci_probes[i].device == device) {
- found = 1;
+ found = true;
break;
}
}
@@ -304,7 +145,7 @@ static int __init early_fill_mp_bus_info(void)
def_link = (reg >> 8) & 0x03;
memset(range, 0, sizeof(range));
- range[0].end = 0xffff;
+ add_range(range, RANGE_NUM, 0, 0, 0xffff + 1);
/* io port resource */
for (i = 0; i < 4; i++) {
reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3));
@@ -328,13 +169,13 @@ static int __init early_fill_mp_bus_info(void)
info = &pci_root_info[j];
printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n",
- node, link, (u64)start, (u64)end);
+ node, link, start, end);
/* kernel only handle 16 bit only */
if (end > 0xffff)
end = 0xffff;
update_res(info, start, end, IORESOURCE_IO, 1);
- update_range(range, start, end);
+ subtract_range(range, RANGE_NUM, start, end + 1);
}
/* add left over io port range to def node/link, [0, 0xffff] */
/* find the position */
@@ -349,29 +190,32 @@ static int __init early_fill_mp_bus_info(void)
if (!range[i].end)
continue;
- update_res(info, range[i].start, range[i].end,
+ update_res(info, range[i].start, range[i].end - 1,
IORESOURCE_IO, 1);
}
}
memset(range, 0, sizeof(range));
/* 0xfd00000000-0xffffffffff for HT */
- range[0].end = (0xfdULL<<32) - 1;
+ end = cap_resource((0xfdULL<<32) - 1);
+ end++;
+ add_range(range, RANGE_NUM, 0, 0, end);
/* need to take out [0, TOM) for RAM*/
address = MSR_K8_TOP_MEM1;
rdmsrl(address, val);
end = (val & 0xffffff800000ULL);
- printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20);
+ printk(KERN_INFO "TOM: %016llx aka %lldM\n", end, end>>20);
if (end < (1ULL<<32))
- update_range(range, 0, end - 1);
+ subtract_range(range, RANGE_NUM, 0, end);
/* get mmconfig */
get_pci_mmcfg_amd_fam10h_range();
/* need to take out mmconf range */
if (fam10h_mmconf_end) {
printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end);
- update_range(range, fam10h_mmconf_start, fam10h_mmconf_end);
+ subtract_range(range, RANGE_NUM, fam10h_mmconf_start,
+ fam10h_mmconf_end + 1);
}
/* mmio resource */
@@ -401,7 +245,7 @@ static int __init early_fill_mp_bus_info(void)
info = &pci_root_info[j];
printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]",
- node, link, (u64)start, (u64)end);
+ node, link, start, end);
/*
* some sick allocation would have range overlap with fam10h
* mmconf range, so need to update start and end.
@@ -426,14 +270,15 @@ static int __init early_fill_mp_bus_info(void)
/* we got a hole */
endx = fam10h_mmconf_start - 1;
update_res(info, start, endx, IORESOURCE_MEM, 0);
- update_range(range, start, endx);
- printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx);
+ subtract_range(range, RANGE_NUM, start,
+ endx + 1);
+ printk(KERN_CONT " ==> [%llx, %llx]", start, endx);
start = fam10h_mmconf_end + 1;
changed = 1;
}
if (changed) {
if (start <= end) {
- printk(KERN_CONT " %s [%llx, %llx]", endx?"and":"==>", (u64)start, (u64)end);
+ printk(KERN_CONT " %s [%llx, %llx]", endx ? "and" : "==>", start, end);
} else {
printk(KERN_CONT "%s\n", endx?"":" ==> none");
continue;
@@ -441,8 +286,9 @@ static int __init early_fill_mp_bus_info(void)
}
}
- update_res(info, start, end, IORESOURCE_MEM, 1);
- update_range(range, start, end);
+ update_res(info, cap_resource(start), cap_resource(end),
+ IORESOURCE_MEM, 1);
+ subtract_range(range, RANGE_NUM, start, end + 1);
printk(KERN_CONT "\n");
}
@@ -456,8 +302,8 @@ static int __init early_fill_mp_bus_info(void)
address = MSR_K8_TOP_MEM2;
rdmsrl(address, val);
end = (val & 0xffffff800000ULL);
- printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20);
- update_range(range, 1ULL<<32, end - 1);
+ printk(KERN_INFO "TOM2: %016llx aka %lldM\n", end, end>>20);
+ subtract_range(range, RANGE_NUM, 1ULL<<32, end);
}
/*
@@ -476,7 +322,8 @@ static int __init early_fill_mp_bus_info(void)
if (!range[i].end)
continue;
- update_res(info, range[i].start, range[i].end,
+ update_res(info, cap_resource(range[i].start),
+ cap_resource(range[i].end - 1),
IORESOURCE_MEM, 1);
}
}
@@ -488,28 +335,18 @@ static int __init early_fill_mp_bus_info(void)
info = &pci_root_info[i];
res_num = info->res_num;
busnum = info->bus_min;
- printk(KERN_DEBUG "bus: [%02x,%02x] on node %x link %x\n",
+ printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n",
info->bus_min, info->bus_max, info->node, info->link);
for (j = 0; j < res_num; j++) {
res = &info->res[j];
- printk(KERN_DEBUG "bus: %02x index %x %s: [%llx, %llx]\n",
- busnum, j,
- (res->flags & IORESOURCE_IO)?"io port":"mmio",
- res->start, res->end);
+ printk(KERN_DEBUG "bus: %02x index %x %pR\n",
+ busnum, j, res);
}
}
return 0;
}
-#else /* !CONFIG_X86_64 */
-
-static int __init early_fill_mp_bus_info(void) { return 0; }
-
-#endif /* !CONFIG_X86_64 */
-
-/* common 32/64 bit code */
-
#define ENABLE_CF8_EXT_CFG (1ULL << 46)
static void enable_pci_io_ecs(void *unused)
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
new file mode 100644
index 000000000000..64a122883896
--- /dev/null
+++ b/arch/x86/pci/bus_numa.c
@@ -0,0 +1,101 @@
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/range.h>
+
+#include "bus_numa.h"
+
+int pci_root_num;
+struct pci_root_info pci_root_info[PCI_ROOT_NR];
+
+void x86_pci_root_bus_res_quirks(struct pci_bus *b)
+{
+ int i;
+ int j;
+ struct pci_root_info *info;
+
+ /* don't go for it if _CRS is used already */
+ if (b->resource[0] != &ioport_resource ||
+ b->resource[1] != &iomem_resource)
+ return;
+
+ if (!pci_root_num)
+ return;
+
+ for (i = 0; i < pci_root_num; i++) {
+ if (pci_root_info[i].bus_min == b->number)
+ break;
+ }
+
+ if (i == pci_root_num)
+ return;
+
+ printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
+ b->number);
+
+ pci_bus_remove_resources(b);
+ info = &pci_root_info[i];
+ for (j = 0; j < info->res_num; j++) {
+ struct resource *res;
+ struct resource *root;
+
+ res = &info->res[j];
+ pci_bus_add_resource(b, res, 0);
+ if (res->flags & IORESOURCE_IO)
+ root = &ioport_resource;
+ else
+ root = &iomem_resource;
+ insert_resource(root, res);
+ }
+}
+
+void __devinit update_res(struct pci_root_info *info, resource_size_t start,
+ resource_size_t end, unsigned long flags, int merge)
+{
+ int i;
+ struct resource *res;
+
+ if (start > end)
+ return;
+
+ if (start == MAX_RESOURCE)
+ return;
+
+ if (!merge)
+ goto addit;
+
+ /* try to merge it with old one */
+ for (i = 0; i < info->res_num; i++) {
+ resource_size_t final_start, final_end;
+ resource_size_t common_start, common_end;
+
+ res = &info->res[i];
+ if (res->flags != flags)
+ continue;
+
+ common_start = max(res->start, start);
+ common_end = min(res->end, end);
+ if (common_start > common_end + 1)
+ continue;
+
+ final_start = min(res->start, start);
+ final_end = max(res->end, end);
+
+ res->start = final_start;
+ res->end = final_end;
+ return;
+ }
+
+addit:
+
+ /* need to add that */
+ if (info->res_num >= RES_NUM)
+ return;
+
+ res = &info->res[info->res_num];
+ res->name = info->name;
+ res->flags = flags;
+ res->start = start;
+ res->end = end;
+ res->child = NULL;
+ info->res_num++;
+}
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
new file mode 100644
index 000000000000..804a4b40c31a
--- /dev/null
+++ b/arch/x86/pci/bus_numa.h
@@ -0,0 +1,25 @@
+#ifndef __BUS_NUMA_H
+#define __BUS_NUMA_H
+/*
+ * sub bus (transparent) will use entres from 3 to store extra from
+ * root, so need to make sure we have enough slot there.
+ */
+#define RES_NUM 16
+struct pci_root_info {
+ char name[12];
+ unsigned int res_num;
+ struct resource res[RES_NUM];
+ int bus_min;
+ int bus_max;
+ int node;
+ int link;
+};
+
+/* 4 at this time, it may become to 32 */
+#define PCI_ROOT_NR 4
+extern int pci_root_num;
+extern struct pci_root_info pci_root_info[PCI_ROOT_NR];
+
+extern void update_res(struct pci_root_info *info, resource_size_t start,
+ resource_size_t end, unsigned long flags, int merge);
+#endif
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 1331fcf26143..294e10cb11e1 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -72,12 +72,6 @@ struct pci_ops pci_root_ops = {
};
/*
- * legacy, numa, and acpi all want to call pcibios_scan_root
- * from their initcalls. This flag prevents that.
- */
-int pcibios_scanned;
-
-/*
* This interrupt-safe spinlock protects all accesses to PCI
* configuration space.
*/
@@ -410,8 +404,6 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
return bus;
}
-extern u8 pci_cache_line_size;
-
int __init pcibios_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -422,15 +414,19 @@ int __init pcibios_init(void)
}
/*
- * Assume PCI cacheline size of 32 bytes for all x86s except K7/K8
- * and P4. It's also good for 386/486s (which actually have 16)
+ * Set PCI cacheline size to that of the CPU if the CPU has reported it.
+ * (For older CPUs that don't support cpuid, we se it to 32 bytes
+ * It's also good for 386/486s (which actually have 16)
* as quite a few PCI devices do not support smaller values.
*/
- pci_cache_line_size = 32 >> 2;
- if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
- pci_cache_line_size = 64 >> 2; /* K7 & K8 */
- else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
- pci_cache_line_size = 128 >> 2; /* P4 */
+ if (c->x86_clflush_size > 0) {
+ pci_dfl_cache_line_size = c->x86_clflush_size >> 2;
+ printk(KERN_DEBUG "PCI: pci_cache_line_size set to %d bytes\n",
+ pci_dfl_cache_line_size << 2);
+ } else {
+ pci_dfl_cache_line_size = 32 >> 2;
+ printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n");
+ }
pcibios_resource_survey();
@@ -518,6 +514,9 @@ char * __devinit pcibios_setup(char *str)
} else if (!strcmp(str, "use_crs")) {
pci_probe |= PCI_USE__CRS;
return NULL;
+ } else if (!strcmp(str, "nocrs")) {
+ pci_probe |= PCI_ROOT_NO_CRS;
+ return NULL;
} else if (!strcmp(str, "earlydump")) {
pci_early_dump_regs = 1;
return NULL;
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index aaf26ae58cd5..d1067d539bee 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -12,8 +12,6 @@ u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset)
u32 v;
outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
v = inl(0xcfc);
- if (v != 0xffffffff)
- pr_debug("%x reading 4 from %x: %x\n", slot, offset, v);
return v;
}
@@ -22,7 +20,6 @@ u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset)
u8 v;
outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
v = inb(0xcfc + (offset&3));
- pr_debug("%x reading 1 from %x: %x\n", slot, offset, v);
return v;
}
@@ -31,28 +28,24 @@ u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset)
u16 v;
outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
v = inw(0xcfc + (offset&2));
- pr_debug("%x reading 2 from %x: %x\n", slot, offset, v);
return v;
}
void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
u32 val)
{
- pr_debug("%x writing to %x: %x\n", slot, offset, val);
outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
outl(val, 0xcfc);
}
void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
{
- pr_debug("%x writing to %x: %x\n", slot, offset, val);
outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
outb(val, 0xcfc + (offset&3));
}
void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val)
{
- pr_debug("%x writing to %x: %x\n", slot, offset, val);
outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
outw(val, 0xcfc + (offset&2));
}
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index b22d13b0c71d..dece3eb9c906 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -60,22 +60,20 @@ skip_isa_ioresource_align(struct pci_dev *dev) {
* but we want to try to avoid allocating at 0x2900-0x2bff
* which might have be mirrored at 0x0100-0x03ff..
*/
-void
-pcibios_align_resource(void *data, struct resource *res,
+resource_size_t
+pcibios_align_resource(void *data, const struct resource *res,
resource_size_t size, resource_size_t align)
{
struct pci_dev *dev = data;
+ resource_size_t start = res->start;
if (res->flags & IORESOURCE_IO) {
- resource_size_t start = res->start;
-
if (skip_isa_ioresource_align(dev))
- return;
- if (start & 0x300) {
+ return start;
+ if (start & 0x300)
start = (start + 0x3ff) & ~0x3ff;
- res->start = start;
- }
}
+ return start;
}
EXPORT_SYMBOL(pcibios_align_resource);
@@ -129,7 +127,9 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
continue;
if (!r->start ||
pci_claim_resource(dev, idx) < 0) {
- dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx);
+ dev_info(&dev->dev,
+ "can't reserve window %pR\n",
+ r);
/*
* Something is wrong with the region.
* Invalidate the resource to prevent
@@ -144,16 +144,29 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
}
}
+struct pci_check_idx_range {
+ int start;
+ int end;
+};
+
static void __init pcibios_allocate_resources(int pass)
{
struct pci_dev *dev = NULL;
- int idx, disabled;
+ int idx, disabled, i;
u16 command;
struct resource *r;
+ struct pci_check_idx_range idx_range[] = {
+ { PCI_STD_RESOURCES, PCI_STD_RESOURCE_END },
+#ifdef CONFIG_PCI_IOV
+ { PCI_IOV_RESOURCES, PCI_IOV_RESOURCE_END },
+#endif
+ };
+
for_each_pci_dev(dev) {
pci_read_config_word(dev, PCI_COMMAND, &command);
- for (idx = 0; idx < PCI_ROM_RESOURCE; idx++) {
+ for (i = 0; i < ARRAY_SIZE(idx_range); i++)
+ for (idx = idx_range[i].start; idx <= idx_range[i].end; idx++) {
r = &dev->resource[idx];
if (r->parent) /* Already allocated */
continue;
@@ -164,12 +177,12 @@ static void __init pcibios_allocate_resources(int pass)
else
disabled = !(command & PCI_COMMAND_MEMORY);
if (pass == disabled) {
- dev_dbg(&dev->dev, "resource %#08llx-%#08llx (f=%lx, d=%d, p=%d)\n",
- (unsigned long long) r->start,
- (unsigned long long) r->end,
- r->flags, disabled, pass);
+ dev_dbg(&dev->dev,
+ "BAR %d: reserving %pr (d=%d, p=%d)\n",
+ idx, r, disabled, pass);
if (pci_claim_resource(dev, idx) < 0) {
- dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx);
+ dev_info(&dev->dev,
+ "can't reserve %pR\n", r);
/* We'll assign a new address later */
r->end -= r->start;
r->start = 0;
@@ -182,7 +195,7 @@ static void __init pcibios_allocate_resources(int pass)
/* Turn the ROM off, leave the resource region,
* but keep it unregistered. */
u32 reg;
- dev_dbg(&dev->dev, "disabling ROM\n");
+ dev_dbg(&dev->dev, "disabling ROM %pR\n", r);
r->flags &= ~IORESOURCE_ROM_ENABLE;
pci_read_config_dword(dev,
dev->rom_base_reg, &reg);
@@ -242,10 +255,6 @@ void __init pcibios_resource_survey(void)
*/
fs_initcall(pcibios_assign_resources);
-void __weak x86_pci_root_bus_res_quirks(struct pci_bus *b)
-{
-}
-
/*
* If we set up a device for bus mastering, we need to check the latency
* timer as certain crappy BIOSes forget to set it properly.
@@ -282,6 +291,15 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
return -EINVAL;
prot = pgprot_val(vma->vm_page_prot);
+
+ /*
+ * Return error if pat is not enabled and write_combine is requested.
+ * Caller can followup with UC MINUS request and add a WC mtrr if there
+ * is a free mtrr slot.
+ */
+ if (!pat_enabled && write_combine)
+ return -EINVAL;
+
if (pat_enabled && write_combine)
prot |= _PAGE_CACHE_WC;
else if (pat_enabled || boot_cpu_data.x86 > 3)
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index 25a1f8efed4a..adb62aaa7ecd 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -1,6 +1,7 @@
#include <linux/pci.h>
#include <linux/init.h>
#include <asm/pci_x86.h>
+#include <asm/x86_init.h>
/* arch_initcall has too random ordering, so call the initializers
in the right sequence from here. */
@@ -15,10 +16,9 @@ static __init int pci_arch_init(void)
if (!(pci_probe & PCI_PROBE_NOEARLY))
pci_mmcfg_early_init();
-#ifdef CONFIG_PCI_OLPC
- if (!pci_olpc_init())
- return 0; /* skip additional checks if it's an XO */
-#endif
+ if (x86_init.pci.arch_init && !x86_init.pci.arch_init())
+ return 0;
+
#ifdef CONFIG_PCI_BIOS
pci_pcbios_init();
#endif
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 0696d506c4ad..8b107521d24e 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -53,7 +53,7 @@ struct irq_router_handler {
int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
};
-int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL;
+int (*pcibios_enable_irq)(struct pci_dev *dev) = pirq_enable_irq;
void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
/*
@@ -590,6 +590,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
case PCI_DEVICE_ID_INTEL_ICH10_1:
case PCI_DEVICE_ID_INTEL_ICH10_2:
case PCI_DEVICE_ID_INTEL_ICH10_3:
+ case PCI_DEVICE_ID_INTEL_CPT_LPC1:
+ case PCI_DEVICE_ID_INTEL_CPT_LPC2:
r->name = "PIIX/ICH";
r->get = pirq_piix_get;
r->set = pirq_piix_set;
@@ -1016,7 +1018,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
return 1;
}
-static void __init pcibios_fixup_irqs(void)
+void __init pcibios_fixup_irqs(void)
{
struct pci_dev *dev = NULL;
u8 pin;
@@ -1110,12 +1112,12 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = {
{ }
};
-int __init pcibios_irq_init(void)
+void __init pcibios_irq_init(void)
{
DBG(KERN_DEBUG "PCI: IRQ init\n");
- if (pcibios_enable_irq || raw_pci_ops == NULL)
- return 0;
+ if (raw_pci_ops == NULL)
+ return;
dmi_check_system(pciirq_dmi_table);
@@ -1142,9 +1144,7 @@ int __init pcibios_irq_init(void)
pirq_table = NULL;
}
- pcibios_enable_irq = pirq_enable_irq;
-
- pcibios_fixup_irqs();
+ x86_init.pci.fixup_irqs();
if (io_apic_assign_pci_irqs && pci_routeirq) {
struct pci_dev *dev = NULL;
@@ -1157,8 +1157,6 @@ int __init pcibios_irq_init(void)
for_each_pci_dev(dev)
pirq_enable_irq(dev);
}
-
- return 0;
}
static void pirq_penalize_isa_irq(int irq, int active)
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 4061bb0f267d..0db5eaf54560 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -35,16 +35,13 @@ static void __devinit pcibios_fixup_peer_bridges(void)
}
}
-static int __init pci_legacy_init(void)
+int __init pci_legacy_init(void)
{
if (!raw_pci_ops) {
printk("PCI: System does not support PCI\n");
return 0;
}
- if (pcibios_scanned++)
- return 0;
-
printk("PCI: Probing PCI hardware\n");
pci_root_bus = pcibios_scan_root(0);
if (pci_root_bus)
@@ -55,18 +52,15 @@ static int __init pci_legacy_init(void)
int __init pci_subsys_init(void)
{
-#ifdef CONFIG_X86_NUMAQ
- pci_numaq_init();
-#endif
-#ifdef CONFIG_ACPI
- pci_acpi_init();
-#endif
-#ifdef CONFIG_X86_VISWS
- pci_visws_init();
-#endif
- pci_legacy_init();
+ /*
+ * The init function returns an non zero value when
+ * pci_legacy_init should be invoked.
+ */
+ if (x86_init.pci.init())
+ pci_legacy_init();
+
pcibios_fixup_peer_bridges();
- pcibios_irq_init();
+ x86_init.pci.init_irq();
pcibios_init();
return 0;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 602c172d3bd5..8f3f9a50b1e0 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -15,48 +15,98 @@
#include <linux/acpi.h>
#include <linux/sfi_acpi.h>
#include <linux/bitmap.h>
-#include <linux/sort.h>
+#include <linux/dmi.h>
#include <asm/e820.h>
#include <asm/pci_x86.h>
#include <asm/acpi.h>
#define PREFIX "PCI: "
-/* aperture is up to 256MB but BIOS may reserve less */
-#define MMCONFIG_APER_MIN (2 * 1024*1024)
-#define MMCONFIG_APER_MAX (256 * 1024*1024)
-
/* Indicate if the mmcfg resources have been placed into the resource table. */
static int __initdata pci_mmcfg_resources_inserted;
-static __init int extend_mmcfg(int num)
+LIST_HEAD(pci_mmcfg_list);
+
+static __init void pci_mmconfig_remove(struct pci_mmcfg_region *cfg)
{
- struct acpi_mcfg_allocation *new;
- int new_num = pci_mmcfg_config_num + num;
+ if (cfg->res.parent)
+ release_resource(&cfg->res);
+ list_del(&cfg->list);
+ kfree(cfg);
+}
- new = kzalloc(sizeof(pci_mmcfg_config[0]) * new_num, GFP_KERNEL);
- if (!new)
- return -1;
+static __init void free_all_mmcfg(void)
+{
+ struct pci_mmcfg_region *cfg, *tmp;
+
+ pci_mmcfg_arch_free();
+ list_for_each_entry_safe(cfg, tmp, &pci_mmcfg_list, list)
+ pci_mmconfig_remove(cfg);
+}
- if (pci_mmcfg_config) {
- memcpy(new, pci_mmcfg_config,
- sizeof(pci_mmcfg_config[0]) * new_num);
- kfree(pci_mmcfg_config);
+static __init void list_add_sorted(struct pci_mmcfg_region *new)
+{
+ struct pci_mmcfg_region *cfg;
+
+ /* keep list sorted by segment and starting bus number */
+ list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+ if (cfg->segment > new->segment ||
+ (cfg->segment == new->segment &&
+ cfg->start_bus >= new->start_bus)) {
+ list_add_tail(&new->list, &cfg->list);
+ return;
+ }
}
- pci_mmcfg_config = new;
+ list_add_tail(&new->list, &pci_mmcfg_list);
+}
- return 0;
+static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
+ int end, u64 addr)
+{
+ struct pci_mmcfg_region *new;
+ int num_buses;
+ struct resource *res;
+
+ if (addr == 0)
+ return NULL;
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ new->address = addr;
+ new->segment = segment;
+ new->start_bus = start;
+ new->end_bus = end;
+
+ list_add_sorted(new);
+
+ num_buses = end - start + 1;
+ res = &new->res;
+ res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
+ res->end = addr + PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN,
+ "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
+ res->name = new->name;
+
+ printk(KERN_INFO PREFIX "MMCONFIG for domain %04x [bus %02x-%02x] at "
+ "%pR (base %#lx)\n", segment, start, end, &new->res,
+ (unsigned long) addr);
+
+ return new;
}
-static __init void fill_one_mmcfg(u64 addr, int segment, int start, int end)
+struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus)
{
- int i = pci_mmcfg_config_num;
+ struct pci_mmcfg_region *cfg;
- pci_mmcfg_config_num++;
- pci_mmcfg_config[i].address = addr;
- pci_mmcfg_config[i].pci_segment = segment;
- pci_mmcfg_config[i].start_bus_number = start;
- pci_mmcfg_config[i].end_bus_number = end;
+ list_for_each_entry(cfg, &pci_mmcfg_list, list)
+ if (cfg->segment == segment &&
+ cfg->start_bus <= bus && bus <= cfg->end_bus)
+ return cfg;
+
+ return NULL;
}
static const char __init *pci_mmcfg_e7520(void)
@@ -68,11 +118,9 @@ static const char __init *pci_mmcfg_e7520(void)
if (win == 0x0000 || win == 0xf000)
return NULL;
- if (extend_mmcfg(1) == -1)
+ if (pci_mmconfig_add(0, 0, 255, win << 16) == NULL)
return NULL;
- fill_one_mmcfg(win << 16, 0, 0, 255);
-
return "Intel Corporation E7520 Memory Controller Hub";
}
@@ -114,11 +162,9 @@ static const char __init *pci_mmcfg_intel_945(void)
if ((pciexbar & mask) >= 0xf0000000U)
return NULL;
- if (extend_mmcfg(1) == -1)
+ if (pci_mmconfig_add(0, 0, (len >> 20) - 1, pciexbar & mask) == NULL)
return NULL;
- fill_one_mmcfg(pciexbar & mask, 0, 0, (len >> 20) - 1);
-
return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub";
}
@@ -127,7 +173,7 @@ static const char __init *pci_mmcfg_amd_fam10h(void)
u32 low, high, address;
u64 base, msr;
int i;
- unsigned segnbits = 0, busnbits;
+ unsigned segnbits = 0, busnbits, end_bus;
if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF))
return NULL;
@@ -161,11 +207,13 @@ static const char __init *pci_mmcfg_amd_fam10h(void)
busnbits = 8;
}
- if (extend_mmcfg(1 << segnbits) == -1)
- return NULL;
-
+ end_bus = (1 << busnbits) - 1;
for (i = 0; i < (1 << segnbits); i++)
- fill_one_mmcfg(base + (1<<28) * i, i, 0, (1 << busnbits) - 1);
+ if (pci_mmconfig_add(i, 0, end_bus,
+ base + (1<<28) * i) == NULL) {
+ free_all_mmcfg();
+ return NULL;
+ }
return "AMD Family 10h NB";
}
@@ -190,7 +238,7 @@ static const char __init *pci_mmcfg_nvidia_mcp55(void)
/*
* do check if amd fam10h already took over
*/
- if (!acpi_disabled || pci_mmcfg_config_num || mcp55_checked)
+ if (!acpi_disabled || !list_empty(&pci_mmcfg_list) || mcp55_checked)
return NULL;
mcp55_checked = true;
@@ -213,16 +261,14 @@ static const char __init *pci_mmcfg_nvidia_mcp55(void)
if (!(extcfg & extcfg_enable_mask))
continue;
- if (extend_mmcfg(1) == -1)
- continue;
-
size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift;
base = extcfg & extcfg_base_mask[size_index];
/* base could > 4G */
base <<= extcfg_base_lshift;
start = (extcfg & extcfg_start_mask) >> extcfg_start_shift;
end = start + extcfg_sizebus[size_index] - 1;
- fill_one_mmcfg(base, 0, start, end);
+ if (pci_mmconfig_add(0, start, end, base) == NULL)
+ continue;
mcp55_mmconf_found++;
}
@@ -253,45 +299,22 @@ static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = {
0x0369, pci_mmcfg_nvidia_mcp55 },
};
-static int __init cmp_mmcfg(const void *x1, const void *x2)
-{
- const typeof(pci_mmcfg_config[0]) *m1 = x1;
- const typeof(pci_mmcfg_config[0]) *m2 = x2;
- int start1, start2;
-
- start1 = m1->start_bus_number;
- start2 = m2->start_bus_number;
-
- return start1 - start2;
-}
-
static void __init pci_mmcfg_check_end_bus_number(void)
{
- int i;
- typeof(pci_mmcfg_config[0]) *cfg, *cfgx;
-
- /* sort them at first */
- sort(pci_mmcfg_config, pci_mmcfg_config_num,
- sizeof(pci_mmcfg_config[0]), cmp_mmcfg, NULL);
-
- /* last one*/
- if (pci_mmcfg_config_num > 0) {
- i = pci_mmcfg_config_num - 1;
- cfg = &pci_mmcfg_config[i];
- if (cfg->end_bus_number < cfg->start_bus_number)
- cfg->end_bus_number = 255;
- }
+ struct pci_mmcfg_region *cfg, *cfgx;
- /* don't overlap please */
- for (i = 0; i < pci_mmcfg_config_num - 1; i++) {
- cfg = &pci_mmcfg_config[i];
- cfgx = &pci_mmcfg_config[i+1];
+ /* Fixup overlaps */
+ list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+ if (cfg->end_bus < cfg->start_bus)
+ cfg->end_bus = 255;
- if (cfg->end_bus_number < cfg->start_bus_number)
- cfg->end_bus_number = 255;
+ /* Don't access the list head ! */
+ if (cfg->list.next == &pci_mmcfg_list)
+ break;
- if (cfg->end_bus_number >= cfgx->start_bus_number)
- cfg->end_bus_number = cfgx->start_bus_number - 1;
+ cfgx = list_entry(cfg->list.next, typeof(*cfg), list);
+ if (cfg->end_bus >= cfgx->start_bus)
+ cfg->end_bus = cfgx->start_bus - 1;
}
}
@@ -306,8 +329,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
if (!raw_pci_ops)
return 0;
- pci_mmcfg_config_num = 0;
- pci_mmcfg_config = NULL;
+ free_all_mmcfg();
for (i = 0; i < ARRAY_SIZE(pci_mmcfg_probes); i++) {
bus = pci_mmcfg_probes[i].bus;
@@ -322,45 +344,22 @@ static int __init pci_mmcfg_check_hostbridge(void)
name = pci_mmcfg_probes[i].probe();
if (name)
- printk(KERN_INFO "PCI: Found %s with MMCONFIG support.\n",
+ printk(KERN_INFO PREFIX "%s with MMCONFIG support\n",
name);
}
/* some end_bus_number is crazy, fix it */
pci_mmcfg_check_end_bus_number();
- return pci_mmcfg_config_num != 0;
+ return !list_empty(&pci_mmcfg_list);
}
static void __init pci_mmcfg_insert_resources(void)
{
-#define PCI_MMCFG_RESOURCE_NAME_LEN 24
- int i;
- struct resource *res;
- char *names;
- unsigned num_buses;
-
- res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res),
- pci_mmcfg_config_num, GFP_KERNEL);
- if (!res) {
- printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n");
- return;
- }
+ struct pci_mmcfg_region *cfg;
- names = (void *)&res[pci_mmcfg_config_num];
- for (i = 0; i < pci_mmcfg_config_num; i++, res++) {
- struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i];
- num_buses = cfg->end_bus_number - cfg->start_bus_number + 1;
- res->name = names;
- snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN,
- "PCI MMCONFIG %u [%02x-%02x]", cfg->pci_segment,
- cfg->start_bus_number, cfg->end_bus_number);
- res->start = cfg->address + (cfg->start_bus_number << 20);
- res->end = res->start + (num_buses << 20) - 1;
- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- insert_resource(&iomem_resource, res);
- names += PCI_MMCFG_RESOURCE_NAME_LEN;
- }
+ list_for_each_entry(cfg, &pci_mmcfg_list, list)
+ insert_resource(&iomem_resource, &cfg->res);
/* Mark that the resources have been inserted. */
pci_mmcfg_resources_inserted = 1;
@@ -437,11 +436,12 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)
typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type);
static int __init is_mmconf_reserved(check_reserved_t is_reserved,
- u64 addr, u64 size, int i,
- typeof(pci_mmcfg_config[0]) *cfg, int with_e820)
+ struct pci_mmcfg_region *cfg, int with_e820)
{
+ u64 addr = cfg->res.start;
+ u64 size = resource_size(&cfg->res);
u64 old_size = size;
- int valid = 0;
+ int valid = 0, num_buses;
while (!is_reserved(addr, addr + size, E820_RESERVED)) {
size >>= 1;
@@ -450,19 +450,25 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
}
if (size >= (16UL<<20) || size == old_size) {
- printk(KERN_NOTICE
- "PCI: MCFG area at %Lx reserved in %s\n",
- addr, with_e820?"E820":"ACPI motherboard resources");
+ printk(KERN_INFO PREFIX "MMCONFIG at %pR reserved in %s\n",
+ &cfg->res,
+ with_e820 ? "E820" : "ACPI motherboard resources");
valid = 1;
if (old_size != size) {
- /* update end_bus_number */
- cfg->end_bus_number = cfg->start_bus_number + ((size>>20) - 1);
- printk(KERN_NOTICE "PCI: updated MCFG configuration %d: base %lx "
- "segment %hu buses %u - %u\n",
- i, (unsigned long)cfg->address, cfg->pci_segment,
- (unsigned int)cfg->start_bus_number,
- (unsigned int)cfg->end_bus_number);
+ /* update end_bus */
+ cfg->end_bus = cfg->start_bus + ((size>>20) - 1);
+ num_buses = cfg->end_bus - cfg->start_bus + 1;
+ cfg->res.end = cfg->res.start +
+ PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
+ snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN,
+ "PCI MMCONFIG %04x [bus %02x-%02x]",
+ cfg->segment, cfg->start_bus, cfg->end_bus);
+ printk(KERN_INFO PREFIX
+ "MMCONFIG for %04x [bus%02x-%02x] "
+ "at %pR (base %#lx) (size reduced!)\n",
+ cfg->segment, cfg->start_bus, cfg->end_bus,
+ &cfg->res, (unsigned long) cfg->address);
}
}
@@ -471,45 +477,26 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
static void __init pci_mmcfg_reject_broken(int early)
{
- typeof(pci_mmcfg_config[0]) *cfg;
- int i;
+ struct pci_mmcfg_region *cfg;
- if ((pci_mmcfg_config_num == 0) ||
- (pci_mmcfg_config == NULL) ||
- (pci_mmcfg_config[0].address == 0))
- return;
-
- for (i = 0; i < pci_mmcfg_config_num; i++) {
+ list_for_each_entry(cfg, &pci_mmcfg_list, list) {
int valid = 0;
- u64 addr, size;
-
- cfg = &pci_mmcfg_config[i];
- addr = cfg->start_bus_number;
- addr <<= 20;
- addr += cfg->address;
- size = cfg->end_bus_number + 1 - cfg->start_bus_number;
- size <<= 20;
- printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx "
- "segment %hu buses %u - %u\n",
- i, (unsigned long)cfg->address, cfg->pci_segment,
- (unsigned int)cfg->start_bus_number,
- (unsigned int)cfg->end_bus_number);
if (!early && !acpi_disabled)
- valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0);
+ valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0);
if (valid)
continue;
if (!early)
- printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not"
- " reserved in ACPI motherboard resources\n",
- cfg->address);
+ printk(KERN_ERR FW_BUG PREFIX
+ "MMCONFIG at %pR not reserved in "
+ "ACPI motherboard resources\n", &cfg->res);
/* Don't try to do this check unless configuration
type 1 is available. how about type 2 ?*/
if (raw_pci_ops)
- valid = is_mmconf_reserved(e820_all_mapped, addr, size, i, cfg, 1);
+ valid = is_mmconf_reserved(e820_all_mapped, cfg, 1);
if (!valid)
goto reject;
@@ -518,34 +505,41 @@ static void __init pci_mmcfg_reject_broken(int early)
return;
reject:
- printk(KERN_INFO "PCI: Not using MMCONFIG.\n");
- pci_mmcfg_arch_free();
- kfree(pci_mmcfg_config);
- pci_mmcfg_config = NULL;
- pci_mmcfg_config_num = 0;
+ printk(KERN_INFO PREFIX "not using MMCONFIG\n");
+ free_all_mmcfg();
}
static int __initdata known_bridge;
-static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
+static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
+ struct acpi_mcfg_allocation *cfg)
+{
+ int year;
-/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
-struct acpi_mcfg_allocation *pci_mmcfg_config;
-int pci_mmcfg_config_num;
+ if (cfg->address < 0xFFFFFFFF)
+ return 0;
-static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
-{
if (!strcmp(mcfg->header.oem_id, "SGI"))
- acpi_mcfg_64bit_base_addr = TRUE;
+ return 0;
- return 0;
+ if (mcfg->header.revision >= 1) {
+ if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) &&
+ year >= 2010)
+ return 0;
+ }
+
+ printk(KERN_ERR PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "
+ "is above 4GB, ignored\n", cfg->pci_segment,
+ cfg->start_bus_number, cfg->end_bus_number, cfg->address);
+ return -EINVAL;
}
static int __init pci_parse_mcfg(struct acpi_table_header *header)
{
struct acpi_table_mcfg *mcfg;
+ struct acpi_mcfg_allocation *cfg_table, *cfg;
unsigned long i;
- int config_size;
+ int entries;
if (!header)
return -EINVAL;
@@ -553,38 +547,33 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
mcfg = (struct acpi_table_mcfg *)header;
/* how many config structures do we have */
- pci_mmcfg_config_num = 0;
+ free_all_mmcfg();
+ entries = 0;
i = header->length - sizeof(struct acpi_table_mcfg);
while (i >= sizeof(struct acpi_mcfg_allocation)) {
- ++pci_mmcfg_config_num;
+ entries++;
i -= sizeof(struct acpi_mcfg_allocation);
};
- if (pci_mmcfg_config_num == 0) {
+ if (entries == 0) {
printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
return -ENODEV;
}
- config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
- pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
- if (!pci_mmcfg_config) {
- printk(KERN_WARNING PREFIX
- "No memory for MCFG config tables\n");
- return -ENOMEM;
- }
-
- memcpy(pci_mmcfg_config, &mcfg[1], config_size);
-
- acpi_mcfg_oem_check(mcfg);
-
- for (i = 0; i < pci_mmcfg_config_num; ++i) {
- if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
- !acpi_mcfg_64bit_base_addr) {
- printk(KERN_ERR PREFIX
- "MMCONFIG not in low 4GB of memory\n");
- kfree(pci_mmcfg_config);
- pci_mmcfg_config_num = 0;
+ cfg_table = (struct acpi_mcfg_allocation *) &mcfg[1];
+ for (i = 0; i < entries; i++) {
+ cfg = &cfg_table[i];
+ if (acpi_mcfg_check_entry(mcfg, cfg)) {
+ free_all_mmcfg();
return -ENODEV;
}
+
+ if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number,
+ cfg->end_bus_number, cfg->address) == NULL) {
+ printk(KERN_WARNING PREFIX
+ "no memory for MCFG entries\n");
+ free_all_mmcfg();
+ return -ENOMEM;
+ }
}
return 0;
@@ -614,9 +603,7 @@ static void __init __pci_mmcfg_init(int early)
pci_mmcfg_reject_broken(early);
- if ((pci_mmcfg_config_num == 0) ||
- (pci_mmcfg_config == NULL) ||
- (pci_mmcfg_config[0].address == 0))
+ if (list_empty(&pci_mmcfg_list))
return;
if (pci_mmcfg_arch_init())
@@ -648,9 +635,7 @@ static int __init pci_mmcfg_late_insert_resources(void)
*/
if ((pci_mmcfg_resources_inserted == 1) ||
(pci_probe & PCI_PROBE_MMCONF) == 0 ||
- (pci_mmcfg_config_num == 0) ||
- (pci_mmcfg_config == NULL) ||
- (pci_mmcfg_config[0].address == 0))
+ list_empty(&pci_mmcfg_list))
return 1;
/*
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index f10a7e94a84c..90d5fd476ed4 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -27,18 +27,10 @@ static int mmcfg_last_accessed_cpu;
*/
static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn)
{
- struct acpi_mcfg_allocation *cfg;
- int cfg_num;
-
- for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
- cfg = &pci_mmcfg_config[cfg_num];
- if (cfg->pci_segment == seg &&
- (cfg->start_bus_number <= bus) &&
- (cfg->end_bus_number >= bus))
- return cfg->address;
- }
+ struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
- /* Fall back to type 0 */
+ if (cfg)
+ return cfg->address;
return 0;
}
@@ -47,7 +39,7 @@ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn)
*/
static void pci_exp_set_dev_base(unsigned int base, int bus, int devfn)
{
- u32 dev_base = base | (bus << 20) | (devfn << 12);
+ u32 dev_base = base | PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12);
int cpu = smp_processor_id();
if (dev_base != mmcfg_last_accessed_device ||
cpu != mmcfg_last_accessed_cpu) {
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index 94349f8b2f96..e783841bd1d7 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -12,38 +12,15 @@
#include <asm/e820.h>
#include <asm/pci_x86.h>
-/* Static virtual mapping of the MMCONFIG aperture */
-struct mmcfg_virt {
- struct acpi_mcfg_allocation *cfg;
- char __iomem *virt;
-};
-static struct mmcfg_virt *pci_mmcfg_virt;
-
-static char __iomem *get_virt(unsigned int seg, unsigned bus)
-{
- struct acpi_mcfg_allocation *cfg;
- int cfg_num;
-
- for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
- cfg = pci_mmcfg_virt[cfg_num].cfg;
- if (cfg->pci_segment == seg &&
- (cfg->start_bus_number <= bus) &&
- (cfg->end_bus_number >= bus))
- return pci_mmcfg_virt[cfg_num].virt;
- }
-
- /* Fall back to type 0 */
- return NULL;
-}
+#define PREFIX "PCI: "
static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
{
- char __iomem *addr;
+ struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
- addr = get_virt(seg, bus);
- if (!addr)
- return NULL;
- return addr + ((bus << 20) | (devfn << 12));
+ if (cfg && cfg->virt)
+ return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12));
+ return NULL;
}
static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
@@ -109,42 +86,30 @@ static struct pci_raw_ops pci_mmcfg = {
.write = pci_mmcfg_write,
};
-static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg)
+static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg)
{
void __iomem *addr;
u64 start, size;
+ int num_buses;
- start = cfg->start_bus_number;
- start <<= 20;
- start += cfg->address;
- size = cfg->end_bus_number + 1 - cfg->start_bus_number;
- size <<= 20;
+ start = cfg->address + PCI_MMCFG_BUS_OFFSET(cfg->start_bus);
+ num_buses = cfg->end_bus - cfg->start_bus + 1;
+ size = PCI_MMCFG_BUS_OFFSET(num_buses);
addr = ioremap_nocache(start, size);
- if (addr) {
- printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n",
- start, start + size - 1);
- addr -= cfg->start_bus_number << 20;
- }
+ if (addr)
+ addr -= PCI_MMCFG_BUS_OFFSET(cfg->start_bus);
return addr;
}
int __init pci_mmcfg_arch_init(void)
{
- int i;
- pci_mmcfg_virt = kzalloc(sizeof(*pci_mmcfg_virt) *
- pci_mmcfg_config_num, GFP_KERNEL);
- if (pci_mmcfg_virt == NULL) {
- printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n");
- return 0;
- }
+ struct pci_mmcfg_region *cfg;
- for (i = 0; i < pci_mmcfg_config_num; ++i) {
- pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i];
- pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]);
- if (!pci_mmcfg_virt[i].virt) {
- printk(KERN_ERR "PCI: Cannot map mmconfig aperture for "
- "segment %d\n",
- pci_mmcfg_config[i].pci_segment);
+ list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+ cfg->virt = mcfg_ioremap(cfg);
+ if (!cfg->virt) {
+ printk(KERN_ERR PREFIX "can't map MMCONFIG at %pR\n",
+ &cfg->res);
pci_mmcfg_arch_free();
return 0;
}
@@ -155,19 +120,12 @@ int __init pci_mmcfg_arch_init(void)
void __init pci_mmcfg_arch_free(void)
{
- int i;
-
- if (pci_mmcfg_virt == NULL)
- return;
+ struct pci_mmcfg_region *cfg;
- for (i = 0; i < pci_mmcfg_config_num; ++i) {
- if (pci_mmcfg_virt[i].virt) {
- iounmap(pci_mmcfg_virt[i].virt + (pci_mmcfg_virt[i].cfg->start_bus_number << 20));
- pci_mmcfg_virt[i].virt = NULL;
- pci_mmcfg_virt[i].cfg = NULL;
+ list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+ if (cfg->virt) {
+ iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus));
+ cfg->virt = NULL;
}
}
-
- kfree(pci_mmcfg_virt);
- pci_mmcfg_virt = NULL;
}
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
new file mode 100644
index 000000000000..8bf2fcb88d04
--- /dev/null
+++ b/arch/x86/pci/mrst.c
@@ -0,0 +1,262 @@
+/*
+ * Moorestown PCI support
+ * Copyright (c) 2008 Intel Corporation
+ * Jesse Barnes <jesse.barnes@intel.com>
+ *
+ * Moorestown has an interesting PCI implementation:
+ * - configuration space is memory mapped (as defined by MCFG)
+ * - Lincroft devices also have a real, type 1 configuration space
+ * - Early Lincroft silicon has a type 1 access bug that will cause
+ * a hang if non-existent devices are accessed
+ * - some devices have the "fixed BAR" capability, which means
+ * they can't be relocated or modified; check for that during
+ * BAR sizing
+ *
+ * So, we use the MCFG space for all reads and writes, but also send
+ * Lincroft writes to type 1 space. But only read/write if the device
+ * actually exists, otherwise return all 1s for reads and bit bucket
+ * the writes.
+ */
+
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+
+#include <asm/acpi.h>
+#include <asm/segment.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/pci_x86.h>
+#include <asm/hw_irq.h>
+#include <asm/io_apic.h>
+
+#define PCIE_CAP_OFFSET 0x100
+
+/* Fixed BAR fields */
+#define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00 /* Fixed BAR (TBD) */
+#define PCI_FIXED_BAR_0_SIZE 0x04
+#define PCI_FIXED_BAR_1_SIZE 0x08
+#define PCI_FIXED_BAR_2_SIZE 0x0c
+#define PCI_FIXED_BAR_3_SIZE 0x10
+#define PCI_FIXED_BAR_4_SIZE 0x14
+#define PCI_FIXED_BAR_5_SIZE 0x1c
+
+/**
+ * fixed_bar_cap - return the offset of the fixed BAR cap if found
+ * @bus: PCI bus
+ * @devfn: device in question
+ *
+ * Look for the fixed BAR cap on @bus and @devfn, returning its offset
+ * if found or 0 otherwise.
+ */
+static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn)
+{
+ int pos;
+ u32 pcie_cap = 0, cap_data;
+
+ pos = PCIE_CAP_OFFSET;
+
+ if (!raw_pci_ext_ops)
+ return 0;
+
+ while (pos) {
+ if (raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
+ devfn, pos, 4, &pcie_cap))
+ return 0;
+
+ if (pcie_cap == 0xffffffff)
+ return 0;
+
+ if (PCI_EXT_CAP_ID(pcie_cap) == PCI_EXT_CAP_ID_VNDR) {
+ raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
+ devfn, pos + 4, 4, &cap_data);
+ if ((cap_data & 0xffff) == PCIE_VNDR_CAP_ID_FIXED_BAR)
+ return pos;
+ }
+
+ pos = pcie_cap >> 20;
+ }
+
+ return 0;
+}
+
+static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn,
+ int reg, int len, u32 val, int offset)
+{
+ u32 size;
+ unsigned int domain, busnum;
+ int bar = (reg - PCI_BASE_ADDRESS_0) >> 2;
+
+ domain = pci_domain_nr(bus);
+ busnum = bus->number;
+
+ if (val == ~0 && len == 4) {
+ unsigned long decode;
+
+ raw_pci_ext_ops->read(domain, busnum, devfn,
+ offset + 8 + (bar * 4), 4, &size);
+
+ /* Turn the size into a decode pattern for the sizing code */
+ if (size) {
+ decode = size - 1;
+ decode |= decode >> 1;
+ decode |= decode >> 2;
+ decode |= decode >> 4;
+ decode |= decode >> 8;
+ decode |= decode >> 16;
+ decode++;
+ decode = ~(decode - 1);
+ } else {
+ decode = ~0;
+ }
+
+ /*
+ * If val is all ones, the core code is trying to size the reg,
+ * so update the mmconfig space with the real size.
+ *
+ * Note: this assumes the fixed size we got is a power of two.
+ */
+ return raw_pci_ext_ops->write(domain, busnum, devfn, reg, 4,
+ decode);
+ }
+
+ /* This is some other kind of BAR write, so just do it. */
+ return raw_pci_ext_ops->write(domain, busnum, devfn, reg, len, val);
+}
+
+/**
+ * type1_access_ok - check whether to use type 1
+ * @bus: bus number
+ * @devfn: device & function in question
+ *
+ * If the bus is on a Lincroft chip and it exists, or is not on a Lincroft at
+ * all, the we can go ahead with any reads & writes. If it's on a Lincroft,
+ * but doesn't exist, avoid the access altogether to keep the chip from
+ * hanging.
+ */
+static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg)
+{
+ /* This is a workaround for A0 LNC bug where PCI status register does
+ * not have new CAP bit set. can not be written by SW either.
+ *
+ * PCI header type in real LNC indicates a single function device, this
+ * will prevent probing other devices under the same function in PCI
+ * shim. Therefore, use the header type in shim instead.
+ */
+ if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE)
+ return 0;
+ if (bus == 0 && (devfn == PCI_DEVFN(2, 0) || devfn == PCI_DEVFN(0, 0)))
+ return 1;
+ return 0; /* langwell on others */
+}
+
+static int pci_read(struct pci_bus *bus, unsigned int devfn, int where,
+ int size, u32 *value)
+{
+ if (type1_access_ok(bus->number, devfn, where))
+ return pci_direct_conf1.read(pci_domain_nr(bus), bus->number,
+ devfn, where, size, value);
+ return raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
+ devfn, where, size, value);
+}
+
+static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
+ int size, u32 value)
+{
+ int offset;
+
+ /* On MRST, there is no PCI ROM BAR, this will cause a subsequent read
+ * to ROM BAR return 0 then being ignored.
+ */
+ if (where == PCI_ROM_ADDRESS)
+ return 0;
+
+ /*
+ * Devices with fixed BARs need special handling:
+ * - BAR sizing code will save, write ~0, read size, restore
+ * - so writes to fixed BARs need special handling
+ * - other writes to fixed BAR devices should go through mmconfig
+ */
+ offset = fixed_bar_cap(bus, devfn);
+ if (offset &&
+ (where >= PCI_BASE_ADDRESS_0 && where <= PCI_BASE_ADDRESS_5)) {
+ return pci_device_update_fixed(bus, devfn, where, size, value,
+ offset);
+ }
+
+ /*
+ * On Moorestown update both real & mmconfig space
+ * Note: early Lincroft silicon can't handle type 1 accesses to
+ * non-existent devices, so just eat the write in that case.
+ */
+ if (type1_access_ok(bus->number, devfn, where))
+ return pci_direct_conf1.write(pci_domain_nr(bus), bus->number,
+ devfn, where, size, value);
+ return raw_pci_ext_ops->write(pci_domain_nr(bus), bus->number, devfn,
+ where, size, value);
+}
+
+static int mrst_pci_irq_enable(struct pci_dev *dev)
+{
+ u8 pin;
+ struct io_apic_irq_attr irq_attr;
+
+ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+
+ /* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
+ * IOAPIC RTE entries, so we just enable RTE for the device.
+ */
+ irq_attr.ioapic = mp_find_ioapic(dev->irq);
+ irq_attr.ioapic_pin = dev->irq;
+ irq_attr.trigger = 1; /* level */
+ irq_attr.polarity = 1; /* active low */
+ io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr);
+
+ return 0;
+}
+
+struct pci_ops pci_mrst_ops = {
+ .read = pci_read,
+ .write = pci_write,
+};
+
+/**
+ * pci_mrst_init - installs pci_mrst_ops
+ *
+ * Moorestown has an interesting PCI implementation (see above).
+ * Called when the early platform detection installs it.
+ */
+int __init pci_mrst_init(void)
+{
+ printk(KERN_INFO "Moorestown platform detected, using MRST PCI ops\n");
+ pci_mmcfg_late_init();
+ pcibios_enable_irq = mrst_pci_irq_enable;
+ pci_root_ops = pci_mrst_ops;
+ /* Continue with standard init */
+ return 1;
+}
+
+/*
+ * Langwell devices reside at fixed offsets, don't try to move them.
+ */
+static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev)
+{
+ unsigned long offset;
+ u32 size;
+ int i;
+
+ /* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */
+ offset = fixed_bar_cap(dev->bus, dev->devfn);
+ if (!offset || PCI_DEVFN(2, 0) == dev->devfn ||
+ PCI_DEVFN(2, 2) == dev->devfn)
+ return;
+
+ for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+ pci_read_config_dword(dev, offset + 8 + (i * 4), &size);
+ dev->resource[i].end = dev->resource[i].start + size - 1;
+ dev->resource[i].flags |= IORESOURCE_PCI_FIXED;
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixed_bar_fixup);
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 8eb295e116f6..8223738ad806 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -8,9 +8,7 @@
#include <asm/apic.h>
#include <asm/mpspec.h>
#include <asm/pci_x86.h>
-
-#define XQUAD_PORTIO_BASE 0xfe400000
-#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
+#include <asm/numaq.h>
#define BUS2QUAD(global) (mp_bus_id_to_node[global])
@@ -18,8 +16,6 @@
#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
-#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
-
#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \
(0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3))
@@ -152,14 +148,8 @@ int __init pci_numaq_init(void)
{
int quad;
- if (!found_numaq)
- return 0;
-
raw_pci_ops = &pci_direct_conf1_mq;
- if (pcibios_scanned++)
- return 0;
-
pci_root_bus = pcibios_scan_root(0);
if (pci_root_bus)
pci_bus_add_devices(pci_root_bus);
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index b889d824f7c6..b34815408f58 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -304,9 +304,6 @@ static struct pci_raw_ops pci_olpc_conf = {
int __init pci_olpc_init(void)
{
- if (!machine_is_olpc() || olpc_has_vsa())
- return -ENODEV;
-
printk(KERN_INFO "PCI: Using configuration type OLPC\n");
raw_pci_ops = &pci_olpc_conf;
is_lx = is_geode_lx();
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index bcead7a46871..03008f72eb04 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -69,9 +69,6 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
int __init pci_visws_init(void)
{
- if (!is_visws_box())
- return -1;
-
pcibios_enable_irq = &pci_visws_enable_irq;
pcibios_disable_irq = &pci_visws_disable_irq;
@@ -90,5 +87,6 @@ int __init pci_visws_init(void)
pci_scan_bus_with_sysdata(pci_bus1);
pci_fixup_irqs(pci_common_swizzle, visws_map_irq);
pcibios_resource_survey();
- return 0;
+ /* Request bus scan */
+ return 1;
}
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 8aa85f17667e..0a979f3e5b8a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -18,6 +18,7 @@
#include <asm/mce.h>
#include <asm/xcr.h>
#include <asm/suspend.h>
+#include <asm/debugreg.h>
#ifdef CONFIG_X86_32
static struct saved_context saved_context;
@@ -142,31 +143,6 @@ static void fix_processor_context(void)
#endif
load_TR_desc(); /* This does ltr */
load_LDT(&current->active_mm->context); /* This does lldt */
-
- /*
- * Now maybe reload the debug registers
- */
- if (current->thread.debugreg7) {
-#ifdef CONFIG_X86_32
- set_debugreg(current->thread.debugreg0, 0);
- set_debugreg(current->thread.debugreg1, 1);
- set_debugreg(current->thread.debugreg2, 2);
- set_debugreg(current->thread.debugreg3, 3);
- /* no 4 and 5 */
- set_debugreg(current->thread.debugreg6, 6);
- set_debugreg(current->thread.debugreg7, 7);
-#else
- /* CONFIG_X86_64 */
- loaddebug(&current->thread, 0);
- loaddebug(&current->thread, 1);
- loaddebug(&current->thread, 2);
- loaddebug(&current->thread, 3);
- /* no 4 and 5 */
- loaddebug(&current->thread, 6);
- loaddebug(&current->thread, 7);
-#endif
- }
-
}
/**
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
new file mode 100644
index 000000000000..f82082677337
--- /dev/null
+++ b/arch/x86/tools/Makefile
@@ -0,0 +1,31 @@
+PHONY += posttest
+
+ifeq ($(KBUILD_VERBOSE),1)
+ posttest_verbose = -v
+else
+ posttest_verbose =
+endif
+
+ifeq ($(CONFIG_64BIT),y)
+ posttest_64bit = -y
+else
+ posttest_64bit = -n
+endif
+
+distill_awk = $(srctree)/arch/x86/tools/distill.awk
+chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk
+
+quiet_cmd_posttest = TEST $@
+ cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose)
+
+posttest: $(obj)/test_get_len vmlinux
+ $(call cmd,posttest)
+
+hostprogs-y := test_get_len
+
+# -I needed for generated C source and C source which in the kernel tree.
+HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
+
+# Dependencies are also needed.
+$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
+
diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk
new file mode 100644
index 000000000000..fd1ab80be0de
--- /dev/null
+++ b/arch/x86/tools/chkobjdump.awk
@@ -0,0 +1,33 @@
+# GNU objdump version checker
+#
+# Usage:
+# objdump -v | awk -f chkobjdump.awk
+BEGIN {
+ # objdump version 2.19 or later is OK for the test.
+ od_ver = 2;
+ od_sver = 19;
+}
+
+/^GNU objdump/ {
+ verstr = ""
+ for (i = 3; i <= NF; i++)
+ if (match($(i), "^[0-9]")) {
+ verstr = $(i);
+ break;
+ }
+ if (verstr == "") {
+ printf("Warning: Failed to find objdump version number.\n");
+ exit 0;
+ }
+ split(verstr, ver, ".");
+ if (ver[1] > od_ver ||
+ (ver[1] == od_ver && ver[2] >= od_sver)) {
+ exit 1;
+ } else {
+ printf("Warning: objdump version %s is older than %d.%d\n",
+ verstr, od_ver, od_sver);
+ print("Warning: Skipping posttest.");
+ # Logic is inverted, because we just skip test without error.
+ exit 0;
+ }
+}
diff --git a/arch/x86/tools/distill.awk b/arch/x86/tools/distill.awk
new file mode 100644
index 000000000000..c13c0ee48ab4
--- /dev/null
+++ b/arch/x86/tools/distill.awk
@@ -0,0 +1,47 @@
+#!/bin/awk -f
+# Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
+# Distills the disassembly as follows:
+# - Removes all lines except the disassembled instructions.
+# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes
+# into a single line.
+# - Remove bad(or prefix only) instructions
+
+BEGIN {
+ prev_addr = ""
+ prev_hex = ""
+ prev_mnemonic = ""
+ bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))"
+ fwait_expr = "^9b "
+ fwait_str="9b\tfwait"
+}
+
+/^ *[0-9a-f]+ <[^>]*>:/ {
+ # Symbol entry
+ printf("%s%s\n", $2, $1)
+}
+
+/^ *[0-9a-f]+:/ {
+ if (split($0, field, "\t") < 3) {
+ # This is a continuation of the same insn.
+ prev_hex = prev_hex field[2]
+ } else {
+ # Skip bad instructions
+ if (match(prev_mnemonic, bad_expr))
+ prev_addr = ""
+ # Split fwait from other f* instructions
+ if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") {
+ printf "%s\t%s\n", prev_addr, fwait_str
+ sub(fwait_expr, "", prev_hex)
+ }
+ if (prev_addr != "")
+ printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
+ prev_addr = field[1]
+ prev_hex = field[2]
+ prev_mnemonic = field[3]
+ }
+}
+
+END {
+ if (prev_addr != "")
+ printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
+}
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
new file mode 100644
index 000000000000..eaf11f52fc0b
--- /dev/null
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -0,0 +1,378 @@
+#!/bin/awk -f
+# gen-insn-attr-x86.awk: Instruction attribute table generator
+# Written by Masami Hiramatsu <mhiramat@redhat.com>
+#
+# Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c
+
+# Awk implementation sanity check
+function check_awk_implement() {
+ if (sprintf("%x", 0) != "0")
+ return "Your awk has a printf-format problem."
+ return ""
+}
+
+# Clear working vars
+function clear_vars() {
+ delete table
+ delete lptable2
+ delete lptable1
+ delete lptable3
+ eid = -1 # escape id
+ gid = -1 # group id
+ aid = -1 # AVX id
+ tname = ""
+}
+
+BEGIN {
+ # Implementation error checking
+ awkchecked = check_awk_implement()
+ if (awkchecked != "") {
+ print "Error: " awkchecked > "/dev/stderr"
+ print "Please try to use gawk." > "/dev/stderr"
+ exit 1
+ }
+
+ # Setup generating tables
+ print "/* x86 opcode map generated from x86-opcode-map.txt */"
+ print "/* Do not change this code. */\n"
+ ggid = 1
+ geid = 1
+ gaid = 0
+ delete etable
+ delete gtable
+ delete atable
+
+ opnd_expr = "^[A-Za-z/]"
+ ext_expr = "^\\("
+ sep_expr = "^\\|$"
+ group_expr = "^Grp[0-9A-Za-z]+"
+
+ imm_expr = "^[IJAO][a-z]"
+ imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
+ imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
+ imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)"
+ imm_flag["Id"] = "INAT_MAKE_IMM(INAT_IMM_DWORD)"
+ imm_flag["Iq"] = "INAT_MAKE_IMM(INAT_IMM_QWORD)"
+ imm_flag["Ap"] = "INAT_MAKE_IMM(INAT_IMM_PTR)"
+ imm_flag["Iz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
+ imm_flag["Jz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
+ imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)"
+ imm_flag["Ob"] = "INAT_MOFFSET"
+ imm_flag["Ov"] = "INAT_MOFFSET"
+
+ modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])"
+ force64_expr = "\\([df]64\\)"
+ rex_expr = "^REX(\\.[XRWB]+)*"
+ fpu_expr = "^ESC" # TODO
+
+ lprefix1_expr = "\\(66\\)"
+ lprefix2_expr = "\\(F3\\)"
+ lprefix3_expr = "\\(F2\\)"
+ max_lprefix = 4
+
+ vexok_expr = "\\(VEX\\)"
+ vexonly_expr = "\\(oVEX\\)"
+
+ prefix_expr = "\\(Prefix\\)"
+ prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
+ prefix_num["REPNE"] = "INAT_PFX_REPNE"
+ prefix_num["REP/REPE"] = "INAT_PFX_REPE"
+ prefix_num["LOCK"] = "INAT_PFX_LOCK"
+ prefix_num["SEG=CS"] = "INAT_PFX_CS"
+ prefix_num["SEG=DS"] = "INAT_PFX_DS"
+ prefix_num["SEG=ES"] = "INAT_PFX_ES"
+ prefix_num["SEG=FS"] = "INAT_PFX_FS"
+ prefix_num["SEG=GS"] = "INAT_PFX_GS"
+ prefix_num["SEG=SS"] = "INAT_PFX_SS"
+ prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ"
+ prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2"
+ prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3"
+
+ clear_vars()
+}
+
+function semantic_error(msg) {
+ print "Semantic error at " NR ": " msg > "/dev/stderr"
+ exit 1
+}
+
+function debug(msg) {
+ print "DEBUG: " msg
+}
+
+function array_size(arr, i,c) {
+ c = 0
+ for (i in arr)
+ c++
+ return c
+}
+
+/^Table:/ {
+ print "/* " $0 " */"
+ if (tname != "")
+ semantic_error("Hit Table: before EndTable:.");
+}
+
+/^Referrer:/ {
+ if (NF != 1) {
+ # escape opcode table
+ ref = ""
+ for (i = 2; i <= NF; i++)
+ ref = ref $i
+ eid = escape[ref]
+ tname = sprintf("inat_escape_table_%d", eid)
+ }
+}
+
+/^AVXcode:/ {
+ if (NF != 1) {
+ # AVX/escape opcode table
+ aid = $2
+ if (gaid <= aid)
+ gaid = aid + 1
+ if (tname == "") # AVX only opcode table
+ tname = sprintf("inat_avx_table_%d", $2)
+ }
+ if (aid == -1 && eid == -1) # primary opcode table
+ tname = "inat_primary_table"
+}
+
+/^GrpTable:/ {
+ print "/* " $0 " */"
+ if (!($2 in group))
+ semantic_error("No group: " $2 )
+ gid = group[$2]
+ tname = "inat_group_table_" gid
+}
+
+function print_table(tbl,name,fmt,n)
+{
+ print "const insn_attr_t " name " = {"
+ for (i = 0; i < n; i++) {
+ id = sprintf(fmt, i)
+ if (tbl[id])
+ print " [" id "] = " tbl[id] ","
+ }
+ print "};"
+}
+
+/^EndTable/ {
+ if (gid != -1) {
+ # print group tables
+ if (array_size(table) != 0) {
+ print_table(table, tname "[INAT_GROUP_TABLE_SIZE]",
+ "0x%x", 8)
+ gtable[gid,0] = tname
+ }
+ if (array_size(lptable1) != 0) {
+ print_table(lptable1, tname "_1[INAT_GROUP_TABLE_SIZE]",
+ "0x%x", 8)
+ gtable[gid,1] = tname "_1"
+ }
+ if (array_size(lptable2) != 0) {
+ print_table(lptable2, tname "_2[INAT_GROUP_TABLE_SIZE]",
+ "0x%x", 8)
+ gtable[gid,2] = tname "_2"
+ }
+ if (array_size(lptable3) != 0) {
+ print_table(lptable3, tname "_3[INAT_GROUP_TABLE_SIZE]",
+ "0x%x", 8)
+ gtable[gid,3] = tname "_3"
+ }
+ } else {
+ # print primary/escaped tables
+ if (array_size(table) != 0) {
+ print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]",
+ "0x%02x", 256)
+ etable[eid,0] = tname
+ if (aid >= 0)
+ atable[aid,0] = tname
+ }
+ if (array_size(lptable1) != 0) {
+ print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]",
+ "0x%02x", 256)
+ etable[eid,1] = tname "_1"
+ if (aid >= 0)
+ atable[aid,1] = tname "_1"
+ }
+ if (array_size(lptable2) != 0) {
+ print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]",
+ "0x%02x", 256)
+ etable[eid,2] = tname "_2"
+ if (aid >= 0)
+ atable[aid,2] = tname "_2"
+ }
+ if (array_size(lptable3) != 0) {
+ print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]",
+ "0x%02x", 256)
+ etable[eid,3] = tname "_3"
+ if (aid >= 0)
+ atable[aid,3] = tname "_3"
+ }
+ }
+ print ""
+ clear_vars()
+}
+
+function add_flags(old,new) {
+ if (old && new)
+ return old " | " new
+ else if (old)
+ return old
+ else
+ return new
+}
+
+# convert operands to flags.
+function convert_operands(count,opnd, i,j,imm,mod)
+{
+ imm = null
+ mod = null
+ for (j = 1; j <= count; j++) {
+ i = opnd[j]
+ if (match(i, imm_expr) == 1) {
+ if (!imm_flag[i])
+ semantic_error("Unknown imm opnd: " i)
+ if (imm) {
+ if (i != "Ib")
+ semantic_error("Second IMM error")
+ imm = add_flags(imm, "INAT_SCNDIMM")
+ } else
+ imm = imm_flag[i]
+ } else if (match(i, modrm_expr))
+ mod = "INAT_MODRM"
+ }
+ return add_flags(imm, mod)
+}
+
+/^[0-9a-f]+\:/ {
+ if (NR == 1)
+ next
+ # get index
+ idx = "0x" substr($1, 1, index($1,":") - 1)
+ if (idx in table)
+ semantic_error("Redefine " idx " in " tname)
+
+ # check if escaped opcode
+ if ("escape" == $2) {
+ if ($3 != "#")
+ semantic_error("No escaped name")
+ ref = ""
+ for (i = 4; i <= NF; i++)
+ ref = ref $i
+ if (ref in escape)
+ semantic_error("Redefine escape (" ref ")")
+ escape[ref] = geid
+ geid++
+ table[idx] = "INAT_MAKE_ESCAPE(" escape[ref] ")"
+ next
+ }
+
+ variant = null
+ # converts
+ i = 2
+ while (i <= NF) {
+ opcode = $(i++)
+ delete opnds
+ ext = null
+ flags = null
+ opnd = null
+ # parse one opcode
+ if (match($i, opnd_expr)) {
+ opnd = $i
+ count = split($(i++), opnds, ",")
+ flags = convert_operands(count, opnds)
+ }
+ if (match($i, ext_expr))
+ ext = $(i++)
+ if (match($i, sep_expr))
+ i++
+ else if (i < NF)
+ semantic_error($i " is not a separator")
+
+ # check if group opcode
+ if (match(opcode, group_expr)) {
+ if (!(opcode in group)) {
+ group[opcode] = ggid
+ ggid++
+ }
+ flags = add_flags(flags, "INAT_MAKE_GROUP(" group[opcode] ")")
+ }
+ # check force(or default) 64bit
+ if (match(ext, force64_expr))
+ flags = add_flags(flags, "INAT_FORCE64")
+
+ # check REX prefix
+ if (match(opcode, rex_expr))
+ flags = add_flags(flags, "INAT_MAKE_PREFIX(INAT_PFX_REX)")
+
+ # check coprocessor escape : TODO
+ if (match(opcode, fpu_expr))
+ flags = add_flags(flags, "INAT_MODRM")
+
+ # check VEX only code
+ if (match(ext, vexonly_expr))
+ flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY")
+
+ # check VEX only code
+ if (match(ext, vexok_expr))
+ flags = add_flags(flags, "INAT_VEXOK")
+
+ # check prefixes
+ if (match(ext, prefix_expr)) {
+ if (!prefix_num[opcode])
+ semantic_error("Unknown prefix: " opcode)
+ flags = add_flags(flags, "INAT_MAKE_PREFIX(" prefix_num[opcode] ")")
+ }
+ if (length(flags) == 0)
+ continue
+ # check if last prefix
+ if (match(ext, lprefix1_expr)) {
+ lptable1[idx] = add_flags(lptable1[idx],flags)
+ variant = "INAT_VARIANT"
+ } else if (match(ext, lprefix2_expr)) {
+ lptable2[idx] = add_flags(lptable2[idx],flags)
+ variant = "INAT_VARIANT"
+ } else if (match(ext, lprefix3_expr)) {
+ lptable3[idx] = add_flags(lptable3[idx],flags)
+ variant = "INAT_VARIANT"
+ } else {
+ table[idx] = add_flags(table[idx],flags)
+ }
+ }
+ if (variant)
+ table[idx] = add_flags(table[idx],variant)
+}
+
+END {
+ if (awkchecked != "")
+ exit 1
+ # print escape opcode map's array
+ print "/* Escape opcode map array */"
+ print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \
+ "[INAT_LSTPFX_MAX + 1] = {"
+ for (i = 0; i < geid; i++)
+ for (j = 0; j < max_lprefix; j++)
+ if (etable[i,j])
+ print " ["i"]["j"] = "etable[i,j]","
+ print "};\n"
+ # print group opcode map's array
+ print "/* Group opcode map array */"
+ print "const insn_attr_t const *inat_group_tables[INAT_GRP_MAX + 1]"\
+ "[INAT_LSTPFX_MAX + 1] = {"
+ for (i = 0; i < ggid; i++)
+ for (j = 0; j < max_lprefix; j++)
+ if (gtable[i,j])
+ print " ["i"]["j"] = "gtable[i,j]","
+ print "};\n"
+ # print AVX opcode map's array
+ print "/* AVX opcode map array */"
+ print "const insn_attr_t const *inat_avx_tables[X86_VEX_M_MAX + 1]"\
+ "[INAT_LSTPFX_MAX + 1] = {"
+ for (i = 0; i < gaid; i++)
+ for (j = 0; j < max_lprefix; j++)
+ if (atable[i,j])
+ print " ["i"]["j"] = "atable[i,j]","
+ print "};"
+}
+
diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c
new file mode 100644
index 000000000000..13403fc95a96
--- /dev/null
+++ b/arch/x86/tools/test_get_len.c
@@ -0,0 +1,173 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+
+#define unlikely(cond) (cond)
+
+#include <asm/insn.h>
+#include <inat.c>
+#include <insn.c>
+
+/*
+ * Test of instruction analysis in general and insn_get_length() in
+ * particular. See if insn_get_length() and the disassembler agree
+ * on the length of each instruction in an elf disassembly.
+ *
+ * Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
+ */
+
+const char *prog;
+static int verbose;
+static int x86_64;
+
+static void usage(void)
+{
+ fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |"
+ " %s [-y|-n] [-v]\n", prog);
+ fprintf(stderr, "\t-y 64bit mode\n");
+ fprintf(stderr, "\t-n 32bit mode\n");
+ fprintf(stderr, "\t-v verbose mode\n");
+ exit(1);
+}
+
+static void malformed_line(const char *line, int line_nr)
+{
+ fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line);
+ exit(3);
+}
+
+static void dump_field(FILE *fp, const char *name, const char *indent,
+ struct insn_field *field)
+{
+ fprintf(fp, "%s.%s = {\n", indent, name);
+ fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n",
+ indent, field->value, field->bytes[0], field->bytes[1],
+ field->bytes[2], field->bytes[3]);
+ fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent,
+ field->got, field->nbytes);
+}
+
+static void dump_insn(FILE *fp, struct insn *insn)
+{
+ fprintf(fp, "Instruction = {\n");
+ dump_field(fp, "prefixes", "\t", &insn->prefixes);
+ dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix);
+ dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix);
+ dump_field(fp, "opcode", "\t", &insn->opcode);
+ dump_field(fp, "modrm", "\t", &insn->modrm);
+ dump_field(fp, "sib", "\t", &insn->sib);
+ dump_field(fp, "displacement", "\t", &insn->displacement);
+ dump_field(fp, "immediate1", "\t", &insn->immediate1);
+ dump_field(fp, "immediate2", "\t", &insn->immediate2);
+ fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n",
+ insn->attr, insn->opnd_bytes, insn->addr_bytes);
+ fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n",
+ insn->length, insn->x86_64, insn->kaddr);
+}
+
+static void parse_args(int argc, char **argv)
+{
+ int c;
+ prog = argv[0];
+ while ((c = getopt(argc, argv, "ynv")) != -1) {
+ switch (c) {
+ case 'y':
+ x86_64 = 1;
+ break;
+ case 'n':
+ x86_64 = 0;
+ break;
+ case 'v':
+ verbose = 1;
+ break;
+ default:
+ usage();
+ }
+ }
+}
+
+#define BUFSIZE 256
+
+int main(int argc, char **argv)
+{
+ char line[BUFSIZE], sym[BUFSIZE] = "<unknown>";
+ unsigned char insn_buf[16];
+ struct insn insn;
+ int insns = 0;
+ int warnings = 0;
+
+ parse_args(argc, argv);
+
+ while (fgets(line, BUFSIZE, stdin)) {
+ char copy[BUFSIZE], *s, *tab1, *tab2;
+ int nb = 0;
+ unsigned int b;
+
+ if (line[0] == '<') {
+ /* Symbol line */
+ strcpy(sym, line);
+ continue;
+ }
+
+ insns++;
+ memset(insn_buf, 0, 16);
+ strcpy(copy, line);
+ tab1 = strchr(copy, '\t');
+ if (!tab1)
+ malformed_line(line, insns);
+ s = tab1 + 1;
+ s += strspn(s, " ");
+ tab2 = strchr(s, '\t');
+ if (!tab2)
+ malformed_line(line, insns);
+ *tab2 = '\0'; /* Characters beyond tab2 aren't examined */
+ while (s < tab2) {
+ if (sscanf(s, "%x", &b) == 1) {
+ insn_buf[nb++] = (unsigned char) b;
+ s += 3;
+ } else
+ break;
+ }
+ /* Decode an instruction */
+ insn_init(&insn, insn_buf, x86_64);
+ insn_get_length(&insn);
+ if (insn.length != nb) {
+ warnings++;
+ fprintf(stderr, "Warning: %s found difference at %s\n",
+ prog, sym);
+ fprintf(stderr, "Warning: %s", line);
+ fprintf(stderr, "Warning: objdump says %d bytes, but "
+ "insn_get_length() says %d\n", nb,
+ insn.length);
+ if (verbose)
+ dump_insn(stderr, &insn);
+ }
+ }
+ if (warnings)
+ fprintf(stderr, "Warning: decoded and checked %d"
+ " instructions with %d warnings\n", insns, warnings);
+ else
+ fprintf(stderr, "Succeed: decoded and checked %d"
+ " instructions\n", insns);
+ return 0;
+}
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 58bc00f68b12..02b442e92007 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -393,7 +393,6 @@ static ctl_table abi_table2[] = {
static ctl_table abi_root_table2[] = {
{
- .ctl_name = CTL_ABI,
.procname = "abi",
.mode = 0555,
.child = abi_table2
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index b53225d2cac3..e133ce25e290 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -100,7 +100,7 @@ static int xen_array_release(struct inode *inode, struct file *file)
return 0;
}
-static struct file_operations u32_array_fops = {
+static const struct file_operations u32_array_fops = {
.owner = THIS_MODULE,
.open = u32_array_open,
.release= xen_array_release,
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 3439616d69f1..b607239c1ba8 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -27,7 +27,9 @@
#include <linux/page-flags.h>
#include <linux/highmem.h>
#include <linux/console.h>
+#include <linux/pci.h>
+#include <xen/xen.h>
#include <xen/interface/xen.h>
#include <xen/interface/version.h>
#include <xen/interface/physdev.h>
@@ -48,6 +50,7 @@
#include <asm/traps.h>
#include <asm/setup.h>
#include <asm/desc.h>
+#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/reboot.h>
@@ -138,24 +141,23 @@ static void xen_vcpu_setup(int cpu)
*/
void xen_vcpu_restore(void)
{
- if (have_vcpu_info_placement) {
- int cpu;
+ int cpu;
- for_each_online_cpu(cpu) {
- bool other_cpu = (cpu != smp_processor_id());
+ for_each_online_cpu(cpu) {
+ bool other_cpu = (cpu != smp_processor_id());
- if (other_cpu &&
- HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
- BUG();
+ if (other_cpu &&
+ HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
+ BUG();
- xen_vcpu_setup(cpu);
+ xen_setup_runstate_info(cpu);
- if (other_cpu &&
- HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
- BUG();
- }
+ if (have_vcpu_info_placement)
+ xen_vcpu_setup(cpu);
- BUG_ON(!have_vcpu_info_placement);
+ if (other_cpu &&
+ HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
+ BUG();
}
}
@@ -178,6 +180,7 @@ static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
static void xen_cpuid(unsigned int *ax, unsigned int *bx,
unsigned int *cx, unsigned int *dx)
{
+ unsigned maskebx = ~0;
unsigned maskecx = ~0;
unsigned maskedx = ~0;
@@ -185,9 +188,16 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
* Mask out inconvenient features, to try and disable as many
* unsupported kernel subsystems as possible.
*/
- if (*ax == 1) {
+ switch (*ax) {
+ case 1:
maskecx = cpuid_leaf1_ecx_mask;
maskedx = cpuid_leaf1_edx_mask;
+ break;
+
+ case 0xb:
+ /* Suppress extended topology stuff */
+ maskebx = 0;
+ break;
}
asm(XEN_EMULATE_PREFIX "cpuid"
@@ -197,6 +207,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
"=d" (*dx)
: "0" (*ax), "2" (*cx));
+ *bx &= maskebx;
*cx &= maskecx;
*dx &= maskedx;
}
@@ -1075,6 +1086,8 @@ asmlinkage void __init xen_start_kernel(void)
* Set up some pagetable state before starting to set any ptes.
*/
+ xen_init_mmu_ops();
+
/* Prevent unwanted bits from being set in PTEs. */
__supported_pte_mask &= ~_PAGE_GLOBAL;
if (!xen_initial_domain())
@@ -1082,10 +1095,14 @@ asmlinkage void __init xen_start_kernel(void)
__supported_pte_mask |= _PAGE_IOMAP;
-#ifdef CONFIG_X86_64
+ /*
+ * Prevent page tables from being allocated in highmem, even
+ * if CONFIG_HIGHPTE is enabled.
+ */
+ __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
+
/* Work out if we support NX */
- check_efer();
-#endif
+ x86_configure_nx();
xen_setup_features();
@@ -1099,7 +1116,6 @@ asmlinkage void __init xen_start_kernel(void)
*/
xen_setup_stackprotector();
- xen_init_mmu_ops();
xen_init_irq_ops();
xen_init_cpuid_mask();
@@ -1142,9 +1158,13 @@ asmlinkage void __init xen_start_kernel(void)
/* keep using Xen gdt for now; no urgent need to change it */
+#ifdef CONFIG_X86_32
pv_info.kernel_rpl = 1;
if (xen_feature(XENFEAT_supervisor_mode_kernel))
pv_info.kernel_rpl = 0;
+#else
+ pv_info.kernel_rpl = 0;
+#endif
/* set the limit of our address space */
xen_reserve_top();
@@ -1168,10 +1188,16 @@ asmlinkage void __init xen_start_kernel(void)
add_preferred_console("xenboot", 0, NULL);
add_preferred_console("tty", 0, NULL);
add_preferred_console("hvc", 0, NULL);
+ } else {
+ /* Make sure ACS will be enabled */
+ pci_request_acs();
}
+
xen_raw_console_write("about to get started...\n");
+ xen_setup_runstate_info(0);
+
/* Start the world */
#ifdef CONFIG_X86_32
i386_start_kernel();
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3bf7b1d250ce..f9eb7de74f42 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -185,7 +185,7 @@ static inline unsigned p2m_index(unsigned long pfn)
}
/* Build the parallel p2m_top_mfn structures */
-static void __init xen_build_mfn_list_list(void)
+void xen_build_mfn_list_list(void)
{
unsigned pfn, idx;
@@ -1427,23 +1427,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
#endif
}
-#ifdef CONFIG_HIGHPTE
-static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
-{
- pgprot_t prot = PAGE_KERNEL;
-
- if (PagePinned(page))
- prot = PAGE_KERNEL_RO;
-
- if (0 && PageHighMem(page))
- printk("mapping highpte %lx type %d prot %s\n",
- page_to_pfn(page), type,
- (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
-
- return kmap_atomic_prot(page, type, prot);
-}
-#endif
-
#ifdef CONFIG_X86_32
static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
{
@@ -1902,10 +1885,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.alloc_pmd_clone = paravirt_nop,
.release_pmd = xen_release_pmd_init,
-#ifdef CONFIG_HIGHPTE
- .kmap_atomic_pte = xen_kmap_atomic_pte,
-#endif
-
#ifdef CONFIG_X86_64
.set_pte = xen_set_pte,
#else
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index fe03eeed7b48..deafb65ef44e 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -35,10 +35,10 @@
cpumask_var_t xen_cpu_initialized_map;
-static DEFINE_PER_CPU(int, resched_irq);
-static DEFINE_PER_CPU(int, callfunc_irq);
-static DEFINE_PER_CPU(int, callfuncsingle_irq);
-static DEFINE_PER_CPU(int, debug_irq) = -1;
+static DEFINE_PER_CPU(int, xen_resched_irq);
+static DEFINE_PER_CPU(int, xen_callfunc_irq);
+static DEFINE_PER_CPU(int, xen_callfuncsingle_irq);
+static DEFINE_PER_CPU(int, xen_debug_irq) = -1;
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
@@ -73,7 +73,7 @@ static __cpuinit void cpu_bringup(void)
xen_setup_cpu_clockevents();
- cpu_set(cpu, cpu_online_map);
+ set_cpu_online(cpu, true);
percpu_write(cpu_state, CPU_ONLINE);
wmb();
@@ -103,7 +103,7 @@ static int xen_smp_intr_init(unsigned int cpu)
NULL);
if (rc < 0)
goto fail;
- per_cpu(resched_irq, cpu) = rc;
+ per_cpu(xen_resched_irq, cpu) = rc;
callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
@@ -114,7 +114,7 @@ static int xen_smp_intr_init(unsigned int cpu)
NULL);
if (rc < 0)
goto fail;
- per_cpu(callfunc_irq, cpu) = rc;
+ per_cpu(xen_callfunc_irq, cpu) = rc;
debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt,
@@ -122,7 +122,7 @@ static int xen_smp_intr_init(unsigned int cpu)
debug_name, NULL);
if (rc < 0)
goto fail;
- per_cpu(debug_irq, cpu) = rc;
+ per_cpu(xen_debug_irq, cpu) = rc;
callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
@@ -133,19 +133,20 @@ static int xen_smp_intr_init(unsigned int cpu)
NULL);
if (rc < 0)
goto fail;
- per_cpu(callfuncsingle_irq, cpu) = rc;
+ per_cpu(xen_callfuncsingle_irq, cpu) = rc;
return 0;
fail:
- if (per_cpu(resched_irq, cpu) >= 0)
- unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
- if (per_cpu(callfunc_irq, cpu) >= 0)
- unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
- if (per_cpu(debug_irq, cpu) >= 0)
- unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
- if (per_cpu(callfuncsingle_irq, cpu) >= 0)
- unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
+ if (per_cpu(xen_resched_irq, cpu) >= 0)
+ unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
+ if (per_cpu(xen_callfunc_irq, cpu) >= 0)
+ unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
+ if (per_cpu(xen_debug_irq, cpu) >= 0)
+ unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
+ if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0)
+ unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu),
+ NULL);
return rc;
}
@@ -295,6 +296,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
(unsigned long)task_stack_page(idle) -
KERNEL_STACK_OFFSET + THREAD_SIZE;
#endif
+ xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
xen_init_lock_cpu(cpu);
@@ -348,10 +350,10 @@ static void xen_cpu_die(unsigned int cpu)
current->state = TASK_UNINTERRUPTIBLE;
schedule_timeout(HZ/10);
}
- unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
- unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
- unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
- unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
+ unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
+ unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
+ unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
+ unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
xen_uninit_lock_cpu(cpu);
xen_teardown_timer(cpu);
@@ -359,7 +361,7 @@ static void xen_cpu_die(unsigned int cpu)
alternatives_smp_switch(0);
}
-static void __cpuinit xen_play_dead(void) /* used only with CPU_HOTPLUG */
+static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */
{
play_dead_common();
HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 36a5141108df..24ded31b5aec 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -120,14 +120,14 @@ struct xen_spinlock {
unsigned short spinners; /* count of waiting cpus */
};
-static int xen_spin_is_locked(struct raw_spinlock *lock)
+static int xen_spin_is_locked(struct arch_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
return xl->lock != 0;
}
-static int xen_spin_is_contended(struct raw_spinlock *lock)
+static int xen_spin_is_contended(struct arch_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
@@ -136,7 +136,7 @@ static int xen_spin_is_contended(struct raw_spinlock *lock)
return xl->spinners != 0;
}
-static int xen_spin_trylock(struct raw_spinlock *lock)
+static int xen_spin_trylock(struct arch_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
u8 old = 1;
@@ -181,7 +181,7 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock
__get_cpu_var(lock_spinners) = prev;
}
-static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable)
+static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
struct xen_spinlock *prev;
@@ -254,7 +254,7 @@ out:
return ret;
}
-static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
+static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
unsigned timeout;
@@ -291,12 +291,12 @@ static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
spin_time_accum_total(start_spin);
}
-static void xen_spin_lock(struct raw_spinlock *lock)
+static void xen_spin_lock(struct arch_spinlock *lock)
{
__xen_spin_lock(lock, false);
}
-static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
+static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags)
{
__xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
}
@@ -317,7 +317,7 @@ static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
}
}
-static void xen_spin_unlock(struct raw_spinlock *lock)
+static void xen_spin_unlock(struct arch_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 95be7b434724..987267f79bf5 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -1,4 +1,5 @@
#include <linux/types.h>
+#include <linux/clockchips.h>
#include <xen/interface/xen.h>
#include <xen/grant_table.h>
@@ -27,6 +28,8 @@ void xen_pre_suspend(void)
void xen_post_suspend(int suspend_cancelled)
{
+ xen_build_mfn_list_list();
+
xen_setup_shared_info();
if (suspend_cancelled) {
@@ -44,7 +47,19 @@ void xen_post_suspend(int suspend_cancelled)
}
+static void xen_vcpu_notify_restore(void *data)
+{
+ unsigned long reason = (unsigned long)data;
+
+ /* Boot processor notified via generic timekeeping_resume() */
+ if ( smp_processor_id() == 0)
+ return;
+
+ clockevents_notify(reason, NULL);
+}
+
void xen_arch_resume(void)
{
- /* nothing */
+ smp_call_function(xen_vcpu_notify_restore,
+ (void *)CLOCK_EVT_NOTIFY_RESUME, 1);
}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 0a5aa44299a5..0d3f07cd1b5f 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -31,14 +31,14 @@
#define NS_PER_TICK (1000000000LL / HZ)
/* runstate info updated by Xen */
-static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
/* snapshots of runstate info */
-static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
+static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
/* unused ns of stolen and blocked time */
-static DEFINE_PER_CPU(u64, residual_stolen);
-static DEFINE_PER_CPU(u64, residual_blocked);
+static DEFINE_PER_CPU(u64, xen_residual_stolen);
+static DEFINE_PER_CPU(u64, xen_residual_blocked);
/* return an consistent snapshot of 64-bit time/counter value */
static u64 get64(const u64 *p)
@@ -79,7 +79,7 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
BUG_ON(preemptible());
- state = &__get_cpu_var(runstate);
+ state = &__get_cpu_var(xen_runstate);
/*
* The runstate info is always updated by the hypervisor on
@@ -97,14 +97,14 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
/* return true when a vcpu could run but has no real cpu to run on */
bool xen_vcpu_stolen(int vcpu)
{
- return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
+ return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
}
-static void setup_runstate_info(int cpu)
+void xen_setup_runstate_info(int cpu)
{
struct vcpu_register_runstate_memory_area area;
- area.addr.v = &per_cpu(runstate, cpu);
+ area.addr.v = &per_cpu(xen_runstate, cpu);
if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
cpu, &area))
@@ -122,7 +122,7 @@ static void do_stolen_accounting(void)
WARN_ON(state.state != RUNSTATE_running);
- snap = &__get_cpu_var(runstate_snapshot);
+ snap = &__get_cpu_var(xen_runstate_snapshot);
/* work out how much time the VCPU has not been runn*ing* */
blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
@@ -133,24 +133,24 @@ static void do_stolen_accounting(void)
/* Add the appropriate number of ticks of stolen time,
including any left-overs from last time. */
- stolen = runnable + offline + __get_cpu_var(residual_stolen);
+ stolen = runnable + offline + __get_cpu_var(xen_residual_stolen);
if (stolen < 0)
stolen = 0;
ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
- __get_cpu_var(residual_stolen) = stolen;
+ __get_cpu_var(xen_residual_stolen) = stolen;
account_steal_ticks(ticks);
/* Add the appropriate number of ticks of blocked time,
including any left-overs from last time. */
- blocked += __get_cpu_var(residual_blocked);
+ blocked += __get_cpu_var(xen_residual_blocked);
if (blocked < 0)
blocked = 0;
ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
- __get_cpu_var(residual_blocked) = blocked;
+ __get_cpu_var(xen_residual_blocked) = blocked;
account_idle_ticks(ticks);
}
@@ -434,7 +434,7 @@ void xen_setup_timer(int cpu)
name = "<timer kasprintf failed>";
irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
- IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER,
name, NULL);
evt = &per_cpu(xen_clock_events, cpu);
@@ -442,8 +442,6 @@ void xen_setup_timer(int cpu)
evt->cpumask = cpumask_of(cpu);
evt->irq = irq;
-
- setup_runstate_info(cpu);
}
void xen_teardown_timer(int cpu)
@@ -494,6 +492,7 @@ __init void xen_time_init(void)
setup_force_cpu_cap(X86_FEATURE_TSC);
+ xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
xen_setup_cpu_clockevents();
}
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 88e15deb8b82..22a2093b5862 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -90,9 +90,9 @@ ENTRY(xen_iret)
GET_THREAD_INFO(%eax)
movl TI_cpu(%eax), %eax
movl __per_cpu_offset(,%eax,4), %eax
- mov per_cpu__xen_vcpu(%eax), %eax
+ mov xen_vcpu(%eax), %eax
#else
- movl per_cpu__xen_vcpu, %eax
+ movl xen_vcpu, %eax
#endif
/* check IF state we're restoring */
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 02f496a8dbaa..53adefda4275 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -96,7 +96,7 @@ ENTRY(xen_sysret32)
pushq $__USER32_CS
pushq %rcx
- pushq $VGCF_in_syscall
+ pushq $0
1: jmp hypercall_iret
ENDPATCH(xen_sysret32)
RELOC(xen_sysret32, 1b+1)
@@ -151,7 +151,7 @@ ENTRY(xen_syscall32_target)
ENTRY(xen_sysenter_target)
lea 16(%rsp), %rsp /* strip %rcx, %r11 */
mov $-ENOSYS, %rax
- pushq $VGCF_in_syscall
+ pushq $0
jmp hypercall_iret
ENDPROC(xen_syscall32_target)
ENDPROC(xen_sysenter_target)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 355fa6b99c9c..f9153a300bce 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -25,6 +25,7 @@ extern struct shared_info *HYPERVISOR_shared_info;
void xen_setup_mfn_list_list(void);
void xen_setup_shared_info(void);
+void xen_build_mfn_list_list(void);
void xen_setup_machphys_mapping(void);
pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_ident_map_ISA(void);
@@ -41,6 +42,7 @@ void __init xen_build_dynamic_phys_to_machine(void);
void xen_init_irq_ops(void);
void xen_setup_timer(int cpu);
+void xen_setup_runstate_info(int cpu);
void xen_teardown_timer(int cpu);
cycle_t xen_clocksource_read(void);
void xen_setup_cpu_clockevents(void);