summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild1
-rw-r--r--arch/x86/Kconfig176
-rw-r--r--arch/x86/Kconfig.debug14
-rw-r--r--arch/x86/Makefile10
-rw-r--r--arch/x86/Makefile_32.cpu13
-rw-r--r--arch/x86/boot/Makefile8
-rw-r--r--arch/x86/boot/boot.h28
-rw-r--r--arch/x86/boot/cmdline.c6
-rw-r--r--arch/x86/boot/compressed/Makefile4
-rw-r--r--arch/x86/boot/compressed/cmdline.c21
-rw-r--r--arch/x86/boot/compressed/early_serial_console.c5
-rw-r--r--arch/x86/boot/compressed/head_32.S13
-rw-r--r--arch/x86/boot/compressed/head_64.S13
-rw-r--r--arch/x86/boot/compressed/misc.c85
-rw-r--r--arch/x86/boot/compressed/misc.h39
-rw-r--r--arch/x86/boot/compressed/string.c2
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S6
-rw-r--r--arch/x86/boot/ctype.h21
-rw-r--r--arch/x86/boot/early_serial_console.c151
-rw-r--r--arch/x86/boot/main.c9
-rw-r--r--arch/x86/boot/printf.c4
-rw-r--r--arch/x86/boot/string.c63
-rw-r--r--arch/x86/boot/tty.c37
-rw-r--r--arch/x86/configs/i386_defconfig2241
-rw-r--r--arch/x86/configs/x86_64_defconfig2208
-rw-r--r--arch/x86/ia32/ia32_aout.c22
-rw-r--r--arch/x86/ia32/ia32entry.S25
-rw-r--r--arch/x86/ia32/sys_ia32.c23
-rw-r--r--arch/x86/include/asm/Kbuild29
-rw-r--r--arch/x86/include/asm/acpi.h5
-rw-r--r--arch/x86/include/asm/alternative.h18
-rw-r--r--arch/x86/include/asm/amd_iommu.h6
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h8
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h23
-rw-r--r--arch/x86/include/asm/amd_nb.h (renamed from arch/x86/include/asm/k8.h)21
-rw-r--r--arch/x86/include/asm/apb_timer.h2
-rw-r--r--arch/x86/include/asm/apic.h4
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/bitops.h4
-rw-r--r--arch/x86/include/asm/bootparam.h11
-rw-r--r--arch/x86/include/asm/calgary.h4
-rw-r--r--arch/x86/include/asm/calling.h52
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h198
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h83
-rw-r--r--arch/x86/include/asm/compat.h2
-rw-r--r--arch/x86/include/asm/cpu.h1
-rw-r--r--arch/x86/include/asm/cpufeature.h61
-rw-r--r--arch/x86/include/asm/dma-mapping.h8
-rw-r--r--arch/x86/include/asm/dwarf2.h20
-rw-r--r--arch/x86/include/asm/e820.h20
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h23
-rw-r--r--arch/x86/include/asm/fixmap.h15
-rw-r--r--arch/x86/include/asm/gart.h20
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/highmem.h11
-rw-r--r--arch/x86/include/asm/hpet.h11
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h19
-rw-r--r--arch/x86/include/asm/hypervisor.h1
-rw-r--r--arch/x86/include/asm/i387.h203
-rw-r--r--arch/x86/include/asm/i8259.h2
-rw-r--r--arch/x86/include/asm/intel_scu_ipc.h20
-rw-r--r--arch/x86/include/asm/io.h15
-rw-r--r--arch/x86/include/asm/io_apic.h7
-rw-r--r--arch/x86/include/asm/iomap.h6
-rw-r--r--arch/x86/include/asm/iommu_table.h100
-rw-r--r--arch/x86/include/asm/irq.h12
-rw-r--r--arch/x86/include/asm/irq_remapping.h35
-rw-r--r--arch/x86/include/asm/irq_vectors.h7
-rw-r--r--arch/x86/include/asm/irqflags.h32
-rw-r--r--arch/x86/include/asm/jump_label.h37
-rw-r--r--arch/x86/include/asm/kdebug.h6
-rw-r--r--arch/x86/include/asm/kgdb.h20
-rw-r--r--arch/x86/include/asm/kvm.h22
-rw-r--r--arch/x86/include/asm/kvm_emulate.h61
-rw-r--r--arch/x86/include/asm/kvm_host.h175
-rw-r--r--arch/x86/include/asm/kvm_para.h6
-rw-r--r--arch/x86/include/asm/local64.h1
-rw-r--r--arch/x86/include/asm/mce.h4
-rw-r--r--arch/x86/include/asm/memblock.h23
-rw-r--r--arch/x86/include/asm/module.h7
-rw-r--r--arch/x86/include/asm/mrst.h34
-rw-r--r--arch/x86/include/asm/msr-index.h28
-rw-r--r--arch/x86/include/asm/msr.h4
-rw-r--r--arch/x86/include/asm/mwait.h15
-rw-r--r--arch/x86/include/asm/nmi.h2
-rw-r--r--arch/x86/include/asm/olpc.h2
-rw-r--r--arch/x86/include/asm/olpc_ofw.h35
-rw-r--r--arch/x86/include/asm/page.h7
-rw-r--r--arch/x86/include/asm/page_32_types.h4
-rw-r--r--arch/x86/include/asm/page_types.h2
-rw-r--r--arch/x86/include/asm/paravirt.h21
-rw-r--r--arch/x86/include/asm/paravirt_types.h1
-rw-r--r--arch/x86/include/asm/pci.h39
-rw-r--r--arch/x86/include/asm/pci_x86.h2
-rw-r--r--arch/x86/include/asm/percpu.h14
-rw-r--r--arch/x86/include/asm/perf_event.h37
-rw-r--r--arch/x86/include/asm/perf_event_p4.h151
-rw-r--r--arch/x86/include/asm/pgtable.h4
-rw-r--r--arch/x86/include/asm/pgtable_32.h15
-rw-r--r--arch/x86/include/asm/pgtable_64.h6
-rw-r--r--arch/x86/include/asm/processor.h50
-rw-r--r--arch/x86/include/asm/pvclock.h38
-rw-r--r--arch/x86/include/asm/required-features.h2
-rw-r--r--arch/x86/include/asm/rwsem.h21
-rw-r--r--arch/x86/include/asm/scatterlist.h1
-rw-r--r--arch/x86/include/asm/segment.h32
-rw-r--r--arch/x86/include/asm/setup.h7
-rw-r--r--arch/x86/include/asm/smp.h9
-rw-r--r--arch/x86/include/asm/stacktrace.h49
-rw-r--r--arch/x86/include/asm/swiotlb.h13
-rw-r--r--arch/x86/include/asm/sys_ia32.h15
-rw-r--r--arch/x86/include/asm/syscalls.h5
-rw-r--r--arch/x86/include/asm/system.h7
-rw-r--r--arch/x86/include/asm/tlbflush.h2
-rw-r--r--arch/x86/include/asm/trampoline.h2
-rw-r--r--arch/x86/include/asm/tsc.h2
-rw-r--r--arch/x86/include/asm/unistd_32.h5
-rw-r--r--arch/x86/include/asm/unistd_64.h6
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h151
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h21
-rw-r--r--arch/x86/include/asm/vmi.h269
-rw-r--r--arch/x86/include/asm/vmi_time.h98
-rw-r--r--arch/x86/include/asm/vmx.h5
-rw-r--r--arch/x86/include/asm/x86_init.h9
-rw-r--r--arch/x86/include/asm/xen/hypercall.h23
-rw-r--r--arch/x86/include/asm/xen/page.h20
-rw-r--r--arch/x86/include/asm/xen/pci.h65
-rw-r--r--arch/x86/include/asm/xen/swiotlb-xen.h14
-rw-r--r--arch/x86/include/asm/xsave.h40
-rw-r--r--arch/x86/kernel/Makefile21
-rw-r--r--arch/x86/kernel/acpi/boot.c60
-rw-r--r--arch/x86/kernel/acpi/cstate.c13
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S2
-rw-r--r--arch/x86/kernel/acpi/sleep.c17
-rw-r--r--arch/x86/kernel/alternative.c33
-rw-r--r--arch/x86/kernel/amd_iommu.c14
-rw-r--r--arch/x86/kernel/amd_iommu_init.c178
-rw-r--r--arch/x86/kernel/amd_nb.c (renamed from arch/x86/kernel/k8.c)56
-rw-r--r--arch/x86/kernel/apb_timer.c97
-rw-r--r--arch/x86/kernel/aperture_64.c35
-rw-r--r--arch/x86/kernel/apic/Makefile7
-rw-r--r--arch/x86/kernel/apic/apic.c95
-rw-r--r--arch/x86/kernel/apic/es7000_32.c1
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c107
-rw-r--r--arch/x86/kernel/apic/io_apic.c888
-rw-r--r--arch/x86/kernel/apic/nmi.c9
-rw-r--r--arch/x86/kernel/apic/numaq_32.c3
-rw-r--r--arch/x86/kernel/apic/probe_64.c3
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c34
-rw-r--r--arch/x86/kernel/apm_32.c7
-rw-r--r--arch/x86/kernel/asm-offsets_32.c4
-rw-r--r--arch/x86/kernel/check.c16
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/amd.c154
-rw-r--r--arch/x86/kernel/cpu/common.c54
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c10
-rw-r--r--arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c11
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h26
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c7
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c20
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c8
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c3
-rw-r--r--arch/x86/kernel/cpu/intel.c7
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c123
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c36
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c40
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c216
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c8
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c131
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c56
-rw-r--r--arch/x86/kernel/cpu/perf_event.c439
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c102
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c229
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c433
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c9
-rw-r--r--arch/x86/kernel/cpu/scattered.c70
-rw-r--r--arch/x86/kernel/cpu/topology.c (renamed from arch/x86/kernel/cpu/addon_cpuid_features.c)58
-rw-r--r--arch/x86/kernel/cpu/vmware.c9
-rw-r--r--arch/x86/kernel/crash.c3
-rw-r--r--arch/x86/kernel/crash_dump_32.c2
-rw-r--r--arch/x86/kernel/crash_dump_64.c3
-rw-r--r--arch/x86/kernel/dumpstack.c1
-rw-r--r--arch/x86/kernel/dumpstack.h56
-rw-r--r--arch/x86/kernel/dumpstack_32.c8
-rw-r--r--arch/x86/kernel/dumpstack_64.c9
-rw-r--r--arch/x86/kernel/e820.c191
-rw-r--r--arch/x86/kernel/early-quirks.c20
-rw-r--r--arch/x86/kernel/early_printk.c13
-rw-r--r--arch/x86/kernel/early_printk_mrst.c319
-rw-r--r--arch/x86/kernel/entry_32.S326
-rw-r--r--arch/x86/kernel/entry_64.S147
-rw-r--r--arch/x86/kernel/ftrace.c63
-rw-r--r--arch/x86/kernel/head.c3
-rw-r--r--arch/x86/kernel/head32.c13
-rw-r--r--arch/x86/kernel/head64.c9
-rw-r--r--arch/x86/kernel/head_32.S55
-rw-r--r--arch/x86/kernel/head_64.S5
-rw-r--r--arch/x86/kernel/hpet.c83
-rw-r--r--arch/x86/kernel/hw_breakpoint.c79
-rw-r--r--arch/x86/kernel/i387.c81
-rw-r--r--arch/x86/kernel/i8259.c63
-rw-r--r--arch/x86/kernel/irq.c32
-rw-r--r--arch/x86/kernel/irq_32.c25
-rw-r--r--arch/x86/kernel/irq_work.c30
-rw-r--r--arch/x86/kernel/irqinit.c23
-rw-r--r--arch/x86/kernel/jump_label.c50
-rw-r--r--arch/x86/kernel/kdebugfs.c1
-rw-r--r--arch/x86/kernel/kgdb.c205
-rw-r--r--arch/x86/kernel/kprobes.c72
-rw-r--r--arch/x86/kernel/kvmclock.c6
-rw-r--r--arch/x86/kernel/machine_kexec_64.c4
-rw-r--r--arch/x86/kernel/microcode_core.c3
-rw-r--r--arch/x86/kernel/microcode_intel.c2
-rw-r--r--arch/x86/kernel/module.c6
-rw-r--r--arch/x86/kernel/mpparse.c21
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/kernel/pci-calgary_64.c18
-rw-r--r--arch/x86/kernel/pci-dma.c41
-rw-r--r--arch/x86/kernel/pci-gart_64.c33
-rw-r--r--arch/x86/kernel/pci-iommu_table.c89
-rw-r--r--arch/x86/kernel/pci-swiotlb.c44
-rw-r--r--arch/x86/kernel/pmtimer_64.c69
-rw-r--r--arch/x86/kernel/process.c59
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c7
-rw-r--r--arch/x86/kernel/ptrace.c17
-rw-r--r--arch/x86/kernel/pvclock.c3
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c14
-rw-r--r--arch/x86/kernel/setup.c237
-rw-r--r--arch/x86/kernel/setup_percpu.c8
-rw-r--r--arch/x86/kernel/smp.c15
-rw-r--r--arch/x86/kernel/smpboot.c181
-rw-r--r--arch/x86/kernel/stacktrace.c31
-rw-r--r--arch/x86/kernel/sys_i386_32.c8
-rw-r--r--arch/x86/kernel/syscall_table_32.S3
-rw-r--r--arch/x86/kernel/trampoline.c11
-rw-r--r--arch/x86/kernel/traps.c43
-rw-r--r--arch/x86/kernel/tsc.c109
-rw-r--r--arch/x86/kernel/verify_cpu_64.S3
-rw-r--r--arch/x86/kernel/vm86_32.c10
-rw-r--r--arch/x86/kernel/vmi_32.c893
-rw-r--r--arch/x86/kernel/vmiclock_32.c317
-rw-r--r--arch/x86/kernel/vmlinux.lds.S30
-rw-r--r--arch/x86/kernel/vsyscall_64.c17
-rw-r--r--arch/x86/kernel/x86_init.c7
-rw-r--r--arch/x86/kernel/xsave.c195
-rw-r--r--arch/x86/kvm/Kconfig7
-rw-r--r--arch/x86/kvm/emulate.c2770
-rw-r--r--arch/x86/kvm/i8254.c158
-rw-r--r--arch/x86/kvm/i8254.h4
-rw-r--r--arch/x86/kvm/i8259.c72
-rw-r--r--arch/x86/kvm/irq.c9
-rw-r--r--arch/x86/kvm/irq.h8
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h17
-rw-r--r--arch/x86/kvm/lapic.c33
-rw-r--r--arch/x86/kvm/mmu.c1639
-rw-r--r--arch/x86/kvm/mmu.h9
-rw-r--r--arch/x86/kvm/mmu_audit.c299
-rw-r--r--arch/x86/kvm/mmutrace.h21
-rw-r--r--arch/x86/kvm/paging_tmpl.h434
-rw-r--r--arch/x86/kvm/svm.c441
-rw-r--r--arch/x86/kvm/timer.c16
-rw-r--r--arch/x86/kvm/vmx.c500
-rw-r--r--arch/x86/kvm/x86.c1915
-rw-r--r--arch/x86/kvm/x86.h15
-rw-r--r--arch/x86/lguest/boot.c31
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/atomic64_386_32.S238
-rw-r--r--arch/x86/lib/clear_page_64.S2
-rw-r--r--arch/x86/lib/cmpxchg.c (renamed from arch/x86/kernel/cpu/cmpxchg.c)18
-rw-r--r--arch/x86/lib/copy_page_64.S2
-rw-r--r--arch/x86/lib/copy_user_64.S2
-rw-r--r--arch/x86/lib/memcpy_32.c199
-rw-r--r--arch/x86/lib/memcpy_64.S160
-rw-r--r--arch/x86/lib/memmove_64.c189
-rw-r--r--arch/x86/lib/memset_64.S2
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/dump_pagetables.c32
-rw-r--r--arch/x86/mm/fault.c114
-rw-r--r--arch/x86/mm/highmem_32.c76
-rw-r--r--arch/x86/mm/init.c10
-rw-r--r--arch/x86/mm/init_32.c166
-rw-r--r--arch/x86/mm/init_64.c119
-rw-r--r--arch/x86/mm/iomap_32.c45
-rw-r--r--arch/x86/mm/ioremap.c19
-rw-r--r--arch/x86/mm/k8topology_64.c12
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c2
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c2
-rw-r--r--arch/x86/mm/kmmio.c16
-rw-r--r--arch/x86/mm/memblock.c348
-rw-r--r--arch/x86/mm/memtest.c7
-rw-r--r--arch/x86/mm/numa_32.c30
-rw-r--r--arch/x86/mm/numa_64.c91
-rw-r--r--arch/x86/mm/pat.c2
-rw-r--r--arch/x86/mm/pf_in.c30
-rw-r--r--arch/x86/mm/pgtable.c24
-rw-r--r--arch/x86/mm/srat_32.c3
-rw-r--r--arch/x86/mm/srat_64.c17
-rw-r--r--arch/x86/mm/testmmiotrace.c22
-rw-r--r--arch/x86/mm/tlb.c52
-rw-r--r--arch/x86/oprofile/backtrace.c70
-rw-r--r--arch/x86/oprofile/nmi_int.c55
-rw-r--r--arch/x86/oprofile/op_model_amd.c261
-rw-r--r--arch/x86/pci/Makefile1
-rw-r--r--arch/x86/pci/acpi.c9
-rw-r--r--arch/x86/pci/common.c37
-rw-r--r--arch/x86/pci/i386.c19
-rw-r--r--arch/x86/pci/irq.c17
-rw-r--r--arch/x86/pci/legacy.c2
-rw-r--r--arch/x86/pci/mmconfig-shared.c4
-rw-r--r--arch/x86/pci/olpc.c2
-rw-r--r--arch/x86/pci/xen.c414
-rw-r--r--arch/x86/platform/Makefile8
-rw-r--r--arch/x86/platform/efi/Makefile1
-rw-r--r--arch/x86/platform/efi/efi.c (renamed from arch/x86/kernel/efi.c)5
-rw-r--r--arch/x86/platform/efi/efi_32.c (renamed from arch/x86/kernel/efi_32.c)0
-rw-r--r--arch/x86/platform/efi/efi_64.c (renamed from arch/x86/kernel/efi_64.c)0
-rw-r--r--arch/x86/platform/efi/efi_stub_32.S (renamed from arch/x86/kernel/efi_stub_32.S)0
-rw-r--r--arch/x86/platform/efi/efi_stub_64.S (renamed from arch/x86/kernel/efi_stub_64.S)0
-rw-r--r--arch/x86/platform/mrst/Makefile1
-rw-r--r--arch/x86/platform/mrst/mrst.c (renamed from arch/x86/kernel/mrst.c)105
-rw-r--r--arch/x86/platform/olpc/Makefile3
-rw-r--r--arch/x86/platform/olpc/olpc-xo1.c140
-rw-r--r--arch/x86/platform/olpc/olpc.c (renamed from arch/x86/kernel/olpc.c)107
-rw-r--r--arch/x86/platform/olpc/olpc_ofw.c112
-rw-r--r--arch/x86/platform/scx200/Makefile2
-rw-r--r--arch/x86/platform/scx200/scx200_32.c (renamed from arch/x86/kernel/scx200_32.c)0
-rw-r--r--arch/x86/platform/sfi/Makefile1
-rw-r--r--arch/x86/platform/sfi/sfi.c (renamed from arch/x86/kernel/sfi.c)4
-rw-r--r--arch/x86/platform/uv/Makefile1
-rw-r--r--arch/x86/platform/uv/bios_uv.c (renamed from arch/x86/kernel/bios_uv.c)0
-rw-r--r--arch/x86/platform/uv/tlb_uv.c (renamed from arch/x86/kernel/tlb_uv.c)766
-rw-r--r--arch/x86/platform/uv/uv_irq.c (renamed from arch/x86/kernel/uv_irq.c)55
-rw-r--r--arch/x86/platform/uv/uv_sysfs.c (renamed from arch/x86/kernel/uv_sysfs.c)0
-rw-r--r--arch/x86/platform/uv/uv_time.c (renamed from arch/x86/kernel/uv_time.c)0
-rw-r--r--arch/x86/platform/visws/Makefile1
-rw-r--r--arch/x86/platform/visws/visws_quirks.c (renamed from arch/x86/kernel/visws_quirks.c)140
-rw-r--r--arch/x86/power/cpu.c4
-rw-r--r--arch/x86/power/hibernate_64.c2
-rw-r--r--arch/x86/vdso/Makefile3
-rwxr-xr-xarch/x86/vdso/checkundef.sh10
-rw-r--r--arch/x86/vdso/vdso32-setup.c2
-rw-r--r--arch/x86/vdso/vma.c3
-rw-r--r--arch/x86/xen/Kconfig26
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/debugfs.c1
-rw-r--r--arch/x86/xen/enlighten.c226
-rw-r--r--arch/x86/xen/mmu.c900
-rw-r--r--arch/x86/xen/mmu.h2
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c67
-rw-r--r--arch/x86/xen/platform-pci-unplug.c143
-rw-r--r--arch/x86/xen/setup.c190
-rw-r--r--arch/x86/xen/smp.c34
-rw-r--r--arch/x86/xen/spinlock.c2
-rw-r--r--arch/x86/xen/suspend.c12
-rw-r--r--arch/x86/xen/time.c97
-rw-r--r--arch/x86/xen/xen-ops.h16
370 files changed, 16703 insertions, 14534 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index ad8ec356fb36..0e103236b754 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -14,3 +14,4 @@ obj-y += crypto/
obj-y += vdso/
obj-$(CONFIG_IA32_EMULATION) += ia32/
+obj-y += platform/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index dcb0593b4a66..e8327686d3c5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1,6 +1,3 @@
-# x86 configuration
-mainmenu "Linux Kernel Configuration for x86"
-
# Select 32 or 64 bit
config 64BIT
bool "64-bit kernel" if ARCH = "x86"
@@ -25,14 +22,17 @@ config X86
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_PERF_EVENTS if (!M386 && !M486)
+ select HAVE_IRQ_WORK
select HAVE_IOREMAP_PROT
select HAVE_KPROBES
+ select HAVE_MEMBLOCK
select ARCH_WANT_OPTIONAL_GPIOLIB
select ARCH_WANT_FRAME_POINTERS
select HAVE_DMA_ATTRS
select HAVE_KRETPROBES
select HAVE_OPTPROBES
select HAVE_FTRACE_MCOUNT_RECORD
+ select HAVE_C_RECORDMCOUNT
select HAVE_DYNAMIC_FTRACE
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
@@ -55,9 +55,16 @@ config X86
select HAVE_HW_BREAKPOINT
select HAVE_MIXED_BREAKPOINTS_REGS
select PERF_EVENTS
+ select HAVE_PERF_EVENTS_NMI
select ANON_INODES
select HAVE_ARCH_KMEMCHECK
select HAVE_USER_RETURN_NOTIFIER
+ select HAVE_ARCH_JUMP_LABEL
+ select HAVE_TEXT_POKE_SMP
+ select HAVE_GENERIC_HARDIRQS
+ select HAVE_SPARSE_IRQ
+ select GENERIC_IRQ_PROBE
+ select GENERIC_PENDING_IRQ if SMP
config INSTRUCTION_DECODER
def_bool (KPROBES || PERF_EVENTS)
@@ -72,9 +79,6 @@ config ARCH_DEFCONFIG
default "arch/x86/configs/i386_defconfig" if X86_32
default "arch/x86/configs/x86_64_defconfig" if X86_64
-config GENERIC_TIME
- def_bool y
-
config GENERIC_CMOS_UPDATE
def_bool y
@@ -195,27 +199,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
config ARCH_SUPPORTS_DEBUG_PAGEALLOC
def_bool y
-config HAVE_EARLY_RES
- def_bool y
-
config HAVE_INTEL_TXT
def_bool y
depends on EXPERIMENTAL && DMAR && ACPI
-# Use the generic interrupt handling code in kernel/irq/:
-config GENERIC_HARDIRQS
- def_bool y
-
-config GENERIC_HARDIRQS_NO__DO_IRQ
- def_bool y
-
-config GENERIC_IRQ_PROBE
- def_bool y
-
-config GENERIC_PENDING_IRQ
- def_bool y
- depends on GENERIC_HARDIRQS && SMP
-
config USE_GENERIC_SMP_HELPERS
def_bool y
depends on SMP
@@ -247,6 +234,11 @@ config ARCH_HWEIGHT_CFLAGS
config KTIME_SCALAR
def_bool X86_32
+
+config ARCH_CPU_PROBE_RELEASE
+ def_bool y
+ depends on HOTPLUG_CPU
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
@@ -293,23 +285,6 @@ config X86_X2APIC
If you don't know what to do here, say N.
-config SPARSE_IRQ
- bool "Support sparse irq numbering"
- depends on PCI_MSI || HT_IRQ
- ---help---
- This enables support for sparse irqs. This is useful for distro
- kernels that want to define a high CONFIG_NR_CPUS value but still
- want to have low kernel memory footprint on smaller machines.
-
- ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
- out the irq_desc[] array in a more NUMA-friendly way. )
-
- If you don't know what to do here, say N.
-
-config NUMA_IRQ_DESC
- def_bool y
- depends on SPARSE_IRQ && NUMA
-
config X86_MPPARSE
bool "Enable MPS table" if ACPI
default y
@@ -369,6 +344,7 @@ endif
config X86_VSMP
bool "ScaleMP vSMP"
+ select PARAVIRT_GUEST
select PARAVIRT
depends on X86_64 && PCI
depends on X86_EXTENDED_PLATFORM
@@ -514,25 +490,6 @@ if PARAVIRT_GUEST
source "arch/x86/xen/Kconfig"
-config VMI
- bool "VMI Guest support (DEPRECATED)"
- select PARAVIRT
- depends on X86_32
- ---help---
- VMI provides a paravirtualized interface to the VMware ESX server
- (it could be used by other hypervisors in theory too, but is not
- at the moment), by linking the kernel to a GPL-ed ROM module
- provided by the hypervisor.
-
- As of September 2009, VMware has started a phased retirement
- of this feature from VMware's products. Please see
- feature-removal-schedule.txt for details. If you are
- planning to enable this option, please note that you cannot
- live migrate a VMI enabled VM to a future VMware product,
- which doesn't support VMI. So if you expect your kernel to
- seamlessly migrate to newer VMware products, keep this
- disabled.
-
config KVM_CLOCK
bool "KVM paravirtualized clock"
select PARAVIRT
@@ -587,16 +544,7 @@ config PARAVIRT_DEBUG
a paravirt_op is missing when it is called.
config NO_BOOTMEM
- default y
- bool "Disable Bootmem code"
- ---help---
- Use early_res directly instead of bootmem before slab is ready.
- - allocator (buddy) [generic]
- - early allocator (bootmem) [generic]
- - very early allocator (reserve_early*()) [x86]
- - very very early allocator (early brk model) [x86]
- So reduce one layer between early allocator to final allocator
-
+ def_bool y
config MEMTEST
bool "Memtest"
@@ -667,7 +615,7 @@ config GART_IOMMU
bool "GART IOMMU support" if EMBEDDED
default y
select SWIOTLB
- depends on X86_64 && PCI && K8_NB
+ depends on X86_64 && PCI && AMD_NB
---help---
Support for full DMA access of devices with 32bit memory access only
on systems with more than 3GB. This is usually needed for USB,
@@ -751,11 +699,11 @@ config IOMMU_API
def_bool (AMD_IOMMU || DMAR)
config MAXSMP
- bool "Configure Maximum number of SMP Processors and NUMA Nodes"
+ bool "Enable Maximum number of SMP Processors and NUMA Nodes"
depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
select CPUMASK_OFFSTACK
---help---
- Configure maximum number of CPUS and NUMA Nodes for this architecture.
+ Enable maximum number of CPUS and NUMA Nodes for this architecture.
If unsure, say N.
config NR_CPUS
@@ -792,6 +740,17 @@ config SCHED_MC
making when dealing with multi-core CPU chips at a cost of slightly
increased overhead in some places. If unsure say N here.
+config IRQ_TIME_ACCOUNTING
+ bool "Fine granularity task level IRQ time accounting"
+ default n
+ ---help---
+ Select this option to enable fine granularity task irq time
+ accounting. This is done by reading a timestamp on each
+ transitions between softirq and hardirq state, so there can be a
+ small performance impact.
+
+ If in doubt, say N here.
+
source "kernel/Kconfig.preempt"
config X86_UP_APIC
@@ -1145,6 +1104,9 @@ config X86_PAE
config ARCH_PHYS_ADDR_T_64BIT
def_bool X86_64 || X86_PAE
+config ARCH_DMA_ADDR_T_64BIT
+ def_bool X86_64 || HIGHMEM64G
+
config DIRECT_GBPAGES
bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
default y
@@ -1323,25 +1285,34 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
Set whether the default state of memory_corruption_check is
on or off.
-config X86_RESERVE_LOW_64K
- bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
- default y
+config X86_RESERVE_LOW
+ int "Amount of low memory, in kilobytes, to reserve for the BIOS"
+ default 64
+ range 4 640
---help---
- Reserve the first 64K of physical RAM on BIOSes that are known
- to potentially corrupt that memory range. A numbers of BIOSes are
- known to utilize this area during suspend/resume, so it must not
- be used by the kernel.
+ Specify the amount of low memory to reserve for the BIOS.
+
+ The first page contains BIOS data structures that the kernel
+ must not use, so that page must always be reserved.
+
+ By default we reserve the first 64K of physical RAM, as a
+ number of BIOSes are known to corrupt that memory range
+ during events such as suspend/resume or monitor cable
+ insertion, so it must not be used by the kernel.
- Set this to N if you are absolutely sure that you trust the BIOS
- to get all its memory reservations and usages right.
+ You can set this to 4 if you are absolutely sure that you
+ trust the BIOS to get all its memory reservations and usages
+ right. If you know your BIOS have problems beyond the
+ default 64K area, you can set this to 640 to avoid using the
+ entire low memory range.
- If you have doubts about the BIOS (e.g. suspend/resume does not
- work or there's kernel crashes after certain hardware hotplug
- events) and it's not AMI or Phoenix, then you might want to enable
- X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
- corruption patterns.
+ If you have doubts about the BIOS (e.g. suspend/resume does
+ not work or there's kernel crashes after certain hardware
+ hotplug events) then you might want to enable
+ X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check
+ typical corruption patterns.
- Say Y if unsure.
+ Leave this to the default value of 64 if you are unsure.
config MATH_EMULATION
bool
@@ -1897,7 +1868,7 @@ config PCI_GODIRECT
bool "Direct"
config PCI_GOOLPC
- bool "OLPC"
+ bool "OLPC XO-1"
depends on OLPC
config PCI_GOANY
@@ -1922,6 +1893,11 @@ config PCI_OLPC
def_bool y
depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY)
+config PCI_XEN
+ def_bool y
+ depends on PCI && XEN
+ select SWIOTLB_XEN
+
config PCI_DOMAINS
def_bool y
depends on PCI
@@ -2046,7 +2022,7 @@ config SCx200
config SCx200HR_TIMER
tristate "NatSemi SCx200 27MHz High-Resolution Timer Support"
- depends on SCx200 && GENERIC_TIME
+ depends on SCx200
default y
---help---
This driver provides a clocksource built upon the on-chip
@@ -2058,13 +2034,29 @@ config SCx200HR_TIMER
config OLPC
bool "One Laptop Per Child support"
select GPIOLIB
+ select OLPC_OPENFIRMWARE
---help---
Add support for detecting the unique features of the OLPC
XO hardware.
+config OLPC_XO1
+ tristate "OLPC XO-1 support"
+ depends on OLPC && PCI
+ ---help---
+ Add support for non-essential features of the OLPC XO-1 laptop.
+
+config OLPC_OPENFIRMWARE
+ bool "Support for OLPC's Open Firmware"
+ depends on !X86_64 && !X86_PAE
+ default n
+ help
+ This option adds support for the implementation of Open Firmware
+ that is used on the OLPC XO-1 Children's Machine.
+ If unsure, say N here.
+
endif # X86_32
-config K8_NB
+config AMD_NB
def_bool y
depends on CPU_SUP_AMD && PCI
@@ -2113,6 +2105,10 @@ config HAVE_ATOMIC_IOMAP
def_bool y
depends on X86_32
+config HAVE_TEXT_POKE_SMP
+ bool
+ select STOP_MACHINE if SMP
+
source "net/Kconfig"
source "drivers/Kconfig"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 75085080b63e..b59ee765414e 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,6 +43,10 @@ config EARLY_PRINTK
with klogd/syslogd or the X server. You should normally N here,
unless you want to debug such a crash.
+config EARLY_PRINTK_MRST
+ bool "Early printk for MRST platform support"
+ depends on EARLY_PRINTK && X86_MRST
+
config EARLY_PRINTK_DBGP
bool "Early printk via EHCI debug port"
depends on EARLY_PRINTK && PCI
@@ -121,16 +125,6 @@ config DEBUG_NX_TEST
and the software setup of this feature.
If in doubt, say "N"
-config 4KSTACKS
- bool "Use 4Kb for kernel stacks instead of 8Kb"
- depends on X86_32
- ---help---
- If you say Y here the kernel will use a 4Kb stacksize for the
- kernel stack attached to each process/thread. This facilitates
- running more threads on a system and also reduces the pressure
- on the VM subsystem for higher order allocations. This option
- will also use IRQ stacks to compensate for the reduced stackspace.
-
config DOUBLEFAULT
default y
bool "Enable doublefault exception handler" if EMBEDDED
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 8aa1b59b9074..b02e509072a7 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -74,7 +74,7 @@ endif
ifdef CONFIG_CC_STACKPROTECTOR
cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh
- ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(biarch)),y)
+ ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y)
stackp-y := -fstack-protector
KBUILD_CFLAGS += $(stackp-y)
else
@@ -96,8 +96,12 @@ cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_en
# is .cfi_signal_frame supported too?
cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
+
+# does binutils support specific instructions?
+asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
LDFLAGS := -m elf_$(UTS_MACHINE)
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index 1255d953c65d..f2ee1abb1df9 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -51,7 +51,18 @@ cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
# prologue (push %ebp, mov %esp, %ebp) which breaks the function graph
# tracer assumptions. For i686, generic, core2 this is set by the
# compiler anyway
-cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-maccumulate-outgoing-args)
+ifeq ($(CONFIG_FUNCTION_GRAPH_TRACER), y)
+ADD_ACCUMULATE_OUTGOING_ARGS := y
+endif
+
+# Work around to a bug with asm goto with first implementations of it
+# in gcc causing gcc to mess up the push and pop of the stack in some
+# uses of asm goto.
+ifeq ($(CONFIG_JUMP_LABEL), y)
+ADD_ACCUMULATE_OUTGOING_ARGS := y
+endif
+
+cflags-$(ADD_ACCUMULATE_OUTGOING_ARGS) += $(call cc-option,-maccumulate-outgoing-args)
# Bug fix for binutils: this option is required in order to keep
# binutils from generating NOPL instructions against our will.
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index ec749c2bfdd7..f7cb086b4add 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -26,10 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage
targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
subdir- := compressed
-setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o
-setup-y += header.o main.o mca.o memory.o pm.o pmjump.o
-setup-y += printf.o regs.o string.o tty.o video.o video-mode.o
-setup-y += version.o
+setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o
+setup-y += early_serial_console.o edd.o header.o main.o mca.o memory.o
+setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o
+setup-y += video-mode.o version.o
setup-$(CONFIG_X86_APM_BOOT) += apm.o
# The link order of the video-*.o modules can matter. In particular,
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 98239d2658f2..c7093bd9f2d3 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -28,6 +28,7 @@
#include "bitops.h"
#include <asm/cpufeature.h>
#include <asm/processor-flags.h>
+#include "ctype.h"
/* Useful macros */
#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -37,6 +38,8 @@
extern struct setup_header hdr;
extern struct boot_params boot_params;
+#define cpu_relax() asm volatile("rep; nop")
+
/* Basic port I/O */
static inline void outb(u8 v, u16 port)
{
@@ -198,11 +201,6 @@ static inline int memcmp_gs(const void *s1, addr_t s2, size_t len)
return diff;
}
-static inline int isdigit(int ch)
-{
- return (ch >= '0') && (ch <= '9');
-}
-
/* Heap -- available for dynamic lists. */
extern char _end[];
extern char *HEAP;
@@ -287,8 +285,18 @@ struct biosregs {
void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
/* cmdline.c */
-int cmdline_find_option(const char *option, char *buffer, int bufsize);
-int cmdline_find_option_bool(const char *option);
+int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize);
+int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option);
+static inline int cmdline_find_option(const char *option, char *buffer, int bufsize)
+{
+ return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, buffer, bufsize);
+}
+
+static inline int cmdline_find_option_bool(const char *option)
+{
+ return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option);
+}
+
/* cpu.c, cpucheck.c */
struct cpu_features {
@@ -300,6 +308,10 @@ extern struct cpu_features cpu;
int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
int validate_cpu(void);
+/* early_serial_console.c */
+extern int early_serial_base;
+void console_init(void);
+
/* edd.c */
void query_edd(void);
@@ -329,8 +341,10 @@ void initregs(struct biosregs *regs);
/* string.c */
int strcmp(const char *str1, const char *str2);
+int strncmp(const char *cs, const char *ct, size_t count);
size_t strnlen(const char *s, size_t maxlen);
unsigned int atou(const char *s);
+unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base);
/* tty.c */
void puts(const char *);
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index a1d35634bce0..6b3b6f708c04 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -27,9 +27,8 @@ static inline int myisspace(u8 c)
* Returns the length of the argument (regardless of if it was
* truncated to fit in the buffer), or -1 on not found.
*/
-int cmdline_find_option(const char *option, char *buffer, int bufsize)
+int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize)
{
- u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
addr_t cptr;
char c;
int len = -1;
@@ -100,9 +99,8 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize)
* Returns the position of that option (starts counting with 1)
* or 0 on not found
*/
-int cmdline_find_option_bool(const char *option)
+int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option)
{
- u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
addr_t cptr;
char c;
int pos = 0, wstart = 0;
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index fbb47daf2459..0c229551eead 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,7 @@
# create a compressed vmlinux image from the original vmlinux
#
-targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o piggy.o
+targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o
KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
@@ -23,7 +23,7 @@ LDFLAGS_vmlinux := -T
hostprogs-y := mkpiggy
-$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
+$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE
$(call if_changed,ld)
@:
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
new file mode 100644
index 000000000000..cb62f786990d
--- /dev/null
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -0,0 +1,21 @@
+#include "misc.h"
+
+static unsigned long fs;
+static inline void set_fs(unsigned long seg)
+{
+ fs = seg << 4; /* shift it back */
+}
+typedef unsigned long addr_t;
+static inline char rdfs8(addr_t addr)
+{
+ return *((char *)(fs + addr));
+}
+#include "../cmdline.c"
+int cmdline_find_option(const char *option, char *buffer, int bufsize)
+{
+ return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, buffer, bufsize);
+}
+int cmdline_find_option_bool(const char *option)
+{
+ return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option);
+}
diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c
new file mode 100644
index 000000000000..261e81fb9582
--- /dev/null
+++ b/arch/x86/boot/compressed/early_serial_console.c
@@ -0,0 +1,5 @@
+#include "misc.h"
+
+int early_serial_base;
+
+#include "../early_serial_console.c"
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index f543b70ffae2..67a655a39ce4 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -124,6 +124,19 @@ relocated:
rep stosl
/*
+ * Adjust our own GOT
+ */
+ leal _got(%ebx), %edx
+ leal _egot(%ebx), %ecx
+1:
+ cmpl %ecx, %edx
+ jae 2f
+ addl %ebx, (%edx)
+ addl $4, %edx
+ jmp 1b
+2:
+
+/*
* Do the decompression, and jump to the new kernel..
*/
leal z_extract_offset_negative(%ebx), %ebp
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index faff0dc9c06a..52f85a196fa0 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -280,6 +280,19 @@ relocated:
rep stosq
/*
+ * Adjust our own GOT
+ */
+ leaq _got(%rip), %rdx
+ leaq _egot(%rip), %rcx
+1:
+ cmpq %rcx, %rdx
+ jae 2f
+ addq %rbx, (%rdx)
+ addq $8, %rdx
+ jmp 1b
+2:
+
+/*
* Do the decompression, and jump to the new kernel..
*/
pushq %rsi /* Save the real mode argument */
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 51e240779a44..23f315c9f215 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -9,23 +9,7 @@
* High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
*/
-/*
- * we have to be careful, because no indirections are allowed here, and
- * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
- * we just keep it from happening
- */
-#undef CONFIG_PARAVIRT
-#ifdef CONFIG_X86_32
-#define _ASM_X86_DESC_H 1
-#endif
-
-#include <linux/linkage.h>
-#include <linux/screen_info.h>
-#include <linux/elf.h>
-#include <linux/io.h>
-#include <asm/page.h>
-#include <asm/boot.h>
-#include <asm/bootparam.h>
+#include "misc.h"
/* WARNING!!
* This code is compiled with -fPIC and it is relocated dynamically
@@ -123,15 +107,13 @@ static void error(char *m);
/*
* This is set up by the setup-routine at boot-time
*/
-static struct boot_params *real_mode; /* Pointer to real-mode data */
+struct boot_params *real_mode; /* Pointer to real-mode data */
static int quiet;
+static int debug;
void *memset(void *s, int c, size_t n);
void *memcpy(void *dest, const void *src, size_t n);
-static void __putstr(int, const char *);
-#define putstr(__x) __putstr(0, __x)
-
#ifdef CONFIG_X86_64
#define memptr long
#else
@@ -170,7 +152,21 @@ static void scroll(void)
vidmem[i] = ' ';
}
-static void __putstr(int error, const char *s)
+#define XMTRDY 0x20
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define LSR 5 /* Line Status */
+static void serial_putchar(int ch)
+{
+ unsigned timeout = 0xffff;
+
+ while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+ cpu_relax();
+
+ outb(ch, early_serial_base + TXR);
+}
+
+void __putstr(int error, const char *s)
{
int x, y, pos;
char c;
@@ -179,6 +175,14 @@ static void __putstr(int error, const char *s)
if (!error)
return;
#endif
+ if (early_serial_base) {
+ const char *str = s;
+ while (*str) {
+ if (*str == '\n')
+ serial_putchar('\r');
+ serial_putchar(*str++);
+ }
+ }
if (real_mode->screen_info.orig_video_mode == 0 &&
lines == 0 && cols == 0)
@@ -225,18 +229,35 @@ void *memset(void *s, int c, size_t n)
ss[i] = c;
return s;
}
-
+#ifdef CONFIG_X86_32
void *memcpy(void *dest, const void *src, size_t n)
{
- int i;
- const char *s = src;
- char *d = dest;
+ int d0, d1, d2;
+ asm volatile(
+ "rep ; movsl\n\t"
+ "movl %4,%%ecx\n\t"
+ "rep ; movsb\n\t"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src)
+ : "memory");
- for (i = 0; i < n; i++)
- d[i] = s[i];
return dest;
}
+#else
+void *memcpy(void *dest, const void *src, size_t n)
+{
+ long d0, d1, d2;
+ asm volatile(
+ "rep ; movsq\n\t"
+ "movq %4,%%rcx\n\t"
+ "rep ; movsb\n\t"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src)
+ : "memory");
+ return dest;
+}
+#endif
static void error(char *x)
{
@@ -305,8 +326,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
{
real_mode = rmode;
- if (real_mode->hdr.loadflags & QUIET_FLAG)
+ if (cmdline_find_option_bool("quiet"))
quiet = 1;
+ if (cmdline_find_option_bool("debug"))
+ debug = 1;
if (real_mode->screen_info.orig_video_mode == 7) {
vidmem = (char *) 0xb0000;
@@ -319,6 +342,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
lines = real_mode->screen_info.orig_video_lines;
cols = real_mode->screen_info.orig_video_cols;
+ console_init();
+ if (debug)
+ putstr("early console in decompress_kernel\n");
+
free_mem_ptr = heap; /* Heap */
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
new file mode 100644
index 000000000000..3f19c81a6203
--- /dev/null
+++ b/arch/x86/boot/compressed/misc.h
@@ -0,0 +1,39 @@
+#ifndef BOOT_COMPRESSED_MISC_H
+#define BOOT_COMPRESSED_MISC_H
+
+/*
+ * we have to be careful, because no indirections are allowed here, and
+ * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
+ * we just keep it from happening
+ */
+#undef CONFIG_PARAVIRT
+#ifdef CONFIG_X86_32
+#define _ASM_X86_DESC_H 1
+#endif
+
+#include <linux/linkage.h>
+#include <linux/screen_info.h>
+#include <linux/elf.h>
+#include <linux/io.h>
+#include <asm/page.h>
+#include <asm/boot.h>
+#include <asm/bootparam.h>
+
+#define BOOT_BOOT_H
+#include "../ctype.h"
+
+/* misc.c */
+extern struct boot_params *real_mode; /* Pointer to real-mode data */
+void __putstr(int error, const char *s);
+#define putstr(__x) __putstr(0, __x)
+#define puts(__x) __putstr(0, __x)
+
+/* cmdline.c */
+int cmdline_find_option(const char *option, char *buffer, int bufsize);
+int cmdline_find_option_bool(const char *option);
+
+/* early_serial_console.c */
+extern int early_serial_base;
+void console_init(void);
+
+#endif
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
new file mode 100644
index 000000000000..19b3e693cd72
--- /dev/null
+++ b/arch/x86/boot/compressed/string.c
@@ -0,0 +1,2 @@
+#include "misc.h"
+#include "../string.c"
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 5ddabceee124..34d047c98284 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -41,6 +41,12 @@ SECTIONS
*(.rodata.*)
_erodata = . ;
}
+ .got : {
+ _got = .;
+ KEEP(*(.got.plt))
+ KEEP(*(.got))
+ _egot = .;
+ }
.data : {
_data = . ;
*(.data)
diff --git a/arch/x86/boot/ctype.h b/arch/x86/boot/ctype.h
new file mode 100644
index 000000000000..25e13403193c
--- /dev/null
+++ b/arch/x86/boot/ctype.h
@@ -0,0 +1,21 @@
+#ifndef BOOT_ISDIGIT_H
+
+#define BOOT_ISDIGIT_H
+
+static inline int isdigit(int ch)
+{
+ return (ch >= '0') && (ch <= '9');
+}
+
+static inline int isxdigit(int ch)
+{
+ if (isdigit(ch))
+ return true;
+
+ if ((ch >= 'a') && (ch <= 'f'))
+ return true;
+
+ return (ch >= 'A') && (ch <= 'F');
+}
+
+#endif
diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c
new file mode 100644
index 000000000000..5df2869c874b
--- /dev/null
+++ b/arch/x86/boot/early_serial_console.c
@@ -0,0 +1,151 @@
+#include "boot.h"
+
+#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */
+
+#define XMTRDY 0x20
+
+#define DLAB 0x80
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define RXR 0 /* Receive register (READ) */
+#define IER 1 /* Interrupt Enable */
+#define IIR 2 /* Interrupt ID */
+#define FCR 2 /* FIFO control */
+#define LCR 3 /* Line control */
+#define MCR 4 /* Modem control */
+#define LSR 5 /* Line Status */
+#define MSR 6 /* Modem Status */
+#define DLL 0 /* Divisor Latch Low */
+#define DLH 1 /* Divisor latch High */
+
+#define DEFAULT_BAUD 9600
+
+static void early_serial_init(int port, int baud)
+{
+ unsigned char c;
+ unsigned divisor;
+
+ outb(0x3, port + LCR); /* 8n1 */
+ outb(0, port + IER); /* no interrupt */
+ outb(0, port + FCR); /* no fifo */
+ outb(0x3, port + MCR); /* DTR + RTS */
+
+ divisor = 115200 / baud;
+ c = inb(port + LCR);
+ outb(c | DLAB, port + LCR);
+ outb(divisor & 0xff, port + DLL);
+ outb((divisor >> 8) & 0xff, port + DLH);
+ outb(c & ~DLAB, port + LCR);
+
+ early_serial_base = port;
+}
+
+static void parse_earlyprintk(void)
+{
+ int baud = DEFAULT_BAUD;
+ char arg[32];
+ int pos = 0;
+ int port = 0;
+
+ if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) {
+ char *e;
+
+ if (!strncmp(arg, "serial", 6)) {
+ port = DEFAULT_SERIAL_PORT;
+ pos += 6;
+ }
+
+ if (arg[pos] == ',')
+ pos++;
+
+ /*
+ * make sure we have
+ * "serial,0x3f8,115200"
+ * "serial,ttyS0,115200"
+ * "ttyS0,115200"
+ */
+ if (pos == 7 && !strncmp(arg + pos, "0x", 2)) {
+ port = simple_strtoull(arg + pos, &e, 16);
+ if (port == 0 || arg + pos == e)
+ port = DEFAULT_SERIAL_PORT;
+ else
+ pos = e - arg;
+ } else if (!strncmp(arg + pos, "ttyS", 4)) {
+ static const int bases[] = { 0x3f8, 0x2f8 };
+ int idx = 0;
+
+ if (!strncmp(arg + pos, "ttyS", 4))
+ pos += 4;
+
+ if (arg[pos++] == '1')
+ idx = 1;
+
+ port = bases[idx];
+ }
+
+ if (arg[pos] == ',')
+ pos++;
+
+ baud = simple_strtoull(arg + pos, &e, 0);
+ if (baud == 0 || arg + pos == e)
+ baud = DEFAULT_BAUD;
+ }
+
+ if (port)
+ early_serial_init(port, baud);
+}
+
+#define BASE_BAUD (1843200/16)
+static unsigned int probe_baud(int port)
+{
+ unsigned char lcr, dll, dlh;
+ unsigned int quot;
+
+ lcr = inb(port + LCR);
+ outb(lcr | DLAB, port + LCR);
+ dll = inb(port + DLL);
+ dlh = inb(port + DLH);
+ outb(lcr, port + LCR);
+ quot = (dlh << 8) | dll;
+
+ return BASE_BAUD / quot;
+}
+
+static void parse_console_uart8250(void)
+{
+ char optstr[64], *options;
+ int baud = DEFAULT_BAUD;
+ int port = 0;
+
+ /*
+ * console=uart8250,io,0x3f8,115200n8
+ * need to make sure it is last one console !
+ */
+ if (cmdline_find_option("console", optstr, sizeof optstr) <= 0)
+ return;
+
+ options = optstr;
+
+ if (!strncmp(options, "uart8250,io,", 12))
+ port = simple_strtoull(options + 12, &options, 0);
+ else if (!strncmp(options, "uart,io,", 8))
+ port = simple_strtoull(options + 8, &options, 0);
+ else
+ return;
+
+ if (options && (options[0] == ','))
+ baud = simple_strtoull(options + 1, &options, 0);
+ else
+ baud = probe_baud(port);
+
+ if (port)
+ early_serial_init(port, baud);
+}
+
+void console_init(void)
+{
+ parse_earlyprintk();
+
+ if (!early_serial_base)
+ parse_console_uart8250();
+}
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 140172b895bd..40358c8905be 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -130,6 +130,11 @@ void main(void)
/* First, copy the boot header into the "zeropage" */
copy_boot_params();
+ /* Initialize the early-boot console */
+ console_init();
+ if (cmdline_find_option_bool("debug"))
+ puts("early console in setup code\n");
+
/* End of heap check */
init_heap();
@@ -168,10 +173,6 @@ void main(void)
/* Set the video mode */
set_video();
- /* Parse command line for 'quiet' and pass it to decompressor. */
- if (cmdline_find_option_bool("quiet"))
- boot_params.hdr.loadflags |= QUIET_FLAG;
-
/* Do the last things and invoke protected mode */
go_to_protected_mode();
}
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
index 50e47cdbdddd..cdac91ca55d3 100644
--- a/arch/x86/boot/printf.c
+++ b/arch/x86/boot/printf.c
@@ -34,7 +34,7 @@ static int skip_atoi(const char **s)
#define SMALL 32 /* Must be 32 == 0x20 */
#define SPECIAL 64 /* 0x */
-#define do_div(n,base) ({ \
+#define __do_div(n, base) ({ \
int __res; \
__res = ((unsigned long) n) % (unsigned) base; \
n = ((unsigned long) n) / (unsigned) base; \
@@ -83,7 +83,7 @@ static char *number(char *str, long num, int base, int size, int precision,
tmp[i++] = '0';
else
while (num != 0)
- tmp[i++] = (digits[do_div(num, base)] | locase);
+ tmp[i++] = (digits[__do_div(num, base)] | locase);
if (i > precision)
precision = i;
size -= precision;
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index f94b7a0c2abf..3cbc4058dd26 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -30,6 +30,22 @@ int strcmp(const char *str1, const char *str2)
return 0;
}
+int strncmp(const char *cs, const char *ct, size_t count)
+{
+ unsigned char c1, c2;
+
+ while (count) {
+ c1 = *cs++;
+ c2 = *ct++;
+ if (c1 != c2)
+ return c1 < c2 ? -1 : 1;
+ if (!c1)
+ break;
+ count--;
+ }
+ return 0;
+}
+
size_t strnlen(const char *s, size_t maxlen)
{
const char *es = s;
@@ -48,3 +64,50 @@ unsigned int atou(const char *s)
i = i * 10 + (*s++ - '0');
return i;
}
+
+/* Works only for digits and letters, but small and fast */
+#define TOLOWER(x) ((x) | 0x20)
+
+static unsigned int simple_guess_base(const char *cp)
+{
+ if (cp[0] == '0') {
+ if (TOLOWER(cp[1]) == 'x' && isxdigit(cp[2]))
+ return 16;
+ else
+ return 8;
+ } else {
+ return 10;
+ }
+}
+
+/**
+ * simple_strtoull - convert a string to an unsigned long long
+ * @cp: The start of the string
+ * @endp: A pointer to the end of the parsed string will be placed here
+ * @base: The number base to use
+ */
+
+unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
+{
+ unsigned long long result = 0;
+
+ if (!base)
+ base = simple_guess_base(cp);
+
+ if (base == 16 && cp[0] == '0' && TOLOWER(cp[1]) == 'x')
+ cp += 2;
+
+ while (isxdigit(*cp)) {
+ unsigned int value;
+
+ value = isdigit(*cp) ? *cp - '0' : TOLOWER(*cp) - 'a' + 10;
+ if (value >= base)
+ break;
+ result = result * base + value;
+ cp++;
+ }
+ if (endp)
+ *endp = (char *)cp;
+
+ return result;
+}
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index 01ec69c901c7..def2451f46ae 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -10,23 +10,36 @@
* ----------------------------------------------------------------------- */
/*
- * Very simple screen I/O
- * XXX: Probably should add very simple serial I/O?
+ * Very simple screen and serial I/O
*/
#include "boot.h"
+int early_serial_base;
+
+#define XMTRDY 0x20
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define LSR 5 /* Line Status */
+
/*
* These functions are in .inittext so they can be used to signal
* error during initialization.
*/
-void __attribute__((section(".inittext"))) putchar(int ch)
+static void __attribute__((section(".inittext"))) serial_putchar(int ch)
{
- struct biosregs ireg;
+ unsigned timeout = 0xffff;
- if (ch == '\n')
- putchar('\r'); /* \n -> \r\n */
+ while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+ cpu_relax();
+
+ outb(ch, early_serial_base + TXR);
+}
+
+static void __attribute__((section(".inittext"))) bios_putchar(int ch)
+{
+ struct biosregs ireg;
initregs(&ireg);
ireg.bx = 0x0007;
@@ -36,6 +49,17 @@ void __attribute__((section(".inittext"))) putchar(int ch)
intcall(0x10, &ireg, NULL);
}
+void __attribute__((section(".inittext"))) putchar(int ch)
+{
+ if (ch == '\n')
+ putchar('\r'); /* \n -> \r\n */
+
+ bios_putchar(ch);
+
+ if (early_serial_base != 0)
+ serial_putchar(ch);
+}
+
void __attribute__((section(".inittext"))) puts(const char *str)
{
while (*str)
@@ -112,3 +136,4 @@ int getchar_timeout(void)
return 0; /* Timeout! */
}
+
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index d28fad19654a..6f9872658dd2 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,524 +1,84 @@
-#
-# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.30-rc2
-# Mon May 11 16:21:55 2009
-#
-# CONFIG_64BIT is not set
-CONFIG_X86_32=y
-# CONFIG_X86_64 is not set
-CONFIG_X86=y
-CONFIG_OUTPUT_FORMAT="elf32-i386"
-CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
-CONFIG_GENERIC_TIME=y
-CONFIG_GENERIC_CMOS_UPDATE=y
-CONFIG_CLOCKSOURCE_WATCHDOG=y
-CONFIG_GENERIC_CLOCKEVENTS=y
-CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y
-CONFIG_LOCKDEP_SUPPORT=y
-CONFIG_STACKTRACE_SUPPORT=y
-CONFIG_HAVE_LATENCYTOP_SUPPORT=y
-CONFIG_FAST_CMPXCHG_LOCAL=y
-CONFIG_MMU=y
-CONFIG_ZONE_DMA=y
-CONFIG_GENERIC_ISA_DMA=y
-CONFIG_GENERIC_IOMAP=y
-CONFIG_GENERIC_BUG=y
-CONFIG_GENERIC_HWEIGHT=y
-CONFIG_ARCH_MAY_HAVE_PC_FDC=y
-# CONFIG_RWSEM_GENERIC_SPINLOCK is not set
-CONFIG_RWSEM_XCHGADD_ALGORITHM=y
-CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y
-CONFIG_GENERIC_CALIBRATE_DELAY=y
-# CONFIG_GENERIC_TIME_VSYSCALL is not set
-CONFIG_ARCH_HAS_CPU_RELAX=y
-CONFIG_ARCH_HAS_DEFAULT_IDLE=y
-CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
-CONFIG_HAVE_SETUP_PER_CPU_AREA=y
-CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
-# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
-CONFIG_ARCH_HIBERNATION_POSSIBLE=y
-CONFIG_ARCH_SUSPEND_POSSIBLE=y
-# CONFIG_ZONE_DMA32 is not set
-CONFIG_ARCH_POPULATES_NODE_MAP=y
-# CONFIG_AUDIT_ARCH is not set
-CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
-CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
-CONFIG_GENERIC_HARDIRQS=y
-CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
-CONFIG_GENERIC_IRQ_PROBE=y
-CONFIG_GENERIC_PENDING_IRQ=y
-CONFIG_USE_GENERIC_SMP_HELPERS=y
-CONFIG_X86_32_SMP=y
-CONFIG_X86_HT=y
-CONFIG_X86_TRAMPOLINE=y
-CONFIG_X86_32_LAZY_GS=y
-CONFIG_KTIME_SCALAR=y
-CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
-
-#
-# General setup
-#
CONFIG_EXPERIMENTAL=y
-CONFIG_LOCK_KERNEL=y
-CONFIG_INIT_ENV_ARG_LIMIT=32
-CONFIG_LOCALVERSION=""
# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_HAVE_KERNEL_GZIP=y
-CONFIG_HAVE_KERNEL_BZIP2=y
-CONFIG_HAVE_KERNEL_LZMA=y
-CONFIG_KERNEL_GZIP=y
-# CONFIG_KERNEL_BZIP2 is not set
-# CONFIG_KERNEL_LZMA is not set
-CONFIG_SWAP=y
CONFIG_SYSVIPC=y
-CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
-CONFIG_POSIX_MQUEUE_SYSCTL=y
CONFIG_BSD_PROCESS_ACCT=y
-# CONFIG_BSD_PROCESS_ACCT_V3 is not set
CONFIG_TASKSTATS=y
CONFIG_TASK_DELAY_ACCT=y
CONFIG_TASK_XACCT=y
CONFIG_TASK_IO_ACCOUNTING=y
CONFIG_AUDIT=y
-CONFIG_AUDITSYSCALL=y
-CONFIG_AUDIT_TREE=y
-
-#
-# RCU Subsystem
-#
-# CONFIG_CLASSIC_RCU is not set
-CONFIG_TREE_RCU=y
-# CONFIG_PREEMPT_RCU is not set
-# CONFIG_RCU_TRACE is not set
-CONFIG_RCU_FANOUT=32
-# CONFIG_RCU_FANOUT_EXACT is not set
-# CONFIG_TREE_RCU_TRACE is not set
-# CONFIG_PREEMPT_RCU_TRACE is not set
-# CONFIG_IKCONFIG is not set
CONFIG_LOG_BUF_SHIFT=18
-CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
-CONFIG_GROUP_SCHED=y
-CONFIG_FAIR_GROUP_SCHED=y
-# CONFIG_RT_GROUP_SCHED is not set
-# CONFIG_USER_SCHED is not set
-CONFIG_CGROUP_SCHED=y
CONFIG_CGROUPS=y
-# CONFIG_CGROUP_DEBUG is not set
CONFIG_CGROUP_NS=y
CONFIG_CGROUP_FREEZER=y
-# CONFIG_CGROUP_DEVICE is not set
CONFIG_CPUSETS=y
-CONFIG_PROC_PID_CPUSET=y
CONFIG_CGROUP_CPUACCT=y
CONFIG_RESOURCE_COUNTERS=y
-# CONFIG_CGROUP_MEM_RES_CTLR is not set
-# CONFIG_SYSFS_DEPRECATED_V2 is not set
-CONFIG_RELAY=y
-CONFIG_NAMESPACES=y
+CONFIG_CGROUP_SCHED=y
CONFIG_UTS_NS=y
CONFIG_IPC_NS=y
CONFIG_USER_NS=y
CONFIG_PID_NS=y
CONFIG_NET_NS=y
CONFIG_BLK_DEV_INITRD=y
-CONFIG_INITRAMFS_SOURCE=""
-CONFIG_RD_GZIP=y
-CONFIG_RD_BZIP2=y
-CONFIG_RD_LZMA=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_SYSCTL=y
-CONFIG_ANON_INODES=y
-# CONFIG_EMBEDDED is not set
-CONFIG_UID16=y
-CONFIG_SYSCTL_SYSCALL=y
-CONFIG_KALLSYMS=y
-CONFIG_KALLSYMS_ALL=y
CONFIG_KALLSYMS_EXTRA_PASS=y
-# CONFIG_STRIP_ASM_SYMS is not set
-CONFIG_HOTPLUG=y
-CONFIG_PRINTK=y
-CONFIG_BUG=y
-CONFIG_ELF_CORE=y
-CONFIG_PCSPKR_PLATFORM=y
-CONFIG_BASE_FULL=y
-CONFIG_FUTEX=y
-CONFIG_EPOLL=y
-CONFIG_SIGNALFD=y
-CONFIG_TIMERFD=y
-CONFIG_EVENTFD=y
-CONFIG_SHMEM=y
-CONFIG_AIO=y
-CONFIG_VM_EVENT_COUNTERS=y
-CONFIG_PCI_QUIRKS=y
-CONFIG_SLUB_DEBUG=y
# CONFIG_COMPAT_BRK is not set
-# CONFIG_SLAB is not set
-CONFIG_SLUB=y
-# CONFIG_SLOB is not set
CONFIG_PROFILING=y
-CONFIG_TRACEPOINTS=y
-CONFIG_MARKERS=y
-# CONFIG_OPROFILE is not set
-CONFIG_HAVE_OPROFILE=y
CONFIG_KPROBES=y
-CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
-CONFIG_KRETPROBES=y
-CONFIG_HAVE_IOREMAP_PROT=y
-CONFIG_HAVE_KPROBES=y
-CONFIG_HAVE_KRETPROBES=y
-CONFIG_HAVE_ARCH_TRACEHOOK=y
-CONFIG_HAVE_DMA_API_DEBUG=y
-# CONFIG_SLOW_WORK is not set
-CONFIG_HAVE_GENERIC_DMA_COHERENT=y
-CONFIG_SLABINFO=y
-CONFIG_RT_MUTEXES=y
-CONFIG_BASE_SMALL=0
CONFIG_MODULES=y
-# CONFIG_MODULE_FORCE_LOAD is not set
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_MODVERSIONS is not set
-# CONFIG_MODULE_SRCVERSION_ALL is not set
-CONFIG_STOP_MACHINE=y
-CONFIG_BLOCK=y
-# CONFIG_LBD is not set
-CONFIG_BLK_DEV_BSG=y
-# CONFIG_BLK_DEV_INTEGRITY is not set
-
-#
-# IO Schedulers
-#
-CONFIG_IOSCHED_NOOP=y
-CONFIG_IOSCHED_AS=y
-CONFIG_IOSCHED_DEADLINE=y
-CONFIG_IOSCHED_CFQ=y
-# CONFIG_DEFAULT_AS is not set
-# CONFIG_DEFAULT_DEADLINE is not set
-CONFIG_DEFAULT_CFQ=y
-# CONFIG_DEFAULT_NOOP is not set
-CONFIG_DEFAULT_IOSCHED="cfq"
-CONFIG_FREEZER=y
-
-#
-# Processor type and features
-#
-CONFIG_TICK_ONESHOT=y
CONFIG_NO_HZ=y
CONFIG_HIGH_RES_TIMERS=y
-CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
CONFIG_SMP=y
CONFIG_SPARSE_IRQ=y
-CONFIG_X86_MPPARSE=y
-# CONFIG_X86_BIGSMP is not set
-CONFIG_X86_EXTENDED_PLATFORM=y
-# CONFIG_X86_ELAN is not set
-# CONFIG_X86_RDC321X is not set
-# CONFIG_X86_32_NON_STANDARD is not set
-CONFIG_SCHED_OMIT_FRAME_POINTER=y
-# CONFIG_PARAVIRT_GUEST is not set
-# CONFIG_MEMTEST is not set
-# CONFIG_M386 is not set
-# CONFIG_M486 is not set
-# CONFIG_M586 is not set
-# CONFIG_M586TSC is not set
-# CONFIG_M586MMX is not set
-CONFIG_M686=y
-# CONFIG_MPENTIUMII is not set
-# CONFIG_MPENTIUMIII is not set
-# CONFIG_MPENTIUMM is not set
-# CONFIG_MPENTIUM4 is not set
-# CONFIG_MK6 is not set
-# CONFIG_MK7 is not set
-# CONFIG_MK8 is not set
-# CONFIG_MCRUSOE is not set
-# CONFIG_MEFFICEON is not set
-# CONFIG_MWINCHIPC6 is not set
-# CONFIG_MWINCHIP3D is not set
-# CONFIG_MGEODEGX1 is not set
-# CONFIG_MGEODE_LX is not set
-# CONFIG_MCYRIXIII is not set
-# CONFIG_MVIAC3_2 is not set
-# CONFIG_MVIAC7 is not set
-# CONFIG_MPSC is not set
-# CONFIG_MCORE2 is not set
-# CONFIG_GENERIC_CPU is not set
CONFIG_X86_GENERIC=y
-CONFIG_X86_CPU=y
-CONFIG_X86_L1_CACHE_BYTES=64
-CONFIG_X86_INTERNODE_CACHE_BYTES=64
-CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=5
-CONFIG_X86_XADD=y
-# CONFIG_X86_PPRO_FENCE is not set
-CONFIG_X86_WP_WORKS_OK=y
-CONFIG_X86_INVLPG=y
-CONFIG_X86_BSWAP=y
-CONFIG_X86_POPAD_OK=y
-CONFIG_X86_INTEL_USERCOPY=y
-CONFIG_X86_USE_PPRO_CHECKSUM=y
-CONFIG_X86_TSC=y
-CONFIG_X86_CMOV=y
-CONFIG_X86_MINIMUM_CPU_FAMILY=4
-CONFIG_X86_DEBUGCTLMSR=y
-CONFIG_CPU_SUP_INTEL=y
-CONFIG_CPU_SUP_CYRIX_32=y
-CONFIG_CPU_SUP_AMD=y
-CONFIG_CPU_SUP_CENTAUR=y
-CONFIG_CPU_SUP_TRANSMETA_32=y
-CONFIG_CPU_SUP_UMC_32=y
-CONFIG_X86_DS=y
-CONFIG_X86_PTRACE_BTS=y
CONFIG_HPET_TIMER=y
-CONFIG_HPET_EMULATE_RTC=y
-CONFIG_DMI=y
-# CONFIG_IOMMU_HELPER is not set
-# CONFIG_IOMMU_API is not set
-CONFIG_NR_CPUS=64
CONFIG_SCHED_SMT=y
-CONFIG_SCHED_MC=y
-# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
-# CONFIG_PREEMPT is not set
-CONFIG_X86_LOCAL_APIC=y
-CONFIG_X86_IO_APIC=y
CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
CONFIG_X86_MCE=y
-CONFIG_X86_MCE_NONFATAL=y
-CONFIG_X86_MCE_P4THERMAL=y
-CONFIG_VM86=y
-# CONFIG_TOSHIBA is not set
-# CONFIG_I8K is not set
CONFIG_X86_REBOOTFIXUPS=y
CONFIG_MICROCODE=y
-CONFIG_MICROCODE_INTEL=y
CONFIG_MICROCODE_AMD=y
-CONFIG_MICROCODE_OLD_INTERFACE=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
-# CONFIG_X86_CPU_DEBUG is not set
-# CONFIG_NOHIGHMEM is not set
-CONFIG_HIGHMEM4G=y
-# CONFIG_HIGHMEM64G is not set
-CONFIG_PAGE_OFFSET=0xC0000000
-CONFIG_HIGHMEM=y
-# CONFIG_ARCH_PHYS_ADDR_T_64BIT is not set
-CONFIG_ARCH_FLATMEM_ENABLE=y
-CONFIG_ARCH_SPARSEMEM_ENABLE=y
-CONFIG_ARCH_SELECT_MEMORY_MODEL=y
-CONFIG_SELECT_MEMORY_MODEL=y
-CONFIG_FLATMEM_MANUAL=y
-# CONFIG_DISCONTIGMEM_MANUAL is not set
-# CONFIG_SPARSEMEM_MANUAL is not set
-CONFIG_FLATMEM=y
-CONFIG_FLAT_NODE_MEM_MAP=y
-CONFIG_SPARSEMEM_STATIC=y
-CONFIG_PAGEFLAGS_EXTENDED=y
-CONFIG_SPLIT_PTLOCK_CPUS=4
-# CONFIG_PHYS_ADDR_T_64BIT is not set
-CONFIG_ZONE_DMA_FLAG=1
-CONFIG_BOUNCE=y
-CONFIG_VIRT_TO_BUS=y
-CONFIG_UNEVICTABLE_LRU=y
-CONFIG_HAVE_MLOCK=y
-CONFIG_HAVE_MLOCKED_PAGE_BIT=y
CONFIG_HIGHPTE=y
CONFIG_X86_CHECK_BIOS_CORRUPTION=y
-CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
-CONFIG_X86_RESERVE_LOW_64K=y
-# CONFIG_MATH_EMULATION is not set
-CONFIG_MTRR=y
# CONFIG_MTRR_SANITIZER is not set
-CONFIG_X86_PAT=y
CONFIG_EFI=y
-CONFIG_SECCOMP=y
-# CONFIG_CC_STACKPROTECTOR is not set
-# CONFIG_HZ_100 is not set
-# CONFIG_HZ_250 is not set
-# CONFIG_HZ_300 is not set
CONFIG_HZ_1000=y
-CONFIG_HZ=1000
-CONFIG_SCHED_HRTICK=y
CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
-# CONFIG_KEXEC_JUMP is not set
-CONFIG_PHYSICAL_START=0x1000000
-CONFIG_RELOCATABLE=y
-CONFIG_X86_NEED_RELOCS=y
-CONFIG_PHYSICAL_ALIGN=0x1000000
-CONFIG_HOTPLUG_CPU=y
# CONFIG_COMPAT_VDSO is not set
-# CONFIG_CMDLINE_BOOL is not set
-CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
-
-#
-# Power management and ACPI options
-#
CONFIG_PM=y
CONFIG_PM_DEBUG=y
-# CONFIG_PM_VERBOSE is not set
-CONFIG_CAN_PM_TRACE=y
-CONFIG_PM_TRACE=y
CONFIG_PM_TRACE_RTC=y
-CONFIG_PM_SLEEP_SMP=y
-CONFIG_PM_SLEEP=y
-CONFIG_SUSPEND=y
-# CONFIG_PM_TEST_SUSPEND is not set
-CONFIG_SUSPEND_FREEZER=y
CONFIG_HIBERNATION=y
-CONFIG_PM_STD_PARTITION=""
-CONFIG_ACPI=y
-CONFIG_ACPI_SLEEP=y
CONFIG_ACPI_PROCFS=y
-CONFIG_ACPI_PROCFS_POWER=y
-CONFIG_ACPI_SYSFS_POWER=y
-CONFIG_ACPI_PROC_EVENT=y
-CONFIG_ACPI_AC=y
-CONFIG_ACPI_BATTERY=y
-CONFIG_ACPI_BUTTON=y
-CONFIG_ACPI_FAN=y
CONFIG_ACPI_DOCK=y
-CONFIG_ACPI_PROCESSOR=y
-CONFIG_ACPI_HOTPLUG_CPU=y
-CONFIG_ACPI_THERMAL=y
-# CONFIG_ACPI_CUSTOM_DSDT is not set
-CONFIG_ACPI_BLACKLIST_YEAR=0
-# CONFIG_ACPI_DEBUG is not set
-# CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_X86_PM_TIMER=y
-CONFIG_ACPI_CONTAINER=y
-# CONFIG_ACPI_SBS is not set
-# CONFIG_APM is not set
-
-#
-# CPU Frequency scaling
-#
CONFIG_CPU_FREQ=y
-CONFIG_CPU_FREQ_TABLE=y
CONFIG_CPU_FREQ_DEBUG=y
# CONFIG_CPU_FREQ_STAT is not set
-# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
-# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
-# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
-# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
-# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_GOV_ONDEMAND=y
-# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set
-
-#
-# CPUFreq processor drivers
-#
CONFIG_X86_ACPI_CPUFREQ=y
-# CONFIG_X86_POWERNOW_K6 is not set
-# CONFIG_X86_POWERNOW_K7 is not set
-# CONFIG_X86_POWERNOW_K8 is not set
-# CONFIG_X86_GX_SUSPMOD is not set
-# CONFIG_X86_SPEEDSTEP_CENTRINO is not set
-# CONFIG_X86_SPEEDSTEP_ICH is not set
-# CONFIG_X86_SPEEDSTEP_SMI is not set
-# CONFIG_X86_P4_CLOCKMOD is not set
-# CONFIG_X86_CPUFREQ_NFORCE2 is not set
-# CONFIG_X86_LONGRUN is not set
-# CONFIG_X86_LONGHAUL is not set
-# CONFIG_X86_E_POWERSAVER is not set
-
-#
-# shared options
-#
-# CONFIG_X86_SPEEDSTEP_LIB is not set
-CONFIG_CPU_IDLE=y
-CONFIG_CPU_IDLE_GOV_LADDER=y
-CONFIG_CPU_IDLE_GOV_MENU=y
-
-#
-# Bus options (PCI etc.)
-#
-CONFIG_PCI=y
-# CONFIG_PCI_GOBIOS is not set
-# CONFIG_PCI_GOMMCONFIG is not set
-# CONFIG_PCI_GODIRECT is not set
-# CONFIG_PCI_GOOLPC is not set
-CONFIG_PCI_GOANY=y
-CONFIG_PCI_BIOS=y
-CONFIG_PCI_DIRECT=y
-CONFIG_PCI_MMCONFIG=y
-CONFIG_PCI_DOMAINS=y
-# CONFIG_DMAR is not set
CONFIG_PCIEPORTBUS=y
-# CONFIG_HOTPLUG_PCI_PCIE is not set
-CONFIG_PCIEAER=y
-# CONFIG_PCIEASPM is not set
-CONFIG_ARCH_SUPPORTS_MSI=y
CONFIG_PCI_MSI=y
-# CONFIG_PCI_LEGACY is not set
-# CONFIG_PCI_DEBUG is not set
-# CONFIG_PCI_STUB is not set
-CONFIG_HT_IRQ=y
-# CONFIG_PCI_IOV is not set
-CONFIG_ISA_DMA_API=y
-# CONFIG_ISA is not set
-# CONFIG_MCA is not set
-# CONFIG_SCx200 is not set
-# CONFIG_OLPC is not set
-CONFIG_K8_NB=y
CONFIG_PCCARD=y
-# CONFIG_PCMCIA_DEBUG is not set
-CONFIG_PCMCIA=y
-CONFIG_PCMCIA_LOAD_CIS=y
-CONFIG_PCMCIA_IOCTL=y
-CONFIG_CARDBUS=y
-
-#
-# PC-card bridges
-#
CONFIG_YENTA=y
-CONFIG_YENTA_O2=y
-CONFIG_YENTA_RICOH=y
-CONFIG_YENTA_TI=y
-CONFIG_YENTA_ENE_TUNE=y
-CONFIG_YENTA_TOSHIBA=y
-# CONFIG_PD6729 is not set
-# CONFIG_I82092 is not set
-CONFIG_PCCARD_NONSTATIC=y
CONFIG_HOTPLUG_PCI=y
-# CONFIG_HOTPLUG_PCI_FAKE is not set
-# CONFIG_HOTPLUG_PCI_IBM is not set
-# CONFIG_HOTPLUG_PCI_ACPI is not set
-# CONFIG_HOTPLUG_PCI_CPCI is not set
-# CONFIG_HOTPLUG_PCI_SHPC is not set
-
-#
-# Executable file formats / Emulations
-#
-CONFIG_BINFMT_ELF=y
CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
-CONFIG_HAVE_AOUT=y
-# CONFIG_BINFMT_AOUT is not set
CONFIG_BINFMT_MISC=y
-CONFIG_HAVE_ATOMIC_IOMAP=y
CONFIG_NET=y
-
-#
-# Networking options
-#
CONFIG_PACKET=y
-CONFIG_PACKET_MMAP=y
CONFIG_UNIX=y
-CONFIG_XFRM=y
CONFIG_XFRM_USER=y
-# CONFIG_XFRM_SUB_POLICY is not set
-# CONFIG_XFRM_MIGRATE is not set
-# CONFIG_XFRM_STATISTICS is not set
-# CONFIG_NET_KEY is not set
CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_ASK_IP_FIB_HASH=y
-# CONFIG_IP_FIB_TRIE is not set
-CONFIG_IP_FIB_HASH=y
CONFIG_IP_MULTIPLE_TABLES=y
CONFIG_IP_ROUTE_MULTIPATH=y
CONFIG_IP_ROUTE_VERBOSE=y
@@ -526,118 +86,46 @@ CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
CONFIG_IP_PNP_BOOTP=y
CONFIG_IP_PNP_RARP=y
-# CONFIG_NET_IPIP is not set
-# CONFIG_NET_IPGRE is not set
CONFIG_IP_MROUTE=y
CONFIG_IP_PIMSM_V1=y
CONFIG_IP_PIMSM_V2=y
-# CONFIG_ARPD is not set
CONFIG_SYN_COOKIES=y
-# CONFIG_INET_AH is not set
-# CONFIG_INET_ESP is not set
-# CONFIG_INET_IPCOMP is not set
-# CONFIG_INET_XFRM_TUNNEL is not set
-CONFIG_INET_TUNNEL=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-CONFIG_INET_LRO=y
# CONFIG_INET_DIAG is not set
CONFIG_TCP_CONG_ADVANCED=y
# CONFIG_TCP_CONG_BIC is not set
-CONFIG_TCP_CONG_CUBIC=y
# CONFIG_TCP_CONG_WESTWOOD is not set
# CONFIG_TCP_CONG_HTCP is not set
-# CONFIG_TCP_CONG_HSTCP is not set
-# CONFIG_TCP_CONG_HYBLA is not set
-# CONFIG_TCP_CONG_VEGAS is not set
-# CONFIG_TCP_CONG_SCALABLE is not set
-# CONFIG_TCP_CONG_LP is not set
-# CONFIG_TCP_CONG_VENO is not set
-# CONFIG_TCP_CONG_YEAH is not set
-# CONFIG_TCP_CONG_ILLINOIS is not set
-# CONFIG_DEFAULT_BIC is not set
-CONFIG_DEFAULT_CUBIC=y
-# CONFIG_DEFAULT_HTCP is not set
-# CONFIG_DEFAULT_VEGAS is not set
-# CONFIG_DEFAULT_WESTWOOD is not set
-# CONFIG_DEFAULT_RENO is not set
-CONFIG_DEFAULT_TCP_CONG="cubic"
CONFIG_TCP_MD5SIG=y
CONFIG_IPV6=y
-# CONFIG_IPV6_PRIVACY is not set
-# CONFIG_IPV6_ROUTER_PREF is not set
-# CONFIG_IPV6_OPTIMISTIC_DAD is not set
CONFIG_INET6_AH=y
CONFIG_INET6_ESP=y
-# CONFIG_INET6_IPCOMP is not set
-# CONFIG_IPV6_MIP6 is not set
-# CONFIG_INET6_XFRM_TUNNEL is not set
-# CONFIG_INET6_TUNNEL is not set
-CONFIG_INET6_XFRM_MODE_TRANSPORT=y
-CONFIG_INET6_XFRM_MODE_TUNNEL=y
-CONFIG_INET6_XFRM_MODE_BEET=y
-# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
-CONFIG_IPV6_SIT=y
-CONFIG_IPV6_NDISC_NODETYPE=y
-# CONFIG_IPV6_TUNNEL is not set
-# CONFIG_IPV6_MULTIPLE_TABLES is not set
-# CONFIG_IPV6_MROUTE is not set
CONFIG_NETLABEL=y
-CONFIG_NETWORK_SECMARK=y
CONFIG_NETFILTER=y
-# CONFIG_NETFILTER_DEBUG is not set
# CONFIG_NETFILTER_ADVANCED is not set
-
-#
-# Core Netfilter Configuration
-#
-CONFIG_NETFILTER_NETLINK=y
-CONFIG_NETFILTER_NETLINK_LOG=y
CONFIG_NF_CONNTRACK=y
-CONFIG_NF_CONNTRACK_SECMARK=y
CONFIG_NF_CONNTRACK_FTP=y
CONFIG_NF_CONNTRACK_IRC=y
CONFIG_NF_CONNTRACK_SIP=y
CONFIG_NF_CT_NETLINK=y
-CONFIG_NETFILTER_XTABLES=y
CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
-CONFIG_NETFILTER_XT_TARGET_MARK=y
CONFIG_NETFILTER_XT_TARGET_NFLOG=y
CONFIG_NETFILTER_XT_TARGET_SECMARK=y
CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
-CONFIG_NETFILTER_XT_MATCH_MARK=y
CONFIG_NETFILTER_XT_MATCH_POLICY=y
CONFIG_NETFILTER_XT_MATCH_STATE=y
-# CONFIG_IP_VS is not set
-
-#
-# IP: Netfilter Configuration
-#
-CONFIG_NF_DEFRAG_IPV4=y
CONFIG_NF_CONNTRACK_IPV4=y
-CONFIG_NF_CONNTRACK_PROC_COMPAT=y
CONFIG_IP_NF_IPTABLES=y
CONFIG_IP_NF_FILTER=y
CONFIG_IP_NF_TARGET_REJECT=y
CONFIG_IP_NF_TARGET_LOG=y
CONFIG_IP_NF_TARGET_ULOG=y
CONFIG_NF_NAT=y
-CONFIG_NF_NAT_NEEDED=y
CONFIG_IP_NF_TARGET_MASQUERADE=y
-CONFIG_NF_NAT_FTP=y
-CONFIG_NF_NAT_IRC=y
-# CONFIG_NF_NAT_TFTP is not set
-# CONFIG_NF_NAT_AMANDA is not set
-# CONFIG_NF_NAT_PPTP is not set
-# CONFIG_NF_NAT_H323 is not set
-CONFIG_NF_NAT_SIP=y
CONFIG_IP_NF_MANGLE=y
-
-#
-# IPv6: Netfilter Configuration
-#
CONFIG_NF_CONNTRACK_IPV6=y
CONFIG_IP6_NF_IPTABLES=y
CONFIG_IP6_NF_MATCH_IPV6HEADER=y
@@ -645,1228 +133,115 @@ CONFIG_IP6_NF_TARGET_LOG=y
CONFIG_IP6_NF_FILTER=y
CONFIG_IP6_NF_TARGET_REJECT=y
CONFIG_IP6_NF_MANGLE=y
-# CONFIG_IP_DCCP is not set
-# CONFIG_IP_SCTP is not set
-# CONFIG_TIPC is not set
-# CONFIG_ATM is not set
-# CONFIG_BRIDGE is not set
-# CONFIG_NET_DSA is not set
-# CONFIG_VLAN_8021Q is not set
-# CONFIG_DECNET is not set
-CONFIG_LLC=y
-# CONFIG_LLC2 is not set
-# CONFIG_IPX is not set
-# CONFIG_ATALK is not set
-# CONFIG_X25 is not set
-# CONFIG_LAPB is not set
-# CONFIG_ECONET is not set
-# CONFIG_WAN_ROUTER is not set
-# CONFIG_PHONET is not set
CONFIG_NET_SCHED=y
-
-#
-# Queueing/Scheduling
-#
-# CONFIG_NET_SCH_CBQ is not set
-# CONFIG_NET_SCH_HTB is not set
-# CONFIG_NET_SCH_HFSC is not set
-# CONFIG_NET_SCH_PRIO is not set
-# CONFIG_NET_SCH_MULTIQ is not set
-# CONFIG_NET_SCH_RED is not set
-# CONFIG_NET_SCH_SFQ is not set
-# CONFIG_NET_SCH_TEQL is not set
-# CONFIG_NET_SCH_TBF is not set
-# CONFIG_NET_SCH_GRED is not set
-# CONFIG_NET_SCH_DSMARK is not set
-# CONFIG_NET_SCH_NETEM is not set
-# CONFIG_NET_SCH_DRR is not set
-# CONFIG_NET_SCH_INGRESS is not set
-
-#
-# Classification
-#
-CONFIG_NET_CLS=y
-# CONFIG_NET_CLS_BASIC is not set
-# CONFIG_NET_CLS_TCINDEX is not set
-# CONFIG_NET_CLS_ROUTE4 is not set
-# CONFIG_NET_CLS_FW is not set
-# CONFIG_NET_CLS_U32 is not set
-# CONFIG_NET_CLS_RSVP is not set
-# CONFIG_NET_CLS_RSVP6 is not set
-# CONFIG_NET_CLS_FLOW is not set
-# CONFIG_NET_CLS_CGROUP is not set
CONFIG_NET_EMATCH=y
-CONFIG_NET_EMATCH_STACK=32
-# CONFIG_NET_EMATCH_CMP is not set
-# CONFIG_NET_EMATCH_NBYTE is not set
-# CONFIG_NET_EMATCH_U32 is not set
-# CONFIG_NET_EMATCH_META is not set
-# CONFIG_NET_EMATCH_TEXT is not set
CONFIG_NET_CLS_ACT=y
-# CONFIG_NET_ACT_POLICE is not set
-# CONFIG_NET_ACT_GACT is not set
-# CONFIG_NET_ACT_MIRRED is not set
-# CONFIG_NET_ACT_IPT is not set
-# CONFIG_NET_ACT_NAT is not set
-# CONFIG_NET_ACT_PEDIT is not set
-# CONFIG_NET_ACT_SIMP is not set
-# CONFIG_NET_ACT_SKBEDIT is not set
-CONFIG_NET_SCH_FIFO=y
-# CONFIG_DCB is not set
-
-#
-# Network testing
-#
-# CONFIG_NET_PKTGEN is not set
-# CONFIG_NET_TCPPROBE is not set
-# CONFIG_NET_DROP_MONITOR is not set
CONFIG_HAMRADIO=y
-
-#
-# Packet Radio protocols
-#
-# CONFIG_AX25 is not set
-# CONFIG_CAN is not set
-# CONFIG_IRDA is not set
-# CONFIG_BT is not set
-# CONFIG_AF_RXRPC is not set
-CONFIG_FIB_RULES=y
-CONFIG_WIRELESS=y
CONFIG_CFG80211=y
-# CONFIG_CFG80211_REG_DEBUG is not set
-CONFIG_WIRELESS_OLD_REGULATORY=y
-CONFIG_WIRELESS_EXT=y
-CONFIG_WIRELESS_EXT_SYSFS=y
-# CONFIG_LIB80211 is not set
CONFIG_MAC80211=y
-
-#
-# Rate control algorithm selection
-#
-CONFIG_MAC80211_RC_MINSTREL=y
-# CONFIG_MAC80211_RC_DEFAULT_PID is not set
-CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y
-CONFIG_MAC80211_RC_DEFAULT="minstrel"
-# CONFIG_MAC80211_MESH is not set
CONFIG_MAC80211_LEDS=y
-# CONFIG_MAC80211_DEBUGFS is not set
-# CONFIG_MAC80211_DEBUG_MENU is not set
-# CONFIG_WIMAX is not set
CONFIG_RFKILL=y
-# CONFIG_RFKILL_INPUT is not set
-CONFIG_RFKILL_LEDS=y
-# CONFIG_NET_9P is not set
-
-#
-# Device Drivers
-#
-
-#
-# Generic Driver Options
-#
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-CONFIG_STANDALONE=y
-CONFIG_PREVENT_FIRMWARE_BUILD=y
-CONFIG_FW_LOADER=y
-CONFIG_FIRMWARE_IN_KERNEL=y
-CONFIG_EXTRA_FIRMWARE=""
-# CONFIG_DEBUG_DRIVER is not set
CONFIG_DEBUG_DEVRES=y
-# CONFIG_SYS_HYPERVISOR is not set
CONFIG_CONNECTOR=y
-CONFIG_PROC_EVENTS=y
-# CONFIG_MTD is not set
-# CONFIG_PARPORT is not set
-CONFIG_PNP=y
-CONFIG_PNP_DEBUG_MESSAGES=y
-
-#
-# Protocols
-#
-CONFIG_PNPACPI=y
-CONFIG_BLK_DEV=y
-# CONFIG_BLK_DEV_FD is not set
-# CONFIG_BLK_CPQ_DA is not set
-# CONFIG_BLK_CPQ_CISS_DA is not set
-# CONFIG_BLK_DEV_DAC960 is not set
-# CONFIG_BLK_DEV_UMEM is not set
-# CONFIG_BLK_DEV_COW_COMMON is not set
CONFIG_BLK_DEV_LOOP=y
-# CONFIG_BLK_DEV_CRYPTOLOOP is not set
-# CONFIG_BLK_DEV_NBD is not set
-# CONFIG_BLK_DEV_SX8 is not set
-# CONFIG_BLK_DEV_UB is not set
CONFIG_BLK_DEV_RAM=y
-CONFIG_BLK_DEV_RAM_COUNT=16
CONFIG_BLK_DEV_RAM_SIZE=16384
-# CONFIG_BLK_DEV_XIP is not set
-# CONFIG_CDROM_PKTCDVD is not set
-# CONFIG_ATA_OVER_ETH is not set
-# CONFIG_BLK_DEV_HD is not set
-CONFIG_MISC_DEVICES=y
-# CONFIG_IBM_ASM is not set
-# CONFIG_PHANTOM is not set
-# CONFIG_SGI_IOC4 is not set
-# CONFIG_TIFM_CORE is not set
-# CONFIG_ICS932S401 is not set
-# CONFIG_ENCLOSURE_SERVICES is not set
-# CONFIG_HP_ILO is not set
-# CONFIG_ISL29003 is not set
-# CONFIG_C2PORT is not set
-
-#
-# EEPROM support
-#
-# CONFIG_EEPROM_AT24 is not set
-# CONFIG_EEPROM_LEGACY is not set
-# CONFIG_EEPROM_93CX6 is not set
-CONFIG_HAVE_IDE=y
-# CONFIG_IDE is not set
-
-#
-# SCSI device support
-#
-# CONFIG_RAID_ATTRS is not set
-CONFIG_SCSI=y
-CONFIG_SCSI_DMA=y
-# CONFIG_SCSI_TGT is not set
-# CONFIG_SCSI_NETLINK is not set
-CONFIG_SCSI_PROC_FS=y
-
-#
-# SCSI support type (disk, tape, CD-ROM)
-#
CONFIG_BLK_DEV_SD=y
-# CONFIG_CHR_DEV_ST is not set
-# CONFIG_CHR_DEV_OSST is not set
CONFIG_BLK_DEV_SR=y
CONFIG_BLK_DEV_SR_VENDOR=y
CONFIG_CHR_DEV_SG=y
-# CONFIG_CHR_DEV_SCH is not set
-
-#
-# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
-#
-# CONFIG_SCSI_MULTI_LUN is not set
CONFIG_SCSI_CONSTANTS=y
-# CONFIG_SCSI_LOGGING is not set
-# CONFIG_SCSI_SCAN_ASYNC is not set
-CONFIG_SCSI_WAIT_SCAN=m
-
-#
-# SCSI Transports
-#
CONFIG_SCSI_SPI_ATTRS=y
-# CONFIG_SCSI_FC_ATTRS is not set
-# CONFIG_SCSI_ISCSI_ATTRS is not set
-# CONFIG_SCSI_SAS_ATTRS is not set
-# CONFIG_SCSI_SAS_LIBSAS is not set
-# CONFIG_SCSI_SRP_ATTRS is not set
# CONFIG_SCSI_LOWLEVEL is not set
-# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
-# CONFIG_SCSI_DH is not set
-# CONFIG_SCSI_OSD_INITIATOR is not set
CONFIG_ATA=y
-# CONFIG_ATA_NONSTANDARD is not set
-CONFIG_ATA_ACPI=y
-CONFIG_SATA_PMP=y
CONFIG_SATA_AHCI=y
-# CONFIG_SATA_SIL24 is not set
-CONFIG_ATA_SFF=y
-# CONFIG_SATA_SVW is not set
CONFIG_ATA_PIIX=y
-# CONFIG_SATA_MV is not set
-# CONFIG_SATA_NV is not set
-# CONFIG_PDC_ADMA is not set
-# CONFIG_SATA_QSTOR is not set
-# CONFIG_SATA_PROMISE is not set
-# CONFIG_SATA_SX4 is not set
-# CONFIG_SATA_SIL is not set
-# CONFIG_SATA_SIS is not set
-# CONFIG_SATA_ULI is not set
-# CONFIG_SATA_VIA is not set
-# CONFIG_SATA_VITESSE is not set
-# CONFIG_SATA_INIC162X is not set
-# CONFIG_PATA_ACPI is not set
-# CONFIG_PATA_ALI is not set
CONFIG_PATA_AMD=y
-# CONFIG_PATA_ARTOP is not set
-# CONFIG_PATA_ATIIXP is not set
-# CONFIG_PATA_CMD640_PCI is not set
-# CONFIG_PATA_CMD64X is not set
-# CONFIG_PATA_CS5520 is not set
-# CONFIG_PATA_CS5530 is not set
-# CONFIG_PATA_CS5535 is not set
-# CONFIG_PATA_CS5536 is not set
-# CONFIG_PATA_CYPRESS is not set
-# CONFIG_PATA_EFAR is not set
-CONFIG_ATA_GENERIC=y
-# CONFIG_PATA_HPT366 is not set
-# CONFIG_PATA_HPT37X is not set
-# CONFIG_PATA_HPT3X2N is not set
-# CONFIG_PATA_HPT3X3 is not set
-# CONFIG_PATA_IT821X is not set
-# CONFIG_PATA_IT8213 is not set
-# CONFIG_PATA_JMICRON is not set
-# CONFIG_PATA_TRIFLEX is not set
-# CONFIG_PATA_MARVELL is not set
-CONFIG_PATA_MPIIX=y
CONFIG_PATA_OLDPIIX=y
-# CONFIG_PATA_NETCELL is not set
-# CONFIG_PATA_NINJA32 is not set
-# CONFIG_PATA_NS87410 is not set
-# CONFIG_PATA_NS87415 is not set
-# CONFIG_PATA_OPTI is not set
-# CONFIG_PATA_OPTIDMA is not set
-# CONFIG_PATA_PCMCIA is not set
-# CONFIG_PATA_PDC_OLD is not set
-# CONFIG_PATA_RADISYS is not set
-# CONFIG_PATA_RZ1000 is not set
-# CONFIG_PATA_SC1200 is not set
-# CONFIG_PATA_SERVERWORKS is not set
-# CONFIG_PATA_PDC2027X is not set
-# CONFIG_PATA_SIL680 is not set
-# CONFIG_PATA_SIS is not set
-# CONFIG_PATA_VIA is not set
-# CONFIG_PATA_WINBOND is not set
CONFIG_PATA_SCH=y
+CONFIG_PATA_MPIIX=y
+CONFIG_ATA_GENERIC=y
CONFIG_MD=y
CONFIG_BLK_DEV_MD=y
-CONFIG_MD_AUTODETECT=y
-# CONFIG_MD_LINEAR is not set
-# CONFIG_MD_RAID0 is not set
-# CONFIG_MD_RAID1 is not set
-# CONFIG_MD_RAID10 is not set
-# CONFIG_MD_RAID456 is not set
-# CONFIG_MD_MULTIPATH is not set
-# CONFIG_MD_FAULTY is not set
CONFIG_BLK_DEV_DM=y
-# CONFIG_DM_DEBUG is not set
-# CONFIG_DM_CRYPT is not set
-# CONFIG_DM_SNAPSHOT is not set
CONFIG_DM_MIRROR=y
CONFIG_DM_ZERO=y
-# CONFIG_DM_MULTIPATH is not set
-# CONFIG_DM_DELAY is not set
-# CONFIG_DM_UEVENT is not set
-# CONFIG_FUSION is not set
-
-#
-# IEEE 1394 (FireWire) support
-#
-
-#
-# Enable only one of the two stacks, unless you know what you are doing
-#
-# CONFIG_FIREWIRE is not set
-# CONFIG_IEEE1394 is not set
-# CONFIG_I2O is not set
CONFIG_MACINTOSH_DRIVERS=y
CONFIG_MAC_EMUMOUSEBTN=y
CONFIG_NETDEVICES=y
-CONFIG_COMPAT_NET_DEV_OPS=y
-# CONFIG_IFB is not set
-# CONFIG_DUMMY is not set
-# CONFIG_BONDING is not set
-# CONFIG_MACVLAN is not set
-# CONFIG_EQUALIZER is not set
-# CONFIG_TUN is not set
-# CONFIG_VETH is not set
-# CONFIG_NET_SB1000 is not set
-# CONFIG_ARCNET is not set
-CONFIG_PHYLIB=y
-
-#
-# MII PHY device drivers
-#
-# CONFIG_MARVELL_PHY is not set
-# CONFIG_DAVICOM_PHY is not set
-# CONFIG_QSEMI_PHY is not set
-# CONFIG_LXT_PHY is not set
-# CONFIG_CICADA_PHY is not set
-# CONFIG_VITESSE_PHY is not set
-# CONFIG_SMSC_PHY is not set
-# CONFIG_BROADCOM_PHY is not set
-# CONFIG_ICPLUS_PHY is not set
-# CONFIG_REALTEK_PHY is not set
-# CONFIG_NATIONAL_PHY is not set
-# CONFIG_STE10XP is not set
-# CONFIG_LSI_ET1011C_PHY is not set
-# CONFIG_FIXED_PHY is not set
-# CONFIG_MDIO_BITBANG is not set
CONFIG_NET_ETHERNET=y
-CONFIG_MII=y
-# CONFIG_HAPPYMEAL is not set
-# CONFIG_SUNGEM is not set
-# CONFIG_CASSINI is not set
CONFIG_NET_VENDOR_3COM=y
-# CONFIG_VORTEX is not set
-# CONFIG_TYPHOON is not set
-# CONFIG_ETHOC is not set
-# CONFIG_DNET is not set
CONFIG_NET_TULIP=y
-# CONFIG_DE2104X is not set
-# CONFIG_TULIP is not set
-# CONFIG_DE4X5 is not set
-# CONFIG_WINBOND_840 is not set
-# CONFIG_DM9102 is not set
-# CONFIG_ULI526X is not set
-# CONFIG_PCMCIA_XIRCOM is not set
-# CONFIG_HP100 is not set
-# CONFIG_IBM_NEW_EMAC_ZMII is not set
-# CONFIG_IBM_NEW_EMAC_RGMII is not set
-# CONFIG_IBM_NEW_EMAC_TAH is not set
-# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
-# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set
-# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set
-# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set
CONFIG_NET_PCI=y
-# CONFIG_PCNET32 is not set
-# CONFIG_AMD8111_ETH is not set
-# CONFIG_ADAPTEC_STARFIRE is not set
-# CONFIG_B44 is not set
CONFIG_FORCEDETH=y
-# CONFIG_FORCEDETH_NAPI is not set
CONFIG_E100=y
-# CONFIG_FEALNX is not set
-# CONFIG_NATSEMI is not set
CONFIG_NE2K_PCI=y
-# CONFIG_8139CP is not set
CONFIG_8139TOO=y
# CONFIG_8139TOO_PIO is not set
-# CONFIG_8139TOO_TUNE_TWISTER is not set
-# CONFIG_8139TOO_8129 is not set
-# CONFIG_8139_OLD_RX_RESET is not set
-# CONFIG_R6040 is not set
-# CONFIG_SIS900 is not set
-# CONFIG_EPIC100 is not set
-# CONFIG_SMSC9420 is not set
-# CONFIG_SUNDANCE is not set
-# CONFIG_TLAN is not set
-# CONFIG_VIA_RHINE is not set
-# CONFIG_SC92031 is not set
-# CONFIG_ATL2 is not set
-CONFIG_NETDEV_1000=y
-# CONFIG_ACENIC is not set
-# CONFIG_DL2K is not set
CONFIG_E1000=y
CONFIG_E1000E=y
-# CONFIG_IP1000 is not set
-# CONFIG_IGB is not set
-# CONFIG_IGBVF is not set
-# CONFIG_NS83820 is not set
-# CONFIG_HAMACHI is not set
-# CONFIG_YELLOWFIN is not set
CONFIG_R8169=y
-# CONFIG_SIS190 is not set
-# CONFIG_SKGE is not set
CONFIG_SKY2=y
-# CONFIG_SKY2_DEBUG is not set
-# CONFIG_VIA_VELOCITY is not set
CONFIG_TIGON3=y
CONFIG_BNX2=y
-# CONFIG_QLA3XXX is not set
-# CONFIG_ATL1 is not set
-# CONFIG_ATL1E is not set
-# CONFIG_ATL1C is not set
-# CONFIG_JME is not set
-CONFIG_NETDEV_10000=y
-# CONFIG_CHELSIO_T1 is not set
-CONFIG_CHELSIO_T3_DEPENDS=y
-# CONFIG_CHELSIO_T3 is not set
-# CONFIG_ENIC is not set
-# CONFIG_IXGBE is not set
-# CONFIG_IXGB is not set
-# CONFIG_S2IO is not set
-# CONFIG_VXGE is not set
-# CONFIG_MYRI10GE is not set
-# CONFIG_NETXEN_NIC is not set
-# CONFIG_NIU is not set
-# CONFIG_MLX4_EN is not set
-# CONFIG_MLX4_CORE is not set
-# CONFIG_TEHUTI is not set
-# CONFIG_BNX2X is not set
-# CONFIG_QLGE is not set
-# CONFIG_SFC is not set
-# CONFIG_BE2NET is not set
CONFIG_TR=y
-# CONFIG_IBMOL is not set
-# CONFIG_IBMLS is not set
-# CONFIG_3C359 is not set
-# CONFIG_TMS380TR is not set
-
-#
-# Wireless LAN
-#
-# CONFIG_WLAN_PRE80211 is not set
-CONFIG_WLAN_80211=y
-# CONFIG_PCMCIA_RAYCS is not set
-# CONFIG_LIBERTAS is not set
-# CONFIG_LIBERTAS_THINFIRM is not set
-# CONFIG_AIRO is not set
-# CONFIG_ATMEL is not set
-# CONFIG_AT76C50X_USB is not set
-# CONFIG_AIRO_CS is not set
-# CONFIG_PCMCIA_WL3501 is not set
-# CONFIG_PRISM54 is not set
-# CONFIG_USB_ZD1201 is not set
-# CONFIG_USB_NET_RNDIS_WLAN is not set
-# CONFIG_RTL8180 is not set
-# CONFIG_RTL8187 is not set
-# CONFIG_ADM8211 is not set
-# CONFIG_MAC80211_HWSIM is not set
-# CONFIG_MWL8K is not set
-# CONFIG_P54_COMMON is not set
-CONFIG_ATH5K=y
-# CONFIG_ATH5K_DEBUG is not set
-# CONFIG_ATH9K is not set
-# CONFIG_AR9170_USB is not set
-# CONFIG_IPW2100 is not set
-# CONFIG_IPW2200 is not set
-# CONFIG_IWLWIFI is not set
-# CONFIG_HOSTAP is not set
-# CONFIG_B43 is not set
-# CONFIG_B43LEGACY is not set
-# CONFIG_ZD1211RW is not set
-# CONFIG_RT2X00 is not set
-# CONFIG_HERMES is not set
-
-#
-# Enable WiMAX (Networking options) to see the WiMAX drivers
-#
-
-#
-# USB Network Adapters
-#
-# CONFIG_USB_CATC is not set
-# CONFIG_USB_KAWETH is not set
-# CONFIG_USB_PEGASUS is not set
-# CONFIG_USB_RTL8150 is not set
-# CONFIG_USB_USBNET is not set
-# CONFIG_USB_HSO is not set
CONFIG_NET_PCMCIA=y
-# CONFIG_PCMCIA_3C589 is not set
-# CONFIG_PCMCIA_3C574 is not set
-# CONFIG_PCMCIA_FMVJ18X is not set
-# CONFIG_PCMCIA_PCNET is not set
-# CONFIG_PCMCIA_NMCLAN is not set
-# CONFIG_PCMCIA_SMC91C92 is not set
-# CONFIG_PCMCIA_XIRC2PS is not set
-# CONFIG_PCMCIA_AXNET is not set
-# CONFIG_PCMCIA_IBMTR is not set
-# CONFIG_WAN is not set
CONFIG_FDDI=y
-# CONFIG_DEFXX is not set
-# CONFIG_SKFP is not set
-# CONFIG_HIPPI is not set
-# CONFIG_PPP is not set
-# CONFIG_SLIP is not set
-# CONFIG_NET_FC is not set
CONFIG_NETCONSOLE=y
-# CONFIG_NETCONSOLE_DYNAMIC is not set
-CONFIG_NETPOLL=y
-# CONFIG_NETPOLL_TRAP is not set
-CONFIG_NET_POLL_CONTROLLER=y
-# CONFIG_ISDN is not set
-# CONFIG_PHONE is not set
-
-#
-# Input device support
-#
-CONFIG_INPUT=y
-CONFIG_INPUT_FF_MEMLESS=y
CONFIG_INPUT_POLLDEV=y
-
-#
-# Userland interfaces
-#
-CONFIG_INPUT_MOUSEDEV=y
# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
-CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
-CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
-# CONFIG_INPUT_JOYDEV is not set
CONFIG_INPUT_EVDEV=y
-# CONFIG_INPUT_EVBUG is not set
-
-#
-# Input Device Drivers
-#
-CONFIG_INPUT_KEYBOARD=y
-CONFIG_KEYBOARD_ATKBD=y
-# CONFIG_KEYBOARD_SUNKBD is not set
-# CONFIG_KEYBOARD_LKKBD is not set
-# CONFIG_KEYBOARD_XTKBD is not set
-# CONFIG_KEYBOARD_NEWTON is not set
-# CONFIG_KEYBOARD_STOWAWAY is not set
-CONFIG_INPUT_MOUSE=y
-CONFIG_MOUSE_PS2=y
-CONFIG_MOUSE_PS2_ALPS=y
-CONFIG_MOUSE_PS2_LOGIPS2PP=y
-CONFIG_MOUSE_PS2_SYNAPTICS=y
-CONFIG_MOUSE_PS2_LIFEBOOK=y
-CONFIG_MOUSE_PS2_TRACKPOINT=y
-# CONFIG_MOUSE_PS2_ELANTECH is not set
-# CONFIG_MOUSE_PS2_TOUCHKIT is not set
-# CONFIG_MOUSE_SERIAL is not set
-# CONFIG_MOUSE_APPLETOUCH is not set
-# CONFIG_MOUSE_BCM5974 is not set
-# CONFIG_MOUSE_VSXXXAA is not set
CONFIG_INPUT_JOYSTICK=y
-# CONFIG_JOYSTICK_ANALOG is not set
-# CONFIG_JOYSTICK_A3D is not set
-# CONFIG_JOYSTICK_ADI is not set
-# CONFIG_JOYSTICK_COBRA is not set
-# CONFIG_JOYSTICK_GF2K is not set
-# CONFIG_JOYSTICK_GRIP is not set
-# CONFIG_JOYSTICK_GRIP_MP is not set
-# CONFIG_JOYSTICK_GUILLEMOT is not set
-# CONFIG_JOYSTICK_INTERACT is not set
-# CONFIG_JOYSTICK_SIDEWINDER is not set
-# CONFIG_JOYSTICK_TMDC is not set
-# CONFIG_JOYSTICK_IFORCE is not set
-# CONFIG_JOYSTICK_WARRIOR is not set
-# CONFIG_JOYSTICK_MAGELLAN is not set
-# CONFIG_JOYSTICK_SPACEORB is not set
-# CONFIG_JOYSTICK_SPACEBALL is not set
-# CONFIG_JOYSTICK_STINGER is not set
-# CONFIG_JOYSTICK_TWIDJOY is not set
-# CONFIG_JOYSTICK_ZHENHUA is not set
-# CONFIG_JOYSTICK_JOYDUMP is not set
-# CONFIG_JOYSTICK_XPAD is not set
CONFIG_INPUT_TABLET=y
-# CONFIG_TABLET_USB_ACECAD is not set
-# CONFIG_TABLET_USB_AIPTEK is not set
-# CONFIG_TABLET_USB_GTCO is not set
-# CONFIG_TABLET_USB_KBTAB is not set
-# CONFIG_TABLET_USB_WACOM is not set
CONFIG_INPUT_TOUCHSCREEN=y
-# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
-# CONFIG_TOUCHSCREEN_AD7879 is not set
-# CONFIG_TOUCHSCREEN_FUJITSU is not set
-# CONFIG_TOUCHSCREEN_GUNZE is not set
-# CONFIG_TOUCHSCREEN_ELO is not set
-# CONFIG_TOUCHSCREEN_WACOM_W8001 is not set
-# CONFIG_TOUCHSCREEN_MTOUCH is not set
-# CONFIG_TOUCHSCREEN_INEXIO is not set
-# CONFIG_TOUCHSCREEN_MK712 is not set
-# CONFIG_TOUCHSCREEN_PENMOUNT is not set
-# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
-# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
-# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
-# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
-# CONFIG_TOUCHSCREEN_TSC2007 is not set
CONFIG_INPUT_MISC=y
-# CONFIG_INPUT_PCSPKR is not set
-# CONFIG_INPUT_APANEL is not set
-# CONFIG_INPUT_WISTRON_BTNS is not set
-# CONFIG_INPUT_ATLAS_BTNS is not set
-# CONFIG_INPUT_ATI_REMOTE is not set
-# CONFIG_INPUT_ATI_REMOTE2 is not set
-# CONFIG_INPUT_KEYSPAN_REMOTE is not set
-# CONFIG_INPUT_POWERMATE is not set
-# CONFIG_INPUT_YEALINK is not set
-# CONFIG_INPUT_CM109 is not set
-# CONFIG_INPUT_UINPUT is not set
-
-#
-# Hardware I/O ports
-#
-CONFIG_SERIO=y
-CONFIG_SERIO_I8042=y
-CONFIG_SERIO_SERPORT=y
-# CONFIG_SERIO_CT82C710 is not set
-# CONFIG_SERIO_PCIPS2 is not set
-CONFIG_SERIO_LIBPS2=y
-# CONFIG_SERIO_RAW is not set
-# CONFIG_GAMEPORT is not set
-
-#
-# Character devices
-#
-CONFIG_VT=y
-CONFIG_CONSOLE_TRANSLATIONS=y
-CONFIG_VT_CONSOLE=y
-CONFIG_HW_CONSOLE=y
CONFIG_VT_HW_CONSOLE_BINDING=y
-CONFIG_DEVKMEM=y
CONFIG_SERIAL_NONSTANDARD=y
-# CONFIG_COMPUTONE is not set
-# CONFIG_ROCKETPORT is not set
-# CONFIG_CYCLADES is not set
-# CONFIG_DIGIEPCA is not set
-# CONFIG_MOXA_INTELLIO is not set
-# CONFIG_MOXA_SMARTIO is not set
-# CONFIG_ISI is not set
-# CONFIG_SYNCLINK is not set
-# CONFIG_SYNCLINKMP is not set
-# CONFIG_SYNCLINK_GT is not set
-# CONFIG_N_HDLC is not set
-# CONFIG_RISCOM8 is not set
-# CONFIG_SPECIALIX is not set
-# CONFIG_SX is not set
-# CONFIG_RIO is not set
-# CONFIG_STALDRV is not set
-# CONFIG_NOZOMI is not set
-
-#
-# Serial drivers
-#
CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_FIX_EARLYCON_MEM=y
-CONFIG_SERIAL_8250_PCI=y
-CONFIG_SERIAL_8250_PNP=y
-# CONFIG_SERIAL_8250_CS is not set
CONFIG_SERIAL_8250_NR_UARTS=32
-CONFIG_SERIAL_8250_RUNTIME_UARTS=4
CONFIG_SERIAL_8250_EXTENDED=y
CONFIG_SERIAL_8250_MANY_PORTS=y
CONFIG_SERIAL_8250_SHARE_IRQ=y
CONFIG_SERIAL_8250_DETECT_IRQ=y
CONFIG_SERIAL_8250_RSA=y
-
-#
-# Non-8250 serial port support
-#
-CONFIG_SERIAL_CORE=y
-CONFIG_SERIAL_CORE_CONSOLE=y
-# CONFIG_SERIAL_JSM is not set
-CONFIG_UNIX98_PTYS=y
-# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set
# CONFIG_LEGACY_PTYS is not set
-# CONFIG_IPMI_HANDLER is not set
CONFIG_HW_RANDOM=y
-# CONFIG_HW_RANDOM_TIMERIOMEM is not set
-CONFIG_HW_RANDOM_INTEL=y
-CONFIG_HW_RANDOM_AMD=y
-CONFIG_HW_RANDOM_GEODE=y
-CONFIG_HW_RANDOM_VIA=y
CONFIG_NVRAM=y
-# CONFIG_R3964 is not set
-# CONFIG_APPLICOM is not set
-# CONFIG_SONYPI is not set
-
-#
-# PCMCIA character devices
-#
-# CONFIG_SYNCLINK_CS is not set
-# CONFIG_CARDMAN_4000 is not set
-# CONFIG_CARDMAN_4040 is not set
-# CONFIG_IPWIRELESS is not set
-# CONFIG_MWAVE is not set
-# CONFIG_PC8736x_GPIO is not set
-# CONFIG_NSC_GPIO is not set
-# CONFIG_CS5535_GPIO is not set
-# CONFIG_RAW_DRIVER is not set
CONFIG_HPET=y
# CONFIG_HPET_MMAP is not set
-# CONFIG_HANGCHECK_TIMER is not set
-# CONFIG_TCG_TPM is not set
-# CONFIG_TELCLOCK is not set
-CONFIG_DEVPORT=y
-CONFIG_I2C=y
-CONFIG_I2C_BOARDINFO=y
-# CONFIG_I2C_CHARDEV is not set
-CONFIG_I2C_HELPER_AUTO=y
-CONFIG_I2C_ALGOBIT=y
-
-#
-# I2C Hardware Bus support
-#
-
-#
-# PC SMBus host controller drivers
-#
-# CONFIG_I2C_ALI1535 is not set
-# CONFIG_I2C_ALI1563 is not set
-# CONFIG_I2C_ALI15X3 is not set
-# CONFIG_I2C_AMD756 is not set
-# CONFIG_I2C_AMD8111 is not set
CONFIG_I2C_I801=y
-# CONFIG_I2C_ISCH is not set
-# CONFIG_I2C_PIIX4 is not set
-# CONFIG_I2C_NFORCE2 is not set
-# CONFIG_I2C_SIS5595 is not set
-# CONFIG_I2C_SIS630 is not set
-# CONFIG_I2C_SIS96X is not set
-# CONFIG_I2C_VIA is not set
-# CONFIG_I2C_VIAPRO is not set
-
-#
-# I2C system bus drivers (mostly embedded / system-on-chip)
-#
-# CONFIG_I2C_OCORES is not set
-# CONFIG_I2C_SIMTEC is not set
-
-#
-# External I2C/SMBus adapter drivers
-#
-# CONFIG_I2C_PARPORT_LIGHT is not set
-# CONFIG_I2C_TAOS_EVM is not set
-# CONFIG_I2C_TINY_USB is not set
-
-#
-# Graphics adapter I2C/DDC channel drivers
-#
-# CONFIG_I2C_VOODOO3 is not set
-
-#
-# Other I2C/SMBus bus drivers
-#
-# CONFIG_I2C_PCA_PLATFORM is not set
-# CONFIG_I2C_STUB is not set
-# CONFIG_SCx200_ACB is not set
-
-#
-# Miscellaneous I2C Chip support
-#
-# CONFIG_DS1682 is not set
-# CONFIG_SENSORS_PCF8574 is not set
-# CONFIG_PCF8575 is not set
-# CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_MAX6875 is not set
-# CONFIG_SENSORS_TSL2550 is not set
-# CONFIG_I2C_DEBUG_CORE is not set
-# CONFIG_I2C_DEBUG_ALGO is not set
-# CONFIG_I2C_DEBUG_BUS is not set
-# CONFIG_I2C_DEBUG_CHIP is not set
-# CONFIG_SPI is not set
-CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
-# CONFIG_GPIOLIB is not set
-# CONFIG_W1 is not set
-CONFIG_POWER_SUPPLY=y
-# CONFIG_POWER_SUPPLY_DEBUG is not set
-# CONFIG_PDA_POWER is not set
-# CONFIG_BATTERY_DS2760 is not set
-# CONFIG_BATTERY_BQ27x00 is not set
-CONFIG_HWMON=y
-# CONFIG_HWMON_VID is not set
-# CONFIG_SENSORS_ABITUGURU is not set
-# CONFIG_SENSORS_ABITUGURU3 is not set
-# CONFIG_SENSORS_AD7414 is not set
-# CONFIG_SENSORS_AD7418 is not set
-# CONFIG_SENSORS_ADM1021 is not set
-# CONFIG_SENSORS_ADM1025 is not set
-# CONFIG_SENSORS_ADM1026 is not set
-# CONFIG_SENSORS_ADM1029 is not set
-# CONFIG_SENSORS_ADM1031 is not set
-# CONFIG_SENSORS_ADM9240 is not set
-# CONFIG_SENSORS_ADT7462 is not set
-# CONFIG_SENSORS_ADT7470 is not set
-# CONFIG_SENSORS_ADT7473 is not set
-# CONFIG_SENSORS_ADT7475 is not set
-# CONFIG_SENSORS_K8TEMP is not set
-# CONFIG_SENSORS_ASB100 is not set
-# CONFIG_SENSORS_ATK0110 is not set
-# CONFIG_SENSORS_ATXP1 is not set
-# CONFIG_SENSORS_DS1621 is not set
-# CONFIG_SENSORS_I5K_AMB is not set
-# CONFIG_SENSORS_F71805F is not set
-# CONFIG_SENSORS_F71882FG is not set
-# CONFIG_SENSORS_F75375S is not set
-# CONFIG_SENSORS_FSCHER is not set
-# CONFIG_SENSORS_FSCPOS is not set
-# CONFIG_SENSORS_FSCHMD is not set
-# CONFIG_SENSORS_G760A is not set
-# CONFIG_SENSORS_GL518SM is not set
-# CONFIG_SENSORS_GL520SM is not set
-# CONFIG_SENSORS_CORETEMP is not set
-# CONFIG_SENSORS_IT87 is not set
-# CONFIG_SENSORS_LM63 is not set
-# CONFIG_SENSORS_LM75 is not set
-# CONFIG_SENSORS_LM77 is not set
-# CONFIG_SENSORS_LM78 is not set
-# CONFIG_SENSORS_LM80 is not set
-# CONFIG_SENSORS_LM83 is not set
-# CONFIG_SENSORS_LM85 is not set
-# CONFIG_SENSORS_LM87 is not set
-# CONFIG_SENSORS_LM90 is not set
-# CONFIG_SENSORS_LM92 is not set
-# CONFIG_SENSORS_LM93 is not set
-# CONFIG_SENSORS_LTC4215 is not set
-# CONFIG_SENSORS_LTC4245 is not set
-# CONFIG_SENSORS_LM95241 is not set
-# CONFIG_SENSORS_MAX1619 is not set
-# CONFIG_SENSORS_MAX6650 is not set
-# CONFIG_SENSORS_PC87360 is not set
-# CONFIG_SENSORS_PC87427 is not set
-# CONFIG_SENSORS_PCF8591 is not set
-# CONFIG_SENSORS_SIS5595 is not set
-# CONFIG_SENSORS_DME1737 is not set
-# CONFIG_SENSORS_SMSC47M1 is not set
-# CONFIG_SENSORS_SMSC47M192 is not set
-# CONFIG_SENSORS_SMSC47B397 is not set
-# CONFIG_SENSORS_ADS7828 is not set
-# CONFIG_SENSORS_THMC50 is not set
-# CONFIG_SENSORS_VIA686A is not set
-# CONFIG_SENSORS_VT1211 is not set
-# CONFIG_SENSORS_VT8231 is not set
-# CONFIG_SENSORS_W83781D is not set
-# CONFIG_SENSORS_W83791D is not set
-# CONFIG_SENSORS_W83792D is not set
-# CONFIG_SENSORS_W83793 is not set
-# CONFIG_SENSORS_W83L785TS is not set
-# CONFIG_SENSORS_W83L786NG is not set
-# CONFIG_SENSORS_W83627HF is not set
-# CONFIG_SENSORS_W83627EHF is not set
-# CONFIG_SENSORS_HDAPS is not set
-# CONFIG_SENSORS_LIS3LV02D is not set
-# CONFIG_SENSORS_APPLESMC is not set
-# CONFIG_HWMON_DEBUG_CHIP is not set
-CONFIG_THERMAL=y
-# CONFIG_THERMAL_HWMON is not set
CONFIG_WATCHDOG=y
-# CONFIG_WATCHDOG_NOWAYOUT is not set
-
-#
-# Watchdog Device Drivers
-#
-# CONFIG_SOFT_WATCHDOG is not set
-# CONFIG_ACQUIRE_WDT is not set
-# CONFIG_ADVANTECH_WDT is not set
-# CONFIG_ALIM1535_WDT is not set
-# CONFIG_ALIM7101_WDT is not set
-# CONFIG_SC520_WDT is not set
-# CONFIG_EUROTECH_WDT is not set
-# CONFIG_IB700_WDT is not set
-# CONFIG_IBMASR is not set
-# CONFIG_WAFER_WDT is not set
-# CONFIG_I6300ESB_WDT is not set
-# CONFIG_ITCO_WDT is not set
-# CONFIG_IT8712F_WDT is not set
-# CONFIG_IT87_WDT is not set
-# CONFIG_HP_WATCHDOG is not set
-# CONFIG_SC1200_WDT is not set
-# CONFIG_PC87413_WDT is not set
-# CONFIG_60XX_WDT is not set
-# CONFIG_SBC8360_WDT is not set
-# CONFIG_SBC7240_WDT is not set
-# CONFIG_CPU5_WDT is not set
-# CONFIG_SMSC_SCH311X_WDT is not set
-# CONFIG_SMSC37B787_WDT is not set
-# CONFIG_W83627HF_WDT is not set
-# CONFIG_W83697HF_WDT is not set
-# CONFIG_W83697UG_WDT is not set
-# CONFIG_W83877F_WDT is not set
-# CONFIG_W83977F_WDT is not set
-# CONFIG_MACHZ_WDT is not set
-# CONFIG_SBC_EPX_C3_WATCHDOG is not set
-
-#
-# PCI-based Watchdog Cards
-#
-# CONFIG_PCIPCWATCHDOG is not set
-# CONFIG_WDTPCI is not set
-
-#
-# USB-based Watchdog Cards
-#
-# CONFIG_USBPCWATCHDOG is not set
-CONFIG_SSB_POSSIBLE=y
-
-#
-# Sonics Silicon Backplane
-#
-# CONFIG_SSB is not set
-
-#
-# Multifunction device drivers
-#
-# CONFIG_MFD_CORE is not set
-# CONFIG_MFD_SM501 is not set
-# CONFIG_HTC_PASIC3 is not set
-# CONFIG_TWL4030_CORE is not set
-# CONFIG_MFD_TMIO is not set
-# CONFIG_PMIC_DA903X is not set
-# CONFIG_MFD_WM8400 is not set
-# CONFIG_MFD_WM8350_I2C is not set
-# CONFIG_MFD_PCF50633 is not set
-# CONFIG_REGULATOR is not set
-
-#
-# Multimedia devices
-#
-
-#
-# Multimedia core support
-#
-# CONFIG_VIDEO_DEV is not set
-# CONFIG_DVB_CORE is not set
-# CONFIG_VIDEO_MEDIA is not set
-
-#
-# Multimedia drivers
-#
-CONFIG_DAB=y
-# CONFIG_USB_DABUSB is not set
-
-#
-# Graphics support
-#
CONFIG_AGP=y
-# CONFIG_AGP_ALI is not set
-# CONFIG_AGP_ATI is not set
-# CONFIG_AGP_AMD is not set
CONFIG_AGP_AMD64=y
CONFIG_AGP_INTEL=y
-# CONFIG_AGP_NVIDIA is not set
-# CONFIG_AGP_SIS is not set
-# CONFIG_AGP_SWORKS is not set
-# CONFIG_AGP_VIA is not set
-# CONFIG_AGP_EFFICEON is not set
CONFIG_DRM=y
-# CONFIG_DRM_TDFX is not set
-# CONFIG_DRM_R128 is not set
-# CONFIG_DRM_RADEON is not set
-# CONFIG_DRM_I810 is not set
-# CONFIG_DRM_I830 is not set
CONFIG_DRM_I915=y
-# CONFIG_DRM_I915_KMS is not set
-# CONFIG_DRM_MGA is not set
-# CONFIG_DRM_SIS is not set
-# CONFIG_DRM_VIA is not set
-# CONFIG_DRM_SAVAGE is not set
-# CONFIG_VGASTATE is not set
-# CONFIG_VIDEO_OUTPUT_CONTROL is not set
-CONFIG_FB=y
-# CONFIG_FIRMWARE_EDID is not set
-# CONFIG_FB_DDC is not set
-# CONFIG_FB_BOOT_VESA_SUPPORT is not set
-CONFIG_FB_CFB_FILLRECT=y
-CONFIG_FB_CFB_COPYAREA=y
-CONFIG_FB_CFB_IMAGEBLIT=y
-# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set
-# CONFIG_FB_SYS_FILLRECT is not set
-# CONFIG_FB_SYS_COPYAREA is not set
-# CONFIG_FB_SYS_IMAGEBLIT is not set
-# CONFIG_FB_FOREIGN_ENDIAN is not set
-# CONFIG_FB_SYS_FOPS is not set
-# CONFIG_FB_SVGALIB is not set
-# CONFIG_FB_MACMODES is not set
-# CONFIG_FB_BACKLIGHT is not set
CONFIG_FB_MODE_HELPERS=y
CONFIG_FB_TILEBLITTING=y
-
-#
-# Frame buffer hardware drivers
-#
-# CONFIG_FB_CIRRUS is not set
-# CONFIG_FB_PM2 is not set
-# CONFIG_FB_CYBER2000 is not set
-# CONFIG_FB_ARC is not set
-# CONFIG_FB_ASILIANT is not set
-# CONFIG_FB_IMSTT is not set
-# CONFIG_FB_VGA16 is not set
-# CONFIG_FB_UVESA is not set
-# CONFIG_FB_VESA is not set
CONFIG_FB_EFI=y
-# CONFIG_FB_N411 is not set
-# CONFIG_FB_HGA is not set
-# CONFIG_FB_S1D13XXX is not set
-# CONFIG_FB_NVIDIA is not set
-# CONFIG_FB_RIVA is not set
-# CONFIG_FB_I810 is not set
-# CONFIG_FB_LE80578 is not set
-# CONFIG_FB_INTEL is not set
-# CONFIG_FB_MATROX is not set
-# CONFIG_FB_RADEON is not set
-# CONFIG_FB_ATY128 is not set
-# CONFIG_FB_ATY is not set
-# CONFIG_FB_S3 is not set
-# CONFIG_FB_SAVAGE is not set
-# CONFIG_FB_SIS is not set
-# CONFIG_FB_VIA is not set
-# CONFIG_FB_NEOMAGIC is not set
-# CONFIG_FB_KYRO is not set
-# CONFIG_FB_3DFX is not set
-# CONFIG_FB_VOODOO1 is not set
-# CONFIG_FB_VT8623 is not set
-# CONFIG_FB_TRIDENT is not set
-# CONFIG_FB_ARK is not set
-# CONFIG_FB_PM3 is not set
-# CONFIG_FB_CARMINE is not set
-# CONFIG_FB_GEODE is not set
-# CONFIG_FB_VIRTUAL is not set
-# CONFIG_FB_METRONOME is not set
-# CONFIG_FB_MB862XX is not set
-# CONFIG_FB_BROADSHEET is not set
CONFIG_BACKLIGHT_LCD_SUPPORT=y
# CONFIG_LCD_CLASS_DEVICE is not set
-CONFIG_BACKLIGHT_CLASS_DEVICE=y
-CONFIG_BACKLIGHT_GENERIC=y
-# CONFIG_BACKLIGHT_PROGEAR is not set
-# CONFIG_BACKLIGHT_MBP_NVIDIA is not set
-# CONFIG_BACKLIGHT_SAHARA is not set
-
-#
-# Display device support
-#
-# CONFIG_DISPLAY_SUPPORT is not set
-
-#
-# Console display driver support
-#
-CONFIG_VGA_CONSOLE=y
CONFIG_VGACON_SOFT_SCROLLBACK=y
-CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
-CONFIG_DUMMY_CONSOLE=y
-# CONFIG_FRAMEBUFFER_CONSOLE is not set
CONFIG_LOGO=y
# CONFIG_LOGO_LINUX_MONO is not set
# CONFIG_LOGO_LINUX_VGA16 is not set
-CONFIG_LOGO_LINUX_CLUT224=y
CONFIG_SOUND=y
-CONFIG_SOUND_OSS_CORE=y
CONFIG_SND=y
-CONFIG_SND_TIMER=y
-CONFIG_SND_PCM=y
-CONFIG_SND_HWDEP=y
-CONFIG_SND_JACK=y
CONFIG_SND_SEQUENCER=y
CONFIG_SND_SEQ_DUMMY=y
-CONFIG_SND_OSSEMUL=y
CONFIG_SND_MIXER_OSS=y
CONFIG_SND_PCM_OSS=y
-CONFIG_SND_PCM_OSS_PLUGINS=y
CONFIG_SND_SEQUENCER_OSS=y
CONFIG_SND_HRTIMER=y
-CONFIG_SND_SEQ_HRTIMER_DEFAULT=y
-CONFIG_SND_DYNAMIC_MINORS=y
-CONFIG_SND_SUPPORT_OLD_API=y
-CONFIG_SND_VERBOSE_PROCFS=y
-# CONFIG_SND_VERBOSE_PRINTK is not set
-# CONFIG_SND_DEBUG is not set
-CONFIG_SND_VMASTER=y
-CONFIG_SND_DRIVERS=y
-# CONFIG_SND_PCSP is not set
-# CONFIG_SND_DUMMY is not set
-# CONFIG_SND_VIRMIDI is not set
-# CONFIG_SND_MTPAV is not set
-# CONFIG_SND_SERIAL_U16550 is not set
-# CONFIG_SND_MPU401 is not set
-CONFIG_SND_PCI=y
-# CONFIG_SND_AD1889 is not set
-# CONFIG_SND_ALS300 is not set
-# CONFIG_SND_ALS4000 is not set
-# CONFIG_SND_ALI5451 is not set
-# CONFIG_SND_ATIIXP is not set
-# CONFIG_SND_ATIIXP_MODEM is not set
-# CONFIG_SND_AU8810 is not set
-# CONFIG_SND_AU8820 is not set
-# CONFIG_SND_AU8830 is not set
-# CONFIG_SND_AW2 is not set
-# CONFIG_SND_AZT3328 is not set
-# CONFIG_SND_BT87X is not set
-# CONFIG_SND_CA0106 is not set
-# CONFIG_SND_CMIPCI is not set
-# CONFIG_SND_OXYGEN is not set
-# CONFIG_SND_CS4281 is not set
-# CONFIG_SND_CS46XX is not set
-# CONFIG_SND_CS5530 is not set
-# CONFIG_SND_CS5535AUDIO is not set
-# CONFIG_SND_DARLA20 is not set
-# CONFIG_SND_GINA20 is not set
-# CONFIG_SND_LAYLA20 is not set
-# CONFIG_SND_DARLA24 is not set
-# CONFIG_SND_GINA24 is not set
-# CONFIG_SND_LAYLA24 is not set
-# CONFIG_SND_MONA is not set
-# CONFIG_SND_MIA is not set
-# CONFIG_SND_ECHO3G is not set
-# CONFIG_SND_INDIGO is not set
-# CONFIG_SND_INDIGOIO is not set
-# CONFIG_SND_INDIGODJ is not set
-# CONFIG_SND_INDIGOIOX is not set
-# CONFIG_SND_INDIGODJX is not set
-# CONFIG_SND_EMU10K1 is not set
-# CONFIG_SND_EMU10K1X is not set
-# CONFIG_SND_ENS1370 is not set
-# CONFIG_SND_ENS1371 is not set
-# CONFIG_SND_ES1938 is not set
-# CONFIG_SND_ES1968 is not set
-# CONFIG_SND_FM801 is not set
CONFIG_SND_HDA_INTEL=y
CONFIG_SND_HDA_HWDEP=y
-# CONFIG_SND_HDA_RECONFIG is not set
-# CONFIG_SND_HDA_INPUT_BEEP is not set
-CONFIG_SND_HDA_CODEC_REALTEK=y
-CONFIG_SND_HDA_CODEC_ANALOG=y
-CONFIG_SND_HDA_CODEC_SIGMATEL=y
-CONFIG_SND_HDA_CODEC_VIA=y
-CONFIG_SND_HDA_CODEC_ATIHDMI=y
-CONFIG_SND_HDA_CODEC_NVHDMI=y
-CONFIG_SND_HDA_CODEC_INTELHDMI=y
-CONFIG_SND_HDA_ELD=y
-CONFIG_SND_HDA_CODEC_CONEXANT=y
-CONFIG_SND_HDA_CODEC_CMEDIA=y
-CONFIG_SND_HDA_CODEC_SI3054=y
-CONFIG_SND_HDA_GENERIC=y
-# CONFIG_SND_HDA_POWER_SAVE is not set
-# CONFIG_SND_HDSP is not set
-# CONFIG_SND_HDSPM is not set
-# CONFIG_SND_HIFIER is not set
-# CONFIG_SND_ICE1712 is not set
-# CONFIG_SND_ICE1724 is not set
-# CONFIG_SND_INTEL8X0 is not set
-# CONFIG_SND_INTEL8X0M is not set
-# CONFIG_SND_KORG1212 is not set
-# CONFIG_SND_MAESTRO3 is not set
-# CONFIG_SND_MIXART is not set
-# CONFIG_SND_NM256 is not set
-# CONFIG_SND_PCXHR is not set
-# CONFIG_SND_RIPTIDE is not set
-# CONFIG_SND_RME32 is not set
-# CONFIG_SND_RME96 is not set
-# CONFIG_SND_RME9652 is not set
-# CONFIG_SND_SIS7019 is not set
-# CONFIG_SND_SONICVIBES is not set
-# CONFIG_SND_TRIDENT is not set
-# CONFIG_SND_VIA82XX is not set
-# CONFIG_SND_VIA82XX_MODEM is not set
-# CONFIG_SND_VIRTUOSO is not set
-# CONFIG_SND_VX222 is not set
-# CONFIG_SND_YMFPCI is not set
-CONFIG_SND_USB=y
-# CONFIG_SND_USB_AUDIO is not set
-# CONFIG_SND_USB_USX2Y is not set
-# CONFIG_SND_USB_CAIAQ is not set
-# CONFIG_SND_USB_US122L is not set
-CONFIG_SND_PCMCIA=y
-# CONFIG_SND_VXPOCKET is not set
-# CONFIG_SND_PDAUDIOCF is not set
-# CONFIG_SND_SOC is not set
-# CONFIG_SOUND_PRIME is not set
-CONFIG_HID_SUPPORT=y
-CONFIG_HID=y
-CONFIG_HID_DEBUG=y
CONFIG_HIDRAW=y
-
-#
-# USB Input Devices
-#
-CONFIG_USB_HID=y
CONFIG_HID_PID=y
CONFIG_USB_HIDDEV=y
-
-#
-# Special HID drivers
-#
-CONFIG_HID_A4TECH=y
-CONFIG_HID_APPLE=y
-CONFIG_HID_BELKIN=y
-CONFIG_HID_CHERRY=y
-CONFIG_HID_CHICONY=y
-CONFIG_HID_CYPRESS=y
-# CONFIG_DRAGONRISE_FF is not set
-CONFIG_HID_EZKEY=y
-CONFIG_HID_KYE=y
CONFIG_HID_GYRATION=y
-CONFIG_HID_KENSINGTON=y
-CONFIG_HID_LOGITECH=y
CONFIG_LOGITECH_FF=y
-# CONFIG_LOGIRUMBLEPAD2_FF is not set
-CONFIG_HID_MICROSOFT=y
-CONFIG_HID_MONTEREY=y
CONFIG_HID_NTRIG=y
CONFIG_HID_PANTHERLORD=y
CONFIG_PANTHERLORD_FF=y
@@ -1874,702 +249,92 @@ CONFIG_HID_PETALYNX=y
CONFIG_HID_SAMSUNG=y
CONFIG_HID_SONY=y
CONFIG_HID_SUNPLUS=y
-# CONFIG_GREENASIA_FF is not set
CONFIG_HID_TOPSEED=y
-CONFIG_THRUSTMASTER_FF=y
-CONFIG_ZEROPLUS_FF=y
-CONFIG_USB_SUPPORT=y
-CONFIG_USB_ARCH_HAS_HCD=y
-CONFIG_USB_ARCH_HAS_OHCI=y
-CONFIG_USB_ARCH_HAS_EHCI=y
CONFIG_USB=y
CONFIG_USB_DEBUG=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
-
-#
-# Miscellaneous USB options
-#
CONFIG_USB_DEVICEFS=y
# CONFIG_USB_DEVICE_CLASS is not set
-# CONFIG_USB_DYNAMIC_MINORS is not set
-CONFIG_USB_SUSPEND=y
-# CONFIG_USB_OTG is not set
CONFIG_USB_MON=y
-# CONFIG_USB_WUSB is not set
-# CONFIG_USB_WUSB_CBAF is not set
-
-#
-# USB Host Controller Drivers
-#
-# CONFIG_USB_C67X00_HCD is not set
CONFIG_USB_EHCI_HCD=y
-# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
# CONFIG_USB_EHCI_TT_NEWSCHED is not set
-# CONFIG_USB_OXU210HP_HCD is not set
-# CONFIG_USB_ISP116X_HCD is not set
-# CONFIG_USB_ISP1760_HCD is not set
CONFIG_USB_OHCI_HCD=y
-# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
-# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set
-CONFIG_USB_OHCI_LITTLE_ENDIAN=y
CONFIG_USB_UHCI_HCD=y
-# CONFIG_USB_SL811_HCD is not set
-# CONFIG_USB_R8A66597_HCD is not set
-# CONFIG_USB_WHCI_HCD is not set
-# CONFIG_USB_HWA_HCD is not set
-
-#
-# USB Device Class drivers
-#
-# CONFIG_USB_ACM is not set
CONFIG_USB_PRINTER=y
-# CONFIG_USB_WDM is not set
-# CONFIG_USB_TMC is not set
-
-#
-# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
-#
-
-#
-# also be needed; see USB_STORAGE Help for more info
-#
CONFIG_USB_STORAGE=y
-# CONFIG_USB_STORAGE_DEBUG is not set
-# CONFIG_USB_STORAGE_DATAFAB is not set
-# CONFIG_USB_STORAGE_FREECOM is not set
-# CONFIG_USB_STORAGE_ISD200 is not set
-# CONFIG_USB_STORAGE_USBAT is not set
-# CONFIG_USB_STORAGE_SDDR09 is not set
-# CONFIG_USB_STORAGE_SDDR55 is not set
-# CONFIG_USB_STORAGE_JUMPSHOT is not set
-# CONFIG_USB_STORAGE_ALAUDA is not set
-# CONFIG_USB_STORAGE_ONETOUCH is not set
-# CONFIG_USB_STORAGE_KARMA is not set
-# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
CONFIG_USB_LIBUSUAL=y
-
-#
-# USB Imaging devices
-#
-# CONFIG_USB_MDC800 is not set
-# CONFIG_USB_MICROTEK is not set
-
-#
-# USB port drivers
-#
-# CONFIG_USB_SERIAL is not set
-
-#
-# USB Miscellaneous drivers
-#
-# CONFIG_USB_EMI62 is not set
-# CONFIG_USB_EMI26 is not set
-# CONFIG_USB_ADUTUX is not set
-# CONFIG_USB_SEVSEG is not set
-# CONFIG_USB_RIO500 is not set
-# CONFIG_USB_LEGOTOWER is not set
-# CONFIG_USB_LCD is not set
-# CONFIG_USB_BERRY_CHARGE is not set
-# CONFIG_USB_LED is not set
-# CONFIG_USB_CYPRESS_CY7C63 is not set
-# CONFIG_USB_CYTHERM is not set
-# CONFIG_USB_IDMOUSE is not set
-# CONFIG_USB_FTDI_ELAN is not set
-# CONFIG_USB_APPLEDISPLAY is not set
-# CONFIG_USB_SISUSBVGA is not set
-# CONFIG_USB_LD is not set
-# CONFIG_USB_TRANCEVIBRATOR is not set
-# CONFIG_USB_IOWARRIOR is not set
-# CONFIG_USB_TEST is not set
-# CONFIG_USB_ISIGHTFW is not set
-# CONFIG_USB_VST is not set
-# CONFIG_USB_GADGET is not set
-
-#
-# OTG and related infrastructure
-#
-# CONFIG_NOP_USB_XCEIV is not set
-# CONFIG_UWB is not set
-# CONFIG_MMC is not set
-# CONFIG_MEMSTICK is not set
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-
-#
-# LED drivers
-#
-# CONFIG_LEDS_ALIX2 is not set
-# CONFIG_LEDS_PCA9532 is not set
-# CONFIG_LEDS_LP5521 is not set
-# CONFIG_LEDS_CLEVO_MAIL is not set
-# CONFIG_LEDS_PCA955X is not set
-# CONFIG_LEDS_BD2802 is not set
-
-#
-# LED Triggers
-#
-CONFIG_LEDS_TRIGGERS=y
-# CONFIG_LEDS_TRIGGER_TIMER is not set
-# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
-# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
-# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
-
-#
-# iptables trigger is under Netfilter config (LED target)
-#
-# CONFIG_ACCESSIBILITY is not set
-# CONFIG_INFINIBAND is not set
CONFIG_EDAC=y
-
-#
-# Reporting subsystems
-#
-# CONFIG_EDAC_DEBUG is not set
-# CONFIG_EDAC_MM_EDAC is not set
-CONFIG_RTC_LIB=y
CONFIG_RTC_CLASS=y
# CONFIG_RTC_HCTOSYS is not set
-# CONFIG_RTC_DEBUG is not set
-
-#
-# RTC interfaces
-#
-CONFIG_RTC_INTF_SYSFS=y
-CONFIG_RTC_INTF_PROC=y
-CONFIG_RTC_INTF_DEV=y
-# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
-# CONFIG_RTC_DRV_TEST is not set
-
-#
-# I2C RTC drivers
-#
-# CONFIG_RTC_DRV_DS1307 is not set
-# CONFIG_RTC_DRV_DS1374 is not set
-# CONFIG_RTC_DRV_DS1672 is not set
-# CONFIG_RTC_DRV_MAX6900 is not set
-# CONFIG_RTC_DRV_RS5C372 is not set
-# CONFIG_RTC_DRV_ISL1208 is not set
-# CONFIG_RTC_DRV_X1205 is not set
-# CONFIG_RTC_DRV_PCF8563 is not set
-# CONFIG_RTC_DRV_PCF8583 is not set
-# CONFIG_RTC_DRV_M41T80 is not set
-# CONFIG_RTC_DRV_S35390A is not set
-# CONFIG_RTC_DRV_FM3130 is not set
-# CONFIG_RTC_DRV_RX8581 is not set
-
-#
-# SPI RTC drivers
-#
-
-#
-# Platform RTC drivers
-#
-CONFIG_RTC_DRV_CMOS=y
-# CONFIG_RTC_DRV_DS1286 is not set
-# CONFIG_RTC_DRV_DS1511 is not set
-# CONFIG_RTC_DRV_DS1553 is not set
-# CONFIG_RTC_DRV_DS1742 is not set
-# CONFIG_RTC_DRV_STK17TA8 is not set
-# CONFIG_RTC_DRV_M48T86 is not set
-# CONFIG_RTC_DRV_M48T35 is not set
-# CONFIG_RTC_DRV_M48T59 is not set
-# CONFIG_RTC_DRV_BQ4802 is not set
-# CONFIG_RTC_DRV_V3020 is not set
-
-#
-# on-CPU RTC drivers
-#
CONFIG_DMADEVICES=y
-
-#
-# DMA Devices
-#
-# CONFIG_INTEL_IOATDMA is not set
-# CONFIG_AUXDISPLAY is not set
-# CONFIG_UIO is not set
-# CONFIG_STAGING is not set
-CONFIG_X86_PLATFORM_DEVICES=y
-# CONFIG_ACER_WMI is not set
-# CONFIG_ASUS_LAPTOP is not set
-# CONFIG_FUJITSU_LAPTOP is not set
-# CONFIG_TC1100_WMI is not set
-# CONFIG_MSI_LAPTOP is not set
-# CONFIG_PANASONIC_LAPTOP is not set
-# CONFIG_COMPAL_LAPTOP is not set
-# CONFIG_SONY_LAPTOP is not set
-# CONFIG_THINKPAD_ACPI is not set
-# CONFIG_INTEL_MENLOW is not set
CONFIG_EEEPC_LAPTOP=y
-# CONFIG_ACPI_WMI is not set
-# CONFIG_ACPI_ASUS is not set
-# CONFIG_ACPI_TOSHIBA is not set
-
-#
-# Firmware Drivers
-#
-# CONFIG_EDD is not set
-CONFIG_FIRMWARE_MEMMAP=y
CONFIG_EFI_VARS=y
-# CONFIG_DELL_RBU is not set
-# CONFIG_DCDBAS is not set
-CONFIG_DMIID=y
-# CONFIG_ISCSI_IBFT_FIND is not set
-
-#
-# File systems
-#
-# CONFIG_EXT2_FS is not set
CONFIG_EXT3_FS=y
# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-CONFIG_EXT3_FS_XATTR=y
CONFIG_EXT3_FS_POSIX_ACL=y
CONFIG_EXT3_FS_SECURITY=y
-# CONFIG_EXT4_FS is not set
-CONFIG_JBD=y
-# CONFIG_JBD_DEBUG is not set
-CONFIG_FS_MBCACHE=y
-# CONFIG_REISERFS_FS is not set
-# CONFIG_JFS_FS is not set
-CONFIG_FS_POSIX_ACL=y
-CONFIG_FILE_LOCKING=y
-# CONFIG_XFS_FS is not set
-# CONFIG_OCFS2_FS is not set
-# CONFIG_BTRFS_FS is not set
-CONFIG_DNOTIFY=y
-CONFIG_INOTIFY=y
-CONFIG_INOTIFY_USER=y
CONFIG_QUOTA=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
# CONFIG_PRINT_QUOTA_WARNING is not set
-CONFIG_QUOTA_TREE=y
-# CONFIG_QFMT_V1 is not set
CONFIG_QFMT_V2=y
-CONFIG_QUOTACTL=y
-# CONFIG_AUTOFS_FS is not set
CONFIG_AUTOFS4_FS=y
-# CONFIG_FUSE_FS is not set
-CONFIG_GENERIC_ACL=y
-
-#
-# Caches
-#
-# CONFIG_FSCACHE is not set
-
-#
-# CD-ROM/DVD Filesystems
-#
CONFIG_ISO9660_FS=y
CONFIG_JOLIET=y
CONFIG_ZISOFS=y
-# CONFIG_UDF_FS is not set
-
-#
-# DOS/FAT/NT Filesystems
-#
-CONFIG_FAT_FS=y
CONFIG_MSDOS_FS=y
CONFIG_VFAT_FS=y
-CONFIG_FAT_DEFAULT_CODEPAGE=437
-CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
-# CONFIG_NTFS_FS is not set
-
-#
-# Pseudo filesystems
-#
-CONFIG_PROC_FS=y
CONFIG_PROC_KCORE=y
-CONFIG_PROC_VMCORE=y
-CONFIG_PROC_SYSCTL=y
-CONFIG_PROC_PAGE_MONITOR=y
-CONFIG_SYSFS=y
CONFIG_TMPFS=y
CONFIG_TMPFS_POSIX_ACL=y
CONFIG_HUGETLBFS=y
-CONFIG_HUGETLB_PAGE=y
-# CONFIG_CONFIGFS_FS is not set
-CONFIG_MISC_FILESYSTEMS=y
-# CONFIG_ADFS_FS is not set
-# CONFIG_AFFS_FS is not set
-# CONFIG_ECRYPT_FS is not set
-# CONFIG_HFS_FS is not set
-# CONFIG_HFSPLUS_FS is not set
-# CONFIG_BEFS_FS is not set
-# CONFIG_BFS_FS is not set
-# CONFIG_EFS_FS is not set
-# CONFIG_CRAMFS is not set
-# CONFIG_SQUASHFS is not set
-# CONFIG_VXFS_FS is not set
-# CONFIG_MINIX_FS is not set
-# CONFIG_OMFS_FS is not set
-# CONFIG_HPFS_FS is not set
-# CONFIG_QNX4FS_FS is not set
-# CONFIG_ROMFS_FS is not set
-# CONFIG_SYSV_FS is not set
-# CONFIG_UFS_FS is not set
-# CONFIG_NILFS2_FS is not set
-CONFIG_NETWORK_FILESYSTEMS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
CONFIG_NFS_V3_ACL=y
CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
-# CONFIG_NFSD is not set
-CONFIG_LOCKD=y
-CONFIG_LOCKD_V4=y
-CONFIG_NFS_ACL_SUPPORT=y
-CONFIG_NFS_COMMON=y
-CONFIG_SUNRPC=y
-CONFIG_SUNRPC_GSS=y
-CONFIG_RPCSEC_GSS_KRB5=y
-# CONFIG_RPCSEC_GSS_SPKM3 is not set
-# CONFIG_SMB_FS is not set
-# CONFIG_CIFS is not set
-# CONFIG_NCP_FS is not set
-# CONFIG_CODA_FS is not set
-# CONFIG_AFS_FS is not set
-
-#
-# Partition Types
-#
CONFIG_PARTITION_ADVANCED=y
-# CONFIG_ACORN_PARTITION is not set
CONFIG_OSF_PARTITION=y
CONFIG_AMIGA_PARTITION=y
-# CONFIG_ATARI_PARTITION is not set
CONFIG_MAC_PARTITION=y
-CONFIG_MSDOS_PARTITION=y
CONFIG_BSD_DISKLABEL=y
CONFIG_MINIX_SUBPARTITION=y
CONFIG_SOLARIS_X86_PARTITION=y
CONFIG_UNIXWARE_DISKLABEL=y
-# CONFIG_LDM_PARTITION is not set
CONFIG_SGI_PARTITION=y
-# CONFIG_ULTRIX_PARTITION is not set
CONFIG_SUN_PARTITION=y
CONFIG_KARMA_PARTITION=y
CONFIG_EFI_PARTITION=y
-# CONFIG_SYSV68_PARTITION is not set
-CONFIG_NLS=y
CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_CODEPAGE_437=y
-# CONFIG_NLS_CODEPAGE_737 is not set
-# CONFIG_NLS_CODEPAGE_775 is not set
-# CONFIG_NLS_CODEPAGE_850 is not set
-# CONFIG_NLS_CODEPAGE_852 is not set
-# CONFIG_NLS_CODEPAGE_855 is not set
-# CONFIG_NLS_CODEPAGE_857 is not set
-# CONFIG_NLS_CODEPAGE_860 is not set
-# CONFIG_NLS_CODEPAGE_861 is not set
-# CONFIG_NLS_CODEPAGE_862 is not set
-# CONFIG_NLS_CODEPAGE_863 is not set
-# CONFIG_NLS_CODEPAGE_864 is not set
-# CONFIG_NLS_CODEPAGE_865 is not set
-# CONFIG_NLS_CODEPAGE_866 is not set
-# CONFIG_NLS_CODEPAGE_869 is not set
-# CONFIG_NLS_CODEPAGE_936 is not set
-# CONFIG_NLS_CODEPAGE_950 is not set
-# CONFIG_NLS_CODEPAGE_932 is not set
-# CONFIG_NLS_CODEPAGE_949 is not set
-# CONFIG_NLS_CODEPAGE_874 is not set
-# CONFIG_NLS_ISO8859_8 is not set
-# CONFIG_NLS_CODEPAGE_1250 is not set
-# CONFIG_NLS_CODEPAGE_1251 is not set
CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_NLS_ISO8859_2 is not set
-# CONFIG_NLS_ISO8859_3 is not set
-# CONFIG_NLS_ISO8859_4 is not set
-# CONFIG_NLS_ISO8859_5 is not set
-# CONFIG_NLS_ISO8859_6 is not set
-# CONFIG_NLS_ISO8859_7 is not set
-# CONFIG_NLS_ISO8859_9 is not set
-# CONFIG_NLS_ISO8859_13 is not set
-# CONFIG_NLS_ISO8859_14 is not set
-# CONFIG_NLS_ISO8859_15 is not set
-# CONFIG_NLS_KOI8_R is not set
-# CONFIG_NLS_KOI8_U is not set
CONFIG_NLS_UTF8=y
-# CONFIG_DLM is not set
-
-#
-# Kernel hacking
-#
-CONFIG_TRACE_IRQFLAGS_SUPPORT=y
CONFIG_PRINTK_TIME=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set
-CONFIG_ENABLE_MUST_CHECK=y
CONFIG_FRAME_WARN=2048
CONFIG_MAGIC_SYSRQ=y
# CONFIG_UNUSED_SYMBOLS is not set
-CONFIG_DEBUG_FS=y
-# CONFIG_HEADERS_CHECK is not set
CONFIG_DEBUG_KERNEL=y
-# CONFIG_DEBUG_SHIRQ is not set
-# CONFIG_DETECT_SOFTLOCKUP is not set
-# CONFIG_DETECT_HUNG_TASK is not set
# CONFIG_SCHED_DEBUG is not set
CONFIG_SCHEDSTATS=y
CONFIG_TIMER_STATS=y
-# CONFIG_DEBUG_OBJECTS is not set
-# CONFIG_SLUB_DEBUG_ON is not set
-# CONFIG_SLUB_STATS is not set
-# CONFIG_DEBUG_RT_MUTEXES is not set
-# CONFIG_RT_MUTEX_TESTER is not set
-# CONFIG_DEBUG_SPINLOCK is not set
-# CONFIG_DEBUG_MUTEXES is not set
-# CONFIG_DEBUG_LOCK_ALLOC is not set
-# CONFIG_PROVE_LOCKING is not set
-# CONFIG_LOCK_STAT is not set
-# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
-# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
-CONFIG_STACKTRACE=y
-# CONFIG_DEBUG_KOBJECT is not set
-# CONFIG_DEBUG_HIGHMEM is not set
-CONFIG_DEBUG_BUGVERBOSE=y
-# CONFIG_DEBUG_INFO is not set
-# CONFIG_DEBUG_VM is not set
-# CONFIG_DEBUG_VIRTUAL is not set
-# CONFIG_DEBUG_WRITECOUNT is not set
-CONFIG_DEBUG_MEMORY_INIT=y
-# CONFIG_DEBUG_LIST is not set
-# CONFIG_DEBUG_SG is not set
-# CONFIG_DEBUG_NOTIFIERS is not set
-CONFIG_ARCH_WANT_FRAME_POINTERS=y
-CONFIG_FRAME_POINTER=y
-# CONFIG_BOOT_PRINTK_DELAY is not set
-# CONFIG_RCU_TORTURE_TEST is not set
# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-# CONFIG_KPROBES_SANITY_TEST is not set
-# CONFIG_BACKTRACE_SELF_TEST is not set
-# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set
-# CONFIG_LKDTM is not set
-# CONFIG_FAULT_INJECTION is not set
-# CONFIG_LATENCYTOP is not set
CONFIG_SYSCTL_SYSCALL_CHECK=y
-# CONFIG_DEBUG_PAGEALLOC is not set
-CONFIG_USER_STACKTRACE_SUPPORT=y
-CONFIG_NOP_TRACER=y
-CONFIG_HAVE_FUNCTION_TRACER=y
-CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
-CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
-CONFIG_HAVE_DYNAMIC_FTRACE=y
-CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
-CONFIG_HAVE_HW_BRANCH_TRACER=y
-CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
-CONFIG_RING_BUFFER=y
-CONFIG_TRACING=y
-CONFIG_TRACING_SUPPORT=y
-
-#
-# Tracers
-#
-# CONFIG_FUNCTION_TRACER is not set
-# CONFIG_IRQSOFF_TRACER is not set
-# CONFIG_SYSPROF_TRACER is not set
-# CONFIG_SCHED_TRACER is not set
-# CONFIG_CONTEXT_SWITCH_TRACER is not set
-# CONFIG_EVENT_TRACER is not set
-# CONFIG_FTRACE_SYSCALLS is not set
-# CONFIG_BOOT_TRACER is not set
-# CONFIG_TRACE_BRANCH_PROFILING is not set
-# CONFIG_POWER_TRACER is not set
-# CONFIG_STACK_TRACER is not set
-# CONFIG_HW_BRANCH_TRACER is not set
-# CONFIG_KMEMTRACE is not set
-# CONFIG_WORKQUEUE_TRACER is not set
CONFIG_BLK_DEV_IO_TRACE=y
-# CONFIG_FTRACE_STARTUP_TEST is not set
-# CONFIG_MMIOTRACE is not set
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
-# CONFIG_DYNAMIC_DEBUG is not set
-# CONFIG_DMA_API_DEBUG is not set
-# CONFIG_SAMPLES is not set
-CONFIG_HAVE_ARCH_KGDB=y
-# CONFIG_KGDB is not set
-# CONFIG_STRICT_DEVMEM is not set
-CONFIG_X86_VERBOSE_BOOTUP=y
-CONFIG_EARLY_PRINTK=y
CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_STACKOVERFLOW=y
CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_DEBUG_PER_CPU_MAPS is not set
-# CONFIG_X86_PTDUMP is not set
-CONFIG_DEBUG_RODATA=y
# CONFIG_DEBUG_RODATA_TEST is not set
CONFIG_DEBUG_NX_TEST=m
-# CONFIG_4KSTACKS is not set
-CONFIG_DOUBLEFAULT=y
-CONFIG_HAVE_MMIOTRACE_SUPPORT=y
-CONFIG_IO_DELAY_TYPE_0X80=0
-CONFIG_IO_DELAY_TYPE_0XED=1
-CONFIG_IO_DELAY_TYPE_UDELAY=2
-CONFIG_IO_DELAY_TYPE_NONE=3
-CONFIG_IO_DELAY_0X80=y
-# CONFIG_IO_DELAY_0XED is not set
-# CONFIG_IO_DELAY_UDELAY is not set
-# CONFIG_IO_DELAY_NONE is not set
-CONFIG_DEFAULT_IO_DELAY_TYPE=0
CONFIG_DEBUG_BOOT_PARAMS=y
-# CONFIG_CPA_DEBUG is not set
CONFIG_OPTIMIZE_INLINING=y
-
-#
-# Security options
-#
-CONFIG_KEYS=y
CONFIG_KEYS_DEBUG_PROC_KEYS=y
CONFIG_SECURITY=y
-# CONFIG_SECURITYFS is not set
CONFIG_SECURITY_NETWORK=y
-# CONFIG_SECURITY_NETWORK_XFRM is not set
-# CONFIG_SECURITY_PATH is not set
-CONFIG_SECURITY_FILE_CAPABILITIES=y
-# CONFIG_SECURITY_ROOTPLUG is not set
-CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
CONFIG_SECURITY_SELINUX=y
CONFIG_SECURITY_SELINUX_BOOTPARAM=y
-CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=1
CONFIG_SECURITY_SELINUX_DISABLE=y
-CONFIG_SECURITY_SELINUX_DEVELOP=y
-CONFIG_SECURITY_SELINUX_AVC_STATS=y
-CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
-# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
-# CONFIG_SECURITY_SMACK is not set
-# CONFIG_SECURITY_TOMOYO is not set
-# CONFIG_IMA is not set
-CONFIG_CRYPTO=y
-
-#
-# Crypto core or helper
-#
-# CONFIG_CRYPTO_FIPS is not set
-CONFIG_CRYPTO_ALGAPI=y
-CONFIG_CRYPTO_ALGAPI2=y
-CONFIG_CRYPTO_AEAD=y
-CONFIG_CRYPTO_AEAD2=y
-CONFIG_CRYPTO_BLKCIPHER=y
-CONFIG_CRYPTO_BLKCIPHER2=y
-CONFIG_CRYPTO_HASH=y
-CONFIG_CRYPTO_HASH2=y
-CONFIG_CRYPTO_RNG2=y
-CONFIG_CRYPTO_PCOMP=y
-CONFIG_CRYPTO_MANAGER=y
-CONFIG_CRYPTO_MANAGER2=y
-# CONFIG_CRYPTO_GF128MUL is not set
-# CONFIG_CRYPTO_NULL is not set
-CONFIG_CRYPTO_WORKQUEUE=y
-# CONFIG_CRYPTO_CRYPTD is not set
-CONFIG_CRYPTO_AUTHENC=y
-# CONFIG_CRYPTO_TEST is not set
-
-#
-# Authenticated Encryption with Associated Data
-#
-# CONFIG_CRYPTO_CCM is not set
-# CONFIG_CRYPTO_GCM is not set
-# CONFIG_CRYPTO_SEQIV is not set
-
-#
-# Block modes
-#
-CONFIG_CRYPTO_CBC=y
-# CONFIG_CRYPTO_CTR is not set
-# CONFIG_CRYPTO_CTS is not set
-CONFIG_CRYPTO_ECB=y
-# CONFIG_CRYPTO_LRW is not set
-# CONFIG_CRYPTO_PCBC is not set
-# CONFIG_CRYPTO_XTS is not set
-
-#
-# Hash modes
-#
-CONFIG_CRYPTO_HMAC=y
-# CONFIG_CRYPTO_XCBC is not set
-
-#
-# Digest
-#
-# CONFIG_CRYPTO_CRC32C is not set
-# CONFIG_CRYPTO_CRC32C_INTEL is not set
-# CONFIG_CRYPTO_MD4 is not set
-CONFIG_CRYPTO_MD5=y
-# CONFIG_CRYPTO_MICHAEL_MIC is not set
-# CONFIG_CRYPTO_RMD128 is not set
-# CONFIG_CRYPTO_RMD160 is not set
-# CONFIG_CRYPTO_RMD256 is not set
-# CONFIG_CRYPTO_RMD320 is not set
-CONFIG_CRYPTO_SHA1=y
-# CONFIG_CRYPTO_SHA256 is not set
-# CONFIG_CRYPTO_SHA512 is not set
-# CONFIG_CRYPTO_TGR192 is not set
-# CONFIG_CRYPTO_WP512 is not set
-
-#
-# Ciphers
-#
-CONFIG_CRYPTO_AES=y
CONFIG_CRYPTO_AES_586=y
-# CONFIG_CRYPTO_ANUBIS is not set
-CONFIG_CRYPTO_ARC4=y
-# CONFIG_CRYPTO_BLOWFISH is not set
-# CONFIG_CRYPTO_CAMELLIA is not set
-# CONFIG_CRYPTO_CAST5 is not set
-# CONFIG_CRYPTO_CAST6 is not set
-CONFIG_CRYPTO_DES=y
-# CONFIG_CRYPTO_FCRYPT is not set
-# CONFIG_CRYPTO_KHAZAD is not set
-# CONFIG_CRYPTO_SALSA20 is not set
-# CONFIG_CRYPTO_SALSA20_586 is not set
-# CONFIG_CRYPTO_SEED is not set
-# CONFIG_CRYPTO_SERPENT is not set
-# CONFIG_CRYPTO_TEA is not set
-# CONFIG_CRYPTO_TWOFISH is not set
-# CONFIG_CRYPTO_TWOFISH_586 is not set
-
-#
-# Compression
-#
-# CONFIG_CRYPTO_DEFLATE is not set
-# CONFIG_CRYPTO_ZLIB is not set
-# CONFIG_CRYPTO_LZO is not set
-
-#
-# Random Number Generation
-#
# CONFIG_CRYPTO_ANSI_CPRNG is not set
-CONFIG_CRYPTO_HW=y
-# CONFIG_CRYPTO_DEV_PADLOCK is not set
-# CONFIG_CRYPTO_DEV_GEODE is not set
-# CONFIG_CRYPTO_DEV_HIFN_795X is not set
-CONFIG_HAVE_KVM=y
-CONFIG_HAVE_KVM_IRQCHIP=y
-CONFIG_VIRTUALIZATION=y
-# CONFIG_KVM is not set
-# CONFIG_LGUEST is not set
-# CONFIG_VIRTIO_PCI is not set
-# CONFIG_VIRTIO_BALLOON is not set
-CONFIG_BINARY_PRINTF=y
-
-#
-# Library routines
-#
-CONFIG_BITREVERSE=y
-CONFIG_GENERIC_FIND_FIRST_BIT=y
-CONFIG_GENERIC_FIND_NEXT_BIT=y
-CONFIG_GENERIC_FIND_LAST_BIT=y
-# CONFIG_CRC_CCITT is not set
-# CONFIG_CRC16 is not set
CONFIG_CRC_T10DIF=y
-# CONFIG_CRC_ITU_T is not set
-CONFIG_CRC32=y
-# CONFIG_CRC7 is not set
-# CONFIG_LIBCRC32C is not set
-CONFIG_AUDIT_GENERIC=y
-CONFIG_ZLIB_INFLATE=y
-CONFIG_DECOMPRESS_GZIP=y
-CONFIG_DECOMPRESS_BZIP2=y
-CONFIG_DECOMPRESS_LZMA=y
-CONFIG_HAS_IOMEM=y
-CONFIG_HAS_IOPORT=y
-CONFIG_HAS_DMA=y
-CONFIG_NLATTR=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 6c86acd847a4..ee01a9d5d4f0 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,519 +1,89 @@
-#
-# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.30-rc2
-# Mon May 11 16:22:00 2009
-#
CONFIG_64BIT=y
-# CONFIG_X86_32 is not set
-CONFIG_X86_64=y
-CONFIG_X86=y
-CONFIG_OUTPUT_FORMAT="elf64-x86-64"
-CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
-CONFIG_GENERIC_TIME=y
-CONFIG_GENERIC_CMOS_UPDATE=y
-CONFIG_CLOCKSOURCE_WATCHDOG=y
-CONFIG_GENERIC_CLOCKEVENTS=y
-CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y
-CONFIG_LOCKDEP_SUPPORT=y
-CONFIG_STACKTRACE_SUPPORT=y
-CONFIG_HAVE_LATENCYTOP_SUPPORT=y
-CONFIG_FAST_CMPXCHG_LOCAL=y
-CONFIG_MMU=y
-CONFIG_ZONE_DMA=y
-CONFIG_GENERIC_ISA_DMA=y
-CONFIG_GENERIC_IOMAP=y
-CONFIG_GENERIC_BUG=y
-CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y
-CONFIG_GENERIC_HWEIGHT=y
-CONFIG_ARCH_MAY_HAVE_PC_FDC=y
-CONFIG_RWSEM_GENERIC_SPINLOCK=y
-# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set
-CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y
-CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_GENERIC_TIME_VSYSCALL=y
-CONFIG_ARCH_HAS_CPU_RELAX=y
-CONFIG_ARCH_HAS_DEFAULT_IDLE=y
-CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
-CONFIG_HAVE_SETUP_PER_CPU_AREA=y
-CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
-CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y
-CONFIG_ARCH_HIBERNATION_POSSIBLE=y
-CONFIG_ARCH_SUSPEND_POSSIBLE=y
-CONFIG_ZONE_DMA32=y
-CONFIG_ARCH_POPULATES_NODE_MAP=y
-CONFIG_AUDIT_ARCH=y
-CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
-CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
-CONFIG_GENERIC_HARDIRQS=y
-CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
-CONFIG_GENERIC_IRQ_PROBE=y
-CONFIG_GENERIC_PENDING_IRQ=y
-CONFIG_USE_GENERIC_SMP_HELPERS=y
-CONFIG_X86_64_SMP=y
-CONFIG_X86_HT=y
-CONFIG_X86_TRAMPOLINE=y
-# CONFIG_KTIME_SCALAR is not set
-CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
-
-#
-# General setup
-#
CONFIG_EXPERIMENTAL=y
-CONFIG_LOCK_KERNEL=y
-CONFIG_INIT_ENV_ARG_LIMIT=32
-CONFIG_LOCALVERSION=""
# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_HAVE_KERNEL_GZIP=y
-CONFIG_HAVE_KERNEL_BZIP2=y
-CONFIG_HAVE_KERNEL_LZMA=y
-CONFIG_KERNEL_GZIP=y
-# CONFIG_KERNEL_BZIP2 is not set
-# CONFIG_KERNEL_LZMA is not set
-CONFIG_SWAP=y
CONFIG_SYSVIPC=y
-CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
-CONFIG_POSIX_MQUEUE_SYSCTL=y
CONFIG_BSD_PROCESS_ACCT=y
-# CONFIG_BSD_PROCESS_ACCT_V3 is not set
CONFIG_TASKSTATS=y
CONFIG_TASK_DELAY_ACCT=y
CONFIG_TASK_XACCT=y
CONFIG_TASK_IO_ACCOUNTING=y
CONFIG_AUDIT=y
-CONFIG_AUDITSYSCALL=y
-CONFIG_AUDIT_TREE=y
-
-#
-# RCU Subsystem
-#
-# CONFIG_CLASSIC_RCU is not set
-CONFIG_TREE_RCU=y
-# CONFIG_PREEMPT_RCU is not set
-# CONFIG_RCU_TRACE is not set
-CONFIG_RCU_FANOUT=64
-# CONFIG_RCU_FANOUT_EXACT is not set
-# CONFIG_TREE_RCU_TRACE is not set
-# CONFIG_PREEMPT_RCU_TRACE is not set
-# CONFIG_IKCONFIG is not set
CONFIG_LOG_BUF_SHIFT=18
-CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
-CONFIG_GROUP_SCHED=y
-CONFIG_FAIR_GROUP_SCHED=y
-# CONFIG_RT_GROUP_SCHED is not set
-# CONFIG_USER_SCHED is not set
-CONFIG_CGROUP_SCHED=y
CONFIG_CGROUPS=y
-# CONFIG_CGROUP_DEBUG is not set
CONFIG_CGROUP_NS=y
CONFIG_CGROUP_FREEZER=y
-# CONFIG_CGROUP_DEVICE is not set
CONFIG_CPUSETS=y
-CONFIG_PROC_PID_CPUSET=y
CONFIG_CGROUP_CPUACCT=y
CONFIG_RESOURCE_COUNTERS=y
-# CONFIG_CGROUP_MEM_RES_CTLR is not set
-# CONFIG_SYSFS_DEPRECATED_V2 is not set
-CONFIG_RELAY=y
-CONFIG_NAMESPACES=y
+CONFIG_CGROUP_SCHED=y
CONFIG_UTS_NS=y
CONFIG_IPC_NS=y
CONFIG_USER_NS=y
CONFIG_PID_NS=y
CONFIG_NET_NS=y
CONFIG_BLK_DEV_INITRD=y
-CONFIG_INITRAMFS_SOURCE=""
-CONFIG_RD_GZIP=y
-CONFIG_RD_BZIP2=y
-CONFIG_RD_LZMA=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_SYSCTL=y
-CONFIG_ANON_INODES=y
-# CONFIG_EMBEDDED is not set
-CONFIG_UID16=y
-CONFIG_SYSCTL_SYSCALL=y
-CONFIG_KALLSYMS=y
-CONFIG_KALLSYMS_ALL=y
CONFIG_KALLSYMS_EXTRA_PASS=y
-# CONFIG_STRIP_ASM_SYMS is not set
-CONFIG_HOTPLUG=y
-CONFIG_PRINTK=y
-CONFIG_BUG=y
-CONFIG_ELF_CORE=y
-CONFIG_PCSPKR_PLATFORM=y
-CONFIG_BASE_FULL=y
-CONFIG_FUTEX=y
-CONFIG_EPOLL=y
-CONFIG_SIGNALFD=y
-CONFIG_TIMERFD=y
-CONFIG_EVENTFD=y
-CONFIG_SHMEM=y
-CONFIG_AIO=y
-CONFIG_VM_EVENT_COUNTERS=y
-CONFIG_PCI_QUIRKS=y
-CONFIG_SLUB_DEBUG=y
# CONFIG_COMPAT_BRK is not set
-# CONFIG_SLAB is not set
-CONFIG_SLUB=y
-# CONFIG_SLOB is not set
CONFIG_PROFILING=y
-CONFIG_TRACEPOINTS=y
-CONFIG_MARKERS=y
-# CONFIG_OPROFILE is not set
-CONFIG_HAVE_OPROFILE=y
CONFIG_KPROBES=y
-CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
-CONFIG_KRETPROBES=y
-CONFIG_HAVE_IOREMAP_PROT=y
-CONFIG_HAVE_KPROBES=y
-CONFIG_HAVE_KRETPROBES=y
-CONFIG_HAVE_ARCH_TRACEHOOK=y
-CONFIG_HAVE_DMA_API_DEBUG=y
-# CONFIG_SLOW_WORK is not set
-# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
-CONFIG_SLABINFO=y
-CONFIG_RT_MUTEXES=y
-CONFIG_BASE_SMALL=0
CONFIG_MODULES=y
-# CONFIG_MODULE_FORCE_LOAD is not set
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_MODVERSIONS is not set
-# CONFIG_MODULE_SRCVERSION_ALL is not set
-CONFIG_STOP_MACHINE=y
-CONFIG_BLOCK=y
-CONFIG_BLK_DEV_BSG=y
-# CONFIG_BLK_DEV_INTEGRITY is not set
-CONFIG_BLOCK_COMPAT=y
-
-#
-# IO Schedulers
-#
-CONFIG_IOSCHED_NOOP=y
-CONFIG_IOSCHED_AS=y
-CONFIG_IOSCHED_DEADLINE=y
-CONFIG_IOSCHED_CFQ=y
-# CONFIG_DEFAULT_AS is not set
-# CONFIG_DEFAULT_DEADLINE is not set
-CONFIG_DEFAULT_CFQ=y
-# CONFIG_DEFAULT_NOOP is not set
-CONFIG_DEFAULT_IOSCHED="cfq"
-CONFIG_FREEZER=y
-
-#
-# Processor type and features
-#
-CONFIG_TICK_ONESHOT=y
CONFIG_NO_HZ=y
CONFIG_HIGH_RES_TIMERS=y
-CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
CONFIG_SMP=y
CONFIG_SPARSE_IRQ=y
-CONFIG_X86_MPPARSE=y
-CONFIG_X86_EXTENDED_PLATFORM=y
-# CONFIG_X86_VSMP is not set
-# CONFIG_X86_UV is not set
-CONFIG_SCHED_OMIT_FRAME_POINTER=y
-# CONFIG_PARAVIRT_GUEST is not set
-# CONFIG_MEMTEST is not set
-# CONFIG_M386 is not set
-# CONFIG_M486 is not set
-# CONFIG_M586 is not set
-# CONFIG_M586TSC is not set
-# CONFIG_M586MMX is not set
-# CONFIG_M686 is not set
-# CONFIG_MPENTIUMII is not set
-# CONFIG_MPENTIUMIII is not set
-# CONFIG_MPENTIUMM is not set
-# CONFIG_MPENTIUM4 is not set
-# CONFIG_MK6 is not set
-# CONFIG_MK7 is not set
-# CONFIG_MK8 is not set
-# CONFIG_MCRUSOE is not set
-# CONFIG_MEFFICEON is not set
-# CONFIG_MWINCHIPC6 is not set
-# CONFIG_MWINCHIP3D is not set
-# CONFIG_MGEODEGX1 is not set
-# CONFIG_MGEODE_LX is not set
-# CONFIG_MCYRIXIII is not set
-# CONFIG_MVIAC3_2 is not set
-# CONFIG_MVIAC7 is not set
-# CONFIG_MPSC is not set
-# CONFIG_MCORE2 is not set
-CONFIG_GENERIC_CPU=y
-CONFIG_X86_CPU=y
-CONFIG_X86_L1_CACHE_BYTES=64
-CONFIG_X86_INTERNODE_CACHE_BYTES=64
-CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=6
-CONFIG_X86_WP_WORKS_OK=y
-CONFIG_X86_TSC=y
-CONFIG_X86_CMPXCHG64=y
-CONFIG_X86_CMOV=y
-CONFIG_X86_MINIMUM_CPU_FAMILY=64
-CONFIG_X86_DEBUGCTLMSR=y
-CONFIG_CPU_SUP_INTEL=y
-CONFIG_CPU_SUP_AMD=y
-CONFIG_CPU_SUP_CENTAUR=y
-CONFIG_X86_DS=y
-CONFIG_X86_PTRACE_BTS=y
-CONFIG_HPET_TIMER=y
-CONFIG_HPET_EMULATE_RTC=y
-CONFIG_DMI=y
-CONFIG_GART_IOMMU=y
CONFIG_CALGARY_IOMMU=y
-CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y
CONFIG_AMD_IOMMU=y
CONFIG_AMD_IOMMU_STATS=y
-CONFIG_SWIOTLB=y
-CONFIG_IOMMU_HELPER=y
-CONFIG_IOMMU_API=y
-# CONFIG_MAXSMP is not set
CONFIG_NR_CPUS=64
CONFIG_SCHED_SMT=y
-CONFIG_SCHED_MC=y
-# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
-# CONFIG_PREEMPT is not set
-CONFIG_X86_LOCAL_APIC=y
-CONFIG_X86_IO_APIC=y
CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
CONFIG_X86_MCE=y
-CONFIG_X86_MCE_INTEL=y
-CONFIG_X86_MCE_AMD=y
-CONFIG_X86_MCE_THRESHOLD=y
-# CONFIG_I8K is not set
CONFIG_MICROCODE=y
-CONFIG_MICROCODE_INTEL=y
CONFIG_MICROCODE_AMD=y
-CONFIG_MICROCODE_OLD_INTERFACE=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
-# CONFIG_X86_CPU_DEBUG is not set
-CONFIG_ARCH_PHYS_ADDR_T_64BIT=y
-CONFIG_DIRECT_GBPAGES=y
CONFIG_NUMA=y
-CONFIG_K8_NUMA=y
-CONFIG_X86_64_ACPI_NUMA=y
-CONFIG_NODES_SPAN_OTHER_NODES=y
-# CONFIG_NUMA_EMU is not set
-CONFIG_NODES_SHIFT=6
-CONFIG_ARCH_SPARSEMEM_DEFAULT=y
-CONFIG_ARCH_SPARSEMEM_ENABLE=y
-CONFIG_ARCH_SELECT_MEMORY_MODEL=y
-CONFIG_SELECT_MEMORY_MODEL=y
-# CONFIG_FLATMEM_MANUAL is not set
-# CONFIG_DISCONTIGMEM_MANUAL is not set
-CONFIG_SPARSEMEM_MANUAL=y
-CONFIG_SPARSEMEM=y
-CONFIG_NEED_MULTIPLE_NODES=y
-CONFIG_HAVE_MEMORY_PRESENT=y
-CONFIG_SPARSEMEM_EXTREME=y
-CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y
-CONFIG_SPARSEMEM_VMEMMAP=y
-
-#
-# Memory hotplug is currently incompatible with Software Suspend
-#
-CONFIG_PAGEFLAGS_EXTENDED=y
-CONFIG_SPLIT_PTLOCK_CPUS=4
-CONFIG_MIGRATION=y
-CONFIG_PHYS_ADDR_T_64BIT=y
-CONFIG_ZONE_DMA_FLAG=1
-CONFIG_BOUNCE=y
-CONFIG_VIRT_TO_BUS=y
-CONFIG_UNEVICTABLE_LRU=y
-CONFIG_HAVE_MLOCK=y
-CONFIG_HAVE_MLOCKED_PAGE_BIT=y
CONFIG_X86_CHECK_BIOS_CORRUPTION=y
-CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
-CONFIG_X86_RESERVE_LOW_64K=y
-CONFIG_MTRR=y
# CONFIG_MTRR_SANITIZER is not set
-CONFIG_X86_PAT=y
CONFIG_EFI=y
-CONFIG_SECCOMP=y
-# CONFIG_CC_STACKPROTECTOR is not set
-# CONFIG_HZ_100 is not set
-# CONFIG_HZ_250 is not set
-# CONFIG_HZ_300 is not set
CONFIG_HZ_1000=y
-CONFIG_HZ=1000
-CONFIG_SCHED_HRTICK=y
CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
-# CONFIG_KEXEC_JUMP is not set
-CONFIG_PHYSICAL_START=0x1000000
-CONFIG_RELOCATABLE=y
-CONFIG_PHYSICAL_ALIGN=0x1000000
-CONFIG_HOTPLUG_CPU=y
# CONFIG_COMPAT_VDSO is not set
-# CONFIG_CMDLINE_BOOL is not set
-CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
-CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
-
-#
-# Power management and ACPI options
-#
-CONFIG_ARCH_HIBERNATION_HEADER=y
CONFIG_PM=y
CONFIG_PM_DEBUG=y
-# CONFIG_PM_VERBOSE is not set
-CONFIG_CAN_PM_TRACE=y
-CONFIG_PM_TRACE=y
CONFIG_PM_TRACE_RTC=y
-CONFIG_PM_SLEEP_SMP=y
-CONFIG_PM_SLEEP=y
-CONFIG_SUSPEND=y
-# CONFIG_PM_TEST_SUSPEND is not set
-CONFIG_SUSPEND_FREEZER=y
CONFIG_HIBERNATION=y
-CONFIG_PM_STD_PARTITION=""
-CONFIG_ACPI=y
-CONFIG_ACPI_SLEEP=y
CONFIG_ACPI_PROCFS=y
-CONFIG_ACPI_PROCFS_POWER=y
-CONFIG_ACPI_SYSFS_POWER=y
-CONFIG_ACPI_PROC_EVENT=y
-CONFIG_ACPI_AC=y
-CONFIG_ACPI_BATTERY=y
-CONFIG_ACPI_BUTTON=y
-CONFIG_ACPI_FAN=y
CONFIG_ACPI_DOCK=y
-CONFIG_ACPI_PROCESSOR=y
-CONFIG_ACPI_HOTPLUG_CPU=y
-CONFIG_ACPI_THERMAL=y
-CONFIG_ACPI_NUMA=y
-# CONFIG_ACPI_CUSTOM_DSDT is not set
-CONFIG_ACPI_BLACKLIST_YEAR=0
-# CONFIG_ACPI_DEBUG is not set
-# CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_X86_PM_TIMER=y
-CONFIG_ACPI_CONTAINER=y
-# CONFIG_ACPI_SBS is not set
-
-#
-# CPU Frequency scaling
-#
CONFIG_CPU_FREQ=y
-CONFIG_CPU_FREQ_TABLE=y
CONFIG_CPU_FREQ_DEBUG=y
# CONFIG_CPU_FREQ_STAT is not set
-# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
-# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
-# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
-# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
-# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_GOV_ONDEMAND=y
-# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set
-
-#
-# CPUFreq processor drivers
-#
CONFIG_X86_ACPI_CPUFREQ=y
-# CONFIG_X86_POWERNOW_K8 is not set
-# CONFIG_X86_SPEEDSTEP_CENTRINO is not set
-# CONFIG_X86_P4_CLOCKMOD is not set
-
-#
-# shared options
-#
-# CONFIG_X86_SPEEDSTEP_LIB is not set
-CONFIG_CPU_IDLE=y
-CONFIG_CPU_IDLE_GOV_LADDER=y
-CONFIG_CPU_IDLE_GOV_MENU=y
-
-#
-# Memory power savings
-#
-# CONFIG_I7300_IDLE is not set
-
-#
-# Bus options (PCI etc.)
-#
-CONFIG_PCI=y
-CONFIG_PCI_DIRECT=y
CONFIG_PCI_MMCONFIG=y
-CONFIG_PCI_DOMAINS=y
CONFIG_DMAR=y
# CONFIG_DMAR_DEFAULT_ON is not set
-CONFIG_DMAR_GFX_WA=y
-CONFIG_DMAR_FLOPPY_WA=y
-# CONFIG_INTR_REMAP is not set
CONFIG_PCIEPORTBUS=y
-# CONFIG_HOTPLUG_PCI_PCIE is not set
-CONFIG_PCIEAER=y
-# CONFIG_PCIEASPM is not set
-CONFIG_ARCH_SUPPORTS_MSI=y
-CONFIG_PCI_MSI=y
-# CONFIG_PCI_LEGACY is not set
-# CONFIG_PCI_DEBUG is not set
-# CONFIG_PCI_STUB is not set
-CONFIG_HT_IRQ=y
-# CONFIG_PCI_IOV is not set
-CONFIG_ISA_DMA_API=y
-CONFIG_K8_NB=y
CONFIG_PCCARD=y
-# CONFIG_PCMCIA_DEBUG is not set
-CONFIG_PCMCIA=y
-CONFIG_PCMCIA_LOAD_CIS=y
-CONFIG_PCMCIA_IOCTL=y
-CONFIG_CARDBUS=y
-
-#
-# PC-card bridges
-#
CONFIG_YENTA=y
-CONFIG_YENTA_O2=y
-CONFIG_YENTA_RICOH=y
-CONFIG_YENTA_TI=y
-CONFIG_YENTA_ENE_TUNE=y
-CONFIG_YENTA_TOSHIBA=y
-# CONFIG_PD6729 is not set
-# CONFIG_I82092 is not set
-CONFIG_PCCARD_NONSTATIC=y
CONFIG_HOTPLUG_PCI=y
-# CONFIG_HOTPLUG_PCI_FAKE is not set
-# CONFIG_HOTPLUG_PCI_ACPI is not set
-# CONFIG_HOTPLUG_PCI_CPCI is not set
-# CONFIG_HOTPLUG_PCI_SHPC is not set
-
-#
-# Executable file formats / Emulations
-#
-CONFIG_BINFMT_ELF=y
-CONFIG_COMPAT_BINFMT_ELF=y
CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
-# CONFIG_HAVE_AOUT is not set
CONFIG_BINFMT_MISC=y
CONFIG_IA32_EMULATION=y
-# CONFIG_IA32_AOUT is not set
-CONFIG_COMPAT=y
-CONFIG_COMPAT_FOR_U64_ALIGNMENT=y
-CONFIG_SYSVIPC_COMPAT=y
CONFIG_NET=y
-
-#
-# Networking options
-#
CONFIG_PACKET=y
-CONFIG_PACKET_MMAP=y
CONFIG_UNIX=y
-CONFIG_XFRM=y
CONFIG_XFRM_USER=y
-# CONFIG_XFRM_SUB_POLICY is not set
-# CONFIG_XFRM_MIGRATE is not set
-# CONFIG_XFRM_STATISTICS is not set
-# CONFIG_NET_KEY is not set
CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_ASK_IP_FIB_HASH=y
-# CONFIG_IP_FIB_TRIE is not set
-CONFIG_IP_FIB_HASH=y
CONFIG_IP_MULTIPLE_TABLES=y
CONFIG_IP_ROUTE_MULTIPATH=y
CONFIG_IP_ROUTE_VERBOSE=y
@@ -521,118 +91,46 @@ CONFIG_IP_PNP=y
CONFIG_IP_PNP_DHCP=y
CONFIG_IP_PNP_BOOTP=y
CONFIG_IP_PNP_RARP=y
-# CONFIG_NET_IPIP is not set
-# CONFIG_NET_IPGRE is not set
CONFIG_IP_MROUTE=y
CONFIG_IP_PIMSM_V1=y
CONFIG_IP_PIMSM_V2=y
-# CONFIG_ARPD is not set
CONFIG_SYN_COOKIES=y
-# CONFIG_INET_AH is not set
-# CONFIG_INET_ESP is not set
-# CONFIG_INET_IPCOMP is not set
-# CONFIG_INET_XFRM_TUNNEL is not set
-CONFIG_INET_TUNNEL=y
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
-CONFIG_INET_LRO=y
# CONFIG_INET_DIAG is not set
CONFIG_TCP_CONG_ADVANCED=y
# CONFIG_TCP_CONG_BIC is not set
-CONFIG_TCP_CONG_CUBIC=y
# CONFIG_TCP_CONG_WESTWOOD is not set
# CONFIG_TCP_CONG_HTCP is not set
-# CONFIG_TCP_CONG_HSTCP is not set
-# CONFIG_TCP_CONG_HYBLA is not set
-# CONFIG_TCP_CONG_VEGAS is not set
-# CONFIG_TCP_CONG_SCALABLE is not set
-# CONFIG_TCP_CONG_LP is not set
-# CONFIG_TCP_CONG_VENO is not set
-# CONFIG_TCP_CONG_YEAH is not set
-# CONFIG_TCP_CONG_ILLINOIS is not set
-# CONFIG_DEFAULT_BIC is not set
-CONFIG_DEFAULT_CUBIC=y
-# CONFIG_DEFAULT_HTCP is not set
-# CONFIG_DEFAULT_VEGAS is not set
-# CONFIG_DEFAULT_WESTWOOD is not set
-# CONFIG_DEFAULT_RENO is not set
-CONFIG_DEFAULT_TCP_CONG="cubic"
CONFIG_TCP_MD5SIG=y
CONFIG_IPV6=y
-# CONFIG_IPV6_PRIVACY is not set
-# CONFIG_IPV6_ROUTER_PREF is not set
-# CONFIG_IPV6_OPTIMISTIC_DAD is not set
CONFIG_INET6_AH=y
CONFIG_INET6_ESP=y
-# CONFIG_INET6_IPCOMP is not set
-# CONFIG_IPV6_MIP6 is not set
-# CONFIG_INET6_XFRM_TUNNEL is not set
-# CONFIG_INET6_TUNNEL is not set
-CONFIG_INET6_XFRM_MODE_TRANSPORT=y
-CONFIG_INET6_XFRM_MODE_TUNNEL=y
-CONFIG_INET6_XFRM_MODE_BEET=y
-# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
-CONFIG_IPV6_SIT=y
-CONFIG_IPV6_NDISC_NODETYPE=y
-# CONFIG_IPV6_TUNNEL is not set
-# CONFIG_IPV6_MULTIPLE_TABLES is not set
-# CONFIG_IPV6_MROUTE is not set
CONFIG_NETLABEL=y
-CONFIG_NETWORK_SECMARK=y
CONFIG_NETFILTER=y
-# CONFIG_NETFILTER_DEBUG is not set
# CONFIG_NETFILTER_ADVANCED is not set
-
-#
-# Core Netfilter Configuration
-#
-CONFIG_NETFILTER_NETLINK=y
-CONFIG_NETFILTER_NETLINK_LOG=y
CONFIG_NF_CONNTRACK=y
-CONFIG_NF_CONNTRACK_SECMARK=y
CONFIG_NF_CONNTRACK_FTP=y
CONFIG_NF_CONNTRACK_IRC=y
CONFIG_NF_CONNTRACK_SIP=y
CONFIG_NF_CT_NETLINK=y
-CONFIG_NETFILTER_XTABLES=y
CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
-CONFIG_NETFILTER_XT_TARGET_MARK=y
CONFIG_NETFILTER_XT_TARGET_NFLOG=y
CONFIG_NETFILTER_XT_TARGET_SECMARK=y
CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
-CONFIG_NETFILTER_XT_MATCH_MARK=y
CONFIG_NETFILTER_XT_MATCH_POLICY=y
CONFIG_NETFILTER_XT_MATCH_STATE=y
-# CONFIG_IP_VS is not set
-
-#
-# IP: Netfilter Configuration
-#
-CONFIG_NF_DEFRAG_IPV4=y
CONFIG_NF_CONNTRACK_IPV4=y
-CONFIG_NF_CONNTRACK_PROC_COMPAT=y
CONFIG_IP_NF_IPTABLES=y
CONFIG_IP_NF_FILTER=y
CONFIG_IP_NF_TARGET_REJECT=y
CONFIG_IP_NF_TARGET_LOG=y
CONFIG_IP_NF_TARGET_ULOG=y
CONFIG_NF_NAT=y
-CONFIG_NF_NAT_NEEDED=y
CONFIG_IP_NF_TARGET_MASQUERADE=y
-CONFIG_NF_NAT_FTP=y
-CONFIG_NF_NAT_IRC=y
-# CONFIG_NF_NAT_TFTP is not set
-# CONFIG_NF_NAT_AMANDA is not set
-# CONFIG_NF_NAT_PPTP is not set
-# CONFIG_NF_NAT_H323 is not set
-CONFIG_NF_NAT_SIP=y
CONFIG_IP_NF_MANGLE=y
-
-#
-# IPv6: Netfilter Configuration
-#
CONFIG_NF_CONNTRACK_IPV6=y
CONFIG_IP6_NF_IPTABLES=y
CONFIG_IP6_NF_MATCH_IPV6HEADER=y
@@ -640,1208 +138,111 @@ CONFIG_IP6_NF_TARGET_LOG=y
CONFIG_IP6_NF_FILTER=y
CONFIG_IP6_NF_TARGET_REJECT=y
CONFIG_IP6_NF_MANGLE=y
-# CONFIG_IP_DCCP is not set
-# CONFIG_IP_SCTP is not set
-# CONFIG_TIPC is not set
-# CONFIG_ATM is not set
-# CONFIG_BRIDGE is not set
-# CONFIG_NET_DSA is not set
-# CONFIG_VLAN_8021Q is not set
-# CONFIG_DECNET is not set
-CONFIG_LLC=y
-# CONFIG_LLC2 is not set
-# CONFIG_IPX is not set
-# CONFIG_ATALK is not set
-# CONFIG_X25 is not set
-# CONFIG_LAPB is not set
-# CONFIG_ECONET is not set
-# CONFIG_WAN_ROUTER is not set
-# CONFIG_PHONET is not set
CONFIG_NET_SCHED=y
-
-#
-# Queueing/Scheduling
-#
-# CONFIG_NET_SCH_CBQ is not set
-# CONFIG_NET_SCH_HTB is not set
-# CONFIG_NET_SCH_HFSC is not set
-# CONFIG_NET_SCH_PRIO is not set
-# CONFIG_NET_SCH_MULTIQ is not set
-# CONFIG_NET_SCH_RED is not set
-# CONFIG_NET_SCH_SFQ is not set
-# CONFIG_NET_SCH_TEQL is not set
-# CONFIG_NET_SCH_TBF is not set
-# CONFIG_NET_SCH_GRED is not set
-# CONFIG_NET_SCH_DSMARK is not set
-# CONFIG_NET_SCH_NETEM is not set
-# CONFIG_NET_SCH_DRR is not set
-# CONFIG_NET_SCH_INGRESS is not set
-
-#
-# Classification
-#
-CONFIG_NET_CLS=y
-# CONFIG_NET_CLS_BASIC is not set
-# CONFIG_NET_CLS_TCINDEX is not set
-# CONFIG_NET_CLS_ROUTE4 is not set
-# CONFIG_NET_CLS_FW is not set
-# CONFIG_NET_CLS_U32 is not set
-# CONFIG_NET_CLS_RSVP is not set
-# CONFIG_NET_CLS_RSVP6 is not set
-# CONFIG_NET_CLS_FLOW is not set
-# CONFIG_NET_CLS_CGROUP is not set
CONFIG_NET_EMATCH=y
-CONFIG_NET_EMATCH_STACK=32
-# CONFIG_NET_EMATCH_CMP is not set
-# CONFIG_NET_EMATCH_NBYTE is not set
-# CONFIG_NET_EMATCH_U32 is not set
-# CONFIG_NET_EMATCH_META is not set
-# CONFIG_NET_EMATCH_TEXT is not set
CONFIG_NET_CLS_ACT=y
-# CONFIG_NET_ACT_POLICE is not set
-# CONFIG_NET_ACT_GACT is not set
-# CONFIG_NET_ACT_MIRRED is not set
-# CONFIG_NET_ACT_IPT is not set
-# CONFIG_NET_ACT_NAT is not set
-# CONFIG_NET_ACT_PEDIT is not set
-# CONFIG_NET_ACT_SIMP is not set
-# CONFIG_NET_ACT_SKBEDIT is not set
-CONFIG_NET_SCH_FIFO=y
-# CONFIG_DCB is not set
-
-#
-# Network testing
-#
-# CONFIG_NET_PKTGEN is not set
-# CONFIG_NET_TCPPROBE is not set
-# CONFIG_NET_DROP_MONITOR is not set
CONFIG_HAMRADIO=y
-
-#
-# Packet Radio protocols
-#
-# CONFIG_AX25 is not set
-# CONFIG_CAN is not set
-# CONFIG_IRDA is not set
-# CONFIG_BT is not set
-# CONFIG_AF_RXRPC is not set
-CONFIG_FIB_RULES=y
-CONFIG_WIRELESS=y
CONFIG_CFG80211=y
-# CONFIG_CFG80211_REG_DEBUG is not set
-CONFIG_WIRELESS_OLD_REGULATORY=y
-CONFIG_WIRELESS_EXT=y
-CONFIG_WIRELESS_EXT_SYSFS=y
-# CONFIG_LIB80211 is not set
CONFIG_MAC80211=y
-
-#
-# Rate control algorithm selection
-#
-CONFIG_MAC80211_RC_MINSTREL=y
-# CONFIG_MAC80211_RC_DEFAULT_PID is not set
-CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y
-CONFIG_MAC80211_RC_DEFAULT="minstrel"
-# CONFIG_MAC80211_MESH is not set
CONFIG_MAC80211_LEDS=y
-# CONFIG_MAC80211_DEBUGFS is not set
-# CONFIG_MAC80211_DEBUG_MENU is not set
-# CONFIG_WIMAX is not set
CONFIG_RFKILL=y
-# CONFIG_RFKILL_INPUT is not set
-CONFIG_RFKILL_LEDS=y
-# CONFIG_NET_9P is not set
-
-#
-# Device Drivers
-#
-
-#
-# Generic Driver Options
-#
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-CONFIG_STANDALONE=y
-CONFIG_PREVENT_FIRMWARE_BUILD=y
-CONFIG_FW_LOADER=y
-CONFIG_FIRMWARE_IN_KERNEL=y
-CONFIG_EXTRA_FIRMWARE=""
-# CONFIG_DEBUG_DRIVER is not set
CONFIG_DEBUG_DEVRES=y
-# CONFIG_SYS_HYPERVISOR is not set
CONFIG_CONNECTOR=y
-CONFIG_PROC_EVENTS=y
-# CONFIG_MTD is not set
-# CONFIG_PARPORT is not set
-CONFIG_PNP=y
-CONFIG_PNP_DEBUG_MESSAGES=y
-
-#
-# Protocols
-#
-CONFIG_PNPACPI=y
-CONFIG_BLK_DEV=y
-# CONFIG_BLK_DEV_FD is not set
-# CONFIG_BLK_CPQ_DA is not set
-# CONFIG_BLK_CPQ_CISS_DA is not set
-# CONFIG_BLK_DEV_DAC960 is not set
-# CONFIG_BLK_DEV_UMEM is not set
-# CONFIG_BLK_DEV_COW_COMMON is not set
CONFIG_BLK_DEV_LOOP=y
-# CONFIG_BLK_DEV_CRYPTOLOOP is not set
-# CONFIG_BLK_DEV_NBD is not set
-# CONFIG_BLK_DEV_SX8 is not set
-# CONFIG_BLK_DEV_UB is not set
CONFIG_BLK_DEV_RAM=y
-CONFIG_BLK_DEV_RAM_COUNT=16
CONFIG_BLK_DEV_RAM_SIZE=16384
-# CONFIG_BLK_DEV_XIP is not set
-# CONFIG_CDROM_PKTCDVD is not set
-# CONFIG_ATA_OVER_ETH is not set
-# CONFIG_BLK_DEV_HD is not set
-CONFIG_MISC_DEVICES=y
-# CONFIG_IBM_ASM is not set
-# CONFIG_PHANTOM is not set
-# CONFIG_SGI_IOC4 is not set
-# CONFIG_TIFM_CORE is not set
-# CONFIG_ICS932S401 is not set
-# CONFIG_ENCLOSURE_SERVICES is not set
-# CONFIG_HP_ILO is not set
-# CONFIG_ISL29003 is not set
-# CONFIG_C2PORT is not set
-
-#
-# EEPROM support
-#
-# CONFIG_EEPROM_AT24 is not set
-# CONFIG_EEPROM_LEGACY is not set
-# CONFIG_EEPROM_93CX6 is not set
-CONFIG_HAVE_IDE=y
-# CONFIG_IDE is not set
-
-#
-# SCSI device support
-#
-# CONFIG_RAID_ATTRS is not set
-CONFIG_SCSI=y
-CONFIG_SCSI_DMA=y
-# CONFIG_SCSI_TGT is not set
-# CONFIG_SCSI_NETLINK is not set
-CONFIG_SCSI_PROC_FS=y
-
-#
-# SCSI support type (disk, tape, CD-ROM)
-#
CONFIG_BLK_DEV_SD=y
-# CONFIG_CHR_DEV_ST is not set
-# CONFIG_CHR_DEV_OSST is not set
CONFIG_BLK_DEV_SR=y
CONFIG_BLK_DEV_SR_VENDOR=y
CONFIG_CHR_DEV_SG=y
-# CONFIG_CHR_DEV_SCH is not set
-
-#
-# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
-#
-# CONFIG_SCSI_MULTI_LUN is not set
CONFIG_SCSI_CONSTANTS=y
-# CONFIG_SCSI_LOGGING is not set
-# CONFIG_SCSI_SCAN_ASYNC is not set
-CONFIG_SCSI_WAIT_SCAN=m
-
-#
-# SCSI Transports
-#
CONFIG_SCSI_SPI_ATTRS=y
-# CONFIG_SCSI_FC_ATTRS is not set
-# CONFIG_SCSI_ISCSI_ATTRS is not set
-# CONFIG_SCSI_SAS_ATTRS is not set
-# CONFIG_SCSI_SAS_LIBSAS is not set
-# CONFIG_SCSI_SRP_ATTRS is not set
# CONFIG_SCSI_LOWLEVEL is not set
-# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
-# CONFIG_SCSI_DH is not set
-# CONFIG_SCSI_OSD_INITIATOR is not set
CONFIG_ATA=y
-# CONFIG_ATA_NONSTANDARD is not set
-CONFIG_ATA_ACPI=y
-CONFIG_SATA_PMP=y
CONFIG_SATA_AHCI=y
-# CONFIG_SATA_SIL24 is not set
-CONFIG_ATA_SFF=y
-# CONFIG_SATA_SVW is not set
CONFIG_ATA_PIIX=y
-# CONFIG_SATA_MV is not set
-# CONFIG_SATA_NV is not set
-# CONFIG_PDC_ADMA is not set
-# CONFIG_SATA_QSTOR is not set
-# CONFIG_SATA_PROMISE is not set
-# CONFIG_SATA_SX4 is not set
-# CONFIG_SATA_SIL is not set
-# CONFIG_SATA_SIS is not set
-# CONFIG_SATA_ULI is not set
-# CONFIG_SATA_VIA is not set
-# CONFIG_SATA_VITESSE is not set
-# CONFIG_SATA_INIC162X is not set
-# CONFIG_PATA_ACPI is not set
-# CONFIG_PATA_ALI is not set
CONFIG_PATA_AMD=y
-# CONFIG_PATA_ARTOP is not set
-# CONFIG_PATA_ATIIXP is not set
-# CONFIG_PATA_CMD640_PCI is not set
-# CONFIG_PATA_CMD64X is not set
-# CONFIG_PATA_CS5520 is not set
-# CONFIG_PATA_CS5530 is not set
-# CONFIG_PATA_CYPRESS is not set
-# CONFIG_PATA_EFAR is not set
-# CONFIG_ATA_GENERIC is not set
-# CONFIG_PATA_HPT366 is not set
-# CONFIG_PATA_HPT37X is not set
-# CONFIG_PATA_HPT3X2N is not set
-# CONFIG_PATA_HPT3X3 is not set
-# CONFIG_PATA_IT821X is not set
-# CONFIG_PATA_IT8213 is not set
-# CONFIG_PATA_JMICRON is not set
-# CONFIG_PATA_TRIFLEX is not set
-# CONFIG_PATA_MARVELL is not set
-# CONFIG_PATA_MPIIX is not set
CONFIG_PATA_OLDPIIX=y
-# CONFIG_PATA_NETCELL is not set
-# CONFIG_PATA_NINJA32 is not set
-# CONFIG_PATA_NS87410 is not set
-# CONFIG_PATA_NS87415 is not set
-# CONFIG_PATA_OPTI is not set
-# CONFIG_PATA_OPTIDMA is not set
-# CONFIG_PATA_PCMCIA is not set
-# CONFIG_PATA_PDC_OLD is not set
-# CONFIG_PATA_RADISYS is not set
-# CONFIG_PATA_RZ1000 is not set
-# CONFIG_PATA_SC1200 is not set
-# CONFIG_PATA_SERVERWORKS is not set
-# CONFIG_PATA_PDC2027X is not set
-# CONFIG_PATA_SIL680 is not set
-# CONFIG_PATA_SIS is not set
-# CONFIG_PATA_VIA is not set
-# CONFIG_PATA_WINBOND is not set
CONFIG_PATA_SCH=y
CONFIG_MD=y
CONFIG_BLK_DEV_MD=y
-CONFIG_MD_AUTODETECT=y
-# CONFIG_MD_LINEAR is not set
-# CONFIG_MD_RAID0 is not set
-# CONFIG_MD_RAID1 is not set
-# CONFIG_MD_RAID10 is not set
-# CONFIG_MD_RAID456 is not set
-# CONFIG_MD_MULTIPATH is not set
-# CONFIG_MD_FAULTY is not set
CONFIG_BLK_DEV_DM=y
-# CONFIG_DM_DEBUG is not set
-# CONFIG_DM_CRYPT is not set
-# CONFIG_DM_SNAPSHOT is not set
CONFIG_DM_MIRROR=y
CONFIG_DM_ZERO=y
-# CONFIG_DM_MULTIPATH is not set
-# CONFIG_DM_DELAY is not set
-# CONFIG_DM_UEVENT is not set
-# CONFIG_FUSION is not set
-
-#
-# IEEE 1394 (FireWire) support
-#
-
-#
-# Enable only one of the two stacks, unless you know what you are doing
-#
-# CONFIG_FIREWIRE is not set
-# CONFIG_IEEE1394 is not set
-# CONFIG_I2O is not set
CONFIG_MACINTOSH_DRIVERS=y
CONFIG_MAC_EMUMOUSEBTN=y
CONFIG_NETDEVICES=y
-CONFIG_COMPAT_NET_DEV_OPS=y
-# CONFIG_IFB is not set
-# CONFIG_DUMMY is not set
-# CONFIG_BONDING is not set
-# CONFIG_MACVLAN is not set
-# CONFIG_EQUALIZER is not set
-# CONFIG_TUN is not set
-# CONFIG_VETH is not set
-# CONFIG_NET_SB1000 is not set
-# CONFIG_ARCNET is not set
-CONFIG_PHYLIB=y
-
-#
-# MII PHY device drivers
-#
-# CONFIG_MARVELL_PHY is not set
-# CONFIG_DAVICOM_PHY is not set
-# CONFIG_QSEMI_PHY is not set
-# CONFIG_LXT_PHY is not set
-# CONFIG_CICADA_PHY is not set
-# CONFIG_VITESSE_PHY is not set
-# CONFIG_SMSC_PHY is not set
-# CONFIG_BROADCOM_PHY is not set
-# CONFIG_ICPLUS_PHY is not set
-# CONFIG_REALTEK_PHY is not set
-# CONFIG_NATIONAL_PHY is not set
-# CONFIG_STE10XP is not set
-# CONFIG_LSI_ET1011C_PHY is not set
-# CONFIG_FIXED_PHY is not set
-# CONFIG_MDIO_BITBANG is not set
CONFIG_NET_ETHERNET=y
-CONFIG_MII=y
-# CONFIG_HAPPYMEAL is not set
-# CONFIG_SUNGEM is not set
-# CONFIG_CASSINI is not set
CONFIG_NET_VENDOR_3COM=y
-# CONFIG_VORTEX is not set
-# CONFIG_TYPHOON is not set
-# CONFIG_ETHOC is not set
-# CONFIG_DNET is not set
CONFIG_NET_TULIP=y
-# CONFIG_DE2104X is not set
-# CONFIG_TULIP is not set
-# CONFIG_DE4X5 is not set
-# CONFIG_WINBOND_840 is not set
-# CONFIG_DM9102 is not set
-# CONFIG_ULI526X is not set
-# CONFIG_PCMCIA_XIRCOM is not set
-# CONFIG_HP100 is not set
-# CONFIG_IBM_NEW_EMAC_ZMII is not set
-# CONFIG_IBM_NEW_EMAC_RGMII is not set
-# CONFIG_IBM_NEW_EMAC_TAH is not set
-# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
-# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set
-# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set
-# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set
CONFIG_NET_PCI=y
-# CONFIG_PCNET32 is not set
-# CONFIG_AMD8111_ETH is not set
-# CONFIG_ADAPTEC_STARFIRE is not set
-# CONFIG_B44 is not set
CONFIG_FORCEDETH=y
-# CONFIG_FORCEDETH_NAPI is not set
CONFIG_E100=y
-# CONFIG_FEALNX is not set
-# CONFIG_NATSEMI is not set
-# CONFIG_NE2K_PCI is not set
-# CONFIG_8139CP is not set
CONFIG_8139TOO=y
-CONFIG_8139TOO_PIO=y
-# CONFIG_8139TOO_TUNE_TWISTER is not set
-# CONFIG_8139TOO_8129 is not set
-# CONFIG_8139_OLD_RX_RESET is not set
-# CONFIG_R6040 is not set
-# CONFIG_SIS900 is not set
-# CONFIG_EPIC100 is not set
-# CONFIG_SMSC9420 is not set
-# CONFIG_SUNDANCE is not set
-# CONFIG_TLAN is not set
-# CONFIG_VIA_RHINE is not set
-# CONFIG_SC92031 is not set
-# CONFIG_ATL2 is not set
-CONFIG_NETDEV_1000=y
-# CONFIG_ACENIC is not set
-# CONFIG_DL2K is not set
CONFIG_E1000=y
-# CONFIG_E1000E is not set
-# CONFIG_IP1000 is not set
-# CONFIG_IGB is not set
-# CONFIG_IGBVF is not set
-# CONFIG_NS83820 is not set
-# CONFIG_HAMACHI is not set
-# CONFIG_YELLOWFIN is not set
-# CONFIG_R8169 is not set
-# CONFIG_SIS190 is not set
-# CONFIG_SKGE is not set
CONFIG_SKY2=y
-# CONFIG_SKY2_DEBUG is not set
-# CONFIG_VIA_VELOCITY is not set
CONFIG_TIGON3=y
-# CONFIG_BNX2 is not set
-# CONFIG_QLA3XXX is not set
-# CONFIG_ATL1 is not set
-# CONFIG_ATL1E is not set
-# CONFIG_ATL1C is not set
-# CONFIG_JME is not set
-CONFIG_NETDEV_10000=y
-# CONFIG_CHELSIO_T1 is not set
-CONFIG_CHELSIO_T3_DEPENDS=y
-# CONFIG_CHELSIO_T3 is not set
-# CONFIG_ENIC is not set
-# CONFIG_IXGBE is not set
-# CONFIG_IXGB is not set
-# CONFIG_S2IO is not set
-# CONFIG_VXGE is not set
-# CONFIG_MYRI10GE is not set
-# CONFIG_NETXEN_NIC is not set
-# CONFIG_NIU is not set
-# CONFIG_MLX4_EN is not set
-# CONFIG_MLX4_CORE is not set
-# CONFIG_TEHUTI is not set
-# CONFIG_BNX2X is not set
-# CONFIG_QLGE is not set
-# CONFIG_SFC is not set
-# CONFIG_BE2NET is not set
CONFIG_TR=y
-# CONFIG_IBMOL is not set
-# CONFIG_3C359 is not set
-# CONFIG_TMS380TR is not set
-
-#
-# Wireless LAN
-#
-# CONFIG_WLAN_PRE80211 is not set
-CONFIG_WLAN_80211=y
-# CONFIG_PCMCIA_RAYCS is not set
-# CONFIG_LIBERTAS is not set
-# CONFIG_LIBERTAS_THINFIRM is not set
-# CONFIG_AIRO is not set
-# CONFIG_ATMEL is not set
-# CONFIG_AT76C50X_USB is not set
-# CONFIG_AIRO_CS is not set
-# CONFIG_PCMCIA_WL3501 is not set
-# CONFIG_PRISM54 is not set
-# CONFIG_USB_ZD1201 is not set
-# CONFIG_USB_NET_RNDIS_WLAN is not set
-# CONFIG_RTL8180 is not set
-# CONFIG_RTL8187 is not set
-# CONFIG_ADM8211 is not set
-# CONFIG_MAC80211_HWSIM is not set
-# CONFIG_MWL8K is not set
-# CONFIG_P54_COMMON is not set
-CONFIG_ATH5K=y
-# CONFIG_ATH5K_DEBUG is not set
-# CONFIG_ATH9K is not set
-# CONFIG_AR9170_USB is not set
-# CONFIG_IPW2100 is not set
-# CONFIG_IPW2200 is not set
-# CONFIG_IWLWIFI is not set
-# CONFIG_HOSTAP is not set
-# CONFIG_B43 is not set
-# CONFIG_B43LEGACY is not set
-# CONFIG_ZD1211RW is not set
-# CONFIG_RT2X00 is not set
-# CONFIG_HERMES is not set
-
-#
-# Enable WiMAX (Networking options) to see the WiMAX drivers
-#
-
-#
-# USB Network Adapters
-#
-# CONFIG_USB_CATC is not set
-# CONFIG_USB_KAWETH is not set
-# CONFIG_USB_PEGASUS is not set
-# CONFIG_USB_RTL8150 is not set
-# CONFIG_USB_USBNET is not set
-# CONFIG_USB_HSO is not set
CONFIG_NET_PCMCIA=y
-# CONFIG_PCMCIA_3C589 is not set
-# CONFIG_PCMCIA_3C574 is not set
-# CONFIG_PCMCIA_FMVJ18X is not set
-# CONFIG_PCMCIA_PCNET is not set
-# CONFIG_PCMCIA_NMCLAN is not set
-# CONFIG_PCMCIA_SMC91C92 is not set
-# CONFIG_PCMCIA_XIRC2PS is not set
-# CONFIG_PCMCIA_AXNET is not set
-# CONFIG_PCMCIA_IBMTR is not set
-# CONFIG_WAN is not set
CONFIG_FDDI=y
-# CONFIG_DEFXX is not set
-# CONFIG_SKFP is not set
-# CONFIG_HIPPI is not set
-# CONFIG_PPP is not set
-# CONFIG_SLIP is not set
-# CONFIG_NET_FC is not set
CONFIG_NETCONSOLE=y
-# CONFIG_NETCONSOLE_DYNAMIC is not set
-CONFIG_NETPOLL=y
-# CONFIG_NETPOLL_TRAP is not set
-CONFIG_NET_POLL_CONTROLLER=y
-# CONFIG_ISDN is not set
-# CONFIG_PHONE is not set
-
-#
-# Input device support
-#
-CONFIG_INPUT=y
-CONFIG_INPUT_FF_MEMLESS=y
CONFIG_INPUT_POLLDEV=y
-
-#
-# Userland interfaces
-#
-CONFIG_INPUT_MOUSEDEV=y
# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
-CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
-CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
-# CONFIG_INPUT_JOYDEV is not set
CONFIG_INPUT_EVDEV=y
-# CONFIG_INPUT_EVBUG is not set
-
-#
-# Input Device Drivers
-#
-CONFIG_INPUT_KEYBOARD=y
-CONFIG_KEYBOARD_ATKBD=y
-# CONFIG_KEYBOARD_SUNKBD is not set
-# CONFIG_KEYBOARD_LKKBD is not set
-# CONFIG_KEYBOARD_XTKBD is not set
-# CONFIG_KEYBOARD_NEWTON is not set
-# CONFIG_KEYBOARD_STOWAWAY is not set
-CONFIG_INPUT_MOUSE=y
-CONFIG_MOUSE_PS2=y
-CONFIG_MOUSE_PS2_ALPS=y
-CONFIG_MOUSE_PS2_LOGIPS2PP=y
-CONFIG_MOUSE_PS2_SYNAPTICS=y
-CONFIG_MOUSE_PS2_LIFEBOOK=y
-CONFIG_MOUSE_PS2_TRACKPOINT=y
-# CONFIG_MOUSE_PS2_ELANTECH is not set
-# CONFIG_MOUSE_PS2_TOUCHKIT is not set
-# CONFIG_MOUSE_SERIAL is not set
-# CONFIG_MOUSE_APPLETOUCH is not set
-# CONFIG_MOUSE_BCM5974 is not set
-# CONFIG_MOUSE_VSXXXAA is not set
CONFIG_INPUT_JOYSTICK=y
-# CONFIG_JOYSTICK_ANALOG is not set
-# CONFIG_JOYSTICK_A3D is not set
-# CONFIG_JOYSTICK_ADI is not set
-# CONFIG_JOYSTICK_COBRA is not set
-# CONFIG_JOYSTICK_GF2K is not set
-# CONFIG_JOYSTICK_GRIP is not set
-# CONFIG_JOYSTICK_GRIP_MP is not set
-# CONFIG_JOYSTICK_GUILLEMOT is not set
-# CONFIG_JOYSTICK_INTERACT is not set
-# CONFIG_JOYSTICK_SIDEWINDER is not set
-# CONFIG_JOYSTICK_TMDC is not set
-# CONFIG_JOYSTICK_IFORCE is not set
-# CONFIG_JOYSTICK_WARRIOR is not set
-# CONFIG_JOYSTICK_MAGELLAN is not set
-# CONFIG_JOYSTICK_SPACEORB is not set
-# CONFIG_JOYSTICK_SPACEBALL is not set
-# CONFIG_JOYSTICK_STINGER is not set
-# CONFIG_JOYSTICK_TWIDJOY is not set
-# CONFIG_JOYSTICK_ZHENHUA is not set
-# CONFIG_JOYSTICK_JOYDUMP is not set
-# CONFIG_JOYSTICK_XPAD is not set
CONFIG_INPUT_TABLET=y
-# CONFIG_TABLET_USB_ACECAD is not set
-# CONFIG_TABLET_USB_AIPTEK is not set
-# CONFIG_TABLET_USB_GTCO is not set
-# CONFIG_TABLET_USB_KBTAB is not set
-# CONFIG_TABLET_USB_WACOM is not set
CONFIG_INPUT_TOUCHSCREEN=y
-# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
-# CONFIG_TOUCHSCREEN_AD7879 is not set
-# CONFIG_TOUCHSCREEN_FUJITSU is not set
-# CONFIG_TOUCHSCREEN_GUNZE is not set
-# CONFIG_TOUCHSCREEN_ELO is not set
-# CONFIG_TOUCHSCREEN_WACOM_W8001 is not set
-# CONFIG_TOUCHSCREEN_MTOUCH is not set
-# CONFIG_TOUCHSCREEN_INEXIO is not set
-# CONFIG_TOUCHSCREEN_MK712 is not set
-# CONFIG_TOUCHSCREEN_PENMOUNT is not set
-# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
-# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
-# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
-# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
-# CONFIG_TOUCHSCREEN_TSC2007 is not set
CONFIG_INPUT_MISC=y
-# CONFIG_INPUT_PCSPKR is not set
-# CONFIG_INPUT_APANEL is not set
-# CONFIG_INPUT_ATLAS_BTNS is not set
-# CONFIG_INPUT_ATI_REMOTE is not set
-# CONFIG_INPUT_ATI_REMOTE2 is not set
-# CONFIG_INPUT_KEYSPAN_REMOTE is not set
-# CONFIG_INPUT_POWERMATE is not set
-# CONFIG_INPUT_YEALINK is not set
-# CONFIG_INPUT_CM109 is not set
-# CONFIG_INPUT_UINPUT is not set
-
-#
-# Hardware I/O ports
-#
-CONFIG_SERIO=y
-CONFIG_SERIO_I8042=y
-CONFIG_SERIO_SERPORT=y
-# CONFIG_SERIO_CT82C710 is not set
-# CONFIG_SERIO_PCIPS2 is not set
-CONFIG_SERIO_LIBPS2=y
-# CONFIG_SERIO_RAW is not set
-# CONFIG_GAMEPORT is not set
-
-#
-# Character devices
-#
-CONFIG_VT=y
-CONFIG_CONSOLE_TRANSLATIONS=y
-CONFIG_VT_CONSOLE=y
-CONFIG_HW_CONSOLE=y
CONFIG_VT_HW_CONSOLE_BINDING=y
-CONFIG_DEVKMEM=y
CONFIG_SERIAL_NONSTANDARD=y
-# CONFIG_COMPUTONE is not set
-# CONFIG_ROCKETPORT is not set
-# CONFIG_CYCLADES is not set
-# CONFIG_DIGIEPCA is not set
-# CONFIG_MOXA_INTELLIO is not set
-# CONFIG_MOXA_SMARTIO is not set
-# CONFIG_ISI is not set
-# CONFIG_SYNCLINK is not set
-# CONFIG_SYNCLINKMP is not set
-# CONFIG_SYNCLINK_GT is not set
-# CONFIG_N_HDLC is not set
-# CONFIG_RISCOM8 is not set
-# CONFIG_SPECIALIX is not set
-# CONFIG_SX is not set
-# CONFIG_RIO is not set
-# CONFIG_STALDRV is not set
-# CONFIG_NOZOMI is not set
-
-#
-# Serial drivers
-#
CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_FIX_EARLYCON_MEM=y
-CONFIG_SERIAL_8250_PCI=y
-CONFIG_SERIAL_8250_PNP=y
-# CONFIG_SERIAL_8250_CS is not set
CONFIG_SERIAL_8250_NR_UARTS=32
-CONFIG_SERIAL_8250_RUNTIME_UARTS=4
CONFIG_SERIAL_8250_EXTENDED=y
CONFIG_SERIAL_8250_MANY_PORTS=y
CONFIG_SERIAL_8250_SHARE_IRQ=y
CONFIG_SERIAL_8250_DETECT_IRQ=y
CONFIG_SERIAL_8250_RSA=y
-
-#
-# Non-8250 serial port support
-#
-CONFIG_SERIAL_CORE=y
-CONFIG_SERIAL_CORE_CONSOLE=y
-# CONFIG_SERIAL_JSM is not set
-CONFIG_UNIX98_PTYS=y
-# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set
# CONFIG_LEGACY_PTYS is not set
-# CONFIG_IPMI_HANDLER is not set
CONFIG_HW_RANDOM=y
-# CONFIG_HW_RANDOM_TIMERIOMEM is not set
# CONFIG_HW_RANDOM_INTEL is not set
# CONFIG_HW_RANDOM_AMD is not set
CONFIG_NVRAM=y
-# CONFIG_R3964 is not set
-# CONFIG_APPLICOM is not set
-
-#
-# PCMCIA character devices
-#
-# CONFIG_SYNCLINK_CS is not set
-# CONFIG_CARDMAN_4000 is not set
-# CONFIG_CARDMAN_4040 is not set
-# CONFIG_IPWIRELESS is not set
-# CONFIG_MWAVE is not set
-# CONFIG_PC8736x_GPIO is not set
-# CONFIG_RAW_DRIVER is not set
CONFIG_HPET=y
# CONFIG_HPET_MMAP is not set
-# CONFIG_HANGCHECK_TIMER is not set
-# CONFIG_TCG_TPM is not set
-# CONFIG_TELCLOCK is not set
-CONFIG_DEVPORT=y
-CONFIG_I2C=y
-CONFIG_I2C_BOARDINFO=y
-# CONFIG_I2C_CHARDEV is not set
-CONFIG_I2C_HELPER_AUTO=y
-CONFIG_I2C_ALGOBIT=y
-
-#
-# I2C Hardware Bus support
-#
-
-#
-# PC SMBus host controller drivers
-#
-# CONFIG_I2C_ALI1535 is not set
-# CONFIG_I2C_ALI1563 is not set
-# CONFIG_I2C_ALI15X3 is not set
-# CONFIG_I2C_AMD756 is not set
-# CONFIG_I2C_AMD8111 is not set
CONFIG_I2C_I801=y
-# CONFIG_I2C_ISCH is not set
-# CONFIG_I2C_PIIX4 is not set
-# CONFIG_I2C_NFORCE2 is not set
-# CONFIG_I2C_SIS5595 is not set
-# CONFIG_I2C_SIS630 is not set
-# CONFIG_I2C_SIS96X is not set
-# CONFIG_I2C_VIA is not set
-# CONFIG_I2C_VIAPRO is not set
-
-#
-# I2C system bus drivers (mostly embedded / system-on-chip)
-#
-# CONFIG_I2C_OCORES is not set
-# CONFIG_I2C_SIMTEC is not set
-
-#
-# External I2C/SMBus adapter drivers
-#
-# CONFIG_I2C_PARPORT_LIGHT is not set
-# CONFIG_I2C_TAOS_EVM is not set
-# CONFIG_I2C_TINY_USB is not set
-
-#
-# Graphics adapter I2C/DDC channel drivers
-#
-# CONFIG_I2C_VOODOO3 is not set
-
-#
-# Other I2C/SMBus bus drivers
-#
-# CONFIG_I2C_PCA_PLATFORM is not set
-# CONFIG_I2C_STUB is not set
-
-#
-# Miscellaneous I2C Chip support
-#
-# CONFIG_DS1682 is not set
-# CONFIG_SENSORS_PCF8574 is not set
-# CONFIG_PCF8575 is not set
-# CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_MAX6875 is not set
-# CONFIG_SENSORS_TSL2550 is not set
-# CONFIG_I2C_DEBUG_CORE is not set
-# CONFIG_I2C_DEBUG_ALGO is not set
-# CONFIG_I2C_DEBUG_BUS is not set
-# CONFIG_I2C_DEBUG_CHIP is not set
-# CONFIG_SPI is not set
-CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
-# CONFIG_GPIOLIB is not set
-# CONFIG_W1 is not set
-CONFIG_POWER_SUPPLY=y
-# CONFIG_POWER_SUPPLY_DEBUG is not set
-# CONFIG_PDA_POWER is not set
-# CONFIG_BATTERY_DS2760 is not set
-# CONFIG_BATTERY_BQ27x00 is not set
-CONFIG_HWMON=y
-# CONFIG_HWMON_VID is not set
-# CONFIG_SENSORS_ABITUGURU is not set
-# CONFIG_SENSORS_ABITUGURU3 is not set
-# CONFIG_SENSORS_AD7414 is not set
-# CONFIG_SENSORS_AD7418 is not set
-# CONFIG_SENSORS_ADM1021 is not set
-# CONFIG_SENSORS_ADM1025 is not set
-# CONFIG_SENSORS_ADM1026 is not set
-# CONFIG_SENSORS_ADM1029 is not set
-# CONFIG_SENSORS_ADM1031 is not set
-# CONFIG_SENSORS_ADM9240 is not set
-# CONFIG_SENSORS_ADT7462 is not set
-# CONFIG_SENSORS_ADT7470 is not set
-# CONFIG_SENSORS_ADT7473 is not set
-# CONFIG_SENSORS_ADT7475 is not set
-# CONFIG_SENSORS_K8TEMP is not set
-# CONFIG_SENSORS_ASB100 is not set
-# CONFIG_SENSORS_ATK0110 is not set
-# CONFIG_SENSORS_ATXP1 is not set
-# CONFIG_SENSORS_DS1621 is not set
-# CONFIG_SENSORS_I5K_AMB is not set
-# CONFIG_SENSORS_F71805F is not set
-# CONFIG_SENSORS_F71882FG is not set
-# CONFIG_SENSORS_F75375S is not set
-# CONFIG_SENSORS_FSCHER is not set
-# CONFIG_SENSORS_FSCPOS is not set
-# CONFIG_SENSORS_FSCHMD is not set
-# CONFIG_SENSORS_G760A is not set
-# CONFIG_SENSORS_GL518SM is not set
-# CONFIG_SENSORS_GL520SM is not set
-# CONFIG_SENSORS_CORETEMP is not set
-# CONFIG_SENSORS_IT87 is not set
-# CONFIG_SENSORS_LM63 is not set
-# CONFIG_SENSORS_LM75 is not set
-# CONFIG_SENSORS_LM77 is not set
-# CONFIG_SENSORS_LM78 is not set
-# CONFIG_SENSORS_LM80 is not set
-# CONFIG_SENSORS_LM83 is not set
-# CONFIG_SENSORS_LM85 is not set
-# CONFIG_SENSORS_LM87 is not set
-# CONFIG_SENSORS_LM90 is not set
-# CONFIG_SENSORS_LM92 is not set
-# CONFIG_SENSORS_LM93 is not set
-# CONFIG_SENSORS_LTC4215 is not set
-# CONFIG_SENSORS_LTC4245 is not set
-# CONFIG_SENSORS_LM95241 is not set
-# CONFIG_SENSORS_MAX1619 is not set
-# CONFIG_SENSORS_MAX6650 is not set
-# CONFIG_SENSORS_PC87360 is not set
-# CONFIG_SENSORS_PC87427 is not set
-# CONFIG_SENSORS_PCF8591 is not set
-# CONFIG_SENSORS_SIS5595 is not set
-# CONFIG_SENSORS_DME1737 is not set
-# CONFIG_SENSORS_SMSC47M1 is not set
-# CONFIG_SENSORS_SMSC47M192 is not set
-# CONFIG_SENSORS_SMSC47B397 is not set
-# CONFIG_SENSORS_ADS7828 is not set
-# CONFIG_SENSORS_THMC50 is not set
-# CONFIG_SENSORS_VIA686A is not set
-# CONFIG_SENSORS_VT1211 is not set
-# CONFIG_SENSORS_VT8231 is not set
-# CONFIG_SENSORS_W83781D is not set
-# CONFIG_SENSORS_W83791D is not set
-# CONFIG_SENSORS_W83792D is not set
-# CONFIG_SENSORS_W83793 is not set
-# CONFIG_SENSORS_W83L785TS is not set
-# CONFIG_SENSORS_W83L786NG is not set
-# CONFIG_SENSORS_W83627HF is not set
-# CONFIG_SENSORS_W83627EHF is not set
-# CONFIG_SENSORS_HDAPS is not set
-# CONFIG_SENSORS_LIS3LV02D is not set
-# CONFIG_SENSORS_APPLESMC is not set
-# CONFIG_HWMON_DEBUG_CHIP is not set
-CONFIG_THERMAL=y
-# CONFIG_THERMAL_HWMON is not set
CONFIG_WATCHDOG=y
-# CONFIG_WATCHDOG_NOWAYOUT is not set
-
-#
-# Watchdog Device Drivers
-#
-# CONFIG_SOFT_WATCHDOG is not set
-# CONFIG_ACQUIRE_WDT is not set
-# CONFIG_ADVANTECH_WDT is not set
-# CONFIG_ALIM1535_WDT is not set
-# CONFIG_ALIM7101_WDT is not set
-# CONFIG_SC520_WDT is not set
-# CONFIG_EUROTECH_WDT is not set
-# CONFIG_IB700_WDT is not set
-# CONFIG_IBMASR is not set
-# CONFIG_WAFER_WDT is not set
-# CONFIG_I6300ESB_WDT is not set
-# CONFIG_ITCO_WDT is not set
-# CONFIG_IT8712F_WDT is not set
-# CONFIG_IT87_WDT is not set
-# CONFIG_HP_WATCHDOG is not set
-# CONFIG_SC1200_WDT is not set
-# CONFIG_PC87413_WDT is not set
-# CONFIG_60XX_WDT is not set
-# CONFIG_SBC8360_WDT is not set
-# CONFIG_CPU5_WDT is not set
-# CONFIG_SMSC_SCH311X_WDT is not set
-# CONFIG_SMSC37B787_WDT is not set
-# CONFIG_W83627HF_WDT is not set
-# CONFIG_W83697HF_WDT is not set
-# CONFIG_W83697UG_WDT is not set
-# CONFIG_W83877F_WDT is not set
-# CONFIG_W83977F_WDT is not set
-# CONFIG_MACHZ_WDT is not set
-# CONFIG_SBC_EPX_C3_WATCHDOG is not set
-
-#
-# PCI-based Watchdog Cards
-#
-# CONFIG_PCIPCWATCHDOG is not set
-# CONFIG_WDTPCI is not set
-
-#
-# USB-based Watchdog Cards
-#
-# CONFIG_USBPCWATCHDOG is not set
-CONFIG_SSB_POSSIBLE=y
-
-#
-# Sonics Silicon Backplane
-#
-# CONFIG_SSB is not set
-
-#
-# Multifunction device drivers
-#
-# CONFIG_MFD_CORE is not set
-# CONFIG_MFD_SM501 is not set
-# CONFIG_HTC_PASIC3 is not set
-# CONFIG_TWL4030_CORE is not set
-# CONFIG_MFD_TMIO is not set
-# CONFIG_PMIC_DA903X is not set
-# CONFIG_MFD_WM8400 is not set
-# CONFIG_MFD_WM8350_I2C is not set
-# CONFIG_MFD_PCF50633 is not set
-# CONFIG_REGULATOR is not set
-
-#
-# Multimedia devices
-#
-
-#
-# Multimedia core support
-#
-# CONFIG_VIDEO_DEV is not set
-# CONFIG_DVB_CORE is not set
-# CONFIG_VIDEO_MEDIA is not set
-
-#
-# Multimedia drivers
-#
-CONFIG_DAB=y
-# CONFIG_USB_DABUSB is not set
-
-#
-# Graphics support
-#
CONFIG_AGP=y
CONFIG_AGP_AMD64=y
CONFIG_AGP_INTEL=y
-# CONFIG_AGP_SIS is not set
-# CONFIG_AGP_VIA is not set
CONFIG_DRM=y
-# CONFIG_DRM_TDFX is not set
-# CONFIG_DRM_R128 is not set
-# CONFIG_DRM_RADEON is not set
-# CONFIG_DRM_I810 is not set
-# CONFIG_DRM_I830 is not set
CONFIG_DRM_I915=y
CONFIG_DRM_I915_KMS=y
-# CONFIG_DRM_MGA is not set
-# CONFIG_DRM_SIS is not set
-# CONFIG_DRM_VIA is not set
-# CONFIG_DRM_SAVAGE is not set
-# CONFIG_VGASTATE is not set
-# CONFIG_VIDEO_OUTPUT_CONTROL is not set
-CONFIG_FB=y
-# CONFIG_FIRMWARE_EDID is not set
-# CONFIG_FB_DDC is not set
-# CONFIG_FB_BOOT_VESA_SUPPORT is not set
-CONFIG_FB_CFB_FILLRECT=y
-CONFIG_FB_CFB_COPYAREA=y
-CONFIG_FB_CFB_IMAGEBLIT=y
-# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set
-# CONFIG_FB_SYS_FILLRECT is not set
-# CONFIG_FB_SYS_COPYAREA is not set
-# CONFIG_FB_SYS_IMAGEBLIT is not set
-# CONFIG_FB_FOREIGN_ENDIAN is not set
-# CONFIG_FB_SYS_FOPS is not set
-# CONFIG_FB_SVGALIB is not set
-# CONFIG_FB_MACMODES is not set
-# CONFIG_FB_BACKLIGHT is not set
CONFIG_FB_MODE_HELPERS=y
CONFIG_FB_TILEBLITTING=y
-
-#
-# Frame buffer hardware drivers
-#
-# CONFIG_FB_CIRRUS is not set
-# CONFIG_FB_PM2 is not set
-# CONFIG_FB_CYBER2000 is not set
-# CONFIG_FB_ARC is not set
-# CONFIG_FB_ASILIANT is not set
-# CONFIG_FB_IMSTT is not set
-# CONFIG_FB_VGA16 is not set
-# CONFIG_FB_UVESA is not set
-# CONFIG_FB_VESA is not set
CONFIG_FB_EFI=y
-# CONFIG_FB_N411 is not set
-# CONFIG_FB_HGA is not set
-# CONFIG_FB_S1D13XXX is not set
-# CONFIG_FB_NVIDIA is not set
-# CONFIG_FB_RIVA is not set
-# CONFIG_FB_LE80578 is not set
-# CONFIG_FB_INTEL is not set
-# CONFIG_FB_MATROX is not set
-# CONFIG_FB_RADEON is not set
-# CONFIG_FB_ATY128 is not set
-# CONFIG_FB_ATY is not set
-# CONFIG_FB_S3 is not set
-# CONFIG_FB_SAVAGE is not set
-# CONFIG_FB_SIS is not set
-# CONFIG_FB_VIA is not set
-# CONFIG_FB_NEOMAGIC is not set
-# CONFIG_FB_KYRO is not set
-# CONFIG_FB_3DFX is not set
-# CONFIG_FB_VOODOO1 is not set
-# CONFIG_FB_VT8623 is not set
-# CONFIG_FB_TRIDENT is not set
-# CONFIG_FB_ARK is not set
-# CONFIG_FB_PM3 is not set
-# CONFIG_FB_CARMINE is not set
-# CONFIG_FB_GEODE is not set
-# CONFIG_FB_VIRTUAL is not set
-# CONFIG_FB_METRONOME is not set
-# CONFIG_FB_MB862XX is not set
-# CONFIG_FB_BROADSHEET is not set
CONFIG_BACKLIGHT_LCD_SUPPORT=y
# CONFIG_LCD_CLASS_DEVICE is not set
-CONFIG_BACKLIGHT_CLASS_DEVICE=y
-CONFIG_BACKLIGHT_GENERIC=y
-# CONFIG_BACKLIGHT_PROGEAR is not set
-# CONFIG_BACKLIGHT_MBP_NVIDIA is not set
-# CONFIG_BACKLIGHT_SAHARA is not set
-
-#
-# Display device support
-#
-# CONFIG_DISPLAY_SUPPORT is not set
-
-#
-# Console display driver support
-#
-CONFIG_VGA_CONSOLE=y
CONFIG_VGACON_SOFT_SCROLLBACK=y
-CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
-CONFIG_DUMMY_CONSOLE=y
-# CONFIG_FRAMEBUFFER_CONSOLE is not set
CONFIG_LOGO=y
# CONFIG_LOGO_LINUX_MONO is not set
# CONFIG_LOGO_LINUX_VGA16 is not set
-CONFIG_LOGO_LINUX_CLUT224=y
CONFIG_SOUND=y
-CONFIG_SOUND_OSS_CORE=y
CONFIG_SND=y
-CONFIG_SND_TIMER=y
-CONFIG_SND_PCM=y
-CONFIG_SND_HWDEP=y
-CONFIG_SND_JACK=y
CONFIG_SND_SEQUENCER=y
CONFIG_SND_SEQ_DUMMY=y
-CONFIG_SND_OSSEMUL=y
CONFIG_SND_MIXER_OSS=y
CONFIG_SND_PCM_OSS=y
-CONFIG_SND_PCM_OSS_PLUGINS=y
CONFIG_SND_SEQUENCER_OSS=y
CONFIG_SND_HRTIMER=y
-CONFIG_SND_SEQ_HRTIMER_DEFAULT=y
-CONFIG_SND_DYNAMIC_MINORS=y
-CONFIG_SND_SUPPORT_OLD_API=y
-CONFIG_SND_VERBOSE_PROCFS=y
-# CONFIG_SND_VERBOSE_PRINTK is not set
-# CONFIG_SND_DEBUG is not set
-CONFIG_SND_VMASTER=y
-CONFIG_SND_DRIVERS=y
-# CONFIG_SND_PCSP is not set
-# CONFIG_SND_DUMMY is not set
-# CONFIG_SND_VIRMIDI is not set
-# CONFIG_SND_MTPAV is not set
-# CONFIG_SND_SERIAL_U16550 is not set
-# CONFIG_SND_MPU401 is not set
-CONFIG_SND_PCI=y
-# CONFIG_SND_AD1889 is not set
-# CONFIG_SND_ALS300 is not set
-# CONFIG_SND_ALS4000 is not set
-# CONFIG_SND_ALI5451 is not set
-# CONFIG_SND_ATIIXP is not set
-# CONFIG_SND_ATIIXP_MODEM is not set
-# CONFIG_SND_AU8810 is not set
-# CONFIG_SND_AU8820 is not set
-# CONFIG_SND_AU8830 is not set
-# CONFIG_SND_AW2 is not set
-# CONFIG_SND_AZT3328 is not set
-# CONFIG_SND_BT87X is not set
-# CONFIG_SND_CA0106 is not set
-# CONFIG_SND_CMIPCI is not set
-# CONFIG_SND_OXYGEN is not set
-# CONFIG_SND_CS4281 is not set
-# CONFIG_SND_CS46XX is not set
-# CONFIG_SND_CS5530 is not set
-# CONFIG_SND_DARLA20 is not set
-# CONFIG_SND_GINA20 is not set
-# CONFIG_SND_LAYLA20 is not set
-# CONFIG_SND_DARLA24 is not set
-# CONFIG_SND_GINA24 is not set
-# CONFIG_SND_LAYLA24 is not set
-# CONFIG_SND_MONA is not set
-# CONFIG_SND_MIA is not set
-# CONFIG_SND_ECHO3G is not set
-# CONFIG_SND_INDIGO is not set
-# CONFIG_SND_INDIGOIO is not set
-# CONFIG_SND_INDIGODJ is not set
-# CONFIG_SND_INDIGOIOX is not set
-# CONFIG_SND_INDIGODJX is not set
-# CONFIG_SND_EMU10K1 is not set
-# CONFIG_SND_EMU10K1X is not set
-# CONFIG_SND_ENS1370 is not set
-# CONFIG_SND_ENS1371 is not set
-# CONFIG_SND_ES1938 is not set
-# CONFIG_SND_ES1968 is not set
-# CONFIG_SND_FM801 is not set
CONFIG_SND_HDA_INTEL=y
CONFIG_SND_HDA_HWDEP=y
-# CONFIG_SND_HDA_RECONFIG is not set
-# CONFIG_SND_HDA_INPUT_BEEP is not set
-CONFIG_SND_HDA_CODEC_REALTEK=y
-CONFIG_SND_HDA_CODEC_ANALOG=y
-CONFIG_SND_HDA_CODEC_SIGMATEL=y
-CONFIG_SND_HDA_CODEC_VIA=y
-CONFIG_SND_HDA_CODEC_ATIHDMI=y
-CONFIG_SND_HDA_CODEC_NVHDMI=y
-CONFIG_SND_HDA_CODEC_INTELHDMI=y
-CONFIG_SND_HDA_ELD=y
-CONFIG_SND_HDA_CODEC_CONEXANT=y
-CONFIG_SND_HDA_CODEC_CMEDIA=y
-CONFIG_SND_HDA_CODEC_SI3054=y
-CONFIG_SND_HDA_GENERIC=y
-# CONFIG_SND_HDA_POWER_SAVE is not set
-# CONFIG_SND_HDSP is not set
-# CONFIG_SND_HDSPM is not set
-# CONFIG_SND_HIFIER is not set
-# CONFIG_SND_ICE1712 is not set
-# CONFIG_SND_ICE1724 is not set
-# CONFIG_SND_INTEL8X0 is not set
-# CONFIG_SND_INTEL8X0M is not set
-# CONFIG_SND_KORG1212 is not set
-# CONFIG_SND_MAESTRO3 is not set
-# CONFIG_SND_MIXART is not set
-# CONFIG_SND_NM256 is not set
-# CONFIG_SND_PCXHR is not set
-# CONFIG_SND_RIPTIDE is not set
-# CONFIG_SND_RME32 is not set
-# CONFIG_SND_RME96 is not set
-# CONFIG_SND_RME9652 is not set
-# CONFIG_SND_SONICVIBES is not set
-# CONFIG_SND_TRIDENT is not set
-# CONFIG_SND_VIA82XX is not set
-# CONFIG_SND_VIA82XX_MODEM is not set
-# CONFIG_SND_VIRTUOSO is not set
-# CONFIG_SND_VX222 is not set
-# CONFIG_SND_YMFPCI is not set
-CONFIG_SND_USB=y
-# CONFIG_SND_USB_AUDIO is not set
-# CONFIG_SND_USB_USX2Y is not set
-# CONFIG_SND_USB_CAIAQ is not set
-# CONFIG_SND_USB_US122L is not set
-CONFIG_SND_PCMCIA=y
-# CONFIG_SND_VXPOCKET is not set
-# CONFIG_SND_PDAUDIOCF is not set
-# CONFIG_SND_SOC is not set
-# CONFIG_SOUND_PRIME is not set
-CONFIG_HID_SUPPORT=y
-CONFIG_HID=y
-CONFIG_HID_DEBUG=y
CONFIG_HIDRAW=y
-
-#
-# USB Input Devices
-#
-CONFIG_USB_HID=y
CONFIG_HID_PID=y
CONFIG_USB_HIDDEV=y
-
-#
-# Special HID drivers
-#
-CONFIG_HID_A4TECH=y
-CONFIG_HID_APPLE=y
-CONFIG_HID_BELKIN=y
-CONFIG_HID_CHERRY=y
-CONFIG_HID_CHICONY=y
-CONFIG_HID_CYPRESS=y
-# CONFIG_DRAGONRISE_FF is not set
-CONFIG_HID_EZKEY=y
-CONFIG_HID_KYE=y
CONFIG_HID_GYRATION=y
-CONFIG_HID_KENSINGTON=y
-CONFIG_HID_LOGITECH=y
CONFIG_LOGITECH_FF=y
-# CONFIG_LOGIRUMBLEPAD2_FF is not set
-CONFIG_HID_MICROSOFT=y
-CONFIG_HID_MONTEREY=y
CONFIG_HID_NTRIG=y
CONFIG_HID_PANTHERLORD=y
CONFIG_PANTHERLORD_FF=y
@@ -1849,697 +250,90 @@ CONFIG_HID_PETALYNX=y
CONFIG_HID_SAMSUNG=y
CONFIG_HID_SONY=y
CONFIG_HID_SUNPLUS=y
-# CONFIG_GREENASIA_FF is not set
CONFIG_HID_TOPSEED=y
-CONFIG_THRUSTMASTER_FF=y
-CONFIG_ZEROPLUS_FF=y
-CONFIG_USB_SUPPORT=y
-CONFIG_USB_ARCH_HAS_HCD=y
-CONFIG_USB_ARCH_HAS_OHCI=y
-CONFIG_USB_ARCH_HAS_EHCI=y
CONFIG_USB=y
CONFIG_USB_DEBUG=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
-
-#
-# Miscellaneous USB options
-#
CONFIG_USB_DEVICEFS=y
# CONFIG_USB_DEVICE_CLASS is not set
-# CONFIG_USB_DYNAMIC_MINORS is not set
-CONFIG_USB_SUSPEND=y
-# CONFIG_USB_OTG is not set
CONFIG_USB_MON=y
-# CONFIG_USB_WUSB is not set
-# CONFIG_USB_WUSB_CBAF is not set
-
-#
-# USB Host Controller Drivers
-#
-# CONFIG_USB_C67X00_HCD is not set
CONFIG_USB_EHCI_HCD=y
-# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
# CONFIG_USB_EHCI_TT_NEWSCHED is not set
-# CONFIG_USB_OXU210HP_HCD is not set
-# CONFIG_USB_ISP116X_HCD is not set
-# CONFIG_USB_ISP1760_HCD is not set
CONFIG_USB_OHCI_HCD=y
-# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
-# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set
-CONFIG_USB_OHCI_LITTLE_ENDIAN=y
CONFIG_USB_UHCI_HCD=y
-# CONFIG_USB_SL811_HCD is not set
-# CONFIG_USB_R8A66597_HCD is not set
-# CONFIG_USB_WHCI_HCD is not set
-# CONFIG_USB_HWA_HCD is not set
-
-#
-# USB Device Class drivers
-#
-# CONFIG_USB_ACM is not set
CONFIG_USB_PRINTER=y
-# CONFIG_USB_WDM is not set
-# CONFIG_USB_TMC is not set
-
-#
-# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
-#
-
-#
-# also be needed; see USB_STORAGE Help for more info
-#
CONFIG_USB_STORAGE=y
-# CONFIG_USB_STORAGE_DEBUG is not set
-# CONFIG_USB_STORAGE_DATAFAB is not set
-# CONFIG_USB_STORAGE_FREECOM is not set
-# CONFIG_USB_STORAGE_ISD200 is not set
-# CONFIG_USB_STORAGE_USBAT is not set
-# CONFIG_USB_STORAGE_SDDR09 is not set
-# CONFIG_USB_STORAGE_SDDR55 is not set
-# CONFIG_USB_STORAGE_JUMPSHOT is not set
-# CONFIG_USB_STORAGE_ALAUDA is not set
-# CONFIG_USB_STORAGE_ONETOUCH is not set
-# CONFIG_USB_STORAGE_KARMA is not set
-# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
CONFIG_USB_LIBUSUAL=y
-
-#
-# USB Imaging devices
-#
-# CONFIG_USB_MDC800 is not set
-# CONFIG_USB_MICROTEK is not set
-
-#
-# USB port drivers
-#
-# CONFIG_USB_SERIAL is not set
-
-#
-# USB Miscellaneous drivers
-#
-# CONFIG_USB_EMI62 is not set
-# CONFIG_USB_EMI26 is not set
-# CONFIG_USB_ADUTUX is not set
-# CONFIG_USB_SEVSEG is not set
-# CONFIG_USB_RIO500 is not set
-# CONFIG_USB_LEGOTOWER is not set
-# CONFIG_USB_LCD is not set
-# CONFIG_USB_BERRY_CHARGE is not set
-# CONFIG_USB_LED is not set
-# CONFIG_USB_CYPRESS_CY7C63 is not set
-# CONFIG_USB_CYTHERM is not set
-# CONFIG_USB_IDMOUSE is not set
-# CONFIG_USB_FTDI_ELAN is not set
-# CONFIG_USB_APPLEDISPLAY is not set
-# CONFIG_USB_SISUSBVGA is not set
-# CONFIG_USB_LD is not set
-# CONFIG_USB_TRANCEVIBRATOR is not set
-# CONFIG_USB_IOWARRIOR is not set
-# CONFIG_USB_TEST is not set
-# CONFIG_USB_ISIGHTFW is not set
-# CONFIG_USB_VST is not set
-# CONFIG_USB_GADGET is not set
-
-#
-# OTG and related infrastructure
-#
-# CONFIG_NOP_USB_XCEIV is not set
-# CONFIG_UWB is not set
-# CONFIG_MMC is not set
-# CONFIG_MEMSTICK is not set
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-
-#
-# LED drivers
-#
-# CONFIG_LEDS_ALIX2 is not set
-# CONFIG_LEDS_PCA9532 is not set
-# CONFIG_LEDS_LP5521 is not set
-# CONFIG_LEDS_CLEVO_MAIL is not set
-# CONFIG_LEDS_PCA955X is not set
-# CONFIG_LEDS_BD2802 is not set
-
-#
-# LED Triggers
-#
-CONFIG_LEDS_TRIGGERS=y
-# CONFIG_LEDS_TRIGGER_TIMER is not set
-# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
-# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
-# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
-
-#
-# iptables trigger is under Netfilter config (LED target)
-#
-# CONFIG_ACCESSIBILITY is not set
-# CONFIG_INFINIBAND is not set
CONFIG_EDAC=y
-
-#
-# Reporting subsystems
-#
-# CONFIG_EDAC_DEBUG is not set
-# CONFIG_EDAC_MM_EDAC is not set
-CONFIG_RTC_LIB=y
CONFIG_RTC_CLASS=y
# CONFIG_RTC_HCTOSYS is not set
-# CONFIG_RTC_DEBUG is not set
-
-#
-# RTC interfaces
-#
-CONFIG_RTC_INTF_SYSFS=y
-CONFIG_RTC_INTF_PROC=y
-CONFIG_RTC_INTF_DEV=y
-# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
-# CONFIG_RTC_DRV_TEST is not set
-
-#
-# I2C RTC drivers
-#
-# CONFIG_RTC_DRV_DS1307 is not set
-# CONFIG_RTC_DRV_DS1374 is not set
-# CONFIG_RTC_DRV_DS1672 is not set
-# CONFIG_RTC_DRV_MAX6900 is not set
-# CONFIG_RTC_DRV_RS5C372 is not set
-# CONFIG_RTC_DRV_ISL1208 is not set
-# CONFIG_RTC_DRV_X1205 is not set
-# CONFIG_RTC_DRV_PCF8563 is not set
-# CONFIG_RTC_DRV_PCF8583 is not set
-# CONFIG_RTC_DRV_M41T80 is not set
-# CONFIG_RTC_DRV_S35390A is not set
-# CONFIG_RTC_DRV_FM3130 is not set
-# CONFIG_RTC_DRV_RX8581 is not set
-
-#
-# SPI RTC drivers
-#
-
-#
-# Platform RTC drivers
-#
-CONFIG_RTC_DRV_CMOS=y
-# CONFIG_RTC_DRV_DS1286 is not set
-# CONFIG_RTC_DRV_DS1511 is not set
-# CONFIG_RTC_DRV_DS1553 is not set
-# CONFIG_RTC_DRV_DS1742 is not set
-# CONFIG_RTC_DRV_STK17TA8 is not set
-# CONFIG_RTC_DRV_M48T86 is not set
-# CONFIG_RTC_DRV_M48T35 is not set
-# CONFIG_RTC_DRV_M48T59 is not set
-# CONFIG_RTC_DRV_BQ4802 is not set
-# CONFIG_RTC_DRV_V3020 is not set
-
-#
-# on-CPU RTC drivers
-#
CONFIG_DMADEVICES=y
-
-#
-# DMA Devices
-#
-# CONFIG_INTEL_IOATDMA is not set
-# CONFIG_AUXDISPLAY is not set
-# CONFIG_UIO is not set
-# CONFIG_STAGING is not set
-CONFIG_X86_PLATFORM_DEVICES=y
-# CONFIG_ACER_WMI is not set
-# CONFIG_ASUS_LAPTOP is not set
-# CONFIG_FUJITSU_LAPTOP is not set
-# CONFIG_MSI_LAPTOP is not set
-# CONFIG_PANASONIC_LAPTOP is not set
-# CONFIG_COMPAL_LAPTOP is not set
-# CONFIG_SONY_LAPTOP is not set
-# CONFIG_THINKPAD_ACPI is not set
-# CONFIG_INTEL_MENLOW is not set
CONFIG_EEEPC_LAPTOP=y
-# CONFIG_ACPI_WMI is not set
-# CONFIG_ACPI_ASUS is not set
-# CONFIG_ACPI_TOSHIBA is not set
-
-#
-# Firmware Drivers
-#
-# CONFIG_EDD is not set
-CONFIG_FIRMWARE_MEMMAP=y
CONFIG_EFI_VARS=y
-# CONFIG_DELL_RBU is not set
-# CONFIG_DCDBAS is not set
-CONFIG_DMIID=y
-# CONFIG_ISCSI_IBFT_FIND is not set
-
-#
-# File systems
-#
-# CONFIG_EXT2_FS is not set
CONFIG_EXT3_FS=y
# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-CONFIG_EXT3_FS_XATTR=y
CONFIG_EXT3_FS_POSIX_ACL=y
CONFIG_EXT3_FS_SECURITY=y
-# CONFIG_EXT4_FS is not set
-CONFIG_JBD=y
-# CONFIG_JBD_DEBUG is not set
-CONFIG_FS_MBCACHE=y
-# CONFIG_REISERFS_FS is not set
-# CONFIG_JFS_FS is not set
-CONFIG_FS_POSIX_ACL=y
-CONFIG_FILE_LOCKING=y
-# CONFIG_XFS_FS is not set
-# CONFIG_GFS2_FS is not set
-# CONFIG_OCFS2_FS is not set
-# CONFIG_BTRFS_FS is not set
-CONFIG_DNOTIFY=y
-CONFIG_INOTIFY=y
-CONFIG_INOTIFY_USER=y
CONFIG_QUOTA=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
# CONFIG_PRINT_QUOTA_WARNING is not set
-CONFIG_QUOTA_TREE=y
-# CONFIG_QFMT_V1 is not set
CONFIG_QFMT_V2=y
-CONFIG_QUOTACTL=y
-# CONFIG_AUTOFS_FS is not set
CONFIG_AUTOFS4_FS=y
-# CONFIG_FUSE_FS is not set
-CONFIG_GENERIC_ACL=y
-
-#
-# Caches
-#
-# CONFIG_FSCACHE is not set
-
-#
-# CD-ROM/DVD Filesystems
-#
CONFIG_ISO9660_FS=y
CONFIG_JOLIET=y
CONFIG_ZISOFS=y
-# CONFIG_UDF_FS is not set
-
-#
-# DOS/FAT/NT Filesystems
-#
-CONFIG_FAT_FS=y
CONFIG_MSDOS_FS=y
CONFIG_VFAT_FS=y
-CONFIG_FAT_DEFAULT_CODEPAGE=437
-CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
-# CONFIG_NTFS_FS is not set
-
-#
-# Pseudo filesystems
-#
-CONFIG_PROC_FS=y
CONFIG_PROC_KCORE=y
-CONFIG_PROC_VMCORE=y
-CONFIG_PROC_SYSCTL=y
-CONFIG_PROC_PAGE_MONITOR=y
-CONFIG_SYSFS=y
CONFIG_TMPFS=y
CONFIG_TMPFS_POSIX_ACL=y
CONFIG_HUGETLBFS=y
-CONFIG_HUGETLB_PAGE=y
-# CONFIG_CONFIGFS_FS is not set
-CONFIG_MISC_FILESYSTEMS=y
-# CONFIG_ADFS_FS is not set
-# CONFIG_AFFS_FS is not set
-# CONFIG_ECRYPT_FS is not set
-# CONFIG_HFS_FS is not set
-# CONFIG_HFSPLUS_FS is not set
-# CONFIG_BEFS_FS is not set
-# CONFIG_BFS_FS is not set
-# CONFIG_EFS_FS is not set
-# CONFIG_CRAMFS is not set
-# CONFIG_SQUASHFS is not set
-# CONFIG_VXFS_FS is not set
-# CONFIG_MINIX_FS is not set
-# CONFIG_OMFS_FS is not set
-# CONFIG_HPFS_FS is not set
-# CONFIG_QNX4FS_FS is not set
-# CONFIG_ROMFS_FS is not set
-# CONFIG_SYSV_FS is not set
-# CONFIG_UFS_FS is not set
-# CONFIG_NILFS2_FS is not set
-CONFIG_NETWORK_FILESYSTEMS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
CONFIG_NFS_V3_ACL=y
CONFIG_NFS_V4=y
CONFIG_ROOT_NFS=y
-# CONFIG_NFSD is not set
-CONFIG_LOCKD=y
-CONFIG_LOCKD_V4=y
-CONFIG_NFS_ACL_SUPPORT=y
-CONFIG_NFS_COMMON=y
-CONFIG_SUNRPC=y
-CONFIG_SUNRPC_GSS=y
-CONFIG_RPCSEC_GSS_KRB5=y
-# CONFIG_RPCSEC_GSS_SPKM3 is not set
-# CONFIG_SMB_FS is not set
-# CONFIG_CIFS is not set
-# CONFIG_NCP_FS is not set
-# CONFIG_CODA_FS is not set
-# CONFIG_AFS_FS is not set
-
-#
-# Partition Types
-#
CONFIG_PARTITION_ADVANCED=y
-# CONFIG_ACORN_PARTITION is not set
CONFIG_OSF_PARTITION=y
CONFIG_AMIGA_PARTITION=y
-# CONFIG_ATARI_PARTITION is not set
CONFIG_MAC_PARTITION=y
-CONFIG_MSDOS_PARTITION=y
CONFIG_BSD_DISKLABEL=y
CONFIG_MINIX_SUBPARTITION=y
CONFIG_SOLARIS_X86_PARTITION=y
CONFIG_UNIXWARE_DISKLABEL=y
-# CONFIG_LDM_PARTITION is not set
CONFIG_SGI_PARTITION=y
-# CONFIG_ULTRIX_PARTITION is not set
CONFIG_SUN_PARTITION=y
CONFIG_KARMA_PARTITION=y
CONFIG_EFI_PARTITION=y
-# CONFIG_SYSV68_PARTITION is not set
-CONFIG_NLS=y
CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_CODEPAGE_437=y
-# CONFIG_NLS_CODEPAGE_737 is not set
-# CONFIG_NLS_CODEPAGE_775 is not set
-# CONFIG_NLS_CODEPAGE_850 is not set
-# CONFIG_NLS_CODEPAGE_852 is not set
-# CONFIG_NLS_CODEPAGE_855 is not set
-# CONFIG_NLS_CODEPAGE_857 is not set
-# CONFIG_NLS_CODEPAGE_860 is not set
-# CONFIG_NLS_CODEPAGE_861 is not set
-# CONFIG_NLS_CODEPAGE_862 is not set
-# CONFIG_NLS_CODEPAGE_863 is not set
-# CONFIG_NLS_CODEPAGE_864 is not set
-# CONFIG_NLS_CODEPAGE_865 is not set
-# CONFIG_NLS_CODEPAGE_866 is not set
-# CONFIG_NLS_CODEPAGE_869 is not set
-# CONFIG_NLS_CODEPAGE_936 is not set
-# CONFIG_NLS_CODEPAGE_950 is not set
-# CONFIG_NLS_CODEPAGE_932 is not set
-# CONFIG_NLS_CODEPAGE_949 is not set
-# CONFIG_NLS_CODEPAGE_874 is not set
-# CONFIG_NLS_ISO8859_8 is not set
-# CONFIG_NLS_CODEPAGE_1250 is not set
-# CONFIG_NLS_CODEPAGE_1251 is not set
CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
-# CONFIG_NLS_ISO8859_2 is not set
-# CONFIG_NLS_ISO8859_3 is not set
-# CONFIG_NLS_ISO8859_4 is not set
-# CONFIG_NLS_ISO8859_5 is not set
-# CONFIG_NLS_ISO8859_6 is not set
-# CONFIG_NLS_ISO8859_7 is not set
-# CONFIG_NLS_ISO8859_9 is not set
-# CONFIG_NLS_ISO8859_13 is not set
-# CONFIG_NLS_ISO8859_14 is not set
-# CONFIG_NLS_ISO8859_15 is not set
-# CONFIG_NLS_KOI8_R is not set
-# CONFIG_NLS_KOI8_U is not set
CONFIG_NLS_UTF8=y
-# CONFIG_DLM is not set
-
-#
-# Kernel hacking
-#
-CONFIG_TRACE_IRQFLAGS_SUPPORT=y
CONFIG_PRINTK_TIME=y
# CONFIG_ENABLE_WARN_DEPRECATED is not set
-CONFIG_ENABLE_MUST_CHECK=y
-CONFIG_FRAME_WARN=2048
CONFIG_MAGIC_SYSRQ=y
# CONFIG_UNUSED_SYMBOLS is not set
-CONFIG_DEBUG_FS=y
-# CONFIG_HEADERS_CHECK is not set
CONFIG_DEBUG_KERNEL=y
-# CONFIG_DEBUG_SHIRQ is not set
-# CONFIG_DETECT_SOFTLOCKUP is not set
-# CONFIG_DETECT_HUNG_TASK is not set
# CONFIG_SCHED_DEBUG is not set
CONFIG_SCHEDSTATS=y
CONFIG_TIMER_STATS=y
-# CONFIG_DEBUG_OBJECTS is not set
-# CONFIG_SLUB_DEBUG_ON is not set
-# CONFIG_SLUB_STATS is not set
-# CONFIG_DEBUG_RT_MUTEXES is not set
-# CONFIG_RT_MUTEX_TESTER is not set
-# CONFIG_DEBUG_SPINLOCK is not set
-# CONFIG_DEBUG_MUTEXES is not set
-# CONFIG_DEBUG_LOCK_ALLOC is not set
-# CONFIG_PROVE_LOCKING is not set
-# CONFIG_LOCK_STAT is not set
-# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
-# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
-CONFIG_STACKTRACE=y
-# CONFIG_DEBUG_KOBJECT is not set
-CONFIG_DEBUG_BUGVERBOSE=y
-# CONFIG_DEBUG_INFO is not set
-# CONFIG_DEBUG_VM is not set
-# CONFIG_DEBUG_VIRTUAL is not set
-# CONFIG_DEBUG_WRITECOUNT is not set
-CONFIG_DEBUG_MEMORY_INIT=y
-# CONFIG_DEBUG_LIST is not set
-# CONFIG_DEBUG_SG is not set
-# CONFIG_DEBUG_NOTIFIERS is not set
-CONFIG_ARCH_WANT_FRAME_POINTERS=y
-CONFIG_FRAME_POINTER=y
-# CONFIG_BOOT_PRINTK_DELAY is not set
-# CONFIG_RCU_TORTURE_TEST is not set
# CONFIG_RCU_CPU_STALL_DETECTOR is not set
-# CONFIG_KPROBES_SANITY_TEST is not set
-# CONFIG_BACKTRACE_SELF_TEST is not set
-# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set
-# CONFIG_LKDTM is not set
-# CONFIG_FAULT_INJECTION is not set
-# CONFIG_LATENCYTOP is not set
CONFIG_SYSCTL_SYSCALL_CHECK=y
-# CONFIG_DEBUG_PAGEALLOC is not set
-CONFIG_USER_STACKTRACE_SUPPORT=y
-CONFIG_NOP_TRACER=y
-CONFIG_HAVE_FUNCTION_TRACER=y
-CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
-CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
-CONFIG_HAVE_DYNAMIC_FTRACE=y
-CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
-CONFIG_HAVE_HW_BRANCH_TRACER=y
-CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
-CONFIG_RING_BUFFER=y
-CONFIG_TRACING=y
-CONFIG_TRACING_SUPPORT=y
-
-#
-# Tracers
-#
-# CONFIG_FUNCTION_TRACER is not set
-# CONFIG_IRQSOFF_TRACER is not set
-# CONFIG_SYSPROF_TRACER is not set
-# CONFIG_SCHED_TRACER is not set
-# CONFIG_CONTEXT_SWITCH_TRACER is not set
-# CONFIG_EVENT_TRACER is not set
-# CONFIG_FTRACE_SYSCALLS is not set
-# CONFIG_BOOT_TRACER is not set
-# CONFIG_TRACE_BRANCH_PROFILING is not set
-# CONFIG_POWER_TRACER is not set
-# CONFIG_STACK_TRACER is not set
-# CONFIG_HW_BRANCH_TRACER is not set
-# CONFIG_KMEMTRACE is not set
-# CONFIG_WORKQUEUE_TRACER is not set
CONFIG_BLK_DEV_IO_TRACE=y
-# CONFIG_FTRACE_STARTUP_TEST is not set
-# CONFIG_MMIOTRACE is not set
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
-# CONFIG_DYNAMIC_DEBUG is not set
-# CONFIG_DMA_API_DEBUG is not set
-# CONFIG_SAMPLES is not set
-CONFIG_HAVE_ARCH_KGDB=y
-# CONFIG_KGDB is not set
-# CONFIG_STRICT_DEVMEM is not set
-CONFIG_X86_VERBOSE_BOOTUP=y
-CONFIG_EARLY_PRINTK=y
CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_STACKOVERFLOW=y
CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_DEBUG_PER_CPU_MAPS is not set
-# CONFIG_X86_PTDUMP is not set
-CONFIG_DEBUG_RODATA=y
# CONFIG_DEBUG_RODATA_TEST is not set
CONFIG_DEBUG_NX_TEST=m
-# CONFIG_IOMMU_DEBUG is not set
-CONFIG_HAVE_MMIOTRACE_SUPPORT=y
-CONFIG_IO_DELAY_TYPE_0X80=0
-CONFIG_IO_DELAY_TYPE_0XED=1
-CONFIG_IO_DELAY_TYPE_UDELAY=2
-CONFIG_IO_DELAY_TYPE_NONE=3
-CONFIG_IO_DELAY_0X80=y
-# CONFIG_IO_DELAY_0XED is not set
-# CONFIG_IO_DELAY_UDELAY is not set
-# CONFIG_IO_DELAY_NONE is not set
-CONFIG_DEFAULT_IO_DELAY_TYPE=0
CONFIG_DEBUG_BOOT_PARAMS=y
-# CONFIG_CPA_DEBUG is not set
CONFIG_OPTIMIZE_INLINING=y
-
-#
-# Security options
-#
-CONFIG_KEYS=y
CONFIG_KEYS_DEBUG_PROC_KEYS=y
CONFIG_SECURITY=y
-# CONFIG_SECURITYFS is not set
CONFIG_SECURITY_NETWORK=y
-# CONFIG_SECURITY_NETWORK_XFRM is not set
-# CONFIG_SECURITY_PATH is not set
-CONFIG_SECURITY_FILE_CAPABILITIES=y
-# CONFIG_SECURITY_ROOTPLUG is not set
-CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
CONFIG_SECURITY_SELINUX=y
CONFIG_SECURITY_SELINUX_BOOTPARAM=y
-CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=1
CONFIG_SECURITY_SELINUX_DISABLE=y
-CONFIG_SECURITY_SELINUX_DEVELOP=y
-CONFIG_SECURITY_SELINUX_AVC_STATS=y
-CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
-# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
-# CONFIG_SECURITY_SMACK is not set
-# CONFIG_SECURITY_TOMOYO is not set
-# CONFIG_IMA is not set
-CONFIG_CRYPTO=y
-
-#
-# Crypto core or helper
-#
-# CONFIG_CRYPTO_FIPS is not set
-CONFIG_CRYPTO_ALGAPI=y
-CONFIG_CRYPTO_ALGAPI2=y
-CONFIG_CRYPTO_AEAD=y
-CONFIG_CRYPTO_AEAD2=y
-CONFIG_CRYPTO_BLKCIPHER=y
-CONFIG_CRYPTO_BLKCIPHER2=y
-CONFIG_CRYPTO_HASH=y
-CONFIG_CRYPTO_HASH2=y
-CONFIG_CRYPTO_RNG2=y
-CONFIG_CRYPTO_PCOMP=y
-CONFIG_CRYPTO_MANAGER=y
-CONFIG_CRYPTO_MANAGER2=y
-# CONFIG_CRYPTO_GF128MUL is not set
-# CONFIG_CRYPTO_NULL is not set
-CONFIG_CRYPTO_WORKQUEUE=y
-# CONFIG_CRYPTO_CRYPTD is not set
-CONFIG_CRYPTO_AUTHENC=y
-# CONFIG_CRYPTO_TEST is not set
-
-#
-# Authenticated Encryption with Associated Data
-#
-# CONFIG_CRYPTO_CCM is not set
-# CONFIG_CRYPTO_GCM is not set
-# CONFIG_CRYPTO_SEQIV is not set
-
-#
-# Block modes
-#
-CONFIG_CRYPTO_CBC=y
-# CONFIG_CRYPTO_CTR is not set
-# CONFIG_CRYPTO_CTS is not set
-CONFIG_CRYPTO_ECB=y
-# CONFIG_CRYPTO_LRW is not set
-# CONFIG_CRYPTO_PCBC is not set
-# CONFIG_CRYPTO_XTS is not set
-
-#
-# Hash modes
-#
-CONFIG_CRYPTO_HMAC=y
-# CONFIG_CRYPTO_XCBC is not set
-
-#
-# Digest
-#
-# CONFIG_CRYPTO_CRC32C is not set
-# CONFIG_CRYPTO_CRC32C_INTEL is not set
-# CONFIG_CRYPTO_MD4 is not set
-CONFIG_CRYPTO_MD5=y
-# CONFIG_CRYPTO_MICHAEL_MIC is not set
-# CONFIG_CRYPTO_RMD128 is not set
-# CONFIG_CRYPTO_RMD160 is not set
-# CONFIG_CRYPTO_RMD256 is not set
-# CONFIG_CRYPTO_RMD320 is not set
-CONFIG_CRYPTO_SHA1=y
-# CONFIG_CRYPTO_SHA256 is not set
-# CONFIG_CRYPTO_SHA512 is not set
-# CONFIG_CRYPTO_TGR192 is not set
-# CONFIG_CRYPTO_WP512 is not set
-
-#
-# Ciphers
-#
-CONFIG_CRYPTO_AES=y
-# CONFIG_CRYPTO_AES_X86_64 is not set
-# CONFIG_CRYPTO_AES_NI_INTEL is not set
-# CONFIG_CRYPTO_ANUBIS is not set
-CONFIG_CRYPTO_ARC4=y
-# CONFIG_CRYPTO_BLOWFISH is not set
-# CONFIG_CRYPTO_CAMELLIA is not set
-# CONFIG_CRYPTO_CAST5 is not set
-# CONFIG_CRYPTO_CAST6 is not set
-CONFIG_CRYPTO_DES=y
-# CONFIG_CRYPTO_FCRYPT is not set
-# CONFIG_CRYPTO_KHAZAD is not set
-# CONFIG_CRYPTO_SALSA20 is not set
-# CONFIG_CRYPTO_SALSA20_X86_64 is not set
-# CONFIG_CRYPTO_SEED is not set
-# CONFIG_CRYPTO_SERPENT is not set
-# CONFIG_CRYPTO_TEA is not set
-# CONFIG_CRYPTO_TWOFISH is not set
-# CONFIG_CRYPTO_TWOFISH_X86_64 is not set
-
-#
-# Compression
-#
-# CONFIG_CRYPTO_DEFLATE is not set
-# CONFIG_CRYPTO_ZLIB is not set
-# CONFIG_CRYPTO_LZO is not set
-
-#
-# Random Number Generation
-#
# CONFIG_CRYPTO_ANSI_CPRNG is not set
-CONFIG_CRYPTO_HW=y
-# CONFIG_CRYPTO_DEV_HIFN_795X is not set
-CONFIG_HAVE_KVM=y
-CONFIG_HAVE_KVM_IRQCHIP=y
-CONFIG_VIRTUALIZATION=y
-# CONFIG_KVM is not set
-# CONFIG_VIRTIO_PCI is not set
-# CONFIG_VIRTIO_BALLOON is not set
-CONFIG_BINARY_PRINTF=y
-
-#
-# Library routines
-#
-CONFIG_BITREVERSE=y
-CONFIG_GENERIC_FIND_FIRST_BIT=y
-CONFIG_GENERIC_FIND_NEXT_BIT=y
-CONFIG_GENERIC_FIND_LAST_BIT=y
-# CONFIG_CRC_CCITT is not set
-# CONFIG_CRC16 is not set
CONFIG_CRC_T10DIF=y
-# CONFIG_CRC_ITU_T is not set
-CONFIG_CRC32=y
-# CONFIG_CRC7 is not set
-# CONFIG_LIBCRC32C is not set
-CONFIG_ZLIB_INFLATE=y
-CONFIG_DECOMPRESS_GZIP=y
-CONFIG_DECOMPRESS_BZIP2=y
-CONFIG_DECOMPRESS_LZMA=y
-CONFIG_HAS_IOMEM=y
-CONFIG_HAS_IOPORT=y
-CONFIG_HAS_DMA=y
-CONFIG_NLATTR=y
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 0350311906ae..2d93bdbc9ac0 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -34,7 +34,7 @@
#include <asm/ia32.h>
#undef WARN_OLD
-#undef CORE_DUMP /* probably broken */
+#undef CORE_DUMP /* definitely broken */
static int load_aout_binary(struct linux_binprm *, struct pt_regs *regs);
static int load_aout_library(struct file *);
@@ -131,21 +131,15 @@ static void set_brk(unsigned long start, unsigned long end)
* macros to write out all the necessary info.
*/
-static int dump_write(struct file *file, const void *addr, int nr)
-{
- return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
+#include <linux/coredump.h>
#define DUMP_WRITE(addr, nr) \
if (!dump_write(file, (void *)(addr), (nr))) \
goto end_coredump;
-#define DUMP_SEEK(offset) \
- if (file->f_op->llseek) { \
- if (file->f_op->llseek(file, (offset), 0) != (offset)) \
- goto end_coredump; \
- } else \
- file->f_pos = (offset)
+#define DUMP_SEEK(offset) \
+ if (!dump_seek(file, offset)) \
+ goto end_coredump;
#define START_DATA() (u.u_tsize << PAGE_SHIFT)
#define START_STACK(u) (u.start_stack)
@@ -217,12 +211,6 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file,
dump_size = dump.u_ssize << PAGE_SHIFT;
DUMP_WRITE(dump_start, dump_size);
}
- /*
- * Finally dump the task struct. Not be used by gdb, but
- * could be useful
- */
- set_fs(KERNEL_DS);
- DUMP_WRITE(current, sizeof(*current));
end_coredump:
set_fs(fs);
return has_dumped;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e790bc1fbfa3..518bb99c3394 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -50,7 +50,12 @@
/*
* Reload arg registers from stack in case ptrace changed them.
* We don't reload %eax because syscall_trace_enter() returned
- * the value it wants us to use in the table lookup.
+ * the %rax value we should see. Instead, we just truncate that
+ * value to 32 bits again as we did on entry from user mode.
+ * If it's a new value set by user_regset during entry tracing,
+ * this matches the normal truncation of the user-mode value.
+ * If it's -1 to make us punt the syscall, then (u32)-1 is still
+ * an appropriately invalid value.
*/
.macro LOAD_ARGS32 offset, _r9=0
.if \_r9
@@ -60,6 +65,7 @@
movl \offset+48(%rsp),%edx
movl \offset+56(%rsp),%esi
movl \offset+64(%rsp),%edi
+ movl %eax,%eax /* zero extension */
.endm
.macro CFI_STARTPROC32 simple
@@ -153,7 +159,7 @@ ENTRY(ia32_sysenter_target)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
CFI_REMEMBER_STATE
jnz sysenter_tracesys
- cmpl $(IA32_NR_syscalls-1),%eax
+ cmpq $(IA32_NR_syscalls-1),%rax
ja ia32_badsys
sysenter_do_call:
IA32_ARG_FIXUP
@@ -195,7 +201,7 @@ sysexit_from_sys_call:
movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
call audit_syscall_entry
movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
- cmpl $(IA32_NR_syscalls-1),%eax
+ cmpq $(IA32_NR_syscalls-1),%rax
ja ia32_badsys
movl %ebx,%edi /* reload 1st syscall arg */
movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
@@ -248,7 +254,7 @@ sysenter_tracesys:
call syscall_trace_enter
LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
RESTORE_REST
- cmpl $(IA32_NR_syscalls-1),%eax
+ cmpq $(IA32_NR_syscalls-1),%rax
ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
jmp sysenter_do_call
CFI_ENDPROC
@@ -314,7 +320,7 @@ ENTRY(ia32_cstar_target)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
CFI_REMEMBER_STATE
jnz cstar_tracesys
- cmpl $IA32_NR_syscalls-1,%eax
+ cmpq $IA32_NR_syscalls-1,%rax
ja ia32_badsys
cstar_do_call:
IA32_ARG_FIXUP 1
@@ -367,7 +373,7 @@ cstar_tracesys:
LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */
RESTORE_REST
xchgl %ebp,%r9d
- cmpl $(IA32_NR_syscalls-1),%eax
+ cmpq $(IA32_NR_syscalls-1),%rax
ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
jmp cstar_do_call
END(ia32_cstar_target)
@@ -425,7 +431,7 @@ ENTRY(ia32_syscall)
orl $TS_COMPAT,TI_status(%r10)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
jnz ia32_tracesys
- cmpl $(IA32_NR_syscalls-1),%eax
+ cmpq $(IA32_NR_syscalls-1),%rax
ja ia32_badsys
ia32_do_call:
IA32_ARG_FIXUP
@@ -444,7 +450,7 @@ ia32_tracesys:
call syscall_trace_enter
LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
RESTORE_REST
- cmpl $(IA32_NR_syscalls-1),%eax
+ cmpq $(IA32_NR_syscalls-1),%rax
ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
jmp ia32_do_call
END(ia32_syscall)
@@ -842,4 +848,7 @@ ia32_sys_call_table:
.quad compat_sys_rt_tgsigqueueinfo /* 335 */
.quad sys_perf_event_open
.quad compat_sys_recvmmsg
+ .quad sys_fanotify_init
+ .quad sys32_fanotify_mark
+ .quad sys_prlimit64 /* 340 */
ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 626be156d88d..849813f398e7 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -51,7 +51,7 @@
#define AA(__x) ((unsigned long)(__x))
-asmlinkage long sys32_truncate64(char __user *filename,
+asmlinkage long sys32_truncate64(const char __user *filename,
unsigned long offset_low,
unsigned long offset_high)
{
@@ -96,7 +96,7 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
return 0;
}
-asmlinkage long sys32_stat64(char __user *filename,
+asmlinkage long sys32_stat64(const char __user *filename,
struct stat64 __user *statbuf)
{
struct kstat stat;
@@ -107,7 +107,7 @@ asmlinkage long sys32_stat64(char __user *filename,
return ret;
}
-asmlinkage long sys32_lstat64(char __user *filename,
+asmlinkage long sys32_lstat64(const char __user *filename,
struct stat64 __user *statbuf)
{
struct kstat stat;
@@ -126,7 +126,7 @@ asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
return ret;
}
-asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename,
+asmlinkage long sys32_fstatat(unsigned int dfd, const char __user *filename,
struct stat64 __user *statbuf, int flag)
{
struct kstat stat;
@@ -408,8 +408,8 @@ asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
((loff_t)AA(poshi) << 32) | AA(poslo));
}
-asmlinkage long sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count,
- u32 poslo, u32 poshi)
+asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf,
+ u32 count, u32 poslo, u32 poshi)
{
return sys_pwrite64(fd, ubuf, count,
((loff_t)AA(poshi) << 32) | AA(poslo));
@@ -449,7 +449,7 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd,
return ret;
}
-asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
+asmlinkage long sys32_execve(const char __user *name, compat_uptr_t __user *argv,
compat_uptr_t __user *envp, struct pt_regs *regs)
{
long error;
@@ -546,3 +546,12 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo,
return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
((u64)len_hi << 32) | len_lo);
}
+
+asmlinkage long sys32_fanotify_mark(int fanotify_fd, unsigned int flags,
+ u32 mask_lo, u32 mask_hi,
+ int fd, const char __user *pathname)
+{
+ return sys_fanotify_mark(fanotify_fd, flags,
+ ((u64)mask_hi << 32) | mask_lo,
+ fd, pathname);
+}
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 493092efaa3b..6fa90a845e4c 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -3,24 +3,23 @@ include include/asm-generic/Kbuild.asm
header-y += boot.h
header-y += bootparam.h
header-y += debugreg.h
+header-y += e820.h
+header-y += hw_breakpoint.h
+header-y += hyperv.h
+header-y += ist.h
header-y += ldt.h
+header-y += mce.h
header-y += msr-index.h
+header-y += msr.h
+header-y += mtrr.h
+header-y += posix_types_32.h
+header-y += posix_types_64.h
header-y += prctl.h
+header-y += processor-flags.h
header-y += ptrace-abi.h
header-y += sigcontext32.h
header-y += ucontext.h
-header-y += processor-flags.h
-header-y += hw_breakpoint.h
-header-y += hyperv.h
-
-unifdef-y += e820.h
-unifdef-y += ist.h
-unifdef-y += mce.h
-unifdef-y += msr.h
-unifdef-y += mtrr.h
-unifdef-y += posix_types_32.h
-unifdef-y += posix_types_64.h
-unifdef-y += unistd_32.h
-unifdef-y += unistd_64.h
-unifdef-y += vm86.h
-unifdef-y += vsyscall.h
+header-y += unistd_32.h
+header-y += unistd_64.h
+header-y += vm86.h
+header-y += vsyscall.h
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index aa2c39d968fc..55d106b5e31b 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -93,6 +93,9 @@ extern u8 acpi_sci_flags;
extern int acpi_sci_override_gsi;
void acpi_pic_sci_set_trigger(unsigned int, u16);
+extern int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
+ int trigger, int polarity);
+
static inline void disable_acpi(void)
{
acpi_disabled = 1;
@@ -134,7 +137,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
boot_cpu_data.x86_model <= 0x05 &&
boot_cpu_data.x86_mask < 0x0A)
return 1;
- else if (boot_cpu_has(X86_FEATURE_AMDC1E))
+ else if (c1e_detected)
return 1;
else
return max_cstate;
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 03b6bb5394a0..76561d20ea2f 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -4,6 +4,7 @@
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/stringify.h>
+#include <linux/jump_label.h>
#include <asm/asm.h>
/*
@@ -45,10 +46,9 @@
struct alt_instr {
u8 *instr; /* original instruction */
u8 *replacement;
- u8 cpuid; /* cpuid bit set for replacement */
+ u16 cpuid; /* cpuid bit set for replacement */
u8 instrlen; /* length of original instruction */
u8 replacementlen; /* length of new instruction, <= instrlen */
- u8 pad1;
#ifdef CONFIG_X86_64
u32 pad2;
#endif
@@ -86,9 +86,11 @@ static inline int alternatives_text_reserved(void *start, void *end)
_ASM_ALIGN "\n" \
_ASM_PTR "661b\n" /* label */ \
_ASM_PTR "663f\n" /* new instruction */ \
- " .byte " __stringify(feature) "\n" /* feature bit */ \
+ " .word " __stringify(feature) "\n" /* feature bit */ \
" .byte 662b-661b\n" /* sourcelen */ \
" .byte 664f-663f\n" /* replacementlen */ \
+ ".previous\n" \
+ ".section .discard,\"aw\",@progbits\n" \
" .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \
".previous\n" \
".section .altinstr_replacement, \"ax\"\n" \
@@ -159,6 +161,8 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
#define __parainstructions_end NULL
#endif
+extern void *text_poke_early(void *addr, const void *opcode, size_t len);
+
/*
* Clear and restore the kernel write-protection flag on the local CPU.
* Allows the kernel to edit read-only pages.
@@ -179,4 +183,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
+#define IDEAL_NOP_SIZE_5 5
+extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
+extern void arch_init_ideal_nop5(void);
+#else
+static inline void arch_init_ideal_nop5(void) {}
+#endif
+
#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index 5af2982133b5..a6863a2dec1f 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
* Leo Duran <leo.duran@amd.com>
*
@@ -24,11 +24,11 @@
#ifdef CONFIG_AMD_IOMMU
-extern void amd_iommu_detect(void);
+extern int amd_iommu_detect(void);
#else
-static inline void amd_iommu_detect(void) { }
+static inline int amd_iommu_detect(void) { return -ENODEV; }
#endif
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
index d2544f1d705d..916bc8111a01 100644
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
*
* This program is free software; you can redistribute it and/or modify it
@@ -38,4 +38,10 @@ static inline void amd_iommu_stats_init(void) { }
#endif /* !CONFIG_AMD_IOMMU_STATS */
+static inline bool is_rd890_iommu(struct pci_dev *pdev)
+{
+ return (pdev->vendor == PCI_VENDOR_ID_ATI) &&
+ (pdev->device == PCI_DEVICE_ID_RD890_IOMMU);
+}
+
#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 7014e88bc779..e3509fc303bf 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
* Leo Duran <leo.duran@amd.com>
*
@@ -368,6 +368,9 @@ struct amd_iommu {
/* capabilities of that IOMMU read from ACPI */
u32 cap;
+ /* flags read from acpi table */
+ u8 acpi_flags;
+
/*
* Capability pointer. There could be more than one IOMMU per PCI
* device function if there are more than one AMD IOMMU capability
@@ -411,6 +414,24 @@ struct amd_iommu {
/* default dma_ops domain for that IOMMU */
struct dma_ops_domain *default_dom;
+
+ /*
+ * We can't rely on the BIOS to restore all values on reinit, so we
+ * need to stash them
+ */
+
+ /* The iommu BAR */
+ u32 stored_addr_lo;
+ u32 stored_addr_hi;
+
+ /*
+ * Each iommu has 6 l1s, each of which is documented as having 0x12
+ * registers
+ */
+ u32 stored_l1[6][0x12];
+
+ /* The l2 indirect registers */
+ u32 stored_l2[0x83];
};
/*
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/amd_nb.h
index af00bd1d2089..c8517f81b21e 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_X86_K8_H
-#define _ASM_X86_K8_H
+#ifndef _ASM_X86_AMD_NB_H
+#define _ASM_X86_AMD_NB_H
#include <linux/pci.h>
@@ -7,24 +7,27 @@ extern struct pci_device_id k8_nb_ids[];
struct bootnode;
extern int early_is_k8_nb(u32 value);
-extern struct pci_dev **k8_northbridges;
-extern int num_k8_northbridges;
extern int cache_k8_northbridges(void);
extern void k8_flush_garts(void);
extern int k8_get_nodes(struct bootnode *nodes);
extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
extern int k8_scan_nodes(void);
-#ifdef CONFIG_K8_NB
-extern int num_k8_northbridges;
+struct k8_northbridge_info {
+ u16 num;
+ u8 gart_supported;
+ struct pci_dev **nb_misc;
+};
+extern struct k8_northbridge_info k8_northbridges;
+
+#ifdef CONFIG_AMD_NB
static inline struct pci_dev *node_to_k8_nb_misc(int node)
{
- return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
+ return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL;
}
#else
-#define num_k8_northbridges 0
static inline struct pci_dev *node_to_k8_nb_misc(int node)
{
@@ -33,4 +36,4 @@ static inline struct pci_dev *node_to_k8_nb_misc(int node)
#endif
-#endif /* _ASM_X86_K8_H */
+#endif /* _ASM_X86_AMD_NB_H */
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index c74a2eebe570..2fefa501d3ba 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -54,8 +54,6 @@ extern struct clock_event_device *global_clock_event;
extern unsigned long apbt_quick_calibrate(void);
extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
extern void apbt_setup_secondary_clock(void);
-extern unsigned int boot_cpu_id;
-extern int disable_apbt_percpu;
extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 1fa03e04ae44..286de34b0ed6 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -252,9 +252,7 @@ static inline int apic_is_clustered_box(void)
}
#endif
-extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask);
-extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
-
+extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
#else /* !CONFIG_X86_LOCAL_APIC */
static inline void lapic_shutdown(void) { }
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 7fe3b3060f08..a859ca461fb0 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -131,6 +131,7 @@
#define APIC_EILVTn(n) (0x500 + 0x10 * n)
#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */
#define APIC_EILVT_NR_AMD_10H 4
+#define APIC_EILVT_NR_MAX APIC_EILVT_NR_AMD_10H
#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF)
#define APIC_EILVT_MSG_FIX 0x0
#define APIC_EILVT_MSG_SMI 0x2
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 545776efeb16..903683b07e42 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -309,7 +309,7 @@ static inline int test_and_change_bit(int nr, volatile unsigned long *addr)
static __always_inline int constant_test_bit(unsigned int nr, const volatile unsigned long *addr)
{
return ((1UL << (nr % BITS_PER_LONG)) &
- (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
+ (addr[nr / BITS_PER_LONG])) != 0;
}
static inline int variable_test_bit(int nr, volatile const unsigned long *addr)
@@ -440,6 +440,8 @@ static inline int fls(int x)
#ifdef __KERNEL__
+#include <asm-generic/bitops/find.h>
+
#include <asm-generic/bitops/sched.h>
#define ARCH_HAS_FAST_MULTIPLIER 1
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 6be33d83c716..8e6218550e77 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -70,6 +70,14 @@ struct sys_desc_table {
__u8 table[14];
};
+/* Gleaned from OFW's set-parameters in cpu/x86/pc/linux.fth */
+struct olpc_ofw_header {
+ __u32 ofw_magic; /* OFW signature */
+ __u32 ofw_version;
+ __u32 cif_handler; /* callback into OFW */
+ __u32 irq_desc_table;
+} __attribute__((packed));
+
struct efi_info {
__u32 efi_loader_signature;
__u32 efi_systab;
@@ -92,7 +100,8 @@ struct boot_params {
__u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
__u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
struct sys_desc_table sys_desc_table; /* 0x0a0 */
- __u8 _pad4[144]; /* 0x0b0 */
+ struct olpc_ofw_header olpc_ofw_header; /* 0x0b0 */
+ __u8 _pad4[128]; /* 0x0c0 */
struct edid_info edid_info; /* 0x140 */
struct efi_info efi_info; /* 0x1c0 */
__u32 alt_mem_k; /* 0x1e0 */
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
index 0918654305af..0d467b338835 100644
--- a/arch/x86/include/asm/calgary.h
+++ b/arch/x86/include/asm/calgary.h
@@ -62,9 +62,9 @@ struct cal_chipset_ops {
extern int use_calgary;
#ifdef CONFIG_CALGARY_IOMMU
-extern void detect_calgary(void);
+extern int detect_calgary(void);
#else
-static inline void detect_calgary(void) { return; }
+static inline int detect_calgary(void) { return -ENODEV; }
#endif
#endif /* _ASM_X86_CALGARY_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 0e63c9a2a8d0..30af5a832163 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -48,36 +48,38 @@ For 32-bit we have the following conventions - kernel is built with
/*
- * 64-bit system call stack frame layout defines and helpers,
- * for assembly code:
+ * 64-bit system call stack frame layout defines and helpers, for
+ * assembly code (note that the seemingly unnecessary parentheses
+ * are to prevent cpp from inserting spaces in expressions that get
+ * passed to macros):
*/
-#define R15 0
-#define R14 8
-#define R13 16
-#define R12 24
-#define RBP 32
-#define RBX 40
+#define R15 (0)
+#define R14 (8)
+#define R13 (16)
+#define R12 (24)
+#define RBP (32)
+#define RBX (40)
/* arguments: interrupts/non tracing syscalls only save up to here: */
-#define R11 48
-#define R10 56
-#define R9 64
-#define R8 72
-#define RAX 80
-#define RCX 88
-#define RDX 96
-#define RSI 104
-#define RDI 112
-#define ORIG_RAX 120 /* + error_code */
+#define R11 (48)
+#define R10 (56)
+#define R9 (64)
+#define R8 (72)
+#define RAX (80)
+#define RCX (88)
+#define RDX (96)
+#define RSI (104)
+#define RDI (112)
+#define ORIG_RAX (120) /* + error_code */
/* end of arguments */
/* cpu exception frame or undefined in case of fast syscall: */
-#define RIP 128
-#define CS 136
-#define EFLAGS 144
-#define RSP 152
-#define SS 160
+#define RIP (128)
+#define CS (136)
+#define EFLAGS (144)
+#define RSP (152)
+#define SS (160)
#define ARGOFFSET R11
#define SWFRAME ORIG_RAX
@@ -111,7 +113,7 @@ For 32-bit we have the following conventions - kernel is built with
.endif
.endm
-#define ARG_SKIP 9*8
+#define ARG_SKIP (9*8)
.macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \
skipr8910=0, skiprdx=0
@@ -169,7 +171,7 @@ For 32-bit we have the following conventions - kernel is built with
.endif
.endm
-#define REST_SKIP 6*8
+#define REST_SKIP (6*8)
.macro SAVE_REST
subq $REST_SKIP, %rsp
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 8859e12dd3cf..284a6e8f7ce1 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -11,38 +11,42 @@
extern void __xchg_wrong_size(void);
/*
- * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
- * Note 2: xchg has side effect, so that attribute volatile is necessary,
- * but generally the primitive is invalid, *ptr is output argument. --ANK
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
+ * Since this is generally used to protect other memory information, we
+ * use "asm volatile" and "memory" clobbers to prevent gcc from moving
+ * information around.
*/
-
-struct __xchg_dummy {
- unsigned long a[100];
-};
-#define __xg(x) ((struct __xchg_dummy *)(x))
-
#define __xchg(x, ptr, size) \
({ \
__typeof(*(ptr)) __x = (x); \
switch (size) { \
case 1: \
- asm volatile("xchgb %b0,%1" \
- : "=q" (__x) \
- : "m" (*__xg(ptr)), "0" (__x) \
+ { \
+ volatile u8 *__ptr = (volatile u8 *)(ptr); \
+ asm volatile("xchgb %0,%1" \
+ : "=q" (__x), "+m" (*__ptr) \
+ : "0" (__x) \
: "memory"); \
break; \
+ } \
case 2: \
- asm volatile("xchgw %w0,%1" \
- : "=r" (__x) \
- : "m" (*__xg(ptr)), "0" (__x) \
+ { \
+ volatile u16 *__ptr = (volatile u16 *)(ptr); \
+ asm volatile("xchgw %0,%1" \
+ : "=r" (__x), "+m" (*__ptr) \
+ : "0" (__x) \
: "memory"); \
break; \
+ } \
case 4: \
+ { \
+ volatile u32 *__ptr = (volatile u32 *)(ptr); \
asm volatile("xchgl %0,%1" \
- : "=r" (__x) \
- : "m" (*__xg(ptr)), "0" (__x) \
+ : "=r" (__x), "+m" (*__ptr) \
+ : "0" (__x) \
: "memory"); \
break; \
+ } \
default: \
__xchg_wrong_size(); \
} \
@@ -53,60 +57,33 @@ struct __xchg_dummy {
__xchg((v), (ptr), sizeof(*ptr))
/*
- * The semantics of XCHGCMP8B are a bit strange, this is why
- * there is a loop and the loading of %%eax and %%edx has to
- * be inside. This inlines well in most cases, the cached
- * cost is around ~38 cycles. (in the future we might want
- * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
- * might have an implicit FPU-save as a cost, so it's not
- * clear which path to go.)
+ * CMPXCHG8B only writes to the target if we had the previous
+ * value in registers, otherwise it acts as a read and gives us the
+ * "new previous" value. That is why there is a loop. Preloading
+ * EDX:EAX is a performance optimization: in the common case it means
+ * we need only one locked operation.
*
- * cmpxchg8b must be used with the lock prefix here to allow
- * the instruction to be executed atomically, see page 3-102
- * of the instruction set reference 24319102.pdf. We need
- * the reader side to see the coherent 64bit value.
+ * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very
+ * least an FPU save and/or %cr0.ts manipulation.
+ *
+ * cmpxchg8b must be used with the lock prefix here to allow the
+ * instruction to be executed atomically. We need to have the reader
+ * side to see the coherent 64bit value.
*/
-static inline void __set_64bit(unsigned long long *ptr,
- unsigned int low, unsigned int high)
+static inline void set_64bit(volatile u64 *ptr, u64 value)
{
+ u32 low = value;
+ u32 high = value >> 32;
+ u64 prev = *ptr;
+
asm volatile("\n1:\t"
- "movl (%0), %%eax\n\t"
- "movl 4(%0), %%edx\n\t"
- LOCK_PREFIX "cmpxchg8b (%0)\n\t"
+ LOCK_PREFIX "cmpxchg8b %0\n\t"
"jnz 1b"
- : /* no outputs */
- : "D"(ptr),
- "b"(low),
- "c"(high)
- : "ax", "dx", "memory");
-}
-
-static inline void __set_64bit_constant(unsigned long long *ptr,
- unsigned long long value)
-{
- __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32));
-}
-
-#define ll_low(x) *(((unsigned int *)&(x)) + 0)
-#define ll_high(x) *(((unsigned int *)&(x)) + 1)
-
-static inline void __set_64bit_var(unsigned long long *ptr,
- unsigned long long value)
-{
- __set_64bit(ptr, ll_low(value), ll_high(value));
+ : "=m" (*ptr), "+A" (prev)
+ : "b" (low), "c" (high)
+ : "memory");
}
-#define set_64bit(ptr, value) \
- (__builtin_constant_p((value)) \
- ? __set_64bit_constant((ptr), (value)) \
- : __set_64bit_var((ptr), (value)))
-
-#define _set_64bit(ptr, value) \
- (__builtin_constant_p(value) \
- ? __set_64bit(ptr, (unsigned int)(value), \
- (unsigned int)((value) >> 32)) \
- : __set_64bit(ptr, ll_low((value)), ll_high((value))))
-
extern void __cmpxchg_wrong_size(void);
/*
@@ -121,23 +98,32 @@ extern void __cmpxchg_wrong_size(void);
__typeof__(*(ptr)) __new = (new); \
switch (size) { \
case 1: \
- asm volatile(lock "cmpxchgb %b1,%2" \
- : "=a"(__ret) \
- : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ { \
+ volatile u8 *__ptr = (volatile u8 *)(ptr); \
+ asm volatile(lock "cmpxchgb %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "q" (__new), "0" (__old) \
: "memory"); \
break; \
+ } \
case 2: \
- asm volatile(lock "cmpxchgw %w1,%2" \
- : "=a"(__ret) \
- : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ { \
+ volatile u16 *__ptr = (volatile u16 *)(ptr); \
+ asm volatile(lock "cmpxchgw %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
: "memory"); \
break; \
+ } \
case 4: \
- asm volatile(lock "cmpxchgl %1,%2" \
- : "=a"(__ret) \
- : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ { \
+ volatile u32 *__ptr = (volatile u32 *)(ptr); \
+ asm volatile(lock "cmpxchgl %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
: "memory"); \
break; \
+ } \
default: \
__cmpxchg_wrong_size(); \
} \
@@ -175,32 +161,28 @@ extern void __cmpxchg_wrong_size(void);
(unsigned long long)(n)))
#endif
-static inline unsigned long long __cmpxchg64(volatile void *ptr,
- unsigned long long old,
- unsigned long long new)
+static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
{
- unsigned long long prev;
- asm volatile(LOCK_PREFIX "cmpxchg8b %3"
- : "=A"(prev)
- : "b"((unsigned long)new),
- "c"((unsigned long)(new >> 32)),
- "m"(*__xg(ptr)),
- "0"(old)
+ u64 prev;
+ asm volatile(LOCK_PREFIX "cmpxchg8b %1"
+ : "=A" (prev),
+ "+m" (*ptr)
+ : "b" ((u32)new),
+ "c" ((u32)(new >> 32)),
+ "0" (old)
: "memory");
return prev;
}
-static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
- unsigned long long old,
- unsigned long long new)
+static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
{
- unsigned long long prev;
- asm volatile("cmpxchg8b %3"
- : "=A"(prev)
- : "b"((unsigned long)new),
- "c"((unsigned long)(new >> 32)),
- "m"(*__xg(ptr)),
- "0"(old)
+ u64 prev;
+ asm volatile("cmpxchg8b %1"
+ : "=A" (prev),
+ "+m" (*ptr)
+ : "b" ((u32)new),
+ "c" ((u32)(new >> 32)),
+ "0" (old)
: "memory");
return prev;
}
@@ -264,8 +246,6 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
* to simulate the cmpxchg8b on the 80386 and 80486 CPU.
*/
-extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
-
#define cmpxchg64(ptr, o, n) \
({ \
__typeof__(*(ptr)) __ret; \
@@ -283,20 +263,20 @@ extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
__ret; })
-
-#define cmpxchg64_local(ptr, o, n) \
-({ \
- __typeof__(*(ptr)) __ret; \
- if (likely(boot_cpu_data.x86 > 4)) \
- __ret = (__typeof__(*(ptr)))__cmpxchg64_local((ptr), \
- (unsigned long long)(o), \
- (unsigned long long)(n)); \
- else \
- __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \
- (unsigned long long)(o), \
- (unsigned long long)(n)); \
- __ret; \
-})
+#define cmpxchg64_local(ptr, o, n) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ __typeof__(*(ptr)) __old = (o); \
+ __typeof__(*(ptr)) __new = (n); \
+ alternative_io("call cmpxchg8b_emu", \
+ "cmpxchg8b (%%esi)" , \
+ X86_FEATURE_CX8, \
+ "=A" (__ret), \
+ "S" ((ptr)), "0" (__old), \
+ "b" ((unsigned int)__new), \
+ "c" ((unsigned int)(__new>>32)) \
+ : "memory"); \
+ __ret; })
#endif
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 485ae415faec..423ae58aa020 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -3,51 +3,60 @@
#include <asm/alternative.h> /* Provides LOCK_PREFIX */
-#define __xg(x) ((volatile long *)(x))
-
-static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
+static inline void set_64bit(volatile u64 *ptr, u64 val)
{
*ptr = val;
}
-#define _set_64bit set_64bit
-
extern void __xchg_wrong_size(void);
extern void __cmpxchg_wrong_size(void);
/*
- * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
- * Note 2: xchg has side effect, so that attribute volatile is necessary,
- * but generally the primitive is invalid, *ptr is output argument. --ANK
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
+ * Since this is generally used to protect other memory information, we
+ * use "asm volatile" and "memory" clobbers to prevent gcc from moving
+ * information around.
*/
#define __xchg(x, ptr, size) \
({ \
__typeof(*(ptr)) __x = (x); \
switch (size) { \
case 1: \
- asm volatile("xchgb %b0,%1" \
- : "=q" (__x) \
- : "m" (*__xg(ptr)), "0" (__x) \
+ { \
+ volatile u8 *__ptr = (volatile u8 *)(ptr); \
+ asm volatile("xchgb %0,%1" \
+ : "=q" (__x), "+m" (*__ptr) \
+ : "0" (__x) \
: "memory"); \
break; \
+ } \
case 2: \
- asm volatile("xchgw %w0,%1" \
- : "=r" (__x) \
- : "m" (*__xg(ptr)), "0" (__x) \
+ { \
+ volatile u16 *__ptr = (volatile u16 *)(ptr); \
+ asm volatile("xchgw %0,%1" \
+ : "=r" (__x), "+m" (*__ptr) \
+ : "0" (__x) \
: "memory"); \
break; \
+ } \
case 4: \
- asm volatile("xchgl %k0,%1" \
- : "=r" (__x) \
- : "m" (*__xg(ptr)), "0" (__x) \
+ { \
+ volatile u32 *__ptr = (volatile u32 *)(ptr); \
+ asm volatile("xchgl %0,%1" \
+ : "=r" (__x), "+m" (*__ptr) \
+ : "0" (__x) \
: "memory"); \
break; \
+ } \
case 8: \
+ { \
+ volatile u64 *__ptr = (volatile u64 *)(ptr); \
asm volatile("xchgq %0,%1" \
- : "=r" (__x) \
- : "m" (*__xg(ptr)), "0" (__x) \
+ : "=r" (__x), "+m" (*__ptr) \
+ : "0" (__x) \
: "memory"); \
break; \
+ } \
default: \
__xchg_wrong_size(); \
} \
@@ -71,29 +80,41 @@ extern void __cmpxchg_wrong_size(void);
__typeof__(*(ptr)) __new = (new); \
switch (size) { \
case 1: \
- asm volatile(lock "cmpxchgb %b1,%2" \
- : "=a"(__ret) \
- : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ { \
+ volatile u8 *__ptr = (volatile u8 *)(ptr); \
+ asm volatile(lock "cmpxchgb %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "q" (__new), "0" (__old) \
: "memory"); \
break; \
+ } \
case 2: \
- asm volatile(lock "cmpxchgw %w1,%2" \
- : "=a"(__ret) \
- : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ { \
+ volatile u16 *__ptr = (volatile u16 *)(ptr); \
+ asm volatile(lock "cmpxchgw %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
: "memory"); \
break; \
+ } \
case 4: \
- asm volatile(lock "cmpxchgl %k1,%2" \
- : "=a"(__ret) \
- : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ { \
+ volatile u32 *__ptr = (volatile u32 *)(ptr); \
+ asm volatile(lock "cmpxchgl %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
: "memory"); \
break; \
+ } \
case 8: \
- asm volatile(lock "cmpxchgq %1,%2" \
- : "=a"(__ret) \
- : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
+ { \
+ volatile u64 *__ptr = (volatile u64 *)(ptr); \
+ asm volatile(lock "cmpxchgq %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
: "memory"); \
break; \
+ } \
default: \
__cmpxchg_wrong_size(); \
} \
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 306160e58b48..1d9cd27c2920 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -205,7 +205,7 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr)
return (u32)(unsigned long)uptr;
}
-static inline void __user *compat_alloc_user_space(long len)
+static inline void __user *arch_compat_alloc_user_space(long len)
{
struct pt_regs *regs = task_pt_regs(current);
return (void __user *)regs->sp - len;
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index b185091bf19c..4fab24de26b1 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -32,6 +32,5 @@ extern void arch_unregister_cpu(int);
DECLARE_PER_CPU(int, cpu_state);
-extern unsigned int boot_cpu_id;
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 468145914389..220e2ea08e80 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -6,7 +6,7 @@
#include <asm/required-features.h>
-#define NCAPINTS 9 /* N 32-bit words worth of info */
+#define NCAPINTS 10 /* N 32-bit words worth of info */
/*
* Note: If the comment begins with a quoted string, that string is used
@@ -89,7 +89,7 @@
#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */
#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
-#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */
+ /* 21 available, was AMD_C1E */
#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */
#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
@@ -124,6 +124,8 @@
#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */
+#define X86_FEATURE_RDRND (4*32+30) /* The RDRAND instruction */
#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
@@ -150,29 +152,48 @@
#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */
#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */
-#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */
+#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */
#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */
#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */
#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */
+#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */
/*
* Auxiliary flags: Linux defined - For features scattered in various
- * CPUID levels like 0x6, 0xA etc
+ * CPUID levels like 0x6, 0xA etc, word 7
*/
#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */
#define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */
+#define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */
+#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */
+#define X86_FEATURE_DTS (7*32+ 7) /* Digital Thermal Sensor */
-/* Virtualization flags: Linux defined */
+/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */
#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
-#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */
-#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */
-#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */
-#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */
+#define X86_FEATURE_NPT (8*32+ 5) /* AMD Nested Page Table support */
+#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */
+#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
+#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */
+
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
+#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
@@ -194,7 +215,9 @@ extern const char * const x86_power_flags[32];
(((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \
(((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \
(((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
- (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ) \
+ (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \
+ (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
+ (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \
? 1 : \
test_cpu_cap(c, bit))
@@ -285,26 +308,27 @@ extern const char * const x86_power_flags[32];
#endif /* CONFIG_X86_64 */
+#if __GNUC__ >= 4
/*
* Static testing of CPU features. Used the same as boot_cpu_has().
* These are only valid after alternatives have run, but will statically
* patch the target code for additional performance.
*
*/
-static __always_inline __pure bool __static_cpu_has(u8 bit)
+static __always_inline __pure bool __static_cpu_has(u16 bit)
{
-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
+#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5
asm goto("1: jmp %l[t_no]\n"
"2:\n"
".section .altinstructions,\"a\"\n"
_ASM_ALIGN "\n"
_ASM_PTR "1b\n"
_ASM_PTR "0\n" /* no replacement */
- " .byte %P0\n" /* feature bit */
+ " .word %P0\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */
" .byte 0\n" /* replacement len */
- " .byte 0xff + 0 - (2b-1b)\n" /* padding */
".previous\n"
+ /* skipping size check since replacement size = 0 */
: : "i" (bit) : : t_no);
return true;
t_no:
@@ -318,10 +342,12 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
_ASM_ALIGN "\n"
_ASM_PTR "1b\n"
_ASM_PTR "3f\n"
- " .byte %P1\n" /* feature bit */
+ " .word %P1\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */
" .byte 4f - 3f\n" /* replacement len */
- " .byte 0xff + (4f-3f) - (2b-1b)\n" /* padding */
+ ".previous\n"
+ ".section .discard,\"aw\",@progbits\n"
+ " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
".previous\n"
".section .altinstr_replacement,\"ax\"\n"
"3: movb $1,%0\n"
@@ -332,12 +358,11 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
#endif
}
-#if __GNUC__ >= 4
#define static_cpu_has(bit) \
( \
__builtin_constant_p(boot_cpu_has(bit)) ? \
boot_cpu_has(bit) : \
- (__builtin_constant_p(bit) && !((bit) & ~0xff)) ? \
+ __builtin_constant_p(bit) ? \
__static_cpu_has(bit) : \
boot_cpu_has(bit) \
)
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index ac91eed21061..d4c419f883a0 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -54,7 +54,6 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-#define dma_is_consistent(d, h) (1)
extern int dma_supported(struct device *hwdev, u64 mask);
extern int dma_set_mask(struct device *dev, u64 mask);
@@ -87,13 +86,6 @@ dma_cache_sync(struct device *dev, void *vaddr, size_t size,
flush_write_buffers();
}
-static inline int dma_get_cache_alignment(void)
-{
- /* no easy way to get cache size on all x86, so return the
- * maximum possible, to be safe */
- return boot_cpu_data.x86_clflush_size;
-}
-
static inline unsigned long dma_alloc_coherent_mask(struct device *dev,
gfp_t gfp)
{
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index 733f7e91e7a9..326099199318 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -89,6 +89,16 @@
CFI_ADJUST_CFA_OFFSET -8
.endm
+ .macro pushfq_cfi
+ pushfq
+ CFI_ADJUST_CFA_OFFSET 8
+ .endm
+
+ .macro popfq_cfi
+ popfq
+ CFI_ADJUST_CFA_OFFSET -8
+ .endm
+
.macro movq_cfi reg offset=0
movq %\reg, \offset(%rsp)
CFI_REL_OFFSET \reg, \offset
@@ -109,6 +119,16 @@
CFI_ADJUST_CFA_OFFSET -4
.endm
+ .macro pushfl_cfi
+ pushfl
+ CFI_ADJUST_CFA_OFFSET 4
+ .endm
+
+ .macro popfl_cfi
+ popfl
+ CFI_ADJUST_CFA_OFFSET -4
+ .endm
+
.macro movl_cfi reg offset=0
movl %\reg, \offset(%esp)
CFI_REL_OFFSET \reg, \offset
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index ec8a52d14ab1..5be1542fbfaf 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -112,23 +112,13 @@ static inline void early_memtest(unsigned long start, unsigned long end)
}
#endif
-extern unsigned long end_user_pfn;
-
-extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
-extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
-extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
-#include <linux/early_res.h>
-
extern unsigned long e820_end_of_ram_pfn(void);
extern unsigned long e820_end_of_low_ram_pfn(void);
-extern int e820_find_active_region(const struct e820entry *ei,
- unsigned long start_pfn,
- unsigned long last_pfn,
- unsigned long *ei_startpfn,
- unsigned long *ei_endpfn);
-extern void e820_register_active_regions(int nid, unsigned long start_pfn,
- unsigned long end_pfn);
-extern u64 e820_hole_size(u64 start, u64 end);
+extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
+
+void memblock_x86_fill(void);
+void memblock_find_dma_reserve(void);
+
extern void finish_e820_parsing(void);
extern void e820_reserve_resources(void);
extern void e820_reserve_resources_late(void);
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 8406ed7f9926..8e4a16508d4e 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -90,7 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
#endif /* CONFIG_X86_32 */
extern int add_efi_memmap;
-extern void efi_reserve_early(void);
+extern void efi_memblock_x86_reserve_range(void);
extern void efi_call_phys_prelog(void);
extern void efi_call_phys_epilog(void);
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 8e8ec663a98f..57650ab4a5f5 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -16,22 +16,11 @@ BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
-BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt1,INVALIDATE_TLB_VECTOR_START+1,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt2,INVALIDATE_TLB_VECTOR_START+2,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt3,INVALIDATE_TLB_VECTOR_START+3,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt4,INVALIDATE_TLB_VECTOR_START+4,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt5,INVALIDATE_TLB_VECTOR_START+5,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt6,INVALIDATE_TLB_VECTOR_START+6,
- smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
+.irpc idx, "01234567"
+BUILD_INTERRUPT3(invalidate_interrupt\idx,
+ (INVALIDATE_TLB_VECTOR_START)+\idx,
smp_invalidate_interrupt)
+.endr
#endif
BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
@@ -49,8 +38,8 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
-#ifdef CONFIG_PERF_EVENTS
-BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
+#ifdef CONFIG_IRQ_WORK
+BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR)
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index d07b44f7d1dc..4d293dced62f 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -214,5 +214,20 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
return __virt_to_fix(vaddr);
}
+
+/* Return an pointer with offset calculated */
+static inline unsigned long __set_fixmap_offset(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t flags)
+{
+ __set_fixmap(idx, phys, flags);
+ return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
+}
+
+#define set_fixmap_offset(idx, phys) \
+ __set_fixmap_offset(idx, phys, PAGE_KERNEL)
+
+#define set_fixmap_offset_nocache(idx, phys) \
+ __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE)
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 4ac5b0f33fc1..43085bfc99c3 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -17,6 +17,7 @@ extern int fix_aperture;
#define GARTEN (1<<0)
#define DISGARTCPU (1<<4)
#define DISGARTIO (1<<5)
+#define DISTLBWALKPRB (1<<6)
/* GART cache control register bits. */
#define INVGART (1<<0)
@@ -27,7 +28,6 @@ extern int fix_aperture;
#define AMD64_GARTAPERTUREBASE 0x94
#define AMD64_GARTTABLEBASE 0x98
#define AMD64_GARTCACHECTL 0x9c
-#define AMD64_GARTEN (1<<0)
#ifdef CONFIG_GART_IOMMU
extern int gart_iommu_aperture;
@@ -37,7 +37,7 @@ extern int gart_iommu_aperture_disabled;
extern void early_gart_iommu_check(void);
extern int gart_iommu_init(void);
extern void __init gart_parse_options(char *);
-extern void gart_iommu_hole_init(void);
+extern int gart_iommu_hole_init(void);
#else
#define gart_iommu_aperture 0
@@ -50,13 +50,27 @@ static inline void early_gart_iommu_check(void)
static inline void gart_parse_options(char *options)
{
}
-static inline void gart_iommu_hole_init(void)
+static inline int gart_iommu_hole_init(void)
{
+ return -ENODEV;
}
#endif
extern int agp_amd64_init(void);
+static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
+{
+ u32 ctl;
+
+ /*
+ * Don't enable translation but enable GART IO and CPU accesses.
+ * Also, set DISTLBWALKPRB since GART tables memory is UC.
+ */
+ ctl = DISTLBWALKPRB | order << 1;
+
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+}
+
static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
{
u32 tmp, ctl;
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index aeab29aee617..55e4de613f0e 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -14,7 +14,7 @@ typedef struct {
#endif
unsigned int x86_platform_ipis; /* arch dependent */
unsigned int apic_perf_irqs;
- unsigned int apic_pending_irqs;
+ unsigned int apic_irq_work_irqs;
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index a726650fc80f..3bd04022fd0c 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -59,11 +59,12 @@ extern void kunmap_high(struct page *page);
void *kmap(struct page *page);
void kunmap(struct page *page);
-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
-void *kmap_atomic(struct page *page, enum km_type type);
-void kunmap_atomic(void *kvaddr, enum km_type type);
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
-void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
+
+void *kmap_atomic_prot(struct page *page, pgprot_t prot);
+void *__kmap_atomic(struct page *page);
+void __kunmap_atomic(void *kvaddr);
+void *kmap_atomic_pfn(unsigned long pfn);
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
struct page *kmap_atomic_to_page(void *ptr);
#define flush_cache_kmaps() do { } while (0)
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 004e6e25e913..2c392d663dce 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -68,17 +68,18 @@ extern unsigned long force_hpet_address;
extern u8 hpet_blockid;
extern int hpet_force_user;
extern u8 hpet_msi_disable;
-extern u8 hpet_readback_cmp;
extern int is_hpet_enabled(void);
extern int hpet_enable(void);
extern void hpet_disable(void);
extern unsigned int hpet_readl(unsigned int a);
extern void force_hpet_resume(void);
-extern void hpet_msi_unmask(unsigned int irq);
-extern void hpet_msi_mask(unsigned int irq);
-extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
-extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
+struct irq_data;
+extern void hpet_msi_unmask(struct irq_data *data);
+extern void hpet_msi_mask(struct irq_data *data);
+struct hpet_dev;
+extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg);
+extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg);
#ifdef CONFIG_PCI_MSI
extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id);
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 942255310e6a..824ca07860d0 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -20,10 +20,10 @@ struct arch_hw_breakpoint {
#include <linux/list.h>
/* Available HW breakpoint length encodings */
+#define X86_BREAKPOINT_LEN_X 0x40
#define X86_BREAKPOINT_LEN_1 0x40
#define X86_BREAKPOINT_LEN_2 0x44
#define X86_BREAKPOINT_LEN_4 0x4c
-#define X86_BREAKPOINT_LEN_EXECUTE 0x40
#ifdef CONFIG_X86_64
#define X86_BREAKPOINT_LEN_8 0x48
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 46c0fe05f230..0274ec5a7e62 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,7 +29,7 @@
extern void apic_timer_interrupt(void);
extern void x86_platform_ipi(void);
extern void error_interrupt(void);
-extern void perf_pending_interrupt(void);
+extern void irq_work_interrupt(void);
extern void spurious_interrupt(void);
extern void thermal_interrupt(void);
@@ -78,6 +78,13 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
irq_attr->polarity = polarity;
}
+struct irq_2_iommu {
+ struct intel_iommu *iommu;
+ u16 irte_index;
+ u16 sub_handle;
+ u8 irte_mask;
+};
+
/*
* This is performance-critical, we want to do it O(1)
*
@@ -89,15 +96,17 @@ struct irq_cfg {
cpumask_var_t old_domain;
u8 vector;
u8 move_in_progress : 1;
+#ifdef CONFIG_INTR_REMAP
+ struct irq_2_iommu irq_2_iommu;
+#endif
};
-extern struct irq_cfg *irq_cfg(unsigned int);
extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
extern void send_cleanup_vector(struct irq_cfg *);
-struct irq_desc;
-extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *,
- unsigned int *dest_id);
+struct irq_data;
+int __ioapic_set_affinity(struct irq_data *, const struct cpumask *,
+ unsigned int *dest_id);
extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr);
extern void setup_ioapic_dest(void);
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 70abda7058c8..ff2546ce7178 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -45,5 +45,6 @@ extern const struct hypervisor_x86 *x86_hyper;
/* Recognized hypervisors */
extern const struct hypervisor_x86 x86_hyper_vmware;
extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
+extern const struct hypervisor_x86 x86_hyper_xen_hvm;
#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c991b3a7b904..4aa2bb3b242a 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -31,7 +31,6 @@ extern void mxcsr_feature_mask_init(void);
extern int init_fpu(struct task_struct *child);
extern asmlinkage void math_state_restore(void);
extern void __math_state_restore(void);
-extern void init_thread_xstate(void);
extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
extern user_regset_active_fn fpregs_active, xfpregs_active;
@@ -56,27 +55,44 @@ extern int save_i387_xstate_ia32(void __user *buf);
extern int restore_i387_xstate_ia32(void __user *buf);
#endif
+#ifdef CONFIG_MATH_EMULATION
+extern void finit_soft_fpu(struct i387_soft_struct *soft);
+#else
+static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
+#endif
+
#define X87_FSW_ES (1 << 7) /* Exception Summary */
+static __always_inline __pure bool use_xsaveopt(void)
+{
+ return static_cpu_has(X86_FEATURE_XSAVEOPT);
+}
+
static __always_inline __pure bool use_xsave(void)
{
return static_cpu_has(X86_FEATURE_XSAVE);
}
-#ifdef CONFIG_X86_64
+static __always_inline __pure bool use_fxsr(void)
+{
+ return static_cpu_has(X86_FEATURE_FXSR);
+}
+
+extern void __sanitize_i387_state(struct task_struct *);
-/* Ignore delayed exceptions from user space */
-static inline void tolerant_fwait(void)
+static inline void sanitize_i387_state(struct task_struct *tsk)
{
- asm volatile("1: fwait\n"
- "2:\n"
- _ASM_EXTABLE(1b, 2b));
+ if (!use_xsaveopt())
+ return;
+ __sanitize_i387_state(tsk);
}
+#ifdef CONFIG_X86_64
static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
{
int err;
+ /* See comment in fxsave() below. */
asm volatile("1: rex64/fxrstor (%[fx])\n\t"
"2:\n"
".section .fixup,\"ax\"\n"
@@ -85,48 +101,24 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
".previous\n"
_ASM_EXTABLE(1b, 3b)
: [err] "=r" (err)
-#if 0 /* See comment in fxsave() below. */
- : [fx] "r" (fx), "m" (*fx), "0" (0));
-#else
- : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
-#endif
+ : [fx] "R" (fx), "m" (*fx), "0" (0));
return err;
}
-/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
- is pending. Clear the x87 state here by setting it to fixed
- values. The kernel data segment can be sometimes 0 and sometimes
- new user value. Both should be ok.
- Use the PDA as safe address because it should be already in L1. */
-static inline void fpu_clear(struct fpu *fpu)
+static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
{
- struct xsave_struct *xstate = &fpu->state->xsave;
- struct i387_fxsave_struct *fx = &fpu->state->fxsave;
+ int err;
/*
- * xsave header may indicate the init state of the FP.
+ * Clear the bytes not touched by the fxsave and reserved
+ * for the SW usage.
*/
- if (use_xsave() &&
- !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
- return;
-
- if (unlikely(fx->swd & X87_FSW_ES))
- asm volatile("fnclex");
- alternative_input(ASM_NOP8 ASM_NOP2,
- " emms\n" /* clear stack tags */
- " fildl %%gs:0", /* load to clear state */
- X86_FEATURE_FXSAVE_LEAK);
-}
-
-static inline void clear_fpu_state(struct task_struct *tsk)
-{
- fpu_clear(&tsk->thread.fpu);
-}
-
-static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
-{
- int err;
+ err = __clear_user(&fx->sw_reserved,
+ sizeof(struct _fpx_sw_bytes));
+ if (unlikely(err))
+ return -EFAULT;
+ /* See comment in fxsave() below. */
asm volatile("1: rex64/fxsave (%[fx])\n\t"
"2:\n"
".section .fixup,\"ax\"\n"
@@ -135,11 +127,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
".previous\n"
_ASM_EXTABLE(1b, 3b)
: [err] "=r" (err), "=m" (*fx)
-#if 0 /* See comment in fxsave() below. */
- : [fx] "r" (fx), "0" (0));
-#else
- : [fx] "cdaSDb" (fx), "0" (0));
-#endif
+ : [fx] "R" (fx), "0" (0));
if (unlikely(err) &&
__clear_user(fx, sizeof(struct i387_fxsave_struct)))
err = -EFAULT;
@@ -153,56 +141,29 @@ static inline void fpu_fxsave(struct fpu *fpu)
uses any extended registers for addressing, a second REX prefix
will be generated (to the assembler, rex64 followed by semicolon
is a separate instruction), and hence the 64-bitness is lost. */
-#if 0
+
+#ifdef CONFIG_AS_FXSAVEQ
/* Using "fxsaveq %0" would be the ideal choice, but is only supported
starting with gas 2.16. */
__asm__ __volatile__("fxsaveq %0"
: "=m" (fpu->state->fxsave));
-#elif 0
+#else
/* Using, as a workaround, the properly prefixed form below isn't
accepted by any binutils version so far released, complaining that
the same type of prefix is used twice if an extended register is
- needed for addressing (fix submitted to mainline 2005-11-21). */
- __asm__ __volatile__("rex64/fxsave %0"
- : "=m" (fpu->state->fxsave));
-#else
- /* This, however, we can work around by forcing the compiler to select
+ needed for addressing (fix submitted to mainline 2005-11-21).
+ asm volatile("rex64/fxsave %0"
+ : "=m" (fpu->state->fxsave));
+ This, however, we can work around by forcing the compiler to select
an addressing mode that doesn't require extended registers. */
- __asm__ __volatile__("rex64/fxsave (%1)"
- : "=m" (fpu->state->fxsave)
- : "cdaSDb" (&fpu->state->fxsave));
+ asm volatile("rex64/fxsave (%[fx])"
+ : "=m" (fpu->state->fxsave)
+ : [fx] "R" (&fpu->state->fxsave));
#endif
}
-static inline void fpu_save_init(struct fpu *fpu)
-{
- if (use_xsave())
- fpu_xsave(fpu);
- else
- fpu_fxsave(fpu);
-
- fpu_clear(fpu);
-}
-
-static inline void __save_init_fpu(struct task_struct *tsk)
-{
- fpu_save_init(&tsk->thread.fpu);
- task_thread_info(tsk)->status &= ~TS_USEDFPU;
-}
-
#else /* CONFIG_X86_32 */
-#ifdef CONFIG_MATH_EMULATION
-extern void finit_soft_fpu(struct i387_soft_struct *soft);
-#else
-static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
-#endif
-
-static inline void tolerant_fwait(void)
-{
- asm volatile("fnclex ; fwait");
-}
-
/* perform fxrstor iff the processor has extended states, otherwise frstor */
static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
{
@@ -219,6 +180,14 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
return 0;
}
+static inline void fpu_fxsave(struct fpu *fpu)
+{
+ asm volatile("fxsave %[fx]"
+ : [fx] "=m" (fpu->state->fxsave));
+}
+
+#endif /* CONFIG_X86_64 */
+
/* We need a safe address that is cheap to find and that is already
in L1 during context switch. The best choices are unfortunately
different for UP and SMP */
@@ -234,47 +203,33 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
static inline void fpu_save_init(struct fpu *fpu)
{
if (use_xsave()) {
- struct xsave_struct *xstate = &fpu->state->xsave;
- struct i387_fxsave_struct *fx = &fpu->state->fxsave;
-
fpu_xsave(fpu);
/*
* xsave header may indicate the init state of the FP.
*/
- if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
- goto end;
-
- if (unlikely(fx->swd & X87_FSW_ES))
- asm volatile("fnclex");
-
- /*
- * we can do a simple return here or be paranoid :)
- */
- goto clear_state;
+ if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
+ return;
+ } else if (use_fxsr()) {
+ fpu_fxsave(fpu);
+ } else {
+ asm volatile("fsave %[fx]; fwait"
+ : [fx] "=m" (fpu->state->fsave));
+ return;
}
- /* Use more nops than strictly needed in case the compiler
- varies code */
- alternative_input(
- "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
- "fxsave %[fx]\n"
- "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
- X86_FEATURE_FXSR,
- [fx] "m" (fpu->state->fxsave),
- [fsw] "m" (fpu->state->fxsave.swd) : "memory");
-clear_state:
+ if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES))
+ asm volatile("fnclex");
+
/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
is pending. Clear the x87 state here by setting it to fixed
values. safe_address is a random variable that should be in L1 */
alternative_input(
- GENERIC_NOP8 GENERIC_NOP2,
+ ASM_NOP8 ASM_NOP2,
"emms\n\t" /* clear stack tags */
- "fildl %[addr]", /* set F?P to defined value */
+ "fildl %P[addr]", /* set F?P to defined value */
X86_FEATURE_FXSAVE_LEAK,
[addr] "m" (safe_address));
-end:
- ;
}
static inline void __save_init_fpu(struct task_struct *tsk)
@@ -283,9 +238,6 @@ static inline void __save_init_fpu(struct task_struct *tsk)
task_thread_info(tsk)->status &= ~TS_USEDFPU;
}
-
-#endif /* CONFIG_X86_64 */
-
static inline int fpu_fxrstor_checking(struct fpu *fpu)
{
return fxrstor_checking(&fpu->state->fxsave);
@@ -322,7 +274,10 @@ static inline void __unlazy_fpu(struct task_struct *tsk)
static inline void __clear_fpu(struct task_struct *tsk)
{
if (task_thread_info(tsk)->status & TS_USEDFPU) {
- tolerant_fwait();
+ /* Ignore delayed exceptions from user space */
+ asm volatile("1: fwait\n"
+ "2:\n"
+ _ASM_EXTABLE(1b, 2b));
task_thread_info(tsk)->status &= ~TS_USEDFPU;
stts();
}
@@ -383,19 +338,6 @@ static inline void irq_ts_restore(int TS_state)
stts();
}
-#ifdef CONFIG_X86_64
-
-static inline void save_init_fpu(struct task_struct *tsk)
-{
- __save_init_fpu(tsk);
- stts();
-}
-
-#define unlazy_fpu __unlazy_fpu
-#define clear_fpu __clear_fpu
-
-#else /* CONFIG_X86_32 */
-
/*
* These disable preemption on their own and are safe
*/
@@ -421,8 +363,6 @@ static inline void clear_fpu(struct task_struct *tsk)
preempt_enable();
}
-#endif /* CONFIG_X86_64 */
-
/*
* i387 state interaction
*/
@@ -482,9 +422,8 @@ static inline void fpu_copy(struct fpu *dst, struct fpu *src)
memcpy(dst->state, src->state, xstate_size);
}
-#endif /* __ASSEMBLY__ */
+extern void fpu_finit(struct fpu *fpu);
-#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
-#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
+#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 1655147646aa..a20365953bf8 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -55,6 +55,8 @@ extern struct irq_chip i8259A_chip;
struct legacy_pic {
int nr_legacy_irqs;
struct irq_chip *chip;
+ void (*mask)(unsigned int irq);
+ void (*unmask)(unsigned int irq);
void (*mask_all)(void);
void (*restore_mask)(void);
void (*init)(int auto_eoi);
diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
index 4470c9ad4a3e..29f66793cc55 100644
--- a/arch/x86/include/asm/intel_scu_ipc.h
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -1,6 +1,12 @@
#ifndef _ASM_X86_INTEL_SCU_IPC_H_
#define _ASM_X86_INTEL_SCU_IPC_H_
+#define IPCMSG_VRTC 0xFA /* Set vRTC device */
+
+/* Command id associated with message IPCMSG_VRTC */
+#define IPC_CMD_VRTC_SETTIME 1 /* Set time */
+#define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */
+
/* Read single register */
int intel_scu_ipc_ioread8(u16 addr, u8 *data);
@@ -28,20 +34,6 @@ int intel_scu_ipc_writev(u16 *addr, u8 *data, int len);
/* Update single register based on the mask */
int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask);
-/*
- * Indirect register read
- * Can be used when SCCB(System Controller Configuration Block) register
- * HRIM(Honor Restricted IPC Messages) is set (bit 23)
- */
-int intel_scu_ipc_register_read(u32 addr, u32 *data);
-
-/*
- * Indirect register write
- * Can be used when SCCB(System Controller Configuration Block) register
- * HRIM(Honor Restricted IPC Messages) is set (bit 23)
- */
-int intel_scu_ipc_register_write(u32 addr, u32 data);
-
/* Issue commands to the SCU with or without data */
int intel_scu_ipc_simple_command(int cmd, int sub);
int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 30a3e9776123..072273082528 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -41,6 +41,8 @@
#include <asm-generic/int-ll64.h>
#include <asm/page.h>
+#include <xen/xen.h>
+
#define build_mmio_read(name, size, type, reg, barrier) \
static inline type name(const volatile void __iomem *addr) \
{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
@@ -206,6 +208,7 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
extern void iounmap(volatile void __iomem *addr);
+extern void set_iounmap_nonlazy(void);
#ifdef __KERNEL__
@@ -348,6 +351,18 @@ extern void __iomem *early_memremap(resource_size_t phys_addr,
unsigned long size);
extern void early_iounmap(void __iomem *addr, unsigned long size);
extern void fixup_early_ioremap(void);
+extern bool is_early_ioremap_ptep(pte_t *ptep);
+
+#ifdef CONFIG_XEN
+struct bio_vec;
+
+extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
+ const struct bio_vec *vec2);
+
+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
+ (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \
+ (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
+#endif /* CONFIG_XEN */
#define IO_SPACE_LIMIT 0xffff
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9cb2edb87c2f..a6b28d017c2f 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -169,13 +169,8 @@ extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
extern void probe_nr_irqs_gsi(void);
+extern int get_nr_irqs_gsi(void);
-extern int setup_ioapic_entry(int apic, int irq,
- struct IO_APIC_route_entry *entry,
- unsigned int destination, int trigger,
- int polarity, int vector, int pin);
-extern void ioapic_write_entry(int apic, int pin,
- struct IO_APIC_route_entry e);
extern void setup_ioapic_ids_from_mpc(void);
struct mp_ioapic_gsi{
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index f35eb45d6576..363e33eb6ec1 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -26,11 +26,11 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
-void *
-iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
+void __iomem *
+iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
void
-iounmap_atomic(void *kvaddr, enum km_type type);
+iounmap_atomic(void __iomem *kvaddr);
int
iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h
new file mode 100644
index 000000000000..f229b13a5f30
--- /dev/null
+++ b/arch/x86/include/asm/iommu_table.h
@@ -0,0 +1,100 @@
+#ifndef _ASM_X86_IOMMU_TABLE_H
+#define _ASM_X86_IOMMU_TABLE_H
+
+#include <asm/swiotlb.h>
+
+/*
+ * History lesson:
+ * The execution chain of IOMMUs in 2.6.36 looks as so:
+ *
+ * [xen-swiotlb]
+ * |
+ * +----[swiotlb *]--+
+ * / | \
+ * / | \
+ * [GART] [Calgary] [Intel VT-d]
+ * /
+ * /
+ * [AMD-Vi]
+ *
+ * *: if SWIOTLB detected 'iommu=soft'/'swiotlb=force' it would skip
+ * over the rest of IOMMUs and unconditionally initialize the SWIOTLB.
+ * Also it would surreptitiously initialize set the swiotlb=1 if there were
+ * more than 4GB and if the user did not pass in 'iommu=off'. The swiotlb
+ * flag would be turned off by all IOMMUs except the Calgary one.
+ *
+ * The IOMMU_INIT* macros allow a similar tree (or more complex if desired)
+ * to be built by defining who we depend on.
+ *
+ * And all that needs to be done is to use one of the macros in the IOMMU
+ * and the pci-dma.c will take care of the rest.
+ */
+
+struct iommu_table_entry {
+ initcall_t detect;
+ initcall_t depend;
+ void (*early_init)(void); /* No memory allocate available. */
+ void (*late_init)(void); /* Yes, can allocate memory. */
+#define IOMMU_FINISH_IF_DETECTED (1<<0)
+#define IOMMU_DETECTED (1<<1)
+ int flags;
+};
+/*
+ * Macro fills out an entry in the .iommu_table that is equivalent
+ * to the fields that 'struct iommu_table_entry' has. The entries
+ * that are put in the .iommu_table section are not put in any order
+ * hence during boot-time we will have to resort them based on
+ * dependency. */
+
+
+#define __IOMMU_INIT(_detect, _depend, _early_init, _late_init, _finish)\
+ static const struct iommu_table_entry const \
+ __iommu_entry_##_detect __used \
+ __attribute__ ((unused, __section__(".iommu_table"), \
+ aligned((sizeof(void *))))) \
+ = {_detect, _depend, _early_init, _late_init, \
+ _finish ? IOMMU_FINISH_IF_DETECTED : 0}
+/*
+ * The simplest IOMMU definition. Provide the detection routine
+ * and it will be run after the SWIOTLB and the other IOMMUs
+ * that utilize this macro. If the IOMMU is detected (ie, the
+ * detect routine returns a positive value), the other IOMMUs
+ * are also checked. You can use IOMMU_INIT_POST_FINISH if you prefer
+ * to stop detecting the other IOMMUs after yours has been detected.
+ */
+#define IOMMU_INIT_POST(_detect) \
+ __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, 0, 0, 0)
+
+#define IOMMU_INIT_POST_FINISH(detect) \
+ __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, 0, 0, 1)
+
+/*
+ * A more sophisticated version of IOMMU_INIT. This variant requires:
+ * a). A detection routine function.
+ * b). The name of the detection routine we depend on to get called
+ * before us.
+ * c). The init routine which gets called if the detection routine
+ * returns a positive value from the pci_iommu_alloc. This means
+ * no presence of a memory allocator.
+ * d). Similar to the 'init', except that this gets called from pci_iommu_init
+ * where we do have a memory allocator.
+ *
+ * The standard vs the _FINISH differs in that the _FINISH variant will
+ * continue detecting other IOMMUs in the call list after the
+ * the detection routine returns a positive number. The _FINISH will
+ * stop the execution chain. Both will still call the 'init' and
+ * 'late_init' functions if they are set.
+ */
+#define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \
+ __IOMMU_INIT(_detect, _depend, _init, _late_init, 1)
+
+#define IOMMU_INIT(_detect, _depend, _init, _late_init) \
+ __IOMMU_INIT(_detect, _depend, _init, _late_init, 0)
+
+void sort_iommu_table(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish);
+
+void check_iommu_entries(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish);
+
+#endif /* _ASM_X86_IOMMU_TABLE_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 5458380b6ef8..13b0ebaa512f 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -19,18 +19,14 @@ static inline int irq_canonicalize(int irq)
# define ARCH_HAS_NMI_WATCHDOG
#endif
-#ifdef CONFIG_4KSTACKS
- extern void irq_ctx_init(int cpu);
- extern void irq_ctx_exit(int cpu);
-# define __ARCH_HAS_DO_SOFTIRQ
+#ifdef CONFIG_X86_32
+extern void irq_ctx_init(int cpu);
#else
# define irq_ctx_init(cpu) do { } while (0)
-# define irq_ctx_exit(cpu) do { } while (0)
-# ifdef CONFIG_X86_64
-# define __ARCH_HAS_DO_SOFTIRQ
-# endif
#endif
+#define __ARCH_HAS_DO_SOFTIRQ
+
#ifdef CONFIG_HOTPLUG_CPU
#include <linux/cpumask.h>
extern void fixup_irqs(void);
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index f275e2244505..1c23360fb2d8 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -3,4 +3,39 @@
#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
+#ifdef CONFIG_INTR_REMAP
+static inline void prepare_irte(struct irte *irte, int vector,
+ unsigned int dest)
+{
+ memset(irte, 0, sizeof(*irte));
+
+ irte->present = 1;
+ irte->dst_mode = apic->irq_dest_mode;
+ /*
+ * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
+ * actual level or edge trigger will be setup in the IO-APIC
+ * RTE. This will help simplify level triggered irq migration.
+ * For more details, see the comments (in io_apic.c) explainig IO-APIC
+ * irq migration in the presence of interrupt-remapping.
+ */
+ irte->trigger_mode = 0;
+ irte->dlvry_mode = apic->irq_delivery_mode;
+ irte->vector = vector;
+ irte->dest_id = IRTE_DEST(dest);
+ irte->redir_hint = 1;
+}
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+ return cfg->irq_2_iommu.iommu != NULL;
+}
+#else
+static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
+{
+}
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+ return false;
+}
+#endif
+
#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 8767d99c4f64..6af0894dafb4 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -114,9 +114,9 @@
#define X86_PLATFORM_IPI_VECTOR 0xed
/*
- * Performance monitoring pending work vector:
+ * IRQ work vector:
*/
-#define LOCAL_PENDING_VECTOR 0xec
+#define IRQ_WORK_VECTOR 0xec
#define UV_BAU_MESSAGE 0xea
@@ -125,6 +125,9 @@
*/
#define MCE_SELF_VECTOR 0xeb
+/* Xen vector callback to receive events in a HVM domain */
+#define XEN_HVM_EVTCHN_CALLBACK 0xe9
+
#define NR_VECTORS 256
#define FPU_IRQ 13
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 9e2b952f810a..5745ce8bf108 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -61,22 +61,22 @@ static inline void native_halt(void)
#else
#ifndef __ASSEMBLY__
-static inline unsigned long __raw_local_save_flags(void)
+static inline unsigned long arch_local_save_flags(void)
{
return native_save_fl();
}
-static inline void raw_local_irq_restore(unsigned long flags)
+static inline void arch_local_irq_restore(unsigned long flags)
{
native_restore_fl(flags);
}
-static inline void raw_local_irq_disable(void)
+static inline void arch_local_irq_disable(void)
{
native_irq_disable();
}
-static inline void raw_local_irq_enable(void)
+static inline void arch_local_irq_enable(void)
{
native_irq_enable();
}
@@ -85,7 +85,7 @@ static inline void raw_local_irq_enable(void)
* Used in the idle loop; sti takes one instruction cycle
* to complete:
*/
-static inline void raw_safe_halt(void)
+static inline void arch_safe_halt(void)
{
native_safe_halt();
}
@@ -102,12 +102,10 @@ static inline void halt(void)
/*
* For spinlocks, etc:
*/
-static inline unsigned long __raw_local_irq_save(void)
+static inline unsigned long arch_local_irq_save(void)
{
- unsigned long flags = __raw_local_save_flags();
-
- raw_local_irq_disable();
-
+ unsigned long flags = arch_local_save_flags();
+ arch_local_irq_disable();
return flags;
}
#else
@@ -153,22 +151,16 @@ static inline unsigned long __raw_local_irq_save(void)
#endif /* CONFIG_PARAVIRT */
#ifndef __ASSEMBLY__
-#define raw_local_save_flags(flags) \
- do { (flags) = __raw_local_save_flags(); } while (0)
-
-#define raw_local_irq_save(flags) \
- do { (flags) = __raw_local_irq_save(); } while (0)
-
-static inline int raw_irqs_disabled_flags(unsigned long flags)
+static inline int arch_irqs_disabled_flags(unsigned long flags)
{
return !(flags & X86_EFLAGS_IF);
}
-static inline int raw_irqs_disabled(void)
+static inline int arch_irqs_disabled(void)
{
- unsigned long flags = __raw_local_save_flags();
+ unsigned long flags = arch_local_save_flags();
- return raw_irqs_disabled_flags(flags);
+ return arch_irqs_disabled_flags(flags);
}
#else
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
new file mode 100644
index 000000000000..f52d42e80585
--- /dev/null
+++ b/arch/x86/include/asm/jump_label.h
@@ -0,0 +1,37 @@
+#ifndef _ASM_X86_JUMP_LABEL_H
+#define _ASM_X86_JUMP_LABEL_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <asm/nops.h>
+
+#define JUMP_LABEL_NOP_SIZE 5
+
+# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
+
+# define JUMP_LABEL(key, label) \
+ do { \
+ asm goto("1:" \
+ JUMP_LABEL_INITIAL_NOP \
+ ".pushsection __jump_table, \"a\" \n\t"\
+ _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \
+ ".popsection \n\t" \
+ : : "i" (key) : : label); \
+ } while (0)
+
+#endif /* __KERNEL__ */
+
+#ifdef CONFIG_X86_64
+typedef u64 jump_label_t;
+#else
+typedef u32 jump_label_t;
+#endif
+
+struct jump_entry {
+ jump_label_t code;
+ jump_label_t target;
+ jump_label_t key;
+};
+
+#endif
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index fa7c0b974761..5bdfca86581b 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -33,5 +33,11 @@ extern void __show_regs(struct pt_regs *regs, int all);
extern void show_regs(struct pt_regs *regs);
extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr);
+#ifdef CONFIG_KEXEC
+extern int in_crash_kexec;
+#else
+/* no crash dump is ever in progress if no crash kernel can be kexec'd */
+#define in_crash_kexec 0
+#endif
#endif /* _ASM_X86_KDEBUG_H */
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index 006da3687cdc..396f5b5fc4d7 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -39,9 +39,11 @@ enum regnames {
GDB_FS, /* 14 */
GDB_GS, /* 15 */
};
+#define GDB_ORIG_AX 41
+#define DBG_MAX_REG_NUM 16
#define NUMREGBYTES ((GDB_GS+1)*4)
#else /* ! CONFIG_X86_32 */
-enum regnames64 {
+enum regnames {
GDB_AX, /* 0 */
GDB_BX, /* 1 */
GDB_CX, /* 2 */
@@ -59,15 +61,15 @@ enum regnames64 {
GDB_R14, /* 14 */
GDB_R15, /* 15 */
GDB_PC, /* 16 */
+ GDB_PS, /* 17 */
+ GDB_CS, /* 18 */
+ GDB_SS, /* 19 */
};
-
-enum regnames32 {
- GDB_PS = 34,
- GDB_CS,
- GDB_SS,
-};
-#define NUMREGBYTES ((GDB_SS+1)*4)
-#endif /* CONFIG_X86_32 */
+#define GDB_ORIG_AX 57
+#define DBG_MAX_REG_NUM 20
+/* 17 64 bit regs and 3 32 bit regs */
+#define NUMREGBYTES ((17 * 8) + (3 * 4))
+#endif /* ! CONFIG_X86_32 */
static inline void arch_kgdb_breakpoint(void)
{
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index ff90055c7f0b..4d8dcbdfc120 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -22,6 +22,8 @@
#define __KVM_HAVE_XEN_HVM
#define __KVM_HAVE_VCPU_EVENTS
#define __KVM_HAVE_DEBUGREGS
+#define __KVM_HAVE_XSAVE
+#define __KVM_HAVE_XCRS
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
@@ -299,4 +301,24 @@ struct kvm_debugregs {
__u64 reserved[9];
};
+/* for KVM_CAP_XSAVE */
+struct kvm_xsave {
+ __u32 region[1024];
+};
+
+#define KVM_MAX_XCRS 16
+
+struct kvm_xcr {
+ __u32 xcr;
+ __u32 reserved;
+ __u64 value;
+};
+
+struct kvm_xcrs {
+ __u32 nr_xcrs;
+ __u32 flags;
+ struct kvm_xcr xcrs[KVM_MAX_XCRS];
+ __u64 padding[16];
+};
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0b2729bf2070..b36c6b3fe144 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -51,8 +51,10 @@ struct x86_emulate_ctxt;
#define X86EMUL_UNHANDLEABLE 1
/* Terminate emulation but return success to the caller. */
#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
-#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */
-#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */
+#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
+#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
+#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
+
struct x86_emulate_ops {
/*
* read_std: Read bytes of standard (non-emulated/special) memory.
@@ -92,6 +94,7 @@ struct x86_emulate_ops {
int (*read_emulated)(unsigned long addr,
void *val,
unsigned int bytes,
+ unsigned int *error,
struct kvm_vcpu *vcpu);
/*
@@ -104,6 +107,7 @@ struct x86_emulate_ops {
int (*write_emulated)(unsigned long addr,
const void *val,
unsigned int bytes,
+ unsigned int *error,
struct kvm_vcpu *vcpu);
/*
@@ -118,6 +122,7 @@ struct x86_emulate_ops {
const void *old,
const void *new,
unsigned int bytes,
+ unsigned int *error,
struct kvm_vcpu *vcpu);
int (*pio_in_emulated)(int size, unsigned short port, void *val,
@@ -132,18 +137,35 @@ struct x86_emulate_ops {
int seg, struct kvm_vcpu *vcpu);
u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
+ unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
+ void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
- void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
+ int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
int (*cpl)(struct kvm_vcpu *vcpu);
- void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+ int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
+ int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
+ int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+ int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
};
/* Type, address-of, and value of an instruction's operand. */
struct operand {
enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
unsigned int bytes;
- unsigned long val, orig_val, *ptr;
+ union {
+ unsigned long orig_val;
+ u64 orig_val64;
+ };
+ union {
+ unsigned long *reg;
+ unsigned long mem;
+ } addr;
+ union {
+ unsigned long val;
+ u64 val64;
+ char valptr[sizeof(unsigned long) + 2];
+ };
};
struct fetch_cache {
@@ -172,6 +194,7 @@ struct decode_cache {
bool has_seg_override;
u8 seg_override;
unsigned int d;
+ int (*execute)(struct x86_emulate_ctxt *ctxt);
unsigned long regs[NR_VCPU_REGS];
unsigned long eip;
/* modrm */
@@ -179,16 +202,16 @@ struct decode_cache {
u8 modrm_mod;
u8 modrm_reg;
u8 modrm_rm;
- u8 use_modrm_ea;
+ u8 modrm_seg;
bool rip_relative;
- unsigned long modrm_ea;
- void *modrm_ptr;
- unsigned long modrm_val;
struct fetch_cache fetch;
struct read_cache io_read;
+ struct read_cache mem_read;
};
struct x86_emulate_ctxt {
+ struct x86_emulate_ops *ops;
+
/* Register state before/after emulation. */
struct kvm_vcpu *vcpu;
@@ -201,7 +224,12 @@ struct x86_emulate_ctxt {
/* interruptibility state, as a result of execution of STI or MOV SS */
int interruptibility;
- bool restart; /* restart string instruction after writeback */
+ bool perm_ok; /* do not check permissions if true */
+
+ int exception; /* exception that happens during emulation or -1 */
+ u32 error_code; /* error code for exception */
+ bool error_code_valid;
+
/* decode cache */
struct decode_cache decode;
};
@@ -224,13 +252,14 @@ struct x86_emulate_ctxt {
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
#endif
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops);
-int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops);
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt);
+#define EMULATION_FAILED -1
+#define EMULATION_OK 0
+#define EMULATION_RESTART 1
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
u16 tss_selector, int reason,
bool has_error_code, u32 error_code);
-
+int emulate_int_real(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int irq);
#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 76f5483cffec..9e6fe391094e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -15,6 +15,7 @@
#include <linux/mm.h>
#include <linux/mmu_notifier.h>
#include <linux/tracepoint.h>
+#include <linux/cpumask.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -39,11 +40,14 @@
0xFFFFFF0000000000ULL)
#define INVALID_PAGE (~(hpa_t)0)
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
#define UNMAPPED_GVA (~(gpa_t)0)
/* KVM Hugepage definitions for x86 */
#define KVM_NR_PAGE_SIZES 3
-#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9))
+#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
+#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
@@ -69,8 +73,6 @@
#define IOPL_SHIFT 12
-#define KVM_ALIAS_SLOTS 4
-
#define KVM_PERMILLE_MMU_PAGES 20
#define KVM_MIN_ALLOC_MMU_PAGES 64
#define KVM_MMU_HASH_SHIFT 10
@@ -234,26 +236,35 @@ struct kvm_pio_request {
*/
struct kvm_mmu {
void (*new_cr3)(struct kvm_vcpu *vcpu);
+ void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
+ unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
+ void (*inject_page_fault)(struct kvm_vcpu *vcpu);
void (*free)(struct kvm_vcpu *vcpu);
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
u32 *error);
+ gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp);
+ struct kvm_mmu_page *sp, bool clear_unsync);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
hpa_t root_hpa;
int root_level;
int shadow_root_level;
union kvm_mmu_page_role base_role;
+ bool direct_map;
u64 *pae_root;
+ u64 *lm_root;
u64 rsvd_bits_mask[2][4];
+
+ bool nx;
+
+ u64 pdptrs[4]; /* pae */
};
struct kvm_vcpu_arch {
- u64 host_tsc;
/*
* rip and regs accesses must go through
* kvm_{register,rip}_{read,write} functions.
@@ -270,7 +281,6 @@ struct kvm_vcpu_arch {
unsigned long cr4_guest_owned_bits;
unsigned long cr8;
u32 hflags;
- u64 pdptrs[4]; /* pae */
u64 efer;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
@@ -280,7 +290,41 @@ struct kvm_vcpu_arch {
u64 ia32_misc_enable_msr;
bool tpr_access_reporting;
+ /*
+ * Paging state of the vcpu
+ *
+ * If the vcpu runs in guest mode with two level paging this still saves
+ * the paging mode of the l1 guest. This context is always used to
+ * handle faults.
+ */
struct kvm_mmu mmu;
+
+ /*
+ * Paging state of an L2 guest (used for nested npt)
+ *
+ * This context will save all necessary information to walk page tables
+ * of the an L2 guest. This context is only initialized for page table
+ * walking and not for faulting since we never handle l2 page faults on
+ * the host.
+ */
+ struct kvm_mmu nested_mmu;
+
+ /*
+ * Pointer to the mmu context currently used for
+ * gva_to_gpa translations.
+ */
+ struct kvm_mmu *walk_mmu;
+
+ /*
+ * This struct is filled with the necessary information to propagate a
+ * page fault into the guest
+ */
+ struct {
+ u64 address;
+ unsigned error_code;
+ bool nested;
+ } fault;
+
/* only needed in kvm_pv_mmu_op() path, but it's hot so
* put it here to avoid allocation */
struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -301,8 +345,8 @@ struct kvm_vcpu_arch {
unsigned long mmu_seq;
} update_pte;
- struct i387_fxsave_struct host_fx_image;
- struct i387_fxsave_struct guest_fx_image;
+ struct fpu guest_fpu;
+ u64 xcr0;
gva_t mmio_fault_cr2;
struct kvm_pio_request pio;
@@ -334,9 +378,15 @@ struct kvm_vcpu_arch {
gpa_t time;
struct pvclock_vcpu_time_info hv_clock;
- unsigned int hv_clock_tsc_khz;
+ unsigned int hw_tsc_khz;
unsigned int time_offset;
struct page *time_page;
+ u64 last_host_tsc;
+ u64 last_guest_tsc;
+ u64 last_kernel_ns;
+ u64 last_tsc_nsec;
+ u64 last_tsc_write;
+ bool tsc_catchup;
bool nmi_pending;
bool nmi_injected;
@@ -360,29 +410,14 @@ struct kvm_vcpu_arch {
/* fields used by HYPER-V emulation */
u64 hv_vapic;
-};
-
-struct kvm_mem_alias {
- gfn_t base_gfn;
- unsigned long npages;
- gfn_t target_gfn;
-#define KVM_ALIAS_INVALID 1UL
- unsigned long flags;
-};
-#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
-
-struct kvm_mem_aliases {
- struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
- int naliases;
+ cpumask_var_t wbinvd_dirty_mask;
};
struct kvm_arch {
- struct kvm_mem_aliases *aliases;
-
- unsigned int n_free_mmu_pages;
+ unsigned int n_used_mmu_pages;
unsigned int n_requested_mmu_pages;
- unsigned int n_alloc_mmu_pages;
+ unsigned int n_max_mmu_pages;
atomic_t invlpg_counter;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
@@ -407,8 +442,14 @@ struct kvm_arch {
gpa_t ept_identity_map_addr;
unsigned long irq_sources_bitmap;
- u64 vm_init_tsc;
s64 kvmclock_offset;
+ spinlock_t tsc_write_lock;
+ u64 last_tsc_nsec;
+ u64 last_tsc_offset;
+ u64 last_tsc_write;
+ u32 virtual_tsc_khz;
+ u32 virtual_tsc_mult;
+ s8 virtual_tsc_shift;
struct kvm_xen_hvm_config xen_hvm_config;
@@ -518,6 +559,7 @@ struct kvm_x86_ops {
void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code,
bool reinject);
+ void (*cancel_injection)(struct kvm_vcpu *vcpu);
int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
int (*nmi_allowed)(struct kvm_vcpu *vcpu);
bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
@@ -530,9 +572,16 @@ struct kvm_x86_ops {
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
+ void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
+
+ void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
+ bool (*has_wbinvd_exit)(void);
+
+ void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+
const struct trace_print_flags *exit_reasons_str;
};
@@ -555,7 +604,7 @@ void kvm_mmu_zap_all(struct kvm *kvm);
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
+int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
@@ -576,7 +625,6 @@ enum emulation_result {
#define EMULTYPE_SKIP (1 << 2)
int emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2, u16 error_code, int emulation_type);
-void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -591,10 +639,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
int emulate_clts(struct kvm_vcpu *vcpu);
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
- unsigned long *dest);
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
- unsigned long value);
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
@@ -602,15 +647,16 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
bool has_error_code, u32 error_code);
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
@@ -622,20 +668,18 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
- u32 error_code);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
+int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gfn_t gfn, void *data, int offset, int len,
+ u32 access);
+void kvm_propagate_fault(struct kvm_vcpu *vcpu);
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
int kvm_pic_set_irq(void *opaque, int irq, int level);
void kvm_inject_nmi(struct kvm_vcpu *vcpu);
-void fx_init(struct kvm_vcpu *vcpu);
-
-int emulator_write_emulated(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu);
+int fx_init(struct kvm_vcpu *vcpu);
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -664,8 +708,6 @@ void kvm_disable_tdp(void);
int complete_pio(struct kvm_vcpu *vcpu);
bool kvm_check_iopl(struct kvm_vcpu *vcpu);
-struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
-
static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
{
struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -673,20 +715,6 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
return (struct kvm_mmu_page *)page_private(page);
}
-static inline u16 kvm_read_fs(void)
-{
- u16 seg;
- asm("mov %%fs, %0" : "=g"(seg));
- return seg;
-}
-
-static inline u16 kvm_read_gs(void)
-{
- u16 seg;
- asm("mov %%gs, %0" : "=g"(seg));
- return seg;
-}
-
static inline u16 kvm_read_ldt(void)
{
u16 ldt;
@@ -694,16 +722,6 @@ static inline u16 kvm_read_ldt(void)
return ldt;
}
-static inline void kvm_load_fs(u16 sel)
-{
- asm("mov %0, %%fs" : : "rm"(sel));
-}
-
-static inline void kvm_load_gs(u16 sel)
-{
- asm("mov %0, %%gs" : : "rm"(sel));
-}
-
static inline void kvm_load_ldt(u16 sel)
{
asm("lldt %0" : : "rm"(sel));
@@ -719,21 +737,6 @@ static inline unsigned long read_msr(unsigned long msr)
}
#endif
-static inline void kvm_fx_save(struct i387_fxsave_struct *image)
-{
- asm("fxsave (%0)":: "r" (image));
-}
-
-static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
-{
- asm("fxrstor (%0)":: "r" (image));
-}
-
-static inline void kvm_fx_finit(void)
-{
- asm("finit");
-}
-
static inline u32 get_rdx_init_val(void)
{
return 0x600; /* P6 family */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 05eba5e9a8e8..7b562b6184bc 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -158,6 +158,12 @@ static inline unsigned int kvm_arch_para_features(void)
return cpuid_eax(KVM_CPUID_FEATURES);
}
+#ifdef CONFIG_KVM_GUEST
+void __init kvm_guest_init(void);
+#else
+#define kvm_guest_init() do { } while (0)
#endif
+#endif /* __KERNEL__ */
+
#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/asm/local64.h b/arch/x86/include/asm/local64.h
new file mode 100644
index 000000000000..36c93b5cc239
--- /dev/null
+++ b/arch/x86/include/asm/local64.h
@@ -0,0 +1 @@
+#include <asm-generic/local64.h>
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index f32a4301c4d4..c62c13cb9788 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -38,6 +38,10 @@
#define MCM_ADDR_MEM 3 /* memory address */
#define MCM_ADDR_GENERIC 7 /* generic */
+/* CTL2 register defines */
+#define MCI_CTL2_CMCI_EN (1ULL << 30)
+#define MCI_CTL2_CMCI_THRESHOLD_MASK 0x7fffULL
+
#define MCJ_CTX_MASK 3
#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK)
#define MCJ_CTX_RANDOM 0 /* inject context: random */
diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
new file mode 100644
index 000000000000..19ae14ba6978
--- /dev/null
+++ b/arch/x86/include/asm/memblock.h
@@ -0,0 +1,23 @@
+#ifndef _X86_MEMBLOCK_H
+#define _X86_MEMBLOCK_H
+
+#define ARCH_DISCARD_MEMBLOCK
+
+u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align);
+void memblock_x86_to_bootmem(u64 start, u64 end);
+
+void memblock_x86_reserve_range(u64 start, u64 end, char *name);
+void memblock_x86_free_range(u64 start, u64 end);
+struct range;
+int __get_free_all_memory_range(struct range **range, int nodeid,
+ unsigned long start_pfn, unsigned long end_pfn);
+int get_free_all_memory_range(struct range **rangep, int nodeid);
+
+void memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
+ unsigned long last_pfn);
+u64 memblock_x86_hole_size(u64 start, u64 end);
+u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align);
+u64 memblock_x86_free_memory_in_range(u64 addr, u64 limit);
+u64 memblock_x86_memory_in_range(u64 addr, u64 limit);
+
+#endif
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index 3e2ce58a31a3..67763c5d8b4e 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -60,12 +60,7 @@
#endif
#ifdef CONFIG_X86_32
-# ifdef CONFIG_4KSTACKS
-# define MODULE_STACKSIZE "4KSTACKS "
-# else
-# define MODULE_STACKSIZE ""
-# endif
-# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE
+# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY
#endif
#endif /* _ASM_X86_MODULE_H */
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 451d30e7f62d..4a711a684b17 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -10,10 +10,44 @@
*/
#ifndef _ASM_X86_MRST_H
#define _ASM_X86_MRST_H
+
+#include <linux/sfi.h>
+
extern int pci_mrst_init(void);
int __init sfi_parse_mrtc(struct sfi_table_header *table);
+/*
+ * Medfield is the follow-up of Moorestown, it combines two chip solution into
+ * one. Other than that it also added always-on and constant tsc and lapic
+ * timers. Medfield is the platform name, and the chip name is called Penwell
+ * we treat Medfield/Penwell as a variant of Moorestown. Penwell can be
+ * identified via MSRs.
+ */
+enum mrst_cpu_type {
+ MRST_CPU_CHIP_LINCROFT = 1,
+ MRST_CPU_CHIP_PENWELL,
+};
+
+extern enum mrst_cpu_type __mrst_cpu_chip;
+static inline enum mrst_cpu_type mrst_identify_cpu(void)
+{
+ return __mrst_cpu_chip;
+}
+
+enum mrst_timer_options {
+ MRST_TIMER_DEFAULT,
+ MRST_TIMER_APBT_ONLY,
+ MRST_TIMER_LAPIC_APBT,
+};
+
+extern enum mrst_timer_options mrst_timer_options;
+
#define SFI_MTMR_MAX_NUM 8
#define SFI_MRTC_MAX 8
+extern struct console early_mrst_console;
+extern void mrst_early_console_init(void);
+
+extern struct console early_hsu_console;
+extern void hsu_early_console_init(void);
#endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8c7ae4318629..3ea3dc487047 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -20,6 +20,7 @@
#define _EFER_LMA 10 /* Long mode active (read-only) */
#define _EFER_NX 11 /* No execute enable */
#define _EFER_SVME 12 /* Enable virtualization */
+#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
#define EFER_SCE (1<<_EFER_SCE)
@@ -27,6 +28,7 @@
#define EFER_LMA (1<<_EFER_LMA)
#define EFER_NX (1<<_EFER_NX)
#define EFER_SVME (1<<_EFER_SVME)
+#define EFER_LMSLE (1<<_EFER_LMSLE)
#define EFER_FFXSR (1<<_EFER_FFXSR)
/* Intel MSRs. Some also available on other CPUs */
@@ -94,9 +96,6 @@
#define MSR_IA32_MC0_CTL2 0x00000280
#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))
-#define CMCI_EN (1ULL << 30)
-#define CMCI_THRESHOLD_MASK 0xffffULL
-
#define MSR_P6_PERFCTR0 0x000000c1
#define MSR_P6_PERFCTR1 0x000000c2
#define MSR_P6_EVNTSEL0 0x00000186
@@ -122,6 +121,7 @@
#define MSR_AMD64_IBSDCLINAD 0xc0011038
#define MSR_AMD64_IBSDCPHYSAD 0xc0011039
#define MSR_AMD64_IBSCTL 0xc001103a
+#define MSR_AMD64_IBSBRTARGET 0xc001103b
/* Fam 10h MSRs */
#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
@@ -159,8 +159,6 @@
#define MSR_K7_FID_VID_STATUS 0xc0010042
/* K6 MSRs */
-#define MSR_K6_EFER 0xc0000080
-#define MSR_K6_STAR 0xc0000081
#define MSR_K6_WHCR 0xc0000082
#define MSR_K6_UWCCR 0xc0000085
#define MSR_K6_EPMR 0xc0000086
@@ -201,6 +199,7 @@
#define MSR_IA32_TSC 0x00000010
#define MSR_IA32_PLATFORM_ID 0x00000017
#define MSR_IA32_EBL_CR_POWERON 0x0000002a
+#define MSR_EBC_FREQUENCY_ID 0x0000002c
#define MSR_IA32_FEATURE_CONTROL 0x0000003a
#define FEATURE_CONTROL_LOCKED (1<<0)
@@ -224,12 +223,14 @@
#define MSR_IA32_THERM_CONTROL 0x0000019a
#define MSR_IA32_THERM_INTERRUPT 0x0000019b
-#define THERM_INT_LOW_ENABLE (1 << 0)
-#define THERM_INT_HIGH_ENABLE (1 << 1)
+#define THERM_INT_HIGH_ENABLE (1 << 0)
+#define THERM_INT_LOW_ENABLE (1 << 1)
+#define THERM_INT_PLN_ENABLE (1 << 24)
#define MSR_IA32_THERM_STATUS 0x0000019c
#define THERM_STATUS_PROCHOT (1 << 0)
+#define THERM_STATUS_POWER_LIMIT (1 << 10)
#define MSR_THERM2_CTL 0x0000019d
@@ -239,6 +240,19 @@
#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2
+#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
+
+#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
+
+#define PACKAGE_THERM_STATUS_PROCHOT (1 << 0)
+#define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10)
+
+#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2
+
+#define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0)
+#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
+#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24)
+
/* MISC_ENABLE bits: architectural */
#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index c5bc4c2d33f5..084ef95274cd 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -148,8 +148,8 @@ static inline unsigned long long native_read_pmc(int counter)
#define rdmsr(msr, val1, val2) \
do { \
u64 __val = native_read_msr((msr)); \
- (val1) = (u32)__val; \
- (val2) = (u32)(__val >> 32); \
+ (void)((val1) = (u32)__val); \
+ (void)((val2) = (u32)(__val >> 32)); \
} while (0)
static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
new file mode 100644
index 000000000000..bcdff997668c
--- /dev/null
+++ b/arch/x86/include/asm/mwait.h
@@ -0,0 +1,15 @@
+#ifndef _ASM_X86_MWAIT_H
+#define _ASM_X86_MWAIT_H
+
+#define MWAIT_SUBSTATE_MASK 0xf
+#define MWAIT_CSTATE_MASK 0xf
+#define MWAIT_SUBSTATE_SIZE 4
+#define MWAIT_MAX_NUM_CSTATES 8
+
+#define CPUID_MWAIT_LEAF 5
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
+#define CPUID5_ECX_INTERRUPT_BREAK 0x2
+
+#define MWAIT_ECX_INTERRUPT_BREAK 0x1
+
+#endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 93da9c3f3341..932f0f86b4b7 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -17,7 +17,9 @@ int do_nmi_callback(struct pt_regs *regs, int cpu);
extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
extern int check_nmi_watchdog(void);
+#if !defined(CONFIG_LOCKUP_DETECTOR)
extern int nmi_watchdog_enabled;
+#endif
extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
extern int reserve_perfctr_nmi(unsigned int);
extern void release_perfctr_nmi(unsigned int);
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 101229b0d8ed..42a978c0c1b3 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -89,6 +89,8 @@ extern int olpc_ec_mask_unset(uint8_t bits);
/* EC commands */
#define EC_FIRMWARE_REV 0x08
+#define EC_WLAN_ENTER_RESET 0x35
+#define EC_WLAN_LEAVE_RESET 0x25
/* SCI source values */
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
new file mode 100644
index 000000000000..2a8478140bb3
--- /dev/null
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -0,0 +1,35 @@
+#ifndef _ASM_X86_OLPC_OFW_H
+#define _ASM_X86_OLPC_OFW_H
+
+/* index into the page table containing the entry OFW occupies */
+#define OLPC_OFW_PDE_NR 1022
+
+#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */
+
+#ifdef CONFIG_OLPC_OPENFIRMWARE
+
+/* run an OFW command by calling into the firmware */
+#define olpc_ofw(name, args, res) \
+ __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res)
+
+extern int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
+ void **res);
+
+/* determine whether OFW is available and lives in the proper memory */
+extern void olpc_ofw_detect(void);
+
+/* install OFW's pde permanently into the kernel's pgtable */
+extern void setup_olpc_ofw_pgd(void);
+
+/* check if OFW was detected during boot */
+extern bool olpc_ofw_present(void);
+
+#else /* !CONFIG_OLPC_OPENFIRMWARE */
+
+static inline void olpc_ofw_detect(void) { }
+static inline void setup_olpc_ofw_pgd(void) { }
+static inline bool olpc_ofw_present(void) { return false; }
+
+#endif /* !CONFIG_OLPC_OPENFIRMWARE */
+
+#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 625c3f0e741a..8ca82839288a 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -37,6 +37,13 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
#define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x))
/* __pa_symbol should be used for C visible symbols.
This seems to be the official gcc blessed way to do such arithmetic. */
+/*
+ * We need __phys_reloc_hide() here because gcc may assume that there is no
+ * overflow during __pa() calculation and can optimize it unexpectedly.
+ * Newer versions of gcc provide -fno-strict-overflow switch to handle this
+ * case properly. Once all supported versions of gcc understand it, we can
+ * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated)
+ */
#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index 6f1b7331313f..ade619ff9e2a 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -15,11 +15,7 @@
*/
#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
-#ifdef CONFIG_4KSTACKS
-#define THREAD_ORDER 0
-#else
#define THREAD_ORDER 1
-#endif
#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
#define STACKFAULT_STACK 0
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index a667f24c7254..1df66211fd1b 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -8,7 +8,7 @@
#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
-#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
+#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
/* Cast PAGE_MASK to a signed type so that it is sign-extended if
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 5653f43d90e5..18e3b8a8709f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -105,7 +105,7 @@ static inline void write_cr8(unsigned long x)
}
#endif
-static inline void raw_safe_halt(void)
+static inline void arch_safe_halt(void)
{
PVOP_VCALL0(pv_irq_ops.safe_halt);
}
@@ -416,11 +416,6 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
}
-static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
- unsigned long start, unsigned long count)
-{
- PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count);
-}
static inline void paravirt_release_pmd(unsigned long pfn)
{
PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
@@ -829,32 +824,32 @@ static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
#define __PV_IS_CALLEE_SAVE(func) \
((struct paravirt_callee_save) { func })
-static inline unsigned long __raw_local_save_flags(void)
+static inline unsigned long arch_local_save_flags(void)
{
return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
}
-static inline void raw_local_irq_restore(unsigned long f)
+static inline void arch_local_irq_restore(unsigned long f)
{
PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
}
-static inline void raw_local_irq_disable(void)
+static inline void arch_local_irq_disable(void)
{
PVOP_VCALLEE0(pv_irq_ops.irq_disable);
}
-static inline void raw_local_irq_enable(void)
+static inline void arch_local_irq_enable(void)
{
PVOP_VCALLEE0(pv_irq_ops.irq_enable);
}
-static inline unsigned long __raw_local_irq_save(void)
+static inline unsigned long arch_local_irq_save(void)
{
unsigned long f;
- f = __raw_local_save_flags();
- raw_local_irq_disable();
+ f = arch_local_save_flags();
+ arch_local_irq_disable();
return f;
}
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index db9ef5532341..b82bac975250 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -255,7 +255,6 @@ struct pv_mmu_ops {
*/
void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
- void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count);
void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
void (*release_pte)(unsigned long pfn);
void (*release_pmd)(unsigned long pfn);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 404a880ea325..ca0437c714b2 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -7,6 +7,7 @@
#include <linux/string.h>
#include <asm/scatterlist.h>
#include <asm/io.h>
+#include <asm/x86_init.h>
#ifdef __KERNEL__
@@ -27,6 +28,9 @@ extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
int node);
extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);
+#ifdef CONFIG_PCI
+
+#ifdef CONFIG_PCI_DOMAINS
static inline int pci_domain_nr(struct pci_bus *bus)
{
struct pci_sysdata *sd = bus->sysdata;
@@ -37,13 +41,12 @@ static inline int pci_proc_domain(struct pci_bus *bus)
{
return pci_domain_nr(bus);
}
-
+#endif
/* Can be used to override the logic in pci_scan_bus for skipping
already-configured bus numbers - to be used for buggy BIOSes
or architectures with incomplete PCI setup by the loader */
-#ifdef CONFIG_PCI
extern unsigned int pcibios_assign_all_busses(void);
extern int pci_legacy_init(void);
# ifdef CONFIG_ACPI
@@ -92,8 +95,36 @@ static inline void early_quirks(void) { }
extern void pci_iommu_alloc(void);
-/* MSI arch hook */
-#define arch_setup_msi_irqs arch_setup_msi_irqs
+#ifdef CONFIG_PCI_MSI
+/* MSI arch specific hooks */
+static inline int x86_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ return x86_msi.setup_msi_irqs(dev, nvec, type);
+}
+
+static inline void x86_teardown_msi_irqs(struct pci_dev *dev)
+{
+ x86_msi.teardown_msi_irqs(dev);
+}
+
+static inline void x86_teardown_msi_irq(unsigned int irq)
+{
+ x86_msi.teardown_msi_irq(irq);
+}
+#define arch_setup_msi_irqs x86_setup_msi_irqs
+#define arch_teardown_msi_irqs x86_teardown_msi_irqs
+#define arch_teardown_msi_irq x86_teardown_msi_irq
+/* implemented in arch/x86/kernel/apic/io_apic. */
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
+void native_teardown_msi_irq(unsigned int irq);
+/* default to the implementation in drivers/lib/msi.c */
+#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
+void default_teardown_msi_irqs(struct pci_dev *dev);
+#else
+#define native_setup_msi_irqs NULL
+#define native_teardown_msi_irq NULL
+#define default_teardown_msi_irqs NULL
+#endif
#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index cd2a31dc5fb8..704526734bef 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -30,6 +30,7 @@
#define PCI_HAS_IO_ECS 0x40000
#define PCI_NOASSIGN_ROMS 0x80000
#define PCI_ROOT_NO_CRS 0x100000
+#define PCI_NOASSIGN_BARS 0x200000
extern unsigned int pci_probe;
extern unsigned long pirq_table_addr;
@@ -46,6 +47,7 @@ enum pci_bf_sort_state {
extern unsigned int pcibios_max_latency;
void pcibios_resource_survey(void);
+void pcibios_set_cache_line_size(void);
/* pci-pc.c */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index cd28f9ad910d..f899e01a8ac9 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -47,6 +47,20 @@
#ifdef CONFIG_SMP
#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x
#define __my_cpu_offset percpu_read(this_cpu_off)
+
+/*
+ * Compared to the generic __my_cpu_offset version, the following
+ * saves one instruction and avoids clobbering a temp register.
+ */
+#define __this_cpu_ptr(ptr) \
+({ \
+ unsigned long tcp_ptr__; \
+ __verify_pcpu_ptr(ptr); \
+ asm volatile("add " __percpu_arg(1) ", %0" \
+ : "=r" (tcp_ptr__) \
+ : "m" (this_cpu_off), "0" (ptr)); \
+ (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
+})
#else
#define __percpu_arg(x) "%P" #x
#endif
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 254883d0c7e0..550e26b1dbb3 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -68,8 +68,9 @@ union cpuid10_eax {
union cpuid10_edx {
struct {
- unsigned int num_counters_fixed:4;
- unsigned int reserved:28;
+ unsigned int num_counters_fixed:5;
+ unsigned int bit_width_fixed:8;
+ unsigned int reserved:19;
} split;
unsigned int full;
};
@@ -110,17 +111,18 @@ union cpuid10_edx {
#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16)
/* IbsFetchCtl bits/masks */
-#define IBS_FETCH_RAND_EN (1ULL<<57)
-#define IBS_FETCH_VAL (1ULL<<49)
-#define IBS_FETCH_ENABLE (1ULL<<48)
-#define IBS_FETCH_CNT 0xFFFF0000ULL
-#define IBS_FETCH_MAX_CNT 0x0000FFFFULL
+#define IBS_FETCH_RAND_EN (1ULL<<57)
+#define IBS_FETCH_VAL (1ULL<<49)
+#define IBS_FETCH_ENABLE (1ULL<<48)
+#define IBS_FETCH_CNT 0xFFFF0000ULL
+#define IBS_FETCH_MAX_CNT 0x0000FFFFULL
/* IbsOpCtl bits */
-#define IBS_OP_CNT_CTL (1ULL<<19)
-#define IBS_OP_VAL (1ULL<<18)
-#define IBS_OP_ENABLE (1ULL<<17)
-#define IBS_OP_MAX_CNT 0x0000FFFFULL
+#define IBS_OP_CNT_CTL (1ULL<<19)
+#define IBS_OP_VAL (1ULL<<18)
+#define IBS_OP_ENABLE (1ULL<<17)
+#define IBS_OP_MAX_CNT 0x0000FFFFULL
+#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */
#ifdef CONFIG_PERF_EVENTS
extern void init_hw_perf_events(void);
@@ -140,6 +142,19 @@ extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
extern unsigned long perf_misc_flags(struct pt_regs *regs);
#define perf_misc_flags(regs) perf_misc_flags(regs)
+#include <asm/stacktrace.h>
+
+/*
+ * We abuse bit 3 from flags to pass exact information, see perf_misc_flags
+ * and the comment with PERF_EFLAGS_EXACT.
+ */
+#define perf_arch_fetch_caller_regs(regs, __ip) { \
+ (regs)->ip = (__ip); \
+ (regs)->bp = caller_frame_pointer(); \
+ (regs)->cs = __KERNEL_CS; \
+ regs->flags = 0; \
+}
+
#else
static inline void init_hw_perf_events(void) { }
static inline void perf_events_lapic_init(void) { }
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 64a8ebff06fc..a70cd216be5d 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -19,7 +19,6 @@
#define ARCH_P4_RESERVED_ESCR (2) /* IQ_ESCR(0,1) not always present */
#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
#define ARCH_P4_MAX_CCCR (18)
-#define ARCH_P4_MAX_COUNTER (ARCH_P4_MAX_CCCR / 2)
#define P4_ESCR_EVENT_MASK 0x7e000000U
#define P4_ESCR_EVENT_SHIFT 25
@@ -37,19 +36,6 @@
#define P4_ESCR_EMASK(v) ((v) << P4_ESCR_EVENTMASK_SHIFT)
#define P4_ESCR_TAG(v) ((v) << P4_ESCR_TAG_SHIFT)
-/* Non HT mask */
-#define P4_ESCR_MASK \
- (P4_ESCR_EVENT_MASK | \
- P4_ESCR_EVENTMASK_MASK | \
- P4_ESCR_TAG_MASK | \
- P4_ESCR_TAG_ENABLE | \
- P4_ESCR_T0_OS | \
- P4_ESCR_T0_USR)
-
-/* HT mask */
-#define P4_ESCR_MASK_HT \
- (P4_ESCR_MASK | P4_ESCR_T1_OS | P4_ESCR_T1_USR)
-
#define P4_CCCR_OVF 0x80000000U
#define P4_CCCR_CASCADE 0x40000000U
#define P4_CCCR_OVF_PMI_T0 0x04000000U
@@ -71,27 +57,6 @@
#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT)
#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT)
-/* Custom bits in reerved CCCR area */
-#define P4_CCCR_CACHE_OPS_MASK 0x0000003fU
-
-
-/* Non HT mask */
-#define P4_CCCR_MASK \
- (P4_CCCR_OVF | \
- P4_CCCR_CASCADE | \
- P4_CCCR_OVF_PMI_T0 | \
- P4_CCCR_FORCE_OVF | \
- P4_CCCR_EDGE | \
- P4_CCCR_THRESHOLD_MASK | \
- P4_CCCR_COMPLEMENT | \
- P4_CCCR_COMPARE | \
- P4_CCCR_ESCR_SELECT_MASK | \
- P4_CCCR_ENABLE)
-
-/* HT mask */
-#define P4_CCCR_MASK_HT \
- (P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY)
-
#define P4_GEN_ESCR_EMASK(class, name, bit) \
class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
#define P4_ESCR_EMASK_BIT(class, name) class##__##name
@@ -106,8 +71,7 @@
* ESCR and CCCR but rather an only packed value should
* be unpacked and written to a proper addresses
*
- * the base idea is to pack as much info as
- * possible
+ * the base idea is to pack as much info as possible
*/
#define p4_config_pack_escr(v) (((u64)(v)) << 32)
#define p4_config_pack_cccr(v) (((u64)(v)) & 0xffffffffULL)
@@ -130,11 +94,31 @@
t; \
})
-#define p4_config_unpack_cache_event(v) (((u64)(v)) & P4_CCCR_CACHE_OPS_MASK)
-
#define P4_CONFIG_HT_SHIFT 63
#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT)
+/*
+ * The bits we allow to pass for RAW events
+ */
+#define P4_CONFIG_MASK_ESCR \
+ P4_ESCR_EVENT_MASK | \
+ P4_ESCR_EVENTMASK_MASK | \
+ P4_ESCR_TAG_MASK | \
+ P4_ESCR_TAG_ENABLE
+
+#define P4_CONFIG_MASK_CCCR \
+ P4_CCCR_EDGE | \
+ P4_CCCR_THRESHOLD_MASK | \
+ P4_CCCR_COMPLEMENT | \
+ P4_CCCR_COMPARE | \
+ P4_CCCR_THREAD_ANY | \
+ P4_CCCR_RESERVED
+
+/* some dangerous bits are reserved for kernel internals */
+#define P4_CONFIG_MASK \
+ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \
+ (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR))
+
static inline bool p4_is_event_cascaded(u64 config)
{
u32 cccr = p4_config_unpack_cccr(config);
@@ -214,6 +198,12 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
return escr;
}
+/*
+ * This are the events which should be used in "Event Select"
+ * field of ESCR register, they are like unique keys which allow
+ * the kernel to determinate which CCCR and COUNTER should be
+ * used to track an event
+ */
enum P4_EVENTS {
P4_EVENT_TC_DELIVER_MODE,
P4_EVENT_BPU_FETCH_REQUEST,
@@ -561,7 +551,7 @@ enum P4_EVENT_OPCODES {
* a caller should use P4_ESCR_EMASK_NAME helper to
* pick the EventMask needed, for example
*
- * P4_ESCR_EMASK_NAME(P4_EVENT_TC_DELIVER_MODE, DD)
+ * P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)
*/
enum P4_ESCR_EMASKS {
P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0),
@@ -753,43 +743,50 @@ enum P4_ESCR_EMASKS {
P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1),
};
-/* P4 PEBS: stale for a while */
-#define P4_PEBS_METRIC_MASK 0x00001fffU
-#define P4_PEBS_UOB_TAG 0x01000000U
-#define P4_PEBS_ENABLE 0x02000000U
-
-/* Replay metrics for MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT */
-#define P4_PEBS__1stl_cache_load_miss_retired 0x3000001
-#define P4_PEBS__2ndl_cache_load_miss_retired 0x3000002
-#define P4_PEBS__dtlb_load_miss_retired 0x3000004
-#define P4_PEBS__dtlb_store_miss_retired 0x3000004
-#define P4_PEBS__dtlb_all_miss_retired 0x3000004
-#define P4_PEBS__tagged_mispred_branch 0x3018000
-#define P4_PEBS__mob_load_replay_retired 0x3000200
-#define P4_PEBS__split_load_retired 0x3000400
-#define P4_PEBS__split_store_retired 0x3000400
-
-#define P4_VERT__1stl_cache_load_miss_retired 0x0000001
-#define P4_VERT__2ndl_cache_load_miss_retired 0x0000001
-#define P4_VERT__dtlb_load_miss_retired 0x0000001
-#define P4_VERT__dtlb_store_miss_retired 0x0000002
-#define P4_VERT__dtlb_all_miss_retired 0x0000003
-#define P4_VERT__tagged_mispred_branch 0x0000010
-#define P4_VERT__mob_load_replay_retired 0x0000001
-#define P4_VERT__split_load_retired 0x0000001
-#define P4_VERT__split_store_retired 0x0000002
-
-enum P4_CACHE_EVENTS {
- P4_CACHE__NONE,
-
- P4_CACHE__1stl_cache_load_miss_retired,
- P4_CACHE__2ndl_cache_load_miss_retired,
- P4_CACHE__dtlb_load_miss_retired,
- P4_CACHE__dtlb_store_miss_retired,
- P4_CACHE__itlb_reference_hit,
- P4_CACHE__itlb_reference_miss,
-
- P4_CACHE__MAX
+/*
+ * P4 PEBS specifics (Replay Event only)
+ *
+ * Format (bits):
+ * 0-6: metric from P4_PEBS_METRIC enum
+ * 7 : reserved
+ * 8 : reserved
+ * 9-11 : reserved
+ *
+ * Note we have UOP and PEBS bits reserved for now
+ * just in case if we will need them once
+ */
+#define P4_PEBS_CONFIG_ENABLE (1 << 7)
+#define P4_PEBS_CONFIG_UOP_TAG (1 << 8)
+#define P4_PEBS_CONFIG_METRIC_MASK 0x3f
+#define P4_PEBS_CONFIG_MASK 0xff
+
+/*
+ * mem: Only counters MSR_IQ_COUNTER4 (16) and
+ * MSR_IQ_COUNTER5 (17) are allowed for PEBS sampling
+ */
+#define P4_PEBS_ENABLE 0x02000000U
+#define P4_PEBS_ENABLE_UOP_TAG 0x01000000U
+
+#define p4_config_unpack_metric(v) (((u64)(v)) & P4_PEBS_CONFIG_METRIC_MASK)
+#define p4_config_unpack_pebs(v) (((u64)(v)) & P4_PEBS_CONFIG_MASK)
+
+#define p4_config_pebs_has(v, mask) (p4_config_unpack_pebs(v) & (mask))
+
+enum P4_PEBS_METRIC {
+ P4_PEBS_METRIC__none,
+
+ P4_PEBS_METRIC__1stl_cache_load_miss_retired,
+ P4_PEBS_METRIC__2ndl_cache_load_miss_retired,
+ P4_PEBS_METRIC__dtlb_load_miss_retired,
+ P4_PEBS_METRIC__dtlb_store_miss_retired,
+ P4_PEBS_METRIC__dtlb_all_miss_retired,
+ P4_PEBS_METRIC__tagged_mispred_branch,
+ P4_PEBS_METRIC__mob_load_replay_retired,
+ P4_PEBS_METRIC__split_load_retired,
+ P4_PEBS_METRIC__split_store_retired,
+
+ P4_PEBS_METRIC__max
};
#endif /* PERF_EVENT_P4_H */
+
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a34c785c5a63..ada823a13c7c 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
extern spinlock_t pgd_lock;
extern struct list_head pgd_list;
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else /* !CONFIG_PARAVIRT */
@@ -603,6 +605,8 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
pte_update(mm, addr, ptep);
}
+#define flush_tlb_fix_spurious_fault(vma, address)
+
/*
* clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
*
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 2984a25ff383..0c92113c4cb6 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -26,6 +26,7 @@ struct mm_struct;
struct vm_area_struct;
extern pgd_t swapper_pg_dir[1024];
+extern pgd_t initial_page_table[1024];
static inline void pgtable_cache_init(void) { }
static inline void check_pgt_cache(void) { }
@@ -48,24 +49,14 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
#endif
#if defined(CONFIG_HIGHPTE)
-#define __KM_PTE \
- (in_nmi() ? KM_NMI_PTE : \
- in_irq() ? KM_IRQ_PTE : \
- KM_PTE0)
#define pte_offset_map(dir, address) \
- ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \
+ ((pte_t *)kmap_atomic(pmd_page(*(dir))) + \
pte_index((address)))
-#define pte_offset_map_nested(dir, address) \
- ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \
- pte_index((address)))
-#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
-#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
+#define pte_unmap(pte) kunmap_atomic((pte))
#else
#define pte_offset_map(dir, address) \
((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
-#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
#define pte_unmap(pte) do { } while (0)
-#define pte_unmap_nested(pte) do { } while (0)
#endif
/* Clear a kernel PTE and flush it from the TLB */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 181be528c612..f86da20347f2 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -102,6 +102,8 @@ static inline void native_pgd_clear(pgd_t *pgd)
native_set_pgd(pgd, native_make_pgd(0));
}
+extern void sync_global_pgds(unsigned long start, unsigned long end);
+
/*
* Conversion functions: convert a page and protection to a page entry,
* and a page entry and page directory to the page they refer to.
@@ -125,9 +127,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
/* x86-64 always has all page tables mapped. */
#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
-#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
-#define pte_unmap(pte) /* NOP */
-#define pte_unmap_nested(pte) /* NOP */
+#define pte_unmap(pte) ((void)(pte))/* NOP */
#define update_mmu_cache(vma, address, ptep) do { } while (0)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7e5c6a60b8ee..cae9c3cb95cf 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -110,6 +110,8 @@ struct cpuinfo_x86 {
u16 phys_proc_id;
/* Core id: */
u16 cpu_core_id;
+ /* Compute unit id */
+ u8 compute_unit_id;
/* Index into per_cpu list: */
u16 cpu_index;
#endif
@@ -602,7 +604,7 @@ extern unsigned long mmu_cr4_features;
static inline void set_in_cr4(unsigned long mask)
{
- unsigned cr4;
+ unsigned long cr4;
mmu_cr4_features |= mask;
cr4 = read_cr4();
@@ -612,7 +614,7 @@ static inline void set_in_cr4(unsigned long mask)
static inline void clear_in_cr4(unsigned long mask)
{
- unsigned cr4;
+ unsigned long cr4;
mmu_cr4_features &= ~mask;
cr4 = read_cr4();
@@ -762,29 +764,7 @@ extern void init_c1e_mask(void);
extern unsigned long boot_option_idle_override;
extern unsigned long idle_halt;
extern unsigned long idle_nomwait;
-
-/*
- * on systems with caches, caches must be flashed as the absolute
- * last instruction before going into a suspended halt. Otherwise,
- * dirty data can linger in the cache and become stale on resume,
- * leading to strange errors.
- *
- * perform a variety of operations to guarantee that the compiler
- * will not reorder instructions. wbinvd itself is serializing
- * so the processor will not reorder.
- *
- * Systems without cache can just go into halt.
- */
-static inline void wbinvd_halt(void)
-{
- mb();
- /* check for clflush to determine if wbinvd is legal */
- if (cpu_has_clflush)
- asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
- else
- while (1)
- halt();
-}
+extern bool c1e_detected;
extern void enable_sep_cpu(void);
extern int sysenter_setup(void);
@@ -1025,4 +1005,24 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
return ratio;
}
+/*
+ * AMD errata checking
+ */
+#ifdef CONFIG_CPU_SUP_AMD
+extern const int amd_erratum_383[];
+extern const int amd_erratum_400[];
+extern bool cpu_has_amd_erratum(const int *);
+
+#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
+#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
+#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
+ ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
+#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
+#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
+#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
+
+#else
+#define cpu_has_amd_erratum(x) (false)
+#endif /* CONFIG_CPU_SUP_AMD */
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index cd02f324aa6b..7f7e577a0e39 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -12,4 +12,42 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
struct pvclock_vcpu_time_info *vcpu,
struct timespec *ts);
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+ u64 product;
+#ifdef __i386__
+ u32 tmp1, tmp2;
+#endif
+
+ if (shift < 0)
+ delta >>= -shift;
+ else
+ delta <<= shift;
+
+#ifdef __i386__
+ __asm__ (
+ "mul %5 ; "
+ "mov %4,%%eax ; "
+ "mov %%edx,%4 ; "
+ "mul %5 ; "
+ "xor %5,%5 ; "
+ "add %4,%%eax ; "
+ "adc %5,%%edx ; "
+ : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+ : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif defined(__x86_64__)
+ __asm__ (
+ "mul %%rdx ; shrd $32,%%rdx,%%rax"
+ : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+ return product;
+}
+
#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 64cf2d24fad1..6c7fc25f2c34 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -84,5 +84,7 @@
#define REQUIRED_MASK5 0
#define REQUIRED_MASK6 0
#define REQUIRED_MASK7 0
+#define REQUIRED_MASK8 0
+#define REQUIRED_MASK9 0
#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 606ede126972..d1e41b0f9b60 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -118,7 +118,7 @@ static inline void __down_read(struct rw_semaphore *sem)
{
asm volatile("# beginning down_read\n\t"
LOCK_PREFIX _ASM_INC "(%1)\n\t"
- /* adds 0x00000001, returns the old value */
+ /* adds 0x00000001 */
" jns 1f\n"
" call call_rwsem_down_read_failed\n"
"1:\n\t"
@@ -156,11 +156,9 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
{
rwsem_count_t tmp;
-
- tmp = RWSEM_ACTIVE_WRITE_BIAS;
asm volatile("# beginning down_write\n\t"
LOCK_PREFIX " xadd %1,(%2)\n\t"
- /* subtract 0x0000ffff, returns the old value */
+ /* adds 0xffff0001, returns the old value */
" test %1,%1\n\t"
/* was the count 0 before? */
" jz 1f\n"
@@ -168,7 +166,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
"1:\n"
"# ending down_write"
: "+m" (sem->count), "=d" (tmp)
- : "a" (sem), "1" (tmp)
+ : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS)
: "memory", "cc");
}
@@ -195,16 +193,16 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
*/
static inline void __up_read(struct rw_semaphore *sem)
{
- rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS;
+ rwsem_count_t tmp;
asm volatile("# beginning __up_read\n\t"
LOCK_PREFIX " xadd %1,(%2)\n\t"
/* subtracts 1, returns the old value */
" jns 1f\n\t"
- " call call_rwsem_wake\n"
+ " call call_rwsem_wake\n" /* expects old value in %edx */
"1:\n"
"# ending __up_read\n"
: "+m" (sem->count), "=d" (tmp)
- : "a" (sem), "1" (tmp)
+ : "a" (sem), "1" (-RWSEM_ACTIVE_READ_BIAS)
: "memory", "cc");
}
@@ -216,10 +214,9 @@ static inline void __up_write(struct rw_semaphore *sem)
rwsem_count_t tmp;
asm volatile("# beginning __up_write\n\t"
LOCK_PREFIX " xadd %1,(%2)\n\t"
- /* tries to transition
- 0xffff0001 -> 0x00000000 */
- " jz 1f\n"
- " call call_rwsem_wake\n"
+ /* subtracts 0xffff0001, returns the old value */
+ " jns 1f\n\t"
+ " call call_rwsem_wake\n" /* expects old value in %edx */
"1:\n\t"
"# ending __up_write\n"
: "+m" (sem->count), "=d" (tmp)
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
index fb0b1874396f..4240878b9d76 100644
--- a/arch/x86/include/asm/scatterlist.h
+++ b/arch/x86/include/asm/scatterlist.h
@@ -3,7 +3,6 @@
#include <asm-generic/scatterlist.h>
-#define ISA_DMA_THRESHOLD (0x00ffffff)
#define ARCH_HAS_SG_CHAIN
#endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 14e0ed86a6f9..231f1c1d6607 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -73,31 +73,31 @@
#define GDT_ENTRY_DEFAULT_USER_DS 15
-#define GDT_ENTRY_KERNEL_BASE 12
+#define GDT_ENTRY_KERNEL_BASE (12)
-#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
+#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE+0)
-#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
+#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE+1)
-#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
-#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
+#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE+4)
+#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE+5)
-#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
-#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
+#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE+6)
+#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE+11)
-#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
-#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
+#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE+14)
+#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8)
-#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
+#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE+15)
#ifdef CONFIG_SMP
#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
#else
#define __KERNEL_PERCPU 0
#endif
-#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE + 16)
+#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE+16)
#ifdef CONFIG_CC_STACKPROTECTOR
-#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY * 8)
+#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8)
#else
#define __KERNEL_STACK_CANARY 0
#endif
@@ -182,10 +182,10 @@
#endif
-#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
-#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
-#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
-#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3)
+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3)
#ifndef CONFIG_PARAVIRT
#define get_kernel_rpl() 0
#endif
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 86b1506f4179..d6763b139a84 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -82,7 +82,7 @@ void *extend_brk(size_t size, size_t align);
* executable.)
*/
#define RESERVE_BRK(name,sz) \
- static void __section(.discard) __used \
+ static void __section(.discard.text) __used \
__brk_reservation_fn_##name##__(void) { \
asm volatile ( \
".pushsection .brk_reservation,\"aw\",@nobits;" \
@@ -93,6 +93,11 @@ void *extend_brk(size_t size, size_t align);
: : "i" (sz)); \
}
+/* Helper for reserving space for arrays of things */
+#define RESERVE_BRK_ARRAY(type, name, entries) \
+ type *name; \
+ RESERVE_BRK(name, sizeof(type) * entries)
+
#ifdef __i386__
void __init i386_start_kernel(void);
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4cfc90824068..4c2f63c7fc1b 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -50,7 +50,7 @@ struct smp_ops {
void (*smp_prepare_cpus)(unsigned max_cpus);
void (*smp_cpus_done)(unsigned max_cpus);
- void (*smp_send_stop)(void);
+ void (*stop_other_cpus)(int wait);
void (*smp_send_reschedule)(int cpu);
int (*cpu_up)(unsigned cpu);
@@ -73,7 +73,12 @@ extern struct smp_ops smp_ops;
static inline void smp_send_stop(void)
{
- smp_ops.smp_send_stop();
+ smp_ops.stop_other_cpus(0);
+}
+
+static inline void stop_other_cpus(void)
+{
+ smp_ops.stop_other_cpus(1);
}
static inline void smp_prepare_boot_cpu(void)
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 4dab78edbad9..2b16a2ad23dc 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -1,6 +1,13 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+
#ifndef _ASM_X86_STACKTRACE_H
#define _ASM_X86_STACKTRACE_H
+#include <linux/uaccess.h>
+
extern int kstack_depth_to_print;
struct thread_info;
@@ -42,4 +49,46 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
const struct stacktrace_ops *ops, void *data);
+#ifdef CONFIG_X86_32
+#define STACKSLOTS_PER_LINE 8
+#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
+#else
+#define STACKSLOTS_PER_LINE 4
+#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
+#endif
+
+extern void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp, char *log_lvl);
+
+extern void
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, unsigned long bp, char *log_lvl);
+
+extern unsigned int code_bytes;
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+ struct stack_frame *next_frame;
+ unsigned long return_address;
+};
+
+struct stack_frame_ia32 {
+ u32 next_frame;
+ u32 return_address;
+};
+
+static inline unsigned long caller_frame_pointer(void)
+{
+ struct stack_frame *frame;
+
+ get_bp(frame);
+
+#ifdef CONFIG_FRAME_POINTER
+ frame = frame->next_frame;
+#endif
+
+ return (unsigned long)frame;
+}
+
#endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index 8085277e1b8b..977f1761a25d 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -5,17 +5,26 @@
#ifdef CONFIG_SWIOTLB
extern int swiotlb;
-extern int __init pci_swiotlb_detect(void);
+extern int __init pci_swiotlb_detect_override(void);
+extern int __init pci_swiotlb_detect_4gb(void);
extern void __init pci_swiotlb_init(void);
+extern void __init pci_swiotlb_late_init(void);
#else
#define swiotlb 0
-static inline int pci_swiotlb_detect(void)
+static inline int pci_swiotlb_detect_override(void)
+{
+ return 0;
+}
+static inline int pci_swiotlb_detect_4gb(void)
{
return 0;
}
static inline void pci_swiotlb_init(void)
{
}
+static inline void pci_swiotlb_late_init(void)
+{
+}
#endif
static inline void dma_mark_clean(void *addr, size_t size) {}
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 3ad421784ae7..cb238526a9f1 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -18,13 +18,13 @@
#include <asm/ia32.h>
/* ia32/sys_ia32.c */
-asmlinkage long sys32_truncate64(char __user *, unsigned long, unsigned long);
+asmlinkage long sys32_truncate64(const char __user *, unsigned long, unsigned long);
asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long);
-asmlinkage long sys32_stat64(char __user *, struct stat64 __user *);
-asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *);
+asmlinkage long sys32_stat64(const char __user *, struct stat64 __user *);
+asmlinkage long sys32_lstat64(const char __user *, struct stat64 __user *);
asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);
-asmlinkage long sys32_fstatat(unsigned int, char __user *,
+asmlinkage long sys32_fstatat(unsigned int, const char __user *,
struct stat64 __user *, int);
struct mmap_arg_struct32;
asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *);
@@ -49,12 +49,12 @@ asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
-asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
+asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32);
asmlinkage long sys32_personality(unsigned long);
asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
-asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
+asmlinkage long sys32_execve(const char __user *, compat_uptr_t __user *,
compat_uptr_t __user *, struct pt_regs *);
asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
@@ -80,4 +80,7 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *);
/* ia32/ipc32.c */
asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32);
+
+asmlinkage long sys32_fanotify_mark(int, unsigned int, u32, u32, int,
+ const char __user *);
#endif /* _ASM_X86_SYS_IA32_H */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 5c044b43e9a7..f1d8b441fc77 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -23,8 +23,9 @@ long sys_iopl(unsigned int, struct pt_regs *);
/* kernel/process.c */
int sys_fork(struct pt_regs *);
int sys_vfork(struct pt_regs *);
-long sys_execve(char __user *, char __user * __user *,
- char __user * __user *, struct pt_regs *);
+long sys_execve(const char __user *,
+ const char __user *const __user *,
+ const char __user *const __user *, struct pt_regs *);
long sys_clone(unsigned long, unsigned long, void __user *,
void __user *, struct pt_regs *);
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index e7f4d33c55ed..33ecc3ea8782 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -457,4 +457,11 @@ static __always_inline void rdtsc_barrier(void)
alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
}
+/*
+ * We handle most unaligned accesses in hardware. On the other hand
+ * unaligned DMA can be quite expensive on some Nehalem processors.
+ *
+ * Based on this we disable the IP header alignment in network drivers.
+ */
+#define NET_IP_ALIGN 0
#endif /* _ASM_X86_SYSTEM_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 7f3eba08e7de..169be8938b96 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -172,6 +172,4 @@ static inline void flush_tlb_kernel_range(unsigned long start,
flush_tlb_all();
}
-extern void zap_low_mappings(bool early);
-
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index cb507bb05d79..f4500fb3b485 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -20,7 +20,7 @@ extern unsigned long initial_gs;
extern unsigned long setup_trampoline(void);
extern void __init reserve_trampoline_memory(void);
#else
-static inline void reserve_trampoline_memory(void) {};
+static inline void reserve_trampoline_memory(void) {}
#endif /* CONFIG_X86_TRAMPOLINE */
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index c0427295e8f5..1ca132fc0d03 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -59,5 +59,7 @@ extern void check_tsc_sync_source(int cpu);
extern void check_tsc_sync_target(void);
extern int notsc_setup(char *);
+extern void save_sched_clock_state(void);
+extern void restore_sched_clock_state(void);
#endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index beb9b5f8f8a4..b766a5e8ba0e 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -343,10 +343,13 @@
#define __NR_rt_tgsigqueueinfo 335
#define __NR_perf_event_open 336
#define __NR_recvmmsg 337
+#define __NR_fanotify_init 338
+#define __NR_fanotify_mark 339
+#define __NR_prlimit64 340
#ifdef __KERNEL__
-#define NR_syscalls 338
+#define NR_syscalls 341
#define __ARCH_WANT_IPC_PARSE_VERSION
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index ff4307b0e81e..363e9b8a715b 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -663,6 +663,12 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
#define __NR_recvmmsg 299
__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
+#define __NR_fanotify_init 300
+__SYSCALL(__NR_fanotify_init, sys_fanotify_init)
+#define __NR_fanotify_mark 301
+__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
+#define __NR_prlimit64 302
+__SYSCALL(__NR_prlimit64, sys_prlimit64)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index aa558ac0306e..42d412fd8b02 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -34,6 +34,7 @@
*/
#define UV_ITEMS_PER_DESCRIPTOR 8
+/* the 'throttle' to prevent the hardware stay-busy bug */
#define MAX_BAU_CONCURRENT 3
#define UV_CPUS_PER_ACT_STATUS 32
#define UV_ACT_STATUS_MASK 0x3
@@ -45,10 +46,26 @@
#define UV_DESC_BASE_PNODE_SHIFT 49
#define UV_PAYLOADQ_PNODE_SHIFT 49
#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
+#define UV_BAU_BASENAME "sgi_uv/bau_tunables"
+#define UV_BAU_TUNABLES_DIR "sgi_uv"
+#define UV_BAU_TUNABLES_FILE "bau_tunables"
+#define WHITESPACE " \t\n"
#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x0000000009UL
+/* [19:16] SOFT_ACK timeout period 19: 1 is urgency 7 17:16 1 is multiplier */
+#define BAU_MISC_CONTROL_MULT_MASK 3
+
+#define UVH_AGING_PRESCALE_SEL 0x000000b000UL
+/* [30:28] URGENCY_7 an index into a table of times */
+#define BAU_URGENCY_7_SHIFT 28
+#define BAU_URGENCY_7_MASK 7
+
+#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL
+/* [45:40] BAU - BAU transaction timeout select - a multiplier */
+#define BAU_TRANS_SHIFT 40
+#define BAU_TRANS_MASK 0x3f
/*
* bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
@@ -59,24 +76,21 @@
#define DESC_STATUS_SOURCE_TIMEOUT 3
/*
- * source side threshholds at which message retries print a warning
- */
-#define SOURCE_TIMEOUT_LIMIT 20
-#define DESTINATION_TIMEOUT_LIMIT 20
-
-/*
- * misc. delays, in microseconds
+ * delay for 'plugged' timeout retries, in microseconds
*/
-#define THROTTLE_DELAY 10
-#define TIMEOUT_DELAY 10
-#define BIOS_TO 1000
-/* BIOS is assumed to set the destination timeout to 1003520 nanoseconds */
+#define PLUGGED_DELAY 10
/*
* threshholds at which to use IPI to free resources
*/
+/* after this # consecutive 'plugged' timeouts, use IPI to release resources */
#define PLUGSB4RESET 100
-#define TIMEOUTSB4RESET 100
+/* after this many consecutive timeouts, use IPI to release resources */
+#define TIMEOUTSB4RESET 1
+/* at this number uses of IPI to release resources, giveup the request */
+#define IPI_RESET_LIMIT 1
+/* after this # consecutive successes, bump up the throttle if it was lowered */
+#define COMPLETE_THRESHOLD 5
/*
* number of entries in the destination side payload queue
@@ -96,6 +110,13 @@
#define FLUSH_COMPLETE 4
/*
+ * tuning the action when the numalink network is extremely delayed
+ */
+#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in microseconds */
+#define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */
+#define CONGESTED_PERIOD 30 /* time for the bau to be disabled, in seconds */
+
+/*
* Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
* If the 'multilevel' flag in the header portion of the descriptor
* has been set to 0, then endpoint multi-unicast mode is selected.
@@ -300,37 +321,16 @@ struct bau_payload_queue_entry {
/* bytes 24-31 */
};
-/*
- * one per-cpu; to locate the software tables
- */
-struct bau_control {
- struct bau_desc *descriptor_base;
+struct msg_desc {
+ struct bau_payload_queue_entry *msg;
+ int msg_slot;
+ int sw_ack_slot;
struct bau_payload_queue_entry *va_queue_first;
struct bau_payload_queue_entry *va_queue_last;
- struct bau_payload_queue_entry *bau_msg_head;
- struct bau_control *uvhub_master;
- struct bau_control *socket_master;
- unsigned long timeout_interval;
- atomic_t active_descriptor_count;
- int max_concurrent;
- int max_concurrent_constant;
- int retry_message_scans;
- int plugged_tries;
- int timeout_tries;
- int ipi_attempts;
- int conseccompletes;
- short cpu;
- short uvhub_cpu;
- short uvhub;
- short cpus_in_socket;
- short cpus_in_uvhub;
- unsigned short message_number;
- unsigned short uvhub_quiesce;
- short socket_acknowledge_count[DEST_Q_SIZE];
- cycles_t send_message;
- spinlock_t masks_lock;
- spinlock_t uvhub_lock;
- spinlock_t queue_lock;
+};
+
+struct reset_args {
+ int sender;
};
/*
@@ -344,18 +344,25 @@ struct ptc_stats {
unsigned long s_dtimeout; /* destination side timeouts */
unsigned long s_time; /* time spent in sending side */
unsigned long s_retriesok; /* successful retries */
- unsigned long s_ntargcpu; /* number of cpus targeted */
- unsigned long s_ntarguvhub; /* number of uvhubs targeted */
- unsigned long s_ntarguvhub16; /* number of times >= 16 target hubs */
- unsigned long s_ntarguvhub8; /* number of times >= 8 target hubs */
- unsigned long s_ntarguvhub4; /* number of times >= 4 target hubs */
- unsigned long s_ntarguvhub2; /* number of times >= 2 target hubs */
- unsigned long s_ntarguvhub1; /* number of times == 1 target hub */
+ unsigned long s_ntargcpu; /* total number of cpu's targeted */
+ unsigned long s_ntargself; /* times the sending cpu was targeted */
+ unsigned long s_ntarglocals; /* targets of cpus on the local blade */
+ unsigned long s_ntargremotes; /* targets of cpus on remote blades */
+ unsigned long s_ntarglocaluvhub; /* targets of the local hub */
+ unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */
+ unsigned long s_ntarguvhub; /* total number of uvhubs targeted */
+ unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/
+ unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */
+ unsigned long s_ntarguvhub4; /* number of times target hubs >= 4 */
+ unsigned long s_ntarguvhub2; /* number of times target hubs >= 2 */
+ unsigned long s_ntarguvhub1; /* number of times target hubs == 1 */
unsigned long s_resets_plug; /* ipi-style resets from plug state */
unsigned long s_resets_timeout; /* ipi-style resets from timeouts */
unsigned long s_busy; /* status stayed busy past s/w timer */
unsigned long s_throttles; /* waits in throttle */
unsigned long s_retry_messages; /* retry broadcasts */
+ unsigned long s_bau_reenabled; /* for bau enable/disable */
+ unsigned long s_bau_disabled; /* for bau enable/disable */
/* destination statistics */
unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */
unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
@@ -370,6 +377,52 @@ struct ptc_stats {
unsigned long d_rcanceled; /* number of messages canceled by resets */
};
+/*
+ * one per-cpu; to locate the software tables
+ */
+struct bau_control {
+ struct bau_desc *descriptor_base;
+ struct bau_payload_queue_entry *va_queue_first;
+ struct bau_payload_queue_entry *va_queue_last;
+ struct bau_payload_queue_entry *bau_msg_head;
+ struct bau_control *uvhub_master;
+ struct bau_control *socket_master;
+ struct ptc_stats *statp;
+ unsigned long timeout_interval;
+ unsigned long set_bau_on_time;
+ atomic_t active_descriptor_count;
+ int plugged_tries;
+ int timeout_tries;
+ int ipi_attempts;
+ int conseccompletes;
+ int baudisabled;
+ int set_bau_off;
+ short cpu;
+ short uvhub_cpu;
+ short uvhub;
+ short cpus_in_socket;
+ short cpus_in_uvhub;
+ unsigned short message_number;
+ unsigned short uvhub_quiesce;
+ short socket_acknowledge_count[DEST_Q_SIZE];
+ cycles_t send_message;
+ spinlock_t uvhub_lock;
+ spinlock_t queue_lock;
+ /* tunables */
+ int max_bau_concurrent;
+ int max_bau_concurrent_constant;
+ int plugged_delay;
+ int plugsb4reset;
+ int timeoutsb4reset;
+ int ipi_reset_limit;
+ int complete_threshold;
+ int congested_response_us;
+ int congested_reps;
+ int congested_period;
+ cycles_t period_time;
+ long period_requests;
+};
+
static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp)
{
return constant_test_bit(uvhub, &dstp->bits[0]);
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index bf6b88ef8eeb..e969f691cbfd 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -5,7 +5,7 @@
*
* SGI UV architectural definitions
*
- * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
*/
#ifndef _ASM_X86_UV_UV_HUB_H
@@ -77,7 +77,8 @@
*
* 1111110000000000
* 5432109876543210
- * pppppppppplc0cch
+ * pppppppppplc0cch Nehalem-EX
+ * ppppppppplcc0cch Westmere-EX
* sssssssssss
*
* p = pnode bits
@@ -148,12 +149,25 @@ struct uv_hub_info_s {
unsigned char m_val;
unsigned char n_val;
struct uv_scir_s scir;
+ unsigned char apic_pnode_shift;
};
DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
+union uvh_apicid {
+ unsigned long v;
+ struct uvh_apicid_s {
+ unsigned long local_apic_mask : 24;
+ unsigned long local_apic_shift : 5;
+ unsigned long unused1 : 3;
+ unsigned long pnode_mask : 24;
+ unsigned long pnode_shift : 5;
+ unsigned long unused2 : 3;
+ } s;
+};
+
/*
* Local & Global MMR space macros.
* Note: macros are intended to be used ONLY by inline functions
@@ -182,6 +196,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
(((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
+#define UVH_APICID 0x002D0E00L
#define UV_APIC_PNODE_SHIFT 6
/* Local Bus from cpu's perspective */
@@ -280,7 +295,7 @@ static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
*/
static inline int uv_apicid_to_pnode(int apicid)
{
- return (apicid >> UV_APIC_PNODE_SHIFT);
+ return (apicid >> uv_hub_info->apic_pnode_shift);
}
/*
diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h
deleted file mode 100644
index 61e08c0a2907..000000000000
--- a/arch/x86/include/asm/vmi.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * VMI interface definition
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Maintained by: Zachary Amsden zach@vmware.com
- *
- */
-#include <linux/types.h>
-
-/*
- *---------------------------------------------------------------------
- *
- * VMI Option ROM API
- *
- *---------------------------------------------------------------------
- */
-#define VMI_SIGNATURE 0x696d5663 /* "cVmi" */
-
-#define PCI_VENDOR_ID_VMWARE 0x15AD
-#define PCI_DEVICE_ID_VMWARE_VMI 0x0801
-
-/*
- * We use two version numbers for compatibility, with the major
- * number signifying interface breakages, and the minor number
- * interface extensions.
- */
-#define VMI_API_REV_MAJOR 3
-#define VMI_API_REV_MINOR 0
-
-#define VMI_CALL_CPUID 0
-#define VMI_CALL_WRMSR 1
-#define VMI_CALL_RDMSR 2
-#define VMI_CALL_SetGDT 3
-#define VMI_CALL_SetLDT 4
-#define VMI_CALL_SetIDT 5
-#define VMI_CALL_SetTR 6
-#define VMI_CALL_GetGDT 7
-#define VMI_CALL_GetLDT 8
-#define VMI_CALL_GetIDT 9
-#define VMI_CALL_GetTR 10
-#define VMI_CALL_WriteGDTEntry 11
-#define VMI_CALL_WriteLDTEntry 12
-#define VMI_CALL_WriteIDTEntry 13
-#define VMI_CALL_UpdateKernelStack 14
-#define VMI_CALL_SetCR0 15
-#define VMI_CALL_SetCR2 16
-#define VMI_CALL_SetCR3 17
-#define VMI_CALL_SetCR4 18
-#define VMI_CALL_GetCR0 19
-#define VMI_CALL_GetCR2 20
-#define VMI_CALL_GetCR3 21
-#define VMI_CALL_GetCR4 22
-#define VMI_CALL_WBINVD 23
-#define VMI_CALL_SetDR 24
-#define VMI_CALL_GetDR 25
-#define VMI_CALL_RDPMC 26
-#define VMI_CALL_RDTSC 27
-#define VMI_CALL_CLTS 28
-#define VMI_CALL_EnableInterrupts 29
-#define VMI_CALL_DisableInterrupts 30
-#define VMI_CALL_GetInterruptMask 31
-#define VMI_CALL_SetInterruptMask 32
-#define VMI_CALL_IRET 33
-#define VMI_CALL_SYSEXIT 34
-#define VMI_CALL_Halt 35
-#define VMI_CALL_Reboot 36
-#define VMI_CALL_Shutdown 37
-#define VMI_CALL_SetPxE 38
-#define VMI_CALL_SetPxELong 39
-#define VMI_CALL_UpdatePxE 40
-#define VMI_CALL_UpdatePxELong 41
-#define VMI_CALL_MachineToPhysical 42
-#define VMI_CALL_PhysicalToMachine 43
-#define VMI_CALL_AllocatePage 44
-#define VMI_CALL_ReleasePage 45
-#define VMI_CALL_InvalPage 46
-#define VMI_CALL_FlushTLB 47
-#define VMI_CALL_SetLinearMapping 48
-
-#define VMI_CALL_SetIOPLMask 61
-#define VMI_CALL_SetInitialAPState 62
-#define VMI_CALL_APICWrite 63
-#define VMI_CALL_APICRead 64
-#define VMI_CALL_IODelay 65
-#define VMI_CALL_SetLazyMode 73
-
-/*
- *---------------------------------------------------------------------
- *
- * MMU operation flags
- *
- *---------------------------------------------------------------------
- */
-
-/* Flags used by VMI_{Allocate|Release}Page call */
-#define VMI_PAGE_PAE 0x10 /* Allocate PAE shadow */
-#define VMI_PAGE_CLONE 0x20 /* Clone from another shadow */
-#define VMI_PAGE_ZEROED 0x40 /* Page is pre-zeroed */
-
-
-/* Flags shared by Allocate|Release Page and PTE updates */
-#define VMI_PAGE_PT 0x01
-#define VMI_PAGE_PD 0x02
-#define VMI_PAGE_PDP 0x04
-#define VMI_PAGE_PML4 0x08
-
-#define VMI_PAGE_NORMAL 0x00 /* for debugging */
-
-/* Flags used by PTE updates */
-#define VMI_PAGE_CURRENT_AS 0x10 /* implies VMI_PAGE_VA_MASK is valid */
-#define VMI_PAGE_DEFER 0x20 /* may queue update until TLB inval */
-#define VMI_PAGE_VA_MASK 0xfffff000
-
-#ifdef CONFIG_X86_PAE
-#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
-#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
-#else
-#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_ZEROED)
-#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_ZEROED)
-#endif
-
-/* Flags used by VMI_FlushTLB call */
-#define VMI_FLUSH_TLB 0x01
-#define VMI_FLUSH_GLOBAL 0x02
-
-/*
- *---------------------------------------------------------------------
- *
- * VMI relocation definitions for ROM call get_reloc
- *
- *---------------------------------------------------------------------
- */
-
-/* VMI Relocation types */
-#define VMI_RELOCATION_NONE 0
-#define VMI_RELOCATION_CALL_REL 1
-#define VMI_RELOCATION_JUMP_REL 2
-#define VMI_RELOCATION_NOP 3
-
-#ifndef __ASSEMBLY__
-struct vmi_relocation_info {
- unsigned char *eip;
- unsigned char type;
- unsigned char reserved[3];
-};
-#endif
-
-
-/*
- *---------------------------------------------------------------------
- *
- * Generic ROM structures and definitions
- *
- *---------------------------------------------------------------------
- */
-
-#ifndef __ASSEMBLY__
-
-struct vrom_header {
- u16 rom_signature; /* option ROM signature */
- u8 rom_length; /* ROM length in 512 byte chunks */
- u8 rom_entry[4]; /* 16-bit code entry point */
- u8 rom_pad0; /* 4-byte align pad */
- u32 vrom_signature; /* VROM identification signature */
- u8 api_version_min;/* Minor version of API */
- u8 api_version_maj;/* Major version of API */
- u8 jump_slots; /* Number of jump slots */
- u8 reserved1; /* Reserved for expansion */
- u32 virtual_top; /* Hypervisor virtual address start */
- u16 reserved2; /* Reserved for expansion */
- u16 license_offs; /* Offset to License string */
- u16 pci_header_offs;/* Offset to PCI OPROM header */
- u16 pnp_header_offs;/* Offset to PnP OPROM header */
- u32 rom_pad3; /* PnP reserverd / VMI reserved */
- u8 reserved[96]; /* Reserved for headers */
- char vmi_init[8]; /* VMI_Init jump point */
- char get_reloc[8]; /* VMI_GetRelocationInfo jump point */
-} __attribute__((packed));
-
-struct pnp_header {
- char sig[4];
- char rev;
- char size;
- short next;
- short res;
- long devID;
- unsigned short manufacturer_offset;
- unsigned short product_offset;
-} __attribute__((packed));
-
-struct pci_header {
- char sig[4];
- short vendorID;
- short deviceID;
- short vpdData;
- short size;
- char rev;
- char class;
- char subclass;
- char interface;
- short chunks;
- char rom_version_min;
- char rom_version_maj;
- char codetype;
- char lastRom;
- short reserved;
-} __attribute__((packed));
-
-/* Function prototypes for bootstrapping */
-#ifdef CONFIG_VMI
-extern void vmi_init(void);
-extern void vmi_activate(void);
-extern void vmi_bringup(void);
-#else
-static inline void vmi_init(void) {}
-static inline void vmi_activate(void) {}
-static inline void vmi_bringup(void) {}
-#endif
-
-/* State needed to start an application processor in an SMP system. */
-struct vmi_ap_state {
- u32 cr0;
- u32 cr2;
- u32 cr3;
- u32 cr4;
-
- u64 efer;
-
- u32 eip;
- u32 eflags;
- u32 eax;
- u32 ebx;
- u32 ecx;
- u32 edx;
- u32 esp;
- u32 ebp;
- u32 esi;
- u32 edi;
- u16 cs;
- u16 ss;
- u16 ds;
- u16 es;
- u16 fs;
- u16 gs;
- u16 ldtr;
-
- u16 gdtr_limit;
- u32 gdtr_base;
- u32 idtr_base;
- u16 idtr_limit;
-};
-
-#endif
diff --git a/arch/x86/include/asm/vmi_time.h b/arch/x86/include/asm/vmi_time.h
deleted file mode 100644
index c6e0bee93e3c..000000000000
--- a/arch/x86/include/asm/vmi_time.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * VMI Time wrappers
- *
- * Copyright (C) 2006, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to dhecht@vmware.com
- *
- */
-
-#ifndef _ASM_X86_VMI_TIME_H
-#define _ASM_X86_VMI_TIME_H
-
-/*
- * Raw VMI call indices for timer functions
- */
-#define VMI_CALL_GetCycleFrequency 66
-#define VMI_CALL_GetCycleCounter 67
-#define VMI_CALL_SetAlarm 68
-#define VMI_CALL_CancelAlarm 69
-#define VMI_CALL_GetWallclockTime 70
-#define VMI_CALL_WallclockUpdated 71
-
-/* Cached VMI timer operations */
-extern struct vmi_timer_ops {
- u64 (*get_cycle_frequency)(void);
- u64 (*get_cycle_counter)(int);
- u64 (*get_wallclock)(void);
- int (*wallclock_updated)(void);
- void (*set_alarm)(u32 flags, u64 expiry, u64 period);
- void (*cancel_alarm)(u32 flags);
-} vmi_timer_ops;
-
-/* Prototypes */
-extern void __init vmi_time_init(void);
-extern unsigned long vmi_get_wallclock(void);
-extern int vmi_set_wallclock(unsigned long now);
-extern unsigned long long vmi_sched_clock(void);
-extern unsigned long vmi_tsc_khz(void);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-extern void __devinit vmi_time_bsp_init(void);
-extern void __devinit vmi_time_ap_init(void);
-#endif
-
-/*
- * When run under a hypervisor, a vcpu is always in one of three states:
- * running, halted, or ready. The vcpu is in the 'running' state if it
- * is executing. When the vcpu executes the halt interface, the vcpu
- * enters the 'halted' state and remains halted until there is some work
- * pending for the vcpu (e.g. an alarm expires, host I/O completes on
- * behalf of virtual I/O). At this point, the vcpu enters the 'ready'
- * state (waiting for the hypervisor to reschedule it). Finally, at any
- * time when the vcpu is not in the 'running' state nor the 'halted'
- * state, it is in the 'ready' state.
- *
- * Real time is advances while the vcpu is 'running', 'ready', or
- * 'halted'. Stolen time is the time in which the vcpu is in the
- * 'ready' state. Available time is the remaining time -- the vcpu is
- * either 'running' or 'halted'.
- *
- * All three views of time are accessible through the VMI cycle
- * counters.
- */
-
-/* The cycle counters. */
-#define VMI_CYCLES_REAL 0
-#define VMI_CYCLES_AVAILABLE 1
-#define VMI_CYCLES_STOLEN 2
-
-/* The alarm interface 'flags' bits */
-#define VMI_ALARM_COUNTERS 2
-
-#define VMI_ALARM_COUNTER_MASK 0x000000ff
-
-#define VMI_ALARM_WIRED_IRQ0 0x00000000
-#define VMI_ALARM_WIRED_LVTT 0x00010000
-
-#define VMI_ALARM_IS_ONESHOT 0x00000000
-#define VMI_ALARM_IS_PERIODIC 0x00000100
-
-#define CONFIG_VMI_ALARM_HZ 100
-
-#endif /* _ASM_X86_VMI_TIME_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9e6779f7cf2d..9f0cbd987d50 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -257,6 +257,7 @@ enum vmcs_field {
#define EXIT_REASON_IO_INSTRUCTION 30
#define EXIT_REASON_MSR_READ 31
#define EXIT_REASON_MSR_WRITE 32
+#define EXIT_REASON_INVALID_STATE 33
#define EXIT_REASON_MWAIT_INSTRUCTION 36
#define EXIT_REASON_MONITOR_INSTRUCTION 39
#define EXIT_REASON_PAUSE_INSTRUCTION 40
@@ -266,6 +267,7 @@ enum vmcs_field {
#define EXIT_REASON_EPT_VIOLATION 48
#define EXIT_REASON_EPT_MISCONFIG 49
#define EXIT_REASON_WBINVD 54
+#define EXIT_REASON_XSETBV 55
/*
* Interruption-information format
@@ -375,6 +377,9 @@ enum vmcs_field {
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
+#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */
+#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
+
#define VMX_EPT_DEFAULT_GAW 3
#define VMX_EPT_MAX_GAW 0x4
#define VMX_EPT_MT_EPTE_SHIFT 3
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index baa579c8e038..64642ad019fb 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -154,9 +154,18 @@ struct x86_platform_ops {
int (*i8042_detect)(void);
};
+struct pci_dev;
+
+struct x86_msi_ops {
+ int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
+ void (*teardown_msi_irq)(unsigned int irq);
+ void (*teardown_msi_irqs)(struct pci_dev *dev);
+};
+
extern struct x86_init_ops x86_init;
extern struct x86_cpuinit_ops x86_cpuinit;
extern struct x86_platform_ops x86_platform;
+extern struct x86_msi_ops x86_msi;
extern void x86_init_noop(void);
extern void x86_init_uint_noop(unsigned int unused);
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 9c371e4a9fa6..a3c28ae4025b 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -200,6 +200,23 @@ extern struct { char _entry[32]; } hypercall_page[];
(type)__res; \
})
+static inline long
+privcmd_call(unsigned call,
+ unsigned long a1, unsigned long a2,
+ unsigned long a3, unsigned long a4,
+ unsigned long a5)
+{
+ __HYPERCALL_DECLS;
+ __HYPERCALL_5ARG(a1, a2, a3, a4, a5);
+
+ asm volatile("call *%[call]"
+ : __HYPERCALL_5PARAM
+ : [call] "a" (&hypercall_page[call])
+ : __HYPERCALL_CLOBBER5);
+
+ return (long)__res;
+}
+
static inline int
HYPERVISOR_set_trap_table(struct trap_info *table)
{
@@ -417,6 +434,12 @@ HYPERVISOR_nmi_op(unsigned long op, unsigned long arg)
return _hypercall2(int, nmi_op, op, arg);
}
+static inline unsigned long __must_check
+HYPERVISOR_hvm_op(int op, void *arg)
+{
+ return _hypercall2(unsigned long, hvm_op, op, arg);
+}
+
static inline void
MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
{
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 018a0a400799..dd8c1414b3d5 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -37,14 +37,21 @@ typedef struct xpaddr {
extern unsigned long get_phys_to_machine(unsigned long pfn);
-extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
static inline unsigned long pfn_to_mfn(unsigned long pfn)
{
+ unsigned long mfn;
+
if (xen_feature(XENFEAT_auto_translated_physmap))
return pfn;
- return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT;
+ mfn = get_phys_to_machine(pfn);
+
+ if (mfn != INVALID_P2M_ENTRY)
+ mfn &= ~FOREIGN_FRAME_BIT;
+
+ return mfn;
}
static inline int phys_to_machine_mapping_valid(unsigned long pfn)
@@ -112,13 +119,9 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
*/
static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
{
- extern unsigned long max_mapnr;
unsigned long pfn = mfn_to_pfn(mfn);
- if ((pfn < max_mapnr)
- && !xen_feature(XENFEAT_auto_translated_physmap)
- && (get_phys_to_machine(pfn) != mfn))
- return max_mapnr; /* force !pfn_valid() */
- /* XXX fixme; not true with sparsemem */
+ if (get_phys_to_machine(pfn) != mfn)
+ return -1; /* force !pfn_valid() */
return pfn;
}
@@ -163,6 +166,7 @@ static inline pte_t __pte_ma(pteval_t x)
#define pgd_val_ma(x) ((x).pgd)
+void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid);
xmaddr_t arbitrary_virt_to_machine(void *address);
unsigned long arbitrary_virt_to_mfn(void *vaddr);
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
new file mode 100644
index 000000000000..2329b3eaf8d3
--- /dev/null
+++ b/arch/x86/include/asm/xen/pci.h
@@ -0,0 +1,65 @@
+#ifndef _ASM_X86_XEN_PCI_H
+#define _ASM_X86_XEN_PCI_H
+
+#if defined(CONFIG_PCI_XEN)
+extern int __init pci_xen_init(void);
+extern int __init pci_xen_hvm_init(void);
+#define pci_xen 1
+#else
+#define pci_xen 0
+#define pci_xen_init (0)
+static inline int pci_xen_hvm_init(void)
+{
+ return -1;
+}
+#endif
+#if defined(CONFIG_XEN_DOM0)
+void __init xen_setup_pirqs(void);
+#else
+static inline void __init xen_setup_pirqs(void)
+{
+}
+#endif
+
+#if defined(CONFIG_PCI_MSI)
+#if defined(CONFIG_PCI_XEN)
+/* The drivers/pci/xen-pcifront.c sets this structure to
+ * its own functions.
+ */
+struct xen_pci_frontend_ops {
+ int (*enable_msi)(struct pci_dev *dev, int **vectors);
+ void (*disable_msi)(struct pci_dev *dev);
+ int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec);
+ void (*disable_msix)(struct pci_dev *dev);
+};
+
+extern struct xen_pci_frontend_ops *xen_pci_frontend;
+
+static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
+ int **vectors)
+{
+ if (xen_pci_frontend && xen_pci_frontend->enable_msi)
+ return xen_pci_frontend->enable_msi(dev, vectors);
+ return -ENODEV;
+}
+static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
+{
+ if (xen_pci_frontend && xen_pci_frontend->disable_msi)
+ xen_pci_frontend->disable_msi(dev);
+}
+static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
+ int **vectors, int nvec)
+{
+ if (xen_pci_frontend && xen_pci_frontend->enable_msix)
+ return xen_pci_frontend->enable_msix(dev, vectors, nvec);
+ return -ENODEV;
+}
+static inline void xen_pci_frontend_disable_msix(struct pci_dev *dev)
+{
+ if (xen_pci_frontend && xen_pci_frontend->disable_msix)
+ xen_pci_frontend->disable_msix(dev);
+}
+#endif /* CONFIG_PCI_XEN */
+#endif /* CONFIG_PCI_MSI */
+
+#endif /* _ASM_X86_XEN_PCI_H */
diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h
new file mode 100644
index 000000000000..1be1ab7d6a41
--- /dev/null
+++ b/arch/x86/include/asm/xen/swiotlb-xen.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_X86_SWIOTLB_XEN_H
+#define _ASM_X86_SWIOTLB_XEN_H
+
+#ifdef CONFIG_SWIOTLB_XEN
+extern int xen_swiotlb;
+extern int __init pci_xen_swiotlb_detect(void);
+extern void __init pci_xen_swiotlb_init(void);
+#else
+#define xen_swiotlb (0)
+static inline int __init pci_xen_swiotlb_detect(void) { return 0; }
+static inline void __init pci_xen_swiotlb_init(void) { }
+#endif
+
+#endif /* _ASM_X86_SWIOTLB_XEN_H */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 2c4390cae228..c6ce2452f10c 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -3,7 +3,8 @@
#include <linux/types.h>
#include <asm/processor.h>
-#include <asm/i387.h>
+
+#define XSTATE_CPUID 0x0000000d
#define XSTATE_FP 0x1
#define XSTATE_SSE 0x2
@@ -13,6 +14,12 @@
#define FXSAVE_SIZE 512
+#define XSAVE_HDR_SIZE 64
+#define XSAVE_HDR_OFFSET FXSAVE_SIZE
+
+#define XSAVE_YMM_SIZE 256
+#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
+
/*
* These are the features that the OS can handle currently.
*/
@@ -26,10 +33,8 @@
extern unsigned int xstate_size;
extern u64 pcntxt_mask;
-extern struct xsave_struct *init_xstate_buf;
extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
-extern void xsave_cntxt_init(void);
extern void xsave_init(void);
extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
extern int init_fpu(struct task_struct *child);
@@ -59,6 +64,16 @@ static inline int fpu_xrstor_checking(struct fpu *fpu)
static inline int xsave_user(struct xsave_struct __user *buf)
{
int err;
+
+ /*
+ * Clear the xsave header first, so that reserved fields are
+ * initialized to zero.
+ */
+ err = __clear_user(&buf->xsave_hdr,
+ sizeof(struct xsave_hdr_struct));
+ if (unlikely(err))
+ return -EFAULT;
+
__asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
"2:\n"
".section .fixup,\"ax\"\n"
@@ -111,12 +126,25 @@ static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
: "memory");
}
+static inline void xsave_state(struct xsave_struct *fx, u64 mask)
+{
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+}
+
static inline void fpu_xsave(struct fpu *fpu)
{
/* This, however, we can work around by forcing the compiler to select
an addressing mode that doesn't require extended registers. */
- __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27"
- : : "D" (&(fpu->state->xsave)),
- "a" (-1), "d"(-1) : "memory");
+ alternative_input(
+ ".byte " REX_PREFIX "0x0f,0xae,0x27",
+ ".byte " REX_PREFIX "0x0f,0xae,0x37",
+ X86_FEATURE_XSAVEOPT,
+ [fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) :
+ "memory");
}
#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e77b22083721..9e13763b6092 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -11,6 +11,8 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_tsc.o = -pg
CFLAGS_REMOVE_rtc.o = -pg
CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
+CFLAGS_REMOVE_pvclock.o = -pg
+CFLAGS_REMOVE_kvmclock.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
endif
@@ -32,8 +34,8 @@ GCOV_PROFILE_paravirt.o := n
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o ldt.o dumpstack.o
-obj-y += setup.o x86_init.o i8259.o irqinit.o
-obj-$(CONFIG_X86_VISWS) += visws_quirks.o
+obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_X86_32) += probe_roms_32.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
@@ -42,6 +44,7 @@ obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
obj-y += tsc.o io_delay.o rtc.o
+obj-y += pci-iommu_table.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
obj-y += process.o
@@ -54,7 +57,6 @@ obj-$(CONFIG_INTEL_TXT) += tboot.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += cpu/
obj-y += acpi/
-obj-$(CONFIG_SFI) += sfi.o
obj-y += reboot.o
obj-$(CONFIG_MCA) += mca_32.o
obj-$(CONFIG_X86_MSR) += msr.o
@@ -78,20 +80,19 @@ obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
+obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_APB_TIMER) += apb_timer.o
-obj-$(CONFIG_K8_NB) += k8.o
+obj-$(CONFIG_AMD_NB) += amd_nb.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
-obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
obj-$(CONFIG_KVM_GUEST) += kvm.o
obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
@@ -100,12 +101,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
-obj-$(CONFIG_SCx200) += scx200.o
-scx200-y += scx200_32.o
-
-obj-$(CONFIG_OLPC) += olpc.o
-obj-$(CONFIG_X86_MRST) += mrst.o
-
microcode-y := microcode_core.o
microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
@@ -118,8 +113,6 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
- obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
- obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
obj-$(CONFIG_AUDIT) += audit_64.o
obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c05872aa3ce0..71232b941b6c 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -513,35 +513,62 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
return 0;
}
-/*
- * success: return IRQ number (>=0)
- * failure: return < 0
- */
-int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
+ int trigger, int polarity)
{
- unsigned int irq;
- unsigned int plat_gsi = gsi;
-
#ifdef CONFIG_PCI
/*
* Make sure all (legacy) PCI IRQs are set as level-triggered.
*/
- if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
- if (trigger == ACPI_LEVEL_SENSITIVE)
- eisa_set_level_irq(gsi);
- }
+ if (trigger == ACPI_LEVEL_SENSITIVE)
+ eisa_set_level_irq(gsi);
#endif
+ return gsi;
+}
+
+static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
#ifdef CONFIG_X86_IO_APIC
- if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
- plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
- }
+ gsi = mp_register_gsi(dev, gsi, trigger, polarity);
#endif
+
+ return gsi;
+}
+
+int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
+ int trigger, int polarity) = acpi_register_gsi_pic;
+
+/*
+ * success: return IRQ number (>=0)
+ * failure: return < 0
+ */
+int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+{
+ unsigned int irq;
+ unsigned int plat_gsi = gsi;
+
+ plat_gsi = (*__acpi_register_gsi)(dev, gsi, trigger, polarity);
irq = gsi_to_irq(plat_gsi);
return irq;
}
+void __init acpi_set_irq_model_pic(void)
+{
+ acpi_irq_model = ACPI_IRQ_MODEL_PIC;
+ __acpi_register_gsi = acpi_register_gsi_pic;
+ acpi_ioapic = 0;
+}
+
+void __init acpi_set_irq_model_ioapic(void)
+{
+ acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
+ __acpi_register_gsi = acpi_register_gsi_ioapic;
+ acpi_ioapic = 1;
+}
+
/*
* ACPI based hotplug support for CPU
*/
@@ -1259,8 +1286,7 @@ static void __init acpi_process_madt(void)
*/
error = acpi_parse_madt_ioapic_entries();
if (!error) {
- acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
- acpi_ioapic = 1;
+ acpi_set_irq_model_ioapic();
smp_found_config = 1;
}
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index fb7a5f052e2b..5812404a0d4c 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -13,6 +13,7 @@
#include <acpi/processor.h>
#include <asm/acpi.h>
+#include <asm/mwait.h>
/*
* Initialize bm_flags based on the CPU cache properties
@@ -61,20 +62,10 @@ struct cstate_entry {
unsigned int ecx;
} states[ACPI_PROCESSOR_MAX_POWER];
};
-static struct cstate_entry *cpu_cstate_entry; /* per CPU ptr */
+static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */
static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
-#define MWAIT_SUBSTATE_MASK (0xf)
-#define MWAIT_CSTATE_MASK (0xf)
-#define MWAIT_SUBSTATE_SIZE (4)
-
-#define CPUID_MWAIT_LEAF (5)
-#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
-#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
-
-#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
-
#define NATIVE_CSTATE_BEYOND_HALT (2)
static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 580b4e296010..28595d6df47c 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -104,7 +104,7 @@ _start:
movl %eax, %ecx
orl %edx, %ecx
jz 1f
- movl $0xc0000080, %ecx
+ movl $MSR_EFER, %ecx
wrmsr
1:
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index fcc3c61fdecc..69fd72aa5594 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -2,16 +2,21 @@
* sleep.c - x86-specific ACPI sleep support.
*
* Copyright (C) 2001-2003 Patrick Mochel
- * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz>
*/
#include <linux/acpi.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/dmi.h>
#include <linux/cpumask.h>
#include <asm/segment.h>
#include <asm/desc.h>
+#ifdef CONFIG_X86_32
+#include <asm/pgtable.h>
+#endif
+
#include "realmode/wakeup.h"
#include "sleep.h"
@@ -90,7 +95,7 @@ int acpi_save_state_mem(void)
#ifndef CONFIG_64BIT
header->pmode_entry = (u32)&wakeup_pmode_return;
- header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET);
+ header->pmode_cr3 = (u32)__pa(&initial_page_table);
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
header->trampoline_segment = setup_trampoline() >> 4;
@@ -125,7 +130,7 @@ void acpi_restore_state_mem(void)
*/
void __init acpi_reserve_wakeup_memory(void)
{
- unsigned long mem;
+ phys_addr_t mem;
if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
printk(KERN_ERR
@@ -133,15 +138,15 @@ void __init acpi_reserve_wakeup_memory(void)
return;
}
- mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
+ mem = memblock_find_in_range(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
- if (mem == -1L) {
+ if (mem == MEMBLOCK_ERROR) {
printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
return;
}
acpi_realmode = (unsigned long) phys_to_virt(mem);
acpi_wakeup_address = mem;
- reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
+ memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
}
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 70237732a6c7..5079f24c955a 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -195,7 +195,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
-static void *text_poke_early(void *addr, const void *opcode, size_t len);
+void *text_poke_early(void *addr, const void *opcode, size_t len);
/* Replace instructions with better alternatives for this CPU type.
This runs before SMP is initialized to avoid SMP problems with
@@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
u8 *instr = a->instr;
BUG_ON(a->replacementlen > a->instrlen);
BUG_ON(a->instrlen > sizeof(insnbuf));
+ BUG_ON(a->cpuid >= NCAPINTS*32);
if (!boot_cpu_has(a->cpuid))
continue;
#ifdef CONFIG_X86_64
@@ -521,7 +522,7 @@ void __init alternative_instructions(void)
* instructions. And on the local CPU you need to be protected again NMI or MCE
* handlers seeing an inconsistent instruction while you patch.
*/
-static void *__init_or_module text_poke_early(void *addr, const void *opcode,
+void *__init_or_module text_poke_early(void *addr, const void *opcode,
size_t len)
{
unsigned long flags;
@@ -636,7 +637,33 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
tpp.len = len;
atomic_set(&stop_machine_first, 1);
wrote_text = 0;
- stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+ /* Use __stop_machine() because the caller already got online_cpus. */
+ __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
return addr;
}
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
+
+#ifdef CONFIG_X86_64
+unsigned char ideal_nop5[5] = { 0x66, 0x66, 0x66, 0x66, 0x90 };
+#else
+unsigned char ideal_nop5[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 };
+#endif
+
+void __init arch_init_ideal_nop5(void)
+{
+ /*
+ * There is no good nop for all x86 archs. This selection
+ * algorithm should be unified with the one in find_nop_table(),
+ * but this should be good enough for now.
+ *
+ * For cases other than the ones below, use the safe (as in
+ * always functional) defaults above.
+ */
+#ifdef CONFIG_X86_64
+ /* Don't use these on 32 bits due to broken virtualizers */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ memcpy(ideal_nop5, p6_nops[5], 5);
+#endif
+}
+#endif
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0d20286d78c6..d2fdb0826df2 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
* Leo Duran <leo.duran@amd.com>
*
@@ -1953,6 +1953,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
size_t size,
int dir)
{
+ dma_addr_t flush_addr;
dma_addr_t i, start;
unsigned int pages;
@@ -1960,6 +1961,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
(dma_addr + size > dma_dom->aperture_size))
return;
+ flush_addr = dma_addr;
pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
dma_addr &= PAGE_MASK;
start = dma_addr;
@@ -1974,7 +1976,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
dma_ops_free_addresses(dma_dom, dma_addr, pages);
if (amd_iommu_unmap_flush || dma_dom->need_flush) {
- iommu_flush_pages(&dma_dom->domain, dma_addr, size);
+ iommu_flush_pages(&dma_dom->domain, flush_addr, size);
dma_dom->need_flush = false;
}
}
@@ -2572,6 +2574,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
unsigned long cap)
{
+ switch (cap) {
+ case IOMMU_CAP_CACHE_COHERENCY:
+ return 1;
+ }
+
return 0;
}
@@ -2609,8 +2616,7 @@ int __init amd_iommu_init_passthrough(void)
pt_domain->mode |= PAGE_MODE_NONE;
- while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
-
+ for_each_pci_dev(dev) {
if (!check_device(&dev->dev))
continue;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 3cc63e2b8dd4..6e11c8134158 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
* Author: Joerg Roedel <joerg.roedel@amd.com>
* Leo Duran <leo.duran@amd.com>
*
@@ -31,7 +31,7 @@
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/x86_init.h>
-
+#include <asm/iommu_table.h>
/*
* definitions for the ACPI scanning code
*/
@@ -194,6 +194,39 @@ static inline unsigned long tbl_size(int entry_size)
return 1UL << shift;
}
+/* Access to l1 and l2 indexed register spaces */
+
+static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
+{
+ u32 val;
+
+ pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+ pci_read_config_dword(iommu->dev, 0xfc, &val);
+ return val;
+}
+
+static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
+{
+ pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
+ pci_write_config_dword(iommu->dev, 0xfc, val);
+ pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+}
+
+static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
+{
+ u32 val;
+
+ pci_write_config_dword(iommu->dev, 0xf0, address);
+ pci_read_config_dword(iommu->dev, 0xf4, &val);
+ return val;
+}
+
+static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
+{
+ pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
+ pci_write_config_dword(iommu->dev, 0xf4, val);
+}
+
/****************************************************************************
*
* AMD IOMMU MMIO register space handling functions
@@ -619,6 +652,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
{
int cap_ptr = iommu->cap_ptr;
u32 range, misc;
+ int i, j;
pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
&iommu->cap);
@@ -632,6 +666,30 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
iommu->last_device = calc_devid(MMIO_GET_BUS(range),
MMIO_GET_LD(range));
iommu->evt_msi_num = MMIO_MSI_NUM(misc);
+
+ if (!is_rd890_iommu(iommu->dev))
+ return;
+
+ /*
+ * Some rd890 systems may not be fully reconfigured by the BIOS, so
+ * it's necessary for us to store this information so it can be
+ * reprogrammed on resume
+ */
+
+ pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
+ &iommu->stored_addr_lo);
+ pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
+ &iommu->stored_addr_hi);
+
+ /* Low bit locks writes to configuration space */
+ iommu->stored_addr_lo &= ~1;
+
+ for (i = 0; i < 6; i++)
+ for (j = 0; j < 0x12; j++)
+ iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
+
+ for (i = 0; i < 0x83; i++)
+ iommu->stored_l2[i] = iommu_read_l2(iommu, i);
}
/*
@@ -649,29 +707,9 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
struct ivhd_entry *e;
/*
- * First set the recommended feature enable bits from ACPI
- * into the IOMMU control registers
- */
- h->flags & IVHD_FLAG_HT_TUN_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
- iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
-
- h->flags & IVHD_FLAG_PASSPW_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
- iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
-
- h->flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
- iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
-
- h->flags & IVHD_FLAG_ISOC_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
- iommu_feature_disable(iommu, CONTROL_ISOC_EN);
-
- /*
- * make IOMMU memory accesses cache coherent
+ * First save the recommended feature enable bits from ACPI
*/
- iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
+ iommu->acpi_flags = h->flags;
/*
* Done. Now parse the device entries
@@ -1116,6 +1154,79 @@ static void init_device_table(void)
}
}
+static void iommu_init_flags(struct amd_iommu *iommu)
+{
+ iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ?
+ iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
+ iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
+
+ iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ?
+ iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
+ iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
+
+ iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
+ iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
+ iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
+
+ iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ?
+ iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
+ iommu_feature_disable(iommu, CONTROL_ISOC_EN);
+
+ /*
+ * make IOMMU memory accesses cache coherent
+ */
+ iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
+}
+
+static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
+{
+ int i, j;
+ u32 ioc_feature_control;
+ struct pci_dev *pdev = NULL;
+
+ /* RD890 BIOSes may not have completely reconfigured the iommu */
+ if (!is_rd890_iommu(iommu->dev))
+ return;
+
+ /*
+ * First, we need to ensure that the iommu is enabled. This is
+ * controlled by a register in the northbridge
+ */
+ pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
+
+ if (!pdev)
+ return;
+
+ /* Select Northbridge indirect register 0x75 and enable writing */
+ pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
+ pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
+
+ /* Enable the iommu */
+ if (!(ioc_feature_control & 0x1))
+ pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
+
+ pci_dev_put(pdev);
+
+ /* Restore the iommu BAR */
+ pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+ iommu->stored_addr_lo);
+ pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
+ iommu->stored_addr_hi);
+
+ /* Restore the l1 indirect regs for each of the 6 l1s */
+ for (i = 0; i < 6; i++)
+ for (j = 0; j < 0x12; j++)
+ iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
+
+ /* Restore the l2 indirect regs */
+ for (i = 0; i < 0x83; i++)
+ iommu_write_l2(iommu, i, iommu->stored_l2[i]);
+
+ /* Lock PCI setup registers */
+ pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+ iommu->stored_addr_lo | 1);
+}
+
/*
* This function finally enables all IOMMUs found in the system after
* they have been initialized
@@ -1126,6 +1237,7 @@ static void enable_iommus(void)
for_each_iommu(iommu) {
iommu_disable(iommu);
+ iommu_init_flags(iommu);
iommu_set_device_table(iommu);
iommu_enable_command_buffer(iommu);
iommu_enable_event_buffer(iommu);
@@ -1150,6 +1262,11 @@ static void disable_iommus(void)
static int amd_iommu_resume(struct sys_device *dev)
{
+ struct amd_iommu *iommu;
+
+ for_each_iommu(iommu)
+ iommu_apply_resume_quirks(iommu);
+
/* re-load the hardware */
enable_iommus();
@@ -1382,13 +1499,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
return 0;
}
-void __init amd_iommu_detect(void)
+int __init amd_iommu_detect(void)
{
if (no_iommu || (iommu_detected && !gart_iommu_aperture))
- return;
+ return -ENODEV;
if (amd_iommu_disabled)
- return;
+ return -ENODEV;
if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
iommu_detected = 1;
@@ -1397,7 +1514,9 @@ void __init amd_iommu_detect(void)
/* Make sure ACS will be enabled */
pci_request_acs();
+ return 1;
}
+ return -ENODEV;
}
/****************************************************************************
@@ -1428,3 +1547,8 @@ static int __init parse_amd_iommu_options(char *str)
__setup("amd_iommu_dump", parse_amd_iommu_dump);
__setup("amd_iommu=", parse_amd_iommu_options);
+
+IOMMU_INIT_FINISH(amd_iommu_detect,
+ gart_iommu_hole_init,
+ 0,
+ 0);
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/amd_nb.c
index 0f7bc20cfcde..8f6463d8ed0d 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -8,21 +8,19 @@
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/spinlock.h>
-#include <asm/k8.h>
-
-int num_k8_northbridges;
-EXPORT_SYMBOL(num_k8_northbridges);
+#include <asm/amd_nb.h>
static u32 *flush_words;
struct pci_device_id k8_nb_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
{}
};
EXPORT_SYMBOL(k8_nb_ids);
-struct pci_dev **k8_northbridges;
+struct k8_northbridge_info k8_northbridges;
EXPORT_SYMBOL(k8_northbridges);
static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
@@ -40,36 +38,45 @@ int cache_k8_northbridges(void)
int i;
struct pci_dev *dev;
- if (num_k8_northbridges)
+ if (k8_northbridges.num)
return 0;
dev = NULL;
while ((dev = next_k8_northbridge(dev)) != NULL)
- num_k8_northbridges++;
+ k8_northbridges.num++;
+
+ /* some CPU families (e.g. family 0x11) do not support GART */
+ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+ boot_cpu_data.x86 == 0x15)
+ k8_northbridges.gart_supported = 1;
- k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
- GFP_KERNEL);
- if (!k8_northbridges)
+ k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) *
+ sizeof(void *), GFP_KERNEL);
+ if (!k8_northbridges.nb_misc)
return -ENOMEM;
- if (!num_k8_northbridges) {
- k8_northbridges[0] = NULL;
+ if (!k8_northbridges.num) {
+ k8_northbridges.nb_misc[0] = NULL;
return 0;
}
- flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
- if (!flush_words) {
- kfree(k8_northbridges);
- return -ENOMEM;
+ if (k8_northbridges.gart_supported) {
+ flush_words = kmalloc(k8_northbridges.num * sizeof(u32),
+ GFP_KERNEL);
+ if (!flush_words) {
+ kfree(k8_northbridges.nb_misc);
+ return -ENOMEM;
+ }
}
dev = NULL;
i = 0;
while ((dev = next_k8_northbridge(dev)) != NULL) {
- k8_northbridges[i] = dev;
- pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
+ k8_northbridges.nb_misc[i] = dev;
+ if (k8_northbridges.gart_supported)
+ pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
}
- k8_northbridges[i] = NULL;
+ k8_northbridges.nb_misc[i] = NULL;
return 0;
}
EXPORT_SYMBOL_GPL(cache_k8_northbridges);
@@ -93,22 +100,25 @@ void k8_flush_garts(void)
unsigned long flags;
static DEFINE_SPINLOCK(gart_lock);
+ if (!k8_northbridges.gart_supported)
+ return;
+
/* Avoid races between AGP and IOMMU. In theory it's not needed
but I'm not sure if the hardware won't lose flush requests
when another is pending. This whole thing is so expensive anyways
that it doesn't matter to serialize more. -AK */
spin_lock_irqsave(&gart_lock, flags);
flushed = 0;
- for (i = 0; i < num_k8_northbridges; i++) {
- pci_write_config_dword(k8_northbridges[i], 0x9c,
+ for (i = 0; i < k8_northbridges.num; i++) {
+ pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c,
flush_words[i]|1);
flushed++;
}
- for (i = 0; i < num_k8_northbridges; i++) {
+ for (i = 0; i < k8_northbridges.num; i++) {
u32 w;
/* Make sure the hardware actually executed the flush*/
for (;;) {
- pci_read_config_dword(k8_northbridges[i],
+ pci_read_config_dword(k8_northbridges.nb_misc[i],
0x9c, &w);
if (!(w & 1))
break;
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index a35347501d36..92543c73cf8e 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -43,10 +43,11 @@
#include <asm/fixmap.h>
#include <asm/apb_timer.h>
+#include <asm/mrst.h>
#define APBT_MASK CLOCKSOURCE_MASK(32)
#define APBT_SHIFT 22
-#define APBT_CLOCKEVENT_RATING 150
+#define APBT_CLOCKEVENT_RATING 110
#define APBT_CLOCKSOURCE_RATING 250
#define APBT_MIN_DELTA_USEC 200
@@ -83,8 +84,6 @@ struct apbt_dev {
char name[10];
};
-int disable_apbt_percpu __cpuinitdata;
-
static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
#ifdef CONFIG_SMP
@@ -195,29 +194,6 @@ static struct clock_event_device apbt_clockevent = {
};
/*
- * if user does not want to use per CPU apb timer, just give it a lower rating
- * than local apic timer and skip the late per cpu timer init.
- */
-static inline int __init setup_x86_mrst_timer(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (strcmp("apbt_only", arg) == 0)
- disable_apbt_percpu = 0;
- else if (strcmp("lapic_and_apbt", arg) == 0)
- disable_apbt_percpu = 1;
- else {
- pr_warning("X86 MRST timer option %s not recognised"
- " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
- arg);
- return -EINVAL;
- }
- return 0;
-}
-__setup("x86_mrst_timer=", setup_x86_mrst_timer);
-
-/*
* start count down from 0xffff_ffff. this is done by toggling the enable bit
* then load initial load count to ~0.
*/
@@ -255,34 +231,6 @@ static void apbt_restart_clocksource(struct clocksource *cs)
apbt_start_counter(phy_cs_timer_id);
}
-/* Setup IRQ routing via IOAPIC */
-#ifdef CONFIG_SMP
-static void apbt_setup_irq(struct apbt_dev *adev)
-{
- struct irq_chip *chip;
- struct irq_desc *desc;
-
- /* timer0 irq has been setup early */
- if (adev->irq == 0)
- return;
- desc = irq_to_desc(adev->irq);
- chip = get_irq_chip(adev->irq);
- disable_irq(adev->irq);
- desc->status |= IRQ_MOVE_PCNTXT;
- irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
- /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
- set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
- enable_irq(adev->irq);
- if (system_state == SYSTEM_BOOTING)
- if (request_irq(adev->irq, apbt_interrupt_handler,
- IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
- adev->name, adev)) {
- printk(KERN_ERR "Failed request IRQ for APBT%d\n",
- adev->num);
- }
-}
-#endif
-
static void apbt_enable_int(int n)
{
unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
@@ -335,7 +283,7 @@ static int __init apbt_clockevent_register(void)
adev->num = smp_processor_id();
memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
- if (disable_apbt_percpu) {
+ if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
global_clock_event = &adev->evt;
printk(KERN_DEBUG "%s clockevent registered as global\n",
@@ -358,6 +306,27 @@ static int __init apbt_clockevent_register(void)
}
#ifdef CONFIG_SMP
+
+static void apbt_setup_irq(struct apbt_dev *adev)
+{
+ /* timer0 irq has been setup early */
+ if (adev->irq == 0)
+ return;
+
+ if (system_state == SYSTEM_BOOTING) {
+ irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
+ /* APB timer irqs are set up as mp_irqs, timer is edge type */
+ __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
+ if (request_irq(adev->irq, apbt_interrupt_handler,
+ IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
+ adev->name, adev)) {
+ printk(KERN_ERR "Failed request IRQ for APBT%d\n",
+ adev->num);
+ }
+ } else
+ enable_irq(adev->irq);
+}
+
/* Should be called with per cpu */
void apbt_setup_secondary_clock(void)
{
@@ -367,7 +336,7 @@ void apbt_setup_secondary_clock(void)
/* Don't register boot CPU clockevent */
cpu = smp_processor_id();
- if (cpu == boot_cpu_id)
+ if (!cpu)
return;
/*
* We need to calculate the scaled math multiplication factor for
@@ -413,23 +382,25 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
switch (action & 0xf) {
case CPU_DEAD:
+ disable_irq(adev->irq);
apbt_disable_int(cpu);
- if (system_state == SYSTEM_RUNNING)
+ if (system_state == SYSTEM_RUNNING) {
pr_debug("skipping APBT CPU %lu offline\n", cpu);
- else if (adev) {
+ } else if (adev) {
pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
free_irq(adev->irq, adev);
}
break;
default:
- pr_debug(KERN_INFO "APBT notified %lu, no action\n", action);
+ pr_debug("APBT notified %lu, no action\n", action);
}
return NOTIFY_OK;
}
static __init int apbt_late_init(void)
{
- if (disable_apbt_percpu || !apb_timer_block_enabled)
+ if (mrst_timer_options == MRST_TIMER_LAPIC_APBT ||
+ !apb_timer_block_enabled)
return 0;
/* This notifier should be called after workqueue is ready */
hotcpu_notifier(apbt_cpuhp_notify, -20);
@@ -450,6 +421,8 @@ static void apbt_set_mode(enum clock_event_mode mode,
int timer_num;
struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
+ BUG_ON(!apbt_virt_address);
+
timer_num = adev->num;
pr_debug("%s CPU %d timer %d mode=%d\n",
__func__, first_cpu(*evt->cpumask), timer_num, mode);
@@ -573,7 +546,7 @@ bad_count:
pr_debug("APB CS going back %lx:%lx:%lx ",
t2, last_read, t2 - last_read);
bad_count_x3:
- pr_debug(KERN_INFO "tripple check enforced\n");
+ pr_debug("triple check enforced\n");
t0 = apbt_readl(phy_cs_timer_id,
APBTMR_N_CURRENT_VALUE);
udelay(1);
@@ -676,7 +649,7 @@ void __init apbt_time_init(void)
}
#ifdef CONFIG_SMP
/* kernel cmdline disable apb timer, so we will use lapic timers */
- if (disable_apbt_percpu) {
+ if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
printk(KERN_INFO "apbt: disabled per cpu timer\n");
return;
}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index b5d8b0bcf235..b3a16e8f0703 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -27,7 +27,7 @@
#include <asm/gart.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
#include <asm/x86_init.h>
int gart_iommu_aperture;
@@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void)
* or BIOS forget to put that in reserved.
* try to update e820 to make that region as reserved.
*/
- u32 agp_aper_base = 0, agp_aper_order = 0;
+ u32 agp_aper_order = 0;
int i, fix, slot, valid_agp = 0;
u32 ctl;
u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
@@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void)
return;
/* This is mostly duplicate of iommu_hole_init */
- agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
+ search_agp_bridge(&agp_aper_order, &valid_agp);
fix = 0;
for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
@@ -307,7 +307,7 @@ void __init early_gart_iommu_check(void)
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
- aper_enabled = ctl & AMD64_GARTEN;
+ aper_enabled = ctl & GARTEN;
aper_order = (ctl >> 1) & 7;
aper_size = (32 * 1024 * 1024) << aper_order;
aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
@@ -362,7 +362,7 @@ void __init early_gart_iommu_check(void)
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
- ctl &= ~AMD64_GARTEN;
+ ctl &= ~GARTEN;
write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
}
}
@@ -371,7 +371,7 @@ void __init early_gart_iommu_check(void)
static int __initdata printed_gart_size_msg;
-void __init gart_iommu_hole_init(void)
+int __init gart_iommu_hole_init(void)
{
u32 agp_aper_base = 0, agp_aper_order = 0;
u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
@@ -381,7 +381,7 @@ void __init gart_iommu_hole_init(void)
if (gart_iommu_aperture_disabled || !fix_aperture ||
!early_pci_allowed())
- return;
+ return -ENODEV;
printk(KERN_INFO "Checking aperture...\n");
@@ -463,8 +463,9 @@ out:
unsigned long n = (32 * 1024 * 1024) << last_aper_order;
insert_aperture_resource((u32)last_aper_base, n);
+ return 1;
}
- return;
+ return 0;
}
if (!fallback_aper_force) {
@@ -500,13 +501,18 @@ out:
panic("Not enough memory for aperture");
}
} else {
- return;
+ return 0;
}
/* Fix up the north bridges */
for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
- int bus;
- int dev_base, dev_limit;
+ int bus, dev_base, dev_limit;
+
+ /*
+ * Don't enable translation yet but enable GART IO and CPU
+ * accesses and set DISTLBWALKPRB since GART table memory is UC.
+ */
+ u32 ctl = DISTLBWALKPRB | aper_order << 1;
bus = bus_dev_ranges[i].bus;
dev_base = bus_dev_ranges[i].dev_base;
@@ -515,13 +521,12 @@ out:
if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
- /* Don't enable translation yet. That is done later.
- Assume this BIOS didn't initialise the GART so
- just overwrite all previous bits */
- write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
}
}
set_up_gart_resume(aper_order, aper_alloc);
+
+ return 1;
}
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 565c1bfc507d..910f20b457c4 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,12 @@
# Makefile for local APIC drivers and for the IO-APIC code
#
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o
+ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
+obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
+endif
+obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o
+
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_SMP) += ipi.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a96489ee6cab..850657d1b0ed 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -52,6 +52,7 @@
#include <asm/mce.h>
#include <asm/kvm_para.h>
#include <asm/tsc.h>
+#include <asm/atomic.h>
unsigned int num_processors;
@@ -370,38 +371,87 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
}
/*
- * Setup extended LVT, AMD specific (K8, family 10h)
+ * Setup extended LVT, AMD specific
*
- * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
- * MCE interrupts are supported. Thus MCE offset must be set to 0.
+ * Software should use the LVT offsets the BIOS provides. The offsets
+ * are determined by the subsystems using it like those for MCE
+ * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts
+ * are supported. Beginning with family 10h at least 4 offsets are
+ * available.
*
- * If mask=1, the LVT entry does not generate interrupts while mask=0
- * enables the vector. See also the BKDGs.
+ * Since the offsets must be consistent for all cores, we keep track
+ * of the LVT offsets in software and reserve the offset for the same
+ * vector also to be used on other cores. An offset is freed by
+ * setting the entry to APIC_EILVT_MASKED.
+ *
+ * If the BIOS is right, there should be no conflicts. Otherwise a
+ * "[Firmware Bug]: ..." error message is generated. However, if
+ * software does not properly determines the offsets, it is not
+ * necessarily a BIOS bug.
*/
-#define APIC_EILVT_LVTOFF_MCE 0
-#define APIC_EILVT_LVTOFF_IBS 1
+static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX];
-static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
+static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
{
- unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
- unsigned int v = (mask << 16) | (msg_type << 8) | vector;
-
- apic_write(reg, v);
+ return (old & APIC_EILVT_MASKED)
+ || (new == APIC_EILVT_MASKED)
+ || ((new & ~APIC_EILVT_MASKED) == old);
}
-u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
+static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_MCE;
+ unsigned int rsvd; /* 0: uninitialized */
+
+ if (offset >= APIC_EILVT_NR_MAX)
+ return ~0;
+
+ rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED;
+ do {
+ if (rsvd &&
+ !eilvt_entry_is_changeable(rsvd, new))
+ /* may not change if vectors are different */
+ return rsvd;
+ rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
+ } while (rsvd != new);
+
+ return new;
}
-u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
+/*
+ * If mask=1, the LVT entry does not generate interrupts while mask=0
+ * enables the vector. See also the BKDGs.
+ */
+
+int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_IBS;
+ unsigned long reg = APIC_EILVTn(offset);
+ unsigned int new, old, reserved;
+
+ new = (mask << 16) | (msg_type << 8) | vector;
+ old = apic_read(reg);
+ reserved = reserve_eilvt_offset(offset, new);
+
+ if (reserved != new) {
+ pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but "
+ "vector 0x%x was already reserved by another core, "
+ "APIC%lX=0x%x\n",
+ smp_processor_id(), new, reserved, reg, old);
+ return -EINVAL;
+ }
+
+ if (!eilvt_entry_is_changeable(old, new)) {
+ pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but "
+ "register already in use, APIC%lX=0x%x\n",
+ smp_processor_id(), new, reg, old);
+ return -EBUSY;
+ }
+
+ apic_write(reg, new);
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
+EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
/*
* Program the next event, relative to now
@@ -460,7 +510,7 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
}
/*
- * Setup the local APIC timer for this CPU. Copy the initilized values
+ * Setup the local APIC timer for this CPU. Copy the initialized values
* of the boot CPU and register the clock event in the framework.
*/
static void __cpuinit setup_APIC_timer(void)
@@ -1606,7 +1656,7 @@ void __init init_apic_mappings(void)
* acpi lapic path already maps that address in
* acpi_register_lapic_address()
*/
- if (!acpi_lapic)
+ if (!acpi_lapic && !smp_found_config)
set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
@@ -1665,10 +1715,7 @@ int __init APIC_init_uniprocessor(void)
}
#endif
-#ifndef CONFIG_SMP
- enable_IR_x2apic();
default_setup_apic_routing();
-#endif
verify_local_APIC();
connect_bsp_APIC();
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 425e53a87feb..8593582d8022 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -129,7 +129,6 @@ int es7000_plat;
* GSI override for ES7000 platforms.
*/
-static unsigned int base;
static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
{
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
new file mode 100644
index 000000000000..cefd6942f0e9
--- /dev/null
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -0,0 +1,107 @@
+/*
+ * HW NMI watchdog support
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Arch specific calls to support NMI watchdog
+ *
+ * Bits copied from original nmi.c file
+ *
+ */
+#include <asm/apic.h>
+
+#include <linux/cpumask.h>
+#include <linux/kdebug.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/nmi.h>
+#include <linux/module.h>
+
+/* For reliability, we're prepared to waste bits here. */
+static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
+
+u64 hw_nmi_get_sample_period(void)
+{
+ return (u64)(cpu_khz) * 1000 * 60;
+}
+
+#ifdef ARCH_HAS_NMI_WATCHDOG
+void arch_trigger_all_cpu_backtrace(void)
+{
+ int i;
+
+ cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+
+ printk(KERN_INFO "sending NMI to all CPUs:\n");
+ apic->send_IPI_all(NMI_VECTOR);
+
+ /* Wait for up to 10 seconds for all CPUs to do the backtrace */
+ for (i = 0; i < 10 * 1000; i++) {
+ if (cpumask_empty(to_cpumask(backtrace_mask)))
+ break;
+ mdelay(1);
+ }
+}
+
+static int __kprobes
+arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
+ unsigned long cmd, void *__args)
+{
+ struct die_args *args = __args;
+ struct pt_regs *regs;
+ int cpu = smp_processor_id();
+
+ switch (cmd) {
+ case DIE_NMI:
+ case DIE_NMI_IPI:
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ regs = args->regs;
+
+ if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
+ static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
+
+ arch_spin_lock(&lock);
+ printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
+ show_regs(regs);
+ dump_stack();
+ arch_spin_unlock(&lock);
+ cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+ return NOTIFY_STOP;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static __read_mostly struct notifier_block backtrace_notifier = {
+ .notifier_call = arch_trigger_all_cpu_backtrace_handler,
+ .next = NULL,
+ .priority = 1
+};
+
+static int __init register_trigger_all_cpu_backtrace(void)
+{
+ register_die_notifier(&backtrace_notifier);
+ return 0;
+}
+early_initcall(register_trigger_all_cpu_backtrace);
+#endif
+
+/* STUB calls to mimic old nmi_watchdog behaviour */
+#if defined(CONFIG_X86_LOCAL_APIC)
+unsigned int nmi_watchdog = NMI_NONE;
+EXPORT_SYMBOL(nmi_watchdog);
+void acpi_nmi_enable(void) { return; }
+void acpi_nmi_disable(void) { return; }
+#endif
+atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
+EXPORT_SYMBOL(nmi_active);
+int unknown_nmi_panic;
+void cpu_nmi_set_wd_enabled(void) { return; }
+void stop_apic_nmi_watchdog(void *unused) { return; }
+void setup_apic_nmi_watchdog(void *unused) { return; }
+int __init check_nmi_watchdog(void) { return 0; }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e41ed24ab26d..7cc0a721f628 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -131,13 +131,9 @@ struct irq_pin_list {
struct irq_pin_list *next;
};
-static struct irq_pin_list *get_one_free_irq_2_pin(int node)
+static struct irq_pin_list *alloc_irq_pin_list(int node)
{
- struct irq_pin_list *pin;
-
- pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
-
- return pin;
+ return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
}
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -150,10 +146,7 @@ static struct irq_cfg irq_cfgx[NR_IRQS];
int __init arch_early_irq_init(void)
{
struct irq_cfg *cfg;
- struct irq_desc *desc;
- int count;
- int node;
- int i;
+ int count, node, i;
if (!legacy_pic->nr_legacy_irqs) {
nr_irqs_gsi = 0;
@@ -162,13 +155,15 @@ int __init arch_early_irq_init(void)
cfg = irq_cfgx;
count = ARRAY_SIZE(irq_cfgx);
- node= cpu_to_node(boot_cpu_id);
+ node = cpu_to_node(0);
+
+ /* Make sure the legacy interrupts are marked in the bitmap */
+ irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
for (i = 0; i < count; i++) {
- desc = irq_to_desc(i);
- desc->chip_data = &cfg[i];
- zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
- zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
+ set_irq_chip_data(i, &cfg[i]);
+ zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
/*
* For legacy IRQ's, start with assigning irq0 to irq15 to
* IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
@@ -183,165 +178,88 @@ int __init arch_early_irq_init(void)
}
#ifdef CONFIG_SPARSE_IRQ
-struct irq_cfg *irq_cfg(unsigned int irq)
+static struct irq_cfg *irq_cfg(unsigned int irq)
{
- struct irq_cfg *cfg = NULL;
- struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
- if (desc)
- cfg = desc->chip_data;
-
- return cfg;
+ return get_irq_chip_data(irq);
}
-static struct irq_cfg *get_one_free_irq_cfg(int node)
+static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
{
struct irq_cfg *cfg;
- cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
- if (cfg) {
- if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
- kfree(cfg);
- cfg = NULL;
- } else if (!zalloc_cpumask_var_node(&cfg->old_domain,
- GFP_ATOMIC, node)) {
- free_cpumask_var(cfg->domain);
- kfree(cfg);
- cfg = NULL;
- }
- }
-
+ cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
+ if (!cfg)
+ return NULL;
+ if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
+ goto out_cfg;
+ if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
+ goto out_domain;
return cfg;
+out_domain:
+ free_cpumask_var(cfg->domain);
+out_cfg:
+ kfree(cfg);
+ return NULL;
}
-int arch_init_chip_data(struct irq_desc *desc, int node)
-{
- struct irq_cfg *cfg;
-
- cfg = desc->chip_data;
- if (!cfg) {
- desc->chip_data = get_one_free_irq_cfg(node);
- if (!desc->chip_data) {
- printk(KERN_ERR "can not alloc irq_cfg\n");
- BUG_ON(1);
- }
- }
-
- return 0;
-}
-
-/* for move_irq_desc */
-static void
-init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
+static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
{
- struct irq_pin_list *old_entry, *head, *tail, *entry;
-
- cfg->irq_2_pin = NULL;
- old_entry = old_cfg->irq_2_pin;
- if (!old_entry)
- return;
-
- entry = get_one_free_irq_2_pin(node);
- if (!entry)
+ if (!cfg)
return;
+ set_irq_chip_data(at, NULL);
+ free_cpumask_var(cfg->domain);
+ free_cpumask_var(cfg->old_domain);
+ kfree(cfg);
+}
- entry->apic = old_entry->apic;
- entry->pin = old_entry->pin;
- head = entry;
- tail = entry;
- old_entry = old_entry->next;
- while (old_entry) {
- entry = get_one_free_irq_2_pin(node);
- if (!entry) {
- entry = head;
- while (entry) {
- head = entry->next;
- kfree(entry);
- entry = head;
- }
- /* still use the old one */
- return;
- }
- entry->apic = old_entry->apic;
- entry->pin = old_entry->pin;
- tail->next = entry;
- tail = entry;
- old_entry = old_entry->next;
- }
+#else
- tail->next = NULL;
- cfg->irq_2_pin = head;
+struct irq_cfg *irq_cfg(unsigned int irq)
+{
+ return irq < nr_irqs ? irq_cfgx + irq : NULL;
}
-static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
+static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
{
- struct irq_pin_list *entry, *next;
-
- if (old_cfg->irq_2_pin == cfg->irq_2_pin)
- return;
+ return irq_cfgx + irq;
+}
- entry = old_cfg->irq_2_pin;
+static inline void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { }
- while (entry) {
- next = entry->next;
- kfree(entry);
- entry = next;
- }
- old_cfg->irq_2_pin = NULL;
-}
+#endif
-void arch_init_copy_chip_data(struct irq_desc *old_desc,
- struct irq_desc *desc, int node)
+static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
{
+ int res = irq_alloc_desc_at(at, node);
struct irq_cfg *cfg;
- struct irq_cfg *old_cfg;
-
- cfg = get_one_free_irq_cfg(node);
-
- if (!cfg)
- return;
- desc->chip_data = cfg;
-
- old_cfg = old_desc->chip_data;
-
- memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
-
- init_copy_irq_2_pin(old_cfg, cfg, node);
-}
+ if (res < 0) {
+ if (res != -EEXIST)
+ return NULL;
+ cfg = get_irq_chip_data(at);
+ if (cfg)
+ return cfg;
+ }
-static void free_irq_cfg(struct irq_cfg *old_cfg)
-{
- kfree(old_cfg);
+ cfg = alloc_irq_cfg(at, node);
+ if (cfg)
+ set_irq_chip_data(at, cfg);
+ else
+ irq_free_desc(at);
+ return cfg;
}
-void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+static int alloc_irq_from(unsigned int from, int node)
{
- struct irq_cfg *old_cfg, *cfg;
-
- old_cfg = old_desc->chip_data;
- cfg = desc->chip_data;
-
- if (old_cfg == cfg)
- return;
-
- if (old_cfg) {
- free_irq_2_pin(old_cfg, cfg);
- free_irq_cfg(old_cfg);
- old_desc->chip_data = NULL;
- }
+ return irq_alloc_desc_from(from, node);
}
-/* end for move_irq_desc */
-#else
-struct irq_cfg *irq_cfg(unsigned int irq)
+static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
{
- return irq < nr_irqs ? irq_cfgx + irq : NULL;
+ free_irq_cfg(at, cfg);
+ irq_free_desc(at);
}
-#endif
-
struct io_apic {
unsigned int index;
unsigned int unused[3];
@@ -446,7 +364,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
io_apic_write(apic, 0x10 + 2*pin, eu.w1);
}
-void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
@@ -476,7 +394,7 @@ static void ioapic_mask_entry(int apic, int pin)
* fast in the common case, and fast for shared ISA-space IRQs.
*/
static int
-add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
+__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
{
struct irq_pin_list **last, *entry;
@@ -488,7 +406,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
last = &entry->next;
}
- entry = get_one_free_irq_2_pin(node);
+ entry = alloc_irq_pin_list(node);
if (!entry) {
printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
node, apic, pin);
@@ -503,7 +421,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
{
- if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin))
+ if (__add_pin_to_irq_node(cfg, node, apic, pin))
panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
}
@@ -566,11 +484,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
}
-static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
-{
- io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
-}
-
static void io_apic_sync(struct irq_pin_list *entry)
{
/*
@@ -582,44 +495,37 @@ static void io_apic_sync(struct irq_pin_list *entry)
readl(&io_apic->data);
}
-static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
+static void mask_ioapic(struct irq_cfg *cfg)
{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
+static void mask_ioapic_irq(struct irq_data *data)
{
- struct irq_cfg *cfg = desc->chip_data;
- unsigned long flags;
-
- BUG_ON(!cfg);
+ mask_ioapic(data->chip_data);
+}
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- __mask_IO_APIC_irq(cfg);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+static void __unmask_ioapic(struct irq_cfg *cfg)
+{
+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
}
-static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
+static void unmask_ioapic(struct irq_cfg *cfg)
{
- struct irq_cfg *cfg = desc->chip_data;
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- __unmask_IO_APIC_irq(cfg);
+ __unmask_ioapic(cfg);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void mask_IO_APIC_irq(unsigned int irq)
+static void unmask_ioapic_irq(struct irq_data *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
- mask_IO_APIC_irq_desc(desc);
-}
-static void unmask_IO_APIC_irq(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- unmask_IO_APIC_irq_desc(desc);
+ unmask_ioapic(data->chip_data);
}
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
@@ -689,14 +595,14 @@ struct IO_APIC_route_entry **alloc_ioapic_entries(void)
struct IO_APIC_route_entry **ioapic_entries;
ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
- GFP_ATOMIC);
+ GFP_KERNEL);
if (!ioapic_entries)
return 0;
for (apic = 0; apic < nr_ioapics; apic++) {
ioapic_entries[apic] =
kzalloc(sizeof(struct IO_APIC_route_entry) *
- nr_ioapic_registers[apic], GFP_ATOMIC);
+ nr_ioapic_registers[apic], GFP_KERNEL);
if (!ioapic_entries[apic])
goto nomem;
}
@@ -1254,7 +1160,6 @@ void __setup_vector_irq(int cpu)
/* Initialize vector_irq on a new cpu */
int irq, vector;
struct irq_cfg *cfg;
- struct irq_desc *desc;
/*
* vector_lock will make sure that we don't run into irq vector
@@ -1263,9 +1168,10 @@ void __setup_vector_irq(int cpu)
*/
raw_spin_lock(&vector_lock);
/* Mark the inuse vectors */
- for_each_irq_desc(irq, desc) {
- cfg = desc->chip_data;
-
+ for_each_active_irq(irq) {
+ cfg = get_irq_chip_data(irq);
+ if (!cfg)
+ continue;
/*
* If it is a legacy IRQ handled by the legacy PIC, this cpu
* will be part of the irq_cfg's domain.
@@ -1322,17 +1228,17 @@ static inline int IO_APIC_irq_trigger(int irq)
}
#endif
-static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
+static void ioapic_register_intr(unsigned int irq, unsigned long trigger)
{
if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
trigger == IOAPIC_LEVEL)
- desc->status |= IRQ_LEVEL;
+ irq_set_status_flags(irq, IRQ_LEVEL);
else
- desc->status &= ~IRQ_LEVEL;
+ irq_clear_status_flags(irq, IRQ_LEVEL);
- if (irq_remapped(irq)) {
- desc->status |= IRQ_MOVE_PCNTXT;
+ if (irq_remapped(get_irq_chip_data(irq))) {
+ irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
if (trigger)
set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
handle_fasteoi_irq,
@@ -1353,10 +1259,10 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
handle_edge_irq, "edge");
}
-int setup_ioapic_entry(int apic_id, int irq,
- struct IO_APIC_route_entry *entry,
- unsigned int destination, int trigger,
- int polarity, int vector, int pin)
+static int setup_ioapic_entry(int apic_id, int irq,
+ struct IO_APIC_route_entry *entry,
+ unsigned int destination, int trigger,
+ int polarity, int vector, int pin)
{
/*
* add it to the IO-APIC irq-routing table:
@@ -1377,21 +1283,7 @@ int setup_ioapic_entry(int apic_id, int irq,
if (index < 0)
panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
- memset(&irte, 0, sizeof(irte));
-
- irte.present = 1;
- irte.dst_mode = apic->irq_dest_mode;
- /*
- * Trigger mode in the IRTE will always be edge, and the
- * actual level or edge trigger will be setup in the IO-APIC
- * RTE. This will help simplify level triggered irq migration.
- * For more details, see the comments above explainig IO-APIC
- * irq migration in the presence of interrupt-remapping.
- */
- irte.trigger_mode = 0;
- irte.dlvry_mode = apic->irq_delivery_mode;
- irte.vector = vector;
- irte.dest_id = IRTE_DEST(destination);
+ prepare_irte(&irte, vector, destination);
/* Set source-id of interrupt request */
set_ioapic_sid(&irte, apic_id);
@@ -1426,18 +1318,14 @@ int setup_ioapic_entry(int apic_id, int irq,
return 0;
}
-static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc,
- int trigger, int polarity)
+static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
+ struct irq_cfg *cfg, int trigger, int polarity)
{
- struct irq_cfg *cfg;
struct IO_APIC_route_entry entry;
unsigned int dest;
if (!IO_APIC_IRQ(irq))
return;
-
- cfg = desc->chip_data;
-
/*
* For legacy irqs, cfg->domain starts with cpu 0 for legacy
* controllers like 8259. Now that IO-APIC can handle this irq, update
@@ -1466,9 +1354,9 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
return;
}
- ioapic_register_intr(irq, desc, trigger);
+ ioapic_register_intr(irq, trigger);
if (irq < legacy_pic->nr_legacy_irqs)
- legacy_pic->chip->mask(irq);
+ legacy_pic->mask(irq);
ioapic_write_entry(apic_id, pin, entry);
}
@@ -1479,11 +1367,9 @@ static struct {
static void __init setup_IO_APIC_irqs(void)
{
- int apic_id, pin, idx, irq;
- int notcon = 0;
- struct irq_desc *desc;
+ int apic_id, pin, idx, irq, notcon = 0;
+ int node = cpu_to_node(0);
struct irq_cfg *cfg;
- int node = cpu_to_node(boot_cpu_id);
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
@@ -1520,19 +1406,17 @@ static void __init setup_IO_APIC_irqs(void)
apic->multi_timer_check(apic_id, irq))
continue;
- desc = irq_to_desc_alloc_node(irq, node);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+ cfg = alloc_irq_and_cfg_at(irq, node);
+ if (!cfg)
continue;
- }
- cfg = desc->chip_data;
+
add_pin_to_irq_node(cfg, node, apic_id, pin);
/*
* don't mark it in pin_programmed, so later acpi could
* set it correctly when irq < 16
*/
- setup_IO_APIC_irq(apic_id, pin, irq, desc,
- irq_trigger(idx), irq_polarity(idx));
+ setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx),
+ irq_polarity(idx));
}
if (notcon)
@@ -1547,9 +1431,7 @@ static void __init setup_IO_APIC_irqs(void)
*/
void setup_IO_APIC_irq_extra(u32 gsi)
{
- int apic_id = 0, pin, idx, irq;
- int node = cpu_to_node(boot_cpu_id);
- struct irq_desc *desc;
+ int apic_id = 0, pin, idx, irq, node = cpu_to_node(0);
struct irq_cfg *cfg;
/*
@@ -1565,18 +1447,15 @@ void setup_IO_APIC_irq_extra(u32 gsi)
return;
irq = pin_2_irq(idx, apic_id, pin);
-#ifdef CONFIG_SPARSE_IRQ
- desc = irq_to_desc(irq);
- if (desc)
+
+ /* Only handle the non legacy irqs on secondary ioapics */
+ if (apic_id == 0 || irq < NR_IRQS_LEGACY)
return;
-#endif
- desc = irq_to_desc_alloc_node(irq, node);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+
+ cfg = alloc_irq_and_cfg_at(irq, node);
+ if (!cfg)
return;
- }
- cfg = desc->chip_data;
add_pin_to_irq_node(cfg, node, apic_id, pin);
if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
@@ -1586,7 +1465,7 @@ void setup_IO_APIC_irq_extra(u32 gsi)
}
set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
- setup_IO_APIC_irq(apic_id, pin, irq, desc,
+ setup_ioapic_irq(apic_id, pin, irq, cfg,
irq_trigger(idx), irq_polarity(idx));
}
@@ -1637,7 +1516,6 @@ __apicdebuginit(void) print_IO_APIC(void)
union IO_APIC_reg_03 reg_03;
unsigned long flags;
struct irq_cfg *cfg;
- struct irq_desc *desc;
unsigned int irq;
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
@@ -1724,10 +1602,12 @@ __apicdebuginit(void) print_IO_APIC(void)
}
}
printk(KERN_DEBUG "IRQ to pin mappings:\n");
- for_each_irq_desc(irq, desc) {
+ for_each_active_irq(irq) {
struct irq_pin_list *entry;
- cfg = desc->chip_data;
+ cfg = get_irq_chip_data(irq);
+ if (!cfg)
+ continue;
entry = cfg->irq_2_pin;
if (!entry)
continue;
@@ -2232,29 +2112,26 @@ static int __init timer_irq_works(void)
* an edge even if it isn't on the 8259A...
*/
-static unsigned int startup_ioapic_irq(unsigned int irq)
+static unsigned int startup_ioapic_irq(struct irq_data *data)
{
- int was_pending = 0;
+ int was_pending = 0, irq = data->irq;
unsigned long flags;
- struct irq_cfg *cfg;
raw_spin_lock_irqsave(&ioapic_lock, flags);
if (irq < legacy_pic->nr_legacy_irqs) {
- legacy_pic->chip->mask(irq);
+ legacy_pic->mask(irq);
if (legacy_pic->irq_pending(irq))
was_pending = 1;
}
- cfg = irq_cfg(irq);
- __unmask_IO_APIC_irq(cfg);
+ __unmask_ioapic(data->chip_data);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return was_pending;
}
-static int ioapic_retrigger_irq(unsigned int irq)
+static int ioapic_retrigger_irq(struct irq_data *data)
{
-
- struct irq_cfg *cfg = irq_cfg(irq);
+ struct irq_cfg *cfg = data->chip_data;
unsigned long flags;
raw_spin_lock_irqsave(&vector_lock, flags);
@@ -2305,7 +2182,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
* With interrupt-remapping, destination information comes
* from interrupt-remapping table entry.
*/
- if (!irq_remapped(irq))
+ if (!irq_remapped(cfg))
io_apic_write(apic, 0x11 + pin*2, dest);
reg = io_apic_read(apic, 0x10 + pin*2);
reg &= ~IO_APIC_REDIR_VECTOR_MASK;
@@ -2315,65 +2192,46 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
}
/*
- * Either sets desc->affinity to a valid value, and returns
+ * Either sets data->affinity to a valid value, and returns
* ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
- * leaves desc->affinity untouched.
+ * leaves data->affinity untouched.
*/
-unsigned int
-set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask,
- unsigned int *dest_id)
+int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ unsigned int *dest_id)
{
- struct irq_cfg *cfg;
- unsigned int irq;
+ struct irq_cfg *cfg = data->chip_data;
if (!cpumask_intersects(mask, cpu_online_mask))
return -1;
- irq = desc->irq;
- cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
+ if (assign_irq_vector(data->irq, data->chip_data, mask))
return -1;
- cpumask_copy(desc->affinity, mask);
+ cpumask_copy(data->affinity, mask);
- *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
+ *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
return 0;
}
static int
-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
- struct irq_cfg *cfg;
+ unsigned int dest, irq = data->irq;
unsigned long flags;
- unsigned int dest;
- unsigned int irq;
- int ret = -1;
-
- irq = desc->irq;
- cfg = desc->chip_data;
+ int ret;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- ret = set_desc_affinity(desc, mask, &dest);
+ ret = __ioapic_set_affinity(data, mask, &dest);
if (!ret) {
/* Only the high 8 bits are valid. */
dest = SET_APIC_LOGICAL_ID(dest);
- __target_IO_APIC_irq(irq, dest, cfg);
+ __target_IO_APIC_irq(irq, dest, data->chip_data);
}
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
return ret;
}
-static int
-set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
-{
- struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
-
- return set_ioapic_affinity_irq_desc(desc, mask);
-}
-
#ifdef CONFIG_INTR_REMAP
/*
@@ -2388,24 +2246,21 @@ set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
* the interrupt-remapping table entry.
*/
static int
-migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
- struct irq_cfg *cfg;
+ struct irq_cfg *cfg = data->chip_data;
+ unsigned int dest, irq = data->irq;
struct irte irte;
- unsigned int dest;
- unsigned int irq;
- int ret = -1;
if (!cpumask_intersects(mask, cpu_online_mask))
- return ret;
+ return -EINVAL;
- irq = desc->irq;
if (get_irte(irq, &irte))
- return ret;
+ return -EBUSY;
- cfg = desc->chip_data;
if (assign_irq_vector(irq, cfg, mask))
- return ret;
+ return -EBUSY;
dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
@@ -2420,29 +2275,14 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
if (cfg->move_in_progress)
send_cleanup_vector(cfg);
- cpumask_copy(desc->affinity, mask);
-
+ cpumask_copy(data->affinity, mask);
return 0;
}
-/*
- * Migrates the IRQ destination in the process context.
- */
-static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
- const struct cpumask *mask)
-{
- return migrate_ioapic_irq_desc(desc, mask);
-}
-static int set_ir_ioapic_affinity_irq(unsigned int irq,
- const struct cpumask *mask)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- return set_ir_ioapic_affinity_irq_desc(desc, mask);
-}
#else
-static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
- const struct cpumask *mask)
+static inline int
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
return 0;
}
@@ -2504,10 +2344,8 @@ unlock:
irq_exit();
}
-static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
+static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
{
- struct irq_desc *desc = *descp;
- struct irq_cfg *cfg = desc->chip_data;
unsigned me;
if (likely(!cfg->move_in_progress))
@@ -2519,31 +2357,28 @@ static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
send_cleanup_vector(cfg);
}
-static void irq_complete_move(struct irq_desc **descp)
+static void irq_complete_move(struct irq_cfg *cfg)
{
- __irq_complete_move(descp, ~get_irq_regs()->orig_ax);
+ __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
}
void irq_force_complete_move(int irq)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg = desc->chip_data;
+ struct irq_cfg *cfg = get_irq_chip_data(irq);
if (!cfg)
return;
- __irq_complete_move(&desc, cfg->vector);
+ __irq_complete_move(cfg, cfg->vector);
}
#else
-static inline void irq_complete_move(struct irq_desc **descp) {}
+static inline void irq_complete_move(struct irq_cfg *cfg) { }
#endif
-static void ack_apic_edge(unsigned int irq)
+static void ack_apic_edge(struct irq_data *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
- irq_complete_move(&desc);
- move_native_irq(irq);
+ irq_complete_move(data->chip_data);
+ move_native_irq(data->irq);
ack_APIC_irq();
}
@@ -2565,10 +2400,12 @@ atomic_t irq_mis_count;
* Otherwise, we simulate the EOI message manually by changing the trigger
* mode to edge and then back to level, with RTE being masked during this.
*/
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
{
struct irq_pin_list *entry;
+ unsigned long flags;
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
for_each_irq_pin(entry, cfg->irq_2_pin) {
if (mp_ioapics[entry->apic].apicver >= 0x20) {
/*
@@ -2577,7 +2414,7 @@ static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
* intr-remapping table entry. Hence for the io-apic
* EOI we use the pin number.
*/
- if (irq_remapped(irq))
+ if (irq_remapped(cfg))
io_apic_eoi(entry->apic, entry->pin);
else
io_apic_eoi(entry->apic, cfg->vector);
@@ -2586,36 +2423,22 @@ static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
__unmask_and_level_IO_APIC_irq(entry);
}
}
-}
-
-static void eoi_ioapic_irq(struct irq_desc *desc)
-{
- struct irq_cfg *cfg;
- unsigned long flags;
- unsigned int irq;
-
- irq = desc->irq;
- cfg = desc->chip_data;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- __eoi_ioapic_irq(irq, cfg);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void ack_apic_level(unsigned int irq)
+static void ack_apic_level(struct irq_data *data)
{
+ struct irq_cfg *cfg = data->chip_data;
+ int i, do_unmask_irq = 0, irq = data->irq;
struct irq_desc *desc = irq_to_desc(irq);
unsigned long v;
- int i;
- struct irq_cfg *cfg;
- int do_unmask_irq = 0;
- irq_complete_move(&desc);
+ irq_complete_move(cfg);
#ifdef CONFIG_GENERIC_PENDING_IRQ
/* If we are moving the irq we need to mask it */
if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
do_unmask_irq = 1;
- mask_IO_APIC_irq_desc(desc);
+ mask_ioapic(cfg);
}
#endif
@@ -2651,7 +2474,6 @@ static void ack_apic_level(unsigned int irq)
* we use the above logic (mask+edge followed by unmask+level) from
* Manfred Spraul to clear the remote IRR.
*/
- cfg = desc->chip_data;
i = cfg->vector;
v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
@@ -2671,7 +2493,7 @@ static void ack_apic_level(unsigned int irq)
if (!(v & (1 << (i & 0x1f)))) {
atomic_inc(&irq_mis_count);
- eoi_ioapic_irq(desc);
+ eoi_ioapic_irq(irq, cfg);
}
/* Now we can move and renable the irq */
@@ -2702,61 +2524,57 @@ static void ack_apic_level(unsigned int irq)
* accurate and is causing problems then it is a hardware bug
* and you can go talk to the chipset vendor about it.
*/
- cfg = desc->chip_data;
if (!io_apic_level_ack_pending(cfg))
move_masked_irq(irq);
- unmask_IO_APIC_irq_desc(desc);
+ unmask_ioapic(cfg);
}
}
#ifdef CONFIG_INTR_REMAP
-static void ir_ack_apic_edge(unsigned int irq)
+static void ir_ack_apic_edge(struct irq_data *data)
{
ack_APIC_irq();
}
-static void ir_ack_apic_level(unsigned int irq)
+static void ir_ack_apic_level(struct irq_data *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
ack_APIC_irq();
- eoi_ioapic_irq(desc);
+ eoi_ioapic_irq(data->irq, data->chip_data);
}
#endif /* CONFIG_INTR_REMAP */
static struct irq_chip ioapic_chip __read_mostly = {
- .name = "IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
- .ack = ack_apic_edge,
- .eoi = ack_apic_level,
+ .name = "IO-APIC",
+ .irq_startup = startup_ioapic_irq,
+ .irq_mask = mask_ioapic_irq,
+ .irq_unmask = unmask_ioapic_irq,
+ .irq_ack = ack_apic_edge,
+ .irq_eoi = ack_apic_level,
#ifdef CONFIG_SMP
- .set_affinity = set_ioapic_affinity_irq,
+ .irq_set_affinity = ioapic_set_affinity,
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
static struct irq_chip ir_ioapic_chip __read_mostly = {
- .name = "IR-IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
+ .name = "IR-IO-APIC",
+ .irq_startup = startup_ioapic_irq,
+ .irq_mask = mask_ioapic_irq,
+ .irq_unmask = unmask_ioapic_irq,
#ifdef CONFIG_INTR_REMAP
- .ack = ir_ack_apic_edge,
- .eoi = ir_ack_apic_level,
+ .irq_ack = ir_ack_apic_edge,
+ .irq_eoi = ir_ack_apic_level,
#ifdef CONFIG_SMP
- .set_affinity = set_ir_ioapic_affinity_irq,
+ .irq_set_affinity = ir_ioapic_set_affinity,
#endif
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
static inline void init_IO_APIC_traps(void)
{
- int irq;
- struct irq_desc *desc;
struct irq_cfg *cfg;
+ unsigned int irq;
/*
* NOTE! The local APIC isn't very good at handling
@@ -2769,8 +2587,8 @@ static inline void init_IO_APIC_traps(void)
* Also, we've got to be careful not to trash gate
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
- for_each_irq_desc(irq, desc) {
- cfg = desc->chip_data;
+ for_each_active_irq(irq) {
+ cfg = get_irq_chip_data(irq);
if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
* Hmm.. We don't have an entry for this,
@@ -2781,7 +2599,7 @@ static inline void init_IO_APIC_traps(void)
legacy_pic->make_irq(irq);
else
/* Strange. Oh, well.. */
- desc->chip = &no_irq_chip;
+ set_irq_chip(irq, &no_irq_chip);
}
}
}
@@ -2790,7 +2608,7 @@ static inline void init_IO_APIC_traps(void)
* The local APIC irq-chip implementation:
*/
-static void mask_lapic_irq(unsigned int irq)
+static void mask_lapic_irq(struct irq_data *data)
{
unsigned long v;
@@ -2798,7 +2616,7 @@ static void mask_lapic_irq(unsigned int irq)
apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
}
-static void unmask_lapic_irq(unsigned int irq)
+static void unmask_lapic_irq(struct irq_data *data)
{
unsigned long v;
@@ -2806,21 +2624,21 @@ static void unmask_lapic_irq(unsigned int irq)
apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
-static void ack_lapic_irq(unsigned int irq)
+static void ack_lapic_irq(struct irq_data *data)
{
ack_APIC_irq();
}
static struct irq_chip lapic_chip __read_mostly = {
.name = "local-APIC",
- .mask = mask_lapic_irq,
- .unmask = unmask_lapic_irq,
- .ack = ack_lapic_irq,
+ .irq_mask = mask_lapic_irq,
+ .irq_unmask = unmask_lapic_irq,
+ .irq_ack = ack_lapic_irq,
};
-static void lapic_register_intr(int irq, struct irq_desc *desc)
+static void lapic_register_intr(int irq)
{
- desc->status &= ~IRQ_LEVEL;
+ irq_clear_status_flags(irq, IRQ_LEVEL);
set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
"edge");
}
@@ -2923,9 +2741,8 @@ int timer_through_8259 __initdata;
*/
static inline void __init check_timer(void)
{
- struct irq_desc *desc = irq_to_desc(0);
- struct irq_cfg *cfg = desc->chip_data;
- int node = cpu_to_node(boot_cpu_id);
+ struct irq_cfg *cfg = get_irq_chip_data(0);
+ int node = cpu_to_node(0);
int apic1, pin1, apic2, pin2;
unsigned long flags;
int no_pin1 = 0;
@@ -2935,7 +2752,7 @@ static inline void __init check_timer(void)
/*
* get/set the timer IRQ vector:
*/
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
assign_irq_vector(0, cfg, apic->target_cpus());
/*
@@ -2994,7 +2811,7 @@ static inline void __init check_timer(void)
add_pin_to_irq_node(cfg, node, apic1, pin1);
setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
} else {
- /* for edge trigger, setup_IO_APIC_irq already
+ /* for edge trigger, setup_ioapic_irq already
* leave it unmasked.
* so only need to unmask if it is level-trigger
* do we really have level trigger timer?
@@ -3002,12 +2819,12 @@ static inline void __init check_timer(void)
int idx;
idx = find_irq_entry(apic1, pin1, mp_INT);
if (idx != -1 && irq_trigger(idx))
- unmask_IO_APIC_irq_desc(desc);
+ unmask_ioapic(cfg);
}
if (timer_irq_works()) {
if (nmi_watchdog == NMI_IO_APIC) {
setup_nmi();
- legacy_pic->chip->unmask(0);
+ legacy_pic->unmask(0);
}
if (disable_timer_pin_1 > 0)
clear_IO_APIC_pin(0, pin1);
@@ -3030,14 +2847,14 @@ static inline void __init check_timer(void)
*/
replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
- legacy_pic->chip->unmask(0);
+ legacy_pic->unmask(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
timer_through_8259 = 1;
if (nmi_watchdog == NMI_IO_APIC) {
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
setup_nmi();
- legacy_pic->chip->unmask(0);
+ legacy_pic->unmask(0);
}
goto out;
}
@@ -3045,7 +2862,7 @@ static inline void __init check_timer(void)
* Cleanup, just in case ...
*/
local_irq_disable();
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
clear_IO_APIC_pin(apic2, pin2);
apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
}
@@ -3062,16 +2879,16 @@ static inline void __init check_timer(void)
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as Virtual Wire IRQ...\n");
- lapic_register_intr(0, desc);
+ lapic_register_intr(0);
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
- legacy_pic->chip->unmask(0);
+ legacy_pic->unmask(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
local_irq_disable();
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
@@ -3237,49 +3054,42 @@ device_initcall(ioapic_init_sysfs);
/*
* Dynamic irq allocate and deallocation
*/
-unsigned int create_irq_nr(unsigned int irq_want, int node)
+unsigned int create_irq_nr(unsigned int from, int node)
{
- /* Allocate an unused irq */
- unsigned int irq;
- unsigned int new;
+ struct irq_cfg *cfg;
unsigned long flags;
- struct irq_cfg *cfg_new = NULL;
- struct irq_desc *desc_new = NULL;
-
- irq = 0;
- if (irq_want < nr_irqs_gsi)
- irq_want = nr_irqs_gsi;
-
- raw_spin_lock_irqsave(&vector_lock, flags);
- for (new = irq_want; new < nr_irqs; new++) {
- desc_new = irq_to_desc_alloc_node(new, node);
- if (!desc_new) {
- printk(KERN_INFO "can not get irq_desc for %d\n", new);
- continue;
- }
- cfg_new = desc_new->chip_data;
-
- if (cfg_new->vector != 0)
- continue;
+ unsigned int ret = 0;
+ int irq;
- desc_new = move_irq_desc(desc_new, node);
- cfg_new = desc_new->chip_data;
+ if (from < nr_irqs_gsi)
+ from = nr_irqs_gsi;
- if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
- irq = new;
- break;
+ irq = alloc_irq_from(from, node);
+ if (irq < 0)
+ return 0;
+ cfg = alloc_irq_cfg(irq, node);
+ if (!cfg) {
+ free_irq_at(irq, NULL);
+ return 0;
}
- raw_spin_unlock_irqrestore(&vector_lock, flags);
- if (irq > 0)
- dynamic_irq_init_keep_chip_data(irq);
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
+ ret = irq;
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
- return irq;
+ if (ret) {
+ set_irq_chip_data(irq, cfg);
+ irq_clear_status_flags(irq, IRQ_NOREQUEST);
+ } else {
+ free_irq_at(irq, cfg);
+ }
+ return ret;
}
int create_irq(void)
{
- int node = cpu_to_node(boot_cpu_id);
+ int node = cpu_to_node(0);
unsigned int irq_want;
int irq;
@@ -3294,14 +3104,17 @@ int create_irq(void)
void destroy_irq(unsigned int irq)
{
+ struct irq_cfg *cfg = get_irq_chip_data(irq);
unsigned long flags;
- dynamic_irq_cleanup_keep_chip_data(irq);
+ irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
- free_irte(irq);
+ if (irq_remapped(cfg))
+ free_irte(irq);
raw_spin_lock_irqsave(&vector_lock, flags);
- __clear_irq_vector(irq, get_irq_chip_data(irq));
+ __clear_irq_vector(irq, cfg);
raw_spin_unlock_irqrestore(&vector_lock, flags);
+ free_irq_at(irq, cfg);
}
/*
@@ -3325,7 +3138,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
- if (irq_remapped(irq)) {
+ if (irq_remapped(get_irq_chip_data(irq))) {
struct irte irte;
int ir_index;
u16 sub_handle;
@@ -3333,14 +3146,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
ir_index = map_irq_to_irte_handle(irq, &sub_handle);
BUG_ON(ir_index == -1);
- memset (&irte, 0, sizeof(irte));
-
- irte.present = 1;
- irte.dst_mode = apic->irq_dest_mode;
- irte.trigger_mode = 0; /* edge */
- irte.dlvry_mode = apic->irq_delivery_mode;
- irte.vector = cfg->vector;
- irte.dest_id = IRTE_DEST(dest);
+ prepare_irte(&irte, cfg->vector, dest);
/* Set source-id of interrupt request */
if (pdev)
@@ -3385,26 +3191,24 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
}
#ifdef CONFIG_SMP
-static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
+ struct irq_cfg *cfg = data->chip_data;
struct msi_msg msg;
unsigned int dest;
- if (set_desc_affinity(desc, mask, &dest))
+ if (__ioapic_set_affinity(data, mask, &dest))
return -1;
- cfg = desc->chip_data;
-
- read_msi_msg_desc(desc, &msg);
+ __get_cached_msi_msg(data->msi_desc, &msg);
msg.data &= ~MSI_DATA_VECTOR_MASK;
msg.data |= MSI_DATA_VECTOR(cfg->vector);
msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
- write_msi_msg_desc(desc, &msg);
+ __write_msi_msg(data->msi_desc, &msg);
return 0;
}
@@ -3414,17 +3218,17 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
* done in the process context using interrupt-remapping hardware.
*/
static int
-ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+ir_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg = desc->chip_data;
- unsigned int dest;
+ struct irq_cfg *cfg = data->chip_data;
+ unsigned int dest, irq = data->irq;
struct irte irte;
if (get_irte(irq, &irte))
return -1;
- if (set_desc_affinity(desc, mask, &dest))
+ if (__ioapic_set_affinity(data, mask, &dest))
return -1;
irte.vector = cfg->vector;
@@ -3454,27 +3258,27 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
* which implement the MSI or MSI-X Capability Structure.
*/
static struct irq_chip msi_chip = {
- .name = "PCI-MSI",
- .unmask = unmask_msi_irq,
- .mask = mask_msi_irq,
- .ack = ack_apic_edge,
+ .name = "PCI-MSI",
+ .irq_unmask = unmask_msi_irq,
+ .irq_mask = mask_msi_irq,
+ .irq_ack = ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = set_msi_irq_affinity,
+ .irq_set_affinity = msi_set_affinity,
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
static struct irq_chip msi_ir_chip = {
- .name = "IR-PCI-MSI",
- .unmask = unmask_msi_irq,
- .mask = mask_msi_irq,
+ .name = "IR-PCI-MSI",
+ .irq_unmask = unmask_msi_irq,
+ .irq_mask = mask_msi_irq,
#ifdef CONFIG_INTR_REMAP
- .ack = ir_ack_apic_edge,
+ .irq_ack = ir_ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = ir_set_msi_irq_affinity,
+ .irq_set_affinity = ir_msi_set_affinity,
#endif
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
/*
@@ -3506,8 +3310,8 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
{
- int ret;
struct msi_msg msg;
+ int ret;
ret = msi_compose_msg(dev, irq, &msg, -1);
if (ret < 0)
@@ -3516,12 +3320,8 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
set_irq_msi(irq, msidesc);
write_msi_msg(irq, &msg);
- if (irq_remapped(irq)) {
- struct irq_desc *desc = irq_to_desc(irq);
- /*
- * irq migration in process context
- */
- desc->status |= IRQ_MOVE_PCNTXT;
+ if (irq_remapped(get_irq_chip_data(irq))) {
+ irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
} else
set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
@@ -3531,15 +3331,12 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
return 0;
}
-int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
- unsigned int irq;
- int ret, sub_handle;
+ int node, ret, sub_handle, index = 0;
+ unsigned int irq, irq_want;
struct msi_desc *msidesc;
- unsigned int irq_want;
struct intel_iommu *iommu = NULL;
- int index = 0;
- int node;
/* x86 doesn't support multiple MSI yet */
if (type == PCI_CAP_ID_MSI && nvec > 1)
@@ -3592,25 +3389,24 @@ error:
return ret;
}
-void arch_teardown_msi_irq(unsigned int irq)
+void native_teardown_msi_irq(unsigned int irq)
{
destroy_irq(irq);
}
#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
#ifdef CONFIG_SMP
-static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
+ struct irq_cfg *cfg = data->chip_data;
+ unsigned int dest, irq = data->irq;
struct msi_msg msg;
- unsigned int dest;
- if (set_desc_affinity(desc, mask, &dest))
+ if (__ioapic_set_affinity(data, mask, &dest))
return -1;
- cfg = desc->chip_data;
-
dmar_msi_read(irq, &msg);
msg.data &= ~MSI_DATA_VECTOR_MASK;
@@ -3626,14 +3422,14 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
#endif /* CONFIG_SMP */
static struct irq_chip dmar_msi_type = {
- .name = "DMAR_MSI",
- .unmask = dmar_msi_unmask,
- .mask = dmar_msi_mask,
- .ack = ack_apic_edge,
+ .name = "DMAR_MSI",
+ .irq_unmask = dmar_msi_unmask,
+ .irq_mask = dmar_msi_mask,
+ .irq_ack = ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = dmar_msi_set_affinity,
+ .irq_set_affinity = dmar_msi_set_affinity,
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
int arch_setup_dmar_msi(unsigned int irq)
@@ -3654,26 +3450,24 @@ int arch_setup_dmar_msi(unsigned int irq)
#ifdef CONFIG_HPET_TIMER
#ifdef CONFIG_SMP
-static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int hpet_msi_set_affinity(struct irq_data *data,
+ const struct cpumask *mask, bool force)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
+ struct irq_cfg *cfg = data->chip_data;
struct msi_msg msg;
unsigned int dest;
- if (set_desc_affinity(desc, mask, &dest))
+ if (__ioapic_set_affinity(data, mask, &dest))
return -1;
- cfg = desc->chip_data;
-
- hpet_msi_read(irq, &msg);
+ hpet_msi_read(data->handler_data, &msg);
msg.data &= ~MSI_DATA_VECTOR_MASK;
msg.data |= MSI_DATA_VECTOR(cfg->vector);
msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
- hpet_msi_write(irq, &msg);
+ hpet_msi_write(data->handler_data, &msg);
return 0;
}
@@ -3681,34 +3475,33 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
#endif /* CONFIG_SMP */
static struct irq_chip ir_hpet_msi_type = {
- .name = "IR-HPET_MSI",
- .unmask = hpet_msi_unmask,
- .mask = hpet_msi_mask,
+ .name = "IR-HPET_MSI",
+ .irq_unmask = hpet_msi_unmask,
+ .irq_mask = hpet_msi_mask,
#ifdef CONFIG_INTR_REMAP
- .ack = ir_ack_apic_edge,
+ .irq_ack = ir_ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = ir_set_msi_irq_affinity,
+ .irq_set_affinity = ir_msi_set_affinity,
#endif
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
static struct irq_chip hpet_msi_type = {
.name = "HPET_MSI",
- .unmask = hpet_msi_unmask,
- .mask = hpet_msi_mask,
- .ack = ack_apic_edge,
+ .irq_unmask = hpet_msi_unmask,
+ .irq_mask = hpet_msi_mask,
+ .irq_ack = ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = hpet_msi_set_affinity,
+ .irq_set_affinity = hpet_msi_set_affinity,
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
{
- int ret;
struct msi_msg msg;
- struct irq_desc *desc = irq_to_desc(irq);
+ int ret;
if (intr_remapping_enabled) {
struct intel_iommu *iommu = map_hpet_to_ir(id);
@@ -3726,9 +3519,9 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
if (ret < 0)
return ret;
- hpet_msi_write(irq, &msg);
- desc->status |= IRQ_MOVE_PCNTXT;
- if (irq_remapped(irq))
+ hpet_msi_write(get_irq_data(irq), &msg);
+ irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+ if (irq_remapped(get_irq_chip_data(irq)))
set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
handle_edge_irq, "edge");
else
@@ -3761,33 +3554,30 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
write_ht_irq_msg(irq, &msg);
}
-static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
+ struct irq_cfg *cfg = data->chip_data;
unsigned int dest;
- if (set_desc_affinity(desc, mask, &dest))
+ if (__ioapic_set_affinity(data, mask, &dest))
return -1;
- cfg = desc->chip_data;
-
- target_ht_irq(irq, dest, cfg->vector);
-
+ target_ht_irq(data->irq, dest, cfg->vector);
return 0;
}
#endif
static struct irq_chip ht_irq_chip = {
- .name = "PCI-HT",
- .mask = mask_ht_irq,
- .unmask = unmask_ht_irq,
- .ack = ack_apic_edge,
+ .name = "PCI-HT",
+ .irq_mask = mask_ht_irq,
+ .irq_unmask = unmask_ht_irq,
+ .irq_ack = ack_apic_edge,
#ifdef CONFIG_SMP
- .set_affinity = set_ht_irq_affinity,
+ .irq_set_affinity = ht_set_affinity,
#endif
- .retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = ioapic_retrigger_irq,
};
int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
@@ -3860,6 +3650,11 @@ void __init probe_nr_irqs_gsi(void)
printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
}
+int get_nr_irqs_gsi(void)
+{
+ return nr_irqs_gsi;
+}
+
#ifdef CONFIG_SPARSE_IRQ
int __init arch_probe_nr_irqs(void)
{
@@ -3878,14 +3673,13 @@ int __init arch_probe_nr_irqs(void)
if (nr < nr_irqs)
nr_irqs = nr;
- return 0;
+ return NR_IRQS_LEGACY;
}
#endif
static int __io_apic_set_pci_routing(struct device *dev, int irq,
struct io_apic_irq_attr *irq_attr)
{
- struct irq_desc *desc;
struct irq_cfg *cfg;
int node;
int ioapic, pin;
@@ -3901,13 +3695,11 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
if (dev)
node = dev_to_node(dev);
else
- node = cpu_to_node(boot_cpu_id);
+ node = cpu_to_node(0);
- desc = irq_to_desc_alloc_node(irq, node);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc %d\n", irq);
+ cfg = alloc_irq_and_cfg_at(irq, node);
+ if (!cfg)
return 0;
- }
pin = irq_attr->ioapic_pin;
trigger = irq_attr->trigger;
@@ -3917,15 +3709,14 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
* IRQs < 16 are already in the irq_2_pin[] map
*/
if (irq >= legacy_pic->nr_legacy_irqs) {
- cfg = desc->chip_data;
- if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
+ if (__add_pin_to_irq_node(cfg, node, ioapic, pin)) {
printk(KERN_INFO "can not add pin %d for irq %d\n",
pin, irq);
return 0;
}
}
- setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
+ setup_ioapic_irq(ioapic, pin, irq, cfg, trigger, polarity);
return 0;
}
@@ -4118,14 +3909,14 @@ void __init setup_ioapic_dest(void)
*/
if (desc->status &
(IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
- mask = desc->affinity;
+ mask = desc->irq_data.affinity;
else
mask = apic->target_cpus();
if (intr_remapping_enabled)
- set_ir_ioapic_affinity_irq_desc(desc, mask);
+ ir_ioapic_set_affinity(&desc->irq_data, mask, false);
else
- set_ioapic_affinity_irq_desc(desc, mask);
+ ioapic_set_affinity(&desc->irq_data, mask, false);
}
}
@@ -4309,19 +4100,18 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
void __init pre_init_apic_IRQ0(void)
{
struct irq_cfg *cfg;
- struct irq_desc *desc;
printk(KERN_INFO "Early APIC setup for system timer0\n");
#ifndef CONFIG_SMP
phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
#endif
- desc = irq_to_desc_alloc_node(0, 0);
+ /* Make sure the irq descriptor is set up */
+ cfg = alloc_irq_and_cfg_at(0, 0);
setup_local_APIC();
- cfg = irq_cfg(0);
add_pin_to_irq_node(cfg, 0, 0, 0);
set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
- setup_IO_APIC_irq(0, 0, 0, desc, 0, 0);
+ setup_ioapic_irq(0, 0, 0, cfg, 0, 0);
}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 1edaf15c0b8e..c90041ccb742 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -178,7 +178,7 @@ int __init check_nmi_watchdog(void)
error:
if (nmi_watchdog == NMI_IO_APIC) {
if (!timer_through_8259)
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
on_each_cpu(__acpi_nmi_disable, NULL, 1);
}
@@ -401,13 +401,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
int cpu = smp_processor_id();
int rc = 0;
- /* check for other users first */
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP) {
- rc = 1;
- touched = 1;
- }
-
sum = get_timer_irqs(cpu);
if (__get_cpu_var(nmi_touch)) {
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 3e28401f161c..960f26ab5c9f 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -26,6 +26,7 @@
#include <linux/nodemask.h>
#include <linux/topology.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/kernel.h>
@@ -88,7 +89,7 @@ static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
node_end_pfn[node] =
MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
- e820_register_active_regions(node, node_start_pfn[node],
+ memblock_x86_register_active_regions(node, node_start_pfn[node],
node_end_pfn[node]);
memory_present(node, node_start_pfn[node], node_end_pfn[node]);
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 83e9be4778e2..f9e4e6a54073 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -54,6 +54,9 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
*/
void __init default_setup_apic_routing(void)
{
+
+ enable_IR_x2apic();
+
#ifdef CONFIG_X86_X2APIC
if (x2apic_mode
#ifdef CONFIG_X86_UV
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index e46f98f36e31..ed4118de249e 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
*
* SGI UV APIC functions (note: not an Intel compatible APIC)
*
- * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
*/
#include <linux/cpumask.h>
#include <linux/hardirq.h>
@@ -41,6 +41,7 @@ DEFINE_PER_CPU(int, x2apic_extra_bits);
static enum uv_system_type uv_system_type;
static u64 gru_start_paddr, gru_end_paddr;
+static union uvh_apicid uvh_apicid;
int uv_min_hub_revision_id;
EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
static DEFINE_SPINLOCK(uv_nmi_lock);
@@ -70,12 +71,27 @@ static int early_get_nodeid(void)
return node_id.s.node_id;
}
+static void __init early_get_apic_pnode_shift(void)
+{
+ unsigned long *mmr;
+
+ mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_APICID, sizeof(*mmr));
+ uvh_apicid.v = *mmr;
+ early_iounmap(mmr, sizeof(*mmr));
+ if (!uvh_apicid.v)
+ /*
+ * Old bios, use default value
+ */
+ uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT;
+}
+
static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
int nodeid;
if (!strcmp(oem_id, "SGI")) {
nodeid = early_get_nodeid();
+ early_get_apic_pnode_shift();
x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
x86_platform.nmi_init = uv_nmi_init;
if (!strcmp(oem_table_id, "UVL"))
@@ -84,7 +100,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
uv_system_type = UV_X2APIC;
else if (!strcmp(oem_table_id, "UVH")) {
__get_cpu_var(x2apic_extra_bits) =
- nodeid << (UV_APIC_PNODE_SHIFT - 1);
+ nodeid << (uvh_apicid.s.pnode_shift - 1);
uv_system_type = UV_NON_UNIQUE_APIC;
return 1;
}
@@ -604,6 +620,10 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
{
if (reason != DIE_NMI_IPI)
return NOTIFY_OK;
+
+ if (in_crash_kexec)
+ /* do nothing if entering the crash kernel */
+ return NOTIFY_OK;
/*
* Use a lock so only one cpu prints at a time
* to prevent intermixed output.
@@ -694,9 +714,11 @@ void __init uv_system_init(void)
for (j = 0; j < 64; j++) {
if (!test_bit(j, &present))
continue;
- uv_blade_info[blade].pnode = (i * 64 + j);
+ pnode = (i * 64 + j);
+ uv_blade_info[blade].pnode = pnode;
uv_blade_info[blade].nr_possible_cpus = 0;
uv_blade_info[blade].nr_online_cpus = 0;
+ max_pnode = max(pnode, max_pnode);
blade++;
}
}
@@ -710,6 +732,10 @@ void __init uv_system_init(void)
int apicid = per_cpu(x86_cpu_to_apicid, cpu);
nid = cpu_to_node(cpu);
+ /*
+ * apic_pnode_shift must be set before calling uv_apicid_to_pnode();
+ */
+ uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
pnode = uv_apicid_to_pnode(apicid);
blade = boot_pnode_to_blade(pnode);
lcpu = uv_blade_info[blade].nr_possible_cpus;
@@ -734,7 +760,6 @@ void __init uv_system_init(void)
uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
uv_node_to_blade[nid] = blade;
uv_cpu_to_blade[cpu] = blade;
- max_pnode = max(pnode, max_pnode);
}
/* Add blade/pnode info for nodes without cpus */
@@ -746,7 +771,6 @@ void __init uv_system_init(void)
pnode = (paddr >> m_val) & pnode_mask;
blade = boot_pnode_to_blade(pnode);
uv_node_to_blade[nid] = blade;
- max_pnode = max(pnode, max_pnode);
}
map_gru_high(max_pnode);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index c4f9182ca3ac..0e4f24c2a746 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -140,7 +140,7 @@
* is now the way life works).
* Fix thinko in suspend() (wrong return).
* Notify drivers on critical suspend.
- * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz>
+ * Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz>
* modified by sfr).
* Disable interrupts while we are suspended (Andy Henroid
* <andy_henroid@yahoo.com> fixed by sfr).
@@ -189,8 +189,8 @@
* Intel Order Number 241704-001. Microsoft Part Number 781-110-X01.
*
* [This document is available free from Intel by calling 800.628.8686 (fax
- * 916.356.6100) or 800.548.4725; or via anonymous ftp from
- * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc. It is also
+ * 916.356.6100) or 800.548.4725; or from
+ * http://www.microsoft.com/whdc/archive/amp_12.mspx It is also
* available from Microsoft by calling 206.882.8080.]
*
* APM 1.2 Reference:
@@ -1926,6 +1926,7 @@ static const struct file_operations apm_bios_fops = {
.unlocked_ioctl = do_ioctl,
.open = do_open,
.release = do_release,
+ .llseek = noop_llseek,
};
static struct miscdevice apm_device = {
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index dfdbf6403895..1a4088dda37a 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -99,9 +99,7 @@ void foo(void)
DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
- DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
- DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
- DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
+ DEFINE(THREAD_SIZE_asm, THREAD_SIZE);
OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index fc999e6fc46a..13a389179514 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -2,7 +2,8 @@
#include <linux/sched.h>
#include <linux/kthread.h>
#include <linux/workqueue.h>
-#include <asm/e820.h>
+#include <linux/memblock.h>
+
#include <asm/proto.h>
/*
@@ -18,10 +19,12 @@ static int __read_mostly memory_corruption_check = -1;
static unsigned __read_mostly corruption_check_size = 64*1024;
static unsigned __read_mostly corruption_check_period = 60; /* seconds */
-static struct e820entry scan_areas[MAX_SCAN_AREAS];
+static struct scan_area {
+ u64 addr;
+ u64 size;
+} scan_areas[MAX_SCAN_AREAS];
static int num_scan_areas;
-
static __init int set_corruption_check(char *arg)
{
char *end;
@@ -81,9 +84,9 @@ void __init setup_bios_corruption_check(void)
while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
u64 size;
- addr = find_e820_area_size(addr, &size, PAGE_SIZE);
+ addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE);
- if (!(addr + 1))
+ if (addr == MEMBLOCK_ERROR)
break;
if (addr >= corruption_check_size)
@@ -92,7 +95,7 @@ void __init setup_bios_corruption_check(void)
if ((addr + size) > corruption_check_size)
size = corruption_check_size - addr;
- e820_update_range(addr, size, E820_RAM, E820_RESERVED);
+ memblock_x86_reserve_range(addr, addr + size, "SCAN RAM");
scan_areas[num_scan_areas].addr = addr;
scan_areas[num_scan_areas].size = size;
num_scan_areas++;
@@ -105,7 +108,6 @@ void __init setup_bios_corruption_check(void)
printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
num_scan_areas);
- update_e820();
}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3a785da34b6f..3f0ebe429a01 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -12,11 +12,11 @@ endif
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_common.o := $(nostackp)
-obj-y := intel_cacheinfo.o addon_cpuid_features.o
+obj-y := intel_cacheinfo.o scattered.o topology.o
obj-y += proc.o capflags.o powerflags.o common.o
obj-y += vmware.o hypervisor.o sched.o mshyperv.o
-obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
+obj-$(CONFIG_X86_32) += bugs.o
obj-$(CONFIG_X86_64) += bugs_64.o
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e485825130d2..9e093f8fe78c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -148,7 +148,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
/* calling is from identify_secondary_cpu() ? */
- if (c->cpu_index == boot_cpu_id)
+ if (!c->cpu_index)
return;
/*
@@ -253,37 +253,51 @@ static int __cpuinit nearby_node(int apicid)
#endif
/*
- * Fixup core topology information for AMD multi-node processors.
- * Assumption: Number of cores in each internal node is the same.
+ * Fixup core topology information for
+ * (1) AMD multi-node processors
+ * Assumption: Number of cores in each internal node is the same.
+ * (2) AMD processors supporting compute units
*/
#ifdef CONFIG_X86_HT
-static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
+static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
{
- unsigned long long value;
- u32 nodes, cores_per_node;
+ u32 nodes;
+ u8 node_id;
int cpu = smp_processor_id();
- if (!cpu_has(c, X86_FEATURE_NODEID_MSR))
- return;
+ /* get information required for multi-node processors */
+ if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+ u32 eax, ebx, ecx, edx;
- /* fixup topology information only once for a core */
- if (cpu_has(c, X86_FEATURE_AMD_DCM))
- return;
+ cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+ nodes = ((ecx >> 8) & 7) + 1;
+ node_id = ecx & 7;
- rdmsrl(MSR_FAM10H_NODE_ID, value);
+ /* get compute unit information */
+ smp_num_siblings = ((ebx >> 8) & 3) + 1;
+ c->compute_unit_id = ebx & 0xff;
+ } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
+ u64 value;
- nodes = ((value >> 3) & 7) + 1;
- if (nodes == 1)
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+ nodes = ((value >> 3) & 7) + 1;
+ node_id = value & 7;
+ } else
return;
- set_cpu_cap(c, X86_FEATURE_AMD_DCM);
- cores_per_node = c->x86_max_cores / nodes;
+ /* fixup multi-node processor information */
+ if (nodes > 1) {
+ u32 cores_per_node;
- /* store NodeID, use llc_shared_map to store sibling info */
- per_cpu(cpu_llc_id, cpu) = value & 7;
+ set_cpu_cap(c, X86_FEATURE_AMD_DCM);
+ cores_per_node = c->x86_max_cores / nodes;
- /* fixup core id to be in range from 0 to (cores_per_node - 1) */
- c->cpu_core_id = c->cpu_core_id % cores_per_node;
+ /* store NodeID, use llc_shared_map to store sibling info */
+ per_cpu(cpu_llc_id, cpu) = node_id;
+
+ /* core id to be in range from 0 to (cores_per_node - 1) */
+ c->cpu_core_id = c->cpu_core_id % cores_per_node;
+ }
}
#endif
@@ -304,9 +318,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
c->phys_proc_id = c->initial_apicid >> bits;
/* use socket ID also for last level cache */
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
- /* fixup topology information on multi-node processors */
- if ((c->x86 == 0x10) && (c->x86_model == 9))
- amd_fixup_dcm(c);
+ amd_get_topology(c);
#endif
}
@@ -412,6 +424,23 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
}
#endif
+
+ /* We need to do the following only once */
+ if (c != &boot_cpu_data)
+ return;
+
+ if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
+
+ if (c->x86 > 0x10 ||
+ (c->x86 == 0x10 && c->x86_model >= 0x2)) {
+ u64 val;
+
+ rdmsrl(MSR_K7_HWCR, val);
+ if (!(val & BIT(24)))
+ printk(KERN_WARNING FW_BUG "TSC doesn't count "
+ "with P0 frequency!\n");
+ }
+ }
}
static void __cpuinit init_amd(struct cpuinfo_x86 *c)
@@ -466,7 +495,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
}
}
- if (c->x86 == 0x10 || c->x86 == 0x11)
+ if (c->x86 >= 0x10)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
/* get apicid instead of initial apic id from cpuid */
@@ -523,13 +552,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
#endif
if (c->extended_cpuid_level >= 0x80000006) {
- if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
+ if (cpuid_edx(0x80000006) & 0xf000)
num_cache_leaves = 4;
else
num_cache_leaves = 3;
}
- if (c->x86 >= 0xf && c->x86 <= 0x11)
+ if (c->x86 >= 0xf)
set_cpu_cap(c, X86_FEATURE_K8);
if (cpu_has_xmm2) {
@@ -546,7 +575,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
fam10h_check_enable_mmcfg();
}
- if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
+ if (c == &boot_cpu_data && c->x86 >= 0xf) {
unsigned long long tseg;
/*
@@ -609,3 +638,74 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
};
cpu_dev_register(amd_cpu_dev);
+
+/*
+ * AMD errata checking
+ *
+ * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
+ * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
+ * have an OSVW id assigned, which it takes as first argument. Both take a
+ * variable number of family-specific model-stepping ranges created by
+ * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const
+ * int[] in arch/x86/include/asm/processor.h.
+ *
+ * Example:
+ *
+ * const int amd_erratum_319[] =
+ * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
+ * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
+ * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
+ */
+
+const int amd_erratum_400[] =
+ AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
+ AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
+EXPORT_SYMBOL_GPL(amd_erratum_400);
+
+const int amd_erratum_383[] =
+ AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
+EXPORT_SYMBOL_GPL(amd_erratum_383);
+
+bool cpu_has_amd_erratum(const int *erratum)
+{
+ struct cpuinfo_x86 *cpu = &current_cpu_data;
+ int osvw_id = *erratum++;
+ u32 range;
+ u32 ms;
+
+ /*
+ * If called early enough that current_cpu_data hasn't been initialized
+ * yet, fall back to boot_cpu_data.
+ */
+ if (cpu->x86 == 0)
+ cpu = &boot_cpu_data;
+
+ if (cpu->x86_vendor != X86_VENDOR_AMD)
+ return false;
+
+ if (osvw_id >= 0 && osvw_id < 65536 &&
+ cpu_has(cpu, X86_FEATURE_OSVW)) {
+ u64 osvw_len;
+
+ rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
+ if (osvw_id < osvw_len) {
+ u64 osvw_bits;
+
+ rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
+ osvw_bits);
+ return osvw_bits & (1ULL << (osvw_id & 0x3f));
+ }
+ }
+
+ /* OSVW unavailable or ID unknown, match family-model-stepping range */
+ ms = (cpu->x86_model << 4) | cpu->x86_mask;
+ while ((range = *erratum++))
+ if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
+ (ms >= AMD_MODEL_RANGE_START(range)) &&
+ (ms <= AMD_MODEL_RANGE_END(range)))
+ return true;
+
+ return false;
+}
+
+EXPORT_SYMBOL_GPL(cpu_has_amd_erratum);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 68e4a6f2211e..4b68bda30938 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
static int __init x86_xsave_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
return 1;
}
__setup("noxsave", x86_xsave_setup);
+static int __init x86_xsaveopt_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+ return 1;
+}
+__setup("noxsaveopt", x86_xsaveopt_setup);
+
#ifdef CONFIG_X86_32
static int cachesize_override __cpuinitdata = -1;
static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -537,7 +545,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
}
}
-static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
+void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
{
u32 tfms, xlvl;
u32 ebx;
@@ -551,6 +559,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[4] = excap;
}
+ /* Additional Intel-defined flags: level 0x00000007 */
+ if (c->cpuid_level >= 0x00000007) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
+
+ if (eax > 0)
+ c->x86_capability[9] = ebx;
+ }
+
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
c->extended_cpuid_level = xlvl;
@@ -576,6 +594,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level >= 0x80000007)
c->x86_power = cpuid_edx(0x80000007);
+ init_scattered_cpuid_features(c);
}
static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -646,7 +665,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
this_cpu->c_early_init(c);
#ifdef CONFIG_SMP
- c->cpu_index = boot_cpu_id;
+ c->cpu_index = 0;
#endif
filter_cpuid_features(c, false);
}
@@ -685,16 +704,21 @@ void __init early_cpu_init(void)
}
/*
- * The NOPL instruction is supposed to exist on all CPUs with
- * family >= 6; unfortunately, that's not true in practice because
- * of early VIA chips and (more importantly) broken virtualizers that
- * are not easy to detect. In the latter case it doesn't even *fail*
- * reliably, so probing for it doesn't even work. Disable it completely
+ * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
+ * unfortunately, that's not true in practice because of early VIA
+ * chips and (more importantly) broken virtualizers that are not easy
+ * to detect. In the latter case it doesn't even *fail* reliably, so
+ * probing for it doesn't even work. Disable it completely on 32-bit
* unless we can find a reliable way to detect all the broken cases.
+ * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
*/
static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
clear_cpu_cap(c, X86_FEATURE_NOPL);
+#else
+ set_cpu_cap(c, X86_FEATURE_NOPL);
+#endif
}
static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
@@ -731,7 +755,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
get_model_name(c); /* Default name */
- init_scattered_cpuid_features(c);
detect_nopl(c);
}
@@ -1192,6 +1215,7 @@ void __cpuinit cpu_init(void)
dbg_restore_debug_regs();
fpu_init();
+ xsave_init();
raw_local_save_flags(kernel_eflags);
@@ -1245,19 +1269,7 @@ void __cpuinit cpu_init(void)
clear_all_debug_regs();
dbg_restore_debug_regs();
- /*
- * Force FPU initialization:
- */
- current_thread_info()->status = 0;
- clear_used_math();
- mxcsr_feature_mask_init();
-
- /*
- * Boot processor to setup the FP and extended state context info.
- */
- if (smp_processor_id() == boot_cpu_id)
- init_thread_xstate();
-
+ fpu_init();
xsave_init();
}
#endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 3624e8a0f71b..e765633f210e 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,8 @@ struct cpu_dev {
extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
+extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
+extern void get_cpu_cap(struct cpuinfo_x86 *c);
#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 1d3cddaa40ee..a2baafb2fe6d 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -34,7 +34,6 @@
#include <linux/compiler.h>
#include <linux/dmi.h>
#include <linux/slab.h>
-#include <trace/events/power.h>
#include <linux/acpi.h>
#include <linux/io.h>
@@ -73,7 +72,7 @@ struct acpi_cpufreq_data {
static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
/* acpi_perf_data is a pointer to percpu data. */
-static struct acpi_processor_performance *acpi_perf_data;
+static struct acpi_processor_performance __percpu *acpi_perf_data;
static struct cpufreq_driver acpi_cpufreq_driver;
@@ -324,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
}
}
- trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
-
switch (data->cpu_feature) {
case SYSTEM_INTEL_MSR_CAPABLE:
cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
@@ -351,7 +348,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
freqs.old = perf->states[perf->state].core_frequency * 1000;
freqs.new = data->freq_table[next_state].frequency;
- for_each_cpu(i, cmd.mask) {
+ for_each_cpu(i, policy->cpus) {
freqs.cpu = i;
cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
}
@@ -367,7 +364,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
}
}
- for_each_cpu(i, cmd.mask) {
+ for_each_cpu(i, policy->cpus) {
freqs.cpu = i;
cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
}
@@ -704,6 +701,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
per_cpu(acfreq_data, policy->cpu) = NULL;
acpi_processor_unregister_performance(data->acpi_data,
policy->cpu);
+ kfree(data->freq_table);
kfree(data);
}
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
index 733093d60436..141abebc4516 100644
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -393,7 +393,7 @@ static struct cpufreq_driver nforce2_driver = {
* Detects nForce2 A2 and C1 stepping
*
*/
-static unsigned int nforce2_detect_chipset(void)
+static int nforce2_detect_chipset(void)
{
nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
PCI_DEVICE_ID_NVIDIA_NFORCE2,
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index 16e3483be9e3..32974cf84232 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -169,12 +169,9 @@ static int gx_freq_mult[16] = {
* Low Level chipset interface *
****************************************************************/
static struct pci_device_id gx_chipset_tbl[] __initdata = {
- { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY,
- PCI_ANY_ID, PCI_ANY_ID },
- { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520,
- PCI_ANY_ID, PCI_ANY_ID },
- { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510,
- PCI_ANY_ID, PCI_ANY_ID },
+ { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
+ { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
+ { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
{ 0, },
};
@@ -199,7 +196,7 @@ static __init struct pci_dev *gx_detect_chipset(void)
}
/* detect which companion chip is used */
- while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) {
+ for_each_pci_dev(gx_pci) {
if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
return gx_pci;
}
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 7e7eea4f8261..03162dac6271 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -426,7 +426,7 @@ static int guess_fsb(int mult)
}
-static int __init longhaul_get_ranges(void)
+static int __cpuinit longhaul_get_ranges(void)
{
unsigned int i, j, k = 0;
unsigned int ratio;
@@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void)
}
-static void __init longhaul_setup_voltagescaling(void)
+static void __cpuinit longhaul_setup_voltagescaling(void)
{
union msr_longhaul longhaul;
struct mV_pos minvid, maxvid, vid;
@@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void)
return 0;
}
-static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
+static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
{
struct cpuinfo_x86 *c = &cpu_data(0);
char *cpuname = NULL;
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
index e2360a469f79..cbf48fbca881 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h
@@ -56,7 +56,7 @@ union msr_longhaul {
/*
* VIA C3 Samuel 1 & Samuel 2 (stepping 0)
*/
-static const int __initdata samuel1_mults[16] = {
+static const int __cpuinitdata samuel1_mults[16] = {
-1, /* 0000 -> RESERVED */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = {
-1, /* 1111 -> RESERVED */
};
-static const int __initdata samuel1_eblcr[16] = {
+static const int __cpuinitdata samuel1_eblcr[16] = {
50, /* 0000 -> RESERVED */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = {
/*
* VIA C3 Samuel2 Stepping 1->15
*/
-static const int __initdata samuel2_eblcr[16] = {
+static const int __cpuinitdata samuel2_eblcr[16] = {
50, /* 0000 -> 5.0x */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = {
/*
* VIA C3 Ezra
*/
-static const int __initdata ezra_mults[16] = {
+static const int __cpuinitdata ezra_mults[16] = {
100, /* 0000 -> 10.0x */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = {
120, /* 1111 -> 12.0x */
};
-static const int __initdata ezra_eblcr[16] = {
+static const int __cpuinitdata ezra_eblcr[16] = {
50, /* 0000 -> 5.0x */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = {
/*
* VIA C3 (Ezra-T) [C5M].
*/
-static const int __initdata ezrat_mults[32] = {
+static const int __cpuinitdata ezrat_mults[32] = {
100, /* 0000 -> 10.0x */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = {
-1, /* 1111 -> RESERVED (12.0x) */
};
-static const int __initdata ezrat_eblcr[32] = {
+static const int __cpuinitdata ezrat_eblcr[32] = {
50, /* 0000 -> 5.0x */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = {
/*
* VIA C3 Nehemiah */
-static const int __initdata nehemiah_mults[32] = {
+static const int __cpuinitdata nehemiah_mults[32] = {
100, /* 0000 -> 10.0x */
-1, /* 0001 -> 16.0x */
40, /* 0010 -> 4.0x */
@@ -270,7 +270,7 @@ static const int __initdata nehemiah_mults[32] = {
-1, /* 1111 -> 12.0x */
};
-static const int __initdata nehemiah_eblcr[32] = {
+static const int __cpuinitdata nehemiah_eblcr[32] = {
50, /* 0000 -> 5.0x */
160, /* 0001 -> 16.0x */
40, /* 0010 -> 4.0x */
@@ -315,7 +315,7 @@ struct mV_pos {
unsigned short pos;
};
-static const struct mV_pos __initdata vrm85_mV[32] = {
+static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
{1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
{1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
{1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
@@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = {
{1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
};
-static const unsigned char __initdata mV_vrm85[32] = {
+static const unsigned char __cpuinitdata mV_vrm85[32] = {
0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
};
-static const struct mV_pos __initdata mobilevrm_mV[32] = {
+static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
{1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
{1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
{1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
@@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = {
{675, 3}, {650, 2}, {625, 1}, {600, 0}
};
-static const unsigned char __initdata mV_mobilevrm[32] = {
+static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index e7b559d74c52..d9f51367666b 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -35,7 +35,7 @@ static unsigned int longrun_low_freq, longrun_high_freq;
* Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
* and MSR_TMTA_LONGRUN_CTRL
*/
-static void __init longrun_get_policy(struct cpufreq_policy *policy)
+static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy)
{
u32 msr_lo, msr_hi;
@@ -165,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu)
* TMTA rules:
* performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
*/
-static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
- unsigned int *high_freq)
+static int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
+ unsigned int *high_freq)
{
u32 msr_lo, msr_hi;
u32 save_lo, save_hi;
@@ -258,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
}
-static int __init longrun_cpu_init(struct cpufreq_policy *policy)
+static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
{
int result = 0;
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 7b8a8ba67b07..bd1cac747f67 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -178,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
}
}
- if (c->x86 != 0xF) {
- if (!cpu_has(c, X86_FEATURE_EST))
- printk(KERN_WARNING PFX "Unknown CPU. "
- "Please send an e-mail to "
- "<cpufreq@vger.kernel.org>\n");
+ if (c->x86 != 0xF)
return 0;
- }
/* on P-4s, the TSC runs with constant frequency independent whether
* throttling is active or not. */
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index a36de5bbb622..4f6f679f2799 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -110,7 +110,7 @@ struct pcc_cpu {
u32 output_offset;
};
-static struct pcc_cpu *pcc_cpu_info;
+static struct pcc_cpu __percpu *pcc_cpu_info;
static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
{
@@ -368,16 +368,22 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
return -ENODEV;
out_obj = output.pointer;
- if (out_obj->type != ACPI_TYPE_BUFFER)
- return -ENODEV;
+ if (out_obj->type != ACPI_TYPE_BUFFER) {
+ ret = -ENODEV;
+ goto out_free;
+ }
errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
- if (errors)
- return -ENODEV;
+ if (errors) {
+ ret = -ENODEV;
+ goto out_free;
+ }
supported = *((u32 *)(out_obj->buffer.pointer + 4));
- if (!(supported & 0x1))
- return -ENODEV;
+ if (!(supported & 0x1)) {
+ ret = -ENODEV;
+ goto out_free;
+ }
out_free:
kfree(output.pointer);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 9a97116f89e5..4a45fd6e41ba 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy)
* We will then get the same kind of behaviour already tested under
* the "well-known" other OS.
*/
-static int __init fixup_sgtc(void)
+static int __cpuinit fixup_sgtc(void)
{
unsigned int sgtc;
unsigned int m;
@@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu)
}
-static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
+static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
{
printk(KERN_WARNING PFX
"%s laptop with broken PST tables in BIOS detected.\n",
@@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
* A BIOS update is all that can save them.
* Mention this, and disable cpufreq.
*/
-static struct dmi_system_id __initdata powernow_dmi_table[] = {
+static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
{
.callback = acer_cpufreq_pst,
.ident = "Acer Aspire",
@@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = {
{ }
};
-static int __init powernow_cpu_init(struct cpufreq_policy *policy)
+static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
{
union msr_fidvidstatus fidvidstatus;
int result;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 3e90cce3dc8b..491977baf6c0 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -9,7 +9,7 @@
* Based on the powernow-k7.c module written by Dave Jones.
* (C) 2003 Dave Jones on behalf of SuSE Labs
* (C) 2004 Dominik Brodowski <linux@brodo.de>
- * (C) 2004 Pavel Machek <pavel@suse.cz>
+ * (C) 2004 Pavel Machek <pavel@ucw.cz>
* Licensed under the terms of the GNU GPL License version 2.
* Based upon datasheets & sample CPUs kindly provided by AMD.
*
@@ -806,6 +806,8 @@ static int find_psb_table(struct powernow_k8_data *data)
* www.amd.com
*/
printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
+ printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
+ " and Cool'N'Quiet support is enabled in BIOS setup\n");
return -ENODEV;
}
@@ -910,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
{
int i;
u32 hi = 0, lo = 0;
- rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo);
- data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
+ rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
+ data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
for (i = 0; i < data->acpi_data.state_count; i++) {
u32 index;
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index dd531cc56a8f..8095f8611f8a 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -34,6 +34,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
&x86_hyper_vmware,
&x86_hyper_ms_hyperv,
+#ifdef CONFIG_XEN_PVHVM
+ &x86_hyper_xen_hvm,
+#endif
};
const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 85f69cdeae10..d16c2c53d6bf 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -39,6 +39,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
c->cpuid_level = cpuid_eax(0);
+ get_cpu_cap(c);
}
}
@@ -169,7 +170,7 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
/* calling is from identify_secondary_cpu() ? */
- if (c->cpu_index == boot_cpu_id)
+ if (!c->cpu_index)
return;
/*
@@ -283,9 +284,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
/* Don't do the funky fallback heuristics the AMD version employs
for now. */
node = apicid_to_node[apicid];
- if (node == NUMA_NO_NODE)
- node = first_node(node_online_map);
- else if (!node_online(node)) {
+ if (node == NUMA_NO_NODE || !node_online(node)) {
/* reuse the value from init_cpu_to_node() */
node = cpu_to_node(cpu);
}
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 33eae2062cf5..17ad03366211 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,7 +17,7 @@
#include <asm/processor.h>
#include <linux/smp.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
#include <asm/smp.h>
#define LVL_1_INST 1
@@ -306,7 +306,7 @@ struct _cache_attr {
ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
};
-#ifdef CONFIG_CPU_SUP_AMD
+#ifdef CONFIG_AMD_NB
/*
* L3 cache descriptors
@@ -327,6 +327,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+ l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
}
static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
@@ -347,8 +348,8 @@ static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
return l3;
}
-static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
+static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
+ int index)
{
int node;
@@ -369,7 +370,7 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
return;
/* not in virtualized environments */
- if (num_k8_northbridges == 0)
+ if (k8_northbridges.num == 0)
return;
/*
@@ -377,7 +378,7 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
* never freed but this is done only on shutdown so it doesn't matter.
*/
if (!l3_caches) {
- int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
+ int size = k8_northbridges.num * sizeof(struct amd_l3_cache *);
l3_caches = kzalloc(size, GFP_ATOMIC);
if (!l3_caches)
@@ -396,20 +397,39 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
this_leaf->l3 = l3_caches[node];
}
+/*
+ * check whether a slot used for disabling an L3 index is occupied.
+ * @l3: L3 cache descriptor
+ * @slot: slot number (0..1)
+ *
+ * @returns: the disabled index if used or negative value if slot free.
+ */
+int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
+{
+ unsigned int reg = 0;
+
+ pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
+
+ /* check whether this slot is activated already */
+ if (reg & (3UL << 30))
+ return reg & 0xfff;
+
+ return -1;
+}
+
static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
unsigned int slot)
{
- struct pci_dev *dev = this_leaf->l3->dev;
- unsigned int reg = 0;
+ int index;
if (!this_leaf->l3 || !this_leaf->l3->can_disable)
return -EINVAL;
- if (!dev)
- return -EINVAL;
+ index = amd_get_l3_disable_slot(this_leaf->l3, slot);
+ if (index >= 0)
+ return sprintf(buf, "%d\n", index);
- pci_read_config_dword(dev, 0x1BC + slot * 4, &reg);
- return sprintf(buf, "0x%08x\n", reg);
+ return sprintf(buf, "FREE\n");
}
#define SHOW_CACHE_DISABLE(slot) \
@@ -451,37 +471,74 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
}
}
-
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
- const char *buf, size_t count,
- unsigned int slot)
+/*
+ * disable a L3 cache index by using a disable-slot
+ *
+ * @l3: L3 cache descriptor
+ * @cpu: A CPU on the node containing the L3 cache
+ * @slot: slot number (0..1)
+ * @index: index to disable
+ *
+ * @return: 0 on success, error status on failure
+ */
+int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
+ unsigned long index)
{
- struct pci_dev *dev = this_leaf->l3->dev;
- int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- unsigned long val = 0;
+ int ret = 0;
#define SUBCACHE_MASK (3UL << 20)
#define SUBCACHE_INDEX 0xfff
- if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+ /*
+ * check whether this slot is already used or
+ * the index is already disabled
+ */
+ ret = amd_get_l3_disable_slot(l3, slot);
+ if (ret >= 0)
return -EINVAL;
+ /*
+ * check whether the other slot has disabled the
+ * same index already
+ */
+ if (index == amd_get_l3_disable_slot(l3, !slot))
+ return -EINVAL;
+
+ /* do not allow writes outside of allowed bits */
+ if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
+ ((index & SUBCACHE_INDEX) > l3->indices))
+ return -EINVAL;
+
+ amd_l3_disable_index(l3, cpu, slot, index);
+
+ return 0;
+}
+
+static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
+ const char *buf, size_t count,
+ unsigned int slot)
+{
+ unsigned long val = 0;
+ int cpu, err = 0;
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!dev)
+ if (!this_leaf->l3 || !this_leaf->l3->can_disable)
return -EINVAL;
- if (strict_strtoul(buf, 10, &val) < 0)
- return -EINVAL;
+ cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- /* do not allow writes outside of allowed bits */
- if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
- ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
+ if (strict_strtoul(buf, 10, &val) < 0)
return -EINVAL;
- amd_l3_disable_index(this_leaf->l3, cpu, slot, val);
-
+ err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val);
+ if (err) {
+ if (err == -EEXIST)
+ printk(KERN_WARNING "L3 disable slot %d in use!\n",
+ slot);
+ return err;
+ }
return count;
}
@@ -500,12 +557,12 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
show_cache_disable_1, store_cache_disable_1);
-#else /* CONFIG_CPU_SUP_AMD */
+#else /* CONFIG_AMD_NB */
static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
+amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
{
};
-#endif /* CONFIG_CPU_SUP_AMD */
+#endif /* CONFIG_AMD_NB */
static int
__cpuinit cpuid4_cache_lookup_regs(int index,
@@ -518,7 +575,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
amd_cpuid4(index, &eax, &ebx, &ecx);
- amd_check_l3_disable(index, this_leaf);
+ amd_check_l3_disable(this_leaf, index);
} else {
cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
}
@@ -944,7 +1001,7 @@ static struct attribute *default_attrs[] = {
static struct attribute *default_l3_attrs[] = {
DEFAULT_SYSFS_CACHE_ATTRS,
-#ifdef CONFIG_CPU_SUP_AMD
+#ifdef CONFIG_AMD_NB
&cache_disable_0.attr,
&cache_disable_1.attr,
#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 745b54f9be89..8209472b27a5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -80,7 +80,7 @@ int apei_write_mce(struct mce *m)
rcd.hdr.revision = CPER_RECORD_REV;
rcd.hdr.signature_end = CPER_SIG_END;
rcd.hdr.section_count = 1;
- rcd.hdr.error_severity = CPER_SER_FATAL;
+ rcd.hdr.error_severity = CPER_SEV_FATAL;
/* timestamp, platform_id, partition_id are all invalid */
rcd.hdr.validation_bits = 0;
rcd.hdr.record_length = sizeof(rcd);
@@ -96,7 +96,7 @@ int apei_write_mce(struct mce *m)
rcd.sec_hdr.validation_bits = 0;
rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
- rcd.sec_hdr.section_severity = CPER_SER_FATAL;
+ rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
memcpy(&rcd.mce, m, sizeof(*m));
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8a85dd1b1aa1..1e8d66c1336a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -192,6 +192,7 @@ static const struct file_operations severities_coverage_fops = {
.release = seq_release,
.read = seq_read,
.write = severities_coverage_write,
+ .llseek = seq_lseek,
};
static int __init severities_debugfs_init(void)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 18cc42562250..7a35b72d7c03 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -51,7 +51,7 @@
static DEFINE_MUTEX(mce_read_mutex);
#define rcu_dereference_check_mce(p) \
- rcu_dereference_check((p), \
+ rcu_dereference_index_check((p), \
rcu_read_lock_sched_held() || \
lockdep_is_held(&mce_read_mutex))
@@ -107,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
static int default_decode_mce(struct notifier_block *nb, unsigned long val,
void *data)
{
- pr_emerg("No human readable MCE decoding support on this CPU type.\n");
- pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
+ pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
+ pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
return NOTIFY_STOP;
}
@@ -211,11 +211,11 @@ void mce_log(struct mce *mce)
static void print_mce(struct mce *m)
{
- pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
+ pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
m->extcpu, m->mcgstatus, m->bank, m->status);
if (m->ip) {
- pr_emerg("RIP%s %02x:<%016Lx> ",
+ pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
m->cs, m->ip);
@@ -224,14 +224,14 @@ static void print_mce(struct mce *m)
pr_cont("\n");
}
- pr_emerg("TSC %llx ", m->tsc);
+ pr_emerg(HW_ERR "TSC %llx ", m->tsc);
if (m->addr)
pr_cont("ADDR %llx ", m->addr);
if (m->misc)
pr_cont("MISC %llx ", m->misc);
pr_cont("\n");
- pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
+ pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
/*
@@ -241,16 +241,6 @@ static void print_mce(struct mce *m)
atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
}
-static void print_mce_head(void)
-{
- pr_emerg("\nHARDWARE ERROR\n");
-}
-
-static void print_mce_tail(void)
-{
- pr_emerg("This is not a software problem!\n");
-}
-
#define PANIC_TIMEOUT 5 /* 5 seconds */
static atomic_t mce_paniced;
@@ -291,7 +281,6 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
if (atomic_inc_return(&mce_fake_paniced) > 1)
return;
}
- print_mce_head();
/* First print corrected ones that are still unlogged */
for (i = 0; i < MCE_LOG_LEN; i++) {
struct mce *m = &mcelog.entry[i];
@@ -322,16 +311,15 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
apei_err = apei_write_mce(final);
}
if (cpu_missing)
- printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
- print_mce_tail();
+ pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
if (exp)
- printk(KERN_EMERG "Machine check: %s\n", exp);
+ pr_emerg(HW_ERR "Machine check: %s\n", exp);
if (!fake_panic) {
if (panic_timeout == 0)
panic_timeout = mce_panic_timeout;
panic(msg);
} else
- printk(KERN_EMERG "Fake kernel panic: %s\n", msg);
+ pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
}
/* Support code for software error injection */
@@ -600,6 +588,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
*/
if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
mce_log(&m);
+ atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
add_taint(TAINT_MACHINE_CHECK);
}
@@ -1220,7 +1209,7 @@ int mce_notify_irq(void)
schedule_work(&mce_trigger_work);
if (__ratelimit(&ratelimit))
- printk(KERN_INFO "Machine check events logged\n");
+ pr_info(HW_ERR "Machine check events logged\n");
return 1;
}
@@ -1676,6 +1665,7 @@ struct file_operations mce_chrdev_ops = {
.read = mce_read,
.poll = mce_poll,
.unlocked_ioctl = mce_ioctl,
+ .llseek = no_llseek,
};
EXPORT_SYMBOL_GPL(mce_chrdev_ops);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 224392d8fe8c..80c482382d5c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -131,7 +131,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
u32 low = 0, high = 0, address = 0;
unsigned int bank, block;
struct thresh_restart tr;
- u8 lvt_off;
+ int lvt_off = -1;
+ u8 offset;
for (bank = 0; bank < NR_BANKS; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
@@ -141,6 +142,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
address = (low & MASK_BLKPTR_LO) >> 21;
if (!address)
break;
+
address += MCG_XBLK_ADDR;
} else
++address;
@@ -148,12 +150,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (rdmsr_safe(address, &low, &high))
break;
- if (!(high & MASK_VALID_HI)) {
- if (block)
- continue;
- else
- break;
- }
+ if (!(high & MASK_VALID_HI))
+ continue;
if (!(high & MASK_CNTP_HI) ||
(high & MASK_LOCKED_HI))
@@ -165,8 +163,28 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (shared_bank[bank] && c->cpu_core_id)
break;
#endif
- lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR,
- APIC_EILVT_MSG_FIX, 0);
+ offset = (high & MASK_LVTOFF_HI) >> 20;
+ if (lvt_off < 0) {
+ if (setup_APIC_eilvt(offset,
+ THRESHOLD_APIC_VECTOR,
+ APIC_EILVT_MSG_FIX, 0)) {
+ pr_err(FW_BUG "cpu %d, failed to "
+ "setup threshold interrupt "
+ "for bank %d, block %d "
+ "(MSR%08X=0x%x%08x)",
+ smp_processor_id(), bank, block,
+ address, high, low);
+ continue;
+ }
+ lvt_off = offset;
+ } else if (lvt_off != offset) {
+ pr_err(FW_BUG "cpu %d, invalid threshold "
+ "interrupt offset %d for bank %d,"
+ "block %d (MSR%08X=0x%x%08x)",
+ smp_processor_id(), lvt_off, bank,
+ block, address, high, low);
+ continue;
+ }
high &= ~MASK_LVTOFF_HI;
high |= lvt_off << 20;
@@ -530,7 +548,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
err = -ENOMEM;
goto out;
}
- if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
+ if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
kfree(b);
err = -ENOMEM;
goto out;
@@ -543,7 +561,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
#ifndef CONFIG_SMP
cpumask_setall(b->cpus);
#else
- cpumask_copy(b->cpus, c->llc_shared_map);
+ cpumask_set_cpu(cpu, b->cpus);
#endif
per_cpu(threshold_banks, cpu)[bank] = b;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 62b48e40920a..6fcd0936194f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -95,19 +95,20 @@ static void cmci_discover(int banks, int boot)
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Already owned by someone else? */
- if (val & CMCI_EN) {
+ if (val & MCI_CTL2_CMCI_EN) {
if (test_and_clear_bit(i, owned) && !boot)
print_update("SHD", &hdr, i);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
continue;
}
- val |= CMCI_EN | CMCI_THRESHOLD;
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Did the enable bit stick? -- the bank supports CMCI */
- if (val & CMCI_EN) {
+ if (val & MCI_CTL2_CMCI_EN) {
if (!test_and_set_bit(i, owned) && !boot)
print_update("CMCI", &hdr, i);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
@@ -155,7 +156,7 @@ void cmci_clear(void)
continue;
/* Disable CMCI */
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
- val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+ val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
__clear_bit(i, __get_cpu_var(mce_banks_owned));
}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index e1a0a3bf9716..4b683267eca5 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,15 +34,25 @@
/* How long to wait between reporting thermal events */
#define CHECK_INTERVAL (300 * HZ)
+#define THERMAL_THROTTLING_EVENT 0
+#define POWER_LIMIT_EVENT 1
+
/*
- * Current thermal throttling state:
+ * Current thermal event state:
*/
-struct thermal_state {
- bool is_throttled;
-
+struct _thermal_state {
+ bool new_event;
+ int event;
u64 next_check;
- unsigned long throttle_count;
- unsigned long last_throttle_count;
+ unsigned long count;
+ unsigned long last_count;
+};
+
+struct thermal_state {
+ struct _thermal_state core_throttle;
+ struct _thermal_state core_power_limit;
+ struct _thermal_state package_throttle;
+ struct _thermal_state package_power_limit;
};
static DEFINE_PER_CPU(struct thermal_state, thermal_state);
@@ -53,11 +63,13 @@ static u32 lvtthmr_init __read_mostly;
#ifdef CONFIG_SYSFS
#define define_therm_throt_sysdev_one_ro(_name) \
- static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
+ static SYSDEV_ATTR(_name, 0444, \
+ therm_throt_sysdev_show_##_name, \
+ NULL) \
-#define define_therm_throt_sysdev_show_func(name) \
+#define define_therm_throt_sysdev_show_func(event, name) \
\
-static ssize_t therm_throt_sysdev_show_##name( \
+static ssize_t therm_throt_sysdev_show_##event##_##name( \
struct sys_device *dev, \
struct sysdev_attribute *attr, \
char *buf) \
@@ -66,30 +78,42 @@ static ssize_t therm_throt_sysdev_show_##name( \
ssize_t ret; \
\
preempt_disable(); /* CPU hotplug */ \
- if (cpu_online(cpu)) \
+ if (cpu_online(cpu)) { \
ret = sprintf(buf, "%lu\n", \
- per_cpu(thermal_state, cpu).name); \
- else \
+ per_cpu(thermal_state, cpu).event.name); \
+ } else \
ret = 0; \
preempt_enable(); \
\
return ret; \
}
-define_therm_throt_sysdev_show_func(throttle_count);
-define_therm_throt_sysdev_one_ro(throttle_count);
+define_therm_throt_sysdev_show_func(core_throttle, count);
+define_therm_throt_sysdev_one_ro(core_throttle_count);
+
+define_therm_throt_sysdev_show_func(core_power_limit, count);
+define_therm_throt_sysdev_one_ro(core_power_limit_count);
+
+define_therm_throt_sysdev_show_func(package_throttle, count);
+define_therm_throt_sysdev_one_ro(package_throttle_count);
+
+define_therm_throt_sysdev_show_func(package_power_limit, count);
+define_therm_throt_sysdev_one_ro(package_power_limit_count);
static struct attribute *thermal_throttle_attrs[] = {
- &attr_throttle_count.attr,
+ &attr_core_throttle_count.attr,
NULL
};
-static struct attribute_group thermal_throttle_attr_group = {
+static struct attribute_group thermal_attr_group = {
.attrs = thermal_throttle_attrs,
.name = "thermal_throttle"
};
#endif /* CONFIG_SYSFS */
+#define CORE_LEVEL 0
+#define PACKAGE_LEVEL 1
+
/***
* therm_throt_process - Process thermal throttling event from interrupt
* @curr: Whether the condition is current or not (boolean), since the
@@ -106,39 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = {
* 1 : Event should be logged further, and a message has been
* printed to the syslog.
*/
-static int therm_throt_process(bool is_throttled)
+static int therm_throt_process(bool new_event, int event, int level)
{
- struct thermal_state *state;
- unsigned int this_cpu;
- bool was_throttled;
+ struct _thermal_state *state;
+ unsigned int this_cpu = smp_processor_id();
+ bool old_event;
u64 now;
+ struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
- this_cpu = smp_processor_id();
now = get_jiffies_64();
- state = &per_cpu(thermal_state, this_cpu);
+ if (level == CORE_LEVEL) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ state = &pstate->core_throttle;
+ else if (event == POWER_LIMIT_EVENT)
+ state = &pstate->core_power_limit;
+ else
+ return 0;
+ } else if (level == PACKAGE_LEVEL) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ state = &pstate->package_throttle;
+ else if (event == POWER_LIMIT_EVENT)
+ state = &pstate->package_power_limit;
+ else
+ return 0;
+ } else
+ return 0;
- was_throttled = state->is_throttled;
- state->is_throttled = is_throttled;
+ old_event = state->new_event;
+ state->new_event = new_event;
- if (is_throttled)
- state->throttle_count++;
+ if (new_event)
+ state->count++;
if (time_before64(now, state->next_check) &&
- state->throttle_count != state->last_throttle_count)
+ state->count != state->last_count)
return 0;
state->next_check = now + CHECK_INTERVAL;
- state->last_throttle_count = state->throttle_count;
+ state->last_count = state->count;
/* if we just entered the thermal event */
- if (is_throttled) {
- printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count);
+ if (new_event) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
+ else
+ printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
add_taint(TAINT_MACHINE_CHECK);
return 1;
}
- if (was_throttled) {
- printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu);
+ if (old_event) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package");
+ else
+ printk(KERN_INFO "CPU%d: %s power limit normal\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package");
return 1;
}
@@ -147,15 +202,36 @@ static int therm_throt_process(bool is_throttled)
#ifdef CONFIG_SYSFS
/* Add/Remove thermal_throttle interface for CPU device: */
-static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
+static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
+ unsigned int cpu)
{
- return sysfs_create_group(&sys_dev->kobj,
- &thermal_throttle_attr_group);
+ int err;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
+ if (err)
+ return err;
+
+ if (cpu_has(c, X86_FEATURE_PLN))
+ err = sysfs_add_file_to_group(&sys_dev->kobj,
+ &attr_core_power_limit_count.attr,
+ thermal_attr_group.name);
+ if (cpu_has(c, X86_FEATURE_PTS)) {
+ err = sysfs_add_file_to_group(&sys_dev->kobj,
+ &attr_package_throttle_count.attr,
+ thermal_attr_group.name);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ err = sysfs_add_file_to_group(&sys_dev->kobj,
+ &attr_package_power_limit_count.attr,
+ thermal_attr_group.name);
+ }
+
+ return err;
}
static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
{
- sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+ sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
}
/* Mutex protecting device creation against CPU hotplug: */
@@ -177,7 +253,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
mutex_lock(&therm_cpu_lock);
- err = thermal_throttle_add_dev(sys_dev);
+ err = thermal_throttle_add_dev(sys_dev, cpu);
mutex_unlock(&therm_cpu_lock);
WARN_ON(err);
break;
@@ -213,7 +289,7 @@ static __init int thermal_throttle_init_device(void)
#endif
/* connect live CPUs to sysfs */
for_each_online_cpu(cpu) {
- err = thermal_throttle_add_dev(get_cpu_sysdev(cpu));
+ err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu);
WARN_ON(err);
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -226,19 +302,55 @@ device_initcall(thermal_throttle_init_device);
#endif /* CONFIG_SYSFS */
+/*
+ * Set up the most two significant bit to notify mce log that this thermal
+ * event type.
+ * This is a temp solution. May be changed in the future with mce log
+ * infrasture.
+ */
+#define CORE_THROTTLED (0)
+#define CORE_POWER_LIMIT ((__u64)1 << 62)
+#define PACKAGE_THROTTLED ((__u64)2 << 62)
+#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
+
/* Thermal transition interrupt handler */
static void intel_thermal_interrupt(void)
{
__u64 msr_val;
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
- if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0))
- mce_log_therm_throt_event(msr_val);
+
+ if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+ THERMAL_THROTTLING_EVENT,
+ CORE_LEVEL) != 0)
+ mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
+
+ if (cpu_has(c, X86_FEATURE_PLN))
+ if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
+ POWER_LIMIT_EVENT,
+ CORE_LEVEL) != 0)
+ mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
+
+ if (cpu_has(c, X86_FEATURE_PTS)) {
+ rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
+ if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+ THERMAL_THROTTLING_EVENT,
+ PACKAGE_LEVEL) != 0)
+ mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ if (therm_throt_process(msr_val &
+ PACKAGE_THERM_STATUS_POWER_LIMIT,
+ POWER_LIMIT_EVENT,
+ PACKAGE_LEVEL) != 0)
+ mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
+ | msr_val);
+ }
}
static void unexpected_thermal_interrupt(void)
{
- printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
+ printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
smp_processor_id());
add_taint(TAINT_MACHINE_CHECK);
}
@@ -335,8 +447,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
apic_write(APIC_LVTTHMR, h);
rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ l | (THERM_INT_LOW_ENABLE
+ | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
+ else
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+
+ if (cpu_has(c, X86_FEATURE_PTS)) {
+ rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE
+ | PACKAGE_THERM_INT_PLN_ENABLE), h);
+ else
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE), h);
+ }
smp_thermal_vector = intel_thermal_interrupt;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 16f41bbe46b6..d944bf6c50e9 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -18,6 +18,7 @@
#include <asm/mshyperv.h>
struct ms_hyperv_info ms_hyperv;
+EXPORT_SYMBOL_GPL(ms_hyperv);
static bool __init ms_hyperv_platform(void)
{
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 06130b52f012..ac140c7be396 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i)
unsigned long gran_base, chunk_base, lose_base;
char gran_factor, chunk_factor, lose_factor;
- gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
- chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
- lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
result[i].bad ? "*BAD*" : " ",
@@ -827,7 +827,7 @@ int __init amd_special_default_mtrr(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return 0;
- if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+ if (boot_cpu_data.x86 < 0xf)
return 0;
/* In case some hypervisor doesn't pass SYSCFG through: */
if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fd31a441c61c..9f27228ceffd 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -64,18 +64,59 @@ static inline void k8_check_syscfg_dram_mod_en(void)
}
}
+/* Get the size of contiguous MTRR range */
+static u64 get_mtrr_size(u64 mask)
+{
+ u64 size;
+
+ mask >>= PAGE_SHIFT;
+ mask |= size_or_mask;
+ size = -mask;
+ size <<= PAGE_SHIFT;
+ return size;
+}
+
/*
- * Returns the effective MTRR type for the region
- * Error returns:
- * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
- * - 0xFF - when MTRR is not enabled
+ * Check and return the effective type for MTRR-MTRR type overlap.
+ * Returns 1 if the effective type is UNCACHEABLE, else returns 0
*/
-u8 mtrr_type_lookup(u64 start, u64 end)
+static int check_type_overlap(u8 *prev, u8 *curr)
+{
+ if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) {
+ *prev = MTRR_TYPE_UNCACHABLE;
+ *curr = MTRR_TYPE_UNCACHABLE;
+ return 1;
+ }
+
+ if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) ||
+ (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) {
+ *prev = MTRR_TYPE_WRTHROUGH;
+ *curr = MTRR_TYPE_WRTHROUGH;
+ }
+
+ if (*prev != *curr) {
+ *prev = MTRR_TYPE_UNCACHABLE;
+ *curr = MTRR_TYPE_UNCACHABLE;
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Error/Semi-error returns:
+ * 0xFF - when MTRR is not enabled
+ * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
+ * corresponds only to [start:*partial_end].
+ * Caller has to lookup again for [*partial_end:end].
+ */
+static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
{
int i;
u64 base, mask;
u8 prev_match, curr_match;
+ *repeat = 0;
if (!mtrr_state_set)
return 0xFF;
@@ -126,8 +167,34 @@ u8 mtrr_type_lookup(u64 start, u64 end)
start_state = ((start & mask) == (base & mask));
end_state = ((end & mask) == (base & mask));
- if (start_state != end_state)
- return 0xFE;
+
+ if (start_state != end_state) {
+ /*
+ * We have start:end spanning across an MTRR.
+ * We split the region into
+ * either
+ * (start:mtrr_end) (mtrr_end:end)
+ * or
+ * (start:mtrr_start) (mtrr_start:end)
+ * depending on kind of overlap.
+ * Return the type for first region and a pointer to
+ * the start of second region so that caller will
+ * lookup again on the second region.
+ * Note: This way we handle multiple overlaps as well.
+ */
+ if (start_state)
+ *partial_end = base + get_mtrr_size(mask);
+ else
+ *partial_end = base;
+
+ if (unlikely(*partial_end <= start)) {
+ WARN_ON(1);
+ *partial_end = start + PAGE_SIZE;
+ }
+
+ end = *partial_end - 1; /* end is inclusive */
+ *repeat = 1;
+ }
if ((start & mask) != (base & mask))
continue;
@@ -138,21 +205,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
continue;
}
- if (prev_match == MTRR_TYPE_UNCACHABLE ||
- curr_match == MTRR_TYPE_UNCACHABLE) {
- return MTRR_TYPE_UNCACHABLE;
- }
-
- if ((prev_match == MTRR_TYPE_WRBACK &&
- curr_match == MTRR_TYPE_WRTHROUGH) ||
- (prev_match == MTRR_TYPE_WRTHROUGH &&
- curr_match == MTRR_TYPE_WRBACK)) {
- prev_match = MTRR_TYPE_WRTHROUGH;
- curr_match = MTRR_TYPE_WRTHROUGH;
- }
-
- if (prev_match != curr_match)
- return MTRR_TYPE_UNCACHABLE;
+ if (check_type_overlap(&prev_match, &curr_match))
+ return curr_match;
}
if (mtrr_tom2) {
@@ -166,6 +220,36 @@ u8 mtrr_type_lookup(u64 start, u64 end)
return mtrr_state.def_type;
}
+/*
+ * Returns the effective MTRR type for the region
+ * Error return:
+ * 0xFF - when MTRR is not enabled
+ */
+u8 mtrr_type_lookup(u64 start, u64 end)
+{
+ u8 type, prev_type;
+ int repeat;
+ u64 partial_end;
+
+ type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+
+ /*
+ * Common path is with repeat = 0.
+ * However, we can have cases where [start:end] spans across some
+ * MTRR range. Do repeated lookups for that case here.
+ */
+ while (repeat) {
+ prev_type = type;
+ start = partial_end;
+ type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+
+ if (check_type_overlap(&prev_type, &type))
+ return type;
+ }
+
+ return type;
+}
+
/* Get the MSR pair relating to a var range */
static void
get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
@@ -433,13 +517,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
{
unsigned int mask_lo, mask_hi, base_lo, base_hi;
unsigned int tmp, hi;
- int cpu;
/*
* get_mtrr doesn't need to update mtrr_state, also it could be called
* from any cpu, so try to print it out directly.
*/
- cpu = get_cpu();
+ get_cpu();
rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 79556bd9b602..01c0f3ee6cc3 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -35,6 +35,7 @@
#include <linux/types.h> /* FIXME: kvm_para.h needs this */
+#include <linux/stop_machine.h>
#include <linux/kvm_para.h>
#include <linux/uaccess.h>
#include <linux/module.h>
@@ -143,22 +144,28 @@ struct set_mtrr_data {
mtrr_type smp_type;
};
+static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
+
/**
- * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
+ * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
* @info: pointer to mtrr configuration data
*
* Returns nothing.
*/
-static void ipi_handler(void *info)
+static int mtrr_work_handler(void *info)
{
#ifdef CONFIG_SMP
struct set_mtrr_data *data = info;
unsigned long flags;
+ atomic_dec(&data->count);
+ while (!atomic_read(&data->gate))
+ cpu_relax();
+
local_irq_save(flags);
atomic_dec(&data->count);
- while (!atomic_read(&data->gate))
+ while (atomic_read(&data->gate))
cpu_relax();
/* The master has cleared me to execute */
@@ -173,12 +180,13 @@ static void ipi_handler(void *info)
}
atomic_dec(&data->count);
- while (atomic_read(&data->gate))
+ while (!atomic_read(&data->gate))
cpu_relax();
atomic_dec(&data->count);
local_irq_restore(flags);
#endif
+ return 0;
}
static inline int types_compatible(mtrr_type type1, mtrr_type type2)
@@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
*
* This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
*
- * 1. Send IPI to do the following:
+ * 1. Queue work to do the following on all processors:
* 2. Disable Interrupts
* 3. Wait for all procs to do so
* 4. Enter no-fill cache mode
@@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
* 15. Enable interrupts.
*
* What does that mean for us? Well, first we set data.count to the number
- * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait
- * until it hits 0 and proceed. We set the data.gate flag and reset data.count.
- * Meanwhile, they are waiting for that flag to be set. Once it's set, each
+ * of CPUs. As each CPU announces that it started the rendezvous handler by
+ * decrementing the count, We reset data.count and set the data.gate flag
+ * allowing all the cpu's to proceed with the work. As each cpu disables
+ * interrupts, it'll decrement data.count once. We wait until it hits 0 and
+ * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
+ * are waiting for that flag to be cleared. Once it's cleared, each
* CPU goes through the transition of updating MTRRs.
* The CPU vendors may each do it differently,
* so we call mtrr_if->set() callback and let them take care of it.
* When they're done, they again decrement data->count and wait for data.gate
- * to be reset.
+ * to be set.
* When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
* Everyone then enables interrupts and we all continue on.
*
@@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
{
struct set_mtrr_data data;
unsigned long flags;
+ int cpu;
+
+ preempt_disable();
data.smp_reg = reg;
data.smp_base = base;
@@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
atomic_set(&data.gate, 0);
/* Start the ball rolling on other CPUs */
- if (smp_call_function(ipi_handler, &data, 0) != 0)
- panic("mtrr: timed out waiting for other CPUs\n");
+ for_each_online_cpu(cpu) {
+ struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
+
+ if (cpu == smp_processor_id())
+ continue;
+
+ stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
+ }
- local_irq_save(flags);
while (atomic_read(&data.count))
cpu_relax();
@@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
smp_wmb();
atomic_set(&data.gate, 1);
+ local_irq_save(flags);
+
+ while (atomic_read(&data.count))
+ cpu_relax();
+
+ /* Ok, reset count and toggle gate */
+ atomic_set(&data.count, num_booting_cpus() - 1);
+ smp_wmb();
+ atomic_set(&data.gate, 0);
+
/* Do our MTRR business */
/*
@@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
atomic_set(&data.count, num_booting_cpus() - 1);
smp_wmb();
- atomic_set(&data.gate, 0);
+ atomic_set(&data.gate, 1);
/*
* Wait here for everyone to have seen the gate change
@@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
cpu_relax();
local_irq_restore(flags);
+ preempt_enable();
}
/**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5db5b7d65a18..ed6310183efb 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -49,7 +49,6 @@ static unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
{
unsigned long offset, addr = (unsigned long)from;
- int type = in_nmi() ? KM_NMI : KM_IRQ0;
unsigned long size, len = 0;
struct page *page;
void *map;
@@ -63,9 +62,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
offset = addr & (PAGE_SIZE - 1);
size = min(PAGE_SIZE - offset, n - len);
- map = kmap_atomic(page, type);
+ map = kmap_atomic(page);
memcpy(to, map+offset, size);
- kunmap_atomic(map, type);
+ kunmap_atomic(map);
put_page(page);
len += size;
@@ -102,6 +101,7 @@ struct cpu_hw_events {
*/
struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
int enabled;
int n_events;
@@ -220,6 +220,7 @@ struct x86_pmu {
struct perf_event *event);
struct event_constraint *event_constraints;
void (*quirks)(void);
+ int perfctr_second_write;
int (*cpu_prepare)(int cpu);
void (*cpu_starting)(int cpu);
@@ -236,6 +237,7 @@ struct x86_pmu {
* Intel DebugStore bits
*/
int bts, pebs;
+ int bts_active, pebs_active;
int pebs_record_size;
void (*drain_pebs)(struct pt_regs *regs);
struct event_constraint *pebs_constraints;
@@ -295,10 +297,10 @@ x86_perf_event_update(struct perf_event *event)
* count to the generic event atomically:
*/
again:
- prev_raw_count = atomic64_read(&hwc->prev_count);
+ prev_raw_count = local64_read(&hwc->prev_count);
rdmsrl(hwc->event_base + idx, new_raw_count);
- if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
new_raw_count) != prev_raw_count)
goto again;
@@ -313,8 +315,8 @@ again:
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
- atomic64_add(delta, &event->count);
- atomic64_sub(delta, &hwc->period_left);
+ local64_add(delta, &event->count);
+ local64_sub(delta, &hwc->period_left);
return new_raw_count;
}
@@ -379,7 +381,7 @@ static void release_pmc_hardware(void) {}
#endif
-static int reserve_ds_buffers(void);
+static void reserve_ds_buffers(void);
static void release_ds_buffers(void);
static void hw_perf_event_destroy(struct perf_event *event)
@@ -438,7 +440,7 @@ static int x86_setup_perfctr(struct perf_event *event)
if (!hwc->sample_period) {
hwc->sample_period = x86_pmu.max_period;
hwc->last_period = hwc->sample_period;
- atomic64_set(&hwc->period_left, hwc->sample_period);
+ local64_set(&hwc->period_left, hwc->sample_period);
} else {
/*
* If we have a PMU initialized but no APIC
@@ -476,7 +478,7 @@ static int x86_setup_perfctr(struct perf_event *event)
if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
(hwc->sample_period == 1)) {
/* BTS is not supported by this architecture. */
- if (!x86_pmu.bts)
+ if (!x86_pmu.bts_active)
return -EOPNOTSUPP;
/* BTS is currently only allowed for user-mode. */
@@ -495,12 +497,13 @@ static int x86_pmu_hw_config(struct perf_event *event)
int precise = 0;
/* Support for constant skid */
- if (x86_pmu.pebs)
+ if (x86_pmu.pebs_active) {
precise++;
- /* Support for IP fixup */
- if (x86_pmu.lbr_nr)
- precise++;
+ /* Support for IP fixup */
+ if (x86_pmu.lbr_nr)
+ precise++;
+ }
if (event->attr.precise_ip > precise)
return -EOPNOTSUPP;
@@ -529,7 +532,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
/*
* Setup the hardware configuration for a given attr_type
*/
-static int __hw_perf_event_init(struct perf_event *event)
+static int __x86_pmu_event_init(struct perf_event *event)
{
int err;
@@ -542,11 +545,8 @@ static int __hw_perf_event_init(struct perf_event *event)
if (atomic_read(&active_events) == 0) {
if (!reserve_pmc_hardware())
err = -EBUSY;
- else {
- err = reserve_ds_buffers();
- if (err)
- release_pmc_hardware();
- }
+ else
+ reserve_ds_buffers();
}
if (!err)
atomic_inc(&active_events);
@@ -582,7 +582,7 @@ static void x86_pmu_disable_all(void)
}
}
-void hw_perf_disable(void)
+static void x86_pmu_disable(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -617,7 +617,7 @@ static void x86_pmu_enable_all(int added)
}
}
-static const struct pmu pmu;
+static struct pmu pmu;
static inline int is_x86_event(struct perf_event *event)
{
@@ -799,10 +799,10 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc,
hwc->last_tag == cpuc->tags[i];
}
-static int x86_pmu_start(struct perf_event *event);
-static void x86_pmu_stop(struct perf_event *event);
+static void x86_pmu_start(struct perf_event *event, int flags);
+static void x86_pmu_stop(struct perf_event *event, int flags);
-void hw_perf_enable(void)
+static void x86_pmu_enable(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct perf_event *event;
@@ -838,7 +838,14 @@ void hw_perf_enable(void)
match_prev_assignment(hwc, cpuc, i))
continue;
- x86_pmu_stop(event);
+ /*
+ * Ensure we don't accidentally enable a stopped
+ * counter simply because we rescheduled.
+ */
+ if (hwc->state & PERF_HES_STOPPED)
+ hwc->state |= PERF_HES_ARCH;
+
+ x86_pmu_stop(event, PERF_EF_UPDATE);
}
for (i = 0; i < cpuc->n_events; i++) {
@@ -850,7 +857,10 @@ void hw_perf_enable(void)
else if (i < n_running)
continue;
- x86_pmu_start(event);
+ if (hwc->state & PERF_HES_ARCH)
+ continue;
+
+ x86_pmu_start(event, PERF_EF_RELOAD);
}
cpuc->n_added = 0;
perf_events_lapic_init();
@@ -885,7 +895,7 @@ static int
x86_perf_event_set_period(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- s64 left = atomic64_read(&hwc->period_left);
+ s64 left = local64_read(&hwc->period_left);
s64 period = hwc->sample_period;
int ret = 0, idx = hwc->idx;
@@ -897,14 +907,14 @@ x86_perf_event_set_period(struct perf_event *event)
*/
if (unlikely(left <= -period)) {
left = period;
- atomic64_set(&hwc->period_left, left);
+ local64_set(&hwc->period_left, left);
hwc->last_period = period;
ret = 1;
}
if (unlikely(left <= 0)) {
left += period;
- atomic64_set(&hwc->period_left, left);
+ local64_set(&hwc->period_left, left);
hwc->last_period = period;
ret = 1;
}
@@ -923,10 +933,19 @@ x86_perf_event_set_period(struct perf_event *event)
* The hw event starts counting from this event offset,
* mark it to be able to extra future deltas:
*/
- atomic64_set(&hwc->prev_count, (u64)-left);
+ local64_set(&hwc->prev_count, (u64)-left);
- wrmsrl(hwc->event_base + idx,
+ wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
+
+ /*
+ * Due to erratum on certan cpu we need
+ * a second write to be sure the register
+ * is updated properly
+ */
+ if (x86_pmu.perfctr_second_write) {
+ wrmsrl(hwc->event_base + idx,
(u64)(-left) & x86_pmu.cntval_mask);
+ }
perf_event_update_userpage(event);
@@ -942,15 +961,12 @@ static void x86_pmu_enable_event(struct perf_event *event)
}
/*
- * activate a single event
+ * Add a single event to the PMU.
*
* The event is added to the group of enabled events
* but only if it can be scehduled with existing events.
- *
- * Called with PMU disabled. If successful and return value 1,
- * then guaranteed to call perf_enable() and hw_perf_enable()
*/
-static int x86_pmu_enable(struct perf_event *event)
+static int x86_pmu_add(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc;
@@ -959,57 +975,67 @@ static int x86_pmu_enable(struct perf_event *event)
hwc = &event->hw;
+ perf_pmu_disable(event->pmu);
n0 = cpuc->n_events;
- n = collect_events(cpuc, event, false);
- if (n < 0)
- return n;
+ ret = n = collect_events(cpuc, event, false);
+ if (ret < 0)
+ goto out;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ if (!(flags & PERF_EF_START))
+ hwc->state |= PERF_HES_ARCH;
/*
* If group events scheduling transaction was started,
* skip the schedulability test here, it will be peformed
- * at commit time(->commit_txn) as a whole
+ * at commit time (->commit_txn) as a whole
*/
- if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
- goto out;
+ if (cpuc->group_flag & PERF_EVENT_TXN)
+ goto done_collect;
ret = x86_pmu.schedule_events(cpuc, n, assign);
if (ret)
- return ret;
+ goto out;
/*
* copy new assignment, now we know it is possible
* will be used by hw_perf_enable()
*/
memcpy(cpuc->assign, assign, n*sizeof(int));
-out:
+done_collect:
cpuc->n_events = n;
cpuc->n_added += n - n0;
cpuc->n_txn += n - n0;
- return 0;
+ ret = 0;
+out:
+ perf_pmu_enable(event->pmu);
+ return ret;
}
-static int x86_pmu_start(struct perf_event *event)
+static void x86_pmu_start(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int idx = event->hw.idx;
- if (idx == -1)
- return -EAGAIN;
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ if (WARN_ON_ONCE(idx == -1))
+ return;
+
+ if (flags & PERF_EF_RELOAD) {
+ WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+ x86_perf_event_set_period(event);
+ }
+
+ event->hw.state = 0;
- x86_perf_event_set_period(event);
cpuc->events[idx] = event;
__set_bit(idx, cpuc->active_mask);
+ __set_bit(idx, cpuc->running);
x86_pmu.enable(event);
perf_event_update_userpage(event);
-
- return 0;
-}
-
-static void x86_pmu_unthrottle(struct perf_event *event)
-{
- int ret = x86_pmu_start(event);
- WARN_ON_ONCE(ret);
}
void perf_event_print_debug(void)
@@ -1066,27 +1092,29 @@ void perf_event_print_debug(void)
local_irq_restore(flags);
}
-static void x86_pmu_stop(struct perf_event *event)
+static void x86_pmu_stop(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- int idx = hwc->idx;
-
- if (!__test_and_clear_bit(idx, cpuc->active_mask))
- return;
- x86_pmu.disable(event);
-
- /*
- * Drain the remaining delta count out of a event
- * that we are disabling:
- */
- x86_perf_event_update(event);
+ if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
+ x86_pmu.disable(event);
+ cpuc->events[hwc->idx] = NULL;
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
- cpuc->events[idx] = NULL;
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ x86_perf_event_update(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
}
-static void x86_pmu_disable(struct perf_event *event)
+static void x86_pmu_del(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int i;
@@ -1096,10 +1124,10 @@ static void x86_pmu_disable(struct perf_event *event)
* The events never got scheduled and ->cancel_txn will truncate
* the event_list.
*/
- if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+ if (cpuc->group_flag & PERF_EVENT_TXN)
return;
- x86_pmu_stop(event);
+ x86_pmu_stop(event, PERF_EF_UPDATE);
for (i = 0; i < cpuc->n_events; i++) {
if (event == cpuc->event_list[i]) {
@@ -1122,7 +1150,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
struct perf_sample_data data;
struct cpu_hw_events *cpuc;
struct perf_event *event;
- struct hw_perf_event *hwc;
int idx, handled = 0;
u64 val;
@@ -1131,11 +1158,18 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
cpuc = &__get_cpu_var(cpu_hw_events);
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- if (!test_bit(idx, cpuc->active_mask))
+ if (!test_bit(idx, cpuc->active_mask)) {
+ /*
+ * Though we deactivated the counter some cpus
+ * might still deliver spurious interrupts still
+ * in flight. Catch them:
+ */
+ if (__test_and_clear_bit(idx, cpuc->running))
+ handled++;
continue;
+ }
event = cpuc->events[idx];
- hwc = &event->hw;
val = x86_perf_event_update(event);
if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
@@ -1144,14 +1178,14 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
/*
* event overflow
*/
- handled = 1;
+ handled++;
data.period = event->hw.last_period;
if (!x86_perf_event_set_period(event))
continue;
if (perf_event_overflow(event, 1, &data, regs))
- x86_pmu_stop(event);
+ x86_pmu_stop(event, 0);
}
if (handled)
@@ -1160,25 +1194,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
return handled;
}
-void smp_perf_pending_interrupt(struct pt_regs *regs)
-{
- irq_enter();
- ack_APIC_irq();
- inc_irq_stat(apic_pending_irqs);
- perf_event_do_pending();
- irq_exit();
-}
-
-void set_perf_event_pending(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
- if (!x86_pmu.apic || !x86_pmu_initialized())
- return;
-
- apic->send_IPI_self(LOCAL_PENDING_VECTOR);
-#endif
-}
-
void perf_events_lapic_init(void)
{
if (!x86_pmu.apic || !x86_pmu_initialized())
@@ -1190,12 +1205,20 @@ void perf_events_lapic_init(void)
apic_write(APIC_LVTPC, APIC_DM_NMI);
}
+struct pmu_nmi_state {
+ unsigned int marked;
+ int handled;
+};
+
+static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
+
static int __kprobes
perf_event_nmi_handler(struct notifier_block *self,
unsigned long cmd, void *__args)
{
struct die_args *args = __args;
- struct pt_regs *regs;
+ unsigned int this_nmi;
+ int handled;
if (!atomic_read(&active_events))
return NOTIFY_DONE;
@@ -1204,22 +1227,47 @@ perf_event_nmi_handler(struct notifier_block *self,
case DIE_NMI:
case DIE_NMI_IPI:
break;
-
+ case DIE_NMIUNKNOWN:
+ this_nmi = percpu_read(irq_stat.__nmi_count);
+ if (this_nmi != __get_cpu_var(pmu_nmi).marked)
+ /* let the kernel handle the unknown nmi */
+ return NOTIFY_DONE;
+ /*
+ * This one is a PMU back-to-back nmi. Two events
+ * trigger 'simultaneously' raising two back-to-back
+ * NMIs. If the first NMI handles both, the latter
+ * will be empty and daze the CPU. So, we drop it to
+ * avoid false-positive 'unknown nmi' messages.
+ */
+ return NOTIFY_STOP;
default:
return NOTIFY_DONE;
}
- regs = args->regs;
-
apic_write(APIC_LVTPC, APIC_DM_NMI);
- /*
- * Can't rely on the handled return value to say it was our NMI, two
- * events could trigger 'simultaneously' raising two back-to-back NMIs.
- *
- * If the first NMI handles both, the latter will be empty and daze
- * the CPU.
- */
- x86_pmu.handle_irq(regs);
+
+ handled = x86_pmu.handle_irq(args->regs);
+ if (!handled)
+ return NOTIFY_DONE;
+
+ this_nmi = percpu_read(irq_stat.__nmi_count);
+ if ((handled > 1) ||
+ /* the next nmi could be a back-to-back nmi */
+ ((__get_cpu_var(pmu_nmi).marked == this_nmi) &&
+ (__get_cpu_var(pmu_nmi).handled > 1))) {
+ /*
+ * We could have two subsequent back-to-back nmis: The
+ * first handles more than one counter, the 2nd
+ * handles only one counter and the 3rd handles no
+ * counter.
+ *
+ * This is the 2nd nmi because the previous was
+ * handling more than one counter. We will mark the
+ * next (3rd) and then drop it if unhandled.
+ */
+ __get_cpu_var(pmu_nmi).marked = this_nmi + 1;
+ __get_cpu_var(pmu_nmi).handled = handled;
+ }
return NOTIFY_STOP;
}
@@ -1335,7 +1383,6 @@ void __init init_hw_perf_events(void)
x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
}
x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
- perf_max_events = x86_pmu.num_counters;
if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
@@ -1371,6 +1418,7 @@ void __init init_hw_perf_events(void)
pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
+ perf_pmu_register(&pmu);
perf_cpu_notifier(x86_pmu_notifier);
}
@@ -1384,11 +1432,12 @@ static inline void x86_pmu_read(struct perf_event *event)
* Set the flag to make pmu::enable() not perform the
* schedulability test, it will be performed at commit time
*/
-static void x86_pmu_start_txn(const struct pmu *pmu)
+static void x86_pmu_start_txn(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+ perf_pmu_disable(pmu);
+ cpuc->group_flag |= PERF_EVENT_TXN;
cpuc->n_txn = 0;
}
@@ -1397,16 +1446,17 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
* Clear the flag and pmu::enable() will perform the
* schedulability test.
*/
-static void x86_pmu_cancel_txn(const struct pmu *pmu)
+static void x86_pmu_cancel_txn(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+ cpuc->group_flag &= ~PERF_EVENT_TXN;
/*
* Truncate the collected events.
*/
cpuc->n_added -= cpuc->n_txn;
cpuc->n_events -= cpuc->n_txn;
+ perf_pmu_enable(pmu);
}
/*
@@ -1414,7 +1464,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
* Perform the group schedulability test as a whole
* Return 0 if success
*/
-static int x86_pmu_commit_txn(const struct pmu *pmu)
+static int x86_pmu_commit_txn(struct pmu *pmu)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int assign[X86_PMC_IDX_MAX];
@@ -1435,27 +1485,11 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
*/
memcpy(cpuc->assign, assign, n*sizeof(int));
- /*
- * Clear out the txn count so that ->cancel_txn() which gets
- * run after ->commit_txn() doesn't undo things.
- */
- cpuc->n_txn = 0;
-
+ cpuc->group_flag &= ~PERF_EVENT_TXN;
+ perf_pmu_enable(pmu);
return 0;
}
-static const struct pmu pmu = {
- .enable = x86_pmu_enable,
- .disable = x86_pmu_disable,
- .start = x86_pmu_start,
- .stop = x86_pmu_stop,
- .read = x86_pmu_read,
- .unthrottle = x86_pmu_unthrottle,
- .start_txn = x86_pmu_start_txn,
- .cancel_txn = x86_pmu_cancel_txn,
- .commit_txn = x86_pmu_commit_txn,
-};
-
/*
* validate that we can schedule this event
*/
@@ -1530,12 +1564,22 @@ out:
return ret;
}
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+int x86_pmu_event_init(struct perf_event *event)
{
- const struct pmu *tmp;
+ struct pmu *tmp;
int err;
- err = __hw_perf_event_init(event);
+ switch (event->attr.type) {
+ case PERF_TYPE_RAW:
+ case PERF_TYPE_HARDWARE:
+ case PERF_TYPE_HW_CACHE:
+ break;
+
+ default:
+ return -ENOENT;
+ }
+
+ err = __x86_pmu_event_init(event);
if (!err) {
/*
* we temporarily connect event to its pmu
@@ -1555,26 +1599,31 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
if (err) {
if (event->destroy)
event->destroy(event);
- return ERR_PTR(err);
}
- return &pmu;
+ return err;
}
-/*
- * callchain support
- */
+static struct pmu pmu = {
+ .pmu_enable = x86_pmu_enable,
+ .pmu_disable = x86_pmu_disable,
-static inline
-void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
- if (entry->nr < PERF_MAX_STACK_DEPTH)
- entry->ip[entry->nr++] = ip;
-}
+ .event_init = x86_pmu_event_init,
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
+ .add = x86_pmu_add,
+ .del = x86_pmu_del,
+ .start = x86_pmu_start,
+ .stop = x86_pmu_stop,
+ .read = x86_pmu_read,
+ .start_txn = x86_pmu_start_txn,
+ .cancel_txn = x86_pmu_cancel_txn,
+ .commit_txn = x86_pmu_commit_txn,
+};
+
+/*
+ * callchain support
+ */
static void
backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
@@ -1596,7 +1645,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
{
struct perf_callchain_entry *entry = data;
- callchain_store(entry, addr);
+ perf_callchain_store(entry, addr);
}
static const struct stacktrace_ops backtrace_ops = {
@@ -1607,13 +1656,15 @@ static const struct stacktrace_ops backtrace_ops = {
.walk_stack = print_context_stack_bp,
};
-#include "../dumpstack.h"
-
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
{
- callchain_store(entry, PERF_CONTEXT_KERNEL);
- callchain_store(entry, regs->ip);
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ /* TODO: We don't support guest os callchain now */
+ return;
+ }
+
+ perf_callchain_store(entry, regs->ip);
dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
}
@@ -1642,7 +1693,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
if (fp < compat_ptr(regs->sp))
break;
- callchain_store(entry, frame.return_address);
+ perf_callchain_store(entry, frame.return_address);
fp = compat_ptr(frame.next_frame);
}
return 1;
@@ -1655,19 +1706,20 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
}
#endif
-static void
-perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
{
struct stack_frame frame;
const void __user *fp;
- if (!user_mode(regs))
- regs = task_pt_regs(current);
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ /* TODO: We don't support guest os callchain now */
+ return;
+ }
fp = (void __user *)regs->bp;
- callchain_store(entry, PERF_CONTEXT_USER);
- callchain_store(entry, regs->ip);
+ perf_callchain_store(entry, regs->ip);
if (perf_callchain_user32(regs, entry))
return;
@@ -1684,68 +1736,11 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
if ((unsigned long)fp < regs->sp)
break;
- callchain_store(entry, frame.return_address);
+ perf_callchain_store(entry, frame.return_address);
fp = frame.next_frame;
}
}
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
- int is_user;
-
- if (!regs)
- return;
-
- is_user = user_mode(regs);
-
- if (is_user && current->state != TASK_RUNNING)
- return;
-
- if (!is_user)
- perf_callchain_kernel(regs, entry);
-
- if (current->mm)
- perf_callchain_user(regs, entry);
-}
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
- struct perf_callchain_entry *entry;
-
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
- /* TODO: We don't support guest os callchain now */
- return NULL;
- }
-
- if (in_nmi())
- entry = &__get_cpu_var(pmc_nmi_entry);
- else
- entry = &__get_cpu_var(pmc_irq_entry);
-
- entry->nr = 0;
-
- perf_do_callchain(regs, entry);
-
- return entry;
-}
-
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
-{
- regs->ip = ip;
- /*
- * perf_arch_fetch_caller_regs adds another call, we need to increment
- * the skip level
- */
- regs->bp = rewind_frame_pointer(skip + 1);
- regs->cs = __KERNEL_CS;
- /*
- * We abuse bit 3 to pass exact information, see perf_misc_flags
- * and the comment with PERF_EFLAGS_EXACT.
- */
- regs->flags = 0;
-}
-
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
unsigned long ip;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index c2897b7b4a3b..46d58448c3af 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -52,7 +52,7 @@ static __initconst const u64 amd_hw_cache_event_ids
[ C(DTLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
- [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
+ [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0,
@@ -66,7 +66,7 @@ static __initconst const u64 amd_hw_cache_event_ids
[ C(ITLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
- [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
+ [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 214ac860ebe0..c8f5c088cad1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -491,33 +491,78 @@ static void intel_pmu_enable_all(int added)
* Intel Errata AAP53 (model 30)
* Intel Errata BD53 (model 44)
*
- * These chips need to be 'reset' when adding counters by programming
- * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
- * either in sequence on the same PMC or on different PMCs.
+ * The official story:
+ * These chips need to be 'reset' when adding counters by programming the
+ * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
+ * in sequence on the same PMC or on different PMCs.
+ *
+ * In practise it appears some of these events do in fact count, and
+ * we need to programm all 4 events.
*/
-static void intel_pmu_nhm_enable_all(int added)
+static void intel_pmu_nhm_workaround(void)
{
- if (added) {
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int i;
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ static const unsigned long nhm_magic[4] = {
+ 0x4300B5,
+ 0x4300D2,
+ 0x4300B1,
+ 0x4300B1
+ };
+ struct perf_event *event;
+ int i;
- wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2);
- wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1);
- wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5);
+ /*
+ * The Errata requires below steps:
+ * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
+ * 2) Configure 4 PERFEVTSELx with the magic events and clear
+ * the corresponding PMCx;
+ * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
+ * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
+ * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
+ */
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
+ /*
+ * The real steps we choose are a little different from above.
+ * A) To reduce MSR operations, we don't run step 1) as they
+ * are already cleared before this function is called;
+ * B) Call x86_perf_event_update to save PMCx before configuring
+ * PERFEVTSELx with magic number;
+ * C) With step 5), we do clear only when the PERFEVTSELx is
+ * not used currently.
+ * D) Call x86_perf_event_set_period to restore PMCx;
+ */
+
+ /* We always operate 4 pairs of PERF Counters */
+ for (i = 0; i < 4; i++) {
+ event = cpuc->events[i];
+ if (event)
+ x86_perf_event_update(event);
+ }
+
+ for (i = 0; i < 4; i++) {
+ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
+ wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
+ }
- for (i = 0; i < 3; i++) {
- struct perf_event *event = cpuc->events[i];
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
- if (!event)
- continue;
+ for (i = 0; i < 4; i++) {
+ event = cpuc->events[i];
+ if (event) {
+ x86_perf_event_set_period(event);
__x86_pmu_enable_event(&event->hw,
- ARCH_PERFMON_EVENTSEL_ENABLE);
- }
+ ARCH_PERFMON_EVENTSEL_ENABLE);
+ } else
+ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
}
+}
+
+static void intel_pmu_nhm_enable_all(int added)
+{
+ if (added)
+ intel_pmu_nhm_workaround();
intel_pmu_enable_all(added);
}
@@ -667,22 +712,24 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
struct perf_sample_data data;
struct cpu_hw_events *cpuc;
int bit, loops;
- u64 ack, status;
+ u64 status;
+ int handled;
perf_sample_data_init(&data, 0);
cpuc = &__get_cpu_var(cpu_hw_events);
intel_pmu_disable_all();
- intel_pmu_drain_bts_buffer();
+ handled = intel_pmu_drain_bts_buffer();
status = intel_pmu_get_status();
if (!status) {
intel_pmu_enable_all(0);
- return 0;
+ return handled;
}
loops = 0;
again:
+ intel_pmu_ack_status(status);
if (++loops > 100) {
WARN_ONCE(1, "perfevents: irq loop stuck!\n");
perf_event_print_debug();
@@ -691,19 +738,22 @@ again:
}
inc_irq_stat(apic_perf_irqs);
- ack = status;
intel_pmu_lbr_read();
/*
* PEBS overflow sets bit 62 in the global status register
*/
- if (__test_and_clear_bit(62, (unsigned long *)&status))
+ if (__test_and_clear_bit(62, (unsigned long *)&status)) {
+ handled++;
x86_pmu.drain_pebs(regs);
+ }
for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
struct perf_event *event = cpuc->events[bit];
+ handled++;
+
if (!test_bit(bit, cpuc->active_mask))
continue;
@@ -713,11 +763,9 @@ again:
data.period = event->hw.last_period;
if (perf_event_overflow(event, 1, &data, regs))
- x86_pmu_stop(event);
+ x86_pmu_stop(event, 0);
}
- intel_pmu_ack_status(ack);
-
/*
* Repeat if there is more work to be done:
*/
@@ -727,7 +775,7 @@ again:
done:
intel_pmu_enable_all(0);
- return 1;
+ return handled;
}
static struct event_constraint *
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 18018d1311cd..b7dcd9f2b8a0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -74,6 +74,107 @@ static void fini_debug_store_on_cpu(int cpu)
wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
}
+static int alloc_pebs_buffer(int cpu)
+{
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ int node = cpu_to_node(cpu);
+ int max, thresh = 1; /* always use a single PEBS record */
+ void *buffer;
+
+ if (!x86_pmu.pebs)
+ return 0;
+
+ buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+ if (unlikely(!buffer))
+ return -ENOMEM;
+
+ max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
+
+ ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+ ds->pebs_index = ds->pebs_buffer_base;
+ ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+ max * x86_pmu.pebs_record_size;
+
+ ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
+ thresh * x86_pmu.pebs_record_size;
+
+ return 0;
+}
+
+static void release_pebs_buffer(int cpu)
+{
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+ if (!ds || !x86_pmu.pebs)
+ return;
+
+ kfree((void *)(unsigned long)ds->pebs_buffer_base);
+ ds->pebs_buffer_base = 0;
+}
+
+static int alloc_bts_buffer(int cpu)
+{
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ int node = cpu_to_node(cpu);
+ int max, thresh;
+ void *buffer;
+
+ if (!x86_pmu.bts)
+ return 0;
+
+ buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+ if (unlikely(!buffer))
+ return -ENOMEM;
+
+ max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+ thresh = max / 16;
+
+ ds->bts_buffer_base = (u64)(unsigned long)buffer;
+ ds->bts_index = ds->bts_buffer_base;
+ ds->bts_absolute_maximum = ds->bts_buffer_base +
+ max * BTS_RECORD_SIZE;
+ ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+ thresh * BTS_RECORD_SIZE;
+
+ return 0;
+}
+
+static void release_bts_buffer(int cpu)
+{
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+ if (!ds || !x86_pmu.bts)
+ return;
+
+ kfree((void *)(unsigned long)ds->bts_buffer_base);
+ ds->bts_buffer_base = 0;
+}
+
+static int alloc_ds_buffer(int cpu)
+{
+ int node = cpu_to_node(cpu);
+ struct debug_store *ds;
+
+ ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node);
+ if (unlikely(!ds))
+ return -ENOMEM;
+
+ per_cpu(cpu_hw_events, cpu).ds = ds;
+
+ return 0;
+}
+
+static void release_ds_buffer(int cpu)
+{
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+ if (!ds)
+ return;
+
+ per_cpu(cpu_hw_events, cpu).ds = NULL;
+ kfree(ds);
+}
+
static void release_ds_buffers(void)
{
int cpu;
@@ -82,93 +183,77 @@ static void release_ds_buffers(void)
return;
get_online_cpus();
-
for_each_online_cpu(cpu)
fini_debug_store_on_cpu(cpu);
for_each_possible_cpu(cpu) {
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
- if (!ds)
- continue;
-
- per_cpu(cpu_hw_events, cpu).ds = NULL;
-
- kfree((void *)(unsigned long)ds->pebs_buffer_base);
- kfree((void *)(unsigned long)ds->bts_buffer_base);
- kfree(ds);
+ release_pebs_buffer(cpu);
+ release_bts_buffer(cpu);
+ release_ds_buffer(cpu);
}
-
put_online_cpus();
}
-static int reserve_ds_buffers(void)
+static void reserve_ds_buffers(void)
{
- int cpu, err = 0;
+ int bts_err = 0, pebs_err = 0;
+ int cpu;
+
+ x86_pmu.bts_active = 0;
+ x86_pmu.pebs_active = 0;
if (!x86_pmu.bts && !x86_pmu.pebs)
- return 0;
+ return;
+
+ if (!x86_pmu.bts)
+ bts_err = 1;
+
+ if (!x86_pmu.pebs)
+ pebs_err = 1;
get_online_cpus();
for_each_possible_cpu(cpu) {
- struct debug_store *ds;
- void *buffer;
- int max, thresh;
+ if (alloc_ds_buffer(cpu)) {
+ bts_err = 1;
+ pebs_err = 1;
+ }
+
+ if (!bts_err && alloc_bts_buffer(cpu))
+ bts_err = 1;
+
+ if (!pebs_err && alloc_pebs_buffer(cpu))
+ pebs_err = 1;
- err = -ENOMEM;
- ds = kzalloc(sizeof(*ds), GFP_KERNEL);
- if (unlikely(!ds))
+ if (bts_err && pebs_err)
break;
- per_cpu(cpu_hw_events, cpu).ds = ds;
-
- if (x86_pmu.bts) {
- buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
- if (unlikely(!buffer))
- break;
-
- max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
- thresh = max / 16;
-
- ds->bts_buffer_base = (u64)(unsigned long)buffer;
- ds->bts_index = ds->bts_buffer_base;
- ds->bts_absolute_maximum = ds->bts_buffer_base +
- max * BTS_RECORD_SIZE;
- ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
- thresh * BTS_RECORD_SIZE;
- }
+ }
- if (x86_pmu.pebs) {
- buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
- if (unlikely(!buffer))
- break;
-
- max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
-
- ds->pebs_buffer_base = (u64)(unsigned long)buffer;
- ds->pebs_index = ds->pebs_buffer_base;
- ds->pebs_absolute_maximum = ds->pebs_buffer_base +
- max * x86_pmu.pebs_record_size;
- /*
- * Always use single record PEBS
- */
- ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
- x86_pmu.pebs_record_size;
- }
+ if (bts_err) {
+ for_each_possible_cpu(cpu)
+ release_bts_buffer(cpu);
+ }
- err = 0;
+ if (pebs_err) {
+ for_each_possible_cpu(cpu)
+ release_pebs_buffer(cpu);
}
- if (err)
- release_ds_buffers();
- else {
+ if (bts_err && pebs_err) {
+ for_each_possible_cpu(cpu)
+ release_ds_buffer(cpu);
+ } else {
+ if (x86_pmu.bts && !bts_err)
+ x86_pmu.bts_active = 1;
+
+ if (x86_pmu.pebs && !pebs_err)
+ x86_pmu.pebs_active = 1;
+
for_each_online_cpu(cpu)
init_debug_store_on_cpu(cpu);
}
put_online_cpus();
-
- return err;
}
/*
@@ -214,7 +299,7 @@ static void intel_pmu_disable_bts(void)
update_debugctlmsr(debugctlmsr);
}
-static void intel_pmu_drain_bts_buffer(void)
+static int intel_pmu_drain_bts_buffer(void)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct debug_store *ds = cpuc->ds;
@@ -231,16 +316,16 @@ static void intel_pmu_drain_bts_buffer(void)
struct pt_regs regs;
if (!event)
- return;
+ return 0;
- if (!ds)
- return;
+ if (!x86_pmu.bts_active)
+ return 0;
at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
top = (struct bts_record *)(unsigned long)ds->bts_index;
if (top <= at)
- return;
+ return 0;
ds->bts_index = ds->bts_buffer_base;
@@ -256,7 +341,7 @@ static void intel_pmu_drain_bts_buffer(void)
perf_prepare_sample(&header, &data, event, &regs);
if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
- return;
+ return 1;
for (; at < top; at++) {
data.ip = at->from;
@@ -270,6 +355,7 @@ static void intel_pmu_drain_bts_buffer(void)
/* There's new data available. */
event->hw.interrupts++;
event->pending_kill = POLL_IN;
+ return 1;
}
/*
@@ -491,7 +577,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
regs.flags &= ~PERF_EFLAGS_EXACT;
if (perf_event_overflow(event, 1, &data, &regs))
- x86_pmu_stop(event);
+ x86_pmu_stop(event, 0);
}
static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
@@ -502,7 +588,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
struct pebs_record_core *at, *top;
int n;
- if (!ds || !x86_pmu.pebs)
+ if (!x86_pmu.pebs_active)
return;
at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
@@ -544,7 +630,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
u64 status = 0;
int bit, n;
- if (!ds || !x86_pmu.pebs)
+ if (!x86_pmu.pebs_active)
return;
at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
@@ -629,9 +715,8 @@ static void intel_ds_init(void)
#else /* CONFIG_CPU_SUP_INTEL */
-static int reserve_ds_buffers(void)
+static void reserve_ds_buffers(void)
{
- return 0;
}
static void release_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ae85d69644d1..81400b93e694 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -18,25 +18,41 @@
struct p4_event_bind {
unsigned int opcode; /* Event code and ESCR selector */
unsigned int escr_msr[2]; /* ESCR MSR for this event */
+ unsigned int escr_emask; /* valid ESCR EventMask bits */
+ unsigned int shared; /* event is shared across threads */
char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
};
-struct p4_cache_event_bind {
+struct p4_pebs_bind {
unsigned int metric_pebs;
unsigned int metric_vert;
};
-#define P4_GEN_CACHE_EVENT_BIND(name) \
- [P4_CACHE__##name] = { \
- .metric_pebs = P4_PEBS__##name, \
- .metric_vert = P4_VERT__##name, \
+/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
+#define P4_GEN_PEBS_BIND(name, pebs, vert) \
+ [P4_PEBS_METRIC__##name] = { \
+ .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \
+ .metric_vert = vert, \
}
-static struct p4_cache_event_bind p4_cache_event_bind_map[] = {
- P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired),
- P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired),
- P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired),
- P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired),
+/*
+ * note we have P4_PEBS_ENABLE_UOP_TAG always set here
+ *
+ * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
+ * event configuration to find out which values are to be
+ * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
+ * resgisters
+ */
+static struct p4_pebs_bind p4_pebs_bind_map[] = {
+ P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001),
+ P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001),
+ P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001),
+ P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002),
+ P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003),
+ P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010),
+ P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001),
+ P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001),
+ P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002),
};
/*
@@ -52,239 +68,443 @@ static struct p4_event_bind p4_event_bind_map[] = {
[P4_EVENT_TC_DELIVER_MODE] = {
.opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
.escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
+ .shared = 1,
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_BPU_FETCH_REQUEST] = {
.opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
.escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_ITLB_REFERENCE] = {
.opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
.escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_MEMORY_CANCEL] = {
.opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
.escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_MEMORY_COMPLETE] = {
.opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
.escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_LOAD_PORT_REPLAY] = {
.opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
.escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_STORE_PORT_REPLAY] = {
.opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
.escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_MOB_LOAD_REPLAY] = {
.opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
.escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_PAGE_WALK_TYPE] = {
.opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
.escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
+ .shared = 1,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_BSQ_CACHE_REFERENCE] = {
.opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
.escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_IOQ_ALLOCATION] = {
.opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
.opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
.escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
.cntr = { {2, -1, -1}, {3, -1, -1} },
},
[P4_EVENT_FSB_DATA_ACTIVITY] = {
.opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
+ .shared = 1,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
.opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
.escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
.cntr = { {0, -1, -1}, {1, -1, -1} },
},
[P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
.opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
.escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
.cntr = { {2, -1, -1}, {3, -1, -1} },
},
[P4_EVENT_SSE_INPUT_ASSIST] = {
.opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_PACKED_SP_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_PACKED_DP_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_SCALAR_SP_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_SCALAR_DP_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_64BIT_MMX_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_128BIT_MMX_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_X87_FP_UOP] = {
.opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
.escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_TC_MISC] = {
.opcode = P4_OPCODE(P4_EVENT_TC_MISC),
.escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_GLOBAL_POWER_EVENTS] = {
.opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_TC_MS_XFER] = {
.opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
.escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_UOP_QUEUE_WRITES] = {
.opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
.escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
.opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
.escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_RETIRED_BRANCH_TYPE] = {
.opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
.escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
.cntr = { {4, 5, -1}, {6, 7, -1} },
},
[P4_EVENT_RESOURCE_STALL] = {
.opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
.escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_WC_BUFFER] = {
.opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
.escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
+ .shared = 1,
.cntr = { {8, 9, -1}, {10, 11, -1} },
},
[P4_EVENT_B2B_CYCLES] = {
.opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_BNR] = {
.opcode = P4_OPCODE(P4_EVENT_BNR),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_SNOOP] = {
.opcode = P4_OPCODE(P4_EVENT_SNOOP),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_RESPONSE] = {
.opcode = P4_OPCODE(P4_EVENT_RESPONSE),
.escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
.cntr = { {0, -1, -1}, {2, -1, -1} },
},
[P4_EVENT_FRONT_END_EVENT] = {
.opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_EXECUTION_EVENT] = {
.opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_REPLAY_EVENT] = {
.opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_INSTR_RETIRED] = {
.opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
.escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_UOPS_RETIRED] = {
.opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
.escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_UOP_TYPE] = {
.opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
.escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_BRANCH_RETIRED] = {
.opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_MISPRED_BRANCH_RETIRED] = {
.opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
.escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_X87_ASSIST] = {
.opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_MACHINE_CLEAR] = {
.opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
.escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_INSTR_COMPLETED] = {
.opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
.escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
};
-#define P4_GEN_CACHE_EVENT(event, bit, cache_event) \
+#define P4_GEN_CACHE_EVENT(event, bit, metric) \
p4_config_pack_escr(P4_ESCR_EVENT(event) | \
P4_ESCR_EMASK_BIT(event, bit)) | \
- p4_config_pack_cccr(cache_event | \
+ p4_config_pack_cccr(metric | \
P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
static __initconst const u64 p4_hw_cache_event_ids
@@ -296,34 +516,34 @@ static __initconst const u64 p4_hw_cache_event_ids
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0,
[ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
- P4_CACHE__1stl_cache_load_miss_retired),
+ P4_PEBS_METRIC__1stl_cache_load_miss_retired),
},
},
[ C(LL ) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0,
[ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
- P4_CACHE__2ndl_cache_load_miss_retired),
+ P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
},
},
[ C(DTLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0,
[ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
- P4_CACHE__dtlb_load_miss_retired),
+ P4_PEBS_METRIC__dtlb_load_miss_retired),
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0x0,
[ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
- P4_CACHE__dtlb_store_miss_retired),
+ P4_PEBS_METRIC__dtlb_store_miss_retired),
},
},
[ C(ITLB) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
- P4_CACHE__itlb_reference_hit),
+ P4_PEBS_METRIC__none),
[ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
- P4_CACHE__itlb_reference_miss),
+ P4_PEBS_METRIC__none),
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = -1,
@@ -414,11 +634,81 @@ static u64 p4_pmu_event_map(int hw_event)
return config;
}
+/* check cpu model specifics */
+static bool p4_event_match_cpu_model(unsigned int event_idx)
+{
+ /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
+ if (event_idx == P4_EVENT_INSTR_COMPLETED) {
+ if (boot_cpu_data.x86_model != 3 &&
+ boot_cpu_data.x86_model != 4 &&
+ boot_cpu_data.x86_model != 6)
+ return false;
+ }
+
+ /*
+ * For info
+ * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
+ */
+
+ return true;
+}
+
+static int p4_validate_raw_event(struct perf_event *event)
+{
+ unsigned int v, emask;
+
+ /* User data may have out-of-bound event index */
+ v = p4_config_unpack_event(event->attr.config);
+ if (v >= ARRAY_SIZE(p4_event_bind_map))
+ return -EINVAL;
+
+ /* It may be unsupported: */
+ if (!p4_event_match_cpu_model(v))
+ return -EINVAL;
+
+ /*
+ * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
+ * in Architectural Performance Monitoring, it means not
+ * on _which_ logical cpu to count but rather _when_, ie it
+ * depends on logical cpu state -- count event if one cpu active,
+ * none, both or any, so we just allow user to pass any value
+ * desired.
+ *
+ * In turn we always set Tx_OS/Tx_USR bits bound to logical
+ * cpu without their propagation to another cpu
+ */
+
+ /*
+ * if an event is shared accross the logical threads
+ * the user needs special permissions to be able to use it
+ */
+ if (p4_event_bind_map[v].shared) {
+ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
+ /* ESCR EventMask bits may be invalid */
+ emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
+ if (emask & ~p4_event_bind_map[v].escr_emask)
+ return -EINVAL;
+
+ /*
+ * it may have some invalid PEBS bits
+ */
+ if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
+ return -EINVAL;
+
+ v = p4_config_unpack_metric(event->attr.config);
+ if (v >= ARRAY_SIZE(p4_pebs_bind_map))
+ return -EINVAL;
+
+ return 0;
+}
+
static int p4_hw_config(struct perf_event *event)
{
int cpu = get_cpu();
int rc = 0;
- unsigned int evnt;
u32 escr, cccr;
/*
@@ -438,25 +728,21 @@ static int p4_hw_config(struct perf_event *event)
if (event->attr.type == PERF_TYPE_RAW) {
- /* user data may have out-of-bound event index */
- evnt = p4_config_unpack_event(event->attr.config);
- if (evnt >= ARRAY_SIZE(p4_event_bind_map)) {
- rc = -EINVAL;
+ /*
+ * Clear bits we reserve to be managed by kernel itself
+ * and never allowed from a user space
+ */
+ event->attr.config &= P4_CONFIG_MASK;
+
+ rc = p4_validate_raw_event(event);
+ if (rc)
goto out;
- }
/*
- * We don't control raw events so it's up to the caller
- * to pass sane values (and we don't count the thread number
- * on HT machine but allow HT-compatible specifics to be
- * passed on)
- *
- * XXX: HT wide things should check perf_paranoid_cpu() &&
- * CAP_SYS_ADMIN
+ * Note that for RAW events we allow user to use P4_CCCR_RESERVED
+ * bits since we keep additional info here (for cache events and etc)
*/
- event->hw.config |= event->attr.config &
- (p4_config_pack_escr(P4_ESCR_MASK_HT) |
- p4_config_pack_cccr(P4_CCCR_MASK_HT));
+ event->hw.config |= event->attr.config;
}
rc = x86_setup_perfctr(event);
@@ -482,6 +768,29 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
return overflow;
}
+static void p4_pmu_disable_pebs(void)
+{
+ /*
+ * FIXME
+ *
+ * It's still allowed that two threads setup same cache
+ * events so we can't simply clear metrics until we knew
+ * noone is depending on us, so we need kind of counter
+ * for "ReplayEvent" users.
+ *
+ * What is more complex -- RAW events, if user (for some
+ * reason) will pass some cache event metric with improper
+ * event opcode -- it's fine from hardware point of view
+ * but completely nonsence from "meaning" of such action.
+ *
+ * So at moment let leave metrics turned on forever -- it's
+ * ok for now but need to be revisited!
+ *
+ * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0);
+ * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
+ */
+}
+
static inline void p4_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
@@ -507,6 +816,26 @@ static void p4_pmu_disable_all(void)
continue;
p4_pmu_disable_event(event);
}
+
+ p4_pmu_disable_pebs();
+}
+
+/* configuration must be valid */
+static void p4_pmu_enable_pebs(u64 config)
+{
+ struct p4_pebs_bind *bind;
+ unsigned int idx;
+
+ BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
+
+ idx = p4_config_unpack_metric(config);
+ if (idx == P4_PEBS_METRIC__none)
+ return;
+
+ bind = &p4_pebs_bind_map[idx];
+
+ (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
+ (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
}
static void p4_pmu_enable_event(struct perf_event *event)
@@ -515,9 +844,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
int thread = p4_ht_config_thread(hwc->config);
u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
unsigned int idx = p4_config_unpack_event(hwc->config);
- unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
struct p4_event_bind *bind;
- struct p4_cache_event_bind *bind_cache;
u64 escr_addr, cccr;
bind = &p4_event_bind_map[idx];
@@ -537,16 +864,10 @@ static void p4_pmu_enable_event(struct perf_event *event)
cccr = p4_config_unpack_cccr(hwc->config);
/*
- * it could be Cache event so that we need to
- * set metrics into additional MSRs
+ * it could be Cache event so we need to write metrics
+ * into additional MSRs
*/
- BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK);
- if (idx_cache > P4_CACHE__NONE &&
- idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
- bind_cache = &p4_cache_event_bind_map[idx_cache];
- (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
- (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
- }
+ p4_pmu_enable_pebs(hwc->config);
(void)checking_wrmsrl(escr_addr, escr_conf);
(void)checking_wrmsrl(hwc->config_base + hwc->idx,
@@ -581,9 +902,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
cpuc = &__get_cpu_var(cpu_hw_events);
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ int overflow;
- if (!test_bit(idx, cpuc->active_mask))
+ if (!test_bit(idx, cpuc->active_mask)) {
+ /* catch in-flight IRQs */
+ if (__test_and_clear_bit(idx, cpuc->running))
+ handled++;
continue;
+ }
event = cpuc->events[idx];
hwc = &event->hw;
@@ -591,12 +917,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
WARN_ON_ONCE(hwc->idx != idx);
/* it might be unflagged overflow */
- handled = p4_pmu_clear_cccr_ovf(hwc);
+ overflow = p4_pmu_clear_cccr_ovf(hwc);
val = x86_perf_event_update(event);
- if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
+ if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
continue;
+ handled += overflow;
+
/* event overflow for sure */
data.period = event->hw.last_period;
@@ -829,6 +1157,15 @@ static __initconst const struct x86_pmu p4_pmu = {
.max_period = (1ULL << 39) - 1,
.hw_config = p4_hw_config,
.schedule_events = p4_pmu_schedule_events,
+ /*
+ * This handles erratum N15 in intel doc 249199-029,
+ * the counter may not be updated correctly on write
+ * so we need a second write operation to do the trick
+ * (the official workaround didn't work)
+ *
+ * the former idea is taken from OProfile code
+ */
+ .perfctr_second_write = 1,
};
static __init int p4_pmu_init(void)
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fb329e9f8494..d9f4ff8fcd69 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -700,11 +700,10 @@ static void probe_nmi_watchdog(void)
{
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
- boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
- return;
- wd_ops = &k7_wd_ops;
- break;
+ if (boot_cpu_data.x86 == 6 ||
+ (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15))
+ wd_ops = &k7_wd_ops;
+ return;
case X86_VENDOR_INTEL:
/* Work around where perfctr1 doesn't have a working enable
* bit as described in the following errata:
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
new file mode 100644
index 000000000000..c7f64e6f537a
--- /dev/null
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -0,0 +1,70 @@
+/*
+ * Routines to indentify additional cpu features that are scattered in
+ * cpuid space.
+ */
+#include <linux/cpu.h>
+
+#include <asm/pat.h>
+#include <asm/processor.h>
+
+#include <asm/apic.h>
+
+struct cpuid_bit {
+ u16 feature;
+ u8 reg;
+ u8 bit;
+ u32 level;
+ u32 sub_leaf;
+};
+
+enum cpuid_regs {
+ CR_EAX = 0,
+ CR_ECX,
+ CR_EDX,
+ CR_EBX
+};
+
+void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
+{
+ u32 max_level;
+ u32 regs[4];
+ const struct cpuid_bit *cb;
+
+ static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
+ { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 },
+ { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
+ { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
+ { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
+ { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
+ { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
+ { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
+ { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
+ { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
+ { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
+ { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
+ { X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 },
+ { X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 },
+ { X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 },
+ { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 },
+ { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 },
+ { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 },
+ { 0, 0, 0, 0, 0 }
+ };
+
+ for (cb = cpuid_bits; cb->feature; cb++) {
+
+ /* Verify that the level is valid */
+ max_level = cpuid_eax(cb->level & 0xffff0000);
+ if (max_level < cb->level ||
+ max_level > (cb->level | 0xffff))
+ continue;
+
+ cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
+ &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
+
+ if (regs[cb->reg] & (1 << cb->bit))
+ set_cpu_cap(c, cb->feature);
+ }
+}
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/topology.c
index 10fa5684a662..4397e987a1cf 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -1,62 +1,14 @@
/*
- * Routines to indentify additional cpu features that are scattered in
- * cpuid space.
+ * Check for extended topology enumeration cpuid leaf 0xb and if it
+ * exists, use it for populating initial_apicid and cpu topology
+ * detection.
*/
-#include <linux/cpu.h>
+#include <linux/cpu.h>
+#include <asm/apic.h>
#include <asm/pat.h>
#include <asm/processor.h>
-#include <asm/apic.h>
-
-struct cpuid_bit {
- u16 feature;
- u8 reg;
- u8 bit;
- u32 level;
-};
-
-enum cpuid_regs {
- CR_EAX = 0,
- CR_ECX,
- CR_EDX,
- CR_EBX
-};
-
-void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
-{
- u32 max_level;
- u32 regs[4];
- const struct cpuid_bit *cb;
-
- static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
- { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
- { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
- { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 },
- { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 },
- { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
- { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
- { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
- { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
- { 0, 0, 0, 0 }
- };
-
- for (cb = cpuid_bits; cb->feature; cb++) {
-
- /* Verify that the level is valid */
- max_level = cpuid_eax(cb->level & 0xffff0000);
- if (max_level < cb->level ||
- max_level > (cb->level | 0xffff))
- continue;
-
- cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
- &regs[CR_ECX], &regs[CR_EDX]);
-
- if (regs[cb->reg] & (1 << cb->bit))
- set_cpu_cap(c, cb->feature);
- }
-}
-
/* leaf 0xb SMT level */
#define SMT_LEVEL 0
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index b9d1ff588445..227b0448960d 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -51,7 +51,7 @@ static inline int __vmware_platform(void)
static unsigned long vmware_get_tsc_khz(void)
{
- uint64_t tsc_hz;
+ uint64_t tsc_hz, lpj;
uint32_t eax, ebx, ecx, edx;
VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
@@ -62,6 +62,13 @@ static unsigned long vmware_get_tsc_khz(void)
printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
(unsigned long) tsc_hz / 1000,
(unsigned long) tsc_hz % 1000);
+
+ if (!preset_lpj) {
+ lpj = ((u64)tsc_hz * 1000);
+ do_div(lpj, HZ);
+ preset_lpj = lpj;
+ }
+
return tsc_hz;
}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ebd4c51d096a..764c7c2b1811 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -28,6 +28,8 @@
#include <asm/reboot.h>
#include <asm/virtext.h>
+int in_crash_kexec;
+
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static void kdump_nmi_callback(int cpu, struct die_args *args)
@@ -61,6 +63,7 @@ static void kdump_nmi_callback(int cpu, struct die_args *args)
static void kdump_nmi_shootdown_cpus(void)
{
+ in_crash_kexec = 1;
nmi_shootdown_cpus(kdump_nmi_callback);
disable_local_APIC();
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 67414550c3cc..d5cd13945d5a 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -61,7 +61,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
if (!is_crashed_pfn_valid(pfn))
return -EFAULT;
- vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
+ vaddr = kmap_atomic_pfn(pfn);
if (!userbuf) {
memcpy(buf, (vaddr + offset), csize);
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 045b36cada65..994828899e09 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -34,7 +34,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
if (!csize)
return 0;
- vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+ vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
if (!vaddr)
return -ENOMEM;
@@ -46,6 +46,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
} else
memcpy(buf, vaddr + offset, csize);
+ set_iounmap_nonlazy();
iounmap(vaddr);
return csize;
}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index c89a386930b7..6e8752c1bd52 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -18,7 +18,6 @@
#include <asm/stacktrace.h>
-#include "dumpstack.h"
int panic_on_unrecovered_nmi;
int panic_on_io_nmi;
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
deleted file mode 100644
index e1a93be4fd44..000000000000
--- a/arch/x86/kernel/dumpstack.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
- */
-
-#ifndef DUMPSTACK_H
-#define DUMPSTACK_H
-
-#ifdef CONFIG_X86_32
-#define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
-#else
-#define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
-#endif
-
-#include <linux/uaccess.h>
-
-extern void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl);
-
-extern void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl);
-
-extern unsigned int code_bytes;
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
- struct stack_frame *next_frame;
- unsigned long return_address;
-};
-
-struct stack_frame_ia32 {
- u32 next_frame;
- u32 return_address;
-};
-
-static inline unsigned long rewind_frame_pointer(int n)
-{
- struct stack_frame *frame;
-
- get_bp(frame);
-
-#ifdef CONFIG_FRAME_POINTER
- while (n--) {
- if (probe_kernel_address(&frame->next_frame, frame))
- break;
- }
-#endif
-
- return (unsigned long)frame;
-}
-
-#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 11540a189d93..1bc7f75a5bda 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -16,8 +16,6 @@
#include <asm/stacktrace.h>
-#include "dumpstack.h"
-
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
@@ -84,11 +82,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (kstack_end(stack))
break;
if (i && ((i % STACKSLOTS_PER_LINE) == 0))
- printk("\n%s", log_lvl);
- printk(" %08lx", *stack++);
+ printk(KERN_CONT "\n");
+ printk(KERN_CONT " %08lx", *stack++);
touch_nmi_watchdog();
}
- printk("\n");
+ printk(KERN_CONT "\n");
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 272c9f1f05f3..6a340485249a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -16,7 +16,6 @@
#include <asm/stacktrace.h>
-#include "dumpstack.h"
#define N_EXCEPTION_STACKS_END \
(N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
@@ -266,20 +265,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (stack >= irq_stack && stack <= irq_stack_end) {
if (stack == irq_stack_end) {
stack = (unsigned long *) (irq_stack_end[-1]);
- printk(" <EOI> ");
+ printk(KERN_CONT " <EOI> ");
}
} else {
if (((long) stack & (THREAD_SIZE-1)) == 0)
break;
}
if (i && ((i % STACKSLOTS_PER_LINE) == 0))
- printk("\n%s", log_lvl);
- printk(" %016lx", *stack++);
+ printk(KERN_CONT "\n");
+ printk(KERN_CONT " %016lx", *stack++);
touch_nmi_watchdog();
}
preempt_enable();
- printk("\n");
+ printk(KERN_CONT "\n");
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0d6fc71bedb1..0c2b7ef7a34d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -15,6 +15,7 @@
#include <linux/pfn.h>
#include <linux/suspend.h>
#include <linux/firmware-map.h>
+#include <linux/memblock.h>
#include <asm/e820.h>
#include <asm/proto.h>
@@ -738,73 +739,7 @@ core_initcall(e820_mark_nvs_memory);
#endif
/*
- * Find a free area with specified alignment in a specific range.
- */
-u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 addr;
- u64 ei_start, ei_last;
-
- if (ei->type != E820_RAM)
- continue;
-
- ei_last = ei->addr + ei->size;
- ei_start = ei->addr;
- addr = find_early_area(ei_start, ei_last, start, end,
- size, align);
-
- if (addr != -1ULL)
- return addr;
- }
- return -1ULL;
-}
-
-u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
-{
- return find_e820_area(start, end, size, align);
-}
-
-u64 __init get_max_mapped(void)
-{
- u64 end = max_pfn_mapped;
-
- end <<= PAGE_SHIFT;
-
- return end;
-}
-/*
- * Find next free range after *start
- */
-u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 addr;
- u64 ei_start, ei_last;
-
- if (ei->type != E820_RAM)
- continue;
-
- ei_last = ei->addr + ei->size;
- ei_start = ei->addr;
- addr = find_early_area_size(ei_start, ei_last, start,
- sizep, align);
-
- if (addr != -1ULL)
- return addr;
- }
-
- return -1ULL;
-}
-
-/*
- * pre allocated 4k and reserved it in e820
+ * pre allocated 4k and reserved it in memblock and e820_saved
*/
u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
{
@@ -813,8 +748,8 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
u64 start;
for (start = startt; ; start += size) {
- start = find_e820_area_size(start, &size, align);
- if (!(start + 1))
+ start = memblock_x86_find_in_range_size(start, &size, align);
+ if (start == MEMBLOCK_ERROR)
return 0;
if (size >= sizet)
break;
@@ -830,10 +765,9 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
addr = round_down(start + size - sizet, align);
if (addr < start)
return 0;
- e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
+ memblock_x86_reserve_range(addr, addr + sizet, "new next");
e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
- printk(KERN_INFO "update e820 for early_reserve_e820\n");
- update_e820();
+ printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
update_e820_saved();
return addr;
@@ -895,74 +829,6 @@ unsigned long __init e820_end_of_low_ram_pfn(void)
{
return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
}
-/*
- * Finds an active region in the address range from start_pfn to last_pfn and
- * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
- */
-int __init e820_find_active_region(const struct e820entry *ei,
- unsigned long start_pfn,
- unsigned long last_pfn,
- unsigned long *ei_startpfn,
- unsigned long *ei_endpfn)
-{
- u64 align = PAGE_SIZE;
-
- *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
- *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
-
- /* Skip map entries smaller than a page */
- if (*ei_startpfn >= *ei_endpfn)
- return 0;
-
- /* Skip if map is outside the node */
- if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
- *ei_startpfn >= last_pfn)
- return 0;
-
- /* Check for overlaps */
- if (*ei_startpfn < start_pfn)
- *ei_startpfn = start_pfn;
- if (*ei_endpfn > last_pfn)
- *ei_endpfn = last_pfn;
-
- return 1;
-}
-
-/* Walk the e820 map and register active regions within a node */
-void __init e820_register_active_regions(int nid, unsigned long start_pfn,
- unsigned long last_pfn)
-{
- unsigned long ei_startpfn;
- unsigned long ei_endpfn;
- int i;
-
- for (i = 0; i < e820.nr_map; i++)
- if (e820_find_active_region(&e820.map[i],
- start_pfn, last_pfn,
- &ei_startpfn, &ei_endpfn))
- add_active_range(nid, ei_startpfn, ei_endpfn);
-}
-
-/*
- * Find the hole size (in bytes) in the memory range.
- * @start: starting address of the memory range to scan
- * @end: ending address of the memory range to scan
- */
-u64 __init e820_hole_size(u64 start, u64 end)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long last_pfn = end >> PAGE_SHIFT;
- unsigned long ei_startpfn, ei_endpfn, ram = 0;
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- if (e820_find_active_region(&e820.map[i],
- start_pfn, last_pfn,
- &ei_startpfn, &ei_endpfn))
- ram += ei_endpfn - ei_startpfn;
- }
- return end - start - ((u64)ram << PAGE_SHIFT);
-}
static void early_panic(char *msg)
{
@@ -1210,3 +1076,48 @@ void __init setup_memory_map(void)
printk(KERN_INFO "BIOS-provided physical RAM map:\n");
e820_print_map(who);
}
+
+void __init memblock_x86_fill(void)
+{
+ int i;
+ u64 end;
+
+ /*
+ * EFI may have more than 128 entries
+ * We are safe to enable resizing, beause memblock_x86_fill()
+ * is rather later for x86
+ */
+ memblock_can_resize = 1;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ end = ei->addr + ei->size;
+ if (end != (resource_size_t)end)
+ continue;
+
+ if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
+ continue;
+
+ memblock_add(ei->addr, ei->size);
+ }
+
+ memblock_analyze();
+ memblock_dump_all();
+}
+
+void __init memblock_find_dma_reserve(void)
+{
+#ifdef CONFIG_X86_64
+ u64 free_size_pfn;
+ u64 mem_size_pfn;
+ /*
+ * need to find out used area below MAX_DMA_PFN
+ * need to use memblock to get free size in [0, MAX_DMA_PFN]
+ * at first, and assume boot_mem will not take below MAX_DMA_PFN
+ */
+ mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
+ free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
+ set_dma_reserve(mem_size_pfn - free_size_pfn);
+#endif
+}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index e5cc7e82e60d..76b8cd953dee 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -18,7 +18,6 @@
#include <asm/apic.h>
#include <asm/iommu.h>
#include <asm/gart.h>
-#include <asm/hpet.h>
static void __init fix_hypertransport_config(int num, int slot, int func)
{
@@ -98,7 +97,6 @@ static void __init nvidia_bugs(int num, int slot, int func)
}
#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
-#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
{
u32 d;
@@ -116,7 +114,6 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
d &= 0xff;
return d;
}
-#endif
static void __init ati_bugs(int num, int slot, int func)
{
@@ -192,21 +189,6 @@ static void __init ati_bugs_contd(int num, int slot, int func)
}
#endif
-/*
- * Force the read back of the CMP register in hpet_next_event()
- * to work around the problem that the CMP register write seems to be
- * delayed. See hpet_next_event() for details.
- *
- * We do this on all SMBUS incarnations for now until we have more
- * information about the affected chipsets.
- */
-static void __init ati_hpet_bugs(int num, int slot, int func)
-{
-#ifdef CONFIG_HPET_TIMER
- hpet_readback_cmp = 1;
-#endif
-}
-
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -236,8 +218,6 @@ static struct chipset early_qrk[] __initdata = {
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
{ PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
- { PCI_VENDOR_ID_ATI, PCI_ANY_ID,
- PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_hpet_bugs },
{}
};
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index fa99bae75ace..4572f25f9325 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -14,6 +14,7 @@
#include <xen/hvc-console.h>
#include <asm/pci-direct.h>
#include <asm/fixmap.h>
+#include <asm/mrst.h>
#include <asm/pgtable.h>
#include <linux/usb/ehci_def.h>
@@ -239,6 +240,18 @@ static int __init setup_early_printk(char *buf)
if (!strncmp(buf, "xen", 3))
early_console_register(&xenboot_console, keep);
#endif
+#ifdef CONFIG_X86_MRST_EARLY_PRINTK
+ if (!strncmp(buf, "mrst", 4)) {
+ mrst_early_console_init();
+ early_console_register(&early_mrst_console, keep);
+ }
+
+ if (!strncmp(buf, "hsu", 3)) {
+ hsu_early_console_init();
+ early_console_register(&early_hsu_console, keep);
+ }
+
+#endif
buf++;
}
return 0;
diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/kernel/early_printk_mrst.c
new file mode 100644
index 000000000000..65df603622b2
--- /dev/null
+++ b/arch/x86/kernel/early_printk_mrst.c
@@ -0,0 +1,319 @@
+/*
+ * early_printk_mrst.c - early consoles for Intel MID platforms
+ *
+ * Copyright (c) 2008-2010, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+/*
+ * This file implements two early consoles named mrst and hsu.
+ * mrst is based on Maxim3110 spi-uart device, it exists in both
+ * Moorestown and Medfield platforms, while hsu is based on a High
+ * Speed UART device which only exists in the Medfield platform
+ */
+
+#include <linux/serial_reg.h>
+#include <linux/serial_mfd.h>
+#include <linux/kmsg_dump.h>
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/io.h>
+
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/mrst.h>
+
+#define MRST_SPI_TIMEOUT 0x200000
+#define MRST_REGBASE_SPI0 0xff128000
+#define MRST_REGBASE_SPI1 0xff128400
+#define MRST_CLK_SPI0_REG 0xff11d86c
+
+/* Bit fields in CTRLR0 */
+#define SPI_DFS_OFFSET 0
+
+#define SPI_FRF_OFFSET 4
+#define SPI_FRF_SPI 0x0
+#define SPI_FRF_SSP 0x1
+#define SPI_FRF_MICROWIRE 0x2
+#define SPI_FRF_RESV 0x3
+
+#define SPI_MODE_OFFSET 6
+#define SPI_SCPH_OFFSET 6
+#define SPI_SCOL_OFFSET 7
+#define SPI_TMOD_OFFSET 8
+#define SPI_TMOD_TR 0x0 /* xmit & recv */
+#define SPI_TMOD_TO 0x1 /* xmit only */
+#define SPI_TMOD_RO 0x2 /* recv only */
+#define SPI_TMOD_EPROMREAD 0x3 /* eeprom read mode */
+
+#define SPI_SLVOE_OFFSET 10
+#define SPI_SRL_OFFSET 11
+#define SPI_CFS_OFFSET 12
+
+/* Bit fields in SR, 7 bits */
+#define SR_MASK 0x7f /* cover 7 bits */
+#define SR_BUSY (1 << 0)
+#define SR_TF_NOT_FULL (1 << 1)
+#define SR_TF_EMPT (1 << 2)
+#define SR_RF_NOT_EMPT (1 << 3)
+#define SR_RF_FULL (1 << 4)
+#define SR_TX_ERR (1 << 5)
+#define SR_DCOL (1 << 6)
+
+struct dw_spi_reg {
+ u32 ctrl0;
+ u32 ctrl1;
+ u32 ssienr;
+ u32 mwcr;
+ u32 ser;
+ u32 baudr;
+ u32 txfltr;
+ u32 rxfltr;
+ u32 txflr;
+ u32 rxflr;
+ u32 sr;
+ u32 imr;
+ u32 isr;
+ u32 risr;
+ u32 txoicr;
+ u32 rxoicr;
+ u32 rxuicr;
+ u32 msticr;
+ u32 icr;
+ u32 dmacr;
+ u32 dmatdlr;
+ u32 dmardlr;
+ u32 idr;
+ u32 version;
+
+ /* Currently operates as 32 bits, though only the low 16 bits matter */
+ u32 dr;
+} __packed;
+
+#define dw_readl(dw, name) __raw_readl(&(dw)->name)
+#define dw_writel(dw, name, val) __raw_writel((val), &(dw)->name)
+
+/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
+static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
+
+static u32 *pclk_spi0;
+/* Always contains an accessable address, start with 0 */
+static struct dw_spi_reg *pspi;
+
+static struct kmsg_dumper dw_dumper;
+static int dumper_registered;
+
+static void dw_kmsg_dump(struct kmsg_dumper *dumper,
+ enum kmsg_dump_reason reason,
+ const char *s1, unsigned long l1,
+ const char *s2, unsigned long l2)
+{
+ int i;
+
+ /* When run to this, we'd better re-init the HW */
+ mrst_early_console_init();
+
+ for (i = 0; i < l1; i++)
+ early_mrst_console.write(&early_mrst_console, s1 + i, 1);
+ for (i = 0; i < l2; i++)
+ early_mrst_console.write(&early_mrst_console, s2 + i, 1);
+}
+
+/* Set the ratio rate to 115200, 8n1, IRQ disabled */
+static void max3110_write_config(void)
+{
+ u16 config;
+
+ config = 0xc001;
+ dw_writel(pspi, dr, config);
+}
+
+/* Translate char to a eligible word and send to max3110 */
+static void max3110_write_data(char c)
+{
+ u16 data;
+
+ data = 0x8000 | c;
+ dw_writel(pspi, dr, data);
+}
+
+void mrst_early_console_init(void)
+{
+ u32 ctrlr0 = 0;
+ u32 spi0_cdiv;
+ u32 freq; /* Freqency info only need be searched once */
+
+ /* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
+ pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+ MRST_CLK_SPI0_REG);
+ spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
+ freq = 100000000 / (spi0_cdiv + 1);
+
+ if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL)
+ mrst_spi_paddr = MRST_REGBASE_SPI1;
+
+ pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+ mrst_spi_paddr);
+
+ /* Disable SPI controller */
+ dw_writel(pspi, ssienr, 0);
+
+ /* Set control param, 8 bits, transmit only mode */
+ ctrlr0 = dw_readl(pspi, ctrl0);
+
+ ctrlr0 &= 0xfcc0;
+ ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
+ | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
+ dw_writel(pspi, ctrl0, ctrlr0);
+
+ /*
+ * Change the spi0 clk to comply with 115200 bps, use 100000 to
+ * calculate the clk dividor to make the clock a little slower
+ * than real baud rate.
+ */
+ dw_writel(pspi, baudr, freq/100000);
+
+ /* Disable all INT for early phase */
+ dw_writel(pspi, imr, 0x0);
+
+ /* Set the cs to spi-uart */
+ dw_writel(pspi, ser, 0x2);
+
+ /* Enable the HW, the last step for HW init */
+ dw_writel(pspi, ssienr, 0x1);
+
+ /* Set the default configuration */
+ max3110_write_config();
+
+ /* Register the kmsg dumper */
+ if (!dumper_registered) {
+ dw_dumper.dump = dw_kmsg_dump;
+ kmsg_dump_register(&dw_dumper);
+ dumper_registered = 1;
+ }
+}
+
+/* Slave select should be called in the read/write function */
+static void early_mrst_spi_putc(char c)
+{
+ unsigned int timeout;
+ u32 sr;
+
+ timeout = MRST_SPI_TIMEOUT;
+ /* Early putc needs to make sure the TX FIFO is not full */
+ while (--timeout) {
+ sr = dw_readl(pspi, sr);
+ if (!(sr & SR_TF_NOT_FULL))
+ cpu_relax();
+ else
+ break;
+ }
+
+ if (!timeout)
+ pr_warning("MRST earlycon: timed out\n");
+ else
+ max3110_write_data(c);
+}
+
+/* Early SPI only uses polling mode */
+static void early_mrst_spi_write(struct console *con, const char *str, unsigned n)
+{
+ int i;
+
+ for (i = 0; i < n && *str; i++) {
+ if (*str == '\n')
+ early_mrst_spi_putc('\r');
+ early_mrst_spi_putc(*str);
+ str++;
+ }
+}
+
+struct console early_mrst_console = {
+ .name = "earlymrst",
+ .write = early_mrst_spi_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
+
+/*
+ * Following is the early console based on Medfield HSU (High
+ * Speed UART) device.
+ */
+#define HSU_PORT2_PADDR 0xffa28180
+
+static void __iomem *phsu;
+
+void hsu_early_console_init(void)
+{
+ u8 lcr;
+
+ phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+ HSU_PORT2_PADDR);
+
+ /* Disable FIFO */
+ writeb(0x0, phsu + UART_FCR);
+
+ /* Set to default 115200 bps, 8n1 */
+ lcr = readb(phsu + UART_LCR);
+ writeb((0x80 | lcr), phsu + UART_LCR);
+ writeb(0x18, phsu + UART_DLL);
+ writeb(lcr, phsu + UART_LCR);
+ writel(0x3600, phsu + UART_MUL*4);
+
+ writeb(0x8, phsu + UART_MCR);
+ writeb(0x7, phsu + UART_FCR);
+ writeb(0x3, phsu + UART_LCR);
+
+ /* Clear IRQ status */
+ readb(phsu + UART_LSR);
+ readb(phsu + UART_RX);
+ readb(phsu + UART_IIR);
+ readb(phsu + UART_MSR);
+
+ /* Enable FIFO */
+ writeb(0x7, phsu + UART_FCR);
+}
+
+#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
+
+static void early_hsu_putc(char ch)
+{
+ unsigned int timeout = 10000; /* 10ms */
+ u8 status;
+
+ while (--timeout) {
+ status = readb(phsu + UART_LSR);
+ if (status & BOTH_EMPTY)
+ break;
+ udelay(1);
+ }
+
+ /* Only write the char when there was no timeout */
+ if (timeout)
+ writeb(ch, phsu + UART_TX);
+}
+
+static void early_hsu_write(struct console *con, const char *str, unsigned n)
+{
+ int i;
+
+ for (i = 0; i < n && *str; i++) {
+ if (*str == '\n')
+ early_hsu_putc('\r');
+ early_hsu_putc(*str);
+ str++;
+ }
+}
+
+struct console early_hsu_console = {
+ .name = "earlyhsu",
+ .write = early_hsu_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index cd49141cf153..59e175e89599 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -115,8 +115,7 @@
/* unfortunately push/pop can't be no-op */
.macro PUSH_GS
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
.endm
.macro POP_GS pop=0
addl $(4 + \pop), %esp
@@ -140,14 +139,12 @@
#else /* CONFIG_X86_32_LAZY_GS */
.macro PUSH_GS
- pushl %gs
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %gs
/*CFI_REL_OFFSET gs, 0*/
.endm
.macro POP_GS pop=0
-98: popl %gs
- CFI_ADJUST_CFA_OFFSET -4
+98: popl_cfi %gs
/*CFI_RESTORE gs*/
.if \pop <> 0
add $\pop, %esp
@@ -195,35 +192,25 @@
.macro SAVE_ALL
cld
PUSH_GS
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %fs
/*CFI_REL_OFFSET fs, 0;*/
- pushl %es
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %es
/*CFI_REL_OFFSET es, 0;*/
- pushl %ds
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ds
/*CFI_REL_OFFSET ds, 0;*/
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
CFI_REL_OFFSET eax, 0
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebp
CFI_REL_OFFSET ebp, 0
- pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %edi
CFI_REL_OFFSET edi, 0
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %esi
CFI_REL_OFFSET esi, 0
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %edx
CFI_REL_OFFSET edx, 0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ecx
CFI_REL_OFFSET ecx, 0
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebx
CFI_REL_OFFSET ebx, 0
movl $(__USER_DS), %edx
movl %edx, %ds
@@ -234,39 +221,29 @@
.endm
.macro RESTORE_INT_REGS
- popl %ebx
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %ebx
CFI_RESTORE ebx
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %ecx
CFI_RESTORE ecx
- popl %edx
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %edx
CFI_RESTORE edx
- popl %esi
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %esi
CFI_RESTORE esi
- popl %edi
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %edi
CFI_RESTORE edi
- popl %ebp
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %ebp
CFI_RESTORE ebp
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %eax
CFI_RESTORE eax
.endm
.macro RESTORE_REGS pop=0
RESTORE_INT_REGS
-1: popl %ds
- CFI_ADJUST_CFA_OFFSET -4
+1: popl_cfi %ds
/*CFI_RESTORE ds;*/
-2: popl %es
- CFI_ADJUST_CFA_OFFSET -4
+2: popl_cfi %es
/*CFI_RESTORE es;*/
-3: popl %fs
- CFI_ADJUST_CFA_OFFSET -4
+3: popl_cfi %fs
/*CFI_RESTORE fs;*/
POP_GS \pop
.pushsection .fixup, "ax"
@@ -320,16 +297,12 @@
ENTRY(ret_from_fork)
CFI_STARTPROC
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
call schedule_tail
GET_THREAD_INFO(%ebp)
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- pushl $0x0202 # Reset kernel eflags
- CFI_ADJUST_CFA_OFFSET 4
- popfl
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %eax
+ pushl_cfi $0x0202 # Reset kernel eflags
+ popfl_cfi
jmp syscall_exit
CFI_ENDPROC
END(ret_from_fork)
@@ -409,29 +382,23 @@ sysenter_past_esp:
* enough kernel state to call TRACE_IRQS_OFF can be called - but
* we immediately enable interrupts at that point anyway.
*/
- pushl $(__USER_DS)
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $__USER_DS
/*CFI_REL_OFFSET ss, 0*/
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebp
CFI_REL_OFFSET esp, 0
- pushfl
+ pushfl_cfi
orl $X86_EFLAGS_IF, (%esp)
- CFI_ADJUST_CFA_OFFSET 4
- pushl $(__USER_CS)
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $__USER_CS
/*CFI_REL_OFFSET cs, 0*/
/*
* Push current_thread_info()->sysenter_return to the stack.
* A tiny bit of offset fixup is necessary - 4*4 means the 4 words
* pushed above; +8 corresponds to copy_thread's esp0 setting.
*/
- pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi (TI_sysenter_return-THREAD_SIZE_asm+8+4*4)(%esp)
CFI_REL_OFFSET eip, 0
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
SAVE_ALL
ENABLE_INTERRUPTS(CLBR_NONE)
@@ -486,8 +453,7 @@ sysenter_audit:
movl %eax,%edx /* 2nd arg: syscall number */
movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
call audit_syscall_entry
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebx
movl PT_EAX(%esp),%eax /* reload syscall number */
jmp sysenter_do_call
@@ -529,8 +495,7 @@ ENDPROC(ia32_sysenter_target)
# system call handler stub
ENTRY(system_call)
RING0_INT_FRAME # can't unwind into user space anyway
- pushl %eax # save orig_eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax # save orig_eax
SAVE_ALL
GET_THREAD_INFO(%ebp)
# system call tracing in operation / emulation
@@ -566,7 +531,6 @@ restore_all_notrace:
je ldt_ss # returning to user-space with LDT SS
restore_nocheck:
RESTORE_REGS 4 # skip orig_eax/error_code
- CFI_ADJUST_CFA_OFFSET -4
irq_return:
INTERRUPT_RETURN
.section .fixup,"ax"
@@ -611,18 +575,16 @@ ldt_ss:
* compensating for the offset by changing to the ESPFIX segment with
* a base address that matches for the difference.
*/
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
mov %esp, %edx /* load kernel esp */
mov PT_OLDESP(%esp), %eax /* load userspace esp */
mov %dx, %ax /* eax: new kernel esp */
sub %eax, %edx /* offset (low word is 0) */
- PER_CPU(gdt_page, %ebx)
shr $16, %edx
- mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */
- mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */
- pushl $__ESPFIX_SS
- CFI_ADJUST_CFA_OFFSET 4
- push %eax /* new kernel esp */
- CFI_ADJUST_CFA_OFFSET 4
+ mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
+ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
+ pushl_cfi $__ESPFIX_SS
+ pushl_cfi %eax /* new kernel esp */
/* Disable interrupts, but do not irqtrace this section: we
* will soon execute iret and the tracer was already set to
* the irqstate after the iret */
@@ -666,11 +628,9 @@ work_notifysig: # deal with pending signals and
ALIGN
work_notifysig_v86:
- pushl %ecx # save ti_flags for do_notify_resume
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ecx # save ti_flags for do_notify_resume
call save_v86_state # %eax contains pt_regs pointer
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %ecx
movl %eax, %esp
#else
movl %esp, %eax
@@ -750,14 +710,18 @@ ptregs_##name: \
#define PTREGSCALL3(name) \
ALIGN; \
ptregs_##name: \
+ CFI_STARTPROC; \
leal 4(%esp),%eax; \
- pushl %eax; \
+ pushl_cfi %eax; \
movl PT_EDX(%eax),%ecx; \
movl PT_ECX(%eax),%edx; \
movl PT_EBX(%eax),%eax; \
call sys_##name; \
addl $4,%esp; \
- ret
+ CFI_ADJUST_CFA_OFFSET -4; \
+ ret; \
+ CFI_ENDPROC; \
+ENDPROC(ptregs_##name)
PTREGSCALL1(iopl)
PTREGSCALL0(fork)
@@ -772,15 +736,19 @@ PTREGSCALL1(vm86old)
/* Clone is an oddball. The 4th arg is in %edi */
ALIGN;
ptregs_clone:
+ CFI_STARTPROC
leal 4(%esp),%eax
- pushl %eax
- pushl PT_EDI(%eax)
+ pushl_cfi %eax
+ pushl_cfi PT_EDI(%eax)
movl PT_EDX(%eax),%ecx
movl PT_ECX(%eax),%edx
movl PT_EBX(%eax),%eax
call sys_clone
addl $8,%esp
+ CFI_ADJUST_CFA_OFFSET -8
ret
+ CFI_ENDPROC
+ENDPROC(ptregs_clone)
.macro FIXUP_ESPFIX_STACK
/*
@@ -791,15 +759,12 @@ ptregs_clone:
* normal stack and adjusts ESP with the matching offset.
*/
/* fixup the stack */
- PER_CPU(gdt_page, %ebx)
- mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */
- mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
+ mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
+ mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
shl $16, %eax
addl %esp, %eax /* the adjusted stack pointer */
- pushl $__KERNEL_DS
- CFI_ADJUST_CFA_OFFSET 4
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $__KERNEL_DS
+ pushl_cfi %eax
lss (%esp), %esp /* switch to the normal stack segment */
CFI_ADJUST_CFA_OFFSET -8
.endm
@@ -836,8 +801,7 @@ vector=FIRST_EXTERNAL_VECTOR
.if vector <> FIRST_EXTERNAL_VECTOR
CFI_ADJUST_CFA_OFFSET -4
.endif
-1: pushl $(~vector+0x80) /* Note: always in signed byte range */
- CFI_ADJUST_CFA_OFFSET 4
+1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
.if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
jmp 2f
.endif
@@ -877,8 +841,7 @@ ENDPROC(common_interrupt)
#define BUILD_INTERRUPT3(name, nr, fn) \
ENTRY(name) \
RING0_INT_FRAME; \
- pushl $~(nr); \
- CFI_ADJUST_CFA_OFFSET 4; \
+ pushl_cfi $~(nr); \
SAVE_ALL; \
TRACE_IRQS_OFF \
movl %esp,%eax; \
@@ -894,27 +857,24 @@ ENDPROC(name)
ENTRY(coprocessor_error)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_coprocessor_error
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_coprocessor_error
jmp error_code
CFI_ENDPROC
END(coprocessor_error)
ENTRY(simd_coprocessor_error)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
#ifdef CONFIG_X86_INVD_BUG
/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-661: pushl $do_general_protection
+661: pushl_cfi $do_general_protection
662:
.section .altinstructions,"a"
.balign 4
.long 661b
.long 663f
- .byte X86_FEATURE_XMM
+ .word X86_FEATURE_XMM
.byte 662b-661b
.byte 664f-663f
.previous
@@ -923,19 +883,16 @@ ENTRY(simd_coprocessor_error)
664:
.previous
#else
- pushl $do_simd_coprocessor_error
+ pushl_cfi $do_simd_coprocessor_error
#endif
- CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
END(simd_coprocessor_error)
ENTRY(device_not_available)
RING0_INT_FRAME
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_device_not_available
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $-1 # mark this as an int
+ pushl_cfi $do_device_not_available
jmp error_code
CFI_ENDPROC
END(device_not_available)
@@ -957,82 +914,68 @@ END(native_irq_enable_sysexit)
ENTRY(overflow)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_overflow
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_overflow
jmp error_code
CFI_ENDPROC
END(overflow)
ENTRY(bounds)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_bounds
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_bounds
jmp error_code
CFI_ENDPROC
END(bounds)
ENTRY(invalid_op)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_invalid_op
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_invalid_op
jmp error_code
CFI_ENDPROC
END(invalid_op)
ENTRY(coprocessor_segment_overrun)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_coprocessor_segment_overrun
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_coprocessor_segment_overrun
jmp error_code
CFI_ENDPROC
END(coprocessor_segment_overrun)
ENTRY(invalid_TSS)
RING0_EC_FRAME
- pushl $do_invalid_TSS
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_invalid_TSS
jmp error_code
CFI_ENDPROC
END(invalid_TSS)
ENTRY(segment_not_present)
RING0_EC_FRAME
- pushl $do_segment_not_present
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_segment_not_present
jmp error_code
CFI_ENDPROC
END(segment_not_present)
ENTRY(stack_segment)
RING0_EC_FRAME
- pushl $do_stack_segment
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_stack_segment
jmp error_code
CFI_ENDPROC
END(stack_segment)
ENTRY(alignment_check)
RING0_EC_FRAME
- pushl $do_alignment_check
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_alignment_check
jmp error_code
CFI_ENDPROC
END(alignment_check)
ENTRY(divide_error)
RING0_INT_FRAME
- pushl $0 # no error code
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_divide_error
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0 # no error code
+ pushl_cfi $do_divide_error
jmp error_code
CFI_ENDPROC
END(divide_error)
@@ -1040,10 +983,8 @@ END(divide_error)
#ifdef CONFIG_X86_MCE
ENTRY(machine_check)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl machine_check_vector
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi machine_check_vector
jmp error_code
CFI_ENDPROC
END(machine_check)
@@ -1051,10 +992,8 @@ END(machine_check)
ENTRY(spurious_interrupt_bug)
RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_spurious_interrupt_bug
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
+ pushl_cfi $do_spurious_interrupt_bug
jmp error_code
CFI_ENDPROC
END(spurious_interrupt_bug)
@@ -1085,8 +1024,7 @@ ENTRY(xen_sysenter_target)
ENTRY(xen_hypervisor_callback)
CFI_STARTPROC
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $0
SAVE_ALL
TRACE_IRQS_OFF
@@ -1122,23 +1060,20 @@ ENDPROC(xen_hypervisor_callback)
# We distinguish between categories by maintaining a status value in EAX.
ENTRY(xen_failsafe_callback)
CFI_STARTPROC
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
movl $1,%eax
1: mov 4(%esp),%ds
2: mov 8(%esp),%es
3: mov 12(%esp),%fs
4: mov 16(%esp),%gs
testl %eax,%eax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %eax
lea 16(%esp),%esp
CFI_ADJUST_CFA_OFFSET -16
jz 5f
addl $16,%esp
jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
-5: pushl $0 # EAX == 0 => Category 1 (Bad segment)
- CFI_ADJUST_CFA_OFFSET 4
+5: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment)
SAVE_ALL
jmp ret_from_exception
CFI_ENDPROC
@@ -1166,6 +1101,9 @@ ENTRY(xen_failsafe_callback)
.previous
ENDPROC(xen_failsafe_callback)
+BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
+ xen_evtchn_do_upcall)
+
#endif /* CONFIG_XEN */
#ifdef CONFIG_FUNCTION_TRACER
@@ -1285,40 +1223,29 @@ syscall_table_size=(.-sys_call_table)
ENTRY(page_fault)
RING0_EC_FRAME
- pushl $do_page_fault
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_page_fault
ALIGN
error_code:
/* the function address is in %gs's slot on the stack */
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %fs
/*CFI_REL_OFFSET fs, 0*/
- pushl %es
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %es
/*CFI_REL_OFFSET es, 0*/
- pushl %ds
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ds
/*CFI_REL_OFFSET ds, 0*/
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
CFI_REL_OFFSET eax, 0
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebp
CFI_REL_OFFSET ebp, 0
- pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %edi
CFI_REL_OFFSET edi, 0
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %esi
CFI_REL_OFFSET esi, 0
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %edx
CFI_REL_OFFSET edx, 0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ecx
CFI_REL_OFFSET ecx, 0
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ebx
CFI_REL_OFFSET ebx, 0
cld
movl $(__KERNEL_PERCPU), %ecx
@@ -1360,12 +1287,9 @@ END(page_fault)
movl TSS_sysenter_sp0 + \offset(%esp), %esp
CFI_DEF_CFA esp, 0
CFI_UNDEFINED eip
- pushfl
- CFI_ADJUST_CFA_OFFSET 4
- pushl $__KERNEL_CS
- CFI_ADJUST_CFA_OFFSET 4
- pushl $sysenter_past_esp
- CFI_ADJUST_CFA_OFFSET 4
+ pushfl_cfi
+ pushl_cfi $__KERNEL_CS
+ pushl_cfi $sysenter_past_esp
CFI_REL_OFFSET eip, 0
.endm
@@ -1375,8 +1299,7 @@ ENTRY(debug)
jne debug_stack_correct
FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
debug_stack_correct:
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $-1 # mark this as an int
SAVE_ALL
TRACE_IRQS_OFF
xorl %edx,%edx # error code 0
@@ -1396,32 +1319,27 @@ END(debug)
*/
ENTRY(nmi)
RING0_INT_FRAME
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
movl %ss, %eax
cmpw $__ESPFIX_SS, %ax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %eax
je nmi_espfix_stack
cmpl $ia32_sysenter_target,(%esp)
je nmi_stack_fixup
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
movl %esp,%eax
/* Do not access memory above the end of our stack page,
* it might not exist.
*/
andl $(THREAD_SIZE-1),%eax
cmpl $(THREAD_SIZE-20),%eax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
+ popl_cfi %eax
jae nmi_stack_correct
cmpl $ia32_sysenter_target,12(%esp)
je nmi_debug_stack_check
nmi_stack_correct:
/* We have a RING0_INT_FRAME here */
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
SAVE_ALL
xorl %edx,%edx # zero error code
movl %esp,%eax # pt_regs pointer
@@ -1450,18 +1368,14 @@ nmi_espfix_stack:
*
* create the pointer to lss back
*/
- pushl %ss
- CFI_ADJUST_CFA_OFFSET 4
- pushl %esp
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %ss
+ pushl_cfi %esp
addl $4, (%esp)
/* copy the iret frame of 12 bytes */
.rept 3
- pushl 16(%esp)
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi 16(%esp)
.endr
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi %eax
SAVE_ALL
FIXUP_ESPFIX_STACK # %eax == %esp
xorl %edx,%edx # zero error code
@@ -1475,8 +1389,7 @@ END(nmi)
ENTRY(int3)
RING0_INT_FRAME
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $-1 # mark this as an int
SAVE_ALL
TRACE_IRQS_OFF
xorl %edx,%edx # zero error code
@@ -1488,8 +1401,7 @@ END(int3)
ENTRY(general_protection)
RING0_EC_FRAME
- pushl $do_general_protection
- CFI_ADJUST_CFA_OFFSET 4
+ pushl_cfi $do_general_protection
jmp error_code
CFI_ENDPROC
END(general_protection)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 4db7c4d12ffa..fe2690d71c0c 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -213,23 +213,17 @@ ENDPROC(native_usergs_sysret64)
.macro FAKE_STACK_FRAME child_rip
/* push in order ss, rsp, eflags, cs, rip */
xorl %eax, %eax
- pushq $__KERNEL_DS /* ss */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $__KERNEL_DS /* ss */
/*CFI_REL_OFFSET ss,0*/
- pushq %rax /* rsp */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rax /* rsp */
CFI_REL_OFFSET rsp,0
- pushq $X86_EFLAGS_IF /* eflags - interrupts on */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
/*CFI_REL_OFFSET rflags,0*/
- pushq $__KERNEL_CS /* cs */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $__KERNEL_CS /* cs */
/*CFI_REL_OFFSET cs,0*/
- pushq \child_rip /* rip */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi \child_rip /* rip */
CFI_REL_OFFSET rip,0
- pushq %rax /* orig rax */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rax /* orig rax */
.endm
.macro UNFAKE_STACK_FRAME
@@ -398,10 +392,8 @@ ENTRY(ret_from_fork)
LOCK ; btr $TIF_FORK,TI_flags(%r8)
- push kernel_eflags(%rip)
- CFI_ADJUST_CFA_OFFSET 8
- popf # reset kernel eflags
- CFI_ADJUST_CFA_OFFSET -8
+ pushq_cfi kernel_eflags(%rip)
+ popfq_cfi # reset kernel eflags
call schedule_tail # rdi: 'prev' task parameter
@@ -521,11 +513,9 @@ sysret_careful:
jnc sysret_signal
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rdi
call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
+ popq_cfi %rdi
jmp sysret_check
/* Handle a signal */
@@ -634,11 +624,9 @@ int_careful:
jnc int_very_careful
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rdi
call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
+ popq_cfi %rdi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
@@ -652,12 +640,10 @@ int_check_syscall_exit_work:
/* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT,%edx
jz int_signal
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rdi
leaq 8(%rsp),%rdi # &ptregs -> arg1
call syscall_trace_leave
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
+ popq_cfi %rdi
andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
jmp int_restore_rest
@@ -714,9 +700,8 @@ END(ptregscall_common)
ENTRY(stub_execve)
CFI_STARTPROC
- popq %r11
- CFI_ADJUST_CFA_OFFSET -8
- CFI_REGISTER rip, r11
+ addq $8, %rsp
+ PARTIAL_FRAME 0
SAVE_REST
FIXUP_TOP_OF_STACK %r11
movq %rsp, %rcx
@@ -735,7 +720,7 @@ END(stub_execve)
ENTRY(stub_rt_sigreturn)
CFI_STARTPROC
addq $8, %rsp
- CFI_ADJUST_CFA_OFFSET -8
+ PARTIAL_FRAME 0
SAVE_REST
movq %rsp,%rdi
FIXUP_TOP_OF_STACK %r11
@@ -766,8 +751,7 @@ vector=FIRST_EXTERNAL_VECTOR
.if vector <> FIRST_EXTERNAL_VECTOR
CFI_ADJUST_CFA_OFFSET -8
.endif
-1: pushq $(~vector+0x80) /* Note: always in signed byte range */
- CFI_ADJUST_CFA_OFFSET 8
+1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
.if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
jmp 2f
.endif
@@ -796,8 +780,8 @@ END(interrupt)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
- subq $10*8, %rsp
- CFI_ADJUST_CFA_OFFSET 10*8
+ subq $ORIG_RAX-ARGOFFSET+8, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8
call save_args
PARTIAL_FRAME 0
call \func
@@ -822,6 +806,7 @@ ret_from_intr:
TRACE_IRQS_OFF
decl PER_CPU_VAR(irq_count)
leaveq
+ CFI_RESTORE rbp
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
exit_intr:
@@ -903,11 +888,9 @@ retint_careful:
jnc retint_signal
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rdi
call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
+ popq_cfi %rdi
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
@@ -956,8 +939,7 @@ END(common_interrupt)
.macro apicinterrupt num sym do_sym
ENTRY(\sym)
INTR_FRAME
- pushq $~(\num)
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $~(\num)
interrupt \do_sym
jmp ret_from_intr
CFI_ENDPROC
@@ -981,22 +963,10 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
x86_platform_ipi smp_x86_platform_ipi
#ifdef CONFIG_SMP
-apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
- invalidate_interrupt0 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
- invalidate_interrupt1 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
- invalidate_interrupt2 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
- invalidate_interrupt3 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
- invalidate_interrupt4 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
- invalidate_interrupt5 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
- invalidate_interrupt6 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
- invalidate_interrupt7 smp_invalidate_interrupt
+.irpc idx, "01234567"
+apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
+ invalidate_interrupt\idx smp_invalidate_interrupt
+.endr
#endif
apicinterrupt THRESHOLD_APIC_VECTOR \
@@ -1023,9 +993,9 @@ apicinterrupt ERROR_APIC_VECTOR \
apicinterrupt SPURIOUS_APIC_VECTOR \
spurious_interrupt smp_spurious_interrupt
-#ifdef CONFIG_PERF_EVENTS
-apicinterrupt LOCAL_PENDING_VECTOR \
- perf_pending_interrupt smp_perf_pending_interrupt
+#ifdef CONFIG_IRQ_WORK
+apicinterrupt IRQ_WORK_VECTOR \
+ irq_work_interrupt smp_irq_work_interrupt
#endif
/*
@@ -1036,8 +1006,8 @@ ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call error_entry
DEFAULT_FRAME 0
movq %rsp,%rdi /* pt_regs pointer */
@@ -1052,9 +1022,9 @@ END(\sym)
ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $-1 /* ORIG_RAX: no syscall to restart */
- CFI_ADJUST_CFA_OFFSET 8
- subq $15*8, %rsp
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
TRACE_IRQS_OFF
movq %rsp,%rdi /* pt_regs pointer */
@@ -1065,21 +1035,21 @@ ENTRY(\sym)
END(\sym)
.endm
+#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
.macro paranoidzeroentry_ist sym do_sym ist
ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $-1 /* ORIG_RAX: no syscall to restart */
- CFI_ADJUST_CFA_OFFSET 8
- subq $15*8, %rsp
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
TRACE_IRQS_OFF
movq %rsp,%rdi /* pt_regs pointer */
xorl %esi,%esi /* no error code */
- PER_CPU(init_tss, %r12)
- subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
+ subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
call \do_sym
- addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
+ addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
END(\sym)
@@ -1089,8 +1059,8 @@ END(\sym)
ENTRY(\sym)
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call error_entry
DEFAULT_FRAME 0
movq %rsp,%rdi /* pt_regs pointer */
@@ -1107,8 +1077,8 @@ END(\sym)
ENTRY(\sym)
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
DEFAULT_FRAME 0
TRACE_IRQS_OFF
@@ -1139,16 +1109,14 @@ zeroentry simd_coprocessor_error do_simd_coprocessor_error
/* edi: new selector */
ENTRY(native_load_gs_index)
CFI_STARTPROC
- pushf
- CFI_ADJUST_CFA_OFFSET 8
+ pushfq_cfi
DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
SWAPGS
gs_change:
movl %edi,%gs
2: mfence /* workaround */
SWAPGS
- popf
- CFI_ADJUST_CFA_OFFSET -8
+ popfq_cfi
ret
CFI_ENDPROC
END(native_load_gs_index)
@@ -1185,13 +1153,13 @@ END(kernel_thread_helper)
* execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
*
* C extern interface:
- * extern long execve(char *name, char **argv, char **envp)
+ * extern long execve(const char *name, char **argv, char **envp)
*
* asm input arguments:
* rdi: name, rsi: argv, rdx: envp
*
* We want to fallback into:
- * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
+ * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
*
* do_sys_execve asm fallback arguments:
* rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
@@ -1215,8 +1183,7 @@ END(kernel_execve)
/* Call softirq on interrupt stack. Interrupts are off. */
ENTRY(call_softirq)
CFI_STARTPROC
- push %rbp
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi %rbp
CFI_REL_OFFSET rbp,0
mov %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
@@ -1225,6 +1192,7 @@ ENTRY(call_softirq)
push %rbp # backlink for old unwinder
call __do_softirq
leaveq
+ CFI_RESTORE rbp
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
decl PER_CPU_VAR(irq_count)
@@ -1329,6 +1297,9 @@ ENTRY(xen_failsafe_callback)
CFI_ENDPROC
END(xen_failsafe_callback)
+apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
+ xen_hvm_callback_vector xen_evtchn_do_upcall
+
#endif /* CONFIG_XEN */
/*
@@ -1365,7 +1336,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
/* ebx: no swapgs flag */
ENTRY(paranoid_exit)
- INTR_FRAME
+ DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl %ebx,%ebx /* swapgs needed? */
@@ -1442,7 +1413,6 @@ error_swapgs:
error_sti:
TRACE_IRQS_OFF
ret
- CFI_ENDPROC
/*
* There are two places in the kernel that can potentially fault with
@@ -1467,6 +1437,7 @@ bstep_iret:
/* Fix truncated RIP */
movq %rcx,RIP+8(%rsp)
jmp error_swapgs
+ CFI_ENDPROC
END(error_entry)
@@ -1495,8 +1466,8 @@ ENTRY(nmi)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq_cfi $-1
- subq $15*8, %rsp
- CFI_ADJUST_CFA_OFFSET 15*8
+ subq $ORIG_RAX-R15, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
DEFAULT_FRAME 0
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index cd37469b54ee..3afb33f14d2d 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -257,14 +257,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
return mod_code_status;
}
-
-
-
-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
-
static unsigned char *ftrace_nop_replace(void)
{
- return ftrace_nop;
+ return ideal_nop5;
}
static int
@@ -338,62 +333,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
int __init ftrace_dyn_arch_init(void *data)
{
- extern const unsigned char ftrace_test_p6nop[];
- extern const unsigned char ftrace_test_nop5[];
- extern const unsigned char ftrace_test_jmp[];
- int faulted = 0;
-
- /*
- * There is no good nop for all x86 archs.
- * We will default to using the P6_NOP5, but first we
- * will test to make sure that the nop will actually
- * work on this CPU. If it faults, we will then
- * go to a lesser efficient 5 byte nop. If that fails
- * we then just use a jmp as our nop. This isn't the most
- * efficient nop, but we can not use a multi part nop
- * since we would then risk being preempted in the middle
- * of that nop, and if we enabled tracing then, it might
- * cause a system crash.
- *
- * TODO: check the cpuid to determine the best nop.
- */
- asm volatile (
- "ftrace_test_jmp:"
- "jmp ftrace_test_p6nop\n"
- "nop\n"
- "nop\n"
- "nop\n" /* 2 byte jmp + 3 bytes */
- "ftrace_test_p6nop:"
- P6_NOP5
- "jmp 1f\n"
- "ftrace_test_nop5:"
- ".byte 0x66,0x66,0x66,0x66,0x90\n"
- "1:"
- ".section .fixup, \"ax\"\n"
- "2: movl $1, %0\n"
- " jmp ftrace_test_nop5\n"
- "3: movl $2, %0\n"
- " jmp 1b\n"
- ".previous\n"
- _ASM_EXTABLE(ftrace_test_p6nop, 2b)
- _ASM_EXTABLE(ftrace_test_nop5, 3b)
- : "=r"(faulted) : "0" (faulted));
-
- switch (faulted) {
- case 0:
- pr_info("converting mcount calls to 0f 1f 44 00 00\n");
- memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
- break;
- case 1:
- pr_info("converting mcount calls to 66 66 66 66 90\n");
- memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
- break;
- case 2:
- pr_info("converting mcount calls to jmp . + 5\n");
- memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
- break;
- }
-
/* The return code is retured via data */
*(unsigned long *)data = 0;
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 3e66bd364a9d..af0699ba48cf 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -1,5 +1,6 @@
#include <linux/kernel.h>
#include <linux/init.h>
+#include <linux/memblock.h>
#include <asm/setup.h>
#include <asm/bios_ebda.h>
@@ -51,5 +52,5 @@ void __init reserve_ebda_region(void)
lowmem = 0x9f000;
/* reserve all memory between lowmem and the 1MB mark */
- reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
+ memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved");
}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index b2e246037392..763310165fa0 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -8,6 +8,7 @@
#include <linux/init.h>
#include <linux/start_kernel.h>
#include <linux/mm.h>
+#include <linux/memblock.h>
#include <asm/setup.h>
#include <asm/sections.h>
@@ -17,10 +18,11 @@
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/bios_ebda.h>
+#include <asm/tlbflush.h>
static void __init i386_default_early_setup(void)
{
- /* Initilize 32bit specific setup functions */
+ /* Initialize 32bit specific setup functions */
x86_init.resources.probe_roms = probe_roms;
x86_init.resources.reserve_resources = i386_reserve_resources;
x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
@@ -30,17 +32,18 @@ static void __init i386_default_early_setup(void)
void __init i386_start_kernel(void)
{
+ memblock_init();
+
#ifdef CONFIG_X86_TRAMPOLINE
/*
* But first pinch a few for the stack/trampoline stuff
* FIXME: Don't need the extra page at 4K, but need to fix
* trampoline before removing it. (see the GDT stuff)
*/
- reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
- "EX TRAMPOLINE");
+ memblock_x86_reserve_range(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
#endif
- reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+ memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
/* Reserve INITRD */
@@ -49,7 +52,7 @@ void __init i386_start_kernel(void)
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
u64 ramdisk_size = boot_params.hdr.ramdisk_size;
u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+ memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
}
#endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 7147143fd614..2d2673c28aff 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -12,6 +12,7 @@
#include <linux/percpu.h>
#include <linux/start_kernel.h>
#include <linux/io.h>
+#include <linux/memblock.h>
#include <asm/processor.h>
#include <asm/proto.h>
@@ -79,6 +80,8 @@ void __init x86_64_start_kernel(char * real_mode_data)
/* Cleanup the over mapped high alias */
cleanup_highmap();
+ max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
+
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
#ifdef CONFIG_EARLY_PRINTK
set_intr_gate(i, &early_idt_handlers[i]);
@@ -98,7 +101,9 @@ void __init x86_64_start_reservations(char *real_mode_data)
{
copy_bootdata(__va(real_mode_data));
- reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+ memblock_init();
+
+ memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
/* Reserve INITRD */
@@ -107,7 +112,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+ memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
}
#endif
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 37c3d4b17d85..bcece91dd311 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -131,6 +131,12 @@ ENTRY(startup_32)
movsl
1:
+#ifdef CONFIG_OLPC_OPENFIRMWARE
+ /* save OFW's pgdir table for later use when calling into OFW */
+ movl %cr3, %eax
+ movl %eax, pa(olpc_ofw_pgd)
+#endif
+
#ifdef CONFIG_PARAVIRT
/* This is can only trip for a broken bootloader... */
cmpw $0x207, pa(boot_params + BP_version)
@@ -177,13 +183,12 @@ default_entry:
#ifdef CONFIG_X86_PAE
/*
- * In PAE mode swapper_pg_dir is statically defined to contain enough
- * entries to cover the VMSPLIT option (that is the top 1, 2 or 3
- * entries). The identity mapping is handled by pointing two PGD
- * entries to the first kernel PMD.
+ * In PAE mode initial_page_table is statically defined to contain
+ * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
+ * entries). The identity mapping is handled by pointing two PGD entries
+ * to the first kernel PMD.
*
- * Note the upper half of each PMD or PTE are always zero at
- * this stage.
+ * Note the upper half of each PMD or PTE are always zero at this stage.
*/
#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
@@ -191,7 +196,7 @@ default_entry:
xorl %ebx,%ebx /* %ebx is kept at zero */
movl $pa(__brk_base), %edi
- movl $pa(swapper_pg_pmd), %edx
+ movl $pa(initial_pg_pmd), %edx
movl $PTE_IDENT_ATTR, %eax
10:
leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
@@ -220,14 +225,14 @@ default_entry:
movl %eax, pa(max_pfn_mapped)
/* Do early initialization of the fixmap area */
- movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
- movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
+ movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+ movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
#else /* Not PAE */
page_pde_offset = (__PAGE_OFFSET >> 20);
movl $pa(__brk_base), %edi
- movl $pa(swapper_pg_dir), %edx
+ movl $pa(initial_page_table), %edx
movl $PTE_IDENT_ATTR, %eax
10:
leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
@@ -251,8 +256,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
movl %eax, pa(max_pfn_mapped)
/* Do early initialization of the fixmap area */
- movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
- movl %eax,pa(swapper_pg_dir+0xffc)
+ movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+ movl %eax,pa(initial_page_table+0xffc)
#endif
jmp 3f
/*
@@ -328,7 +333,7 @@ ENTRY(startup_32_smp)
/*
* Enable paging
*/
- movl $pa(swapper_pg_dir),%eax
+ movl $pa(initial_page_table), %eax
movl %eax,%cr3 /* set the page table pointer.. */
movl %cr0,%eax
orl $X86_CR0_PG,%eax
@@ -615,16 +620,18 @@ ENTRY(initial_code)
__PAGE_ALIGNED_BSS
.align PAGE_SIZE_asm
#ifdef CONFIG_X86_PAE
-swapper_pg_pmd:
+initial_pg_pmd:
.fill 1024*KPMDS,4,0
#else
-ENTRY(swapper_pg_dir)
+ENTRY(initial_page_table)
.fill 1024,4,0
#endif
-swapper_pg_fixmap:
+initial_pg_fixmap:
.fill 1024,4,0
ENTRY(empty_zero_page)
.fill 4096,1,0
+ENTRY(swapper_pg_dir)
+ .fill 1024,4,0
/*
* This starts the data section.
@@ -633,20 +640,20 @@ ENTRY(empty_zero_page)
__PAGE_ALIGNED_DATA
/* Page-aligned for the benefit of paravirt? */
.align PAGE_SIZE_asm
-ENTRY(swapper_pg_dir)
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
+ENTRY(initial_page_table)
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
# if KPMDS == 3
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
# elif KPMDS == 2
.long 0,0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
# elif KPMDS == 1
.long 0,0
.long 0,0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
# else
# error "Kernel PMDs should be 1, 2 or 3"
# endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3d1e6f16b7a6..239046bd447f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -234,9 +234,8 @@ ENTRY(secondary_startup_64)
* init data section till per cpu areas are set up.
*/
movl $MSR_GS_BASE,%ecx
- movq initial_gs(%rip),%rax
- movq %rax,%rdx
- shrq $32,%rdx
+ movl initial_gs(%rip),%eax
+ movl initial_gs+4(%rip),%edx
wrmsr
/* esi is pointer to real mode structure with interesting info.
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ba390d731175..ae03cab4352e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -16,7 +16,6 @@
#include <asm/hpet.h>
#define HPET_MASK CLOCKSOURCE_MASK(32)
-#define HPET_SHIFT 22
/* FSEC = 10^-15
NSEC = 10^-9 */
@@ -36,7 +35,6 @@
unsigned long hpet_address;
u8 hpet_blockid; /* OS timer block num */
u8 hpet_msi_disable;
-u8 hpet_readback_cmp;
#ifdef CONFIG_PCI_MSI
static unsigned long hpet_num_timers;
@@ -382,40 +380,35 @@ static int hpet_next_event(unsigned long delta,
struct clock_event_device *evt, int timer)
{
u32 cnt;
+ s32 res;
cnt = hpet_readl(HPET_COUNTER);
cnt += (u32) delta;
hpet_writel(cnt, HPET_Tn_CMP(timer));
/*
- * We need to read back the CMP register on certain HPET
- * implementations (ATI chipsets) which seem to delay the
- * transfer of the compare register into the internal compare
- * logic. With small deltas this might actually be too late as
- * the counter could already be higher than the compare value
- * at that point and we would wait for the next hpet interrupt
- * forever. We found out that reading the CMP register back
- * forces the transfer so we can rely on the comparison with
- * the counter register below.
- *
- * That works fine on those ATI chipsets, but on newer Intel
- * chipsets (ICH9...) this triggers due to an erratum: Reading
- * the comparator immediately following a write is returning
- * the old value.
- *
- * We restrict the read back to the affected ATI chipsets (set
- * by quirks) and also run it with hpet=verbose for debugging
- * purposes.
+ * HPETs are a complete disaster. The compare register is
+ * based on a equal comparison and neither provides a less
+ * than or equal functionality (which would require to take
+ * the wraparound into account) nor a simple count down event
+ * mode. Further the write to the comparator register is
+ * delayed internally up to two HPET clock cycles in certain
+ * chipsets (ATI, ICH9,10). We worked around that by reading
+ * back the compare register, but that required another
+ * workaround for ICH9,10 chips where the first readout after
+ * write can return the old stale value. We already have a
+ * minimum delta of 5us enforced, but a NMI or SMI hitting
+ * between the counter readout and the comparator write can
+ * move us behind that point easily. Now instead of reading
+ * the compare register back several times, we make the ETIME
+ * decision based on the following: Return ETIME if the
+ * counter value after the write is less than 8 HPET cycles
+ * away from the event or if the counter is already ahead of
+ * the event.
*/
- if (hpet_readback_cmp || hpet_verbose) {
- u32 cmp = hpet_readl(HPET_Tn_CMP(timer));
+ res = (s32)(cnt - hpet_readl(HPET_COUNTER));
- if (cmp != cnt)
- printk_once(KERN_WARNING
- "hpet: compare register read back failed.\n");
- }
-
- return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
+ return res < 8 ? -ETIME : 0;
}
static void hpet_legacy_set_mode(enum clock_event_mode mode,
@@ -438,9 +431,9 @@ static int hpet_legacy_next_event(unsigned long delta,
static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
static struct hpet_dev *hpet_devs;
-void hpet_msi_unmask(unsigned int irq)
+void hpet_msi_unmask(struct irq_data *data)
{
- struct hpet_dev *hdev = get_irq_data(irq);
+ struct hpet_dev *hdev = data->handler_data;
unsigned int cfg;
/* unmask it */
@@ -449,10 +442,10 @@ void hpet_msi_unmask(unsigned int irq)
hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
}
-void hpet_msi_mask(unsigned int irq)
+void hpet_msi_mask(struct irq_data *data)
{
+ struct hpet_dev *hdev = data->handler_data;
unsigned int cfg;
- struct hpet_dev *hdev = get_irq_data(irq);
/* mask it */
cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -460,18 +453,14 @@ void hpet_msi_mask(unsigned int irq)
hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
}
-void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
+void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg)
{
- struct hpet_dev *hdev = get_irq_data(irq);
-
hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
}
-void hpet_msi_read(unsigned int irq, struct msi_msg *msg)
+void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
{
- struct hpet_dev *hdev = get_irq_data(irq);
-
msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
msg->address_hi = 0;
@@ -504,7 +493,7 @@ static int hpet_assign_irq(struct hpet_dev *dev)
{
unsigned int irq;
- irq = create_irq();
+ irq = create_irq_nr(0, -1);
if (!irq)
return -EINVAL;
@@ -583,7 +572,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
* scaled math multiplication factor for nanosecond to hpet tick
* conversion.
*/
- hpet_freq = 1000000000000000ULL;
+ hpet_freq = FSEC_PER_SEC;
do_div(hpet_freq, hpet_period);
evt->mult = div_sc((unsigned long) hpet_freq,
NSEC_PER_SEC, evt->shift);
@@ -724,7 +713,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
switch (action & 0xf) {
case CPU_ONLINE:
- INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work);
+ INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work);
init_completion(&work.complete);
/* FIXME: add schedule_work_on() */
schedule_delayed_work_on(cpu, &work.work, 0);
@@ -787,7 +776,6 @@ static struct clocksource clocksource_hpet = {
.rating = 250,
.read = read_hpet,
.mask = HPET_MASK,
- .shift = HPET_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.resume = hpet_resume_counter,
#ifdef CONFIG_X86_64
@@ -798,6 +786,7 @@ static struct clocksource clocksource_hpet = {
static int hpet_clocksource_register(void)
{
u64 start, now;
+ u64 hpet_freq;
cycle_t t1;
/* Start the counter */
@@ -832,9 +821,15 @@ static int hpet_clocksource_register(void)
* mult = (hpet_period * 2^shift)/10^6
* mult = (hpet_period << shift)/FSEC_PER_NSEC
*/
- clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
- clocksource_register(&clocksource_hpet);
+ /* Need to convert hpet_period (fsec/cyc) to cyc/sec:
+ *
+ * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
+ * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
+ */
+ hpet_freq = FSEC_PER_SEC;
+ do_div(hpet_freq, hpet_period);
+ clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
return 0;
}
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index a8f1b803d2fd..ff15c9dcc25d 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -206,6 +206,25 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
int arch_bp_generic_fields(int x86_len, int x86_type,
int *gen_len, int *gen_type)
{
+ /* Type */
+ switch (x86_type) {
+ case X86_BREAKPOINT_EXECUTE:
+ if (x86_len != X86_BREAKPOINT_LEN_X)
+ return -EINVAL;
+
+ *gen_type = HW_BREAKPOINT_X;
+ *gen_len = sizeof(long);
+ return 0;
+ case X86_BREAKPOINT_WRITE:
+ *gen_type = HW_BREAKPOINT_W;
+ break;
+ case X86_BREAKPOINT_RW:
+ *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
+ break;
+ default:
+ return -EINVAL;
+ }
+
/* Len */
switch (x86_len) {
case X86_BREAKPOINT_LEN_1:
@@ -226,21 +245,6 @@ int arch_bp_generic_fields(int x86_len, int x86_type,
return -EINVAL;
}
- /* Type */
- switch (x86_type) {
- case X86_BREAKPOINT_EXECUTE:
- *gen_type = HW_BREAKPOINT_X;
- break;
- case X86_BREAKPOINT_WRITE:
- *gen_type = HW_BREAKPOINT_W;
- break;
- case X86_BREAKPOINT_RW:
- *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
- break;
- default:
- return -EINVAL;
- }
-
return 0;
}
@@ -251,6 +255,29 @@ static int arch_build_bp_info(struct perf_event *bp)
info->address = bp->attr.bp_addr;
+ /* Type */
+ switch (bp->attr.bp_type) {
+ case HW_BREAKPOINT_W:
+ info->type = X86_BREAKPOINT_WRITE;
+ break;
+ case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+ info->type = X86_BREAKPOINT_RW;
+ break;
+ case HW_BREAKPOINT_X:
+ info->type = X86_BREAKPOINT_EXECUTE;
+ /*
+ * x86 inst breakpoints need to have a specific undefined len.
+ * But we still need to check userspace is not trying to setup
+ * an unsupported length, to get a range breakpoint for example.
+ */
+ if (bp->attr.bp_len == sizeof(long)) {
+ info->len = X86_BREAKPOINT_LEN_X;
+ return 0;
+ }
+ default:
+ return -EINVAL;
+ }
+
/* Len */
switch (bp->attr.bp_len) {
case HW_BREAKPOINT_LEN_1:
@@ -271,21 +298,6 @@ static int arch_build_bp_info(struct perf_event *bp)
return -EINVAL;
}
- /* Type */
- switch (bp->attr.bp_type) {
- case HW_BREAKPOINT_W:
- info->type = X86_BREAKPOINT_WRITE;
- break;
- case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
- info->type = X86_BREAKPOINT_RW;
- break;
- case HW_BREAKPOINT_X:
- info->type = X86_BREAKPOINT_EXECUTE;
- break;
- default:
- return -EINVAL;
- }
-
return 0;
}
/*
@@ -466,6 +478,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
perf_bp_event(bp, args->regs);
+ /*
+ * Set up resume flag to avoid breakpoint recursion when
+ * returning back to origin.
+ */
+ if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE)
+ args->regs->flags |= X86_EFLAGS_RF;
+
rcu_read_unlock();
}
/*
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 86cef6b32253..58bb239a2fd7 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -40,6 +40,7 @@
static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
unsigned int xstate_size;
+EXPORT_SYMBOL_GPL(xstate_size);
unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
static struct i387_fxsave_struct fx_scratch __cpuinitdata;
@@ -59,62 +60,68 @@ void __cpuinit mxcsr_feature_mask_init(void)
stts();
}
-void __cpuinit init_thread_xstate(void)
+static void __cpuinit init_thread_xstate(void)
{
+ /*
+ * Note that xstate_size might be overwriten later during
+ * xsave_init().
+ */
+
if (!HAVE_HWFP) {
+ /*
+ * Disable xsave as we do not support it if i387
+ * emulation is enabled.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
xstate_size = sizeof(struct i387_soft_struct);
return;
}
- if (cpu_has_xsave) {
- xsave_cntxt_init();
- return;
- }
-
if (cpu_has_fxsr)
xstate_size = sizeof(struct i387_fxsave_struct);
-#ifdef CONFIG_X86_32
else
xstate_size = sizeof(struct i387_fsave_struct);
-#endif
}
-#ifdef CONFIG_X86_64
/*
* Called at bootup to set up the initial FPU state that is later cloned
* into all processes.
*/
+
void __cpuinit fpu_init(void)
{
- unsigned long oldcr0 = read_cr0();
+ unsigned long cr0;
+ unsigned long cr4_mask = 0;
- set_in_cr4(X86_CR4_OSFXSR);
- set_in_cr4(X86_CR4_OSXMMEXCPT);
-
- write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
+ if (cpu_has_fxsr)
+ cr4_mask |= X86_CR4_OSFXSR;
+ if (cpu_has_xmm)
+ cr4_mask |= X86_CR4_OSXMMEXCPT;
+ if (cr4_mask)
+ set_in_cr4(cr4_mask);
+
+ cr0 = read_cr0();
+ cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
+ if (!HAVE_HWFP)
+ cr0 |= X86_CR0_EM;
+ write_cr0(cr0);
- /*
- * Boot processor to setup the FP and extended state context info.
- */
if (!smp_processor_id())
init_thread_xstate();
- xsave_init();
mxcsr_feature_mask_init();
/* clean state in init */
current_thread_info()->status = 0;
clear_used_math();
}
-#endif /* CONFIG_X86_64 */
-static void fpu_finit(struct fpu *fpu)
+void fpu_finit(struct fpu *fpu)
{
-#ifdef CONFIG_X86_32
if (!HAVE_HWFP) {
finit_soft_fpu(&fpu->state->soft);
return;
}
-#endif
if (cpu_has_fxsr) {
struct i387_fxsave_struct *fx = &fpu->state->fxsave;
@@ -132,6 +139,7 @@ static void fpu_finit(struct fpu *fpu)
fp->fos = 0xffff0000u;
}
}
+EXPORT_SYMBOL_GPL(fpu_finit);
/*
* The _current_ task is using the FPU for the first time
@@ -190,6 +198,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
+ sanitize_i387_state(target);
+
return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
&target->thread.fpu.state->fxsave, 0, -1);
}
@@ -207,6 +217,8 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
+ sanitize_i387_state(target);
+
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
&target->thread.fpu.state->fxsave, 0, -1);
@@ -374,19 +386,17 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
#ifdef CONFIG_X86_64
env->fip = fxsave->rip;
env->foo = fxsave->rdp;
+ /*
+ * should be actually ds/cs at fpu exception time, but
+ * that information is not available in 64bit mode.
+ */
+ env->fcs = task_pt_regs(tsk)->cs;
if (tsk == current) {
- /*
- * should be actually ds/cs at fpu exception time, but
- * that information is not available in 64bit mode.
- */
- asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos));
- asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs));
+ savesegment(ds, env->fos);
} else {
- struct pt_regs *regs = task_pt_regs(tsk);
-
- env->fos = 0xffff0000 | tsk->thread.ds;
- env->fcs = regs->cs;
+ env->fos = tsk->thread.ds;
}
+ env->fos |= 0xffff0000;
#else
env->fip = fxsave->fip;
env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
@@ -446,6 +456,8 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
-1);
}
+ sanitize_i387_state(target);
+
if (kbuf && pos == 0 && count == sizeof(env)) {
convert_from_fxsr(kbuf, target);
return 0;
@@ -467,6 +479,8 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
+ sanitize_i387_state(target);
+
if (!HAVE_HWFP)
return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
@@ -533,6 +547,9 @@ static int save_i387_xsave(void __user *buf)
struct _fpstate_ia32 __user *fx = buf;
int err = 0;
+
+ sanitize_i387_state(tsk);
+
/*
* For legacy compatible, we always set FP/SSE bits in the bit
* vector while saving the state to the user context.
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index cafa7c80ac95..20757cb2efa3 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -29,24 +29,10 @@
* plus some generic x86 specific things if generic specifics makes
* any sense at all.
*/
+static void init_8259A(int auto_eoi);
static int i8259A_auto_eoi;
DEFINE_RAW_SPINLOCK(i8259A_lock);
-static void mask_and_ack_8259A(unsigned int);
-static void mask_8259A(void);
-static void unmask_8259A(void);
-static void disable_8259A_irq(unsigned int irq);
-static void enable_8259A_irq(unsigned int irq);
-static void init_8259A(int auto_eoi);
-static int i8259A_irq_pending(unsigned int irq);
-
-struct irq_chip i8259A_chip = {
- .name = "XT-PIC",
- .mask = disable_8259A_irq,
- .disable = disable_8259A_irq,
- .unmask = enable_8259A_irq,
- .mask_ack = mask_and_ack_8259A,
-};
/*
* 8259A PIC functions to handle ISA devices:
@@ -68,7 +54,7 @@ unsigned int cached_irq_mask = 0xffff;
*/
unsigned long io_apic_irqs;
-static void disable_8259A_irq(unsigned int irq)
+static void mask_8259A_irq(unsigned int irq)
{
unsigned int mask = 1 << irq;
unsigned long flags;
@@ -82,7 +68,12 @@ static void disable_8259A_irq(unsigned int irq)
raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
-static void enable_8259A_irq(unsigned int irq)
+static void disable_8259A_irq(struct irq_data *data)
+{
+ mask_8259A_irq(data->irq);
+}
+
+static void unmask_8259A_irq(unsigned int irq)
{
unsigned int mask = ~(1 << irq);
unsigned long flags;
@@ -96,6 +87,11 @@ static void enable_8259A_irq(unsigned int irq)
raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
+static void enable_8259A_irq(struct irq_data *data)
+{
+ unmask_8259A_irq(data->irq);
+}
+
static int i8259A_irq_pending(unsigned int irq)
{
unsigned int mask = 1<<irq;
@@ -117,7 +113,7 @@ static void make_8259A_irq(unsigned int irq)
disable_irq_nosync(irq);
io_apic_irqs &= ~(1<<irq);
set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
- "XT");
+ i8259A_chip.name);
enable_irq(irq);
}
@@ -150,8 +146,9 @@ static inline int i8259A_irq_real(unsigned int irq)
* first, _then_ send the EOI, and the order of EOI
* to the two 8259s is important!
*/
-static void mask_and_ack_8259A(unsigned int irq)
+static void mask_and_ack_8259A(struct irq_data *data)
{
+ unsigned int irq = data->irq;
unsigned int irqmask = 1 << irq;
unsigned long flags;
@@ -223,6 +220,14 @@ spurious_8259A_irq:
}
}
+struct irq_chip i8259A_chip = {
+ .name = "XT-PIC",
+ .irq_mask = disable_8259A_irq,
+ .irq_disable = disable_8259A_irq,
+ .irq_unmask = enable_8259A_irq,
+ .irq_mask_ack = mask_and_ack_8259A,
+};
+
static char irq_trigger[2];
/**
* ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
@@ -342,9 +347,9 @@ static void init_8259A(int auto_eoi)
* In AEOI mode we just have to mask the interrupt
* when acking.
*/
- i8259A_chip.mask_ack = disable_8259A_irq;
+ i8259A_chip.irq_mask_ack = disable_8259A_irq;
else
- i8259A_chip.mask_ack = mask_and_ack_8259A;
+ i8259A_chip.irq_mask_ack = mask_and_ack_8259A;
udelay(100); /* wait for 8259A to initialize */
@@ -363,14 +368,6 @@ static void init_8259A(int auto_eoi)
static void legacy_pic_noop(void) { };
static void legacy_pic_uint_noop(unsigned int unused) { };
static void legacy_pic_int_noop(int unused) { };
-
-static struct irq_chip dummy_pic_chip = {
- .name = "dummy pic",
- .mask = legacy_pic_uint_noop,
- .unmask = legacy_pic_uint_noop,
- .disable = legacy_pic_uint_noop,
- .mask_ack = legacy_pic_uint_noop,
-};
static int legacy_pic_irq_pending_noop(unsigned int irq)
{
return 0;
@@ -378,7 +375,9 @@ static int legacy_pic_irq_pending_noop(unsigned int irq)
struct legacy_pic null_legacy_pic = {
.nr_legacy_irqs = 0,
- .chip = &dummy_pic_chip,
+ .chip = &dummy_irq_chip,
+ .mask = legacy_pic_uint_noop,
+ .unmask = legacy_pic_uint_noop,
.mask_all = legacy_pic_noop,
.restore_mask = legacy_pic_noop,
.init = legacy_pic_int_noop,
@@ -389,7 +388,9 @@ struct legacy_pic null_legacy_pic = {
struct legacy_pic default_legacy_pic = {
.nr_legacy_irqs = NR_IRQS_LEGACY,
.chip = &i8259A_chip,
- .mask_all = mask_8259A,
+ .mask = mask_8259A_irq,
+ .unmask = unmask_8259A_irq,
+ .mask_all = mask_8259A,
.restore_mask = unmask_8259A,
.init = init_8259A,
.irq_pending = i8259A_irq_pending,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 91fd0c70a18a..83ec0175f986 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -67,10 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
seq_printf(p, " Performance monitoring interrupts\n");
- seq_printf(p, "%*s: ", prec, "PND");
+ seq_printf(p, "%*s: ", prec, "IWI");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
- seq_printf(p, " Performance pending work\n");
+ seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
+ seq_printf(p, " IRQ work interrupts\n");
#endif
if (x86_platform_ipi_callback) {
seq_printf(p, "%*s: ", prec, "PLT");
@@ -159,7 +159,7 @@ int show_interrupts(struct seq_file *p, void *v)
seq_printf(p, "%*d: ", prec, i);
for_each_online_cpu(j)
seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
- seq_printf(p, " %8s", desc->chip->name);
+ seq_printf(p, " %8s", desc->irq_data.chip->name);
seq_printf(p, "-%-8s", desc->name);
if (action) {
@@ -185,7 +185,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += irq_stats(cpu)->apic_timer_irqs;
sum += irq_stats(cpu)->irq_spurious_count;
sum += irq_stats(cpu)->apic_perf_irqs;
- sum += irq_stats(cpu)->apic_pending_irqs;
+ sum += irq_stats(cpu)->apic_irq_work_irqs;
#endif
if (x86_platform_ipi_callback)
sum += irq_stats(cpu)->x86_platform_ipis;
@@ -282,6 +282,7 @@ void fixup_irqs(void)
unsigned int irq, vector;
static int warned;
struct irq_desc *desc;
+ struct irq_data *data;
for_each_irq_desc(irq, desc) {
int break_affinity = 0;
@@ -296,7 +297,8 @@ void fixup_irqs(void)
/* interrupt's are disabled at this point */
raw_spin_lock(&desc->lock);
- affinity = desc->affinity;
+ data = &desc->irq_data;
+ affinity = data->affinity;
if (!irq_has_action(irq) ||
cpumask_equal(affinity, cpu_online_mask)) {
raw_spin_unlock(&desc->lock);
@@ -315,16 +317,16 @@ void fixup_irqs(void)
affinity = cpu_all_mask;
}
- if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
- desc->chip->mask(irq);
+ if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_mask)
+ data->chip->irq_mask(data);
- if (desc->chip->set_affinity)
- desc->chip->set_affinity(irq, affinity);
+ if (data->chip->irq_set_affinity)
+ data->chip->irq_set_affinity(data, affinity, true);
else if (!(warned++))
set_affinity = 0;
- if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
- desc->chip->unmask(irq);
+ if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_unmask)
+ data->chip->irq_unmask(data);
raw_spin_unlock(&desc->lock);
@@ -355,10 +357,10 @@ void fixup_irqs(void)
if (irr & (1 << (vector % 32))) {
irq = __get_cpu_var(vector_irq)[vector];
- desc = irq_to_desc(irq);
+ data = irq_get_irq_data(irq);
raw_spin_lock(&desc->lock);
- if (desc->chip->retrigger)
- desc->chip->retrigger(irq);
+ if (data->chip->irq_retrigger)
+ data->chip->irq_retrigger(data);
raw_spin_unlock(&desc->lock);
}
}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 10709f29d166..96656f207751 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -17,6 +17,7 @@
#include <linux/delay.h>
#include <linux/uaccess.h>
#include <linux/percpu.h>
+#include <linux/mm.h>
#include <asm/apic.h>
@@ -49,21 +50,17 @@ static inline int check_stack_overflow(void) { return 0; }
static inline void print_stack_overflow(void) { }
#endif
-#ifdef CONFIG_4KSTACKS
/*
* per-CPU IRQ handling contexts (thread information and stack)
*/
union irq_ctx {
struct thread_info tinfo;
u32 stack[THREAD_SIZE/sizeof(u32)];
-} __attribute__((aligned(PAGE_SIZE)));
+} __attribute__((aligned(THREAD_SIZE)));
static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
-static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
-static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
-
static void call_on_stack(void *func, void *stack)
{
asm volatile("xchgl %%ebx,%%esp \n"
@@ -129,7 +126,9 @@ void __cpuinit irq_ctx_init(int cpu)
if (per_cpu(hardirq_ctx, cpu))
return;
- irqctx = &per_cpu(hardirq_stack, cpu);
+ irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
+ THREAD_FLAGS,
+ THREAD_ORDER));
irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu;
@@ -138,7 +137,9 @@ void __cpuinit irq_ctx_init(int cpu)
per_cpu(hardirq_ctx, cpu) = irqctx;
- irqctx = &per_cpu(softirq_stack, cpu);
+ irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
+ THREAD_FLAGS,
+ THREAD_ORDER));
irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu;
@@ -151,11 +152,6 @@ void __cpuinit irq_ctx_init(int cpu)
cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
}
-void irq_ctx_exit(int cpu)
-{
- per_cpu(hardirq_ctx, cpu) = NULL;
-}
-
asmlinkage void do_softirq(void)
{
unsigned long flags;
@@ -187,11 +183,6 @@ asmlinkage void do_softirq(void)
local_irq_restore(flags);
}
-#else
-static inline int
-execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
-#endif
-
bool handle_irq(unsigned irq, struct pt_regs *regs)
{
struct irq_desc *desc;
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
new file mode 100644
index 000000000000..ca8f703a1e70
--- /dev/null
+++ b/arch/x86/kernel/irq_work.c
@@ -0,0 +1,30 @@
+/*
+ * x86 specific code for irq_work
+ *
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+#include <asm/apic.h>
+
+void smp_irq_work_interrupt(struct pt_regs *regs)
+{
+ irq_enter();
+ ack_APIC_irq();
+ inc_irq_stat(apic_irq_work_irqs);
+ irq_work_run();
+ irq_exit();
+}
+
+void arch_irq_work_raise(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ if (!cpu_has_apic)
+ return;
+
+ apic->send_IPI_self(IRQ_WORK_VECTOR);
+ apic_wait_icr_idle();
+#endif
+}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 990ae7cfc578..c752e973958d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -100,6 +100,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
void __init init_ISA_irqs(void)
{
+ struct irq_chip *chip = legacy_pic->chip;
+ const char *name = chip->name;
int i;
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
@@ -107,19 +109,8 @@ void __init init_ISA_irqs(void)
#endif
legacy_pic->init(0);
- /*
- * 16 old-style INTA-cycle interrupts:
- */
- for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
- struct irq_desc *desc = irq_to_desc(i);
-
- desc->status = IRQ_DISABLED;
- desc->action = NULL;
- desc->depth = 1;
-
- set_irq_chip_and_handler_name(i, &i8259A_chip,
- handle_level_irq, "XT");
- }
+ for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
+ set_irq_chip_and_handler_name(i, chip, handle_level_irq, name);
}
void __init init_IRQ(void)
@@ -224,9 +215,9 @@ static void __init apic_intr_init(void)
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
- /* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_EVENTS
- alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
+ /* IRQ work interrupts: */
+# ifdef CONFIG_IRQ_WORK
+ alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt);
# endif
#endif
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
new file mode 100644
index 000000000000..961b6b30ba90
--- /dev/null
+++ b/arch/x86/kernel/jump_label.c
@@ -0,0 +1,50 @@
+/*
+ * jump label x86 support
+ *
+ * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ *
+ */
+#include <linux/jump_label.h>
+#include <linux/memory.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/cpu.h>
+#include <asm/kprobes.h>
+#include <asm/alternative.h>
+
+#ifdef HAVE_JUMP_LABEL
+
+union jump_code_union {
+ char code[JUMP_LABEL_NOP_SIZE];
+ struct {
+ char jump;
+ int offset;
+ } __attribute__((packed));
+};
+
+void arch_jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ union jump_code_union code;
+
+ if (type == JUMP_LABEL_ENABLE) {
+ code.jump = 0xe9;
+ code.offset = entry->target -
+ (entry->code + JUMP_LABEL_NOP_SIZE);
+ } else
+ memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+ get_online_cpus();
+ mutex_lock(&text_mutex);
+ text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
+ mutex_unlock(&text_mutex);
+ put_online_cpus();
+}
+
+void arch_jump_label_text_poke_early(jump_label_t addr)
+{
+ text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+}
+
+#endif
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 8afd9f321f10..90fcf62854bb 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -78,6 +78,7 @@ static int setup_data_open(struct inode *inode, struct file *file)
static const struct file_operations fops_setup_data = {
.read = setup_data_read,
.open = setup_data_open,
+ .llseek = default_llseek,
};
static int __init
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 01ab17ae2ae7..ec592caac4b4 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -49,55 +49,94 @@
#include <asm/system.h>
#include <asm/apic.h>
-/**
- * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
- * @gdb_regs: A pointer to hold the registers in the order GDB wants.
- * @regs: The &struct pt_regs of the current process.
- *
- * Convert the pt_regs in @regs into the format for registers that
- * GDB expects, stored in @gdb_regs.
- */
-void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{
-#ifndef CONFIG_X86_32
- u32 *gdb_regs32 = (u32 *)gdb_regs;
+#ifdef CONFIG_X86_32
+ { "ax", 4, offsetof(struct pt_regs, ax) },
+ { "cx", 4, offsetof(struct pt_regs, cx) },
+ { "dx", 4, offsetof(struct pt_regs, dx) },
+ { "bx", 4, offsetof(struct pt_regs, bx) },
+ { "sp", 4, offsetof(struct pt_regs, sp) },
+ { "bp", 4, offsetof(struct pt_regs, bp) },
+ { "si", 4, offsetof(struct pt_regs, si) },
+ { "di", 4, offsetof(struct pt_regs, di) },
+ { "ip", 4, offsetof(struct pt_regs, ip) },
+ { "flags", 4, offsetof(struct pt_regs, flags) },
+ { "cs", 4, offsetof(struct pt_regs, cs) },
+ { "ss", 4, offsetof(struct pt_regs, ss) },
+ { "ds", 4, offsetof(struct pt_regs, ds) },
+ { "es", 4, offsetof(struct pt_regs, es) },
+ { "fs", 4, -1 },
+ { "gs", 4, -1 },
+#else
+ { "ax", 8, offsetof(struct pt_regs, ax) },
+ { "bx", 8, offsetof(struct pt_regs, bx) },
+ { "cx", 8, offsetof(struct pt_regs, cx) },
+ { "dx", 8, offsetof(struct pt_regs, dx) },
+ { "si", 8, offsetof(struct pt_regs, dx) },
+ { "di", 8, offsetof(struct pt_regs, di) },
+ { "bp", 8, offsetof(struct pt_regs, bp) },
+ { "sp", 8, offsetof(struct pt_regs, sp) },
+ { "r8", 8, offsetof(struct pt_regs, r8) },
+ { "r9", 8, offsetof(struct pt_regs, r9) },
+ { "r10", 8, offsetof(struct pt_regs, r10) },
+ { "r11", 8, offsetof(struct pt_regs, r11) },
+ { "r12", 8, offsetof(struct pt_regs, r12) },
+ { "r13", 8, offsetof(struct pt_regs, r13) },
+ { "r14", 8, offsetof(struct pt_regs, r14) },
+ { "r15", 8, offsetof(struct pt_regs, r15) },
+ { "ip", 8, offsetof(struct pt_regs, ip) },
+ { "flags", 4, offsetof(struct pt_regs, flags) },
+ { "cs", 4, offsetof(struct pt_regs, cs) },
+ { "ss", 4, offsetof(struct pt_regs, ss) },
#endif
- gdb_regs[GDB_AX] = regs->ax;
- gdb_regs[GDB_BX] = regs->bx;
- gdb_regs[GDB_CX] = regs->cx;
- gdb_regs[GDB_DX] = regs->dx;
- gdb_regs[GDB_SI] = regs->si;
- gdb_regs[GDB_DI] = regs->di;
- gdb_regs[GDB_BP] = regs->bp;
- gdb_regs[GDB_PC] = regs->ip;
+};
+
+int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
+{
+ if (
#ifdef CONFIG_X86_32
- gdb_regs[GDB_PS] = regs->flags;
- gdb_regs[GDB_DS] = regs->ds;
- gdb_regs[GDB_ES] = regs->es;
- gdb_regs[GDB_CS] = regs->cs;
- gdb_regs[GDB_FS] = 0xFFFF;
- gdb_regs[GDB_GS] = 0xFFFF;
- if (user_mode_vm(regs)) {
- gdb_regs[GDB_SS] = regs->ss;
- gdb_regs[GDB_SP] = regs->sp;
- } else {
- gdb_regs[GDB_SS] = __KERNEL_DS;
- gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
+ regno == GDB_SS || regno == GDB_FS || regno == GDB_GS ||
+#endif
+ regno == GDB_SP || regno == GDB_ORIG_AX)
+ return 0;
+
+ if (dbg_reg_def[regno].offset != -1)
+ memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
+ dbg_reg_def[regno].size);
+ return 0;
+}
+
+char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
+{
+ if (regno == GDB_ORIG_AX) {
+ memcpy(mem, &regs->orig_ax, sizeof(regs->orig_ax));
+ return "orig_ax";
}
-#else
- gdb_regs[GDB_R8] = regs->r8;
- gdb_regs[GDB_R9] = regs->r9;
- gdb_regs[GDB_R10] = regs->r10;
- gdb_regs[GDB_R11] = regs->r11;
- gdb_regs[GDB_R12] = regs->r12;
- gdb_regs[GDB_R13] = regs->r13;
- gdb_regs[GDB_R14] = regs->r14;
- gdb_regs[GDB_R15] = regs->r15;
- gdb_regs32[GDB_PS] = regs->flags;
- gdb_regs32[GDB_CS] = regs->cs;
- gdb_regs32[GDB_SS] = regs->ss;
- gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
+ if (regno >= DBG_MAX_REG_NUM || regno < 0)
+ return NULL;
+
+ if (dbg_reg_def[regno].offset != -1)
+ memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
+ dbg_reg_def[regno].size);
+
+ switch (regno) {
+#ifdef CONFIG_X86_32
+ case GDB_SS:
+ if (!user_mode_vm(regs))
+ *(unsigned long *)mem = __KERNEL_DS;
+ break;
+ case GDB_SP:
+ if (!user_mode_vm(regs))
+ *(unsigned long *)mem = kernel_stack_pointer(regs);
+ break;
+ case GDB_GS:
+ case GDB_FS:
+ *(unsigned long *)mem = 0xFFFF;
+ break;
#endif
+ }
+ return dbg_reg_def[regno].name;
}
/**
@@ -150,54 +189,13 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs[GDB_SP] = p->thread.sp;
}
-/**
- * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs.
- * @gdb_regs: A pointer to hold the registers we've received from GDB.
- * @regs: A pointer to a &struct pt_regs to hold these values in.
- *
- * Convert the GDB regs in @gdb_regs into the pt_regs, and store them
- * in @regs.
- */
-void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
-{
-#ifndef CONFIG_X86_32
- u32 *gdb_regs32 = (u32 *)gdb_regs;
-#endif
- regs->ax = gdb_regs[GDB_AX];
- regs->bx = gdb_regs[GDB_BX];
- regs->cx = gdb_regs[GDB_CX];
- regs->dx = gdb_regs[GDB_DX];
- regs->si = gdb_regs[GDB_SI];
- regs->di = gdb_regs[GDB_DI];
- regs->bp = gdb_regs[GDB_BP];
- regs->ip = gdb_regs[GDB_PC];
-#ifdef CONFIG_X86_32
- regs->flags = gdb_regs[GDB_PS];
- regs->ds = gdb_regs[GDB_DS];
- regs->es = gdb_regs[GDB_ES];
- regs->cs = gdb_regs[GDB_CS];
-#else
- regs->r8 = gdb_regs[GDB_R8];
- regs->r9 = gdb_regs[GDB_R9];
- regs->r10 = gdb_regs[GDB_R10];
- regs->r11 = gdb_regs[GDB_R11];
- regs->r12 = gdb_regs[GDB_R12];
- regs->r13 = gdb_regs[GDB_R13];
- regs->r14 = gdb_regs[GDB_R14];
- regs->r15 = gdb_regs[GDB_R15];
- regs->flags = gdb_regs32[GDB_PS];
- regs->cs = gdb_regs32[GDB_CS];
- regs->ss = gdb_regs32[GDB_SS];
-#endif
-}
-
static struct hw_breakpoint {
unsigned enabled;
unsigned long addr;
int len;
int type;
- struct perf_event **pev;
-} breakinfo[4];
+ struct perf_event * __percpu *pev;
+} breakinfo[HBP_NUM];
static unsigned long early_dr7;
@@ -205,7 +203,7 @@ static void kgdb_correct_hw_break(void)
{
int breakno;
- for (breakno = 0; breakno < 4; breakno++) {
+ for (breakno = 0; breakno < HBP_NUM; breakno++) {
struct perf_event *bp;
struct arch_hw_breakpoint *info;
int val;
@@ -292,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
{
int i;
- for (i = 0; i < 4; i++)
+ for (i = 0; i < HBP_NUM; i++)
if (breakinfo[i].addr == addr && breakinfo[i].enabled)
break;
- if (i == 4)
+ if (i == HBP_NUM)
return -1;
if (hw_break_release_slot(i)) {
@@ -313,7 +311,7 @@ static void kgdb_remove_all_hw_break(void)
int cpu = raw_smp_processor_id();
struct perf_event *bp;
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < HBP_NUM; i++) {
if (!breakinfo[i].enabled)
continue;
bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
@@ -333,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
{
int i;
- for (i = 0; i < 4; i++)
+ for (i = 0; i < HBP_NUM; i++)
if (!breakinfo[i].enabled)
break;
- if (i == 4)
+ if (i == HBP_NUM)
return -1;
switch (bptype) {
@@ -389,7 +387,7 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
* disable hardware debugging while it is processing gdb packets or
* handling exception.
*/
-void kgdb_disable_hw_debug(struct pt_regs *regs)
+static void kgdb_disable_hw_debug(struct pt_regs *regs)
{
int i;
int cpu = raw_smp_processor_id();
@@ -397,7 +395,7 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
/* Disable hardware debugging while we are in kgdb: */
set_debugreg(0UL, 7);
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < HBP_NUM; i++) {
if (!breakinfo[i].enabled)
continue;
if (dbg_is_early) {
@@ -458,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
{
unsigned long addr;
char *ptr;
- int newPC;
switch (remcomInBuffer[0]) {
case 'c':
@@ -469,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
linux_regs->ip = addr;
case 'D':
case 'k':
- newPC = linux_regs->ip;
-
/* clear the trace bit */
linux_regs->flags &= ~X86_EFLAGS_TF;
atomic_set(&kgdb_cpu_doing_single_step, -1);
@@ -482,8 +477,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
raw_smp_processor_id());
}
- kgdb_correct_hw_break();
-
return 0;
}
@@ -626,7 +619,12 @@ int kgdb_arch_init(void)
static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
struct perf_sample_data *data, struct pt_regs *regs)
{
- kgdb_ll_trap(DIE_DEBUG, "debug", regs, 0, 0, SIGTRAP);
+ struct task_struct *tsk = current;
+ int i;
+
+ for (i = 0; i < 4; i++)
+ if (breakinfo[i].enabled)
+ tsk->thread.debugreg6 |= (DR_TRAP0 << i);
}
void kgdb_arch_late(void)
@@ -645,11 +643,11 @@ void kgdb_arch_late(void)
attr.bp_len = HW_BREAKPOINT_LEN_1;
attr.bp_type = HW_BREAKPOINT_W;
attr.disabled = 1;
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < HBP_NUM; i++) {
if (breakinfo[i].pev)
continue;
breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
- if (IS_ERR(breakinfo[i].pev)) {
+ if (IS_ERR((void * __force)breakinfo[i].pev)) {
printk(KERN_ERR "kgdb: Could not allocate hw"
"breakpoints\nDisabling the kernel debugger\n");
breakinfo[i].pev = NULL;
@@ -726,6 +724,7 @@ struct kgdb_arch arch_kgdb_ops = {
.flags = KGDB_HW_BREAKPOINT,
.set_hw_breakpoint = kgdb_set_hw_break,
.remove_hw_breakpoint = kgdb_remove_hw_break,
+ .disable_hw_break = kgdb_disable_hw_debug,
.remove_all_hw_break = kgdb_remove_all_hw_break,
.correct_hw_break = kgdb_correct_hw_break,
};
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 675879b65ce6..1cbd54c0df99 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -126,16 +126,22 @@ static void __kprobes synthesize_reljump(void *from, void *to)
}
/*
- * Check for the REX prefix which can only exist on X86_64
- * X86_32 always returns 0
+ * Skip the prefixes of the instruction.
*/
-static int __kprobes is_REX_prefix(kprobe_opcode_t *insn)
+static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
{
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute((insn_byte_t)*insn);
+ while (inat_is_legacy_prefix(attr)) {
+ insn++;
+ attr = inat_get_opcode_attribute((insn_byte_t)*insn);
+ }
#ifdef CONFIG_X86_64
- if ((*insn & 0xf0) == 0x40)
- return 1;
+ if (inat_is_rex_prefix(attr))
+ insn++;
#endif
- return 0;
+ return insn;
}
/*
@@ -224,9 +230,6 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
return 0;
}
-/* Dummy buffers for kallsyms_lookup */
-static char __dummy_buf[KSYM_NAME_LEN];
-
/* Check if paddr is at an instruction boundary */
static int __kprobes can_probe(unsigned long paddr)
{
@@ -235,7 +238,7 @@ static int __kprobes can_probe(unsigned long paddr)
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
- if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
+ if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
return 0;
/* Decode instructions */
@@ -272,6 +275,9 @@ static int __kprobes can_probe(unsigned long paddr)
*/
static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
{
+ /* Skip prefixes */
+ insn = skip_prefixes(insn);
+
switch (*insn) {
case 0xfa: /* cli */
case 0xfb: /* sti */
@@ -280,13 +286,6 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
return 1;
}
- /*
- * on X86_64, 0x40-0x4f are REX prefixes so we need to look
- * at the next byte instead.. but of course not recurse infinitely
- */
- if (is_REX_prefix(insn))
- return is_IF_modifier(++insn);
-
return 0;
}
@@ -707,6 +706,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
struct hlist_node *node, *tmp;
unsigned long flags, orig_ret_address = 0;
unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
+ kprobe_opcode_t *correct_ret_addr = NULL;
INIT_HLIST_HEAD(&empty_rp);
kretprobe_hash_lock(current, &head, &flags);
@@ -738,14 +738,34 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
/* another task is sharing our hash bucket */
continue;
+ orig_ret_address = (unsigned long)ri->ret_addr;
+
+ if (orig_ret_address != trampoline_address)
+ /*
+ * This is the real return address. Any other
+ * instances associated with this task are for
+ * other calls deeper on the call stack
+ */
+ break;
+ }
+
+ kretprobe_assert(ri, orig_ret_address, trampoline_address);
+
+ correct_ret_addr = ri->ret_addr;
+ hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+ if (ri->task != current)
+ /* another task is sharing our hash bucket */
+ continue;
+
+ orig_ret_address = (unsigned long)ri->ret_addr;
if (ri->rp && ri->rp->handler) {
__get_cpu_var(current_kprobe) = &ri->rp->kp;
get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
+ ri->ret_addr = correct_ret_addr;
ri->rp->handler(ri, regs);
__get_cpu_var(current_kprobe) = NULL;
}
- orig_ret_address = (unsigned long)ri->ret_addr;
recycle_rp_inst(ri, &empty_rp);
if (orig_ret_address != trampoline_address)
@@ -757,8 +777,6 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
break;
}
- kretprobe_assert(ri, orig_ret_address, trampoline_address);
-
kretprobe_hash_unlock(current, &flags);
hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
@@ -803,9 +821,8 @@ static void __kprobes resume_execution(struct kprobe *p,
unsigned long orig_ip = (unsigned long)p->addr;
kprobe_opcode_t *insn = p->ainsn.insn;
- /*skip the REX prefix*/
- if (is_REX_prefix(insn))
- insn++;
+ /* Skip prefixes */
+ insn = skip_prefixes(insn);
regs->flags &= ~X86_EFLAGS_TF;
switch (*insn) {
@@ -1109,7 +1126,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
*(unsigned long *)addr = val;
}
-void __kprobes kprobes_optinsn_template_holder(void)
+static void __used __kprobes kprobes_optinsn_template_holder(void)
{
asm volatile (
".global optprobe_template_entry\n"
@@ -1201,7 +1218,8 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
}
/* Check whether the address range is reserved */
if (ftrace_text_reserved(src, src + len - 1) ||
- alternatives_text_reserved(src, src + len - 1))
+ alternatives_text_reserved(src, src + len - 1) ||
+ jump_label_text_reserved(src, src + len - 1))
return -EBUSY;
return len;
@@ -1249,11 +1267,9 @@ static int __kprobes can_optimize(unsigned long paddr)
unsigned long addr, size = 0, offset = 0;
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
- /* Dummy buffers for lookup_symbol_attrs */
- static char __dummy_buf[KSYM_NAME_LEN];
/* Lookup symbol including addr */
- if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
+ if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
return 0;
/* Check there is enough space for a relative jump. */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index eb9b76c716c2..ca43ce31a19c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -128,13 +128,15 @@ static struct clocksource kvm_clock = {
static int kvm_register_clock(char *txt)
{
int cpu = smp_processor_id();
- int low, high;
+ int low, high, ret;
+
low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
+ ret = native_write_msr_safe(msr_kvm_system_time, low, high);
printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
cpu, high, low, txt);
- return native_write_msr_safe(msr_kvm_system_time, low, high);
+ return ret;
}
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 035c8c529181..b3ea9db39db6 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -36,7 +36,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
if (!page)
goto out;
pud = (pud_t *)page_address(page);
- memset(pud, 0, PAGE_SIZE);
+ clear_page(pud);
set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
}
pud = pud_offset(pgd, addr);
@@ -45,7 +45,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
if (!page)
goto out;
pmd = (pmd_t *)page_address(page);
- memset(pmd, 0, PAGE_SIZE);
+ clear_page(pmd);
set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
}
pmd = pmd_offset(pud, addr);
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index fa6551d36c10..1cca374a2bac 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -12,7 +12,7 @@
* Software Developer's Manual
* Order Number 253668 or free download from:
*
- * http://developer.intel.com/design/pentium4/manuals/253668.htm
+ * http://developer.intel.com/Assets/PDF/manual/253668.pdf
*
* For more information, go to http://www.urbanmyth.org/microcode
*
@@ -232,6 +232,7 @@ static const struct file_operations microcode_fops = {
.owner = THIS_MODULE,
.write = microcode_write,
.open = microcode_open,
+ .llseek = no_llseek,
};
static struct miscdevice microcode_dev = {
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 356170262a93..dcb65cc0a053 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -12,7 +12,7 @@
* Software Developer's Manual
* Order Number 253668 or free download from:
*
- * http://developer.intel.com/design/pentium4/manuals/253668.htm
+ * http://developer.intel.com/Assets/PDF/manual/253668.pdf
*
* For more information, go to http://www.urbanmyth.org/microcode
*
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e0bc186d7501..8f2956091735 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -239,11 +239,13 @@ int module_finalize(const Elf_Ehdr *hdr,
apply_paravirt(pseg, pseg + para->sh_size);
}
- return module_bug_finalize(hdr, sechdrs, me);
+ /* make jump label nops */
+ jump_label_apply_nops(me);
+
+ return 0;
}
void module_arch_cleanup(struct module *mod)
{
alternatives_smp_module_del(mod);
- module_bug_cleanup(mod);
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d86dbf7e54be..9af64d9c4b67 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -11,6 +11,7 @@
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/bitops.h>
@@ -274,6 +275,18 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
+static void __init smp_register_lapic_address(unsigned long address)
+{
+ mp_lapic_addr = address;
+
+ set_fixmap_nocache(FIX_APIC_BASE, address);
+ if (boot_cpu_physical_apicid == -1U) {
+ boot_cpu_physical_apicid = read_apic_id();
+ apic_version[boot_cpu_physical_apicid] =
+ GET_APIC_VERSION(apic_read(APIC_LVR));
+ }
+}
+
static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
{
char str[16];
@@ -295,6 +308,10 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
if (early)
return 1;
+ /* Initialize the lapic mapping */
+ if (!acpi_lapic)
+ smp_register_lapic_address(mpc->lapic);
+
if (mpc->oemptr)
x86_init.mpparse.smp_read_mpc_oem(mpc);
@@ -641,7 +658,7 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
{
unsigned long size = get_mpc_size(mpf->physptr);
- reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc");
+ memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc");
}
static int __init smp_scan_config(unsigned long base, unsigned long length)
@@ -670,7 +687,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
mpf, (u64)virt_to_phys(mpf));
mem = virt_to_phys(mpf);
- reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf");
+ memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf");
if (mpf->physptr)
smp_reserve_memory(mpf);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1db183ed7c01..c5b250011fd4 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -413,7 +413,6 @@ struct pv_mmu_ops pv_mmu_ops = {
.alloc_pte = paravirt_nop,
.alloc_pmd = paravirt_nop,
- .alloc_pmd_clone = paravirt_nop,
.alloc_pud = paravirt_nop,
.release_pte = paravirt_nop,
.release_pmd = paravirt_nop,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 078d4ec1a9d9..f56a117cef68 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -47,6 +47,7 @@
#include <asm/rio.h>
#include <asm/bios_ebda.h>
#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
int use_calgary __read_mostly = 1;
@@ -1364,7 +1365,7 @@ static int __init calgary_iommu_init(void)
return 0;
}
-void __init detect_calgary(void)
+int __init detect_calgary(void)
{
int bus;
void *tbl;
@@ -1378,13 +1379,13 @@ void __init detect_calgary(void)
* another HW IOMMU already, bail out.
*/
if (no_iommu || iommu_detected)
- return;
+ return -ENODEV;
if (!use_calgary)
- return;
+ return -ENODEV;
if (!early_pci_allowed())
- return;
+ return -ENODEV;
printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
@@ -1410,13 +1411,13 @@ void __init detect_calgary(void)
if (!rio_table_hdr) {
printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
"in EBDA - bailing!\n");
- return;
+ return -ENODEV;
}
ret = build_detail_arrays();
if (ret) {
printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
- return;
+ return -ENOMEM;
}
specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
@@ -1464,7 +1465,7 @@ void __init detect_calgary(void)
x86_init.iommu.iommu_init = calgary_iommu_init;
}
- return;
+ return calgary_found;
cleanup:
for (--bus; bus >= 0; --bus) {
@@ -1473,6 +1474,7 @@ cleanup:
if (info->tce_space)
free_tce_table(info->tce_space);
}
+ return -ENOMEM;
}
static int __init calgary_parse_options(char *p)
@@ -1594,3 +1596,5 @@ static int __init calgary_fixup_tce_spaces(void)
* and before device_initcall.
*/
rootfs_initcall(calgary_fixup_tce_spaces);
+
+IOMMU_INIT_POST(detect_calgary);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 4b7e3d8b01dd..9ea999a4dcc1 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,8 +11,8 @@
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/calgary.h>
-#include <asm/amd_iommu.h>
#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
static int forbid_dac __read_mostly;
@@ -44,6 +44,8 @@ int iommu_detected __read_mostly = 0;
*/
int iommu_pass_through __read_mostly;
+extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
+
/* Dummy device used for NULL arguments (normally ISA). */
struct device x86_dma_fallback_dev = {
.init_name = "fallback device",
@@ -129,24 +131,24 @@ static void __init dma32_free_bootmem(void)
void __init pci_iommu_alloc(void)
{
+ struct iommu_table_entry *p;
+
/* free the range so iommu could get some range less than 4G */
dma32_free_bootmem();
- if (pci_swiotlb_detect())
- goto out;
-
- gart_iommu_hole_init();
-
- detect_calgary();
-
- detect_intel_iommu();
+ sort_iommu_table(__iommu_table, __iommu_table_end);
+ check_iommu_entries(__iommu_table, __iommu_table_end);
- /* needs to be called after gart_iommu_hole_init */
- amd_iommu_detect();
-out:
- pci_swiotlb_init();
+ for (p = __iommu_table; p < __iommu_table_end; p++) {
+ if (p && p->detect && p->detect() > 0) {
+ p->flags |= IOMMU_DETECTED;
+ if (p->early_init)
+ p->early_init();
+ if (p->flags & IOMMU_FINISH_IF_DETECTED)
+ break;
+ }
+ }
}
-
void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_addr, gfp_t flag)
{
@@ -289,6 +291,7 @@ EXPORT_SYMBOL(dma_supported);
static int __init pci_iommu_init(void)
{
+ struct iommu_table_entry *p;
dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
#ifdef CONFIG_PCI
@@ -296,12 +299,10 @@ static int __init pci_iommu_init(void)
#endif
x86_init.iommu.iommu_init();
- if (swiotlb) {
- printk(KERN_INFO "PCI-DMA: "
- "Using software bounce buffering for IO (SWIOTLB)\n");
- swiotlb_print_info();
- } else
- swiotlb_free();
+ for (p = __iommu_table; p < __iommu_table_end; p++) {
+ if (p && (p->flags & IOMMU_DETECTED) && p->late_init)
+ p->late_init();
+ }
return 0;
}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 0f7f130caa67..ba0f0ca9f280 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -39,8 +39,9 @@
#include <asm/cacheflush.h>
#include <asm/swiotlb.h>
#include <asm/dma.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
static unsigned long iommu_bus_base; /* GART remapping area (physical) */
static unsigned long iommu_size; /* size of remapping area bytes */
@@ -560,8 +561,11 @@ static void enable_gart_translations(void)
{
int i;
- for (i = 0; i < num_k8_northbridges; i++) {
- struct pci_dev *dev = k8_northbridges[i];
+ if (!k8_northbridges.gart_supported)
+ return;
+
+ for (i = 0; i < k8_northbridges.num; i++) {
+ struct pci_dev *dev = k8_northbridges.nb_misc[i];
enable_gart_translation(dev, __pa(agp_gatt_table));
}
@@ -592,16 +596,19 @@ static void gart_fixup_northbridges(struct sys_device *dev)
if (!fix_up_north_bridges)
return;
+ if (!k8_northbridges.gart_supported)
+ return;
+
pr_info("PCI-DMA: Restoring GART aperture settings\n");
- for (i = 0; i < num_k8_northbridges; i++) {
- struct pci_dev *dev = k8_northbridges[i];
+ for (i = 0; i < k8_northbridges.num; i++) {
+ struct pci_dev *dev = k8_northbridges.nb_misc[i];
/*
* Don't enable translations just yet. That is the next
* step. Restore the pre-suspend aperture settings.
*/
- pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
+ gart_set_size_and_enable(dev, aperture_order);
pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
}
}
@@ -649,8 +656,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
aper_size = aper_base = info->aper_size = 0;
dev = NULL;
- for (i = 0; i < num_k8_northbridges; i++) {
- dev = k8_northbridges[i];
+ for (i = 0; i < k8_northbridges.num; i++) {
+ dev = k8_northbridges.nb_misc[i];
new_aper_base = read_aperture(dev, &new_aper_size);
if (!new_aper_base)
goto nommu;
@@ -718,10 +725,13 @@ static void gart_iommu_shutdown(void)
if (!no_agp)
return;
- for (i = 0; i < num_k8_northbridges; i++) {
+ if (!k8_northbridges.gart_supported)
+ return;
+
+ for (i = 0; i < k8_northbridges.num; i++) {
u32 ctl;
- dev = k8_northbridges[i];
+ dev = k8_northbridges.nb_misc[i];
pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
ctl &= ~GARTEN;
@@ -739,7 +749,7 @@ int __init gart_iommu_init(void)
unsigned long scratch;
long i;
- if (num_k8_northbridges == 0)
+ if (!k8_northbridges.gart_supported)
return 0;
#ifndef CONFIG_AGP_AMD64
@@ -896,3 +906,4 @@ void __init gart_parse_options(char *p)
}
}
}
+IOMMU_INIT_POST(gart_iommu_hole_init);
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
new file mode 100644
index 000000000000..55d745ec1181
--- /dev/null
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -0,0 +1,89 @@
+#include <linux/dma-mapping.h>
+#include <asm/iommu_table.h>
+#include <linux/string.h>
+#include <linux/kallsyms.h>
+
+
+#define DEBUG 1
+
+static struct iommu_table_entry * __init
+find_dependents_of(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish,
+ struct iommu_table_entry *q)
+{
+ struct iommu_table_entry *p;
+
+ if (!q)
+ return NULL;
+
+ for (p = start; p < finish; p++)
+ if (p->detect == q->depend)
+ return p;
+
+ return NULL;
+}
+
+
+void __init sort_iommu_table(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish) {
+
+ struct iommu_table_entry *p, *q, tmp;
+
+ for (p = start; p < finish; p++) {
+again:
+ q = find_dependents_of(start, finish, p);
+ /* We are bit sneaky here. We use the memory address to figure
+ * out if the node we depend on is past our point, if so, swap.
+ */
+ if (q > p) {
+ tmp = *p;
+ memmove(p, q, sizeof(*p));
+ *q = tmp;
+ goto again;
+ }
+ }
+
+}
+
+#ifdef DEBUG
+void __init check_iommu_entries(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish)
+{
+ struct iommu_table_entry *p, *q, *x;
+ char sym_p[KSYM_SYMBOL_LEN];
+ char sym_q[KSYM_SYMBOL_LEN];
+
+ /* Simple cyclic dependency checker. */
+ for (p = start; p < finish; p++) {
+ q = find_dependents_of(start, finish, p);
+ x = find_dependents_of(start, finish, q);
+ if (p == x) {
+ sprint_symbol(sym_p, (unsigned long)p->detect);
+ sprint_symbol(sym_q, (unsigned long)q->detect);
+
+ printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %s depends" \
+ " on %s and vice-versa. BREAKING IT.\n",
+ sym_p, sym_q);
+ /* Heavy handed way..*/
+ x->depend = 0;
+ }
+ }
+
+ for (p = start; p < finish; p++) {
+ q = find_dependents_of(p, finish, p);
+ if (q && q > p) {
+ sprint_symbol(sym_p, (unsigned long)p->detect);
+ sprint_symbol(sym_q, (unsigned long)q->detect);
+
+ printk(KERN_ERR "EXECUTION ORDER INVALID! %s "\
+ "should be called before %s!\n",
+ sym_p, sym_q);
+ }
+ }
+}
+#else
+inline void check_iommu_entries(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish)
+{
+}
+#endif
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index a5bc528d4328..8f972cbddef0 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -10,7 +10,8 @@
#include <asm/iommu.h>
#include <asm/swiotlb.h>
#include <asm/dma.h>
-
+#include <asm/xen/swiotlb-xen.h>
+#include <asm/iommu_table.h>
int swiotlb __read_mostly;
static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -41,25 +42,42 @@ static struct dma_map_ops swiotlb_dma_ops = {
};
/*
- * pci_swiotlb_detect - set swiotlb to 1 if necessary
+ * pci_swiotlb_detect_override - set swiotlb to 1 if necessary
*
* This returns non-zero if we are forced to use swiotlb (by the boot
* option).
*/
-int __init pci_swiotlb_detect(void)
+int __init pci_swiotlb_detect_override(void)
{
int use_swiotlb = swiotlb | swiotlb_force;
+ if (swiotlb_force)
+ swiotlb = 1;
+
+ return use_swiotlb;
+}
+IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
+ pci_xen_swiotlb_detect,
+ pci_swiotlb_init,
+ pci_swiotlb_late_init);
+
+/*
+ * if 4GB or more detected (and iommu=off not set) return 1
+ * and set swiotlb to 1.
+ */
+int __init pci_swiotlb_detect_4gb(void)
+{
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
#ifdef CONFIG_X86_64
if (!no_iommu && max_pfn > MAX_DMA32_PFN)
swiotlb = 1;
#endif
- if (swiotlb_force)
- swiotlb = 1;
-
- return use_swiotlb;
+ return swiotlb;
}
+IOMMU_INIT(pci_swiotlb_detect_4gb,
+ pci_swiotlb_detect_override,
+ pci_swiotlb_init,
+ pci_swiotlb_late_init);
void __init pci_swiotlb_init(void)
{
@@ -68,3 +86,15 @@ void __init pci_swiotlb_init(void)
dma_ops = &swiotlb_dma_ops;
}
}
+
+void __init pci_swiotlb_late_init(void)
+{
+ /* An IOMMU turned us off. */
+ if (!swiotlb)
+ swiotlb_free();
+ else {
+ printk(KERN_INFO "PCI-DMA: "
+ "Using software bounce buffering for IO (SWIOTLB)\n");
+ swiotlb_print_info();
+ }
+}
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
deleted file mode 100644
index b112406f1996..000000000000
--- a/arch/x86/kernel/pmtimer_64.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Ported over from i386 by AK, original copyright was:
- *
- * (C) Dominik Brodowski <linux@brodo.de> 2003
- *
- * Driver to use the Power Management Timer (PMTMR) available in some
- * southbridges as primary timing source for the Linux kernel.
- *
- * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
- * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
- *
- * This file is licensed under the GPL v2.
- *
- * Dropped all the hardware bug workarounds for now. Hopefully they
- * are not needed on 64bit chipsets.
- */
-
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/init.h>
-#include <linux/cpumask.h>
-#include <linux/acpi_pmtmr.h>
-
-#include <asm/io.h>
-#include <asm/proto.h>
-#include <asm/msr.h>
-#include <asm/vsyscall.h>
-
-static inline u32 cyc2us(u32 cycles)
-{
- /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
- * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
- *
- * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
- * easily be multiplied with 286 (=0x11E) without having to fear
- * u32 overflows.
- */
- cycles *= 286;
- return (cycles >> 10);
-}
-
-static unsigned pmtimer_wait_tick(void)
-{
- u32 a, b;
- for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
- a == b;
- b = inl(pmtmr_ioport) & ACPI_PM_MASK)
- cpu_relax();
- return b;
-}
-
-/* note: wait time is rounded up to one tick */
-void pmtimer_wait(unsigned us)
-{
- u32 a, b;
- a = pmtimer_wait_tick();
- do {
- b = inl(pmtmr_ioport);
- cpu_relax();
- } while (cyc2us(b - a) < us);
-}
-
-static int __init nopmtimer_setup(char *s)
-{
- pmtmr_ioport = 0;
- return 1;
-}
-
-__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e7e35219b32f..57d1868a86aa 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@ unsigned long idle_nomwait;
EXPORT_SYMBOL(idle_nomwait);
struct kmem_cache *task_xstate_cachep;
+EXPORT_SYMBOL_GPL(task_xstate_cachep);
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
@@ -300,8 +301,9 @@ EXPORT_SYMBOL(kernel_thread);
/*
* sys_execve() executes a new program.
*/
-long sys_execve(char __user *name, char __user * __user *argv,
- char __user * __user *envp, struct pt_regs *regs)
+long sys_execve(const char __user *name,
+ const char __user *const __user *argv,
+ const char __user *const __user *envp, struct pt_regs *regs)
{
long error;
char *filename;
@@ -371,7 +373,7 @@ static inline int hlt_use_halt(void)
void default_idle(void)
{
if (hlt_use_halt()) {
- trace_power_start(POWER_CSTATE, 1);
+ trace_power_start(POWER_CSTATE, 1, smp_processor_id());
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
@@ -441,7 +443,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
*/
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
- trace_power_start(POWER_CSTATE, (ax>>4)+1);
+ trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
if (!need_resched()) {
if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
@@ -457,7 +459,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
static void mwait_idle(void)
{
if (!need_resched()) {
- trace_power_start(POWER_CSTATE, 1);
+ trace_power_start(POWER_CSTATE, 1, smp_processor_id());
if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
@@ -478,7 +480,7 @@ static void mwait_idle(void)
*/
static void poll_idle(void)
{
- trace_power_start(POWER_CSTATE, 0);
+ trace_power_start(POWER_CSTATE, 0, smp_processor_id());
local_irq_enable();
while (!need_resched())
cpu_relax();
@@ -525,44 +527,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
return (edx & MWAIT_EDX_C1);
}
-/*
- * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e.
- * For more information see
- * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
- * - Erratum #365 for family 0x11 (not affected because C1e not in use)
- */
-static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
-{
- u64 val;
- if (c->x86_vendor != X86_VENDOR_AMD)
- goto no_c1e_idle;
-
- /* Family 0x0f models < rev F do not have C1E */
- if (c->x86 == 0x0F && c->x86_model >= 0x40)
- return 1;
-
- if (c->x86 == 0x10) {
- /*
- * check OSVW bit for CPUs that are not affected
- * by erratum #400
- */
- if (cpu_has(c, X86_FEATURE_OSVW)) {
- rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
- if (val >= 2) {
- rdmsrl(MSR_AMD64_OSVW_STATUS, val);
- if (!(val & BIT(1)))
- goto no_c1e_idle;
- }
- }
- return 1;
- }
-
-no_c1e_idle:
- return 0;
-}
+bool c1e_detected;
+EXPORT_SYMBOL(c1e_detected);
static cpumask_var_t c1e_mask;
-static int c1e_detected;
void c1e_remove_cpu(int cpu)
{
@@ -584,12 +552,12 @@ static void c1e_idle(void)
u32 lo, hi;
rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+
if (lo & K8_INTP_C1E_ACTIVE_MASK) {
- c1e_detected = 1;
+ c1e_detected = true;
if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
mark_tsc_unstable("TSC halt in AMD C1E");
printk(KERN_INFO "System has AMD C1E enabled\n");
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
}
}
@@ -638,7 +606,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
*/
printk(KERN_INFO "using mwait in idle threads.\n");
pm_idle = mwait_idle;
- } else if (check_c1e_idle(c)) {
+ } else if (cpu_has_amd_erratum(amd_erratum_400)) {
+ /* E400: APIC timer interrupt does not wake up CPU from C1e */
printk(KERN_INFO "using C1E aware idle routine\n");
pm_idle = c1e_idle;
} else
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d128783af47..96586c3cbbbf 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,6 +57,8 @@
#include <asm/syscalls.h>
#include <asm/debugreg.h>
+#include <trace/events/power.h>
+
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
/*
@@ -111,6 +113,8 @@ void cpu_idle(void)
stop_critical_timings();
pm_idle();
start_critical_timings();
+
+ trace_power_end(smp_processor_id());
}
tick_nohz_restart_sched_tick();
preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3c2422a99f1f..b3d7a3a04f38 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,6 +51,8 @@
#include <asm/syscalls.h>
#include <asm/debugreg.h>
+#include <trace/events/power.h>
+
asmlinkage extern void ret_from_fork(void);
DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -138,6 +140,9 @@ void cpu_idle(void)
stop_critical_timings();
pm_idle();
start_critical_timings();
+
+ trace_power_end(smp_processor_id());
+
/* In many cases the interrupt that ended idle
has already called exit_idle. But some idle
loops can be woken up without interrupt. */
@@ -419,7 +424,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
load_TLS(next, cpu);
/* Must be after DS reload */
- unlazy_fpu(prev_p);
+ __unlazy_fpu(prev_p);
/* Make sure cpu is ready for new context */
if (preload_fpu)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 70c4872cd8aa..45892dc4b72a 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -801,7 +801,8 @@ void ptrace_disable(struct task_struct *child)
static const struct user_regset_view user_x86_32_view; /* Initialized below. */
#endif
-long arch_ptrace(struct task_struct *child, long request, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request,
+ unsigned long addr, unsigned long data)
{
int ret;
unsigned long __user *datap = (unsigned long __user *)data;
@@ -812,8 +813,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
unsigned long tmp;
ret = -EIO;
- if ((addr & (sizeof(data) - 1)) || addr < 0 ||
- addr >= sizeof(struct user))
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
break;
tmp = 0; /* Default return condition */
@@ -830,8 +830,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
ret = -EIO;
- if ((addr & (sizeof(data) - 1)) || addr < 0 ||
- addr >= sizeof(struct user))
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
break;
if (addr < sizeof(struct user_regs_struct))
@@ -888,17 +887,17 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
case PTRACE_GET_THREAD_AREA:
- if (addr < 0)
+ if ((int) addr < 0)
return -EIO;
ret = do_get_thread_area(child, addr,
- (struct user_desc __user *) data);
+ (struct user_desc __user *)data);
break;
case PTRACE_SET_THREAD_AREA:
- if (addr < 0)
+ if ((int) addr < 0)
return -EIO;
ret = do_set_thread_area(child, addr,
- (struct user_desc __user *) data, 0);
+ (struct user_desc __user *)data, 0);
break;
#endif
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 239427ca02af..bab3b9e6f66d 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -82,7 +82,8 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
{
u64 delta = native_read_tsc() - shadow->tsc_timestamp;
- return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+ return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
+ shadow->tsc_shift);
}
/*
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 939b9e98245f..8bbe8c56916d 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -344,6 +344,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
vt8237_force_enable_hpet);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
vt8237_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_CX700,
+ vt8237_force_enable_hpet);
static void ati_force_hpet_resume(void)
{
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e3af342fe83a..c495aa8d4815 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -84,7 +84,7 @@ static int __init reboot_setup(char *str)
}
/* we will leave sorting out the final value
when we are ready to reboot, since we might not
- have set up boot_cpu_id or smp_num_cpu */
+ have detected BSP APIC ID or smp_num_cpu */
break;
#endif /* CONFIG_SMP */
@@ -371,16 +371,10 @@ void machine_real_restart(const unsigned char *code, int length)
CMOS_WRITE(0x00, 0x8f);
spin_unlock(&rtc_lock);
- /* Remap the kernel at virtual address zero, as well as offset zero
- from the kernel segment. This assumes the kernel segment starts at
- virtual address PAGE_OFFSET. */
- memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
-
/*
- * Use `swapper_pg_dir' as our page directory.
+ * Switch back to the initial page table.
*/
- load_cr3(swapper_pg_dir);
+ load_cr3(initial_page_table);
/* Write 0x1234 to absolute memory location 0x472. The BIOS reads
this on booting to tell it to "Bypass memory test (also warm
@@ -641,7 +635,7 @@ void native_machine_shutdown(void)
/* O.K Now that I'm on the appropriate processor,
* stop all of the others.
*/
- smp_send_stop();
+ stop_other_cpus();
#endif
lapic_shutdown();
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4ae4acbd031..21c6746338af 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -31,6 +31,7 @@
#include <linux/apm_bios.h>
#include <linux/initrd.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/seq_file.h>
#include <linux/console.h>
#include <linux/mca.h>
@@ -83,7 +84,6 @@
#include <asm/dmi.h>
#include <asm/io_apic.h>
#include <asm/ist.h>
-#include <asm/vmi.h>
#include <asm/setup_arch.h>
#include <asm/bios_ebda.h>
#include <asm/cacheflush.h>
@@ -102,15 +102,17 @@
#include <asm/paravirt.h>
#include <asm/hypervisor.h>
+#include <asm/olpc_ofw.h>
#include <asm/percpu.h>
#include <asm/topology.h>
#include <asm/apicdef.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
#ifdef CONFIG_X86_64
#include <asm/numa_64.h>
#endif
#include <asm/mce.h>
+#include <asm/alternative.h>
/*
* end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -124,7 +126,6 @@ unsigned long max_pfn_mapped;
RESERVE_BRK(dmi_alloc, 65536);
#endif
-unsigned int boot_cpu_id __read_mostly;
static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
unsigned long _brk_end = (unsigned long)__brk_base;
@@ -301,7 +302,7 @@ static inline void init_gbpages(void)
static void __init reserve_brk(void)
{
if (_brk_end > _brk_start)
- reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
+ memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK");
/* Mark brk area as locked down and no longer taking any
new allocations */
@@ -323,17 +324,16 @@ static void __init relocate_initrd(void)
char *p, *q;
/* We need to move the initrd down into lowmem */
- ramdisk_here = find_e820_area(0, end_of_lowmem, area_size,
+ ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
PAGE_SIZE);
- if (ramdisk_here == -1ULL)
+ if (ramdisk_here == MEMBLOCK_ERROR)
panic("Cannot find place for new RAMDISK of size %lld\n",
ramdisk_size);
/* Note: this includes all the lowmem currently occupied by
the initrd, we rely on that fact to keep the data intact. */
- reserve_early(ramdisk_here, ramdisk_here + area_size,
- "NEW RAMDISK");
+ memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK");
initrd_start = ramdisk_here + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
@@ -389,7 +389,7 @@ static void __init reserve_initrd(void)
initrd_start = 0;
if (ramdisk_size >= (end_of_lowmem>>1)) {
- free_early(ramdisk_image, ramdisk_end);
+ memblock_x86_free_range(ramdisk_image, ramdisk_end);
printk(KERN_ERR "initrd too large to handle, "
"disabling initrd\n");
return;
@@ -412,7 +412,7 @@ static void __init reserve_initrd(void)
relocate_initrd();
- free_early(ramdisk_image, ramdisk_end);
+ memblock_x86_free_range(ramdisk_image, ramdisk_end);
}
#else
static void __init reserve_initrd(void)
@@ -468,7 +468,7 @@ static void __init e820_reserve_setup_data(void)
e820_print_map("reserve setup_data");
}
-static void __init reserve_early_setup_data(void)
+static void __init memblock_x86_reserve_range_setup_data(void)
{
struct setup_data *data;
u64 pa_data;
@@ -480,7 +480,7 @@ static void __init reserve_early_setup_data(void)
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
sprintf(buf, "setup data %x", data->type);
- reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+ memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf);
pa_data = data->next;
early_iounmap(data, sizeof(*data));
}
@@ -501,6 +501,7 @@ static inline unsigned long long get_total_mem(void)
return total << PAGE_SHIFT;
}
+#define DEFAULT_BZIMAGE_ADDR_MAX 0x37FFFFFF
static void __init reserve_crashkernel(void)
{
unsigned long long total_mem;
@@ -518,23 +519,27 @@ static void __init reserve_crashkernel(void)
if (crash_base <= 0) {
const unsigned long long alignment = 16<<20; /* 16M */
- crash_base = find_e820_area(alignment, ULONG_MAX, crash_size,
- alignment);
- if (crash_base == -1ULL) {
+ /*
+ * kexec want bzImage is below DEFAULT_BZIMAGE_ADDR_MAX
+ */
+ crash_base = memblock_find_in_range(alignment,
+ DEFAULT_BZIMAGE_ADDR_MAX, crash_size, alignment);
+
+ if (crash_base == MEMBLOCK_ERROR) {
pr_info("crashkernel reservation failed - No suitable area found.\n");
return;
}
} else {
unsigned long long start;
- start = find_e820_area(crash_base, ULONG_MAX, crash_size,
- 1<<20);
+ start = memblock_find_in_range(crash_base,
+ crash_base + crash_size, crash_size, 1<<20);
if (start != crash_base) {
pr_info("crashkernel reservation failed - memory is in use.\n");
return;
}
}
- reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL");
+ memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL");
printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
"for crashkernel (System RAM: %ldMB)\n",
@@ -614,82 +619,10 @@ static __init void reserve_ibft_region(void)
addr = find_ibft_region(&size);
if (size)
- reserve_early_overlap_ok(addr, addr + size, "ibft");
+ memblock_x86_reserve_range(addr, addr + size, "* ibft");
}
-#ifdef CONFIG_X86_RESERVE_LOW_64K
-static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
-{
- printk(KERN_NOTICE
- "%s detected: BIOS may corrupt low RAM, working around it.\n",
- d->ident);
-
- e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-
- return 0;
-}
-#endif
-
-/* List of systems that have known low memory corruption BIOS problems */
-static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
-#ifdef CONFIG_X86_RESERVE_LOW_64K
- {
- .callback = dmi_low_memory_corruption,
- .ident = "AMI BIOS",
- .matches = {
- DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
- },
- },
- {
- .callback = dmi_low_memory_corruption,
- .ident = "Phoenix BIOS",
- .matches = {
- DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
- },
- },
- {
- .callback = dmi_low_memory_corruption,
- .ident = "Phoenix/MSC BIOS",
- .matches = {
- DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
- },
- },
- /*
- * AMI BIOS with low memory corruption was found on Intel DG45ID and
- * DG45FC boards.
- * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
- * match only DMI_BOARD_NAME and see if there is more bad products
- * with this vendor.
- */
- {
- .callback = dmi_low_memory_corruption,
- .ident = "AMI BIOS",
- .matches = {
- DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
- },
- },
- {
- .callback = dmi_low_memory_corruption,
- .ident = "AMI BIOS",
- .matches = {
- DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
- },
- },
- /*
- * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
- * match on the product name.
- */
- {
- .callback = dmi_low_memory_corruption,
- .ident = "Phoenix BIOS",
- .matches = {
- DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
- },
- },
-#endif
- {}
-};
+static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
static void __init trim_bios_range(void)
{
@@ -697,8 +630,14 @@ static void __init trim_bios_range(void)
* A special case is the first 4Kb of memory;
* This is a BIOS owned area, not kernel ram, but generally
* not listed as such in the E820 table.
+ *
+ * This typically reserves additional memory (64KiB by default)
+ * since some BIOSes are known to corrupt low memory. See the
+ * Kconfig help text for X86_RESERVE_LOW.
*/
- e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
+ e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE),
+ E820_RAM, E820_RESERVED);
+
/*
* special case: Some BIOSen report the PC BIOS
* area (640->1Mb) as ram even though it is not.
@@ -708,6 +647,37 @@ static void __init trim_bios_range(void)
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
}
+static int __init parse_reservelow(char *p)
+{
+ unsigned long long size;
+
+ if (!p)
+ return -EINVAL;
+
+ size = memparse(p, &p);
+
+ if (size < 4096)
+ size = 4096;
+
+ if (size > 640*1024)
+ size = 640*1024;
+
+ reserve_low = size;
+
+ return 0;
+}
+
+early_param("reservelow", parse_reservelow);
+
+static u64 __init get_max_mapped(void)
+{
+ u64 end = max_pfn_mapped;
+
+ end <<= PAGE_SHIFT;
+
+ return end;
+}
+
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -725,21 +695,38 @@ void __init setup_arch(char **cmdline_p)
{
int acpi = 0;
int k8 = 0;
+ unsigned long flags;
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
visws_early_detect();
+
+ /*
+ * copy kernel address range established so far and switch
+ * to the proper swapper page table
+ */
+ clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ initial_page_table + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
#else
printk(KERN_INFO "Command line: %s\n", boot_command_line);
#endif
- /* VMI may relocate the fixmap; do this before touching ioremap area */
- vmi_init();
+ /*
+ * If we have OLPC OFW, we might end up relocating the fixmap due to
+ * reserve_top(), so do this before touching the ioremap area.
+ */
+ olpc_ofw_detect();
early_trap_init();
early_cpu_init();
early_ioremap_init();
+ setup_olpc_ofw_pgd();
+
ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
screen_info = boot_params.screen_info;
edid_info = boot_params.edid_info;
@@ -776,12 +763,14 @@ void __init setup_arch(char **cmdline_p)
#endif
4)) {
efi_enabled = 1;
- efi_reserve_early();
+ efi_memblock_x86_reserve_range();
}
#endif
x86_init.oem.arch_setup();
+ resource_alloc_from_bottom = 0;
+ iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
setup_memory_map();
parse_setup_data();
/* update the e820_saved too */
@@ -832,11 +821,8 @@ void __init setup_arch(char **cmdline_p)
x86_report_nx();
- /* Must be before kernel pagetables are setup */
- vmi_activate();
-
/* after early param, so could get panic from serial */
- reserve_early_setup_data();
+ memblock_x86_reserve_range_setup_data();
if (acpi_mps_check()) {
#ifdef CONFIG_X86_LOCAL_APIC
@@ -857,8 +843,6 @@ void __init setup_arch(char **cmdline_p)
dmi_scan_machine();
- dmi_check_system(bad_bios_dmi_table);
-
/*
* VMware detection requires dmi to be available, so this
* needs to be done after dmi_scan_machine, for the BP.
@@ -891,8 +875,6 @@ void __init setup_arch(char **cmdline_p)
*/
max_pfn = e820_end_of_ram_pfn();
- /* preallocate 4k for mptable mpc */
- early_reserve_e820_mpc_new();
/* update e820 for memory not covered by WB MTRRs */
mtrr_bp_init();
if (mtrr_trim_uncached_memory(max_pfn))
@@ -914,18 +896,8 @@ void __init setup_arch(char **cmdline_p)
max_low_pfn = max_pfn;
high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
- max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
#endif
-#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
- setup_bios_corruption_check();
-#endif
-
- printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
- max_pfn_mapped<<PAGE_SHIFT);
-
- reserve_brk();
-
/*
* Find and reserve possible boot-time SMP configuration:
*/
@@ -933,6 +905,26 @@ void __init setup_arch(char **cmdline_p)
reserve_ibft_region();
+ /*
+ * Need to conclude brk, before memblock_x86_fill()
+ * it could use memblock_find_in_range, could overlap with
+ * brk area.
+ */
+ reserve_brk();
+
+ memblock.current_limit = get_max_mapped();
+ memblock_x86_fill();
+
+ /* preallocate 4k for mptable mpc */
+ early_reserve_e820_mpc_new();
+
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+ setup_bios_corruption_check();
+#endif
+
+ printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
+ max_pfn_mapped<<PAGE_SHIFT);
+
reserve_trampoline_memory();
#ifdef CONFIG_ACPI_SLEEP
@@ -956,6 +948,7 @@ void __init setup_arch(char **cmdline_p)
max_low_pfn = max_pfn;
}
#endif
+ memblock.current_limit = get_max_mapped();
/*
* NOTE: On x86-32, only from this point on, fixmaps are ready for use.
@@ -994,10 +987,7 @@ void __init setup_arch(char **cmdline_p)
#endif
initmem_init(0, max_pfn, acpi, k8);
-#ifndef CONFIG_NO_BOOTMEM
- early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
-#endif
-
+ memblock_find_dma_reserve();
dma32_reserve_bootmem();
#ifdef CONFIG_KVM_CLOCK
@@ -1008,6 +998,13 @@ void __init setup_arch(char **cmdline_p)
paging_init();
x86_init.paging.pagetable_setup_done(swapper_pg_dir);
+#ifdef CONFIG_X86_32
+ /* sync back kernel address range */
+ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+#endif
+
tboot_probe();
#ifdef CONFIG_X86_64
@@ -1063,6 +1060,10 @@ void __init setup_arch(char **cmdline_p)
x86_init.oem.banner();
mcheck_init();
+
+ local_irq_save(flags);
+ arch_init_ideal_nop5();
+ local_irq_restore(flags);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index a60df9ae6454..002b79685f73 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -131,13 +131,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
static void __init pcpu_fc_free(void *ptr, size_t size)
{
-#ifdef CONFIG_NO_BOOTMEM
- u64 start = __pa(ptr);
- u64 end = start + size;
- free_early_partial(start, end);
-#else
free_bootmem(__pa(ptr), size);
-#endif
}
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
@@ -253,7 +247,7 @@ void __init setup_per_cpu_areas(void)
* Up to this point, the boot CPU has been using .init.data
* area. Reload any changed state for the boot CPU.
*/
- if (cpu == boot_cpu_id)
+ if (!cpu)
switch_to_new_gdt(cpu);
}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d801210945d6..513deac7228d 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -159,10 +159,10 @@ asmlinkage void smp_reboot_interrupt(void)
irq_exit();
}
-static void native_smp_send_stop(void)
+static void native_stop_other_cpus(int wait)
{
unsigned long flags;
- unsigned long wait;
+ unsigned long timeout;
if (reboot_force)
return;
@@ -179,9 +179,12 @@ static void native_smp_send_stop(void)
if (num_online_cpus() > 1) {
apic->send_IPI_allbutself(REBOOT_VECTOR);
- /* Don't wait longer than a second */
- wait = USEC_PER_SEC;
- while (num_online_cpus() > 1 && wait--)
+ /*
+ * Don't wait longer than a second if the caller
+ * didn't ask us to wait.
+ */
+ timeout = USEC_PER_SEC;
+ while (num_online_cpus() > 1 && (wait || timeout--))
udelay(1);
}
@@ -227,7 +230,7 @@ struct smp_ops smp_ops = {
.smp_prepare_cpus = native_smp_prepare_cpus,
.smp_cpus_done = native_smp_cpus_done,
- .smp_send_stop = native_smp_send_stop,
+ .stop_other_cpus = native_stop_other_cpus,
.smp_send_reschedule = native_smp_send_reschedule,
.cpu_up = native_cpu_up,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c4f33b2e77d6..083e99d1b7df 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,7 +62,7 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
-#include <asm/vmi.h>
+#include <asm/mwait.h>
#include <asm/apic.h>
#include <asm/setup.h>
#include <asm/uv/uv.h>
@@ -73,7 +73,6 @@
#ifdef CONFIG_X86_32
u8 apicid_2_node[MAX_APICID];
-static int low_mappings;
#endif
/* State of each CPU */
@@ -91,6 +90,25 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
+
+/*
+ * We need this for trampoline_base protection from concurrent accesses when
+ * off- and onlining cores wildly.
+ */
+static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
+
+void cpu_hotplug_driver_lock()
+{
+ mutex_lock(&x86_cpu_hotplug_driver_mutex);
+}
+
+void cpu_hotplug_driver_unlock()
+{
+ mutex_unlock(&x86_cpu_hotplug_driver_mutex);
+}
+
+ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
+ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
#else
static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
#define get_idle_for_cpu(x) (idle_thread_array[(x)])
@@ -281,11 +299,16 @@ notrace static void __cpuinit start_secondary(void *unused)
* fragile that we want to limit the things done here to the
* most necessary things.
*/
- vmi_bringup();
cpu_init();
preempt_disable();
smp_callin();
+#ifdef CONFIG_X86_32
+ /* switch away from the initial page table */
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
+#endif
+
/* otherwise gcc will move up smp_processor_id before the cpu_init */
barrier();
/*
@@ -294,17 +317,11 @@ notrace static void __cpuinit start_secondary(void *unused)
check_tsc_sync_target();
if (nmi_watchdog == NMI_IO_APIC) {
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
enable_NMI_through_LVT0();
- legacy_pic->chip->unmask(0);
+ legacy_pic->unmask(0);
}
-#ifdef CONFIG_X86_32
- while (low_mappings)
- cpu_relax();
- __flush_tlb_all();
-#endif
-
/* This must be done before setting cpu_online_mask */
set_cpu_sibling_map(raw_smp_processor_id());
wmb();
@@ -373,6 +390,19 @@ void __cpuinit smp_store_cpu_info(int id)
identify_secondary_cpu(c);
}
+static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
+{
+ struct cpuinfo_x86 *c1 = &cpu_data(cpu1);
+ struct cpuinfo_x86 *c2 = &cpu_data(cpu2);
+
+ cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
+ cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
+ cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
+ cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
+ cpumask_set_cpu(cpu1, c2->llc_shared_map);
+ cpumask_set_cpu(cpu2, c1->llc_shared_map);
+}
+
void __cpuinit set_cpu_sibling_map(int cpu)
{
@@ -385,14 +415,13 @@ void __cpuinit set_cpu_sibling_map(int cpu)
for_each_cpu(i, cpu_sibling_setup_mask) {
struct cpuinfo_x86 *o = &cpu_data(i);
- if (c->phys_proc_id == o->phys_proc_id &&
- c->cpu_core_id == o->cpu_core_id) {
- cpumask_set_cpu(i, cpu_sibling_mask(cpu));
- cpumask_set_cpu(cpu, cpu_sibling_mask(i));
- cpumask_set_cpu(i, cpu_core_mask(cpu));
- cpumask_set_cpu(cpu, cpu_core_mask(i));
- cpumask_set_cpu(i, c->llc_shared_map);
- cpumask_set_cpu(cpu, o->llc_shared_map);
+ if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+ if (c->phys_proc_id == o->phys_proc_id &&
+ c->compute_unit_id == o->compute_unit_id)
+ link_thread_siblings(cpu, i);
+ } else if (c->phys_proc_id == o->phys_proc_id &&
+ c->cpu_core_id == o->cpu_core_id) {
+ link_thread_siblings(cpu, i);
}
}
} else {
@@ -718,7 +747,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
.done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
};
- INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
+ INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
alternatives_smp_switch(1);
@@ -735,12 +764,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
goto do_rest;
}
- if (!keventd_up() || current_is_keventd())
- c_idle.work.func(&c_idle.work);
- else {
- schedule_work(&c_idle.work);
- wait_for_completion(&c_idle.done);
- }
+ schedule_work(&c_idle.work);
+ wait_for_completion(&c_idle.done);
if (IS_ERR(c_idle.idle)) {
printk("failed fork for CPU %d\n", cpu);
@@ -816,6 +841,13 @@ do_rest:
if (cpumask_test_cpu(cpu, cpu_callin_mask))
break; /* It has booted */
udelay(100);
+ /*
+ * Allow other tasks to run while we wait for the
+ * AP to come online. This also gives a chance
+ * for the MTRR work(triggered by the AP coming online)
+ * to be completed in the stop machine context.
+ */
+ schedule();
}
if (cpumask_test_cpu(cpu, cpu_callin_mask))
@@ -894,20 +926,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
-#ifdef CONFIG_X86_32
- /* init low mem mapping */
- clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
- flush_tlb_all();
- low_mappings = 1;
-
- err = do_boot_cpu(apicid, cpu);
-
- zap_low_mappings(false);
- low_mappings = 0;
-#else
err = do_boot_cpu(apicid, cpu);
-#endif
if (err) {
pr_debug("do_boot_cpu failed %d\n", err);
return -EIO;
@@ -1093,8 +1112,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
}
set_cpu_sibling_map(0);
- enable_IR_x2apic();
- default_setup_apic_routing();
if (smp_sanity_check(max_cpus) < 0) {
printk(KERN_INFO "SMP disabled\n");
@@ -1102,6 +1119,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
goto out;
}
+ default_setup_apic_routing();
+
preempt_disable();
if (read_apic_id() != boot_cpu_physical_apicid) {
panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
@@ -1354,7 +1373,6 @@ void play_dead_common(void)
{
idle_task_exit();
reset_lazy_tlbstate();
- irq_ctx_exit(raw_smp_processor_id());
c1e_remove_cpu(raw_smp_processor_id());
mb();
@@ -1367,11 +1385,88 @@ void play_dead_common(void)
local_irq_disable();
}
+/*
+ * We need to flush the caches before going to sleep, lest we have
+ * dirty data in our caches when we come back up.
+ */
+static inline void mwait_play_dead(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ unsigned int highest_cstate = 0;
+ unsigned int highest_subcstate = 0;
+ int i;
+ void *mwait_ptr;
+
+ if (!cpu_has(&current_cpu_data, X86_FEATURE_MWAIT))
+ return;
+ if (!cpu_has(&current_cpu_data, X86_FEATURE_CLFLSH))
+ return;
+ if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+ return;
+
+ eax = CPUID_MWAIT_LEAF;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ /*
+ * eax will be 0 if EDX enumeration is not valid.
+ * Initialized below to cstate, sub_cstate value when EDX is valid.
+ */
+ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
+ eax = 0;
+ } else {
+ edx >>= MWAIT_SUBSTATE_SIZE;
+ for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
+ if (edx & MWAIT_SUBSTATE_MASK) {
+ highest_cstate = i;
+ highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
+ }
+ }
+ eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
+ (highest_subcstate - 1);
+ }
+
+ /*
+ * This should be a memory location in a cache line which is
+ * unlikely to be touched by other processors. The actual
+ * content is immaterial as it is not actually modified in any way.
+ */
+ mwait_ptr = &current_thread_info()->flags;
+
+ wbinvd();
+
+ while (1) {
+ /*
+ * The CLFLUSH is a workaround for erratum AAI65 for
+ * the Xeon 7400 series. It's not clear it is actually
+ * needed, but it should be harmless in either case.
+ * The WBINVD is insufficient due to the spurious-wakeup
+ * case where we return around the loop.
+ */
+ clflush(mwait_ptr);
+ __monitor(mwait_ptr, 0, 0);
+ mb();
+ __mwait(eax, 0);
+ }
+}
+
+static inline void hlt_play_dead(void)
+{
+ if (current_cpu_data.x86 >= 4)
+ wbinvd();
+
+ while (1) {
+ native_halt();
+ }
+}
+
void native_play_dead(void)
{
play_dead_common();
tboot_shutdown(TB_SHUTDOWN_WFS);
- wbinvd_halt();
+
+ mwait_play_dead(); /* Only returns on failure */
+ hlt_play_dead();
}
#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 922eefbb3f6c..b53c525368a7 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -23,11 +23,16 @@ static int save_stack_stack(void *data, char *name)
return 0;
}
-static void save_stack_address(void *data, unsigned long addr, int reliable)
+static void
+__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
{
struct stack_trace *trace = data;
+#ifdef CONFIG_FRAME_POINTER
if (!reliable)
return;
+#endif
+ if (nosched && in_sched_functions(addr))
+ return;
if (trace->skip > 0) {
trace->skip--;
return;
@@ -36,20 +41,15 @@ static void save_stack_address(void *data, unsigned long addr, int reliable)
trace->entries[trace->nr_entries++] = addr;
}
+static void save_stack_address(void *data, unsigned long addr, int reliable)
+{
+ return __save_stack_address(data, addr, reliable, false);
+}
+
static void
save_stack_address_nosched(void *data, unsigned long addr, int reliable)
{
- struct stack_trace *trace = (struct stack_trace *)data;
- if (!reliable)
- return;
- if (in_sched_functions(addr))
- return;
- if (trace->skip > 0) {
- trace->skip--;
- return;
- }
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = addr;
+ return __save_stack_address(data, addr, reliable, true);
}
static const struct stacktrace_ops save_stack_ops = {
@@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
-struct stack_frame {
+struct stack_frame_user {
const void __user *next_fp;
unsigned long ret_addr;
};
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+static int
+copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
{
int ret;
@@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace)
trace->entries[trace->nr_entries++] = regs->ip;
while (trace->nr_entries < trace->max_entries) {
- struct stack_frame frame;
+ struct stack_frame_user frame;
frame.next_fp = NULL;
frame.ret_addr = 0;
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 196552bb412c..0b0cb5fede19 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -28,11 +28,13 @@
* Do a system call from kernel instead of calling sys_execve so we
* end up with proper pt_regs.
*/
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+int kernel_execve(const char *filename,
+ const char *const argv[],
+ const char *const envp[])
{
long __res;
- asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
+ asm volatile ("int $0x80"
: "=a" (__res)
- : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory");
+ : "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory");
return __res;
}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8b3729341216..b35786dc9b8f 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,6 @@ ENTRY(sys_call_table)
.long sys_rt_tgsigqueueinfo /* 335 */
.long sys_perf_event_open
.long sys_recvmmsg
+ .long sys_fanotify_init
+ .long sys_fanotify_mark
+ .long sys_prlimit64 /* 340 */
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index c652ef62742d..a375616d77f7 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,7 +1,8 @@
#include <linux/io.h>
+#include <linux/memblock.h>
#include <asm/trampoline.h>
-#include <asm/e820.h>
+#include <asm/pgtable.h>
#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
#define __trampinit
@@ -16,15 +17,15 @@ unsigned char *__trampinitdata trampoline_base;
void __init reserve_trampoline_memory(void)
{
- unsigned long mem;
+ phys_addr_t mem;
/* Has to be in very low memory so we can execute real-mode AP code. */
- mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE);
- if (mem == -1L)
+ mem = memblock_find_in_range(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE);
+ if (mem == MEMBLOCK_ERROR)
panic("Cannot allocate trampoline\n");
trampoline_base = __va(mem);
- reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE");
+ memblock_x86_reserve_range(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE");
}
/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 725ef4d17cd5..cb838ca42c96 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -392,7 +392,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
== NOTIFY_STOP)
return;
+
#ifdef CONFIG_X86_LOCAL_APIC
+ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+ == NOTIFY_STOP)
+ return;
+
+#ifndef CONFIG_LOCKUP_DETECTOR
/*
* Ok, so this is none of the documented NMI sources,
* so it must be the NMI watchdog.
@@ -400,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
if (nmi_watchdog_tick(regs, reason))
return;
if (!do_nmi_callback(regs, cpu))
+#endif /* !CONFIG_LOCKUP_DETECTOR */
unknown_nmi_error(reason, regs);
#else
unknown_nmi_error(reason, regs);
@@ -568,6 +575,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
if (regs->flags & X86_VM_MASK) {
handle_vm86_trap((struct kernel_vm86_regs *) regs,
error_code, 1);
+ preempt_conditional_cli(regs);
return;
}
@@ -769,21 +777,10 @@ asmlinkage void math_state_restore(void)
}
EXPORT_SYMBOL_GPL(math_state_restore);
-#ifndef CONFIG_MATH_EMULATION
-void math_emulate(struct math_emu_info *info)
-{
- printk(KERN_EMERG
- "math-emulation not enabled and no coprocessor found.\n");
- printk(KERN_EMERG "killing %s.\n", current->comm);
- force_sig(SIGFPE, current);
- schedule();
-}
-#endif /* CONFIG_MATH_EMULATION */
-
dotraplinkage void __kprobes
do_device_not_available(struct pt_regs *regs, long error_code)
{
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_MATH_EMULATION
if (read_cr0() & X86_CR0_EM) {
struct math_emu_info info = { };
@@ -791,12 +788,12 @@ do_device_not_available(struct pt_regs *regs, long error_code)
info.regs = regs;
math_emulate(&info);
- } else {
- math_state_restore(); /* interrupts still off */
- conditional_sti(regs);
+ return;
}
-#else
- math_state_restore();
+#endif
+ math_state_restore(); /* interrupts still off */
+#ifdef CONFIG_X86_32
+ conditional_sti(regs);
#endif
}
@@ -874,18 +871,6 @@ void __init trap_init(void)
#endif
#ifdef CONFIG_X86_32
- if (cpu_has_fxsr) {
- printk(KERN_INFO "Enabling fast FPU save and restore... ");
- set_in_cr4(X86_CR4_OSFXSR);
- printk("done.\n");
- }
- if (cpu_has_xmm) {
- printk(KERN_INFO
- "Enabling unmasked SIMD FPU exception support... ");
- set_in_cr4(X86_CR4_OSXMMEXCPT);
- printk("done.\n");
- }
-
set_system_trap_gate(SYSCALL_VECTOR, &system_call);
set_bit(SYSCALL_VECTOR, used_vectors);
#endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9faf91ae1841..0c40d8b72416 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,10 +104,14 @@ int __init notsc_setup(char *str)
__setup("notsc", notsc_setup);
+static int no_sched_irq_time;
+
static int __init tsc_setup(char *str)
{
if (!strcmp(str, "reliable"))
tsc_clocksource_reliable = 1;
+ if (!strncmp(str, "noirqtime", 9))
+ no_sched_irq_time = 1;
return 1;
}
@@ -626,6 +630,44 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
local_irq_restore(flags);
}
+static unsigned long long cyc2ns_suspend;
+
+void save_sched_clock_state(void)
+{
+ if (!sched_clock_stable)
+ return;
+
+ cyc2ns_suspend = sched_clock();
+}
+
+/*
+ * Even on processors with invariant TSC, TSC gets reset in some the
+ * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to
+ * arbitrary value (still sync'd across cpu's) during resume from such sleep
+ * states. To cope up with this, recompute the cyc2ns_offset for each cpu so
+ * that sched_clock() continues from the point where it was left off during
+ * suspend.
+ */
+void restore_sched_clock_state(void)
+{
+ unsigned long long offset;
+ unsigned long flags;
+ int cpu;
+
+ if (!sched_clock_stable)
+ return;
+
+ local_irq_save(flags);
+
+ __get_cpu_var(cyc2ns_offset) = 0;
+ offset = cyc2ns_suspend - sched_clock();
+
+ for_each_possible_cpu(cpu)
+ per_cpu(cyc2ns_offset, cpu) = offset;
+
+ local_irq_restore(flags);
+}
+
#ifdef CONFIG_CPU_FREQ
/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
@@ -751,7 +793,6 @@ static struct clocksource clocksource_tsc = {
.read = read_tsc,
.resume = resume_tsc,
.mask = CLOCKSOURCE_MASK(64),
- .shift = 22,
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
#ifdef CONFIG_X86_64
@@ -764,6 +805,7 @@ void mark_tsc_unstable(char *reason)
if (!tsc_unstable) {
tsc_unstable = 1;
sched_clock_stable = 0;
+ disable_sched_clock_irqtime();
printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
/* Change only the rating, when not registered */
if (clocksource_tsc.mult)
@@ -845,8 +887,6 @@ __cpuinit int unsynchronized_tsc(void)
static void __init init_tsc_clocksource(void)
{
- clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
- clocksource_tsc.shift);
if (tsc_clocksource_reliable)
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
/* lower the rating if we already know its unstable: */
@@ -854,63 +894,9 @@ static void __init init_tsc_clocksource(void)
clocksource_tsc.rating = 0;
clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
}
- clocksource_register(&clocksource_tsc);
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
}
-#ifdef CONFIG_X86_64
-/*
- * calibrate_cpu is used on systems with fixed rate TSCs to determine
- * processor frequency
- */
-#define TICK_COUNT 100000000
-static unsigned long __init calibrate_cpu(void)
-{
- int tsc_start, tsc_now;
- int i, no_ctr_free;
- unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
- unsigned long flags;
-
- for (i = 0; i < 4; i++)
- if (avail_to_resrv_perfctr_nmi_bit(i))
- break;
- no_ctr_free = (i == 4);
- if (no_ctr_free) {
- WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
- "cpu_khz value may be incorrect.\n");
- i = 3;
- rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
- wrmsrl(MSR_K7_EVNTSEL3, 0);
- rdmsrl(MSR_K7_PERFCTR3, pmc3);
- } else {
- reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
- }
- local_irq_save(flags);
- /* start measuring cycles, incrementing from 0 */
- wrmsrl(MSR_K7_PERFCTR0 + i, 0);
- wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
- rdtscl(tsc_start);
- do {
- rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
- tsc_now = get_cycles();
- } while ((tsc_now - tsc_start) < TICK_COUNT);
-
- local_irq_restore(flags);
- if (no_ctr_free) {
- wrmsrl(MSR_K7_EVNTSEL3, 0);
- wrmsrl(MSR_K7_PERFCTR3, pmc3);
- wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
- } else {
- release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
- }
-
- return pmc_now * tsc_khz / (tsc_now - tsc_start);
-}
-#else
-static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
-#endif
-
void __init tsc_init(void)
{
u64 lpj;
@@ -929,10 +915,6 @@ void __init tsc_init(void)
return;
}
- if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
- (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
- cpu_khz = calibrate_cpu();
-
printk("Detected %lu.%03lu MHz processor.\n",
(unsigned long)cpu_khz / 1000,
(unsigned long)cpu_khz % 1000);
@@ -952,6 +934,9 @@ void __init tsc_init(void)
/* now allow native_sched_clock() to use rdtsc */
tsc_disabled = 0;
+ if (!no_sched_irq_time)
+ enable_sched_clock_irqtime();
+
lpj = ((u64)tsc_khz * 1000);
do_div(lpj, HZ);
lpj_fine = lpj;
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
index 45b6f8a975a1..56a8c2a867d9 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu_64.S
@@ -31,6 +31,7 @@
*/
#include <asm/cpufeature.h>
+#include <asm/msr-index.h>
verify_cpu:
pushfl # Save caller passed flags
@@ -88,7 +89,7 @@ verify_cpu_sse_test:
je verify_cpu_sse_ok
test %di,%di
jz verify_cpu_no_longmode # only try to force SSE on AMD
- movl $0xc0010015,%ecx # HWCR
+ movl $MSR_K7_HWCR,%ecx
rdmsr
btr $15,%eax # enable SSE
wrmsr
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 5ffb5622f793..61fb98519622 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -551,8 +551,14 @@ cannot_handle:
int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
{
if (VMPI.is_vm86pus) {
- if ((trapno == 3) || (trapno == 1))
- return_to_32bit(regs, VM86_TRAP + (trapno << 8));
+ if ((trapno == 3) || (trapno == 1)) {
+ KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
+ /* setting this flag forces the code in entry_32.S to
+ call save_v86_state() and change the stack pointer
+ to KVM86->regs32 */
+ set_thread_flag(TIF_IRET);
+ return 0;
+ }
do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
return 0;
}
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
deleted file mode 100644
index ce9fbacb7526..000000000000
--- a/arch/x86/kernel/vmi_32.c
+++ /dev/null
@@ -1,893 +0,0 @@
-/*
- * VMI specific paravirt-ops implementation
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to zach@vmware.com
- *
- */
-
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <asm/vmi.h>
-#include <asm/io.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/pgalloc.h>
-#include <asm/processor.h>
-#include <asm/timer.h>
-#include <asm/vmi_time.h>
-#include <asm/kmap_types.h>
-#include <asm/setup.h>
-
-/* Convenient for calling VMI functions indirectly in the ROM */
-typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
-typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
-
-#define call_vrom_func(rom,func) \
- (((VROMFUNC *)(rom->func))())
-
-#define call_vrom_long_func(rom,func,arg) \
- (((VROMLONGFUNC *)(rom->func)) (arg))
-
-static struct vrom_header *vmi_rom;
-static int disable_pge;
-static int disable_pse;
-static int disable_sep;
-static int disable_tsc;
-static int disable_mtrr;
-static int disable_noidle;
-static int disable_vmi_timer;
-
-/* Cached VMI operations */
-static struct {
- void (*cpuid)(void /* non-c */);
- void (*_set_ldt)(u32 selector);
- void (*set_tr)(u32 selector);
- void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
- void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
- void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
- void (*set_kernel_stack)(u32 selector, u32 sp0);
- void (*allocate_page)(u32, u32, u32, u32, u32);
- void (*release_page)(u32, u32);
- void (*set_pte)(pte_t, pte_t *, unsigned);
- void (*update_pte)(pte_t *, unsigned);
- void (*set_linear_mapping)(int, void *, u32, u32);
- void (*_flush_tlb)(int);
- void (*set_initial_ap_state)(int, int);
- void (*halt)(void);
- void (*set_lazy_mode)(int mode);
-} vmi_ops;
-
-/* Cached VMI operations */
-struct vmi_timer_ops vmi_timer_ops;
-
-/*
- * VMI patching routines.
- */
-#define MNEM_CALL 0xe8
-#define MNEM_JMP 0xe9
-#define MNEM_RET 0xc3
-
-#define IRQ_PATCH_INT_MASK 0
-#define IRQ_PATCH_DISABLE 5
-
-static inline void patch_offset(void *insnbuf,
- unsigned long ip, unsigned long dest)
-{
- *(unsigned long *)(insnbuf+1) = dest-ip-5;
-}
-
-static unsigned patch_internal(int call, unsigned len, void *insnbuf,
- unsigned long ip)
-{
- u64 reloc;
- struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
- reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
- switch(rel->type) {
- case VMI_RELOCATION_CALL_REL:
- BUG_ON(len < 5);
- *(char *)insnbuf = MNEM_CALL;
- patch_offset(insnbuf, ip, (unsigned long)rel->eip);
- return 5;
-
- case VMI_RELOCATION_JUMP_REL:
- BUG_ON(len < 5);
- *(char *)insnbuf = MNEM_JMP;
- patch_offset(insnbuf, ip, (unsigned long)rel->eip);
- return 5;
-
- case VMI_RELOCATION_NOP:
- /* obliterate the whole thing */
- return 0;
-
- case VMI_RELOCATION_NONE:
- /* leave native code in place */
- break;
-
- default:
- BUG();
- }
- return len;
-}
-
-/*
- * Apply patch if appropriate, return length of new instruction
- * sequence. The callee does nop padding for us.
- */
-static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
- unsigned long ip, unsigned len)
-{
- switch (type) {
- case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
- return patch_internal(VMI_CALL_DisableInterrupts, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
- return patch_internal(VMI_CALL_EnableInterrupts, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
- return patch_internal(VMI_CALL_SetInterruptMask, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.save_fl):
- return patch_internal(VMI_CALL_GetInterruptMask, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_cpu_ops.iret):
- return patch_internal(VMI_CALL_IRET, len, insns, ip);
- case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
- return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
- default:
- break;
- }
- return len;
-}
-
-/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
-static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
- unsigned int *cx, unsigned int *dx)
-{
- int override = 0;
- if (*ax == 1)
- override = 1;
- asm volatile ("call *%6"
- : "=a" (*ax),
- "=b" (*bx),
- "=c" (*cx),
- "=d" (*dx)
- : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
- if (override) {
- if (disable_pse)
- *dx &= ~X86_FEATURE_PSE;
- if (disable_pge)
- *dx &= ~X86_FEATURE_PGE;
- if (disable_sep)
- *dx &= ~X86_FEATURE_SEP;
- if (disable_tsc)
- *dx &= ~X86_FEATURE_TSC;
- if (disable_mtrr)
- *dx &= ~X86_FEATURE_MTRR;
- }
-}
-
-static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
-{
- if (gdt[nr].a != new->a || gdt[nr].b != new->b)
- write_gdt_entry(gdt, nr, new, 0);
-}
-
-static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
-{
- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
-}
-
-static void vmi_set_ldt(const void *addr, unsigned entries)
-{
- unsigned cpu = smp_processor_id();
- struct desc_struct desc;
-
- pack_descriptor(&desc, (unsigned long)addr,
- entries * sizeof(struct desc_struct) - 1,
- DESC_LDT, 0);
- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
- vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
-}
-
-static void vmi_set_tr(void)
-{
- vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
-}
-
-static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
-{
- u32 *idt_entry = (u32 *)g;
- vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
-}
-
-static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
- const void *desc, int type)
-{
- u32 *gdt_entry = (u32 *)desc;
- vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
-}
-
-static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
- const void *desc)
-{
- u32 *ldt_entry = (u32 *)desc;
- vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
-}
-
-static void vmi_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
-{
- tss->x86_tss.sp0 = thread->sp0;
-
- /* This can only happen when SEP is enabled, no need to test "SEP"arately */
- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
- tss->x86_tss.ss1 = thread->sysenter_cs;
- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
- }
- vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
-}
-
-static void vmi_flush_tlb_user(void)
-{
- vmi_ops._flush_tlb(VMI_FLUSH_TLB);
-}
-
-static void vmi_flush_tlb_kernel(void)
-{
- vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
-}
-
-/* Stub to do nothing at all; used for delays and unimplemented calls */
-static void vmi_nop(void)
-{
-}
-
-static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
-{
- vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
-{
- /*
- * This call comes in very early, before mem_map is setup.
- * It is called only for swapper_pg_dir, which already has
- * data on it.
- */
- vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
-{
- vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
-}
-
-static void vmi_release_pte(unsigned long pfn)
-{
- vmi_ops.release_page(pfn, VMI_PAGE_L1);
-}
-
-static void vmi_release_pmd(unsigned long pfn)
-{
- vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * We use the pgd_free hook for releasing the pgd page:
- */
-static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- unsigned long pfn = __pa(pgd) >> PAGE_SHIFT;
-
- vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * Helper macros for MMU update flags. We can defer updates until a flush
- * or page invalidation only if the update is to the current address space
- * (otherwise, there is no flush). We must check against init_mm, since
- * this could be a kernel update, which usually passes init_mm, although
- * sometimes this check can be skipped if we know the particular function
- * is only called on user mode PTEs. We could change the kernel to pass
- * current->active_mm here, but in particular, I was unsure if changing
- * mm/highmem.c to do this would still be correct on other architectures.
- */
-#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
- (!mustbeuser && (mm) == &init_mm))
-#define vmi_flags_addr(mm, addr, level, user) \
- ((level) | (is_current_as(mm, user) ? \
- (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-#define vmi_flags_addr_defer(mm, addr, level, user) \
- ((level) | (is_current_as(mm, user) ? \
- (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-
-static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pte(pte_t *ptep, pte_t pte)
-{
- /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
- vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
- vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-#ifdef CONFIG_X86_PAE
- const pte_t pte = { .pte = pmdval.pmd };
-#else
- const pte_t pte = { pmdval.pud.pgd.pgd };
-#endif
- vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
-}
-
-#ifdef CONFIG_X86_PAE
-
-static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
-{
- /*
- * XXX This is called from set_pmd_pte, but at both PT
- * and PD layers so the VMI_PAGE_PT flag is wrong. But
- * it is only called for large page mapping changes,
- * the Xen backend, doesn't support large pages, and the
- * ESX backend doesn't depend on the flag.
- */
- set_64bit((unsigned long long *)ptep,pte_val(pteval));
- vmi_ops.update_pte(ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pud(pud_t *pudp, pud_t pudval)
-{
- /* Um, eww */
- const pte_t pte = { .pte = pudval.pgd.pgd };
- vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
-}
-
-static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- const pte_t pte = { .pte = 0 };
- vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_pmd_clear(pmd_t *pmd)
-{
- const pte_t pte = { .pte = 0 };
- vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
-}
-#endif
-
-#ifdef CONFIG_SMP
-static void __devinit
-vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
- unsigned long start_esp)
-{
- struct vmi_ap_state ap;
-
- /* Default everything to zero. This is fine for most GPRs. */
- memset(&ap, 0, sizeof(struct vmi_ap_state));
-
- ap.gdtr_limit = GDT_SIZE - 1;
- ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
-
- ap.idtr_limit = IDT_ENTRIES * 8 - 1;
- ap.idtr_base = (unsigned long) idt_table;
-
- ap.ldtr = 0;
-
- ap.cs = __KERNEL_CS;
- ap.eip = (unsigned long) start_eip;
- ap.ss = __KERNEL_DS;
- ap.esp = (unsigned long) start_esp;
-
- ap.ds = __USER_DS;
- ap.es = __USER_DS;
- ap.fs = __KERNEL_PERCPU;
- ap.gs = __KERNEL_STACK_CANARY;
-
- ap.eflags = 0;
-
-#ifdef CONFIG_X86_PAE
- /* efer should match BSP efer. */
- if (cpu_has_nx) {
- unsigned l, h;
- rdmsr(MSR_EFER, l, h);
- ap.efer = (unsigned long long) h << 32 | l;
- }
-#endif
-
- ap.cr3 = __pa(swapper_pg_dir);
- /* Protected mode, paging, AM, WP, NE, MP. */
- ap.cr0 = 0x80050023;
- ap.cr4 = mmu_cr4_features;
- vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
-}
-#endif
-
-static void vmi_start_context_switch(struct task_struct *prev)
-{
- paravirt_start_context_switch(prev);
- vmi_ops.set_lazy_mode(2);
-}
-
-static void vmi_end_context_switch(struct task_struct *next)
-{
- vmi_ops.set_lazy_mode(0);
- paravirt_end_context_switch(next);
-}
-
-static void vmi_enter_lazy_mmu(void)
-{
- paravirt_enter_lazy_mmu();
- vmi_ops.set_lazy_mode(1);
-}
-
-static void vmi_leave_lazy_mmu(void)
-{
- vmi_ops.set_lazy_mode(0);
- paravirt_leave_lazy_mmu();
-}
-
-static inline int __init check_vmi_rom(struct vrom_header *rom)
-{
- struct pci_header *pci;
- struct pnp_header *pnp;
- const char *manufacturer = "UNKNOWN";
- const char *product = "UNKNOWN";
- const char *license = "unspecified";
-
- if (rom->rom_signature != 0xaa55)
- return 0;
- if (rom->vrom_signature != VMI_SIGNATURE)
- return 0;
- if (rom->api_version_maj != VMI_API_REV_MAJOR ||
- rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
- printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
- rom->api_version_maj,
- rom->api_version_min);
- return 0;
- }
-
- /*
- * Relying on the VMI_SIGNATURE field is not 100% safe, so check
- * the PCI header and device type to make sure this is really a
- * VMI device.
- */
- if (!rom->pci_header_offs) {
- printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
- return 0;
- }
-
- pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
- if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
- pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
- /* Allow it to run... anyways, but warn */
- printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
- }
-
- if (rom->pnp_header_offs) {
- pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
- if (pnp->manufacturer_offset)
- manufacturer = (const char *)rom+pnp->manufacturer_offset;
- if (pnp->product_offset)
- product = (const char *)rom+pnp->product_offset;
- }
-
- if (rom->license_offs)
- license = (char *)rom+rom->license_offs;
-
- printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
- manufacturer, product,
- rom->api_version_maj, rom->api_version_min,
- pci->rom_version_maj, pci->rom_version_min);
-
- /* Don't allow BSD/MIT here for now because we don't want to end up
- with any binary only shim layers */
- if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
- printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
- license);
- return 0;
- }
-
- return 1;
-}
-
-/*
- * Probe for the VMI option ROM
- */
-static inline int __init probe_vmi_rom(void)
-{
- unsigned long base;
-
- /* VMI ROM is in option ROM area, check signature */
- for (base = 0xC0000; base < 0xE0000; base += 2048) {
- struct vrom_header *romstart;
- romstart = (struct vrom_header *)isa_bus_to_virt(base);
- if (check_vmi_rom(romstart)) {
- vmi_rom = romstart;
- return 1;
- }
- }
- return 0;
-}
-
-/*
- * VMI setup common to all processors
- */
-void vmi_bringup(void)
-{
- /* We must establish the lowmem mapping for MMU ops to work */
- if (vmi_ops.set_linear_mapping)
- vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
-}
-
-/*
- * Return a pointer to a VMI function or NULL if unimplemented
- */
-static void *vmi_get_function(int vmicall)
-{
- u64 reloc;
- const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
- reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
- BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
- if (rel->type == VMI_RELOCATION_CALL_REL)
- return (void *)rel->eip;
- else
- return NULL;
-}
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For unimplemented operations, fall back to default, unless nop
- * is returned by the ROM.
- */
-#define para_fill(opname, vmicall) \
-do { \
- reloc = call_vrom_long_func(vmi_rom, get_reloc, \
- VMI_CALL_##vmicall); \
- if (rel->type == VMI_RELOCATION_CALL_REL) \
- opname = (void *)rel->eip; \
- else if (rel->type == VMI_RELOCATION_NOP) \
- opname = (void *)vmi_nop; \
- else if (rel->type != VMI_RELOCATION_NONE) \
- printk(KERN_WARNING "VMI: Unknown relocation " \
- "type %d for " #vmicall"\n",\
- rel->type); \
-} while (0)
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For cached operations which do not match the VMI ROM ABI and must
- * go through a tranlation stub. Ignore NOPs, since it is not clear
- * a NOP * VMI function corresponds to a NOP paravirt-op when the
- * functions are not in 1-1 correspondence.
- */
-#define para_wrap(opname, wrapper, cache, vmicall) \
-do { \
- reloc = call_vrom_long_func(vmi_rom, get_reloc, \
- VMI_CALL_##vmicall); \
- BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \
- if (rel->type == VMI_RELOCATION_CALL_REL) { \
- opname = wrapper; \
- vmi_ops.cache = (void *)rel->eip; \
- } \
-} while (0)
-
-/*
- * Activate the VMI interface and switch into paravirtualized mode
- */
-static inline int __init activate_vmi(void)
-{
- short kernel_cs;
- u64 reloc;
- const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
-
- /*
- * Prevent page tables from being allocated in highmem, even if
- * CONFIG_HIGHPTE is enabled.
- */
- __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
-
- if (call_vrom_func(vmi_rom, vmi_init) != 0) {
- printk(KERN_ERR "VMI ROM failed to initialize!");
- return 0;
- }
- savesegment(cs, kernel_cs);
-
- pv_info.paravirt_enabled = 1;
- pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
- pv_info.name = "vmi [deprecated]";
-
- pv_init_ops.patch = vmi_patch;
-
- /*
- * Many of these operations are ABI compatible with VMI.
- * This means we can fill in the paravirt-ops with direct
- * pointers into the VMI ROM. If the calling convention for
- * these operations changes, this code needs to be updated.
- *
- * Exceptions
- * CPUID paravirt-op uses pointers, not the native ISA
- * halt has no VMI equivalent; all VMI halts are "safe"
- * no MSR support yet - just trap and emulate. VMI uses the
- * same ABI as the native ISA, but Linux wants exceptions
- * from bogus MSR read / write handled
- * rdpmc is not yet used in Linux
- */
-
- /* CPUID is special, so very special it gets wrapped like a present */
- para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
-
- para_fill(pv_cpu_ops.clts, CLTS);
- para_fill(pv_cpu_ops.get_debugreg, GetDR);
- para_fill(pv_cpu_ops.set_debugreg, SetDR);
- para_fill(pv_cpu_ops.read_cr0, GetCR0);
- para_fill(pv_mmu_ops.read_cr2, GetCR2);
- para_fill(pv_mmu_ops.read_cr3, GetCR3);
- para_fill(pv_cpu_ops.read_cr4, GetCR4);
- para_fill(pv_cpu_ops.write_cr0, SetCR0);
- para_fill(pv_mmu_ops.write_cr2, SetCR2);
- para_fill(pv_mmu_ops.write_cr3, SetCR3);
- para_fill(pv_cpu_ops.write_cr4, SetCR4);
-
- para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
- para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
- para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
- para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
-
- para_fill(pv_cpu_ops.wbinvd, WBINVD);
- para_fill(pv_cpu_ops.read_tsc, RDTSC);
-
- /* The following we emulate with trap and emulate for now */
- /* paravirt_ops.read_msr = vmi_rdmsr */
- /* paravirt_ops.write_msr = vmi_wrmsr */
- /* paravirt_ops.rdpmc = vmi_rdpmc */
-
- /* TR interface doesn't pass TR value, wrap */
- para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
-
- /* LDT is special, too */
- para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
-
- para_fill(pv_cpu_ops.load_gdt, SetGDT);
- para_fill(pv_cpu_ops.load_idt, SetIDT);
- para_fill(pv_cpu_ops.store_gdt, GetGDT);
- para_fill(pv_cpu_ops.store_idt, GetIDT);
- para_fill(pv_cpu_ops.store_tr, GetTR);
- pv_cpu_ops.load_tls = vmi_load_tls;
- para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
- write_ldt_entry, WriteLDTEntry);
- para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
- write_gdt_entry, WriteGDTEntry);
- para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
- write_idt_entry, WriteIDTEntry);
- para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
- para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
- para_fill(pv_cpu_ops.io_delay, IODelay);
-
- para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
- set_lazy_mode, SetLazyMode);
- para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
- set_lazy_mode, SetLazyMode);
-
- para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
- set_lazy_mode, SetLazyMode);
- para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
- set_lazy_mode, SetLazyMode);
-
- /* user and kernel flush are just handled with different flags to FlushTLB */
- para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
- para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
- para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
-
- /*
- * Until a standard flag format can be agreed on, we need to
- * implement these as wrappers in Linux. Get the VMI ROM
- * function pointers for the two backend calls.
- */
-#ifdef CONFIG_X86_PAE
- vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
- vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
-#else
- vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
- vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
-#endif
-
- if (vmi_ops.set_pte) {
- pv_mmu_ops.set_pte = vmi_set_pte;
- pv_mmu_ops.set_pte_at = vmi_set_pte_at;
- pv_mmu_ops.set_pmd = vmi_set_pmd;
-#ifdef CONFIG_X86_PAE
- pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
- pv_mmu_ops.set_pud = vmi_set_pud;
- pv_mmu_ops.pte_clear = vmi_pte_clear;
- pv_mmu_ops.pmd_clear = vmi_pmd_clear;
-#endif
- }
-
- if (vmi_ops.update_pte) {
- pv_mmu_ops.pte_update = vmi_update_pte;
- pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
- }
-
- vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
- if (vmi_ops.allocate_page) {
- pv_mmu_ops.alloc_pte = vmi_allocate_pte;
- pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
- pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
- }
-
- vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
- if (vmi_ops.release_page) {
- pv_mmu_ops.release_pte = vmi_release_pte;
- pv_mmu_ops.release_pmd = vmi_release_pmd;
- pv_mmu_ops.pgd_free = vmi_pgd_free;
- }
-
- /* Set linear is needed in all cases */
- vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
-
- /*
- * These MUST always be patched. Don't support indirect jumps
- * through these operations, as the VMI interface may use either
- * a jump or a call to get to these operations, depending on
- * the backend. They are performance critical anyway, so requiring
- * a patch is not a big problem.
- */
- pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
- pv_cpu_ops.iret = (void *)0xbadbab0;
-
-#ifdef CONFIG_SMP
- para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
- para_fill(apic->read, APICRead);
- para_fill(apic->write, APICWrite);
-#endif
-
- /*
- * Check for VMI timer functionality by probing for a cycle frequency method
- */
- reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
- if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
- vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
- vmi_timer_ops.get_cycle_counter =
- vmi_get_function(VMI_CALL_GetCycleCounter);
- vmi_timer_ops.get_wallclock =
- vmi_get_function(VMI_CALL_GetWallclockTime);
- vmi_timer_ops.wallclock_updated =
- vmi_get_function(VMI_CALL_WallclockUpdated);
- vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
- vmi_timer_ops.cancel_alarm =
- vmi_get_function(VMI_CALL_CancelAlarm);
- x86_init.timers.timer_init = vmi_time_init;
-#ifdef CONFIG_X86_LOCAL_APIC
- x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
- x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
-#endif
- pv_time_ops.sched_clock = vmi_sched_clock;
- x86_platform.calibrate_tsc = vmi_tsc_khz;
- x86_platform.get_wallclock = vmi_get_wallclock;
- x86_platform.set_wallclock = vmi_set_wallclock;
-
- /* We have true wallclock functions; disable CMOS clock sync */
- no_sync_cmos_clock = 1;
- } else {
- disable_noidle = 1;
- disable_vmi_timer = 1;
- }
-
- para_fill(pv_irq_ops.safe_halt, Halt);
-
- /*
- * Alternative instruction rewriting doesn't happen soon enough
- * to convert VMI_IRET to a call instead of a jump; so we have
- * to do this before IRQs get reenabled. Fortunately, it is
- * idempotent.
- */
- apply_paravirt(__parainstructions, __parainstructions_end);
-
- vmi_bringup();
-
- return 1;
-}
-
-#undef para_fill
-
-void __init vmi_init(void)
-{
- if (!vmi_rom)
- probe_vmi_rom();
- else
- check_vmi_rom(vmi_rom);
-
- /* In case probing for or validating the ROM failed, basil */
- if (!vmi_rom)
- return;
-
- reserve_top_address(-vmi_rom->virtual_top);
-
-#ifdef CONFIG_X86_IO_APIC
- /* This is virtual hardware; timer routing is wired correctly */
- no_timer_check = 1;
-#endif
-}
-
-void __init vmi_activate(void)
-{
- unsigned long flags;
-
- if (!vmi_rom)
- return;
-
- local_irq_save(flags);
- activate_vmi();
- local_irq_restore(flags & X86_EFLAGS_IF);
-}
-
-static int __init parse_vmi(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (!strcmp(arg, "disable_pge")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
- disable_pge = 1;
- } else if (!strcmp(arg, "disable_pse")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
- disable_pse = 1;
- } else if (!strcmp(arg, "disable_sep")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
- disable_sep = 1;
- } else if (!strcmp(arg, "disable_tsc")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
- disable_tsc = 1;
- } else if (!strcmp(arg, "disable_mtrr")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
- disable_mtrr = 1;
- } else if (!strcmp(arg, "disable_timer")) {
- disable_vmi_timer = 1;
- disable_noidle = 1;
- } else if (!strcmp(arg, "disable_noidle"))
- disable_noidle = 1;
- return 0;
-}
-
-early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
deleted file mode 100644
index 5e1ff66ecd73..000000000000
--- a/arch/x86/kernel/vmiclock_32.c
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * VMI paravirtual timer support routines.
- *
- * Copyright (C) 2007, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/cpumask.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-
-#include <asm/vmi.h>
-#include <asm/vmi_time.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/timer.h>
-#include <asm/i8253.h>
-#include <asm/irq_vectors.h>
-
-#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-
-static DEFINE_PER_CPU(struct clock_event_device, local_events);
-
-static inline u32 vmi_counter(u32 flags)
-{
- /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
- * cycle counter. */
- return flags & VMI_ALARM_COUNTER_MASK;
-}
-
-/* paravirt_ops.get_wallclock = vmi_get_wallclock */
-unsigned long vmi_get_wallclock(void)
-{
- unsigned long long wallclock;
- wallclock = vmi_timer_ops.get_wallclock(); // nsec
- (void)do_div(wallclock, 1000000000); // sec
-
- return wallclock;
-}
-
-/* paravirt_ops.set_wallclock = vmi_set_wallclock */
-int vmi_set_wallclock(unsigned long now)
-{
- return 0;
-}
-
-/* paravirt_ops.sched_clock = vmi_sched_clock */
-unsigned long long vmi_sched_clock(void)
-{
- return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
-}
-
-/* x86_platform.calibrate_tsc = vmi_tsc_khz */
-unsigned long vmi_tsc_khz(void)
-{
- unsigned long long khz;
- khz = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(khz, 1000);
- return khz;
-}
-
-static inline unsigned int vmi_get_timer_vector(void)
-{
- return IRQ0_VECTOR;
-}
-
-/** vmi clockchip */
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned int startup_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, vmi_get_timer_vector());
-
- return (val & APIC_SEND_PENDING);
-}
-
-static void mask_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
-}
-
-static void unmask_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
-}
-
-static void ack_timer_irq(unsigned int irq)
-{
- ack_APIC_irq();
-}
-
-static struct irq_chip vmi_chip __read_mostly = {
- .name = "VMI-LOCAL",
- .startup = startup_timer_irq,
- .mask = mask_timer_irq,
- .unmask = unmask_timer_irq,
- .ack = ack_timer_irq
-};
-#endif
-
-/** vmi clockevent */
-#define VMI_ALARM_WIRED_IRQ0 0x00000000
-#define VMI_ALARM_WIRED_LVTT 0x00010000
-static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
-
-static inline int vmi_get_alarm_wiring(void)
-{
- return vmi_wiring;
-}
-
-static void vmi_timer_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- cycle_t now, cycles_per_hz;
- BUG_ON(!irqs_disabled());
-
- switch (mode) {
- case CLOCK_EVT_MODE_ONESHOT:
- case CLOCK_EVT_MODE_RESUME:
- break;
- case CLOCK_EVT_MODE_PERIODIC:
- cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_hz, HZ);
- now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
- vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
- break;
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- switch (evt->mode) {
- case CLOCK_EVT_MODE_ONESHOT:
- vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
- break;
- case CLOCK_EVT_MODE_PERIODIC:
- vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
- break;
- default:
- break;
- }
- break;
- default:
- break;
- }
-}
-
-static int vmi_timer_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- /* Unfortunately, set_next_event interface only passes relative
- * expiry, but we want absolute expiry. It'd be better if were
- * were passed an absolute expiry, since a bunch of time may
- * have been stolen between the time the delta is computed and
- * when we set the alarm below. */
- cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
-
- BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
- vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
- return 0;
-}
-
-static struct clock_event_device vmi_clockevent = {
- .name = "vmi-timer",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .shift = 22,
- .set_mode = vmi_timer_set_mode,
- .set_next_event = vmi_timer_next_event,
- .rating = 1000,
- .irq = 0,
-};
-
-static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
-{
- struct clock_event_device *evt = &__get_cpu_var(local_events);
- evt->event_handler(evt);
- return IRQ_HANDLED;
-}
-
-static struct irqaction vmi_clock_action = {
- .name = "vmi-timer",
- .handler = vmi_timer_interrupt,
- .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
-};
-
-static void __devinit vmi_time_init_clockevent(void)
-{
- cycle_t cycles_per_msec;
- struct clock_event_device *evt;
-
- int cpu = smp_processor_id();
- evt = &__get_cpu_var(local_events);
-
- /* Use cycles_per_msec since div_sc params are 32-bits. */
- cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_msec, 1000);
-
- memcpy(evt, &vmi_clockevent, sizeof(*evt));
- /* Must pick .shift such that .mult fits in 32-bits. Choosing
- * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
- * before overflow. */
- evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
- /* Upper bound is clockevent's use of ulong for cycle deltas. */
- evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
- evt->min_delta_ns = clockevent_delta2ns(1, evt);
- evt->cpumask = cpumask_of(cpu);
-
- printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
- evt->name, evt->mult, evt->shift);
- clockevents_register_device(evt);
-}
-
-void __init vmi_time_init(void)
-{
- unsigned int cpu;
- /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
- outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
-
- vmi_time_init_clockevent();
- setup_irq(0, &vmi_clock_action);
- for_each_possible_cpu(cpu)
- per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-void __devinit vmi_time_bsp_init(void)
-{
- /*
- * On APIC systems, we want local timers to fire on each cpu. We do
- * this by programming LVTT to deliver timer events to the IRQ handler
- * for IRQ-0, since we can't re-use the APIC local timer handler
- * without interfering with that code.
- */
- clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
- local_irq_disable();
-#ifdef CONFIG_SMP
- /*
- * XXX handle_percpu_irq only defined for SMP; we need to switch over
- * to using it, since this is a local interrupt, which each CPU must
- * handle individually without locking out or dropping simultaneous
- * local timers on other CPUs. We also don't want to trigger the
- * quirk workaround code for interrupts which gets invoked from
- * handle_percpu_irq via eoi, so we use our own IRQ chip.
- */
- set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
-#else
- set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
-#endif
- vmi_wiring = VMI_ALARM_WIRED_LVTT;
- apic_write(APIC_LVTT, vmi_get_timer_vector());
- local_irq_enable();
- clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-}
-
-void __devinit vmi_time_ap_init(void)
-{
- vmi_time_init_clockevent();
- apic_write(APIC_LVTT, vmi_get_timer_vector());
-}
-#endif
-
-/** vmi clocksource */
-static struct clocksource clocksource_vmi;
-
-static cycle_t read_real_cycles(struct clocksource *cs)
-{
- cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
- return max(ret, clocksource_vmi.cycle_last);
-}
-
-static struct clocksource clocksource_vmi = {
- .name = "vmi-timer",
- .rating = 450,
- .read = read_real_cycles,
- .mask = CLOCKSOURCE_MASK(64),
- .mult = 0, /* to be set */
- .shift = 22,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-static int __init init_vmi_clocksource(void)
-{
- cycle_t cycles_per_msec;
-
- if (!vmi_timer_ops.get_cycle_frequency)
- return 0;
- /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
- cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_msec, 1000);
-
- /* Note that clocksource.{mult, shift} converts in the opposite direction
- * as clockevents. */
- clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
- clocksource_vmi.shift);
-
- printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
- return clocksource_register(&clocksource_vmi);
-
-}
-module_init(init_vmi_clocksource);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d0bb52296fa3..e03530aebfd0 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -242,6 +242,12 @@ SECTIONS
__x86_cpu_dev_end = .;
}
+ /*
+ * start address and size of operations which during runtime
+ * can be patched with virtualization friendly instructions or
+ * baremetal native ones. Think page table operations.
+ * Details in paravirt_types.h
+ */
. = ALIGN(8);
.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
__parainstructions = .;
@@ -249,6 +255,11 @@ SECTIONS
__parainstructions_end = .;
}
+ /*
+ * struct alt_inst entries. From the header (alternative.h):
+ * "Alternative instructions for different CPU types or capabilities"
+ * Think locking instructions on spinlocks.
+ */
. = ALIGN(8);
.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
__alt_instructions = .;
@@ -256,11 +267,28 @@ SECTIONS
__alt_instructions_end = .;
}
+ /*
+ * And here are the replacement instructions. The linker sticks
+ * them as binary blobs. The .altinstructions has enough data to
+ * get the address and the length of them to patch the kernel safely.
+ */
.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
*(.altinstr_replacement)
}
/*
+ * struct iommu_table_entry entries are injected in this section.
+ * It is an array of IOMMUs which during run time gets sorted depending
+ * on its dependency order. After rootfs_initcall is complete
+ * this section can be safely removed.
+ */
+ .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) {
+ __iommu_table = .;
+ *(.iommu_table)
+ __iommu_table_end = .;
+ }
+ . = ALIGN(8);
+ /*
* .exit.text is discard at runtime, not link time, to deal with
* references from .altinstructions and .eh_frame
*/
@@ -273,7 +301,7 @@ SECTIONS
}
#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
- PERCPU(PAGE_SIZE)
+ PERCPU(THREAD_SIZE)
#endif
. = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 1c0c6ab9c60f..dcbb28c4b694 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,8 +73,8 @@ void update_vsyscall_tz(void)
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
- u32 mult)
+void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
+ struct clocksource *clock, u32 mult)
{
unsigned long flags;
@@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
vsyscall_gtod_data.clock.shift = clock->shift;
vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
- vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
+ vsyscall_gtod_data.wall_to_monotonic = *wtm;
vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
@@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
* unlikely */
time_t __vsyscall(1) vtime(time_t *t)
{
- struct timeval tv;
+ unsigned seq;
time_t result;
if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
return time_syscall(t);
- vgettimeofday(&tv, NULL);
- result = tv.tv_sec;
+ do {
+ seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+
+ result = __vsyscall_gtod_data.wall_time_sec;
+
+ } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+
if (t)
*t = result;
return result;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index cd6da6bf3eca..ceb2911aa439 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -6,10 +6,12 @@
#include <linux/init.h>
#include <linux/ioport.h>
#include <linux/module.h>
+#include <linux/pci.h>
#include <asm/bios_ebda.h>
#include <asm/paravirt.h>
#include <asm/pci_x86.h>
+#include <asm/pci.h>
#include <asm/mpspec.h>
#include <asm/setup.h>
#include <asm/apic.h>
@@ -99,3 +101,8 @@ struct x86_platform_ops x86_platform = {
};
EXPORT_SYMBOL_GPL(x86_platform);
+struct x86_msi_ops x86_msi = {
+ .setup_msi_irqs = native_setup_msi_irqs,
+ .teardown_msi_irq = native_teardown_msi_irq,
+ .teardown_msi_irqs = default_teardown_msi_irqs,
+};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 37e68fc5e24a..9c253bd65e24 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -16,11 +16,88 @@
*/
u64 pcntxt_mask;
+/*
+ * Represents init state for the supported extended state.
+ */
+static struct xsave_struct *init_xstate_buf;
+
struct _fpx_sw_bytes fx_sw_reserved;
#ifdef CONFIG_IA32_EMULATION
struct _fpx_sw_bytes fx_sw_reserved_ia32;
#endif
+static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
+
+/*
+ * If a processor implementation discern that a processor state component is
+ * in its initialized state it may modify the corresponding bit in the
+ * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory
+ * layout in the case of xsaveopt. While presenting the xstate information to
+ * the user, we always ensure that the memory layout of a feature will be in
+ * the init state if the corresponding header bit is zero. This is to ensure
+ * that the user doesn't see some stale state in the memory layout during
+ * signal handling, debugging etc.
+ */
+void __sanitize_i387_state(struct task_struct *tsk)
+{
+ u64 xstate_bv;
+ int feature_bit = 0x2;
+ struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
+
+ if (!fx)
+ return;
+
+ BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU);
+
+ xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
+
+ /*
+ * None of the feature bits are in init state. So nothing else
+ * to do for us, as the memory layout is upto date.
+ */
+ if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
+ return;
+
+ /*
+ * FP is in init state
+ */
+ if (!(xstate_bv & XSTATE_FP)) {
+ fx->cwd = 0x37f;
+ fx->swd = 0;
+ fx->twd = 0;
+ fx->fop = 0;
+ fx->rip = 0;
+ fx->rdp = 0;
+ memset(&fx->st_space[0], 0, 128);
+ }
+
+ /*
+ * SSE is in init state
+ */
+ if (!(xstate_bv & XSTATE_SSE))
+ memset(&fx->xmm_space[0], 0, 256);
+
+ xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2;
+
+ /*
+ * Update all the other memory layouts for which the corresponding
+ * header bit is in the init state.
+ */
+ while (xstate_bv) {
+ if (xstate_bv & 0x1) {
+ int offset = xstate_offsets[feature_bit];
+ int size = xstate_sizes[feature_bit];
+
+ memcpy(((void *) fx) + offset,
+ ((void *) init_xstate_buf) + offset,
+ size);
+ }
+
+ xstate_bv >>= 1;
+ feature_bit++;
+ }
+}
+
/*
* Check for the presence of extended state information in the
* user fpstate pointer in the sigcontext.
@@ -36,15 +113,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
sizeof(struct _fpx_sw_bytes));
-
if (err)
- return err;
+ return -EFAULT;
/*
* First Magic check failed.
*/
if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
- return -1;
+ return -EINVAL;
/*
* Check for error scenarios.
@@ -52,19 +128,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
if (fx_sw_user->xstate_size < min_xstate_size ||
fx_sw_user->xstate_size > xstate_size ||
fx_sw_user->xstate_size > fx_sw_user->extended_size)
- return -1;
+ return -EINVAL;
err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
fx_sw_user->extended_size -
FP_XSTATE_MAGIC2_SIZE));
+ if (err)
+ return err;
/*
* Check for the presence of second magic word at the end of memory
* layout. This detects the case where the user just copied the legacy
* fpstate layout with out copying the extended state information
* in the memory layout.
*/
- if (err || magic2 != FP_XSTATE_MAGIC2)
- return -1;
+ if (magic2 != FP_XSTATE_MAGIC2)
+ return -EFAULT;
return 0;
}
@@ -91,14 +169,6 @@ int save_i387_xstate(void __user *buf)
return 0;
if (task_thread_info(tsk)->status & TS_USEDFPU) {
- /*
- * Start with clearing the user buffer. This will present a
- * clean context for the bytes not touched by the fxsave/xsave.
- */
- err = __clear_user(buf, sig_xstate_size);
- if (err)
- return err;
-
if (use_xsave())
err = xsave_user(buf);
else
@@ -109,6 +179,7 @@ int save_i387_xstate(void __user *buf)
task_thread_info(tsk)->status &= ~TS_USEDFPU;
stts();
} else {
+ sanitize_i387_state(tsk);
if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
xstate_size))
return -1;
@@ -184,8 +255,8 @@ static int restore_user_xstate(void __user *buf)
* init the state skipped by the user.
*/
mask = pcntxt_mask & ~mask;
-
- xrstor_state(init_xstate_buf, mask);
+ if (unlikely(mask))
+ xrstor_state(init_xstate_buf, mask);
return 0;
@@ -274,11 +345,6 @@ static void prepare_fx_sw_frame(void)
#endif
}
-/*
- * Represents init state for the supported extended state.
- */
-struct xsave_struct *init_xstate_buf;
-
#ifdef CONFIG_X86_64
unsigned int sig_xstate_size = sizeof(struct _fpstate);
#endif
@@ -286,37 +352,77 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate);
/*
* Enable the extended processor state save/restore feature
*/
-void __cpuinit xsave_init(void)
+static inline void xstate_enable(void)
{
- if (!cpu_has_xsave)
- return;
-
set_in_cr4(X86_CR4_OSXSAVE);
-
- /*
- * Enable all the features that the HW is capable of
- * and the Linux kernel is aware of.
- */
xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
}
/*
+ * Record the offsets and sizes of different state managed by the xsave
+ * memory layout.
+ */
+static void __init setup_xstate_features(void)
+{
+ int eax, ebx, ecx, edx, leaf = 0x2;
+
+ xstate_features = fls64(pcntxt_mask);
+ xstate_offsets = alloc_bootmem(xstate_features * sizeof(int));
+ xstate_sizes = alloc_bootmem(xstate_features * sizeof(int));
+
+ do {
+ cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
+
+ if (eax == 0)
+ break;
+
+ xstate_offsets[leaf] = ebx;
+ xstate_sizes[leaf] = eax;
+
+ leaf++;
+ } while (1);
+}
+
+/*
* setup the xstate image representing the init state
*/
static void __init setup_xstate_init(void)
{
+ setup_xstate_features();
+
+ /*
+ * Setup init_xstate_buf to represent the init state of
+ * all the features managed by the xsave
+ */
init_xstate_buf = alloc_bootmem(xstate_size);
init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
+
+ clts();
+ /*
+ * Init all the features state with header_bv being 0x0
+ */
+ xrstor_state(init_xstate_buf, -1);
+ /*
+ * Dump the init state again. This is to identify the init state
+ * of any feature which is not represented by all zero's.
+ */
+ xsave_state(init_xstate_buf, -1);
+ stts();
}
/*
* Enable and initialize the xsave feature.
*/
-void __ref xsave_cntxt_init(void)
+static void __init xstate_enable_boot_cpu(void)
{
unsigned int eax, ebx, ecx, edx;
- cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+ if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
+ WARN(1, KERN_ERR "XSTATE_CPUID missing\n");
+ return;
+ }
+
+ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
pcntxt_mask = eax + ((u64)edx << 32);
if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
@@ -329,12 +435,13 @@ void __ref xsave_cntxt_init(void)
* Support only the state known to OS.
*/
pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
- xsave_init();
+
+ xstate_enable();
/*
* Recompute the context size for enabled features
*/
- cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
xstate_size = ebx;
update_regset_xstate_info(xstate_size, pcntxt_mask);
@@ -346,3 +453,23 @@ void __ref xsave_cntxt_init(void)
"cntxt size 0x%x\n",
pcntxt_mask, xstate_size);
}
+
+/*
+ * For the very first instance, this calls xstate_enable_boot_cpu();
+ * for all subsequent instances, this calls xstate_enable().
+ *
+ * This is somewhat obfuscated due to the lack of powerful enough
+ * overrides for the section checks.
+ */
+void __cpuinit xsave_init(void)
+{
+ static __refdata void (*next_func)(void) = xstate_enable_boot_cpu;
+ void (*this_func)(void);
+
+ if (!cpu_has_xsave)
+ return;
+
+ this_func = next_func;
+ next_func = xstate_enable;
+ this_func();
+}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd479516..ddc131ff438f 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -64,6 +64,13 @@ config KVM_AMD
To compile this as a module, choose M here: the module
will be called kvm-amd.
+config KVM_MMU_AUDIT
+ bool "Audit KVM MMU"
+ depends on KVM && TRACEPOINTS
+ ---help---
+ This option adds a R/W kVM module parameter 'mmu_audit', which allows
+ audit KVM MMU at runtime.
+
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5ac0bb465ed6..38b6e8dafaff 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,6 +9,7 @@
* privileged instructions:
*
* Copyright (C) 2006 Qumranet
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
@@ -50,13 +51,13 @@
#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
#define DstReg (2<<1) /* Register operand. */
#define DstMem (3<<1) /* Memory operand. */
-#define DstAcc (4<<1) /* Destination Accumulator */
+#define DstAcc (4<<1) /* Destination Accumulator */
#define DstDI (5<<1) /* Destination is in ES:(E)DI */
#define DstMem64 (6<<1) /* 64bit memory operand */
+#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */
#define DstMask (7<<1)
/* Source operand type. */
#define SrcNone (0<<4) /* No source operand. */
-#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */
#define SrcReg (1<<4) /* Register operand. */
#define SrcMem (2<<4) /* Memory operand. */
#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
@@ -67,6 +68,10 @@
#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
#define SrcImmU (9<<4) /* Immediate operand, unsigned */
#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
+#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */
+#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */
+#define SrcAcc (0xd<<4) /* Source Accumulator */
+#define SrcImmU16 (0xe<<4) /* Immediate operand, unsigned, 16 bits */
#define SrcMask (0xf<<4)
/* Generic ModRM decode. */
#define ModRM (1<<8)
@@ -78,8 +83,10 @@
#define Stack (1<<13) /* Stack instruction (push/pop) */
#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
-#define GroupMask 0xff /* Group number stored in bits 0:7 */
/* Misc flags */
+#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
+#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
+#define Undefined (1<<25) /* No Such Instruction */
#define Lock (1<<26) /* lock prefix is allowed for the instruction */
#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
#define No64 (1<<28)
@@ -88,289 +95,30 @@
#define Src2CL (1<<29)
#define Src2ImmByte (2<<29)
#define Src2One (3<<29)
-#define Src2Imm16 (4<<29)
-#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be
- in memory and second argument is located
- immediately after the first one in memory. */
+#define Src2Imm (4<<29)
#define Src2Mask (7<<29)
-enum {
- Group1_80, Group1_81, Group1_82, Group1_83,
- Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
- Group8, Group9,
+#define X2(x...) x, x
+#define X3(x...) X2(x), x
+#define X4(x...) X2(x), X2(x)
+#define X5(x...) X4(x), x
+#define X6(x...) X4(x), X2(x)
+#define X7(x...) X4(x), X3(x)
+#define X8(x...) X4(x), X4(x)
+#define X16(x...) X8(x), X8(x)
+
+struct opcode {
+ u32 flags;
+ union {
+ int (*execute)(struct x86_emulate_ctxt *ctxt);
+ struct opcode *group;
+ struct group_dual *gdual;
+ } u;
};
-static u32 opcode_table[256] = {
- /* 0x00 - 0x07 */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
- ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
- /* 0x08 - 0x0F */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
- ImplicitOps | Stack | No64, 0,
- /* 0x10 - 0x17 */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
- ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
- /* 0x18 - 0x1F */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
- ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
- /* 0x20 - 0x27 */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
- /* 0x28 - 0x2F */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x30 - 0x37 */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x38 - 0x3F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
- 0, 0,
- /* 0x40 - 0x47 */
- DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
- /* 0x48 - 0x4F */
- DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
- /* 0x50 - 0x57 */
- SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
- SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
- /* 0x58 - 0x5F */
- DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
- DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
- /* 0x60 - 0x67 */
- ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
- 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
- 0, 0, 0, 0,
- /* 0x68 - 0x6F */
- SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
- DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
- SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
- /* 0x70 - 0x77 */
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- /* 0x78 - 0x7F */
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- /* 0x80 - 0x87 */
- Group | Group1_80, Group | Group1_81,
- Group | Group1_82, Group | Group1_83,
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- /* 0x88 - 0x8F */
- ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
- ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
- DstReg | SrcMem | ModRM | Mov, Group | Group1A,
- /* 0x90 - 0x97 */
- DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
- /* 0x98 - 0x9F */
- 0, 0, SrcImm | Src2Imm16 | No64, 0,
- ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
- /* 0xA0 - 0xA7 */
- ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
- ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
- ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
- ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
- /* 0xA8 - 0xAF */
- 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
- ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
- ByteOp | DstDI | String, DstDI | String,
- /* 0xB0 - 0xB7 */
- ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
- ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
- ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
- ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
- /* 0xB8 - 0xBF */
- DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
- DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
- DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
- DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
- /* 0xC0 - 0xC7 */
- ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
- 0, ImplicitOps | Stack, 0, 0,
- ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
- /* 0xC8 - 0xCF */
- 0, 0, 0, ImplicitOps | Stack,
- ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
- /* 0xD0 - 0xD7 */
- ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
- ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
- 0, 0, 0, 0,
- /* 0xD8 - 0xDF */
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xE0 - 0xE7 */
- 0, 0, 0, 0,
- ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
- ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
- /* 0xE8 - 0xEF */
- SrcImm | Stack, SrcImm | ImplicitOps,
- SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
- SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
- SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
- /* 0xF0 - 0xF7 */
- 0, 0, 0, 0,
- ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
- /* 0xF8 - 0xFF */
- ImplicitOps, 0, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
-};
-
-static u32 twobyte_table[256] = {
- /* 0x00 - 0x0F */
- 0, Group | GroupDual | Group7, 0, 0,
- 0, ImplicitOps, ImplicitOps | Priv, 0,
- ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
- 0, ImplicitOps | ModRM, 0, 0,
- /* 0x10 - 0x1F */
- 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
- /* 0x20 - 0x2F */
- ModRM | ImplicitOps | Priv, ModRM | Priv,
- ModRM | ImplicitOps | Priv, ModRM | Priv,
- 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x30 - 0x3F */
- ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
- ImplicitOps, ImplicitOps | Priv, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x40 - 0x47 */
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- /* 0x48 - 0x4F */
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- /* 0x50 - 0x5F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x60 - 0x6F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x70 - 0x7F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x80 - 0x8F */
- SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
- SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
- /* 0x90 - 0x9F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xA0 - 0xA7 */
- ImplicitOps | Stack, ImplicitOps | Stack,
- 0, DstMem | SrcReg | ModRM | BitOp,
- DstMem | SrcReg | Src2ImmByte | ModRM,
- DstMem | SrcReg | Src2CL | ModRM, 0, 0,
- /* 0xA8 - 0xAF */
- ImplicitOps | Stack, ImplicitOps | Stack,
- 0, DstMem | SrcReg | ModRM | BitOp | Lock,
- DstMem | SrcReg | Src2ImmByte | ModRM,
- DstMem | SrcReg | Src2CL | ModRM,
- ModRM, 0,
- /* 0xB0 - 0xB7 */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- 0, DstMem | SrcReg | ModRM | BitOp | Lock,
- 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem16 | ModRM | Mov,
- /* 0xB8 - 0xBF */
- 0, 0,
- Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
- 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem16 | ModRM | Mov,
- /* 0xC0 - 0xCF */
- 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
- 0, 0, 0, Group | GroupDual | Group9,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xD0 - 0xDF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xE0 - 0xEF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xF0 - 0xFF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-static u32 group_table[] = {
- [Group1_80*8] =
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM,
- [Group1_81*8] =
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM,
- [Group1_82*8] =
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64,
- [Group1_83*8] =
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM,
- [Group1A*8] =
- DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
- [Group3_Byte*8] =
- ByteOp | SrcImm | DstMem | ModRM, 0,
- ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
- 0, 0, 0, 0,
- [Group3*8] =
- DstMem | SrcImm | ModRM, 0,
- DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
- 0, 0, 0, 0,
- [Group4*8] =
- ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
- 0, 0, 0, 0, 0, 0,
- [Group5*8] =
- DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
- SrcMem | ModRM | Stack, 0,
- SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
- SrcMem | ModRM | Stack, 0,
- [Group7*8] =
- 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
- SrcNone | ModRM | DstMem | Mov, 0,
- SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
- [Group8*8] =
- 0, 0, 0, 0,
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
- [Group9*8] =
- 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
-};
-
-static u32 group2_table[] = {
- [Group7*8] =
- SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
- SrcNone | ModRM | DstMem | Mov, 0,
- SrcMem16 | ModRM | Mov | Priv, 0,
- [Group9*8] =
- 0, 0, 0, 0, 0, 0, 0, 0,
+struct group_dual {
+ struct opcode mod012[8];
+ struct opcode mod3[8];
};
/* EFLAGS bit definitions. */
@@ -392,6 +140,9 @@ static u32 group2_table[] = {
#define EFLG_PF (1<<2)
#define EFLG_CF (1<<0)
+#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
+#define EFLG_RESERVED_ONE_MASK 2
+
/*
* Instruction emulation:
* Most instructions are emulated directly via a fragment of inline assembly
@@ -444,13 +195,13 @@ static u32 group2_table[] = {
#define ON64(x)
#endif
-#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \
+#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \
do { \
__asm__ __volatile__ ( \
_PRE_EFLAGS("0", "4", "2") \
_op _suffix " %"_x"3,%1; " \
_POST_EFLAGS("0", "4", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
+ : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\
"=&r" (_tmp) \
: _y ((_src).val), "i" (EFLAGS_MASK)); \
} while (0)
@@ -463,13 +214,13 @@ static u32 group2_table[] = {
\
switch ((_dst).bytes) { \
case 2: \
- ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
+ ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\
break; \
case 4: \
- ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
+ ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\
break; \
case 8: \
- ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
+ ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \
break; \
} \
} while (0)
@@ -479,7 +230,7 @@ static u32 group2_table[] = {
unsigned long _tmp; \
switch ((_dst).bytes) { \
case 1: \
- ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \
+ ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \
break; \
default: \
__emulate_2op_nobyte(_op, _src, _dst, _eflags, \
@@ -566,6 +317,74 @@ static u32 group2_table[] = {
} \
} while (0)
+#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \
+ do { \
+ unsigned long _tmp; \
+ \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "4", "1") \
+ _op _suffix " %5; " \
+ _POST_EFLAGS("0", "4", "1") \
+ : "=m" (_eflags), "=&r" (_tmp), \
+ "+a" (_rax), "+d" (_rdx) \
+ : "i" (EFLAGS_MASK), "m" ((_src).val), \
+ "a" (_rax), "d" (_rdx)); \
+ } while (0)
+
+#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
+ do { \
+ unsigned long _tmp; \
+ \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "5", "1") \
+ "1: \n\t" \
+ _op _suffix " %6; " \
+ "2: \n\t" \
+ _POST_EFLAGS("0", "5", "1") \
+ ".pushsection .fixup,\"ax\" \n\t" \
+ "3: movb $1, %4 \n\t" \
+ "jmp 2b \n\t" \
+ ".popsection \n\t" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=m" (_eflags), "=&r" (_tmp), \
+ "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \
+ : "i" (EFLAGS_MASK), "m" ((_src).val), \
+ "a" (_rax), "d" (_rdx)); \
+ } while (0)
+
+/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
+#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \
+ do { \
+ switch((_src).bytes) { \
+ case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \
+ case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \
+ case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \
+ case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \
+ } \
+ } while (0)
+
+#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \
+ do { \
+ switch((_src).bytes) { \
+ case 1: \
+ __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+ _eflags, "b", _ex); \
+ break; \
+ case 2: \
+ __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+ _eflags, "w", _ex); \
+ break; \
+ case 4: \
+ __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+ _eflags, "l", _ex); \
+ break; \
+ case 8: ON64( \
+ __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+ _eflags, "q", _ex)); \
+ break; \
+ } \
+ } while (0)
+
/* Fetch next part of the instruction being emulated. */
#define insn_fetch(_type, _size, _eip) \
({ unsigned long _x; \
@@ -576,6 +395,13 @@ static u32 group2_table[] = {
(_type)_x; \
})
+#define insn_fetch_arr(_arr, _size, _eip) \
+({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
+ if (rc != X86EMUL_CONTINUE) \
+ goto done; \
+ (_eip) += (_size); \
+})
+
static inline unsigned long ad_mask(struct decode_cache *c)
{
return (1UL << (c->ad_bytes << 3)) - 1;
@@ -617,31 +443,69 @@ static void set_seg_override(struct decode_cache *c, int seg)
c->seg_override = seg;
}
-static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int seg)
{
if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
return 0;
- return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg);
+ return ops->get_cached_segment_base(seg, ctxt->vcpu);
}
static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
struct decode_cache *c)
{
if (!c->has_seg_override)
return 0;
- return seg_base(ctxt, c->seg_override);
+ return seg_base(ctxt, ops, c->seg_override);
+}
+
+static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ return seg_base(ctxt, ops, VCPU_SREG_ES);
+}
+
+static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ return seg_base(ctxt, ops, VCPU_SREG_SS);
+}
+
+static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+ u32 error, bool valid)
+{
+ ctxt->exception = vec;
+ ctxt->error_code = error;
+ ctxt->error_code_valid = valid;
+}
+
+static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
+{
+ emulate_exception(ctxt, GP_VECTOR, err, true);
+}
+
+static void emulate_pf(struct x86_emulate_ctxt *ctxt)
+{
+ emulate_exception(ctxt, PF_VECTOR, 0, true);
+}
+
+static void emulate_ud(struct x86_emulate_ctxt *ctxt)
+{
+ emulate_exception(ctxt, UD_VECTOR, 0, false);
}
-static unsigned long es_base(struct x86_emulate_ctxt *ctxt)
+static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
{
- return seg_base(ctxt, VCPU_SREG_ES);
+ emulate_exception(ctxt, TS_VECTOR, err, true);
}
-static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
+static int emulate_de(struct x86_emulate_ctxt *ctxt)
{
- return seg_base(ctxt, VCPU_SREG_SS);
+ emulate_exception(ctxt, DE_VECTOR, 0, false);
+ return X86EMUL_PROPAGATE_FAULT;
}
static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -700,7 +564,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
static int read_descriptor(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
- void *ptr,
+ ulong addr,
u16 *size, unsigned long *address, int op_bytes)
{
int rc;
@@ -708,12 +572,10 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
if (op_bytes == 2)
op_bytes = 3;
*address = 0;
- rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
- ctxt->vcpu, NULL);
+ rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
- ctxt->vcpu, NULL);
+ rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL);
return rc;
}
@@ -752,6 +614,24 @@ static int test_cc(unsigned int condition, unsigned int flags)
return (!!rc ^ (condition & 1));
}
+static void fetch_register_operand(struct operand *op)
+{
+ switch (op->bytes) {
+ case 1:
+ op->val = *(u8 *)op->addr.reg;
+ break;
+ case 2:
+ op->val = *(u16 *)op->addr.reg;
+ break;
+ case 4:
+ op->val = *(u32 *)op->addr.reg;
+ break;
+ case 8:
+ op->val = *(u64 *)op->addr.reg;
+ break;
+ }
+}
+
static void decode_register_operand(struct operand *op,
struct decode_cache *c,
int inhibit_bytereg)
@@ -763,34 +643,25 @@ static void decode_register_operand(struct operand *op,
reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
op->type = OP_REG;
if ((c->d & ByteOp) && !inhibit_bytereg) {
- op->ptr = decode_register(reg, c->regs, highbyte_regs);
- op->val = *(u8 *)op->ptr;
+ op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
op->bytes = 1;
} else {
- op->ptr = decode_register(reg, c->regs, 0);
+ op->addr.reg = decode_register(reg, c->regs, 0);
op->bytes = c->op_bytes;
- switch (op->bytes) {
- case 2:
- op->val = *(u16 *)op->ptr;
- break;
- case 4:
- op->val = *(u32 *)op->ptr;
- break;
- case 8:
- op->val = *(u64 *) op->ptr;
- break;
- }
}
+ fetch_register_operand(op);
op->orig_val = op->val;
}
static int decode_modrm(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+ struct x86_emulate_ops *ops,
+ struct operand *op)
{
struct decode_cache *c = &ctxt->decode;
u8 sib;
int index_reg = 0, base_reg = 0, scale;
int rc = X86EMUL_CONTINUE;
+ ulong modrm_ea = 0;
if (c->rex_prefix) {
c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
@@ -802,16 +673,19 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
c->modrm_mod |= (c->modrm & 0xc0) >> 6;
c->modrm_reg |= (c->modrm & 0x38) >> 3;
c->modrm_rm |= (c->modrm & 0x07);
- c->modrm_ea = 0;
- c->use_modrm_ea = 1;
+ c->modrm_seg = VCPU_SREG_DS;
if (c->modrm_mod == 3) {
- c->modrm_ptr = decode_register(c->modrm_rm,
+ op->type = OP_REG;
+ op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ op->addr.reg = decode_register(c->modrm_rm,
c->regs, c->d & ByteOp);
- c->modrm_val = *(unsigned long *)c->modrm_ptr;
+ fetch_register_operand(op);
return rc;
}
+ op->type = OP_MEM;
+
if (c->ad_bytes == 2) {
unsigned bx = c->regs[VCPU_REGS_RBX];
unsigned bp = c->regs[VCPU_REGS_RBP];
@@ -822,47 +696,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
switch (c->modrm_mod) {
case 0:
if (c->modrm_rm == 6)
- c->modrm_ea += insn_fetch(u16, 2, c->eip);
+ modrm_ea += insn_fetch(u16, 2, c->eip);
break;
case 1:
- c->modrm_ea += insn_fetch(s8, 1, c->eip);
+ modrm_ea += insn_fetch(s8, 1, c->eip);
break;
case 2:
- c->modrm_ea += insn_fetch(u16, 2, c->eip);
+ modrm_ea += insn_fetch(u16, 2, c->eip);
break;
}
switch (c->modrm_rm) {
case 0:
- c->modrm_ea += bx + si;
+ modrm_ea += bx + si;
break;
case 1:
- c->modrm_ea += bx + di;
+ modrm_ea += bx + di;
break;
case 2:
- c->modrm_ea += bp + si;
+ modrm_ea += bp + si;
break;
case 3:
- c->modrm_ea += bp + di;
+ modrm_ea += bp + di;
break;
case 4:
- c->modrm_ea += si;
+ modrm_ea += si;
break;
case 5:
- c->modrm_ea += di;
+ modrm_ea += di;
break;
case 6:
if (c->modrm_mod != 0)
- c->modrm_ea += bp;
+ modrm_ea += bp;
break;
case 7:
- c->modrm_ea += bx;
+ modrm_ea += bx;
break;
}
if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
(c->modrm_rm == 6 && c->modrm_mod != 0))
- if (!c->has_seg_override)
- set_seg_override(c, VCPU_SREG_SS);
- c->modrm_ea = (u16)c->modrm_ea;
+ c->modrm_seg = VCPU_SREG_SS;
+ modrm_ea = (u16)modrm_ea;
} else {
/* 32/64-bit ModR/M decode. */
if ((c->modrm_rm & 7) == 4) {
@@ -872,395 +745,105 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
scale = sib >> 6;
if ((base_reg & 7) == 5 && c->modrm_mod == 0)
- c->modrm_ea += insn_fetch(s32, 4, c->eip);
+ modrm_ea += insn_fetch(s32, 4, c->eip);
else
- c->modrm_ea += c->regs[base_reg];
+ modrm_ea += c->regs[base_reg];
if (index_reg != 4)
- c->modrm_ea += c->regs[index_reg] << scale;
+ modrm_ea += c->regs[index_reg] << scale;
} else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
if (ctxt->mode == X86EMUL_MODE_PROT64)
c->rip_relative = 1;
} else
- c->modrm_ea += c->regs[c->modrm_rm];
+ modrm_ea += c->regs[c->modrm_rm];
switch (c->modrm_mod) {
case 0:
if (c->modrm_rm == 5)
- c->modrm_ea += insn_fetch(s32, 4, c->eip);
+ modrm_ea += insn_fetch(s32, 4, c->eip);
break;
case 1:
- c->modrm_ea += insn_fetch(s8, 1, c->eip);
+ modrm_ea += insn_fetch(s8, 1, c->eip);
break;
case 2:
- c->modrm_ea += insn_fetch(s32, 4, c->eip);
+ modrm_ea += insn_fetch(s32, 4, c->eip);
break;
}
}
+ op->addr.mem = modrm_ea;
done:
return rc;
}
static int decode_abs(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+ struct x86_emulate_ops *ops,
+ struct operand *op)
{
struct decode_cache *c = &ctxt->decode;
int rc = X86EMUL_CONTINUE;
+ op->type = OP_MEM;
switch (c->ad_bytes) {
case 2:
- c->modrm_ea = insn_fetch(u16, 2, c->eip);
+ op->addr.mem = insn_fetch(u16, 2, c->eip);
break;
case 4:
- c->modrm_ea = insn_fetch(u32, 4, c->eip);
+ op->addr.mem = insn_fetch(u32, 4, c->eip);
break;
case 8:
- c->modrm_ea = insn_fetch(u64, 8, c->eip);
+ op->addr.mem = insn_fetch(u64, 8, c->eip);
break;
}
done:
return rc;
}
-int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+static void fetch_bit_operand(struct decode_cache *c)
{
- struct decode_cache *c = &ctxt->decode;
- int rc = X86EMUL_CONTINUE;
- int mode = ctxt->mode;
- int def_op_bytes, def_ad_bytes, group;
+ long sv = 0, mask;
+ if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
+ mask = ~(c->dst.bytes * 8 - 1);
- /* we cannot decode insn before we complete previous rep insn */
- WARN_ON(ctxt->restart);
+ if (c->src.bytes == 2)
+ sv = (s16)c->src.val & (s16)mask;
+ else if (c->src.bytes == 4)
+ sv = (s32)c->src.val & (s32)mask;
- /* Shadow copy of register state. Committed on successful emulation. */
- memset(c, 0, sizeof(struct decode_cache));
- c->eip = ctxt->eip;
- c->fetch.start = c->fetch.end = c->eip;
- ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
- memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
-
- switch (mode) {
- case X86EMUL_MODE_REAL:
- case X86EMUL_MODE_VM86:
- case X86EMUL_MODE_PROT16:
- def_op_bytes = def_ad_bytes = 2;
- break;
- case X86EMUL_MODE_PROT32:
- def_op_bytes = def_ad_bytes = 4;
- break;
-#ifdef CONFIG_X86_64
- case X86EMUL_MODE_PROT64:
- def_op_bytes = 4;
- def_ad_bytes = 8;
- break;
-#endif
- default:
- return -1;
- }
-
- c->op_bytes = def_op_bytes;
- c->ad_bytes = def_ad_bytes;
-
- /* Legacy prefixes. */
- for (;;) {
- switch (c->b = insn_fetch(u8, 1, c->eip)) {
- case 0x66: /* operand-size override */
- /* switch between 2/4 bytes */
- c->op_bytes = def_op_bytes ^ 6;
- break;
- case 0x67: /* address-size override */
- if (mode == X86EMUL_MODE_PROT64)
- /* switch between 4/8 bytes */
- c->ad_bytes = def_ad_bytes ^ 12;
- else
- /* switch between 2/4 bytes */
- c->ad_bytes = def_ad_bytes ^ 6;
- break;
- case 0x26: /* ES override */
- case 0x2e: /* CS override */
- case 0x36: /* SS override */
- case 0x3e: /* DS override */
- set_seg_override(c, (c->b >> 3) & 3);
- break;
- case 0x64: /* FS override */
- case 0x65: /* GS override */
- set_seg_override(c, c->b & 7);
- break;
- case 0x40 ... 0x4f: /* REX */
- if (mode != X86EMUL_MODE_PROT64)
- goto done_prefixes;
- c->rex_prefix = c->b;
- continue;
- case 0xf0: /* LOCK */
- c->lock_prefix = 1;
- break;
- case 0xf2: /* REPNE/REPNZ */
- c->rep_prefix = REPNE_PREFIX;
- break;
- case 0xf3: /* REP/REPE/REPZ */
- c->rep_prefix = REPE_PREFIX;
- break;
- default:
- goto done_prefixes;
- }
-
- /* Any legacy prefix after a REX prefix nullifies its effect. */
-
- c->rex_prefix = 0;
- }
-
-done_prefixes:
-
- /* REX prefix. */
- if (c->rex_prefix)
- if (c->rex_prefix & 8)
- c->op_bytes = 8; /* REX.W */
-
- /* Opcode byte(s). */
- c->d = opcode_table[c->b];
- if (c->d == 0) {
- /* Two-byte opcode? */
- if (c->b == 0x0f) {
- c->twobyte = 1;
- c->b = insn_fetch(u8, 1, c->eip);
- c->d = twobyte_table[c->b];
- }
- }
-
- if (c->d & Group) {
- group = c->d & GroupMask;
- c->modrm = insn_fetch(u8, 1, c->eip);
- --c->eip;
-
- group = (group << 3) + ((c->modrm >> 3) & 7);
- if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
- c->d = group2_table[group];
- else
- c->d = group_table[group];
- }
-
- /* Unrecognised? */
- if (c->d == 0) {
- DPRINTF("Cannot emulate %02x\n", c->b);
- return -1;
+ c->dst.addr.mem += (sv >> 3);
}
- if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
- c->op_bytes = 8;
-
- /* ModRM and SIB bytes. */
- if (c->d & ModRM)
- rc = decode_modrm(ctxt, ops);
- else if (c->d & MemAbs)
- rc = decode_abs(ctxt, ops);
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
- if (!c->has_seg_override)
- set_seg_override(c, VCPU_SREG_DS);
-
- if (!(!c->twobyte && c->b == 0x8d))
- c->modrm_ea += seg_override_base(ctxt, c);
-
- if (c->ad_bytes != 8)
- c->modrm_ea = (u32)c->modrm_ea;
-
- if (c->rip_relative)
- c->modrm_ea += c->eip;
+ /* only subword offset */
+ c->src.val &= (c->dst.bytes << 3) - 1;
+}
- /*
- * Decode and fetch the source operand: register, memory
- * or immediate.
- */
- switch (c->d & SrcMask) {
- case SrcNone:
- break;
- case SrcReg:
- decode_register_operand(&c->src, c, 0);
- break;
- case SrcMem16:
- c->src.bytes = 2;
- goto srcmem_common;
- case SrcMem32:
- c->src.bytes = 4;
- goto srcmem_common;
- case SrcMem:
- c->src.bytes = (c->d & ByteOp) ? 1 :
- c->op_bytes;
- /* Don't fetch the address for invlpg: it could be unmapped. */
- if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
- break;
- srcmem_common:
- /*
- * For instructions with a ModR/M byte, switch to register
- * access if Mod = 3.
- */
- if ((c->d & ModRM) && c->modrm_mod == 3) {
- c->src.type = OP_REG;
- c->src.val = c->modrm_val;
- c->src.ptr = c->modrm_ptr;
- break;
- }
- c->src.type = OP_MEM;
- c->src.ptr = (unsigned long *)c->modrm_ea;
- c->src.val = 0;
- break;
- case SrcImm:
- case SrcImmU:
- c->src.type = OP_IMM;
- c->src.ptr = (unsigned long *)c->eip;
- c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- if (c->src.bytes == 8)
- c->src.bytes = 4;
- /* NB. Immediates are sign-extended as necessary. */
- switch (c->src.bytes) {
- case 1:
- c->src.val = insn_fetch(s8, 1, c->eip);
- break;
- case 2:
- c->src.val = insn_fetch(s16, 2, c->eip);
- break;
- case 4:
- c->src.val = insn_fetch(s32, 4, c->eip);
- break;
- }
- if ((c->d & SrcMask) == SrcImmU) {
- switch (c->src.bytes) {
- case 1:
- c->src.val &= 0xff;
- break;
- case 2:
- c->src.val &= 0xffff;
- break;
- case 4:
- c->src.val &= 0xffffffff;
- break;
- }
- }
- break;
- case SrcImmByte:
- case SrcImmUByte:
- c->src.type = OP_IMM;
- c->src.ptr = (unsigned long *)c->eip;
- c->src.bytes = 1;
- if ((c->d & SrcMask) == SrcImmByte)
- c->src.val = insn_fetch(s8, 1, c->eip);
- else
- c->src.val = insn_fetch(u8, 1, c->eip);
- break;
- case SrcOne:
- c->src.bytes = 1;
- c->src.val = 1;
- break;
- case SrcSI:
- c->src.type = OP_MEM;
- c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->src.ptr = (unsigned long *)
- register_address(c, seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]);
- c->src.val = 0;
- break;
- }
+static int read_emulated(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ unsigned long addr, void *dest, unsigned size)
+{
+ int rc;
+ struct read_cache *mc = &ctxt->decode.mem_read;
+ u32 err;
- /*
- * Decode and fetch the second source operand: register, memory
- * or immediate.
- */
- switch (c->d & Src2Mask) {
- case Src2None:
- break;
- case Src2CL:
- c->src2.bytes = 1;
- c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
- break;
- case Src2ImmByte:
- c->src2.type = OP_IMM;
- c->src2.ptr = (unsigned long *)c->eip;
- c->src2.bytes = 1;
- c->src2.val = insn_fetch(u8, 1, c->eip);
- break;
- case Src2Imm16:
- c->src2.type = OP_IMM;
- c->src2.ptr = (unsigned long *)c->eip;
- c->src2.bytes = 2;
- c->src2.val = insn_fetch(u16, 2, c->eip);
- break;
- case Src2One:
- c->src2.bytes = 1;
- c->src2.val = 1;
- break;
- case Src2Mem16:
- c->src2.type = OP_MEM;
- c->src2.bytes = 2;
- c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
- c->src2.val = 0;
- break;
- }
+ while (size) {
+ int n = min(size, 8u);
+ size -= n;
+ if (mc->pos < mc->end)
+ goto read_cached;
- /* Decode and fetch the destination operand: register or memory. */
- switch (c->d & DstMask) {
- case ImplicitOps:
- /* Special instructions do their own operand decoding. */
- return 0;
- case DstReg:
- decode_register_operand(&c->dst, c,
- c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
- break;
- case DstMem:
- case DstMem64:
- if ((c->d & ModRM) && c->modrm_mod == 3) {
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.type = OP_REG;
- c->dst.val = c->dst.orig_val = c->modrm_val;
- c->dst.ptr = c->modrm_ptr;
- break;
- }
- c->dst.type = OP_MEM;
- c->dst.ptr = (unsigned long *)c->modrm_ea;
- if ((c->d & DstMask) == DstMem64)
- c->dst.bytes = 8;
- else
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.val = 0;
- if (c->d & BitOp) {
- unsigned long mask = ~(c->dst.bytes * 8 - 1);
+ rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
+ ctxt->vcpu);
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ emulate_pf(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ mc->end += n;
- c->dst.ptr = (void *)c->dst.ptr +
- (c->src.val & mask) / 8;
- }
- break;
- case DstAcc:
- c->dst.type = OP_REG;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = &c->regs[VCPU_REGS_RAX];
- switch (c->dst.bytes) {
- case 1:
- c->dst.val = *(u8 *)c->dst.ptr;
- break;
- case 2:
- c->dst.val = *(u16 *)c->dst.ptr;
- break;
- case 4:
- c->dst.val = *(u32 *)c->dst.ptr;
- break;
- case 8:
- c->dst.val = *(u64 *)c->dst.ptr;
- break;
- }
- c->dst.orig_val = c->dst.val;
- break;
- case DstDI:
- c->dst.type = OP_MEM;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)
- register_address(c, es_base(ctxt),
- c->regs[VCPU_REGS_RDI]);
- c->dst.val = 0;
- break;
+ read_cached:
+ memcpy(dest, mc->data + mc->pos, n);
+ mc->pos += n;
+ dest += n;
+ addr += n;
}
-
-done:
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+ return X86EMUL_CONTINUE;
}
static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
@@ -1330,13 +913,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
get_descriptor_table_ptr(ctxt, ops, selector, &dt);
if (dt.size < index * 8 + 7) {
- kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+ emulate_gp(ctxt, selector & 0xfffc);
return X86EMUL_PROPAGATE_FAULT;
}
addr = dt.address + index * 8;
ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT)
- kvm_inject_page_fault(ctxt->vcpu, addr, err);
+ emulate_pf(ctxt);
return ret;
}
@@ -1355,14 +938,14 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
get_descriptor_table_ptr(ctxt, ops, selector, &dt);
if (dt.size < index * 8 + 7) {
- kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+ emulate_gp(ctxt, selector & 0xfffc);
return X86EMUL_PROPAGATE_FAULT;
}
addr = dt.address + index * 8;
ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT)
- kvm_inject_page_fault(ctxt->vcpu, addr, err);
+ emulate_pf(ctxt);
return ret;
}
@@ -1481,11 +1064,72 @@ load:
ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
return X86EMUL_CONTINUE;
exception:
- kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
+ emulate_exception(ctxt, err_vec, err_code, true);
return X86EMUL_PROPAGATE_FAULT;
}
-static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
+static void write_register_operand(struct operand *op)
+{
+ /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+ switch (op->bytes) {
+ case 1:
+ *(u8 *)op->addr.reg = (u8)op->val;
+ break;
+ case 2:
+ *(u16 *)op->addr.reg = (u16)op->val;
+ break;
+ case 4:
+ *op->addr.reg = (u32)op->val;
+ break; /* 64b: zero-extend */
+ case 8:
+ *op->addr.reg = op->val;
+ break;
+ }
+}
+
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ int rc;
+ struct decode_cache *c = &ctxt->decode;
+ u32 err;
+
+ switch (c->dst.type) {
+ case OP_REG:
+ write_register_operand(&c->dst);
+ break;
+ case OP_MEM:
+ if (c->lock_prefix)
+ rc = ops->cmpxchg_emulated(
+ c->dst.addr.mem,
+ &c->dst.orig_val,
+ &c->dst.val,
+ c->dst.bytes,
+ &err,
+ ctxt->vcpu);
+ else
+ rc = ops->write_emulated(
+ c->dst.addr.mem,
+ &c->dst.val,
+ c->dst.bytes,
+ &err,
+ ctxt->vcpu);
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ emulate_pf(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ break;
+ case OP_NONE:
+ /* no writeback */
+ break;
+ default:
+ break;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
@@ -1493,8 +1137,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
c->dst.bytes = c->op_bytes;
c->dst.val = c->src.val;
register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
- c->dst.ptr = (void *) register_address(c, ss_base(ctxt),
- c->regs[VCPU_REGS_RSP]);
+ c->dst.addr.mem = register_address(c, ss_base(ctxt, ops),
+ c->regs[VCPU_REGS_RSP]);
}
static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1504,9 +1148,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
struct decode_cache *c = &ctxt->decode;
int rc;
- rc = ops->read_emulated(register_address(c, ss_base(ctxt),
- c->regs[VCPU_REGS_RSP]),
- dest, len, ctxt->vcpu);
+ rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
+ c->regs[VCPU_REGS_RSP]),
+ dest, len);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1541,7 +1185,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
break;
case X86EMUL_MODE_VM86:
if (iopl < 3) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
change_mask |= EFLG_IF;
@@ -1554,18 +1198,20 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
*(unsigned long *)dest =
(ctxt->eflags & ~change_mask) | (val & change_mask);
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ emulate_pf(ctxt);
+
return rc;
}
-static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
+static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int seg)
{
struct decode_cache *c = &ctxt->decode;
- struct kvm_segment segment;
- kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
+ c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
- c->src.val = segment.selector;
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
}
static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1583,19 +1229,31 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
return rc;
}
-static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
+static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+ int rc = X86EMUL_CONTINUE;
int reg = VCPU_REGS_RAX;
while (reg <= VCPU_REGS_RDI) {
(reg == VCPU_REGS_RSP) ?
(c->src.val = old_esp) : (c->src.val = c->regs[reg]);
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
+
+ rc = writeback(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
++reg;
}
+
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+
+ return rc;
}
static int emulate_popa(struct x86_emulate_ctxt *ctxt,
@@ -1620,6 +1278,150 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
return rc;
}
+int emulate_int_real(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int irq)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+ struct desc_ptr dt;
+ gva_t cs_addr;
+ gva_t eip_addr;
+ u16 cs, eip;
+ u32 err;
+
+ /* TODO: Add limit checks */
+ c->src.val = ctxt->eflags;
+ emulate_push(ctxt, ops);
+ rc = writeback(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
+
+ c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+ emulate_push(ctxt, ops);
+ rc = writeback(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->src.val = c->eip;
+ emulate_push(ctxt, ops);
+ rc = writeback(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->dst.type = OP_NONE;
+
+ ops->get_idt(&dt, ctxt->vcpu);
+
+ eip_addr = dt.address + (irq << 2);
+ cs_addr = dt.address + (irq << 2) + 2;
+
+ rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->eip = eip;
+
+ return rc;
+}
+
+static int emulate_int(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int irq)
+{
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ return emulate_int_real(ctxt, ops, irq);
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT64:
+ default:
+ /* Protected mode interrupts unimplemented yet */
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc = X86EMUL_CONTINUE;
+ unsigned long temp_eip = 0;
+ unsigned long temp_eflags = 0;
+ unsigned long cs = 0;
+ unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
+ EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
+ EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
+ unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
+
+ /* TODO: Add stack limit check */
+
+ rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (temp_eip & ~0xffff) {
+ emulate_gp(ctxt, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->eip = temp_eip;
+
+
+ if (c->op_bytes == 4)
+ ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
+ else if (c->op_bytes == 2) {
+ ctxt->eflags &= ~0xffff;
+ ctxt->eflags |= temp_eflags;
+ }
+
+ ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
+ ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+
+ return rc;
+}
+
+static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops* ops)
+{
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ return emulate_iret_real(ctxt, ops);
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT64:
+ default:
+ /* iret from protected mode unimplemented yet */
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
@@ -1661,6 +1463,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
+ unsigned long *rax = &c->regs[VCPU_REGS_RAX];
+ unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
+ u8 de = 0;
switch (c->modrm_reg) {
case 0 ... 1: /* test */
@@ -1672,10 +1477,26 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
case 3: /* neg */
emulate_1op("neg", c->dst, ctxt->eflags);
break;
+ case 4: /* mul */
+ emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags);
+ break;
+ case 5: /* imul */
+ emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
+ break;
+ case 6: /* div */
+ emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx,
+ ctxt->eflags, de);
+ break;
+ case 7: /* idiv */
+ emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx,
+ ctxt->eflags, de);
+ break;
default:
- return 0;
+ return X86EMUL_UNHANDLEABLE;
}
- return 1;
+ if (de)
+ return emulate_de(ctxt);
+ return X86EMUL_CONTINUE;
}
static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -1695,14 +1516,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
old_eip = c->eip;
c->eip = c->src.val;
c->src.val = old_eip;
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
break;
}
case 4: /* jmp abs */
c->eip = c->src.val;
break;
case 6: /* push */
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
break;
}
return X86EMUL_CONTINUE;
@@ -1712,17 +1533,16 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- u64 old = c->dst.orig_val;
+ u64 old = c->dst.orig_val64;
if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
-
c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
ctxt->eflags &= ~EFLG_ZF;
} else {
- c->dst.val = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
- (u32) c->regs[VCPU_REGS_RBX];
+ c->dst.val64 = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
+ (u32) c->regs[VCPU_REGS_RBX];
ctxt->eflags |= EFLG_ZF;
}
@@ -1748,145 +1568,99 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
return rc;
}
-static inline int writeback(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int seg)
{
- int rc;
struct decode_cache *c = &ctxt->decode;
+ unsigned short sel;
+ int rc;
- switch (c->dst.type) {
- case OP_REG:
- /* The 4-byte case *is* correct:
- * in 64-bit mode we zero-extend.
- */
- switch (c->dst.bytes) {
- case 1:
- *(u8 *)c->dst.ptr = (u8)c->dst.val;
- break;
- case 2:
- *(u16 *)c->dst.ptr = (u16)c->dst.val;
- break;
- case 4:
- *c->dst.ptr = (u32)c->dst.val;
- break; /* 64b: zero-ext */
- case 8:
- *c->dst.ptr = c->dst.val;
- break;
- }
- break;
- case OP_MEM:
- if (c->lock_prefix)
- rc = ops->cmpxchg_emulated(
- (unsigned long)c->dst.ptr,
- &c->dst.orig_val,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu);
- else
- rc = ops->write_emulated(
- (unsigned long)c->dst.ptr,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- break;
- case OP_NONE:
- /* no writeback */
- break;
- default:
- break;
- }
- return X86EMUL_CONTINUE;
-}
+ memcpy(&sel, c->src.valptr + c->op_bytes, 2);
-static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
-{
- u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
- /*
- * an sti; sti; sequence only disable interrupts for the first
- * instruction. So, if the last instruction, be it emulated or
- * not, left the system with the INT_STI flag enabled, it
- * means that the last instruction is an sti. We should not
- * leave the flag on in this case. The same goes for mov ss
- */
- if (!(int_shadow & mask))
- ctxt->interruptibility = mask;
+ rc = load_segment_descriptor(ctxt, ops, sel, seg);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->dst.val = c->src.val;
+ return rc;
}
static inline void
setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
- struct kvm_segment *cs, struct kvm_segment *ss)
+ struct x86_emulate_ops *ops, struct desc_struct *cs,
+ struct desc_struct *ss)
{
- memset(cs, 0, sizeof(struct kvm_segment));
- kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS);
- memset(ss, 0, sizeof(struct kvm_segment));
+ memset(cs, 0, sizeof(struct desc_struct));
+ ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu);
+ memset(ss, 0, sizeof(struct desc_struct));
cs->l = 0; /* will be adjusted later */
- cs->base = 0; /* flat segment */
+ set_desc_base(cs, 0); /* flat segment */
cs->g = 1; /* 4kb granularity */
- cs->limit = 0xffffffff; /* 4GB limit */
+ set_desc_limit(cs, 0xfffff); /* 4GB limit */
cs->type = 0x0b; /* Read, Execute, Accessed */
cs->s = 1;
cs->dpl = 0; /* will be adjusted later */
- cs->present = 1;
- cs->db = 1;
+ cs->p = 1;
+ cs->d = 1;
- ss->unusable = 0;
- ss->base = 0; /* flat segment */
- ss->limit = 0xffffffff; /* 4GB limit */
+ set_desc_base(ss, 0); /* flat segment */
+ set_desc_limit(ss, 0xfffff); /* 4GB limit */
ss->g = 1; /* 4kb granularity */
ss->s = 1;
ss->type = 0x03; /* Read/Write, Accessed */
- ss->db = 1; /* 32bit stack segment */
+ ss->d = 1; /* 32bit stack segment */
ss->dpl = 0;
- ss->present = 1;
+ ss->p = 1;
}
static int
-emulate_syscall(struct x86_emulate_ctxt *ctxt)
+emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- struct kvm_segment cs, ss;
+ struct desc_struct cs, ss;
u64 msr_data;
+ u16 cs_sel, ss_sel;
/* syscall is not available in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
ctxt->mode == X86EMUL_MODE_VM86) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
return X86EMUL_PROPAGATE_FAULT;
}
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(ctxt, ops, &cs, &ss);
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
msr_data >>= 32;
- cs.selector = (u16)(msr_data & 0xfffc);
- ss.selector = (u16)(msr_data + 8);
+ cs_sel = (u16)(msr_data & 0xfffc);
+ ss_sel = (u16)(msr_data + 8);
if (is_long_mode(ctxt->vcpu)) {
- cs.db = 0;
+ cs.d = 0;
cs.l = 1;
}
- kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
- kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+ ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
c->regs[VCPU_REGS_RCX] = c->eip;
if (is_long_mode(ctxt->vcpu)) {
#ifdef CONFIG_X86_64
c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
- kvm_x86_ops->get_msr(ctxt->vcpu,
- ctxt->mode == X86EMUL_MODE_PROT64 ?
- MSR_LSTAR : MSR_CSTAR, &msr_data);
+ ops->get_msr(ctxt->vcpu,
+ ctxt->mode == X86EMUL_MODE_PROT64 ?
+ MSR_LSTAR : MSR_CSTAR, &msr_data);
c->eip = msr_data;
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
ctxt->eflags &= ~(msr_data | EFLG_RF);
#endif
} else {
/* legacy mode */
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
c->eip = (u32)msr_data;
ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1896,15 +1670,16 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
}
static int
-emulate_sysenter(struct x86_emulate_ctxt *ctxt)
+emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- struct kvm_segment cs, ss;
+ struct desc_struct cs, ss;
u64 msr_data;
+ u16 cs_sel, ss_sel;
/* inject #GP if in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
@@ -1912,67 +1687,70 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
* Therefore, we inject an #UD.
*/
if (ctxt->mode == X86EMUL_MODE_PROT64) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
return X86EMUL_PROPAGATE_FAULT;
}
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(ctxt, ops, &cs, &ss);
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
switch (ctxt->mode) {
case X86EMUL_MODE_PROT32:
if ((msr_data & 0xfffc) == 0x0) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
break;
case X86EMUL_MODE_PROT64:
if (msr_data == 0x0) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
break;
}
ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
- cs.selector = (u16)msr_data;
- cs.selector &= ~SELECTOR_RPL_MASK;
- ss.selector = cs.selector + 8;
- ss.selector &= ~SELECTOR_RPL_MASK;
+ cs_sel = (u16)msr_data;
+ cs_sel &= ~SELECTOR_RPL_MASK;
+ ss_sel = cs_sel + 8;
+ ss_sel &= ~SELECTOR_RPL_MASK;
if (ctxt->mode == X86EMUL_MODE_PROT64
|| is_long_mode(ctxt->vcpu)) {
- cs.db = 0;
+ cs.d = 0;
cs.l = 1;
}
- kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
- kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+ ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
c->eip = msr_data;
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
c->regs[VCPU_REGS_RSP] = msr_data;
return X86EMUL_CONTINUE;
}
static int
-emulate_sysexit(struct x86_emulate_ctxt *ctxt)
+emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- struct kvm_segment cs, ss;
+ struct desc_struct cs, ss;
u64 msr_data;
int usermode;
+ u16 cs_sel, ss_sel;
/* inject #GP if in real mode or Virtual 8086 mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
ctxt->mode == X86EMUL_MODE_VM86) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(ctxt, ops, &cs, &ss);
if ((c->rex_prefix & 0x8) != 0x0)
usermode = X86EMUL_MODE_PROT64;
@@ -1981,35 +1759,37 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
cs.dpl = 3;
ss.dpl = 3;
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
switch (usermode) {
case X86EMUL_MODE_PROT32:
- cs.selector = (u16)(msr_data + 16);
+ cs_sel = (u16)(msr_data + 16);
if ((msr_data & 0xfffc) == 0x0) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
- ss.selector = (u16)(msr_data + 24);
+ ss_sel = (u16)(msr_data + 24);
break;
case X86EMUL_MODE_PROT64:
- cs.selector = (u16)(msr_data + 32);
+ cs_sel = (u16)(msr_data + 32);
if (msr_data == 0x0) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
- ss.selector = cs.selector + 8;
- cs.db = 0;
+ ss_sel = cs_sel + 8;
+ cs.d = 0;
cs.l = 1;
break;
}
- cs.selector |= SELECTOR_RPL_MASK;
- ss.selector |= SELECTOR_RPL_MASK;
+ cs_sel |= SELECTOR_RPL_MASK;
+ ss_sel |= SELECTOR_RPL_MASK;
- kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
- kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+ ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
- c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
- c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
+ c->eip = c->regs[VCPU_REGS_RDX];
+ c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
return X86EMUL_CONTINUE;
}
@@ -2030,25 +1810,25 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
u16 port, u16 len)
{
- struct kvm_segment tr_seg;
+ struct desc_struct tr_seg;
int r;
u16 io_bitmap_ptr;
u8 perm, bit_idx = port & 0x7;
unsigned mask = (1 << len) - 1;
- kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
- if (tr_seg.unusable)
+ ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu);
+ if (!tr_seg.p)
return false;
- if (tr_seg.limit < 103)
+ if (desc_limit_scaled(&tr_seg) < 103)
return false;
- r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
- NULL);
+ r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2,
+ ctxt->vcpu, NULL);
if (r != X86EMUL_CONTINUE)
return false;
- if (io_bitmap_ptr + port/8 > tr_seg.limit)
+ if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
return false;
- r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
- ctxt->vcpu, NULL);
+ r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8,
+ &perm, 1, ctxt->vcpu, NULL);
if (r != X86EMUL_CONTINUE)
return false;
if ((perm >> bit_idx) & mask)
@@ -2060,21 +1840,16 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
u16 port, u16 len)
{
+ if (ctxt->perm_ok)
+ return true;
+
if (emulator_bad_iopl(ctxt, ops))
if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
return false;
- return true;
-}
-static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- int seg)
-{
- struct desc_struct desc;
- if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
- return get_desc_base(&desc);
- else
- return ~0;
+ ctxt->perm_ok = true;
+
+ return true;
}
static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
@@ -2165,7 +1940,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ emulate_pf(ctxt);
return ret;
}
@@ -2175,7 +1950,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ emulate_pf(ctxt);
return ret;
}
@@ -2183,7 +1958,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ emulate_pf(ctxt);
return ret;
}
@@ -2196,7 +1971,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ emulate_pf(ctxt);
return ret;
}
}
@@ -2238,7 +2013,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
struct decode_cache *c = &ctxt->decode;
int ret;
- ops->set_cr(3, tss->cr3, ctxt->vcpu);
+ if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
+ emulate_gp(ctxt, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
c->eip = tss->eip;
ctxt->eflags = tss->eflags | 2;
c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2304,7 +2082,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ emulate_pf(ctxt);
return ret;
}
@@ -2314,7 +2092,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ emulate_pf(ctxt);
return ret;
}
@@ -2322,7 +2100,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ emulate_pf(ctxt);
return ret;
}
@@ -2335,7 +2113,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ emulate_pf(ctxt);
return ret;
}
}
@@ -2352,7 +2130,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
int ret;
u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
ulong old_tss_base =
- get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
+ ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu);
u32 desc_limit;
/* FIXME: old_tss_base == ~0 ? */
@@ -2369,7 +2147,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if (reason != TASK_SWITCH_IRET) {
if ((tss_selector & 3) > next_tss_desc.dpl ||
ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
}
@@ -2378,8 +2156,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if (!next_tss_desc.p ||
((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
desc_limit < 0x2b)) {
- kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
- tss_selector & 0xfffc);
+ emulate_ts(ctxt, tss_selector & 0xfffc);
return X86EMUL_PROPAGATE_FAULT;
}
@@ -2425,32 +2202,30 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
c->lock_prefix = 0;
c->src.val = (unsigned long) error_code;
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
}
return ret;
}
int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
u16 tss_selector, int reason,
bool has_error_code, u32 error_code)
{
+ struct x86_emulate_ops *ops = ctxt->ops;
struct decode_cache *c = &ctxt->decode;
int rc;
- memset(c, 0, sizeof(struct decode_cache));
c->eip = ctxt->eip;
- memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
c->dst.type = OP_NONE;
rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
has_error_code, error_code);
if (rc == X86EMUL_CONTINUE) {
- memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(ctxt->vcpu, c->eip);
rc = writeback(ctxt, ops);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->eip = c->eip;
}
return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -2463,86 +2238,828 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
register_address_increment(c, &c->regs[reg], df * op->bytes);
- op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]);
+ op->addr.mem = register_address(c, base, c->regs[reg]);
+}
+
+static int em_push(struct x86_emulate_ctxt *ctxt)
+{
+ emulate_push(ctxt, ctxt->ops);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_das(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ u8 al, old_al;
+ bool af, cf, old_cf;
+
+ cf = ctxt->eflags & X86_EFLAGS_CF;
+ al = c->dst.val;
+
+ old_al = al;
+ old_cf = cf;
+ cf = false;
+ af = ctxt->eflags & X86_EFLAGS_AF;
+ if ((al & 0x0f) > 9 || af) {
+ al -= 6;
+ cf = old_cf | (al >= 250);
+ af = true;
+ } else {
+ af = false;
+ }
+ if (old_al > 0x99 || old_cf) {
+ al -= 0x60;
+ cf = true;
+ }
+
+ c->dst.val = al;
+ /* Set PF, ZF, SF */
+ c->src.type = OP_IMM;
+ c->src.val = 0;
+ c->src.bytes = 1;
+ emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+ ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
+ if (cf)
+ ctxt->eflags |= X86_EFLAGS_CF;
+ if (af)
+ ctxt->eflags |= X86_EFLAGS_AF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_call_far(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ u16 sel, old_cs;
+ ulong old_eip;
+ int rc;
+
+ old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+ old_eip = c->eip;
+
+ memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+ if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS))
+ return X86EMUL_CONTINUE;
+
+ c->eip = 0;
+ memcpy(&c->eip, c->src.valptr, c->op_bytes);
+
+ c->src.val = old_cs;
+ emulate_push(ctxt, ctxt->ops);
+ rc = writeback(ctxt, ctxt->ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->src.val = old_eip;
+ emulate_push(ctxt, ctxt->ops);
+ rc = writeback(ctxt, ctxt->ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->dst.type = OP_NONE;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+
+ c->dst.type = OP_REG;
+ c->dst.addr.reg = &c->eip;
+ c->dst.bytes = c->op_bytes;
+ rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_imul(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ c->dst.val = c->src2.val;
+ return em_imul(ctxt);
+}
+
+static int em_cwd(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ c->dst.type = OP_REG;
+ c->dst.bytes = c->src.bytes;
+ c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
+ c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned cpl = ctxt->ops->cpl(ctxt->vcpu);
+ struct decode_cache *c = &ctxt->decode;
+ u64 tsc = 0;
+
+ if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) {
+ emulate_gp(ctxt, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
+ c->regs[VCPU_REGS_RAX] = (u32)tsc;
+ c->regs[VCPU_REGS_RDX] = tsc >> 32;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_mov(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ c->dst.val = c->src.val;
+ return X86EMUL_CONTINUE;
+}
+
+#define D(_y) { .flags = (_y) }
+#define N D(0)
+#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
+#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
+#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
+
+#define D2bv(_f) D((_f) | ByteOp), D(_f)
+#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
+
+#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \
+ D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \
+ D2bv(((_f) & ~Lock) | DstAcc | SrcImm)
+
+
+static struct opcode group1[] = {
+ X7(D(Lock)), N
+};
+
+static struct opcode group1A[] = {
+ D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
+};
+
+static struct opcode group3[] = {
+ D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
+ D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+ X4(D(SrcMem | ModRM)),
+};
+
+static struct opcode group4[] = {
+ D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
+ N, N, N, N, N, N,
+};
+
+static struct opcode group5[] = {
+ D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+ D(SrcMem | ModRM | Stack),
+ I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
+ D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
+ D(SrcMem | ModRM | Stack), N,
+};
+
+static struct group_dual group7 = { {
+ N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
+ D(SrcNone | ModRM | DstMem | Mov), N,
+ D(SrcMem16 | ModRM | Mov | Priv),
+ D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
+}, {
+ D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
+ D(SrcNone | ModRM | DstMem | Mov), N,
+ D(SrcMem16 | ModRM | Mov | Priv), N,
+} };
+
+static struct opcode group8[] = {
+ N, N, N, N,
+ D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
+ D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
+};
+
+static struct group_dual group9 = { {
+ N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
+}, {
+ N, N, N, N, N, N, N, N,
+} };
+
+static struct opcode group11[] = {
+ I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
+};
+
+static struct opcode opcode_table[256] = {
+ /* 0x00 - 0x07 */
+ D6ALU(Lock),
+ D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+ /* 0x08 - 0x0F */
+ D6ALU(Lock),
+ D(ImplicitOps | Stack | No64), N,
+ /* 0x10 - 0x17 */
+ D6ALU(Lock),
+ D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+ /* 0x18 - 0x1F */
+ D6ALU(Lock),
+ D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+ /* 0x20 - 0x27 */
+ D6ALU(Lock), N, N,
+ /* 0x28 - 0x2F */
+ D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das),
+ /* 0x30 - 0x37 */
+ D6ALU(Lock), N, N,
+ /* 0x38 - 0x3F */
+ D6ALU(0), N, N,
+ /* 0x40 - 0x4F */
+ X16(D(DstReg)),
+ /* 0x50 - 0x57 */
+ X8(I(SrcReg | Stack, em_push)),
+ /* 0x58 - 0x5F */
+ X8(D(DstReg | Stack)),
+ /* 0x60 - 0x67 */
+ D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+ N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
+ N, N, N, N,
+ /* 0x68 - 0x6F */
+ I(SrcImm | Mov | Stack, em_push),
+ I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
+ I(SrcImmByte | Mov | Stack, em_push),
+ I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
+ D2bv(DstDI | Mov | String), /* insb, insw/insd */
+ D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
+ /* 0x70 - 0x7F */
+ X16(D(SrcImmByte)),
+ /* 0x80 - 0x87 */
+ G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
+ G(DstMem | SrcImm | ModRM | Group, group1),
+ G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
+ G(DstMem | SrcImmByte | ModRM | Group, group1),
+ D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock),
+ /* 0x88 - 0x8F */
+ I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
+ I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
+ D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
+ D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
+ /* 0x90 - 0x97 */
+ X8(D(SrcAcc | DstReg)),
+ /* 0x98 - 0x9F */
+ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
+ I(SrcImmFAddr | No64, em_call_far), N,
+ D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
+ /* 0xA0 - 0xA7 */
+ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
+ I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
+ I2bv(SrcSI | DstDI | Mov | String, em_mov),
+ D2bv(SrcSI | DstDI | String),
+ /* 0xA8 - 0xAF */
+ D2bv(DstAcc | SrcImm),
+ I2bv(SrcAcc | DstDI | Mov | String, em_mov),
+ I2bv(SrcSI | DstAcc | Mov | String, em_mov),
+ D2bv(SrcAcc | DstDI | String),
+ /* 0xB0 - 0xB7 */
+ X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
+ /* 0xB8 - 0xBF */
+ X8(I(DstReg | SrcImm | Mov, em_mov)),
+ /* 0xC0 - 0xC7 */
+ D2bv(DstMem | SrcImmByte | ModRM),
+ I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
+ D(ImplicitOps | Stack),
+ D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
+ G(ByteOp, group11), G(0, group11),
+ /* 0xC8 - 0xCF */
+ N, N, N, D(ImplicitOps | Stack),
+ D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
+ /* 0xD0 - 0xD7 */
+ D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
+ N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ X4(D(SrcImmByte)),
+ D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte),
+ /* 0xE8 - 0xEF */
+ D(SrcImm | Stack), D(SrcImm | ImplicitOps),
+ D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
+ D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps),
+ /* 0xF0 - 0xF7 */
+ N, N, N, N,
+ D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
+ /* 0xF8 - 0xFF */
+ D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
+ D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
+};
+
+static struct opcode twobyte_table[256] = {
+ /* 0x00 - 0x0F */
+ N, GD(0, &group7), N, N,
+ N, D(ImplicitOps), D(ImplicitOps | Priv), N,
+ D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
+ N, D(ImplicitOps | ModRM), N, N,
+ /* 0x10 - 0x1F */
+ N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
+ /* 0x20 - 0x2F */
+ D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264),
+ D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264),
+ N, N, N, N,
+ N, N, N, N, N, N, N, N,
+ /* 0x30 - 0x3F */
+ D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
+ D(ImplicitOps | Priv), N,
+ D(ImplicitOps), D(ImplicitOps | Priv), N, N,
+ N, N, N, N, N, N, N, N,
+ /* 0x40 - 0x4F */
+ X16(D(DstReg | SrcMem | ModRM | Mov)),
+ /* 0x50 - 0x5F */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0x60 - 0x6F */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0x70 - 0x7F */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0x80 - 0x8F */
+ X16(D(SrcImm)),
+ /* 0x90 - 0x9F */
+ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
+ /* 0xA0 - 0xA7 */
+ D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+ N, D(DstMem | SrcReg | ModRM | BitOp),
+ D(DstMem | SrcReg | Src2ImmByte | ModRM),
+ D(DstMem | SrcReg | Src2CL | ModRM), N, N,
+ /* 0xA8 - 0xAF */
+ D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+ N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
+ D(DstMem | SrcReg | Src2ImmByte | ModRM),
+ D(DstMem | SrcReg | Src2CL | ModRM),
+ D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
+ /* 0xB0 - 0xB7 */
+ D2bv(DstMem | SrcReg | ModRM | Lock),
+ D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+ D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM),
+ D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+ /* 0xB8 - 0xBF */
+ N, N,
+ G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+ D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+ D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+ /* 0xC0 - 0xCF */
+ D2bv(DstMem | SrcReg | ModRM | Lock),
+ N, D(DstMem | SrcReg | ModRM | Mov),
+ N, N, N, GD(0, &group9),
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xDF */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xEF */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xFF */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
+};
+
+#undef D
+#undef N
+#undef G
+#undef GD
+#undef I
+
+#undef D2bv
+#undef I2bv
+#undef D6ALU
+
+static unsigned imm_size(struct decode_cache *c)
+{
+ unsigned size;
+
+ size = (c->d & ByteOp) ? 1 : c->op_bytes;
+ if (size == 8)
+ size = 4;
+ return size;
+}
+
+static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
+ unsigned size, bool sign_extension)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct x86_emulate_ops *ops = ctxt->ops;
+ int rc = X86EMUL_CONTINUE;
+
+ op->type = OP_IMM;
+ op->bytes = size;
+ op->addr.mem = c->eip;
+ /* NB. Immediates are sign-extended as necessary. */
+ switch (op->bytes) {
+ case 1:
+ op->val = insn_fetch(s8, 1, c->eip);
+ break;
+ case 2:
+ op->val = insn_fetch(s16, 2, c->eip);
+ break;
+ case 4:
+ op->val = insn_fetch(s32, 4, c->eip);
+ break;
+ }
+ if (!sign_extension) {
+ switch (op->bytes) {
+ case 1:
+ op->val &= 0xff;
+ break;
+ case 2:
+ op->val &= 0xffff;
+ break;
+ case 4:
+ op->val &= 0xffffffff;
+ break;
+ }
+ }
+done:
+ return rc;
}
int
-x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+x86_decode_insn(struct x86_emulate_ctxt *ctxt)
{
- u64 msr_data;
+ struct x86_emulate_ops *ops = ctxt->ops;
struct decode_cache *c = &ctxt->decode;
int rc = X86EMUL_CONTINUE;
- int saved_dst_type = c->dst.type;
+ int mode = ctxt->mode;
+ int def_op_bytes, def_ad_bytes, dual, goffset;
+ struct opcode opcode, *g_mod012, *g_mod3;
+ struct operand memop = { .type = OP_NONE };
+
+ c->eip = ctxt->eip;
+ c->fetch.start = c->fetch.end = c->eip;
+ ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
+
+ switch (mode) {
+ case X86EMUL_MODE_REAL:
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ def_op_bytes = def_ad_bytes = 2;
+ break;
+ case X86EMUL_MODE_PROT32:
+ def_op_bytes = def_ad_bytes = 4;
+ break;
+#ifdef CONFIG_X86_64
+ case X86EMUL_MODE_PROT64:
+ def_op_bytes = 4;
+ def_ad_bytes = 8;
+ break;
+#endif
+ default:
+ return -1;
+ }
+
+ c->op_bytes = def_op_bytes;
+ c->ad_bytes = def_ad_bytes;
+
+ /* Legacy prefixes. */
+ for (;;) {
+ switch (c->b = insn_fetch(u8, 1, c->eip)) {
+ case 0x66: /* operand-size override */
+ /* switch between 2/4 bytes */
+ c->op_bytes = def_op_bytes ^ 6;
+ break;
+ case 0x67: /* address-size override */
+ if (mode == X86EMUL_MODE_PROT64)
+ /* switch between 4/8 bytes */
+ c->ad_bytes = def_ad_bytes ^ 12;
+ else
+ /* switch between 2/4 bytes */
+ c->ad_bytes = def_ad_bytes ^ 6;
+ break;
+ case 0x26: /* ES override */
+ case 0x2e: /* CS override */
+ case 0x36: /* SS override */
+ case 0x3e: /* DS override */
+ set_seg_override(c, (c->b >> 3) & 3);
+ break;
+ case 0x64: /* FS override */
+ case 0x65: /* GS override */
+ set_seg_override(c, c->b & 7);
+ break;
+ case 0x40 ... 0x4f: /* REX */
+ if (mode != X86EMUL_MODE_PROT64)
+ goto done_prefixes;
+ c->rex_prefix = c->b;
+ continue;
+ case 0xf0: /* LOCK */
+ c->lock_prefix = 1;
+ break;
+ case 0xf2: /* REPNE/REPNZ */
+ c->rep_prefix = REPNE_PREFIX;
+ break;
+ case 0xf3: /* REP/REPE/REPZ */
+ c->rep_prefix = REPE_PREFIX;
+ break;
+ default:
+ goto done_prefixes;
+ }
+
+ /* Any legacy prefix after a REX prefix nullifies its effect. */
+
+ c->rex_prefix = 0;
+ }
+
+done_prefixes:
+
+ /* REX prefix. */
+ if (c->rex_prefix & 8)
+ c->op_bytes = 8; /* REX.W */
+
+ /* Opcode byte(s). */
+ opcode = opcode_table[c->b];
+ /* Two-byte opcode? */
+ if (c->b == 0x0f) {
+ c->twobyte = 1;
+ c->b = insn_fetch(u8, 1, c->eip);
+ opcode = twobyte_table[c->b];
+ }
+ c->d = opcode.flags;
+
+ if (c->d & Group) {
+ dual = c->d & GroupDual;
+ c->modrm = insn_fetch(u8, 1, c->eip);
+ --c->eip;
+
+ if (c->d & GroupDual) {
+ g_mod012 = opcode.u.gdual->mod012;
+ g_mod3 = opcode.u.gdual->mod3;
+ } else
+ g_mod012 = g_mod3 = opcode.u.group;
+
+ c->d &= ~(Group | GroupDual);
+
+ goffset = (c->modrm >> 3) & 7;
+
+ if ((c->modrm >> 6) == 3)
+ opcode = g_mod3[goffset];
+ else
+ opcode = g_mod012[goffset];
+ c->d |= opcode.flags;
+ }
+
+ c->execute = opcode.u.execute;
+
+ /* Unrecognised? */
+ if (c->d == 0 || (c->d & Undefined)) {
+ DPRINTF("Cannot emulate %02x\n", c->b);
+ return -1;
+ }
+
+ if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
+ c->op_bytes = 8;
+
+ if (c->d & Op3264) {
+ if (mode == X86EMUL_MODE_PROT64)
+ c->op_bytes = 8;
+ else
+ c->op_bytes = 4;
+ }
+
+ /* ModRM and SIB bytes. */
+ if (c->d & ModRM) {
+ rc = decode_modrm(ctxt, ops, &memop);
+ if (!c->has_seg_override)
+ set_seg_override(c, c->modrm_seg);
+ } else if (c->d & MemAbs)
+ rc = decode_abs(ctxt, ops, &memop);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ if (!c->has_seg_override)
+ set_seg_override(c, VCPU_SREG_DS);
+
+ if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d))
+ memop.addr.mem += seg_override_base(ctxt, ops, c);
+
+ if (memop.type == OP_MEM && c->ad_bytes != 8)
+ memop.addr.mem = (u32)memop.addr.mem;
- ctxt->interruptibility = 0;
+ if (memop.type == OP_MEM && c->rip_relative)
+ memop.addr.mem += c->eip;
- /* Shadow copy of register state. Committed on successful emulation.
- * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
- * modify them.
+ /*
+ * Decode and fetch the source operand: register, memory
+ * or immediate.
+ */
+ switch (c->d & SrcMask) {
+ case SrcNone:
+ break;
+ case SrcReg:
+ decode_register_operand(&c->src, c, 0);
+ break;
+ case SrcMem16:
+ memop.bytes = 2;
+ goto srcmem_common;
+ case SrcMem32:
+ memop.bytes = 4;
+ goto srcmem_common;
+ case SrcMem:
+ memop.bytes = (c->d & ByteOp) ? 1 :
+ c->op_bytes;
+ srcmem_common:
+ c->src = memop;
+ break;
+ case SrcImmU16:
+ rc = decode_imm(ctxt, &c->src, 2, false);
+ break;
+ case SrcImm:
+ rc = decode_imm(ctxt, &c->src, imm_size(c), true);
+ break;
+ case SrcImmU:
+ rc = decode_imm(ctxt, &c->src, imm_size(c), false);
+ break;
+ case SrcImmByte:
+ rc = decode_imm(ctxt, &c->src, 1, true);
+ break;
+ case SrcImmUByte:
+ rc = decode_imm(ctxt, &c->src, 1, false);
+ break;
+ case SrcAcc:
+ c->src.type = OP_REG;
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
+ fetch_register_operand(&c->src);
+ break;
+ case SrcOne:
+ c->src.bytes = 1;
+ c->src.val = 1;
+ break;
+ case SrcSI:
+ c->src.type = OP_MEM;
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->src.addr.mem =
+ register_address(c, seg_override_base(ctxt, ops, c),
+ c->regs[VCPU_REGS_RSI]);
+ c->src.val = 0;
+ break;
+ case SrcImmFAddr:
+ c->src.type = OP_IMM;
+ c->src.addr.mem = c->eip;
+ c->src.bytes = c->op_bytes + 2;
+ insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
+ break;
+ case SrcMemFAddr:
+ memop.bytes = c->op_bytes + 2;
+ goto srcmem_common;
+ break;
+ }
+
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ /*
+ * Decode and fetch the second source operand: register, memory
+ * or immediate.
*/
+ switch (c->d & Src2Mask) {
+ case Src2None:
+ break;
+ case Src2CL:
+ c->src2.bytes = 1;
+ c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
+ break;
+ case Src2ImmByte:
+ rc = decode_imm(ctxt, &c->src2, 1, true);
+ break;
+ case Src2One:
+ c->src2.bytes = 1;
+ c->src2.val = 1;
+ break;
+ case Src2Imm:
+ rc = decode_imm(ctxt, &c->src2, imm_size(c), true);
+ break;
+ }
- memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ /* Decode and fetch the destination operand: register or memory. */
+ switch (c->d & DstMask) {
+ case DstReg:
+ decode_register_operand(&c->dst, c,
+ c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
+ break;
+ case DstImmUByte:
+ c->dst.type = OP_IMM;
+ c->dst.addr.mem = c->eip;
+ c->dst.bytes = 1;
+ c->dst.val = insn_fetch(u8, 1, c->eip);
+ break;
+ case DstMem:
+ case DstMem64:
+ c->dst = memop;
+ if ((c->d & DstMask) == DstMem64)
+ c->dst.bytes = 8;
+ else
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ if (c->d & BitOp)
+ fetch_bit_operand(c);
+ c->dst.orig_val = c->dst.val;
+ break;
+ case DstAcc:
+ c->dst.type = OP_REG;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.addr.reg = &c->regs[VCPU_REGS_RAX];
+ fetch_register_operand(&c->dst);
+ c->dst.orig_val = c->dst.val;
+ break;
+ case DstDI:
+ c->dst.type = OP_MEM;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.addr.mem =
+ register_address(c, es_base(ctxt, ops),
+ c->regs[VCPU_REGS_RDI]);
+ c->dst.val = 0;
+ break;
+ case ImplicitOps:
+ /* Special instructions do their own operand decoding. */
+ default:
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ return 0;
+ }
+
+done:
+ return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+}
+
+static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ /* The second termination condition only applies for REPE
+ * and REPNE. Test if the repeat string operation prefix is
+ * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+ * corresponding termination condition according to:
+ * - if REPE/REPZ and ZF = 0 then done
+ * - if REPNE/REPNZ and ZF = 1 then done
+ */
+ if (((c->b == 0xa6) || (c->b == 0xa7) ||
+ (c->b == 0xae) || (c->b == 0xaf))
+ && (((c->rep_prefix == REPE_PREFIX) &&
+ ((ctxt->eflags & EFLG_ZF) == 0))
+ || ((c->rep_prefix == REPNE_PREFIX) &&
+ ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
+ return true;
+
+ return false;
+}
+
+int
+x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
+{
+ struct x86_emulate_ops *ops = ctxt->ops;
+ u64 msr_data;
+ struct decode_cache *c = &ctxt->decode;
+ int rc = X86EMUL_CONTINUE;
+ int saved_dst_type = c->dst.type;
+ int irq; /* Used for int 3, int, and into */
+
+ ctxt->decode.mem_read.pos = 0;
if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
goto done;
}
/* LOCK prefix is allowed only with some instructions */
if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
+ goto done;
+ }
+
+ if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
+ emulate_ud(ctxt);
goto done;
}
/* Privileged instruction can be executed only in CPL=0 */
if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
goto done;
}
if (c->rep_prefix && (c->d & String)) {
- ctxt->restart = true;
/* All REP prefixes have the same first termination condition */
if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
- string_done:
- ctxt->restart = false;
- kvm_rip_write(ctxt->vcpu, c->eip);
+ ctxt->eip = c->eip;
goto done;
}
- /* The second termination condition only applies for REPE
- * and REPNE. Test if the repeat string operation prefix is
- * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
- * corresponding termination condition according to:
- * - if REPE/REPZ and ZF = 0 then done
- * - if REPNE/REPNZ and ZF = 1 then done
- */
- if ((c->b == 0xa6) || (c->b == 0xa7) ||
- (c->b == 0xae) || (c->b == 0xaf)) {
- if ((c->rep_prefix == REPE_PREFIX) &&
- ((ctxt->eflags & EFLG_ZF) == 0))
- goto string_done;
- if ((c->rep_prefix == REPNE_PREFIX) &&
- ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
- goto string_done;
- }
- c->eip = ctxt->eip;
}
- if (c->src.type == OP_MEM) {
- rc = ops->read_emulated((unsigned long)c->src.ptr,
- &c->src.val,
- c->src.bytes,
- ctxt->vcpu);
+ if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
+ rc = read_emulated(ctxt, ops, c->src.addr.mem,
+ c->src.valptr, c->src.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
- c->src.orig_val = c->src.val;
+ c->src.orig_val64 = c->src.val64;
}
if (c->src2.type == OP_MEM) {
- rc = ops->read_emulated((unsigned long)c->src2.ptr,
- &c->src2.val,
- c->src2.bytes,
- ctxt->vcpu);
+ rc = read_emulated(ctxt, ops, c->src2.addr.mem,
+ &c->src2.val, c->src2.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
}
@@ -2553,8 +3070,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
/* optimisation - avoid slow emulated read if Mov */
- rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
- c->dst.bytes, ctxt->vcpu);
+ rc = read_emulated(ctxt, ops, c->dst.addr.mem,
+ &c->dst.val, c->dst.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
}
@@ -2562,6 +3079,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
special_insn:
+ if (c->execute) {
+ rc = c->execute(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ goto writeback;
+ }
+
if (c->twobyte)
goto twobyte_insn;
@@ -2571,43 +3095,37 @@ special_insn:
emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
break;
case 0x06: /* push es */
- emulate_push_sreg(ctxt, VCPU_SREG_ES);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
break;
case 0x07: /* pop es */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
- if (rc != X86EMUL_CONTINUE)
- goto done;
break;
case 0x08 ... 0x0d:
or: /* or */
emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
break;
case 0x0e: /* push cs */
- emulate_push_sreg(ctxt, VCPU_SREG_CS);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
break;
case 0x10 ... 0x15:
adc: /* adc */
emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
break;
case 0x16: /* push ss */
- emulate_push_sreg(ctxt, VCPU_SREG_SS);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
break;
case 0x17: /* pop ss */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
- if (rc != X86EMUL_CONTINUE)
- goto done;
break;
case 0x18 ... 0x1d:
sbb: /* sbb */
emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
break;
case 0x1e: /* push ds */
- emulate_push_sreg(ctxt, VCPU_SREG_DS);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
break;
case 0x1f: /* pop ds */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
- if (rc != X86EMUL_CONTINUE)
- goto done;
break;
case 0x20 ... 0x25:
and: /* and */
@@ -2631,56 +3149,29 @@ special_insn:
case 0x48 ... 0x4f: /* dec r16/r32 */
emulate_1op("dec", c->dst, ctxt->eflags);
break;
- case 0x50 ... 0x57: /* push reg */
- emulate_push(ctxt);
- break;
case 0x58 ... 0x5f: /* pop reg */
pop_instruction:
rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- goto done;
break;
case 0x60: /* pusha */
- emulate_pusha(ctxt);
+ rc = emulate_pusha(ctxt, ops);
break;
case 0x61: /* popa */
rc = emulate_popa(ctxt, ops);
- if (rc != X86EMUL_CONTINUE)
- goto done;
break;
case 0x63: /* movsxd */
if (ctxt->mode != X86EMUL_MODE_PROT64)
goto cannot_emulate;
c->dst.val = (s32) c->src.val;
break;
- case 0x68: /* push imm */
- case 0x6a: /* push imm8 */
- emulate_push(ctxt);
- break;
case 0x6c: /* insb */
case 0x6d: /* insw/insd */
- c->dst.bytes = min(c->dst.bytes, 4u);
- if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
- c->dst.bytes)) {
- kvm_inject_gp(ctxt->vcpu, 0);
- goto done;
- }
- if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
- c->regs[VCPU_REGS_RDX], &c->dst.val))
- goto done; /* IO is needed, skip writeback */
- break;
+ c->src.val = c->regs[VCPU_REGS_RDX];
+ goto do_io_in;
case 0x6e: /* outsb */
case 0x6f: /* outsw/outsd */
- c->src.bytes = min(c->src.bytes, 4u);
- if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
- c->src.bytes)) {
- kvm_inject_gp(ctxt->vcpu, 0);
- goto done;
- }
- ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
- &c->src.val, 1, ctxt->vcpu);
-
- c->dst.type = OP_NONE; /* nothing to writeback */
+ c->dst.val = c->regs[VCPU_REGS_RDX];
+ goto do_io_out;
break;
case 0x70 ... 0x7f: /* jcc (short) */
if (test_cc(c->b, ctxt->eflags))
@@ -2707,48 +3198,30 @@ special_insn:
}
break;
case 0x84 ... 0x85:
+ test:
emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
break;
case 0x86 ... 0x87: /* xchg */
xchg:
/* Write back the register source. */
- switch (c->dst.bytes) {
- case 1:
- *(u8 *) c->src.ptr = (u8) c->dst.val;
- break;
- case 2:
- *(u16 *) c->src.ptr = (u16) c->dst.val;
- break;
- case 4:
- *c->src.ptr = (u32) c->dst.val;
- break; /* 64b reg: zero-extend */
- case 8:
- *c->src.ptr = c->dst.val;
- break;
- }
+ c->src.val = c->dst.val;
+ write_register_operand(&c->src);
/*
* Write back the memory destination with implicit LOCK
* prefix.
*/
- c->dst.val = c->src.val;
+ c->dst.val = c->src.orig_val;
c->lock_prefix = 1;
break;
- case 0x88 ... 0x8b: /* mov */
- goto mov;
- case 0x8c: { /* mov r/m, sreg */
- struct kvm_segment segreg;
-
- if (c->modrm_reg <= VCPU_SREG_GS)
- kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
- else {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ case 0x8c: /* mov r/m, sreg */
+ if (c->modrm_reg > VCPU_SREG_GS) {
+ emulate_ud(ctxt);
goto done;
}
- c->dst.val = segreg.selector;
+ c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
break;
- }
case 0x8d: /* lea r16/r32, m */
- c->dst.val = c->modrm_ea;
+ c->dst.val = c->src.addr.mem;
break;
case 0x8e: { /* mov seg, r/m16 */
uint16_t sel;
@@ -2757,12 +3230,12 @@ special_insn:
if (c->modrm_reg == VCPU_SREG_CS ||
c->modrm_reg > VCPU_SREG_GS) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
goto done;
}
if (c->modrm_reg == VCPU_SREG_SS)
- toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
@@ -2771,80 +3244,87 @@ special_insn:
}
case 0x8f: /* pop (sole member of Grp1a) */
rc = emulate_grp1a(ctxt, ops);
- if (rc != X86EMUL_CONTINUE)
- goto done;
break;
- case 0x90: /* nop / xchg r8,rax */
- if (!(c->rex_prefix & 1)) { /* nop */
- c->dst.type = OP_NONE;
+ case 0x90 ... 0x97: /* nop / xchg reg, rax */
+ if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
break;
- }
- case 0x91 ... 0x97: /* xchg reg,rax */
- c->src.type = c->dst.type = OP_REG;
- c->src.bytes = c->dst.bytes = c->op_bytes;
- c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
- c->src.val = *(c->src.ptr);
goto xchg;
+ case 0x98: /* cbw/cwde/cdqe */
+ switch (c->op_bytes) {
+ case 2: c->dst.val = (s8)c->dst.val; break;
+ case 4: c->dst.val = (s16)c->dst.val; break;
+ case 8: c->dst.val = (s32)c->dst.val; break;
+ }
+ break;
case 0x9c: /* pushf */
c->src.val = (unsigned long) ctxt->eflags;
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
break;
case 0x9d: /* popf */
c->dst.type = OP_REG;
- c->dst.ptr = (unsigned long *) &ctxt->eflags;
+ c->dst.addr.reg = &ctxt->eflags;
c->dst.bytes = c->op_bytes;
rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- goto done;
break;
- case 0xa0 ... 0xa1: /* mov */
- c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
- c->dst.val = c->src.val;
- break;
- case 0xa2 ... 0xa3: /* mov */
- c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
- break;
- case 0xa4 ... 0xa5: /* movs */
- goto mov;
case 0xa6 ... 0xa7: /* cmps */
c->dst.type = OP_NONE; /* Disable writeback. */
- DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
+ DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
goto cmp;
- case 0xaa ... 0xab: /* stos */
- c->dst.val = c->regs[VCPU_REGS_RAX];
- break;
- case 0xac ... 0xad: /* lods */
- goto mov;
+ case 0xa8 ... 0xa9: /* test ax, imm */
+ goto test;
case 0xae ... 0xaf: /* scas */
- DPRINTF("Urk! I don't handle SCAS.\n");
- goto cannot_emulate;
- case 0xb0 ... 0xbf: /* mov r, imm */
- goto mov;
+ goto cmp;
case 0xc0 ... 0xc1:
emulate_grp2(ctxt);
break;
case 0xc3: /* ret */
c->dst.type = OP_REG;
- c->dst.ptr = &c->eip;
+ c->dst.addr.reg = &c->eip;
c->dst.bytes = c->op_bytes;
goto pop_instruction;
- case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
- mov:
- c->dst.val = c->src.val;
+ case 0xc4: /* les */
+ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
+ break;
+ case 0xc5: /* lds */
+ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS);
break;
case 0xcb: /* ret far */
rc = emulate_ret_far(ctxt, ops);
- if (rc != X86EMUL_CONTINUE)
- goto done;
+ break;
+ case 0xcc: /* int3 */
+ irq = 3;
+ goto do_interrupt;
+ case 0xcd: /* int n */
+ irq = c->src.val;
+ do_interrupt:
+ rc = emulate_int(ctxt, ops, irq);
+ break;
+ case 0xce: /* into */
+ if (ctxt->eflags & EFLG_OF) {
+ irq = 4;
+ goto do_interrupt;
+ }
+ break;
+ case 0xcf: /* iret */
+ rc = emulate_iret(ctxt, ops);
break;
case 0xd0 ... 0xd1: /* Grp2 */
- c->src.val = 1;
emulate_grp2(ctxt);
break;
case 0xd2 ... 0xd3: /* Grp2 */
c->src.val = c->regs[VCPU_REGS_RCX];
emulate_grp2(ctxt);
break;
+ case 0xe0 ... 0xe2: /* loop/loopz/loopnz */
+ register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+ if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
+ (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
+ jmp_rel(c, c->src.val);
+ break;
+ case 0xe3: /* jcxz/jecxz/jrcxz */
+ if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
+ jmp_rel(c, c->src.val);
+ break;
case 0xe4: /* inb */
case 0xe5: /* in */
goto do_io_in;
@@ -2855,19 +3335,23 @@ special_insn:
long int rel = c->src.val;
c->src.val = (unsigned long) c->eip;
jmp_rel(c, rel);
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
break;
}
case 0xe9: /* jmp rel */
goto jmp;
- case 0xea: /* jmp far */
+ case 0xea: { /* jmp far */
+ unsigned short sel;
jump_far:
- if (load_segment_descriptor(ctxt, ops, c->src2.val,
- VCPU_SREG_CS))
+ memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+
+ if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
goto done;
- c->eip = c->src.val;
+ c->eip = 0;
+ memcpy(&c->eip, c->src.valptr, c->op_bytes);
break;
+ }
case 0xeb:
jmp: /* jmp rel short */
jmp_rel(c, c->src.val);
@@ -2879,24 +3363,25 @@ special_insn:
do_io_in:
c->dst.bytes = min(c->dst.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
goto done;
}
if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
&c->dst.val))
goto done; /* IO is needed */
break;
- case 0xee: /* out al,dx */
- case 0xef: /* out (e/r)ax,dx */
- c->src.val = c->regs[VCPU_REGS_RDX];
+ case 0xee: /* out dx,al */
+ case 0xef: /* out dx,(e/r)ax */
+ c->dst.val = c->regs[VCPU_REGS_RDX];
do_io_out:
- c->dst.bytes = min(c->dst.bytes, 4u);
- if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ c->src.bytes = min(c->src.bytes, 4u);
+ if (!emulator_io_permited(ctxt, ops, c->dst.val,
+ c->src.bytes)) {
+ emulate_gp(ctxt, 0);
goto done;
}
- ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
- ctxt->vcpu);
+ ops->pio_out_emulated(c->src.bytes, c->dst.val,
+ &c->src.val, 1, ctxt->vcpu);
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf4: /* hlt */
@@ -2905,53 +3390,53 @@ special_insn:
case 0xf5: /* cmc */
/* complement carry flag from eflags reg */
ctxt->eflags ^= EFLG_CF;
- c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf6 ... 0xf7: /* Grp3 */
- if (!emulate_grp3(ctxt, ops))
- goto cannot_emulate;
+ rc = emulate_grp3(ctxt, ops);
break;
case 0xf8: /* clc */
ctxt->eflags &= ~EFLG_CF;
- c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xf9: /* stc */
+ ctxt->eflags |= EFLG_CF;
break;
case 0xfa: /* cli */
- if (emulator_bad_iopl(ctxt, ops))
- kvm_inject_gp(ctxt->vcpu, 0);
- else {
+ if (emulator_bad_iopl(ctxt, ops)) {
+ emulate_gp(ctxt, 0);
+ goto done;
+ } else
ctxt->eflags &= ~X86_EFLAGS_IF;
- c->dst.type = OP_NONE; /* Disable writeback. */
- }
break;
case 0xfb: /* sti */
- if (emulator_bad_iopl(ctxt, ops))
- kvm_inject_gp(ctxt->vcpu, 0);
- else {
- toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
+ if (emulator_bad_iopl(ctxt, ops)) {
+ emulate_gp(ctxt, 0);
+ goto done;
+ } else {
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
ctxt->eflags |= X86_EFLAGS_IF;
- c->dst.type = OP_NONE; /* Disable writeback. */
}
break;
case 0xfc: /* cld */
ctxt->eflags &= ~EFLG_DF;
- c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfd: /* std */
ctxt->eflags |= EFLG_DF;
- c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfe: /* Grp4 */
grp45:
rc = emulate_grp45(ctxt, ops);
- if (rc != X86EMUL_CONTINUE)
- goto done;
break;
case 0xff: /* Grp5 */
if (c->modrm_reg == 5)
goto jump_far;
goto grp45;
+ default:
+ goto cannot_emulate;
}
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
writeback:
rc = writeback(ctxt, ops);
if (rc != X86EMUL_CONTINUE)
@@ -2964,31 +3449,40 @@ writeback:
c->dst.type = saved_dst_type;
if ((c->d & SrcMask) == SrcSI)
- string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
- &c->src);
+ string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
+ VCPU_REGS_RSI, &c->src);
if ((c->d & DstMask) == DstDI)
- string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
+ string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
+ &c->dst);
if (c->rep_prefix && (c->d & String)) {
- struct read_cache *rc = &ctxt->decode.io_read;
+ struct read_cache *r = &ctxt->decode.io_read;
register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
- /*
- * Re-enter guest when pio read ahead buffer is empty or,
- * if it is not used, after each 1024 iteration.
- */
- if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
- (rc->end != 0 && rc->end == rc->pos))
- ctxt->restart = false;
+
+ if (!string_insn_completed(ctxt)) {
+ /*
+ * Re-enter guest when pio read ahead buffer is empty
+ * or, if it is not used, after each 1024 iteration.
+ */
+ if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) &&
+ (r->end == 0 || r->end != r->pos)) {
+ /*
+ * Reset read cache. Usually happens before
+ * decode, but since instruction is restarted
+ * we have to do it here.
+ */
+ ctxt->decode.mem_read.end = 0;
+ return EMULATION_RESTART;
+ }
+ goto done; /* skip rip writeback */
+ }
}
- /* Commit shadow register state. */
- memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(ctxt->vcpu, c->eip);
- ops->set_rflags(ctxt->vcpu, ctxt->eflags);
+ ctxt->eip = c->eip;
done:
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
twobyte_insn:
switch (c->b) {
@@ -3011,7 +3505,7 @@ twobyte_insn:
c->dst.type = OP_NONE;
break;
case 2: /* lgdt */
- rc = read_descriptor(ctxt, ops, c->src.ptr,
+ rc = read_descriptor(ctxt, ops, c->src.addr.mem,
&size, &address, c->op_bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
@@ -3024,14 +3518,12 @@ twobyte_insn:
switch (c->modrm_rm) {
case 1:
rc = kvm_fix_hypercall(ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
break;
default:
goto cannot_emulate;
}
} else {
- rc = read_descriptor(ctxt, ops, c->src.ptr,
+ rc = read_descriptor(ctxt, ops, c->src.addr.mem,
&size, &address,
c->op_bytes);
if (rc != X86EMUL_CONTINUE)
@@ -3046,15 +3538,15 @@ twobyte_insn:
c->dst.val = ops->get_cr(0, ctxt->vcpu);
break;
case 6: /* lmsw */
- ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) |
+ ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) |
(c->src.val & 0x0f), ctxt->vcpu);
c->dst.type = OP_NONE;
break;
case 5: /* not defined */
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
goto done;
case 7: /* invlpg*/
- emulate_invlpg(ctxt->vcpu, c->modrm_ea);
+ emulate_invlpg(ctxt->vcpu, c->src.addr.mem);
/* Disable writeback. */
c->dst.type = OP_NONE;
break;
@@ -3063,91 +3555,86 @@ twobyte_insn:
}
break;
case 0x05: /* syscall */
- rc = emulate_syscall(ctxt);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- else
- goto writeback;
+ rc = emulate_syscall(ctxt, ops);
break;
case 0x06:
emulate_clts(ctxt->vcpu);
- c->dst.type = OP_NONE;
break;
- case 0x08: /* invd */
case 0x09: /* wbinvd */
+ kvm_emulate_wbinvd(ctxt->vcpu);
+ break;
+ case 0x08: /* invd */
case 0x0d: /* GrpP (prefetch) */
case 0x18: /* Grp16 (prefetch/nop) */
- c->dst.type = OP_NONE;
break;
case 0x20: /* mov cr, reg */
switch (c->modrm_reg) {
case 1:
case 5 ... 7:
case 9 ... 15:
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
goto done;
}
- c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
- c->dst.type = OP_NONE; /* no writeback */
+ c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
break;
case 0x21: /* mov from dr to reg */
if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
(c->modrm_reg == 4 || c->modrm_reg == 5)) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
goto done;
}
- emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
- c->dst.type = OP_NONE; /* no writeback */
+ ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
break;
case 0x22: /* mov reg, cr */
- ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
+ if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
+ emulate_gp(ctxt, 0);
+ goto done;
+ }
c->dst.type = OP_NONE;
break;
case 0x23: /* mov from reg to dr */
if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
(c->modrm_reg == 4 || c->modrm_reg == 5)) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
+ goto done;
+ }
+
+ if (ops->set_dr(c->modrm_reg, c->src.val &
+ ((ctxt->mode == X86EMUL_MODE_PROT64) ?
+ ~0ULL : ~0U), ctxt->vcpu) < 0) {
+ /* #UD condition is already handled by the code above */
+ emulate_gp(ctxt, 0);
goto done;
}
- emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
+
c->dst.type = OP_NONE; /* no writeback */
break;
case 0x30:
/* wrmsr */
msr_data = (u32)c->regs[VCPU_REGS_RAX]
| ((u64)c->regs[VCPU_REGS_RDX] << 32);
- if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
+ emulate_gp(ctxt, 0);
goto done;
}
rc = X86EMUL_CONTINUE;
- c->dst.type = OP_NONE;
break;
case 0x32:
/* rdmsr */
- if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
+ emulate_gp(ctxt, 0);
goto done;
} else {
c->regs[VCPU_REGS_RAX] = (u32)msr_data;
c->regs[VCPU_REGS_RDX] = msr_data >> 32;
}
rc = X86EMUL_CONTINUE;
- c->dst.type = OP_NONE;
break;
case 0x34: /* sysenter */
- rc = emulate_sysenter(ctxt);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- else
- goto writeback;
+ rc = emulate_sysenter(ctxt, ops);
break;
case 0x35: /* sysexit */
- rc = emulate_sysexit(ctxt);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- else
- goto writeback;
+ rc = emulate_sysexit(ctxt, ops);
break;
case 0x40 ... 0x4f: /* cmov */
c->dst.val = c->dst.orig_val = c->src.val;
@@ -3157,15 +3644,15 @@ twobyte_insn:
case 0x80 ... 0x8f: /* jnz rel, etc*/
if (test_cc(c->b, ctxt->eflags))
jmp_rel(c, c->src.val);
- c->dst.type = OP_NONE;
+ break;
+ case 0x90 ... 0x9f: /* setcc r/m8 */
+ c->dst.val = test_cc(c->b, ctxt->eflags);
break;
case 0xa0: /* push fs */
- emulate_push_sreg(ctxt, VCPU_SREG_FS);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
break;
case 0xa1: /* pop fs */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
- if (rc != X86EMUL_CONTINUE)
- goto done;
break;
case 0xa3:
bt: /* bt */
@@ -3179,17 +3666,13 @@ twobyte_insn:
emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
break;
case 0xa8: /* push gs */
- emulate_push_sreg(ctxt, VCPU_SREG_GS);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
break;
case 0xa9: /* pop gs */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
- if (rc != X86EMUL_CONTINUE)
- goto done;
break;
case 0xab:
bts: /* bts */
- /* only subword offset */
- c->src.val &= (c->dst.bytes << 3) - 1;
emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
break;
case 0xac: /* shrd imm8, r, r/m */
@@ -3212,15 +3695,22 @@ twobyte_insn:
} else {
/* Failure: write the value we saw to EAX. */
c->dst.type = OP_REG;
- c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+ c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX];
}
break;
+ case 0xb2: /* lss */
+ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS);
+ break;
case 0xb3:
btr: /* btr */
- /* only subword offset */
- c->src.val &= (c->dst.bytes << 3) - 1;
emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
break;
+ case 0xb4: /* lfs */
+ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS);
+ break;
+ case 0xb5: /* lgs */
+ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS);
+ break;
case 0xb6 ... 0xb7: /* movzx */
c->dst.bytes = c->op_bytes;
c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
@@ -3240,15 +3730,43 @@ twobyte_insn:
break;
case 0xbb:
btc: /* btc */
- /* only subword offset */
- c->src.val &= (c->dst.bytes << 3) - 1;
emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
break;
+ case 0xbc: { /* bsf */
+ u8 zf;
+ __asm__ ("bsf %2, %0; setz %1"
+ : "=r"(c->dst.val), "=q"(zf)
+ : "r"(c->src.val));
+ ctxt->eflags &= ~X86_EFLAGS_ZF;
+ if (zf) {
+ ctxt->eflags |= X86_EFLAGS_ZF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ }
+ break;
+ }
+ case 0xbd: { /* bsr */
+ u8 zf;
+ __asm__ ("bsr %2, %0; setz %1"
+ : "=r"(c->dst.val), "=q"(zf)
+ : "r"(c->src.val));
+ ctxt->eflags &= ~X86_EFLAGS_ZF;
+ if (zf) {
+ ctxt->eflags |= X86_EFLAGS_ZF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ }
+ break;
+ }
case 0xbe ... 0xbf: /* movsx */
c->dst.bytes = c->op_bytes;
c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
(s16) c->src.val;
break;
+ case 0xc0 ... 0xc1: /* xadd */
+ emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+ /* Write back the register source. */
+ c->src.val = c->dst.orig_val;
+ write_register_operand(&c->src);
+ break;
case 0xc3: /* movnti */
c->dst.bytes = c->op_bytes;
c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
@@ -3256,10 +3774,14 @@ twobyte_insn:
break;
case 0xc7: /* Grp9 (cmpxchg8b) */
rc = emulate_grp9(ctxt, ops);
- if (rc != X86EMUL_CONTINUE)
- goto done;
break;
+ default:
+ goto cannot_emulate;
}
+
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
goto writeback;
cannot_emulate:
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 0150affad25d..efad72385058 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,6 +5,7 @@
* Copyright (c) 2006 Intel Corporation
* Copyright (c) 2007 Keir Fraser, XenSource Inc
* Copyright (c) 2008 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,7 @@
#include <linux/kvm_host.h>
#include <linux/slab.h>
+#include <linux/workqueue.h>
#include "irq.h"
#include "i8254.h"
@@ -230,24 +232,26 @@ static void pit_latch_status(struct kvm *kvm, int channel)
}
}
-int pit_has_pending_timer(struct kvm_vcpu *vcpu)
-{
- struct kvm_pit *pit = vcpu->kvm->arch.vpit;
-
- if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
- return atomic_read(&pit->pit_state.pit_timer.pending);
- return 0;
-}
-
static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
irq_ack_notifier);
- raw_spin_lock(&ps->inject_lock);
- if (atomic_dec_return(&ps->pit_timer.pending) < 0)
+ int value;
+
+ spin_lock(&ps->inject_lock);
+ value = atomic_dec_return(&ps->pit_timer.pending);
+ if (value < 0)
+ /* spurious acks can be generated if, for example, the
+ * PIC is being reset. Handle it gracefully here
+ */
atomic_inc(&ps->pit_timer.pending);
+ else if (value > 0)
+ /* in this case, we had multiple outstanding pit interrupts
+ * that we needed to inject. Reinject
+ */
+ queue_work(ps->pit->wq, &ps->pit->expired);
ps->irq_ack = 1;
- raw_spin_unlock(&ps->inject_lock);
+ spin_unlock(&ps->inject_lock);
}
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -263,10 +267,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
-static void destroy_pit_timer(struct kvm_timer *pt)
+static void destroy_pit_timer(struct kvm_pit *pit)
{
- pr_debug("execute del timer!\n");
- hrtimer_cancel(&pt->timer);
+ hrtimer_cancel(&pit->pit_state.pit_timer.timer);
+ cancel_work_sync(&pit->expired);
}
static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -280,6 +284,60 @@ static struct kvm_timer_ops kpit_ops = {
.is_periodic = kpit_is_periodic,
};
+static void pit_do_work(struct work_struct *work)
+{
+ struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
+ struct kvm *kvm = pit->kvm;
+ struct kvm_vcpu *vcpu;
+ int i;
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ int inject = 0;
+
+ /* Try to inject pending interrupts when
+ * last one has been acked.
+ */
+ spin_lock(&ps->inject_lock);
+ if (ps->irq_ack) {
+ ps->irq_ack = 0;
+ inject = 1;
+ }
+ spin_unlock(&ps->inject_lock);
+ if (inject) {
+ kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
+ kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
+
+ /*
+ * Provides NMI watchdog support via Virtual Wire mode.
+ * The route is: PIT -> PIC -> LVT0 in NMI mode.
+ *
+ * Note: Our Virtual Wire implementation is simplified, only
+ * propagating PIT interrupts to all VCPUs when they have set
+ * LVT0 to NMI delivery. Other PIC interrupts are just sent to
+ * VCPU0, and only if its LVT0 is in EXTINT mode.
+ */
+ if (kvm->arch.vapics_in_nmi_mode > 0)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_apic_nmi_wd_deliver(vcpu);
+ }
+}
+
+static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
+{
+ struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+ struct kvm_pit *pt = ktimer->kvm->arch.vpit;
+
+ if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
+ atomic_inc(&ktimer->pending);
+ queue_work(pt->wq, &pt->expired);
+ }
+
+ if (ktimer->t_ops->is_periodic(ktimer)) {
+ hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
+
static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
{
struct kvm_timer *pt = &ps->pit_timer;
@@ -291,13 +349,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
/* TODO The new value only affected after the retriggered */
hrtimer_cancel(&pt->timer);
+ cancel_work_sync(&ps->pit->expired);
pt->period = interval;
ps->is_periodic = is_period;
- pt->timer.function = kvm_timer_fn;
+ pt->timer.function = pit_timer_fn;
pt->t_ops = &kpit_ops;
pt->kvm = ps->pit->kvm;
- pt->vcpu = pt->kvm->bsp_vcpu;
atomic_set(&pt->pending, 0);
ps->irq_ack = 1;
@@ -346,7 +404,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
}
break;
default:
- destroy_pit_timer(&ps->pit_timer);
+ destroy_pit_timer(kvm->arch.vpit);
}
}
@@ -625,7 +683,16 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
mutex_init(&pit->pit_state.lock);
mutex_lock(&pit->pit_state.lock);
- raw_spin_lock_init(&pit->pit_state.inject_lock);
+ spin_lock_init(&pit->pit_state.inject_lock);
+
+ pit->wq = create_singlethread_workqueue("kvm-pit-wq");
+ if (!pit->wq) {
+ mutex_unlock(&pit->pit_state.lock);
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
+ kfree(pit);
+ return NULL;
+ }
+ INIT_WORK(&pit->expired, pit_do_work);
kvm->arch.vpit = pit;
pit->kvm = kvm;
@@ -667,7 +734,7 @@ fail:
kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
kvm_free_irq_source_id(kvm, pit->irq_source_id);
-
+ destroy_workqueue(pit->wq);
kfree(pit);
return NULL;
}
@@ -677,6 +744,9 @@ void kvm_free_pit(struct kvm *kvm)
struct hrtimer *timer;
if (kvm->arch.vpit) {
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+ &kvm->arch.vpit->speaker_dev);
kvm_unregister_irq_mask_notifier(kvm, 0,
&kvm->arch.vpit->mask_notifier);
kvm_unregister_irq_ack_notifier(kvm,
@@ -684,54 +754,10 @@ void kvm_free_pit(struct kvm *kvm)
mutex_lock(&kvm->arch.vpit->pit_state.lock);
timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
hrtimer_cancel(timer);
+ cancel_work_sync(&kvm->arch.vpit->expired);
kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ destroy_workqueue(kvm->arch.vpit->wq);
kfree(kvm->arch.vpit);
}
}
-
-static void __inject_pit_timer_intr(struct kvm *kvm)
-{
- struct kvm_vcpu *vcpu;
- int i;
-
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
-
- /*
- * Provides NMI watchdog support via Virtual Wire mode.
- * The route is: PIT -> PIC -> LVT0 in NMI mode.
- *
- * Note: Our Virtual Wire implementation is simplified, only
- * propagating PIT interrupts to all VCPUs when they have set
- * LVT0 to NMI delivery. Other PIC interrupts are just sent to
- * VCPU0, and only if its LVT0 is in EXTINT mode.
- */
- if (kvm->arch.vapics_in_nmi_mode > 0)
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_apic_nmi_wd_deliver(vcpu);
-}
-
-void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
-{
- struct kvm_pit *pit = vcpu->kvm->arch.vpit;
- struct kvm *kvm = vcpu->kvm;
- struct kvm_kpit_state *ps;
-
- if (pit) {
- int inject = 0;
- ps = &pit->pit_state;
-
- /* Try to inject pending interrupts when
- * last one has been acked.
- */
- raw_spin_lock(&ps->inject_lock);
- if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
- ps->irq_ack = 0;
- inject = 1;
- }
- raw_spin_unlock(&ps->inject_lock);
- if (inject)
- __inject_pit_timer_intr(kvm);
- }
-}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 900d6b0ba7c2..46d08ca0b48f 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
u32 speaker_data_on;
struct mutex lock;
struct kvm_pit *pit;
- raw_spinlock_t inject_lock;
+ spinlock_t inject_lock;
unsigned long irq_ack;
struct kvm_irq_ack_notifier irq_ack_notifier;
};
@@ -40,6 +40,8 @@ struct kvm_pit {
struct kvm_kpit_state pit_state;
int irq_source_id;
struct kvm_irq_mask_notifier mask_notifier;
+ struct workqueue_struct *wq;
+ struct work_struct expired;
};
#define KVM_PIT_BASE_ADDRESS 0x40
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 93825ff3338f..f628234fbeca 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,6 +3,7 @@
*
* Copyright (c) 2003-2004 Fabrice Bellard
* Copyright (c) 2007 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -33,26 +34,41 @@
#include <linux/kvm_host.h>
#include "trace.h"
+static void pic_irq_request(struct kvm *kvm, int level);
+
static void pic_lock(struct kvm_pic *s)
__acquires(&s->lock)
{
- raw_spin_lock(&s->lock);
+ spin_lock(&s->lock);
}
static void pic_unlock(struct kvm_pic *s)
__releases(&s->lock)
{
bool wakeup = s->wakeup_needed;
- struct kvm_vcpu *vcpu;
+ struct kvm_vcpu *vcpu, *found = NULL;
+ int i;
s->wakeup_needed = false;
- raw_spin_unlock(&s->lock);
+ spin_unlock(&s->lock);
if (wakeup) {
- vcpu = s->kvm->bsp_vcpu;
- if (vcpu)
- kvm_vcpu_kick(vcpu);
+ kvm_for_each_vcpu(i, vcpu, s->kvm) {
+ if (kvm_apic_accept_pic_intr(vcpu)) {
+ found = vcpu;
+ break;
+ }
+ }
+
+ if (!found)
+ found = s->kvm->bsp_vcpu;
+
+ if (!found)
+ return;
+
+ kvm_make_request(KVM_REQ_EVENT, found);
+ kvm_vcpu_kick(found);
}
}
@@ -173,10 +189,7 @@ static void pic_update_irq(struct kvm_pic *s)
pic_set_irq1(&s->pics[0], 2, 0);
}
irq = pic_get_irq(&s->pics[0]);
- if (irq >= 0)
- s->irq_request(s->irq_request_opaque, 1);
- else
- s->irq_request(s->irq_request_opaque, 0);
+ pic_irq_request(s->kvm, irq >= 0);
}
void kvm_pic_update_irq(struct kvm_pic *s)
@@ -261,8 +274,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
void kvm_pic_reset(struct kvm_kpic_state *s)
{
int irq;
- struct kvm *kvm = s->pics_state->irq_request_opaque;
- struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
+ struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;
u8 irr = s->irr, isr = s->imr;
s->last_irr = 0;
@@ -297,14 +309,17 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
addr &= 1;
if (addr == 0) {
if (val & 0x10) {
- kvm_pic_reset(s); /* init */
- /*
- * deassert a pending interrupt
- */
- s->pics_state->irq_request(s->pics_state->
- irq_request_opaque, 0);
- s->init_state = 1;
s->init4 = val & 1;
+ s->last_irr = 0;
+ s->imr = 0;
+ s->priority_add = 0;
+ s->special_mask = 0;
+ s->read_reg_select = 0;
+ if (!s->init4) {
+ s->special_fully_nested_mode = 0;
+ s->auto_eoi = 0;
+ }
+ s->init_state = 1;
if (val & 0x02)
printk(KERN_ERR "single mode not supported");
if (val & 0x08)
@@ -356,10 +371,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
}
} else
switch (s->init_state) {
- case 0: /* normal mode */
+ case 0: { /* normal mode */
+ u8 imr_diff = s->imr ^ val,
+ off = (s == &s->pics_state->pics[0]) ? 0 : 8;
s->imr = val;
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+ if (imr_diff & (1 << irq))
+ kvm_fire_mask_notifiers(
+ s->pics_state->kvm,
+ SELECT_PIC(irq + off),
+ irq + off,
+ !!(s->imr & (1 << irq)));
pic_update_irq(s->pics_state);
break;
+ }
case 1:
s->irq_base = val & 0xf8;
s->init_state = 2;
@@ -518,9 +543,8 @@ static int picdev_read(struct kvm_io_device *this,
/*
* callback when PIC0 irq status changed
*/
-static void pic_irq_request(void *opaque, int level)
+static void pic_irq_request(struct kvm *kvm, int level)
{
- struct kvm *kvm = opaque;
struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
struct kvm_pic *s = pic_irqchip(kvm);
int irq = pic_get_irq(&s->pics[0]);
@@ -545,12 +569,10 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
if (!s)
return NULL;
- raw_spin_lock_init(&s->lock);
+ spin_lock_init(&s->lock);
s->kvm = kvm;
s->pics[0].elcr_mask = 0xf8;
s->pics[1].elcr_mask = 0xde;
- s->irq_request = pic_irq_request;
- s->irq_request_opaque = kvm;
s->pics[0].pics_state = s;
s->pics[1].pics_state = s;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 96dfbb6ad2a9..7e06ba1618bd 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,6 +1,7 @@
/*
* irq.c: API for in kernel interrupt controller
* Copyright (c) 2007, Intel Corporation.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
@@ -32,12 +33,7 @@
*/
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
{
- int ret;
-
- ret = pit_has_pending_timer(vcpu);
- ret |= apic_has_pending_timer(vcpu);
-
- return ret;
+ return apic_has_pending_timer(vcpu);
}
EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
@@ -89,7 +85,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
kvm_inject_apic_timer_irqs(vcpu);
- kvm_inject_pit_timer_irqs(vcpu);
/* TODO: PIT, RTC etc. */
}
EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index cd1f362f413d..ba910d149410 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -38,14 +38,11 @@
struct kvm;
struct kvm_vcpu;
-typedef void irq_request_func(void *opaque, int level);
-
struct kvm_kpic_state {
u8 last_irr; /* edge detection */
u8 irr; /* interrupt request register */
u8 imr; /* interrupt mask register */
u8 isr; /* interrupt service register */
- u8 isr_ack; /* interrupt ack detection */
u8 priority_add; /* highest irq priority */
u8 irq_base;
u8 read_reg_select;
@@ -58,17 +55,16 @@ struct kvm_kpic_state {
u8 init4; /* true if 4 byte init */
u8 elcr; /* PIIX edge/trigger selection */
u8 elcr_mask;
+ u8 isr_ack; /* interrupt ack detection */
struct kvm_pic *pics_state;
};
struct kvm_pic {
- raw_spinlock_t lock;
+ spinlock_t lock;
bool wakeup_needed;
unsigned pending_acks;
struct kvm *kvm;
struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
- irq_request_func *irq_request;
- void *irq_request_opaque;
int output; /* intr from master PIC */
struct kvm_io_device dev;
void (*ack_notifier)(void *opaque, int irq);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index cff851cf5322..975bb45329a1 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -36,11 +36,20 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
{
+ might_sleep(); /* on svm */
+
if (!test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail))
kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
- return vcpu->arch.pdptrs[index];
+ return vcpu->arch.walk_mmu->pdptrs[index];
+}
+
+static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index)
+{
+ load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu));
+
+ return mmu->pdptrs[index];
}
static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
@@ -69,4 +78,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
return kvm_read_cr4_bits(vcpu, ~0UL);
}
+static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
+{
+ return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
+ | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
+}
+
#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1eb7a4ae0c9c..413f8973a855 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,6 +5,7 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2007 Novell
* Copyright (C) 2007 Intel
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Dor Laor <dor.laor@qumranet.com>
@@ -258,9 +259,10 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
static void apic_update_ppr(struct kvm_lapic *apic)
{
- u32 tpr, isrv, ppr;
+ u32 tpr, isrv, ppr, old_ppr;
int isr;
+ old_ppr = apic_get_reg(apic, APIC_PROCPRI);
tpr = apic_get_reg(apic, APIC_TASKPRI);
isr = apic_find_highest_isr(apic);
isrv = (isr != -1) ? isr : 0;
@@ -273,7 +275,10 @@ static void apic_update_ppr(struct kvm_lapic *apic)
apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
apic, ppr, isr, isrv);
- apic_set_reg(apic, APIC_PROCPRI, ppr);
+ if (old_ppr != ppr) {
+ apic_set_reg(apic, APIC_PROCPRI, ppr);
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+ }
}
static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
@@ -328,7 +333,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
"dest_mode 0x%x, short_hand 0x%x\n",
target, source, dest, dest_mode, short_hand);
- ASSERT(!target);
+ ASSERT(target);
switch (short_hand) {
case APIC_DEST_NOSHORT:
if (dest_mode == 0)
@@ -390,6 +395,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
break;
}
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
break;
@@ -415,6 +421,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
"INIT on a runnable vcpu %d\n",
vcpu->vcpu_id);
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
} else {
apic_debug("Ignoring de-assert INIT to vcpu %d\n",
@@ -429,6 +436,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
result = 1;
vcpu->arch.sipi_vector = vector;
vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
}
break;
@@ -474,6 +482,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)
trigger_mode = IOAPIC_EDGE_TRIG;
if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
}
static void apic_send_ipi(struct kvm_lapic *apic)
@@ -533,7 +542,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
struct kvm_vcpu *vcpu = apic->vcpu;
struct kvm_run *run = vcpu->run;
- set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
+ kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
run->tpr_access.rip = kvm_rip_read(vcpu);
run->tpr_access.is_write = write;
}
@@ -1055,14 +1064,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
vcpu->arch.apic = apic;
- apic->regs_page = alloc_page(GFP_KERNEL);
+ apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
if (apic->regs_page == NULL) {
printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
vcpu->vcpu_id);
goto nomem_free_apic;
}
apic->regs = page_address(apic->regs_page);
- memset(apic->regs, 0, PAGE_SIZE);
apic->vcpu = vcpu;
hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
@@ -1106,13 +1114,11 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
int r = 0;
- if (kvm_vcpu_is_bsp(vcpu)) {
- if (!apic_hw_enabled(vcpu->arch.apic))
- r = 1;
- if ((lvt0 & APIC_LVT_MASKED) == 0 &&
- GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
- r = 1;
- }
+ if (!apic_hw_enabled(vcpu->arch.apic))
+ r = 1;
+ if ((lvt0 & APIC_LVT_MASKED) == 0 &&
+ GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
+ r = 1;
return r;
}
@@ -1153,6 +1159,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
update_divide_count(apic);
start_apic_timer(apic);
apic->irr_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b1ed0a1a5913..908ea5464a51 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,6 +7,7 @@
* MMU support
*
* Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
@@ -32,6 +33,7 @@
#include <linux/compiler.h>
#include <linux/srcu.h>
#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
@@ -47,15 +49,25 @@
*/
bool tdp_enabled = false;
-#undef MMU_DEBUG
+enum {
+ AUDIT_PRE_PAGE_FAULT,
+ AUDIT_POST_PAGE_FAULT,
+ AUDIT_PRE_PTE_WRITE,
+ AUDIT_POST_PTE_WRITE,
+ AUDIT_PRE_SYNC,
+ AUDIT_POST_SYNC
+};
-#undef AUDIT
+char *audit_point_name[] = {
+ "pre page fault",
+ "post page fault",
+ "pre pte write",
+ "post pte write",
+ "pre sync",
+ "post sync"
+};
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
+#undef MMU_DEBUG
#ifdef MMU_DEBUG
@@ -69,7 +81,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
#endif
-#if defined(MMU_DEBUG) || defined(AUDIT)
+#ifdef MMU_DEBUG
static int dbg = 0;
module_param(dbg, bool, 0644);
#endif
@@ -87,11 +99,11 @@ module_param(oos_shadow, bool, 0644);
}
#endif
+#define PTE_PREFETCH_NUM 8
+
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
#define PT64_LEVEL_BITS 9
#define PT64_LEVEL_SHIFT(level) \
@@ -173,11 +185,12 @@ struct kvm_shadow_walk_iterator {
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
-typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
+typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
static struct kmem_cache *mmu_page_header_cache;
+static struct percpu_counter kvm_total_used_mmu_pages;
static u64 __read_mostly shadow_trap_nonpresent_pte;
static u64 __read_mostly shadow_notrap_nonpresent_pte;
@@ -281,13 +294,70 @@ static gfn_t pse36_gfn_delta(u32 gpte)
static void __set_spte(u64 *sptep, u64 spte)
{
+ set_64bit(sptep, spte);
+}
+
+static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+{
#ifdef CONFIG_X86_64
- set_64bit((unsigned long *)sptep, spte);
+ return xchg(sptep, new_spte);
#else
- set_64bit((unsigned long long *)sptep, spte);
+ u64 old_spte;
+
+ do {
+ old_spte = *sptep;
+ } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+
+ return old_spte;
#endif
}
+static bool spte_has_volatile_bits(u64 spte)
+{
+ if (!shadow_accessed_mask)
+ return false;
+
+ if (!is_shadow_present_pte(spte))
+ return false;
+
+ if ((spte & shadow_accessed_mask) &&
+ (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
+ return false;
+
+ return true;
+}
+
+static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
+{
+ return (old_spte & bit_mask) && !(new_spte & bit_mask);
+}
+
+static void update_spte(u64 *sptep, u64 new_spte)
+{
+ u64 mask, old_spte = *sptep;
+
+ WARN_ON(!is_rmap_spte(new_spte));
+
+ new_spte |= old_spte & shadow_dirty_mask;
+
+ mask = shadow_accessed_mask;
+ if (is_writable_pte(old_spte))
+ mask |= shadow_dirty_mask;
+
+ if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
+ __set_spte(sptep, new_spte);
+ else
+ old_spte = __xchg_spte(sptep, new_spte);
+
+ if (!shadow_accessed_mask)
+ return;
+
+ if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
+ kvm_set_pfn_accessed(spte_to_pfn(old_spte));
+ if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
+ kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+}
+
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
struct kmem_cache *base_cache, int min)
{
@@ -304,10 +374,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
return 0;
}
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
+ struct kmem_cache *cache)
{
while (mc->nobjs)
- kfree(mc->objects[--mc->nobjs]);
+ kmem_cache_free(cache, mc->objects[--mc->nobjs]);
}
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
@@ -341,7 +412,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
if (r)
goto out;
r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
- rmap_desc_cache, 4);
+ rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
if (r)
goto out;
r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -355,10 +426,11 @@ out:
static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
- mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ mmu_page_header_cache);
}
static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
@@ -379,7 +451,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
{
- kfree(pc);
+ kmem_cache_free(pte_chain_cache, pc);
}
static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
@@ -390,7 +462,23 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
{
- kfree(rd);
+ kmem_cache_free(rmap_desc_cache, rd);
+}
+
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
+{
+ if (!sp->role.direct)
+ return sp->gfns[index];
+
+ return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
+}
+
+static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
+{
+ if (sp->role.direct)
+ BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
+ else
+ sp->gfns[index] = gfn;
}
/*
@@ -403,8 +491,8 @@ static int *slot_largepage_idx(gfn_t gfn,
{
unsigned long idx;
- idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+ idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
return &slot->lpage_info[level - 2][idx].write_count;
}
@@ -414,9 +502,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
int *write_count;
int i;
- gfn = unalias_gfn(kvm, gfn);
-
- slot = gfn_to_memslot_unaliased(kvm, gfn);
+ slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
write_count = slot_largepage_idx(gfn, slot, i);
@@ -430,8 +516,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
int *write_count;
int i;
- gfn = unalias_gfn(kvm, gfn);
- slot = gfn_to_memslot_unaliased(kvm, gfn);
+ slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
write_count = slot_largepage_idx(gfn, slot, i);
@@ -447,8 +532,7 @@ static int has_wrprotected_page(struct kvm *kvm,
struct kvm_memory_slot *slot;
int *largepage_idx;
- gfn = unalias_gfn(kvm, gfn);
- slot = gfn_to_memslot_unaliased(kvm, gfn);
+ slot = gfn_to_memslot(kvm, gfn);
if (slot) {
largepage_idx = slot_largepage_idx(gfn, slot, level);
return *largepage_idx;
@@ -501,7 +585,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
/*
* Take gfn and return the reverse mapping to it.
- * Note: gfn must be unaliased before this function get called
*/
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
@@ -513,8 +596,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
if (likely(level == PT_PAGE_TABLE_LEVEL))
return &slot->rmap[gfn - slot->base_gfn];
- idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+ idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
return &slot->lpage_info[level - 2][idx].rmap_pde;
}
@@ -541,9 +624,8 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
if (!is_rmap_spte(*spte))
return count;
- gfn = unalias_gfn(vcpu->kvm, gfn);
sp = page_header(__pa(spte));
- sp->gfns[spte - sp->spt] = gfn;
+ kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
if (!*rmapp) {
rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
@@ -554,6 +636,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
desc->sptes[0] = (u64 *)*rmapp;
desc->sptes[1] = spte;
*rmapp = (unsigned long)desc | 1;
+ ++count;
} else {
rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
@@ -566,7 +649,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
desc = desc->more;
}
for (i = 0; desc->sptes[i]; ++i)
- ;
+ ++count;
desc->sptes[i] = spte;
}
return count;
@@ -600,32 +683,25 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
struct kvm_rmap_desc *desc;
struct kvm_rmap_desc *prev_desc;
struct kvm_mmu_page *sp;
- pfn_t pfn;
+ gfn_t gfn;
unsigned long *rmapp;
int i;
- if (!is_rmap_spte(*spte))
- return;
sp = page_header(__pa(spte));
- pfn = spte_to_pfn(*spte);
- if (*spte & shadow_accessed_mask)
- kvm_set_pfn_accessed(pfn);
- if (is_writable_pte(*spte))
- kvm_set_pfn_dirty(pfn);
- rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
+ gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
+ rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
if (!*rmapp) {
- printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
+ printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
BUG();
} else if (!(*rmapp & 1)) {
- rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
+ rmap_printk("rmap_remove: %p 1->0\n", spte);
if ((u64 *)*rmapp != spte) {
- printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
- spte, *spte);
+ printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte);
BUG();
}
*rmapp = 0;
} else {
- rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
+ rmap_printk("rmap_remove: %p many->many\n", spte);
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
prev_desc = NULL;
while (desc) {
@@ -639,11 +715,37 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
prev_desc = desc;
desc = desc->more;
}
- pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
+ pr_err("rmap_remove: %p many->many\n", spte);
BUG();
}
}
+static void set_spte_track_bits(u64 *sptep, u64 new_spte)
+{
+ pfn_t pfn;
+ u64 old_spte = *sptep;
+
+ if (!spte_has_volatile_bits(old_spte))
+ __set_spte(sptep, new_spte);
+ else
+ old_spte = __xchg_spte(sptep, new_spte);
+
+ if (!is_rmap_spte(old_spte))
+ return;
+
+ pfn = spte_to_pfn(old_spte);
+ if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+ kvm_set_pfn_accessed(pfn);
+ if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
+ kvm_set_pfn_dirty(pfn);
+}
+
+static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+{
+ set_spte_track_bits(sptep, new_spte);
+ rmap_remove(kvm, sptep);
+}
+
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
{
struct kvm_rmap_desc *desc;
@@ -676,7 +778,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
u64 *spte;
int i, write_protected = 0;
- gfn = unalias_gfn(kvm, gfn);
rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
spte = rmap_next(kvm, rmapp, NULL);
@@ -685,18 +786,11 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
if (is_writable_pte(*spte)) {
- __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
+ update_spte(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
}
- if (write_protected) {
- pfn_t pfn;
-
- spte = rmap_next(kvm, rmapp, NULL);
- pfn = spte_to_pfn(*spte);
- kvm_set_pfn_dirty(pfn);
- }
/* check for huge page mappings */
for (i = PT_DIRECTORY_LEVEL;
@@ -709,9 +803,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
if (is_writable_pte(*spte)) {
- rmap_remove(kvm, spte);
+ drop_spte(kvm, spte,
+ shadow_trap_nonpresent_pte);
--kvm->stat.lpages;
- __set_spte(spte, shadow_trap_nonpresent_pte);
spte = NULL;
write_protected = 1;
}
@@ -731,8 +825,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
while ((spte = rmap_next(kvm, rmapp, NULL))) {
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
- rmap_remove(kvm, spte);
- __set_spte(spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
need_tlb_flush = 1;
}
return need_tlb_flush;
@@ -754,8 +847,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
need_flush = 1;
if (pte_write(*ptep)) {
- rmap_remove(kvm, spte);
- __set_spte(spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
spte = rmap_next(kvm, rmapp, NULL);
} else {
new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -763,9 +855,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~SPTE_HOST_WRITEABLE;
- if (is_writable_pte(*spte))
- kvm_set_pfn_dirty(spte_to_pfn(*spte));
- __set_spte(spte, new_spte);
+ new_spte &= ~shadow_accessed_mask;
+ set_spte_track_bits(spte, new_spte);
spte = rmap_next(kvm, rmapp, spte);
}
}
@@ -799,8 +890,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
ret = handler(kvm, &memslot->rmap[gfn_offset], data);
for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
- int idx = gfn_offset;
- idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
+ unsigned long idx;
+ int sh;
+
+ sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
+ idx = ((memslot->base_gfn+gfn_offset) >> sh) -
+ (memslot->base_gfn >> sh);
ret |= handler(kvm,
&memslot->lpage_info[j][idx].rmap_pde,
data);
@@ -863,7 +958,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
sp = page_header(__pa(spte));
- gfn = unalias_gfn(vcpu->kvm, gfn);
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
@@ -891,14 +985,28 @@ static int is_empty_shadow_page(u64 *spt)
}
#endif
+/*
+ * This value is the sum of all of the kvm instances's
+ * kvm->arch.n_used_mmu_pages values. We need a global,
+ * aggregate version in order to make the slab shrinker
+ * faster
+ */
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
+{
+ kvm->arch.n_used_mmu_pages += nr;
+ percpu_counter_add(&kvm_total_used_mmu_pages, nr);
+}
+
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
+ hlist_del(&sp->hash_link);
list_del(&sp->link);
__free_page(virt_to_page(sp->spt));
- __free_page(virt_to_page(sp->gfns));
- kfree(sp);
- ++kvm->arch.n_free_mmu_pages;
+ if (!sp->role.direct)
+ __free_page(virt_to_page(sp->gfns));
+ kmem_cache_free(mmu_page_header_cache, sp);
+ kvm_mod_used_mmu_pages(kvm, -1);
}
static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -907,19 +1015,21 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
}
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
- u64 *parent_pte)
+ u64 *parent_pte, int direct)
{
struct kvm_mmu_page *sp;
sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
- sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+ if (!direct)
+ sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
+ PAGE_SIZE);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
sp->multimapped = 0;
sp->parent_pte = parent_pte;
- --vcpu->kvm->arch.n_free_mmu_pages;
+ kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
}
@@ -998,7 +1108,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
BUG();
}
-
static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
{
struct kvm_pte_chain *pte_chain;
@@ -1008,63 +1117,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
if (!sp->multimapped && sp->parent_pte) {
parent_sp = page_header(__pa(sp->parent_pte));
- fn(parent_sp);
- mmu_parent_walk(parent_sp, fn);
+ fn(parent_sp, sp->parent_pte);
return;
}
+
hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
+ u64 *spte = pte_chain->parent_ptes[i];
+
+ if (!spte)
break;
- parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
- fn(parent_sp);
- mmu_parent_walk(parent_sp, fn);
+ parent_sp = page_header(__pa(spte));
+ fn(parent_sp, spte);
}
}
-static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
{
- unsigned int index;
- struct kvm_mmu_page *sp = page_header(__pa(spte));
-
- index = spte - sp->spt;
- if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
- sp->unsync_children++;
- WARN_ON(!sp->unsync_children);
+ mmu_parent_walk(sp, mark_unsync);
}
-static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
+ unsigned int index;
- if (!sp->parent_pte)
+ index = spte - sp->spt;
+ if (__test_and_set_bit(index, sp->unsync_child_bitmap))
return;
-
- if (!sp->multimapped) {
- kvm_mmu_update_unsync_bitmap(sp->parent_pte);
+ if (sp->unsync_children++)
return;
- }
-
- hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
- break;
- kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
- }
-}
-
-static int unsync_walk_fn(struct kvm_mmu_page *sp)
-{
- kvm_mmu_update_parents_unsync(sp);
- return 1;
-}
-
-static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
-{
- mmu_parent_walk(sp, unsync_walk_fn);
- kvm_mmu_update_parents_unsync(sp);
+ kvm_mmu_mark_parents_unsync(sp);
}
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
@@ -1077,7 +1160,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
}
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
+ struct kvm_mmu_page *sp, bool clear_unsync)
{
return 1;
}
@@ -1123,35 +1206,40 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
int i, ret, nr_unsync_leaf = 0;
for_each_unsync_children(sp->unsync_child_bitmap, i) {
+ struct kvm_mmu_page *child;
u64 ent = sp->spt[i];
- if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
- struct kvm_mmu_page *child;
- child = page_header(ent & PT64_BASE_ADDR_MASK);
-
- if (child->unsync_children) {
- if (mmu_pages_add(pvec, child, i))
- return -ENOSPC;
-
- ret = __mmu_unsync_walk(child, pvec);
- if (!ret)
- __clear_bit(i, sp->unsync_child_bitmap);
- else if (ret > 0)
- nr_unsync_leaf += ret;
- else
- return ret;
- }
+ if (!is_shadow_present_pte(ent) || is_large_pte(ent))
+ goto clear_child_bitmap;
+
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+ if (child->unsync_children) {
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+
+ ret = __mmu_unsync_walk(child, pvec);
+ if (!ret)
+ goto clear_child_bitmap;
+ else if (ret > 0)
+ nr_unsync_leaf += ret;
+ else
+ return ret;
+ } else if (child->unsync) {
+ nr_unsync_leaf++;
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+ } else
+ goto clear_child_bitmap;
- if (child->unsync) {
- nr_unsync_leaf++;
- if (mmu_pages_add(pvec, child, i))
- return -ENOSPC;
- }
- }
+ continue;
+
+clear_child_bitmap:
+ __clear_bit(i, sp->unsync_child_bitmap);
+ sp->unsync_children--;
+ WARN_ON((int)sp->unsync_children < 0);
}
- if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
- sp->unsync_children = 0;
return nr_unsync_leaf;
}
@@ -1166,26 +1254,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
return __mmu_unsync_walk(sp, pvec);
}
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
-{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *sp;
- struct hlist_node *node;
-
- pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry(sp, node, bucket, hash_link)
- if (sp->gfn == gfn && !sp->role.direct
- && !sp->role.invalid) {
- pgprintk("%s: found role %x\n",
- __func__, sp->role.word);
- return sp;
- }
- return NULL;
-}
-
static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
WARN_ON(!sp->unsync);
@@ -1194,20 +1262,36 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
--kvm->stat.mmu_unsync;
}
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list);
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list);
+
+#define for_each_gfn_sp(kvm, sp, gfn, pos) \
+ hlist_for_each_entry(sp, pos, \
+ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
+ if ((sp)->gfn != (gfn)) {} else
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \
+ hlist_for_each_entry(sp, pos, \
+ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
+ if ((sp)->gfn != (gfn) || (sp)->role.direct || \
+ (sp)->role.invalid) {} else
+
+/* @sp->gfn should be write-protected at the call site */
+static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list, bool clear_unsync)
{
if (sp->role.cr4_pae != !!is_pae(vcpu)) {
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return 1;
}
- if (rmap_write_protect(vcpu->kvm, sp->gfn))
- kvm_flush_remote_tlbs(vcpu->kvm);
- kvm_unlink_unsync_page(vcpu->kvm, sp);
- if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ if (clear_unsync)
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+
+ if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return 1;
}
@@ -1215,6 +1299,52 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return 0;
}
+static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ LIST_HEAD(invalid_list);
+ int ret;
+
+ ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
+ if (ret)
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
+ return ret;
+}
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ return __kvm_sync_page(vcpu, sp, invalid_list, true);
+}
+
+/* @gfn should be write-protected at the call site */
+static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ struct kvm_mmu_page *s;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
+ bool flush = false;
+
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+ if (!s->unsync)
+ continue;
+
+ WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
+ (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
+ continue;
+ }
+ kvm_unlink_unsync_page(vcpu->kvm, s);
+ flush = true;
+ }
+
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ if (flush)
+ kvm_mmu_flush_tlb(vcpu);
+}
+
struct mmu_page_path {
struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
unsigned int idx[PT64_ROOT_LEVEL-1];
@@ -1281,6 +1411,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp;
struct mmu_page_path parents;
struct kvm_mmu_pages pages;
+ LIST_HEAD(invalid_list);
kvm_mmu_pages_init(parent, &parents, &pages);
while (mmu_unsync_walk(parent, &pages)) {
@@ -1293,9 +1424,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
kvm_flush_remote_tlbs(vcpu->kvm);
for_each_sp(pages, sp, parents, i) {
- kvm_sync_page(vcpu, sp);
+ kvm_sync_page(vcpu, sp, &invalid_list);
mmu_pages_clear_parents(&parents);
}
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
cond_resched_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_pages_init(parent, &parents, &pages);
}
@@ -1310,11 +1442,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
u64 *parent_pte)
{
union kvm_mmu_page_role role;
- unsigned index;
unsigned quadrant;
- struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node, *tmp;
+ struct hlist_node *node;
+ bool need_sync = false;
role = vcpu->arch.mmu.base_role;
role.level = level;
@@ -1322,40 +1453,46 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (role.direct)
role.cr4_pae = 0;
role.access = access;
- if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+ if (!vcpu->arch.mmu.direct_map
+ && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
- index = kvm_page_table_hashfn(gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
- if (sp->gfn == gfn) {
- if (sp->unsync)
- if (kvm_sync_page(vcpu, sp))
- continue;
+ for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
+ if (!need_sync && sp->unsync)
+ need_sync = true;
- if (sp->role.word != role.word)
- continue;
+ if (sp->role.word != role.word)
+ continue;
- mmu_page_add_parent_pte(vcpu, sp, parent_pte);
- if (sp->unsync_children) {
- set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
- kvm_mmu_mark_parents_unsync(sp);
- }
- trace_kvm_mmu_get_page(sp, false);
- return sp;
- }
+ if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
+ break;
+
+ mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+ if (sp->unsync_children) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_mmu_mark_parents_unsync(sp);
+ } else if (sp->unsync)
+ kvm_mmu_mark_parents_unsync(sp);
+
+ trace_kvm_mmu_get_page(sp, false);
+ return sp;
+ }
++vcpu->kvm->stat.mmu_cache_miss;
- sp = kvm_mmu_alloc_page(vcpu, parent_pte);
+ sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
if (!sp)
return sp;
sp->gfn = gfn;
sp->role = role;
- hlist_add_head(&sp->hash_link, bucket);
+ hlist_add_head(&sp->hash_link,
+ &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
if (!direct) {
if (rmap_write_protect(vcpu->kvm, gfn))
kvm_flush_remote_tlbs(vcpu->kvm);
+ if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+ kvm_sync_pages(vcpu, gfn);
+
account_shadowed(vcpu->kvm, gfn);
}
if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1372,6 +1509,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
iterator->addr = addr;
iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
iterator->level = vcpu->arch.mmu.shadow_root_level;
+
+ if (iterator->level == PT64_ROOT_LEVEL &&
+ vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
+ !vcpu->arch.mmu.direct_map)
+ --iterator->level;
+
if (iterator->level == PT32E_ROOT_LEVEL) {
iterator->shadow_addr
= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
@@ -1402,6 +1545,47 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
--iterator->level;
}
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+{
+ u64 spte;
+
+ spte = __pa(sp->spt)
+ | PT_PRESENT_MASK | PT_ACCESSED_MASK
+ | PT_WRITABLE_MASK | PT_USER_MASK;
+ __set_spte(sptep, spte);
+}
+
+static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ if (is_large_pte(*sptep)) {
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ }
+}
+
+static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned direct_access)
+{
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
+ struct kvm_mmu_page *child;
+
+ /*
+ * For the direct sp, if the guest pte's dirty bit
+ * changed form clean to dirty, it will corrupt the
+ * sp's access: allow writable in the read-only sp,
+ * so we should update the spte at this point to get
+ * a new sp with the correct access.
+ */
+ child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+ if (child->role.access == direct_access)
+ return;
+
+ mmu_page_remove_parent_pte(child, sptep);
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ }
+}
+
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
struct kvm_mmu_page *sp)
{
@@ -1422,7 +1606,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
} else {
if (is_large_pte(ent))
--kvm->stat.lpages;
- rmap_remove(kvm, &pt[i]);
+ drop_spte(kvm, &pt[i],
+ shadow_trap_nonpresent_pte);
}
}
pt[i] = shadow_trap_nonpresent_pte;
@@ -1464,7 +1649,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
}
static int mmu_zap_unsync_children(struct kvm *kvm,
- struct kvm_mmu_page *parent)
+ struct kvm_mmu_page *parent,
+ struct list_head *invalid_list)
{
int i, zapped = 0;
struct mmu_page_path parents;
@@ -1478,7 +1664,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
struct kvm_mmu_page *sp;
for_each_sp(pages, sp, parents, i) {
- kvm_mmu_zap_page(kvm, sp);
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
mmu_pages_clear_parents(&parents);
zapped++;
}
@@ -1488,112 +1674,113 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
return zapped;
}
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
{
int ret;
- trace_kvm_mmu_zap_page(sp);
+ trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
- ret = mmu_zap_unsync_children(kvm, sp);
+ ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
kvm_mmu_page_unlink_children(kvm, sp);
kvm_mmu_unlink_parents(kvm, sp);
- kvm_flush_remote_tlbs(kvm);
if (!sp->role.invalid && !sp->role.direct)
unaccount_shadowed(kvm, sp->gfn);
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
- hlist_del(&sp->hash_link);
- kvm_mmu_free_page(kvm, sp);
+ /* Count self */
+ ret++;
+ list_move(&sp->link, invalid_list);
} else {
- sp->role.invalid = 1;
list_move(&sp->link, &kvm->arch.active_mmu_pages);
kvm_reload_remote_mmus(kvm);
}
+
+ sp->role.invalid = 1;
kvm_mmu_reset_last_pte_updated(kvm);
return ret;
}
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ if (list_empty(invalid_list))
+ return;
+
+ kvm_flush_remote_tlbs(kvm);
+
+ do {
+ sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+ WARN_ON(!sp->role.invalid || sp->root_count);
+ kvm_mmu_free_page(kvm, sp);
+ } while (!list_empty(invalid_list));
+
+}
+
/*
* Changing the number of mmu pages allocated to the vm
- * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
+ * Note: if goal_nr_mmu_pages is too small, you will get dead lock
*/
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
{
- int used_pages;
-
- used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
- used_pages = max(0, used_pages);
-
+ LIST_HEAD(invalid_list);
/*
* If we set the number of mmu pages to be smaller be than the
* number of actived pages , we must to free some mmu pages before we
* change the value
*/
- if (used_pages > kvm_nr_mmu_pages) {
- while (used_pages > kvm_nr_mmu_pages &&
+ if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
+ while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
!list_empty(&kvm->arch.active_mmu_pages)) {
struct kvm_mmu_page *page;
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- used_pages -= kvm_mmu_zap_page(kvm, page);
- used_pages--;
+ kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
}
- kvm_nr_mmu_pages = used_pages;
- kvm->arch.n_free_mmu_pages = 0;
+ goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
}
- else
- kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
- - kvm->arch.n_alloc_mmu_pages;
- kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
+ kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
}
static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
{
- unsigned index;
- struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node, *n;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
int r;
- pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
+ pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
r = 0;
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
-restart:
- hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
- if (sp->gfn == gfn && !sp->role.direct) {
- pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
- sp->role.word);
- r = 1;
- if (kvm_mmu_zap_page(kvm, sp))
- goto restart;
- }
+
+ for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
+ pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
+ sp->role.word);
+ r = 1;
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ }
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
return r;
}
static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
{
- unsigned index;
- struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node, *nn;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
-restart:
- hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
- if (sp->gfn == gfn && !sp->role.direct
- && !sp->role.invalid) {
- pgprintk("%s: zap %lx %x\n",
- __func__, gfn, sp->role.word);
- if (kvm_mmu_zap_page(kvm, sp))
- goto restart;
- }
+ for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
+ pgprintk("%s: zap %llx %x\n",
+ __func__, gfn, sp->role.word);
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
}
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
}
static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
@@ -1723,47 +1910,51 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
-static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *s;
- struct hlist_node *node, *n;
-
- index = kvm_page_table_hashfn(sp->gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
- /* don't unsync if pagetable is shadowed with multiple roles */
- hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
- if (s->gfn != sp->gfn || s->role.direct)
- continue;
- if (s->role.word != sp->role.word)
- return 1;
- }
trace_kvm_mmu_unsync_page(sp);
++vcpu->kvm->stat.mmu_unsync;
sp->unsync = 1;
kvm_mmu_mark_parents_unsync(sp);
-
mmu_convert_notrap(sp);
- return 0;
+}
+
+static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ struct kvm_mmu_page *s;
+ struct hlist_node *node;
+
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+ if (s->unsync)
+ continue;
+ WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ __kvm_unsync_page(vcpu, s);
+ }
}
static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
bool can_unsync)
{
- struct kvm_mmu_page *shadow;
+ struct kvm_mmu_page *s;
+ struct hlist_node *node;
+ bool need_unsync = false;
- shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
- if (shadow) {
- if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+ if (!can_unsync)
return 1;
- if (shadow->unsync)
- return 0;
- if (can_unsync && oos_shadow)
- return kvm_unsync_page(vcpu, shadow);
- return 1;
+
+ if (s->role.level != PT_PAGE_TABLE_LEVEL)
+ return 1;
+
+ if (!need_unsync && !s->unsync) {
+ if (!oos_shadow)
+ return 1;
+ need_unsync = true;
+ }
}
+ if (need_unsync)
+ kvm_unsync_pages(vcpu, gfn);
return 0;
}
@@ -1781,7 +1972,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* whether the guest actually used the pte (in order to detect
* demand paging).
*/
- spte = shadow_base_present_pte | shadow_dirty_mask;
+ spte = shadow_base_present_pte;
if (!speculative)
spte |= shadow_accessed_mask;
if (!dirty)
@@ -1804,18 +1995,20 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= (u64)pfn << PAGE_SHIFT;
if ((pte_access & ACC_WRITE_MASK)
- || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
+ || (!vcpu->arch.mmu.direct_map && write_fault
+ && !is_write_protection(vcpu) && !user_fault)) {
if (level > PT_PAGE_TABLE_LEVEL &&
has_wrprotected_page(vcpu->kvm, gfn, level)) {
ret = 1;
- spte = shadow_trap_nonpresent_pte;
- goto set_pte;
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ goto done;
}
spte |= PT_WRITABLE_MASK;
- if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
+ if (!vcpu->arch.mmu.direct_map
+ && !(pte_access & ACC_WRITE_MASK))
spte &= ~PT_USER_MASK;
/*
@@ -1828,7 +2021,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
goto set_pte;
if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
- pgprintk("%s: found shadow page for %lx, marking ro\n",
+ pgprintk("%s: found shadow page for %llx, marking ro\n",
__func__, gfn);
ret = 1;
pte_access &= ~ACC_WRITE_MASK;
@@ -1841,7 +2034,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
mark_page_dirty(vcpu->kvm, gfn);
set_pte:
- __set_spte(sptep, spte);
+ update_spte(sptep, spte);
+done:
return ret;
}
@@ -1853,11 +2047,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
bool reset_host_protection)
{
int was_rmapped = 0;
- int was_writable = is_writable_pte(*sptep);
int rmap_count;
pgprintk("%s: spte %llx access %x write_fault %d"
- " user_fault %d gfn %lx\n",
+ " user_fault %d gfn %llx\n",
__func__, *sptep, pt_access,
write_fault, user_fault, gfn);
@@ -1876,10 +2069,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
__set_spte(sptep, shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
} else if (pfn != spte_to_pfn(*sptep)) {
- pgprintk("hfn old %lx new %lx\n",
+ pgprintk("hfn old %llx new %llx\n",
spte_to_pfn(*sptep), pfn);
- rmap_remove(vcpu->kvm, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
} else
was_rmapped = 1;
@@ -1890,11 +2082,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
reset_host_protection)) {
if (write_fault)
*ptwrite = 1;
- kvm_x86_ops->tlb_flush(vcpu);
+ kvm_mmu_flush_tlb(vcpu);
}
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
- pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
+ pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
is_large_pte(*sptep)? "2MB" : "4kB",
*sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
*sptep, sptep);
@@ -1904,15 +2096,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
page_header_update_slot(vcpu->kvm, sptep, gfn);
if (!was_rmapped) {
rmap_count = rmap_add(vcpu, sptep, gfn);
- kvm_release_pfn_clean(pfn);
if (rmap_count > RMAP_RECYCLE_THRESHOLD)
rmap_recycle(vcpu, sptep, gfn);
- } else {
- if (was_writable)
- kvm_release_pfn_dirty(pfn);
- else
- kvm_release_pfn_clean(pfn);
}
+ kvm_release_pfn_clean(pfn);
if (speculative) {
vcpu->arch.last_pte_updated = sptep;
vcpu->arch.last_pte_gfn = gfn;
@@ -1923,6 +2110,105 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}
+static struct kvm_memory_slot *
+pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = gfn_to_memslot(vcpu->kvm, gfn);
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
+ (no_dirty_log && slot->dirty_bitmap))
+ slot = NULL;
+
+ return slot;
+}
+
+static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool no_dirty_log)
+{
+ struct kvm_memory_slot *slot;
+ unsigned long hva;
+
+ slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
+ if (!slot) {
+ get_page(bad_page);
+ return page_to_pfn(bad_page);
+ }
+
+ hva = gfn_to_hva_memslot(slot, gfn);
+
+ return hva_to_pfn_atomic(vcpu->kvm, hva);
+}
+
+static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp,
+ u64 *start, u64 *end)
+{
+ struct page *pages[PTE_PREFETCH_NUM];
+ unsigned access = sp->role.access;
+ int i, ret;
+ gfn_t gfn;
+
+ gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
+ if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK))
+ return -1;
+
+ ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
+ if (ret <= 0)
+ return -1;
+
+ for (i = 0; i < ret; i++, gfn++, start++)
+ mmu_set_spte(vcpu, start, ACC_ALL,
+ access, 0, 0, 1, NULL,
+ sp->role.level, gfn,
+ page_to_pfn(pages[i]), true, true);
+
+ return 0;
+}
+
+static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *sptep)
+{
+ u64 *spte, *start = NULL;
+ int i;
+
+ WARN_ON(!sp->role.direct);
+
+ i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+ spte = sp->spt + i;
+
+ for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+ if (!start)
+ continue;
+ if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
+ break;
+ start = NULL;
+ } else if (!start)
+ start = spte;
+ }
+}
+
+static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+
+ /*
+ * Since it's no accessed bit on EPT, it's no way to
+ * distinguish between actually accessed translations
+ * and prefetched, so disable pte prefetch if EPT is
+ * enabled.
+ */
+ if (!shadow_accessed_mask)
+ return;
+
+ sp = page_header(__pa(sptep));
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return;
+
+ __direct_pte_prefetch(vcpu, sp, sptep);
+}
+
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
int level, gfn_t gfn, pfn_t pfn)
{
@@ -1936,12 +2222,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
0, write, 1, &pt_write,
level, gfn, pfn, false, true);
+ direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
}
if (*iterator.sptep == shadow_trap_nonpresent_pte) {
- pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ u64 base_addr = iterator.addr;
+
+ base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
+ pseudo_gfn = base_addr >> PAGE_SHIFT;
sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
iterator.level - 1,
1, ACC_ALL, iterator.sptep);
@@ -1954,12 +2244,38 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
__set_spte(iterator.sptep,
__pa(sp->spt)
| PT_PRESENT_MASK | PT_WRITABLE_MASK
- | shadow_user_mask | shadow_x_mask);
+ | shadow_user_mask | shadow_x_mask
+ | shadow_accessed_mask);
}
}
return pt_write;
}
+static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
+{
+ siginfo_t info;
+
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = BUS_MCEERR_AR;
+ info.si_addr = (void __user *)address;
+ info.si_addr_lsb = PAGE_SHIFT;
+
+ send_sig_info(SIGBUS, &info, tsk);
+}
+
+static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+{
+ kvm_release_pfn_clean(pfn);
+ if (is_hwpoison_pfn(pfn)) {
+ kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
+ return 0;
+ } else if (is_fault_pfn(pfn))
+ return -EFAULT;
+
+ return 1;
+}
+
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
int r;
@@ -1983,10 +2299,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
pfn = gfn_to_pfn(vcpu->kvm, gfn);
/* mmio */
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return 1;
- }
+ if (is_error_pfn(pfn))
+ return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2009,17 +2323,22 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
int i;
struct kvm_mmu_page *sp;
+ LIST_HEAD(invalid_list);
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
spin_lock(&vcpu->kvm->mmu_lock);
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
+ (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
+ vcpu->arch.mmu.direct_map)) {
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
--sp->root_count;
- if (!sp->root_count && sp->role.invalid)
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ if (!sp->root_count && sp->role.invalid) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ }
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
spin_unlock(&vcpu->kvm->mmu_lock);
return;
@@ -2032,10 +2351,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
sp = page_header(root);
--sp->root_count;
if (!sp->root_count && sp->role.invalid)
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+ &invalid_list);
}
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
}
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
}
@@ -2045,85 +2366,165 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
int ret = 0;
if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
ret = 1;
}
return ret;
}
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
{
- int i;
- gfn_t root_gfn;
struct kvm_mmu_page *sp;
- int direct = 0;
- u64 pdptr;
-
- root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+ unsigned i;
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
+ 1, ACC_ALL, NULL);
+ ++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.root_hpa = __pa(sp->spt);
+ } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ ASSERT(!VALID_PAGE(root));
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ sp = kvm_mmu_get_page(vcpu, i << 30, i << 30,
+ PT32_ROOT_LEVEL, 1, ACC_ALL,
+ NULL);
+ root = __pa(sp->spt);
+ ++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+ }
+ vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+ } else
+ BUG();
+
+ return 0;
+}
+
+static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+ u64 pdptr, pm_mask;
+ gfn_t root_gfn;
+ int i;
+
+ root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
+
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
+
+ /*
+ * Do we shadow a long mode page table? If so we need to
+ * write-protect the guests page table root.
+ */
+ if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
ASSERT(!VALID_PAGE(root));
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
- if (tdp_enabled) {
- direct = 1;
- root_gfn = 0;
- }
+
spin_lock(&vcpu->kvm->mmu_lock);
- sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
- PT64_ROOT_LEVEL, direct,
- ACC_ALL, NULL);
+ kvm_mmu_free_some_pages(vcpu);
+ sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
+ 0, ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = root;
return 0;
}
- direct = !is_paging(vcpu);
+
+ /*
+ * We shadow a 32 bit page table. This may be a legacy 2-level
+ * or a PAE 3-level page table. In either case we need to be aware that
+ * the shadow page table may be a PAE or a long mode page table.
+ */
+ pm_mask = PT_PRESENT_MASK;
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
+ pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
+
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
ASSERT(!VALID_PAGE(root));
if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
- pdptr = kvm_pdptr_read(vcpu, i);
+ pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
if (!is_present_gpte(pdptr)) {
vcpu->arch.mmu.pae_root[i] = 0;
continue;
}
root_gfn = pdptr >> PAGE_SHIFT;
- } else if (vcpu->arch.mmu.root_level == 0)
- root_gfn = 0;
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
- if (tdp_enabled) {
- direct = 1;
- root_gfn = i << 30;
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
}
spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
- PT32_ROOT_LEVEL, direct,
+ PT32_ROOT_LEVEL, 0,
ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+ vcpu->arch.mmu.pae_root[i] = root | pm_mask;
}
vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+
+ /*
+ * If we shadow a 32 bit page table with a long mode page
+ * table we enter this path.
+ */
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ if (vcpu->arch.mmu.lm_root == NULL) {
+ /*
+ * The additional page necessary for this is only
+ * allocated on demand.
+ */
+
+ u64 *lm_root;
+
+ lm_root = (void*)get_zeroed_page(GFP_KERNEL);
+ if (lm_root == NULL)
+ return 1;
+
+ lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
+
+ vcpu->arch.mmu.lm_root = lm_root;
+ }
+
+ vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
+ }
+
return 0;
}
+static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.mmu.direct_map)
+ return mmu_alloc_direct_roots(vcpu);
+ else
+ return mmu_alloc_shadow_roots(vcpu);
+}
+
static void mmu_sync_roots(struct kvm_vcpu *vcpu)
{
int i;
struct kvm_mmu_page *sp;
+ if (vcpu->arch.mmu.direct_map)
+ return;
+
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+
+ trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+ if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
@@ -2138,6 +2539,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
mmu_sync_children(vcpu, sp);
}
}
+ trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
}
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -2155,6 +2557,14 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
return vaddr;
}
+static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
+ u32 access, u32 *error)
+{
+ if (error)
+ *error = 0;
+ return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
+}
+
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code)
{
@@ -2198,10 +2608,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return 1;
- }
+ if (is_error_pfn(pfn))
+ return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
@@ -2223,10 +2631,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu)
mmu_free_roots(vcpu);
}
-static int nonpaging_init_context(struct kvm_vcpu *vcpu)
+static int nonpaging_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
{
- struct kvm_mmu *context = &vcpu->arch.mmu;
-
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = nonpaging_page_fault;
context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2237,13 +2644,15 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
+ context->direct_map = true;
+ context->nx = false;
return 0;
}
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_x86_ops->tlb_flush(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
static void paging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2252,11 +2661,14 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu)
mmu_free_roots(vcpu);
}
-static void inject_page_fault(struct kvm_vcpu *vcpu,
- u64 addr,
- u32 err_code)
+static unsigned long get_cr3(struct kvm_vcpu *vcpu)
{
- kvm_inject_page_fault(vcpu, addr, err_code);
+ return vcpu->arch.cr3;
+}
+
+static void inject_page_fault(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.mmu.inject_page_fault(vcpu);
}
static void paging_free(struct kvm_vcpu *vcpu)
@@ -2264,12 +2676,12 @@ static void paging_free(struct kvm_vcpu *vcpu)
nonpaging_free(vcpu);
}
-static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
{
int bit7;
bit7 = (gpte >> 7) & 1;
- return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
+ return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
}
#define PTTYPE 64
@@ -2280,13 +2692,14 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
#include "paging_tmpl.h"
#undef PTTYPE
-static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context,
+ int level)
{
- struct kvm_mmu *context = &vcpu->arch.mmu;
int maxphyaddr = cpuid_maxphyaddr(vcpu);
u64 exb_bit_rsvd = 0;
- if (!is_nx(vcpu))
+ if (!context->nx)
exb_bit_rsvd = rsvd_bits(63, 63);
switch (level) {
case PT32_ROOT_LEVEL:
@@ -2341,9 +2754,13 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
}
}
-static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
+static int paging64_init_context_common(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context,
+ int level)
{
- struct kvm_mmu *context = &vcpu->arch.mmu;
+ context->nx = is_nx(vcpu);
+
+ reset_rsvds_bits_mask(vcpu, context, level);
ASSERT(is_pae(vcpu));
context->new_cr3 = paging_new_cr3;
@@ -2356,20 +2773,23 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
context->root_level = level;
context->shadow_root_level = level;
context->root_hpa = INVALID_PAGE;
+ context->direct_map = false;
return 0;
}
-static int paging64_init_context(struct kvm_vcpu *vcpu)
+static int paging64_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
{
- reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
- return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
+ return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
}
-static int paging32_init_context(struct kvm_vcpu *vcpu)
+static int paging32_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
{
- struct kvm_mmu *context = &vcpu->arch.mmu;
+ context->nx = false;
+
+ reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
- reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
context->new_cr3 = paging_new_cr3;
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2380,18 +2800,19 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
context->root_level = PT32_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
+ context->direct_map = false;
return 0;
}
-static int paging32E_init_context(struct kvm_vcpu *vcpu)
+static int paging32E_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
{
- reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
- return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
+ return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
}
static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu *context = &vcpu->arch.mmu;
+ struct kvm_mmu *context = vcpu->arch.walk_mmu;
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = tdp_page_fault;
@@ -2401,20 +2822,29 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->invlpg = nonpaging_invlpg;
context->shadow_root_level = kvm_x86_ops->get_tdp_level();
context->root_hpa = INVALID_PAGE;
+ context->direct_map = true;
+ context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
+ context->get_cr3 = get_cr3;
+ context->inject_page_fault = kvm_inject_page_fault;
+ context->nx = is_nx(vcpu);
if (!is_paging(vcpu)) {
+ context->nx = false;
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->root_level = 0;
} else if (is_long_mode(vcpu)) {
- reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
+ context->nx = is_nx(vcpu);
+ reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
context->gva_to_gpa = paging64_gva_to_gpa;
context->root_level = PT64_ROOT_LEVEL;
} else if (is_pae(vcpu)) {
- reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
+ context->nx = is_nx(vcpu);
+ reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
context->gva_to_gpa = paging64_gva_to_gpa;
context->root_level = PT32E_ROOT_LEVEL;
} else {
- reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
+ context->nx = false;
+ reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
context->gva_to_gpa = paging32_gva_to_gpa;
context->root_level = PT32_ROOT_LEVEL;
}
@@ -2422,33 +2852,83 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
return 0;
}
-static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
{
int r;
-
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (!is_paging(vcpu))
- r = nonpaging_init_context(vcpu);
+ r = nonpaging_init_context(vcpu, context);
else if (is_long_mode(vcpu))
- r = paging64_init_context(vcpu);
+ r = paging64_init_context(vcpu, context);
else if (is_pae(vcpu))
- r = paging32E_init_context(vcpu);
+ r = paging32E_init_context(vcpu, context);
else
- r = paging32_init_context(vcpu);
+ r = paging32_init_context(vcpu, context);
vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
- vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
+ vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+
+static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+{
+ int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
+
+ vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
+ vcpu->arch.walk_mmu->get_cr3 = get_cr3;
+ vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
return r;
}
+static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
+
+ g_context->get_cr3 = get_cr3;
+ g_context->inject_page_fault = kvm_inject_page_fault;
+
+ /*
+ * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
+ * translation of l2_gpa to l1_gpa addresses is done using the
+ * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
+ * functions between mmu and nested_mmu are swapped.
+ */
+ if (!is_paging(vcpu)) {
+ g_context->nx = false;
+ g_context->root_level = 0;
+ g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
+ } else if (is_long_mode(vcpu)) {
+ g_context->nx = is_nx(vcpu);
+ reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
+ g_context->root_level = PT64_ROOT_LEVEL;
+ g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+ } else if (is_pae(vcpu)) {
+ g_context->nx = is_nx(vcpu);
+ reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
+ g_context->root_level = PT32E_ROOT_LEVEL;
+ g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+ } else {
+ g_context->nx = false;
+ reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
+ g_context->root_level = PT32_ROOT_LEVEL;
+ g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
+ }
+
+ return 0;
+}
+
static int init_kvm_mmu(struct kvm_vcpu *vcpu)
{
vcpu->arch.update_pte.pfn = bad_pfn;
- if (tdp_enabled)
+ if (mmu_is_nested(vcpu))
+ return init_kvm_nested_mmu(vcpu);
+ else if (tdp_enabled)
return init_kvm_tdp_mmu(vcpu);
else
return init_kvm_softmmu(vcpu);
@@ -2457,10 +2937,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
- if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+ if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ /* mmu.free() should set root_hpa = INVALID_PAGE */
vcpu->arch.mmu.free(vcpu);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- }
}
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -2477,9 +2956,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
r = mmu_topup_memory_caches(vcpu);
if (r)
goto out;
- spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
- spin_unlock(&vcpu->kvm->mmu_lock);
r = mmu_alloc_roots(vcpu);
spin_lock(&vcpu->kvm->mmu_lock);
mmu_sync_roots(vcpu);
@@ -2487,7 +2963,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
if (r)
goto out;
/* set_cr3() should ensure TLB has been flushed */
- kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+ vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
out:
return r;
}
@@ -2497,6 +2973,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
mmu_free_roots(vcpu);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_unload);
static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
@@ -2508,7 +2985,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
pte = *spte;
if (is_shadow_present_pte(pte)) {
if (is_last_spte(pte, sp->role.level))
- rmap_remove(vcpu->kvm, spte);
+ drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
mmu_page_remove_parent_pte(child, spte);
@@ -2529,6 +3006,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
return;
}
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
+ return;
+
++vcpu->kvm->stat.mmu_pte_updated;
if (!sp->role.cr4_pae)
paging32_update_pte(vcpu, sp, spte, new);
@@ -2549,11 +3029,15 @@ static bool need_remote_flush(u64 old, u64 new)
return (old & ~new & PT64_PERM_MASK) != 0;
}
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
+static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
+ bool remote_flush, bool local_flush)
{
- if (need_remote_flush(old, new))
+ if (zap_page)
+ return;
+
+ if (remote_flush)
kvm_flush_remote_tlbs(vcpu->kvm);
- else
+ else if (local_flush)
kvm_mmu_flush_tlb(vcpu);
}
@@ -2603,10 +3087,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
bool guest_initiated)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
+ union kvm_mmu_page_role mask = { .word = 0 };
struct kvm_mmu_page *sp;
- struct hlist_node *node, *n;
- struct hlist_head *bucket;
- unsigned index;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
u64 entry, gentry;
u64 *spte;
unsigned offset = offset_in_page(gpa);
@@ -2619,6 +3103,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
int npte;
int r;
int invlpg_counter;
+ bool remote_flush, local_flush, zap_page;
+
+ zap_page = remote_flush = local_flush = false;
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
@@ -2661,7 +3148,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
kvm_mmu_access_page(vcpu, gfn);
kvm_mmu_free_some_pages(vcpu);
++vcpu->kvm->stat.mmu_pte_write;
- kvm_mmu_audit(vcpu, "pre pte write");
+ trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
if (guest_initiated) {
if (gfn == vcpu->arch.last_pt_write_gfn
&& !last_updated_pte_accessed(vcpu)) {
@@ -2674,13 +3161,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
vcpu->arch.last_pte_updated = NULL;
}
}
- index = kvm_page_table_hashfn(gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-restart:
- hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
- if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
- continue;
+ mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
pte_size = sp->role.cr4_pae ? 8 : 4;
misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
misaligned |= bytes < 4;
@@ -2697,8 +3180,8 @@ restart:
*/
pgprintk("misaligned: gpa %llx bytes %d role %x\n",
gpa, bytes, sp->role.word);
- if (kvm_mmu_zap_page(vcpu->kvm, sp))
- goto restart;
+ zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+ &invalid_list);
++vcpu->kvm->stat.mmu_flooded;
continue;
}
@@ -2722,17 +3205,23 @@ restart:
if (quadrant != sp->role.quadrant)
continue;
}
+ local_flush = true;
spte = &sp->spt[page_offset / sizeof(*spte)];
while (npte--) {
entry = *spte;
mmu_pte_write_zap_pte(vcpu, sp, spte);
- if (gentry)
+ if (gentry &&
+ !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
+ & mask.word))
mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
- mmu_pte_write_flush_tlb(vcpu, entry, *spte);
+ if (!remote_flush && need_remote_flush(entry, *spte))
+ remote_flush = true;
++spte;
}
}
- kvm_mmu_audit(vcpu, "post pte write");
+ mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
spin_unlock(&vcpu->kvm->mmu_lock);
if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
@@ -2745,7 +3234,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
gpa_t gpa;
int r;
- if (tdp_enabled)
+ if (vcpu->arch.mmu.direct_map)
return 0;
gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
@@ -2759,13 +3248,16 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
- while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES &&
+ LIST_HEAD(invalid_list);
+
+ while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
struct kvm_mmu_page *sp;
sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
++vcpu->kvm->stat.mmu_recycled;
}
}
@@ -2795,11 +3287,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
return 1;
case EMULATE_DO_MMIO:
++vcpu->stat.mmio_exits;
- return 0;
+ /* fall through */
case EMULATE_FAIL:
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
return 0;
default:
BUG();
@@ -2832,6 +3321,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
free_page((unsigned long)vcpu->arch.mmu.pae_root);
+ if (vcpu->arch.mmu.lm_root != NULL)
+ free_page((unsigned long)vcpu->arch.mmu.lm_root);
}
static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@ -2873,15 +3364,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
return init_kvm_mmu(vcpu);
}
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
-
- destroy_kvm_mmu(vcpu);
- free_mmu_pages(vcpu);
- mmu_free_memory_caches(vcpu);
-}
-
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
{
struct kvm_mmu_page *sp;
@@ -2896,7 +3378,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
pt = sp->spt;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
/* avoid RMW */
- if (pt[i] & PT_WRITABLE_MASK)
+ if (is_writable_pte(pt[i]))
pt[i] &= ~PT_WRITABLE_MASK;
}
kvm_flush_remote_tlbs(kvm);
@@ -2905,50 +3387,53 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
void kvm_mmu_zap_all(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
+ LIST_HEAD(invalid_list);
spin_lock(&kvm->mmu_lock);
restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
- if (kvm_mmu_zap_page(kvm, sp))
+ if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
goto restart;
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
-
- kvm_flush_remote_tlbs(kvm);
}
-static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
+static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
+ struct list_head *invalid_list)
{
struct kvm_mmu_page *page;
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- return kvm_mmu_zap_page(kvm, page) + 1;
+ return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
}
static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
{
struct kvm *kvm;
struct kvm *kvm_freed = NULL;
- int cache_count = 0;
+
+ if (nr_to_scan == 0)
+ goto out;
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
- int npages, idx, freed_pages;
+ int idx, freed_pages;
+ LIST_HEAD(invalid_list);
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
- npages = kvm->arch.n_alloc_mmu_pages -
- kvm->arch.n_free_mmu_pages;
- cache_count += npages;
- if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
- freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm);
- cache_count -= freed_pages;
+ if (!kvm_freed && nr_to_scan > 0 &&
+ kvm->arch.n_used_mmu_pages > 0) {
+ freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
+ &invalid_list);
kvm_freed = kvm;
}
nr_to_scan--;
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
}
@@ -2957,7 +3442,8 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
spin_unlock(&kvm_lock);
- return cache_count;
+out:
+ return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
}
static struct shrinker mmu_shrinker = {
@@ -2978,6 +3464,7 @@ static void mmu_destroy_caches(void)
void kvm_mmu_module_exit(void)
{
mmu_destroy_caches();
+ percpu_counter_destroy(&kvm_total_used_mmu_pages);
unregister_shrinker(&mmu_shrinker);
}
@@ -3000,6 +3487,9 @@ int kvm_mmu_module_init(void)
if (!mmu_page_header_cache)
goto nomem;
+ if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
+ goto nomem;
+
register_shrinker(&mmu_shrinker);
return 0;
@@ -3074,7 +3564,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
- kvm_set_cr3(vcpu, vcpu->arch.cr3);
+ (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
return 1;
}
@@ -3170,273 +3660,18 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
}
EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
-#ifdef AUDIT
-
-static const char *audit_msg;
-
-static gva_t canonicalize(gva_t gva)
-{
-#ifdef CONFIG_X86_64
- gva = (long long)(gva << 16) >> 16;
+#ifdef CONFIG_KVM_MMU_AUDIT
+#include "mmu_audit.c"
+#else
+static void mmu_audit_disable(void) { }
#endif
- return gva;
-}
-
-
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
-
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
- inspect_spte_fn fn)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 ent = sp->spt[i];
-
- if (is_shadow_present_pte(ent)) {
- if (!is_last_spte(ent, sp->role.level)) {
- struct kvm_mmu_page *child;
- child = page_header(ent & PT64_BASE_ADDR_MASK);
- __mmu_spte_walk(kvm, child, fn);
- } else
- fn(kvm, &sp->spt[i]);
- }
- }
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
- int i;
- struct kvm_mmu_page *sp;
-
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
- sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
- return;
- }
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- if (root && VALID_PAGE(root)) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
- }
- }
- return;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
- gva_t va, int level)
-{
- u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
- int i;
- gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
- u64 ent = pt[i];
-
- if (ent == shadow_trap_nonpresent_pte)
- continue;
-
- va = canonicalize(va);
- if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
- audit_mappings_page(vcpu, ent, va, level - 1);
- else {
- gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
- gfn_t gfn = gpa >> PAGE_SHIFT;
- pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
- hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
-
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- continue;
- }
-
- if (is_shadow_present_pte(ent)
- && (ent & PT64_BASE_ADDR_MASK) != hpa)
- printk(KERN_ERR "xx audit error: (%s) levels %d"
- " gva %lx gpa %llx hpa %llx ent %llx %d\n",
- audit_msg, vcpu->arch.mmu.root_level,
- va, gpa, hpa, ent,
- is_shadow_present_pte(ent));
- else if (ent == shadow_notrap_nonpresent_pte
- && !is_error_hpa(hpa))
- printk(KERN_ERR "audit: (%s) notrap shadow,"
- " valid guest gva %lx\n", audit_msg, va);
- kvm_release_pfn_clean(pfn);
-
- }
- }
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
- unsigned i;
-
- if (vcpu->arch.mmu.root_level == 4)
- audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
- else
- for (i = 0; i < 4; ++i)
- if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
- audit_mappings_page(vcpu,
- vcpu->arch.mmu.pae_root[i],
- i << 30,
- 2);
-}
-
-static int count_rmaps(struct kvm_vcpu *vcpu)
-{
- struct kvm *kvm = vcpu->kvm;
- struct kvm_memslots *slots;
- int nmaps = 0;
- int i, j, k, idx;
-
- idx = srcu_read_lock(&kvm->srcu);
- slots = kvm_memslots(kvm);
- for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
- struct kvm_memory_slot *m = &slots->memslots[i];
- struct kvm_rmap_desc *d;
-
- for (j = 0; j < m->npages; ++j) {
- unsigned long *rmapp = &m->rmap[j];
-
- if (!*rmapp)
- continue;
- if (!(*rmapp & 1)) {
- ++nmaps;
- continue;
- }
- d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
- while (d) {
- for (k = 0; k < RMAP_EXT; ++k)
- if (d->sptes[k])
- ++nmaps;
- else
- break;
- d = d->more;
- }
- }
- }
- srcu_read_unlock(&kvm->srcu, idx);
- return nmaps;
-}
-
-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
- unsigned long *rmapp;
- struct kvm_mmu_page *rev_sp;
- gfn_t gfn;
-
- if (*sptep & PT_WRITABLE_MASK) {
- rev_sp = page_header(__pa(sptep));
- gfn = rev_sp->gfns[sptep - rev_sp->spt];
-
- if (!gfn_to_memslot(kvm, gfn)) {
- if (!printk_ratelimit())
- return;
- printk(KERN_ERR "%s: no memslot for gfn %ld\n",
- audit_msg, gfn);
- printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
- audit_msg, (long int)(sptep - rev_sp->spt),
- rev_sp->gfn);
- dump_stack();
- return;
- }
-
- rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
- rev_sp->role.level);
- if (!*rmapp) {
- if (!printk_ratelimit())
- return;
- printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
- audit_msg, *sptep);
- dump_stack();
- }
- }
-
-}
-
-void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
-{
- mmu_spte_walk(vcpu, inspect_spte_has_rmap);
-}
-
-static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *sp;
- int i;
- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- u64 *pt = sp->spt;
-
- if (sp->role.level != PT_PAGE_TABLE_LEVEL)
- continue;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 ent = pt[i];
-
- if (!(ent & PT_PRESENT_MASK))
- continue;
- if (!(ent & PT_WRITABLE_MASK))
- continue;
- inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
- }
- }
- return;
-}
-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
- check_writable_mappings_rmap(vcpu);
- count_rmaps(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *sp;
- struct kvm_memory_slot *slot;
- unsigned long *rmapp;
- u64 *spte;
- gfn_t gfn;
-
- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- if (sp->role.direct)
- continue;
- if (sp->unsync)
- continue;
-
- gfn = unalias_gfn(vcpu->kvm, sp->gfn);
- slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
- rmapp = &slot->rmap[gfn - slot->base_gfn];
-
- spte = rmap_next(vcpu->kvm, rmapp, NULL);
- while (spte) {
- if (*spte & PT_WRITABLE_MASK)
- printk(KERN_ERR "%s: (%s) shadow page has "
- "writable mappings: gfn %lx role %x\n",
- __func__, audit_msg, sp->gfn,
- sp->role.word);
- spte = rmap_next(vcpu->kvm, rmapp, spte);
- }
- }
-}
-
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
- int olddbg = dbg;
+ ASSERT(vcpu);
- dbg = 0;
- audit_msg = msg;
- audit_rmap(vcpu);
- audit_write_protection(vcpu);
- if (strcmp("pre pte write", audit_msg) != 0)
- audit_mappings(vcpu);
- audit_writable_sptes_have_rmaps(vcpu);
- dbg = olddbg;
+ destroy_kvm_mmu(vcpu);
+ free_mmu_pages(vcpu);
+ mmu_free_memory_caches(vcpu);
+ mmu_audit_disable();
}
-
-#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index be66759321a5..7086ca85d3e7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,10 +49,17 @@
#define PFERR_FETCH_MASK (1U << 4)
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+
+static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
+{
+ return kvm->arch.n_max_mmu_pages -
+ kvm->arch.n_used_mmu_pages;
+}
static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
- if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
+ if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES))
__kvm_mmu_free_some_pages(vcpu);
}
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
new file mode 100644
index 000000000000..ba2bcdde6221
--- /dev/null
+++ b/arch/x86/kvm/mmu_audit.c
@@ -0,0 +1,299 @@
+/*
+ * mmu_audit.c:
+ *
+ * Audit code for KVM MMU
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ * Marcelo Tosatti <mtosatti@redhat.com>
+ * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/ratelimit.h>
+
+static int audit_point;
+
+#define audit_printk(fmt, args...) \
+ printk(KERN_ERR "audit: (%s) error: " \
+ fmt, audit_point_name[audit_point], ##args)
+
+typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
+
+static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ inspect_spte_fn fn, int level)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ u64 *ent = sp->spt;
+
+ fn(vcpu, ent + i, level);
+
+ if (is_shadow_present_pte(ent[i]) &&
+ !is_last_spte(ent[i], level)) {
+ struct kvm_mmu_page *child;
+
+ child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
+ __mmu_spte_walk(vcpu, child, fn, level - 1);
+ }
+ }
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+
+ if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
+ return;
+ }
+
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root && VALID_PAGE(root)) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu, sp, fn, 2);
+ }
+ }
+
+ return;
+}
+
+typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
+{
+ struct kvm_mmu_page *sp;
+
+ list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
+ fn(kvm, sp);
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ struct kvm_mmu_page *sp;
+ gfn_t gfn;
+ pfn_t pfn;
+ hpa_t hpa;
+
+ sp = page_header(__pa(sptep));
+
+ if (sp->unsync) {
+ if (level != PT_PAGE_TABLE_LEVEL) {
+ audit_printk("unsync sp: %p level = %d\n", sp, level);
+ return;
+ }
+
+ if (*sptep == shadow_notrap_nonpresent_pte) {
+ audit_printk("notrap spte in unsync sp: %p\n", sp);
+ return;
+ }
+ }
+
+ if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+ audit_printk("notrap spte in direct sp: %p\n", sp);
+ return;
+ }
+
+ if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
+ return;
+
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+ pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return;
+ }
+
+ hpa = pfn << PAGE_SHIFT;
+ if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+ audit_printk("levels %d pfn %llx hpa %llx ent %llxn",
+ vcpu->arch.mmu.root_level, pfn, hpa, *sptep);
+}
+
+static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+{
+ unsigned long *rmapp;
+ struct kvm_mmu_page *rev_sp;
+ gfn_t gfn;
+
+
+ rev_sp = page_header(__pa(sptep));
+ gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
+
+ if (!gfn_to_memslot(kvm, gfn)) {
+ if (!printk_ratelimit())
+ return;
+ audit_printk("no memslot for gfn %llx\n", gfn);
+ audit_printk("index %ld of sp (gfn=%llx)\n",
+ (long int)(sptep - rev_sp->spt), rev_sp->gfn);
+ dump_stack();
+ return;
+ }
+
+ rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+ if (!*rmapp) {
+ if (!printk_ratelimit())
+ return;
+ audit_printk("no rmap for writable spte %llx\n", *sptep);
+ dump_stack();
+ }
+}
+
+static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
+ inspect_spte_has_rmap(vcpu->kvm, sptep);
+}
+
+static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+ if (audit_point == AUDIT_POST_SYNC && sp->unsync)
+ audit_printk("meet unsync sp(%p) after sync root.\n", sp);
+}
+
+static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ int i;
+
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ return;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (!is_rmap_spte(sp->spt[i]))
+ continue;
+
+ inspect_spte_has_rmap(kvm, sp->spt + i);
+ }
+}
+
+static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ struct kvm_memory_slot *slot;
+ unsigned long *rmapp;
+ u64 *spte;
+
+ if (sp->role.direct || sp->unsync || sp->role.invalid)
+ return;
+
+ slot = gfn_to_memslot(kvm, sp->gfn);
+ rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ if (is_writable_pte(*spte))
+ audit_printk("shadow page has writable mappings: gfn "
+ "%llx role %x\n", sp->gfn, sp->role.word);
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+}
+
+static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ check_mappings_rmap(kvm, sp);
+ audit_write_protection(kvm, sp);
+}
+
+static void audit_all_active_sps(struct kvm *kvm)
+{
+ walk_all_active_sps(kvm, audit_sp);
+}
+
+static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ audit_sptes_have_rmaps(vcpu, sptep, level);
+ audit_mappings(vcpu, sptep, level);
+ audit_spte_after_sync(vcpu, sptep, level);
+}
+
+static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
+{
+ mmu_spte_walk(vcpu, audit_spte);
+}
+
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
+{
+ static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+
+ if (!__ratelimit(&ratelimit_state))
+ return;
+
+ audit_point = point;
+ audit_all_active_sps(vcpu->kvm);
+ audit_vcpu_spte(vcpu);
+}
+
+static bool mmu_audit;
+
+static void mmu_audit_enable(void)
+{
+ int ret;
+
+ if (mmu_audit)
+ return;
+
+ ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+ WARN_ON(ret);
+
+ mmu_audit = true;
+}
+
+static void mmu_audit_disable(void)
+{
+ if (!mmu_audit)
+ return;
+
+ unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+ tracepoint_synchronize_unregister();
+ mmu_audit = false;
+}
+
+static int mmu_audit_set(const char *val, const struct kernel_param *kp)
+{
+ int ret;
+ unsigned long enable;
+
+ ret = strict_strtoul(val, 10, &enable);
+ if (ret < 0)
+ return -EINVAL;
+
+ switch (enable) {
+ case 0:
+ mmu_audit_disable();
+ break;
+ case 1:
+ mmu_audit_enable();
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct kernel_param_ops audit_param_ops = {
+ .set = mmu_audit_set,
+ .get = param_get_bool,
+};
+
+module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 42f07b1bfbc9..b60b4fdb3eda 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -190,11 +190,30 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
TP_ARGS(sp)
);
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page,
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
TP_PROTO(struct kvm_mmu_page *sp),
TP_ARGS(sp)
);
+
+TRACE_EVENT(
+ kvm_mmu_audit,
+ TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
+ TP_ARGS(vcpu, audit_point),
+
+ TP_STRUCT__entry(
+ __field(struct kvm_vcpu *, vcpu)
+ __field(int, audit_point)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->audit_point = audit_point;
+ ),
+
+ TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
+ audit_point_name[__entry->audit_point])
+);
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2331bdc2b549..cd7a833a3b52 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,6 +7,7 @@
* MMU support
*
* Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
@@ -66,6 +67,7 @@ struct guest_walker {
int level;
gfn_t table_gfn[PT_MAX_FULL_LEVELS];
pt_element_t ptes[PT_MAX_FULL_LEVELS];
+ pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
unsigned pt_access;
unsigned pte_access;
@@ -103,7 +105,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
#if PTTYPE == 64
- if (is_nx(vcpu))
+ if (vcpu->arch.mmu.nx)
access &= ~(gpte >> PT64_NX_SHIFT);
#endif
return access;
@@ -112,32 +114,42 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
/*
* Fetch a guest pte for a guest virtual address
*/
-static int FNAME(walk_addr)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr,
- int write_fault, int user_fault, int fetch_fault)
+static int FNAME(walk_addr_generic)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gva_t addr, u32 access)
{
pt_element_t pte;
gfn_t table_gfn;
- unsigned index, pt_access, pte_access;
+ unsigned index, pt_access, uninitialized_var(pte_access);
gpa_t pte_gpa;
- int rsvd_fault = 0;
+ bool eperm, present, rsvd_fault;
+ int offset, write_fault, user_fault, fetch_fault;
+
+ write_fault = access & PFERR_WRITE_MASK;
+ user_fault = access & PFERR_USER_MASK;
+ fetch_fault = access & PFERR_FETCH_MASK;
trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
fetch_fault);
walk:
- walker->level = vcpu->arch.mmu.root_level;
- pte = vcpu->arch.cr3;
+ present = true;
+ eperm = rsvd_fault = false;
+ walker->level = mmu->root_level;
+ pte = mmu->get_cr3(vcpu);
+
#if PTTYPE == 64
- if (!is_long_mode(vcpu)) {
- pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
+ if (walker->level == PT32E_ROOT_LEVEL) {
+ pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_gpte(pte))
- goto not_present;
+ if (!is_present_gpte(pte)) {
+ present = false;
+ goto error;
+ }
--walker->level;
}
#endif
ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
- (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
+ (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
pt_access = ACC_ALL;
@@ -145,42 +157,49 @@ walk:
index = PT_INDEX(addr, walker->level);
table_gfn = gpte_to_gfn(pte);
- pte_gpa = gfn_to_gpa(table_gfn);
- pte_gpa += index * sizeof(pt_element_t);
+ offset = index * sizeof(pt_element_t);
+ pte_gpa = gfn_to_gpa(table_gfn) + offset;
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
- if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)))
- goto not_present;
+ if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte,
+ offset, sizeof(pte),
+ PFERR_USER_MASK|PFERR_WRITE_MASK)) {
+ present = false;
+ break;
+ }
trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_gpte(pte))
- goto not_present;
+ if (!is_present_gpte(pte)) {
+ present = false;
+ break;
+ }
- rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
- if (rsvd_fault)
- goto access_error;
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) {
+ rsvd_fault = true;
+ break;
+ }
if (write_fault && !is_writable_pte(pte))
if (user_fault || is_write_protection(vcpu))
- goto access_error;
+ eperm = true;
if (user_fault && !(pte & PT_USER_MASK))
- goto access_error;
+ eperm = true;
#if PTTYPE == 64
if (fetch_fault && (pte & PT64_NX_MASK))
- goto access_error;
+ eperm = true;
#endif
- if (!(pte & PT_ACCESSED_MASK)) {
+ if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {
trace_kvm_mmu_set_accessed_bit(table_gfn, index,
sizeof(pte));
- mark_page_dirty(vcpu->kvm, table_gfn);
if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
index, pte, pte|PT_ACCESSED_MASK))
goto walk;
+ mark_page_dirty(vcpu->kvm, table_gfn);
pte |= PT_ACCESSED_MASK;
}
@@ -194,17 +213,28 @@ walk:
(PTTYPE == 64 || is_pse(vcpu))) ||
((walker->level == PT_PDPE_LEVEL) &&
is_large_pte(pte) &&
- is_long_mode(vcpu))) {
+ mmu->root_level == PT64_ROOT_LEVEL)) {
int lvl = walker->level;
+ gpa_t real_gpa;
+ gfn_t gfn;
+ u32 ac;
- walker->gfn = gpte_to_gfn_lvl(pte, lvl);
- walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl))
- >> PAGE_SHIFT;
+ gfn = gpte_to_gfn_lvl(pte, lvl);
+ gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
if (PTTYPE == 32 &&
walker->level == PT_DIRECTORY_LEVEL &&
is_cpuid_PSE36())
- walker->gfn += pse36_gfn_delta(pte);
+ gfn += pse36_gfn_delta(pte);
+
+ ac = write_fault | fetch_fault | user_fault;
+
+ real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
+ ac);
+ if (real_gpa == UNMAPPED_GVA)
+ return 0;
+
+ walker->gfn = real_gpa >> PAGE_SHIFT;
break;
}
@@ -213,15 +243,18 @@ walk:
--walker->level;
}
+ if (!present || eperm || rsvd_fault)
+ goto error;
+
if (write_fault && !is_dirty_gpte(pte)) {
bool ret;
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
- mark_page_dirty(vcpu->kvm, table_gfn);
ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
pte|PT_DIRTY_MASK);
if (ret)
goto walk;
+ mark_page_dirty(vcpu->kvm, table_gfn);
pte |= PT_DIRTY_MASK;
walker->ptes[walker->level - 1] = pte;
}
@@ -229,30 +262,44 @@ walk:
walker->pt_access = pt_access;
walker->pte_access = pte_access;
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, pt_access, pte_access);
+ __func__, (u64)pte, pte_access, pt_access);
return 1;
-not_present:
+error:
walker->error_code = 0;
- goto err;
+ if (present)
+ walker->error_code |= PFERR_PRESENT_MASK;
-access_error:
- walker->error_code = PFERR_PRESENT_MASK;
+ walker->error_code |= write_fault | user_fault;
-err:
- if (write_fault)
- walker->error_code |= PFERR_WRITE_MASK;
- if (user_fault)
- walker->error_code |= PFERR_USER_MASK;
- if (fetch_fault)
+ if (fetch_fault && mmu->nx)
walker->error_code |= PFERR_FETCH_MASK;
if (rsvd_fault)
walker->error_code |= PFERR_RSVD_MASK;
+
+ vcpu->arch.fault.address = addr;
+ vcpu->arch.fault.error_code = walker->error_code;
+
trace_kvm_mmu_walker_error(walker->error_code);
return 0;
}
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
+static int FNAME(walk_addr)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, gva_t addr, u32 access)
+{
+ return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
+ access);
+}
+
+static int FNAME(walk_addr_nested)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, gva_t addr,
+ u32 access)
+{
+ return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
+ addr, access);
+}
+
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, const void *pte)
{
pt_element_t gpte;
@@ -263,7 +310,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
if (!is_present_gpte(gpte)) {
- if (page->unsync)
+ if (sp->unsync)
new_spte = shadow_trap_nonpresent_pte;
else
new_spte = shadow_notrap_nonpresent_pte;
@@ -272,7 +319,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
return;
}
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
- pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
return;
pfn = vcpu->arch.update_pte.pfn;
@@ -285,11 +332,95 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
* we call mmu_set_spte() with reset_host_protection = true beacuse that
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
*/
- mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
- gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
+ mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
+ is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
gpte_to_gfn(gpte), pfn, true, true);
}
+static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
+ struct guest_walker *gw, int level)
+{
+ pt_element_t curr_pte;
+ gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
+ u64 mask;
+ int r, index;
+
+ if (level == PT_PAGE_TABLE_LEVEL) {
+ mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
+ base_gpa = pte_gpa & ~mask;
+ index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
+
+ r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
+ gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
+ curr_pte = gw->prefetch_ptes[index];
+ } else
+ r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
+ &curr_pte, sizeof(curr_pte));
+
+ return r || curr_pte != gw->ptes[level - 1];
+}
+
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
+ u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+ pt_element_t *gptep = gw->prefetch_ptes;
+ u64 *spte;
+ int i;
+
+ sp = page_header(__pa(sptep));
+
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return;
+
+ if (sp->role.direct)
+ return __direct_pte_prefetch(vcpu, sp, sptep);
+
+ i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+ spte = sp->spt + i;
+
+ for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ pt_element_t gpte;
+ unsigned pte_access;
+ gfn_t gfn;
+ pfn_t pfn;
+ bool dirty;
+
+ if (spte == sptep)
+ continue;
+
+ if (*spte != shadow_trap_nonpresent_pte)
+ continue;
+
+ gpte = gptep[i];
+
+ if (!is_present_gpte(gpte) ||
+ is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
+ if (!sp->unsync)
+ __set_spte(spte, shadow_notrap_nonpresent_pte);
+ continue;
+ }
+
+ if (!(gpte & PT_ACCESSED_MASK))
+ continue;
+
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ gfn = gpte_to_gfn(gpte);
+ dirty = is_dirty_gpte(gpte);
+ pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+ (pte_access & ACC_WRITE_MASK) && dirty);
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ break;
+ }
+
+ mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
+ dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
+ pfn, true, true);
+ }
+}
+
/*
* Fetch a shadow pte for a specific level in the paging hierarchy.
*/
@@ -299,75 +430,87 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
int *ptwrite, pfn_t pfn)
{
unsigned access = gw->pt_access;
- struct kvm_mmu_page *shadow_page;
- u64 spte, *sptep = NULL;
- int direct;
- gfn_t table_gfn;
- int r;
- int level;
- pt_element_t curr_pte;
- struct kvm_shadow_walk_iterator iterator;
+ struct kvm_mmu_page *sp = NULL;
+ bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
+ int top_level;
+ unsigned direct_access;
+ struct kvm_shadow_walk_iterator it;
if (!is_present_gpte(gw->ptes[gw->level - 1]))
return NULL;
- for_each_shadow_entry(vcpu, addr, iterator) {
- level = iterator.level;
- sptep = iterator.sptep;
- if (iterator.level == hlevel) {
- mmu_set_spte(vcpu, sptep, access,
- gw->pte_access & access,
- user_fault, write_fault,
- gw->ptes[gw->level-1] & PT_DIRTY_MASK,
- ptwrite, level,
- gw->gfn, pfn, false, true);
- break;
- }
+ direct_access = gw->pt_access & gw->pte_access;
+ if (!dirty)
+ direct_access &= ~ACC_WRITE_MASK;
- if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
- continue;
+ top_level = vcpu->arch.mmu.root_level;
+ if (top_level == PT32E_ROOT_LEVEL)
+ top_level = PT32_ROOT_LEVEL;
+ /*
+ * Verify that the top-level gpte is still there. Since the page
+ * is a root page, it is either write protected (and cannot be
+ * changed from now on) or it is invalid (in which case, we don't
+ * really care if it changes underneath us after this point).
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, top_level))
+ goto out_gpte_changed;
- if (is_large_pte(*sptep)) {
- rmap_remove(vcpu->kvm, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
- }
+ for (shadow_walk_init(&it, vcpu, addr);
+ shadow_walk_okay(&it) && it.level > gw->level;
+ shadow_walk_next(&it)) {
+ gfn_t table_gfn;
- if (level <= gw->level) {
- int delta = level - gw->level + 1;
- direct = 1;
- if (!is_dirty_gpte(gw->ptes[level - delta]))
- access &= ~ACC_WRITE_MASK;
- table_gfn = gpte_to_gfn(gw->ptes[level - delta]);
- /* advance table_gfn when emulating 1gb pages with 4k */
- if (delta == 0)
- table_gfn += PT_INDEX(addr, level);
- access &= gw->pte_access;
- } else {
- direct = 0;
- table_gfn = gw->table_gfn[level - 2];
- }
- shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
- direct, access, sptep);
- if (!direct) {
- r = kvm_read_guest_atomic(vcpu->kvm,
- gw->pte_gpa[level - 2],
- &curr_pte, sizeof(curr_pte));
- if (r || curr_pte != gw->ptes[level - 2]) {
- kvm_mmu_put_page(shadow_page, sptep);
- kvm_release_pfn_clean(pfn);
- sptep = NULL;
- break;
- }
+ drop_large_spte(vcpu, it.sptep);
+
+ sp = NULL;
+ if (!is_shadow_present_pte(*it.sptep)) {
+ table_gfn = gw->table_gfn[it.level - 2];
+ sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
+ false, access, it.sptep);
}
- spte = __pa(shadow_page->spt)
- | PT_PRESENT_MASK | PT_ACCESSED_MASK
- | PT_WRITABLE_MASK | PT_USER_MASK;
- *sptep = spte;
+ /*
+ * Verify that the gpte in the page we've just write
+ * protected is still there.
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
+ goto out_gpte_changed;
+
+ if (sp)
+ link_shadow_page(it.sptep, sp);
+ }
+
+ for (;
+ shadow_walk_okay(&it) && it.level > hlevel;
+ shadow_walk_next(&it)) {
+ gfn_t direct_gfn;
+
+ validate_direct_spte(vcpu, it.sptep, direct_access);
+
+ drop_large_spte(vcpu, it.sptep);
+
+ if (is_shadow_present_pte(*it.sptep))
+ continue;
+
+ direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+
+ sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
+ true, direct_access, it.sptep);
+ link_shadow_page(it.sptep, sp);
}
- return sptep;
+ mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
+ user_fault, write_fault, dirty, ptwrite, it.level,
+ gw->gfn, pfn, false, true);
+ FNAME(pte_prefetch)(vcpu, gw, it.sptep);
+
+ return it.sptep;
+
+out_gpte_changed:
+ if (sp)
+ kvm_mmu_put_page(sp, it.sptep);
+ kvm_release_pfn_clean(pfn);
+ return NULL;
}
/*
@@ -389,7 +532,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
{
int write_fault = error_code & PFERR_WRITE_MASK;
int user_fault = error_code & PFERR_USER_MASK;
- int fetch_fault = error_code & PFERR_FETCH_MASK;
struct guest_walker walker;
u64 *sptep;
int write_pt = 0;
@@ -399,7 +541,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
unsigned long mmu_seq;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
- kvm_mmu_audit(vcpu, "pre page fault");
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -408,15 +549,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
/*
* Look up the guest pte for the faulting address.
*/
- r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
- fetch_fault);
+ r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
/*
* The page is not mapped by the guest. Let the guest handle it.
*/
if (!r) {
pgprintk("%s: guest page fault\n", __func__);
- inject_page_fault(vcpu, addr, walker.error_code);
+ inject_page_fault(vcpu);
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
return 0;
}
@@ -431,18 +571,18 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
/* mmio */
- if (is_error_pfn(pfn)) {
- pgprintk("gfn %lx is mmio\n", walker.gfn);
- kvm_release_pfn_clean(pfn);
- return 1;
- }
+ if (is_error_pfn(pfn))
+ return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
+
+ trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
kvm_mmu_free_some_pages(vcpu);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
level, &write_pt, pfn);
+ (void)sptep;
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
sptep, *sptep, write_pt);
@@ -450,7 +590,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
++vcpu->stat.pf_fixed;
- kvm_mmu_audit(vcpu, "post page fault (fixed)");
+ trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
spin_unlock(&vcpu->kvm->mmu_lock);
return write_pt;
@@ -464,6 +604,7 @@ out_unlock:
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
{
struct kvm_shadow_walk_iterator iterator;
+ struct kvm_mmu_page *sp;
gpa_t pte_gpa = -1;
int level;
u64 *sptep;
@@ -475,10 +616,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
level = iterator.level;
sptep = iterator.sptep;
+ sp = page_header(__pa(sptep));
if (is_last_spte(*sptep, level)) {
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
int offset, shift;
+ if (!sp->unsync)
+ break;
+
shift = PAGE_SHIFT -
(PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
offset = sp->role.quadrant << shift;
@@ -487,16 +631,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
if (is_shadow_present_pte(*sptep)) {
- rmap_remove(vcpu->kvm, sptep);
if (is_large_pte(*sptep))
--vcpu->kvm->stat.lpages;
+ drop_spte(vcpu->kvm, sptep,
+ shadow_trap_nonpresent_pte);
need_flush = 1;
- }
- __set_spte(sptep, shadow_trap_nonpresent_pte);
+ } else
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
break;
}
- if (!is_shadow_present_pte(*sptep))
+ if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
break;
}
@@ -522,10 +667,25 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
gpa_t gpa = UNMAPPED_GVA;
int r;
- r = FNAME(walk_addr)(&walker, vcpu, vaddr,
- !!(access & PFERR_WRITE_MASK),
- !!(access & PFERR_USER_MASK),
- !!(access & PFERR_FETCH_MASK));
+ r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
+
+ if (r) {
+ gpa = gfn_to_gpa(walker.gfn);
+ gpa |= vaddr & ~PAGE_MASK;
+ } else if (error)
+ *error = walker.error_code;
+
+ return gpa;
+}
+
+static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
+ u32 access, u32 *error)
+{
+ struct guest_walker walker;
+ gpa_t gpa = UNMAPPED_GVA;
+ int r;
+
+ r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
if (r) {
gpa = gfn_to_gpa(walker.gfn);
@@ -570,9 +730,9 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
* can't change unless all sptes pointing to it are nuked first.
- * - Alias changes zap the entire shadow cache.
*/
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ bool clear_unsync)
{
int i, offset, nr_present;
bool reset_host_protection;
@@ -580,6 +740,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
offset = nr_present = 0;
+ /* direct kvm_mmu_page can not be unsync. */
+ BUG_ON(sp->role.direct);
+
if (PTTYPE == 32)
offset = sp->role.quadrant << PT64_LEVEL_BITS;
@@ -589,7 +752,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
unsigned pte_access;
pt_element_t gpte;
gpa_t pte_gpa;
- gfn_t gfn = sp->gfns[i];
+ gfn_t gfn;
if (!is_shadow_present_pte(sp->spt[i]))
continue;
@@ -600,16 +763,17 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
sizeof(pt_element_t)))
return -EINVAL;
- if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) ||
- !(gpte & PT_ACCESSED_MASK)) {
+ gfn = gpte_to_gfn(gpte);
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
+ || gfn != sp->gfns[i] || !is_present_gpte(gpte)
+ || !(gpte & PT_ACCESSED_MASK)) {
u64 nonpresent;
- rmap_remove(vcpu->kvm, &sp->spt[i]);
- if (is_present_gpte(gpte))
+ if (is_present_gpte(gpte) || !clear_unsync)
nonpresent = shadow_trap_nonpresent_pte;
else
nonpresent = shadow_notrap_nonpresent_pte;
- __set_spte(&sp->spt[i], nonpresent);
+ drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
continue;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce438e0fdd26..82e144a4e514 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,6 +4,7 @@
* AMD SVM support
*
* Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
@@ -87,6 +88,14 @@ struct nested_state {
/* A VMEXIT is required but not yet emulated */
bool exit_required;
+ /*
+ * If we vmexit during an instruction emulation we need this to restore
+ * the l1 guest rip after the emulation
+ */
+ unsigned long vmexit_rip;
+ unsigned long vmexit_rsp;
+ unsigned long vmexit_rax;
+
/* cache for intercepts of the guest */
u16 intercept_cr_read;
u16 intercept_cr_write;
@@ -95,6 +104,8 @@ struct nested_state {
u32 intercept_exceptions;
u64 intercept;
+ /* Nested Paging related state */
+ u64 nested_cr3;
};
#define MSRPM_OFFSETS 16
@@ -130,7 +141,7 @@ static struct svm_direct_access_msrs {
u32 index; /* Index of the MSR */
bool always; /* True if intercept is always on */
} direct_access_msrs[] = {
- { .index = MSR_K6_STAR, .always = true },
+ { .index = MSR_STAR, .always = true },
{ .index = MSR_IA32_SYSENTER_CS, .always = true },
#ifdef CONFIG_X86_64
{ .index = MSR_GS_BASE, .always = true },
@@ -283,13 +294,22 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
force_new_asid(vcpu);
}
+static int get_npt_level(void)
+{
+#ifdef CONFIG_X86_64
+ return PT64_ROOT_LEVEL;
+#else
+ return PT32E_ROOT_LEVEL;
+#endif
+}
+
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
+ vcpu->arch.efer = efer;
if (!npt_enabled && !(efer & EFER_LMA))
efer &= ~EFER_LME;
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
- vcpu->arch.efer = efer;
}
static int is_external_interrupt(u32 info)
@@ -383,8 +403,7 @@ static void svm_init_erratum_383(void)
int err;
u64 val;
- /* Only Fam10h is affected */
- if (boot_cpu_data.x86 != 0x10)
+ if (!cpu_has_amd_erratum(amd_erratum_383))
return;
/* Use _safe variants to not break nested virtualization */
@@ -640,7 +659,7 @@ static __init int svm_hardware_setup(void)
if (nested) {
printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
- kvm_enable_efer_bits(EFER_SVME);
+ kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
}
for_each_possible_cpu(cpu) {
@@ -701,6 +720,29 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
seg->base = 0;
}
+static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 g_tsc_offset = 0;
+
+ if (is_nested(svm)) {
+ g_tsc_offset = svm->vmcb->control.tsc_offset -
+ svm->nested.hsave->control.tsc_offset;
+ svm->nested.hsave->control.tsc_offset = offset;
+ }
+
+ svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
+}
+
+static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.tsc_offset += adjustment;
+ if (is_nested(svm))
+ svm->nested.hsave->control.tsc_offset += adjustment;
+}
+
static void init_vmcb(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
@@ -766,7 +808,6 @@ static void init_vmcb(struct vcpu_svm *svm)
control->iopm_base_pa = iopm_base;
control->msrpm_base_pa = __pa(svm->msrpm);
- control->tsc_offset = 0;
control->int_ctl = V_INTR_MASKING_MASK;
init_seg(&save->es);
@@ -794,7 +835,7 @@ static void init_vmcb(struct vcpu_svm *svm)
init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
- save->efer = EFER_SVME;
+ svm_set_efer(&svm->vcpu, 0);
save->dr6 = 0xffff0ff0;
save->dr7 = 0x400;
save->rflags = 2;
@@ -805,8 +846,8 @@ static void init_vmcb(struct vcpu_svm *svm)
* This is the guest-visible cr0 value.
* svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
*/
- svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
+ svm->vcpu.arch.cr0 = 0;
+ (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
save->cr4 = X86_CR4_PAE;
/* rdx = ?? */
@@ -902,14 +943,20 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
svm->asid_generation = 0;
init_vmcb(svm);
+ kvm_write_tsc(&svm->vcpu, 0);
+
+ err = fx_init(&svm->vcpu);
+ if (err)
+ goto free_page4;
- fx_init(&svm->vcpu);
svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
if (kvm_vcpu_is_bsp(&svm->vcpu))
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
return &svm->vcpu;
+free_page4:
+ __free_page(hsave_page);
free_page3:
__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
free_page2:
@@ -942,20 +989,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
int i;
if (unlikely(cpu != vcpu->cpu)) {
- u64 delta;
-
- if (check_tsc_unstable()) {
- /*
- * Make sure that the guest sees a monotonically
- * increasing TSC.
- */
- delta = vcpu->arch.host_tsc - native_read_tsc();
- svm->vmcb->control.tsc_offset += delta;
- if (is_nested(svm))
- svm->nested.hsave->control.tsc_offset += delta;
- }
- vcpu->cpu = cpu;
- kvm_migrate_timers(vcpu);
svm->asid_generation = 0;
}
@@ -971,8 +1004,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
++vcpu->stat.host_state_reload;
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
- vcpu->arch.host_tsc = native_read_tsc();
}
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -990,7 +1021,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
switch (reg) {
case VCPU_EXREG_PDPTR:
BUG_ON(!npt_enabled);
- load_pdptrs(vcpu, vcpu->arch.cr3);
+ load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
break;
default:
BUG();
@@ -1201,8 +1232,12 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (old == new) {
/* cr0 write with ts and mp unchanged */
svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
- if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE)
+ if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) {
+ svm->nested.vmexit_rip = kvm_rip_read(vcpu);
+ svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
return;
+ }
}
}
@@ -1488,7 +1523,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
*/
pr_err("KVM: Guest triggered AMD Erratum 383\n");
- set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
return;
}
@@ -1535,7 +1570,7 @@ static int io_interception(struct vcpu_svm *svm)
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
if (string || in)
- return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+ return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1576,6 +1611,54 @@ static int vmmcall_interception(struct vcpu_svm *svm)
return 1;
}
+static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->nested.nested_cr3;
+}
+
+static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
+ unsigned long root)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.nested_cr3 = root;
+ force_new_asid(vcpu);
+}
+
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.exit_code = SVM_EXIT_NPF;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code;
+ svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address;
+
+ nested_svm_vmexit(svm);
+}
+
+static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
+
+ vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
+ vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
+ vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
+ vcpu->arch.mmu.shadow_root_level = get_npt_level();
+ vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
+
+ return r;
+}
+
+static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
+
static int nested_svm_check_permissions(struct vcpu_svm *svm)
{
if (!(svm->vcpu.arch.efer & EFER_SVME)
@@ -1624,6 +1707,14 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
return false;
+ /*
+ * if vmexit was already requested (by intercepted exception
+ * for instance) do not overwrite it with "external interrupt"
+ * vmexit.
+ */
+ if (svm->nested.exit_required)
+ return false;
+
svm->vmcb->control.exit_code = SVM_EXIT_INTR;
svm->vmcb->control.exit_info_1 = 0;
svm->vmcb->control.exit_info_2 = 0;
@@ -1891,6 +1982,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->save.ds = vmcb->save.ds;
nested_vmcb->save.gdtr = vmcb->save.gdtr;
nested_vmcb->save.idtr = vmcb->save.idtr;
+ nested_vmcb->save.efer = svm->vcpu.arch.efer;
nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
nested_vmcb->save.cr3 = svm->vcpu.arch.cr3;
nested_vmcb->save.cr2 = vmcb->save.cr2;
@@ -1912,6 +2004,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
+ nested_vmcb->control.next_rip = vmcb->control.next_rip;
/*
* If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@@ -1942,6 +2035,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);
+ svm->nested.nested_cr3 = 0;
+
/* Restore selected save entries */
svm->vmcb->save.es = hsave->save.es;
svm->vmcb->save.cs = hsave->save.cs;
@@ -1957,7 +2052,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vmcb->save.cr3 = hsave->save.cr3;
svm->vcpu.arch.cr3 = hsave->save.cr3;
} else {
- kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
+ (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
}
kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
@@ -1968,6 +2063,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_svm_unmap(page);
+ nested_svm_uninit_mmu_context(&svm->vcpu);
kvm_mmu_reset_context(&svm->vcpu);
kvm_mmu_load(&svm->vcpu);
@@ -2007,6 +2103,20 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
return true;
}
+static bool nested_vmcb_checks(struct vmcb *vmcb)
+{
+ if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
+ return false;
+
+ if (vmcb->control.asid == 0)
+ return false;
+
+ if (vmcb->control.nested_ctl && !npt_enabled)
+ return false;
+
+ return true;
+}
+
static bool nested_svm_vmrun(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
@@ -2021,7 +2131,18 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
if (!nested_vmcb)
return false;
- trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa,
+ if (!nested_vmcb_checks(nested_vmcb)) {
+ nested_vmcb->control.exit_code = SVM_EXIT_ERR;
+ nested_vmcb->control.exit_code_hi = 0;
+ nested_vmcb->control.exit_info_1 = 0;
+ nested_vmcb->control.exit_info_2 = 0;
+
+ nested_svm_unmap(page);
+
+ return false;
+ }
+
+ trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
nested_vmcb->save.rip,
nested_vmcb->control.int_ctl,
nested_vmcb->control.event_inj,
@@ -2050,7 +2171,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
hsave->save.cr4 = svm->vcpu.arch.cr4;
hsave->save.rflags = vmcb->save.rflags;
- hsave->save.rip = svm->next_rip;
+ hsave->save.rip = kvm_rip_read(&svm->vcpu);
hsave->save.rsp = vmcb->save.rsp;
hsave->save.rax = vmcb->save.rax;
if (npt_enabled)
@@ -2065,6 +2186,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
else
svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
+ if (nested_vmcb->control.nested_ctl) {
+ kvm_mmu_unload(&svm->vcpu);
+ svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
+ nested_svm_init_mmu_context(&svm->vcpu);
+ }
+
/* Load the nested guest state */
svm->vmcb->save.es = nested_vmcb->save.es;
svm->vmcb->save.cs = nested_vmcb->save.cs;
@@ -2080,7 +2207,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
} else
- kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
+ (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
/* Guest paging mode is active - reset mmu */
kvm_mmu_reset_context(&svm->vcpu);
@@ -2222,8 +2349,8 @@ static int vmrun_interception(struct vcpu_svm *svm)
if (nested_svm_check_permissions(svm))
return 1;
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
+ /* Save rip after vmrun instruction */
+ kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
if (!nested_svm_vmrun(svm))
return 1;
@@ -2252,6 +2379,7 @@ static int stgi_interception(struct vcpu_svm *svm)
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
enable_gif(svm);
@@ -2386,16 +2514,29 @@ static int iret_interception(struct vcpu_svm *svm)
static int invlpg_interception(struct vcpu_svm *svm)
{
- if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
- pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
- return 1;
+ return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
}
static int emulate_on_interception(struct vcpu_svm *svm)
{
- if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
- pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
- return 1;
+ return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
+}
+
+static int cr0_write_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ int r;
+
+ r = emulate_instruction(&svm->vcpu, 0, 0, 0);
+
+ if (svm->nested.vmexit_rip) {
+ kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax);
+ svm->nested.vmexit_rip = 0;
+ }
+
+ return r == EMULATE_DONE;
}
static int cr8_write_interception(struct vcpu_svm *svm)
@@ -2431,7 +2572,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
*data = tsc_offset + native_read_tsc();
break;
}
- case MSR_K6_STAR:
+ case MSR_STAR:
*data = svm->vmcb->save.star;
break;
#ifdef CONFIG_X86_64
@@ -2541,21 +2682,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
struct vcpu_svm *svm = to_svm(vcpu);
switch (ecx) {
- case MSR_IA32_TSC: {
- u64 tsc_offset = data - native_read_tsc();
- u64 g_tsc_offset = 0;
-
- if (is_nested(svm)) {
- g_tsc_offset = svm->vmcb->control.tsc_offset -
- svm->nested.hsave->control.tsc_offset;
- svm->nested.hsave->control.tsc_offset = tsc_offset;
- }
-
- svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
-
+ case MSR_IA32_TSC:
+ kvm_write_tsc(vcpu, data);
break;
- }
- case MSR_K6_STAR:
+ case MSR_STAR:
svm->vmcb->save.star = data;
break;
#ifdef CONFIG_X86_64
@@ -2642,6 +2772,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
{
struct kvm_run *kvm_run = svm->vcpu.run;
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
/*
@@ -2671,7 +2802,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_READ_CR4] = emulate_on_interception,
[SVM_EXIT_READ_CR8] = emulate_on_interception,
[SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
+ [SVM_EXIT_WRITE_CR0] = cr0_write_interception,
[SVM_EXIT_WRITE_CR3] = emulate_on_interception,
[SVM_EXIT_WRITE_CR4] = emulate_on_interception,
[SVM_EXIT_WRITE_CR8] = cr8_write_interception,
@@ -2726,6 +2857,99 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_NPF] = pf_interception,
};
+void dump_vmcb(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct vmcb_save_area *save = &svm->vmcb->save;
+
+ pr_err("VMCB Control Area:\n");
+ pr_err("cr_read: %04x\n", control->intercept_cr_read);
+ pr_err("cr_write: %04x\n", control->intercept_cr_write);
+ pr_err("dr_read: %04x\n", control->intercept_dr_read);
+ pr_err("dr_write: %04x\n", control->intercept_dr_write);
+ pr_err("exceptions: %08x\n", control->intercept_exceptions);
+ pr_err("intercepts: %016llx\n", control->intercept);
+ pr_err("pause filter count: %d\n", control->pause_filter_count);
+ pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa);
+ pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa);
+ pr_err("tsc_offset: %016llx\n", control->tsc_offset);
+ pr_err("asid: %d\n", control->asid);
+ pr_err("tlb_ctl: %d\n", control->tlb_ctl);
+ pr_err("int_ctl: %08x\n", control->int_ctl);
+ pr_err("int_vector: %08x\n", control->int_vector);
+ pr_err("int_state: %08x\n", control->int_state);
+ pr_err("exit_code: %08x\n", control->exit_code);
+ pr_err("exit_info1: %016llx\n", control->exit_info_1);
+ pr_err("exit_info2: %016llx\n", control->exit_info_2);
+ pr_err("exit_int_info: %08x\n", control->exit_int_info);
+ pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err);
+ pr_err("nested_ctl: %lld\n", control->nested_ctl);
+ pr_err("nested_cr3: %016llx\n", control->nested_cr3);
+ pr_err("event_inj: %08x\n", control->event_inj);
+ pr_err("event_inj_err: %08x\n", control->event_inj_err);
+ pr_err("lbr_ctl: %lld\n", control->lbr_ctl);
+ pr_err("next_rip: %016llx\n", control->next_rip);
+ pr_err("VMCB State Save Area:\n");
+ pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->es.selector, save->es.attrib,
+ save->es.limit, save->es.base);
+ pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->cs.selector, save->cs.attrib,
+ save->cs.limit, save->cs.base);
+ pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->ss.selector, save->ss.attrib,
+ save->ss.limit, save->ss.base);
+ pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->ds.selector, save->ds.attrib,
+ save->ds.limit, save->ds.base);
+ pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->fs.selector, save->fs.attrib,
+ save->fs.limit, save->fs.base);
+ pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->gs.selector, save->gs.attrib,
+ save->gs.limit, save->gs.base);
+ pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->gdtr.selector, save->gdtr.attrib,
+ save->gdtr.limit, save->gdtr.base);
+ pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->ldtr.selector, save->ldtr.attrib,
+ save->ldtr.limit, save->ldtr.base);
+ pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->idtr.selector, save->idtr.attrib,
+ save->idtr.limit, save->idtr.base);
+ pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->tr.selector, save->tr.attrib,
+ save->tr.limit, save->tr.base);
+ pr_err("cpl: %d efer: %016llx\n",
+ save->cpl, save->efer);
+ pr_err("cr0: %016llx cr2: %016llx\n",
+ save->cr0, save->cr2);
+ pr_err("cr3: %016llx cr4: %016llx\n",
+ save->cr3, save->cr4);
+ pr_err("dr6: %016llx dr7: %016llx\n",
+ save->dr6, save->dr7);
+ pr_err("rip: %016llx rflags: %016llx\n",
+ save->rip, save->rflags);
+ pr_err("rsp: %016llx rax: %016llx\n",
+ save->rsp, save->rax);
+ pr_err("star: %016llx lstar: %016llx\n",
+ save->star, save->lstar);
+ pr_err("cstar: %016llx sfmask: %016llx\n",
+ save->cstar, save->sfmask);
+ pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n",
+ save->kernel_gs_base, save->sysenter_cs);
+ pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n",
+ save->sysenter_esp, save->sysenter_eip);
+ pr_err("gpat: %016llx dbgctl: %016llx\n",
+ save->g_pat, save->dbgctl);
+ pr_err("br_from: %016llx br_to: %016llx\n",
+ save->br_from, save->br_to);
+ pr_err("excp_from: %016llx excp_to: %016llx\n",
+ save->last_excp_from, save->last_excp_to);
+
+}
+
static int handle_exit(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -2770,12 +2994,15 @@ static int handle_exit(struct kvm_vcpu *vcpu)
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
kvm_run->fail_entry.hardware_entry_failure_reason
= svm->vmcb->control.exit_code;
+ pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
+ dump_vmcb(vcpu);
return 0;
}
if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
- exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH)
+ exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
+ exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
"exit_code 0x%x\n",
__func__, svm->vmcb->control.exit_int_info,
@@ -2826,9 +3053,6 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
{
struct vmcb_control_area *control;
- trace_kvm_inj_virq(irq);
-
- ++svm->vcpu.stat.irq_injections;
control = &svm->vmcb->control;
control->int_vector = irq;
control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -2842,6 +3066,9 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
BUG_ON(!(gif_set(svm)));
+ trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
+ ++vcpu->stat.irq_injections;
+
svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
@@ -2992,8 +3219,10 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
svm->int3_injected = 0;
- if (svm->vcpu.arch.hflags & HF_IRET_MASK)
+ if (svm->vcpu.arch.hflags & HF_IRET_MASK) {
svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ }
svm->vcpu.arch.nmi_injected = false;
kvm_clear_exception_queue(&svm->vcpu);
@@ -3002,6 +3231,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
if (!(exitintinfo & SVM_EXITINTINFO_VALID))
return;
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
@@ -3038,6 +3269,17 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
}
}
+static void svm_cancel_injection(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+
+ control->exit_int_info = control->event_inj;
+ control->exit_int_info_err = control->event_inj_err;
+ control->event_inj = 0;
+ svm_complete_interrupts(svm);
+}
+
#ifdef CONFIG_X86_64
#define R "r"
#else
@@ -3067,13 +3309,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
sync_lapic_to_cr8(vcpu);
save_host_msrs(vcpu);
- fs_selector = kvm_read_fs();
- gs_selector = kvm_read_gs();
+ savesegment(fs, fs_selector);
+ savesegment(gs, gs_selector);
ldt_selector = kvm_read_ldt();
svm->vmcb->save.cr2 = vcpu->arch.cr2;
- /* required for live migration with NPT */
- if (npt_enabled)
- svm->vmcb->save.cr3 = vcpu->arch.cr3;
clgi();
@@ -3155,10 +3394,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
- kvm_load_fs(fs_selector);
- kvm_load_gs(gs_selector);
- kvm_load_ldt(ldt_selector);
load_host_msrs(vcpu);
+ loadsegment(fs, fs_selector);
+#ifdef CONFIG_X86_64
+ load_gs_index(gs_selector);
+ wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+#else
+ loadsegment(gs, gs_selector);
+#endif
+ kvm_load_ldt(ldt_selector);
reload_tss(vcpu);
@@ -3190,16 +3434,22 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (npt_enabled) {
- svm->vmcb->control.nested_cr3 = root;
- force_new_asid(vcpu);
- return;
- }
-
svm->vmcb->save.cr3 = root;
force_new_asid(vcpu);
}
+static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.nested_cr3 = root;
+
+ /* Also sync guest cr3 here in case we live migrate */
+ svm->vmcb->save.cr3 = vcpu->arch.cr3;
+
+ force_new_asid(vcpu);
+}
+
static int is_disabled(void)
{
u64 vm_cr;
@@ -3232,15 +3482,6 @@ static bool svm_cpu_has_accelerated_tpr(void)
return false;
}
-static int get_npt_level(void)
-{
-#ifdef CONFIG_X86_64
- return PT64_ROOT_LEVEL;
-#else
- return PT32E_ROOT_LEVEL;
-#endif
-}
-
static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
return 0;
@@ -3253,12 +3494,25 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
{
switch (func) {
+ case 0x80000001:
+ if (nested)
+ entry->ecx |= (1 << 2); /* Set SVM bit */
+ break;
case 0x8000000A:
entry->eax = 1; /* SVM revision 1 */
entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
ASID emulation to nested SVM */
entry->ecx = 0; /* Reserved */
- entry->edx = 0; /* Do not support any additional features */
+ entry->edx = 0; /* Per default do not support any
+ additional features */
+
+ /* Support next_rip if host supports it */
+ if (svm_has(SVM_FEATURE_NRIP))
+ entry->edx |= SVM_FEATURE_NRIP;
+
+ /* Support NPT for the guest if enabled */
+ if (npt_enabled)
+ entry->edx |= SVM_FEATURE_NPT;
break;
}
@@ -3327,6 +3581,11 @@ static bool svm_rdtscp_supported(void)
return false;
}
+static bool svm_has_wbinvd_exit(void)
+{
+ return true;
+}
+
static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3391,6 +3650,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_irq = svm_set_irq,
.set_nmi = svm_inject_nmi,
.queue_exception = svm_queue_exception,
+ .cancel_injection = svm_cancel_injection,
.interrupt_allowed = svm_interrupt_allowed,
.nmi_allowed = svm_nmi_allowed,
.get_nmi_mask = svm_get_nmi_mask,
@@ -3411,6 +3671,13 @@ static struct kvm_x86_ops svm_x86_ops = {
.rdtscp_supported = svm_rdtscp_supported,
.set_supported_cpuid = svm_set_supported_cpuid,
+
+ .has_wbinvd_exit = svm_has_wbinvd_exit,
+
+ .write_tsc_offset = svm_write_tsc_offset,
+ .adjust_tsc_offset = svm_adjust_tsc_offset,
+
+ .set_tdp_cr3 = set_tdp_cr3,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 4ddadb1a5ffe..fc7a101c4a35 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -1,3 +1,17 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * timer support
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/hrtimer.h>
@@ -18,7 +32,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
atomic_inc(&ktimer->pending);
/* FIXME: this code should not know anything about vcpus */
- set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
}
if (waitqueue_active(q))
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ee03679efe78..8da0e45ff7c9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,6 +5,7 @@
* machines without emulation or binary translation.
*
* Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
@@ -36,6 +37,8 @@
#include <asm/vmx.h>
#include <asm/virtext.h>
#include <asm/mce.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
#include "trace.h"
@@ -63,6 +66,9 @@ module_param_named(unrestricted_guest,
static int __read_mostly emulate_invalid_guest_state = 0;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+static int __read_mostly vmm_exclusive = 1;
+module_param(vmm_exclusive, bool, S_IRUGO);
+
#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
#define KVM_GUEST_CR0_MASK \
@@ -119,6 +125,7 @@ struct vcpu_vmx {
unsigned long host_rsp;
int launched;
u8 fail;
+ u32 exit_intr_info;
u32 idt_vectoring_info;
struct shared_msr_entry *guest_msrs;
int nmsrs;
@@ -148,11 +155,6 @@ struct vcpu_vmx {
u32 limit;
u32 ar;
} tr, es, ds, fs, gs;
- struct {
- bool pending;
- u8 vector;
- unsigned rip;
- } irq;
} rmode;
int vpid;
bool emulation_required;
@@ -173,10 +175,13 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
static int init_rmode(struct kvm *kvm);
static u64 construct_eptp(unsigned long root_hpa);
+static void kvm_cpu_vmxon(u64 addr);
+static void kvm_cpu_vmxoff(void);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
static unsigned long *vmx_io_bitmap_a;
static unsigned long *vmx_io_bitmap_b;
@@ -231,14 +236,14 @@ static u64 host_efer;
static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
/*
- * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
+ * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
* away by decrementing the array size.
*/
static const u32 vmx_msr_index[] = {
#ifdef CONFIG_X86_64
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
- MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR,
+ MSR_EFER, MSR_TSC_AUX, MSR_STAR,
};
#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
@@ -334,6 +339,11 @@ static inline bool cpu_has_vmx_ept_1g_page(void)
return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
}
+static inline bool cpu_has_vmx_ept_4levels(void)
+{
+ return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
+}
+
static inline bool cpu_has_vmx_invept_individual_addr(void)
{
return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -349,6 +359,16 @@ static inline bool cpu_has_vmx_invept_global(void)
return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
}
+static inline bool cpu_has_vmx_invvpid_single(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_global(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+}
+
static inline bool cpu_has_vmx_ept(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -389,6 +409,12 @@ static inline bool cpu_has_virtual_nmis(void)
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
}
+static inline bool cpu_has_vmx_wbinvd_exit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_WBINVD_EXITING;
+}
+
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
@@ -453,6 +479,19 @@ static void vmcs_clear(struct vmcs *vmcs)
vmcs, phys_addr);
}
+static void vmcs_load(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(vmcs);
+ u8 error;
+
+ asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
+ : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+ : "cc", "memory");
+ if (error)
+ printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
+ vmcs, phys_addr);
+}
+
static void __vcpu_clear(void *arg)
{
struct vcpu_vmx *vmx = arg;
@@ -462,7 +501,6 @@ static void __vcpu_clear(void *arg)
vmcs_clear(vmx->vmcs);
if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
- rdtscll(vmx->vcpu.arch.host_tsc);
list_del(&vmx->local_vcpus_link);
vmx->vcpu.cpu = -1;
vmx->launched = 0;
@@ -475,12 +513,27 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
}
-static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
+static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
{
if (vmx->vpid == 0)
return;
- __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+ if (cpu_has_vmx_invvpid_single())
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+}
+
+static inline void vpid_sync_vcpu_global(void)
+{
+ if (cpu_has_vmx_invvpid_global())
+ __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+}
+
+static inline void vpid_sync_context(struct vcpu_vmx *vmx)
+{
+ if (cpu_has_vmx_invvpid_single())
+ vpid_sync_vcpu_single(vmx);
+ else
+ vpid_sync_vcpu_global();
}
static inline void ept_sync_global(void)
@@ -648,11 +701,10 @@ static void reload_tss(void)
/*
* VT restores TR but not its size. Useless.
*/
- struct desc_ptr gdt;
+ struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
struct desc_struct *descs;
- native_store_gdt(&gdt);
- descs = (void *)gdt.address;
+ descs = (void *)gdt->address;
descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
load_TR_desc();
}
@@ -695,7 +747,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
static unsigned long segment_base(u16 selector)
{
- struct desc_ptr gdt;
+ struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
struct desc_struct *d;
unsigned long table_base;
unsigned long v;
@@ -703,8 +755,7 @@ static unsigned long segment_base(u16 selector)
if (!(selector & ~3))
return 0;
- native_store_gdt(&gdt);
- table_base = gdt.address;
+ table_base = gdt->address;
if (selector & 4) { /* from ldt */
u16 ldt_selector = kvm_read_ldt();
@@ -745,7 +796,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
*/
vmx->host_state.ldt_sel = kvm_read_ldt();
vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
- vmx->host_state.fs_sel = kvm_read_fs();
+ savesegment(fs, vmx->host_state.fs_sel);
if (!(vmx->host_state.fs_sel & 7)) {
vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
vmx->host_state.fs_reload_needed = 0;
@@ -753,7 +804,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
vmcs_write16(HOST_FS_SELECTOR, 0);
vmx->host_state.fs_reload_needed = 1;
}
- vmx->host_state.gs_sel = kvm_read_gs();
+ savesegment(gs, vmx->host_state.gs_sel);
if (!(vmx->host_state.gs_sel & 7))
vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
else {
@@ -783,27 +834,21 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
static void __vmx_load_host_state(struct vcpu_vmx *vmx)
{
- unsigned long flags;
-
if (!vmx->host_state.loaded)
return;
++vmx->vcpu.stat.host_state_reload;
vmx->host_state.loaded = 0;
if (vmx->host_state.fs_reload_needed)
- kvm_load_fs(vmx->host_state.fs_sel);
+ loadsegment(fs, vmx->host_state.fs_sel);
if (vmx->host_state.gs_ldt_reload_needed) {
kvm_load_ldt(vmx->host_state.ldt_sel);
- /*
- * If we have to reload gs, we must take care to
- * preserve our gs base.
- */
- local_irq_save(flags);
- kvm_load_gs(vmx->host_state.gs_sel);
#ifdef CONFIG_X86_64
- wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
+ load_gs_index(vmx->host_state.gs_sel);
+ wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+#else
+ loadsegment(gs, vmx->host_state.gs_sel);
#endif
- local_irq_restore(flags);
}
reload_tss();
#ifdef CONFIG_X86_64
@@ -812,6 +857,9 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
}
#endif
+ if (current_thread_info()->status & TS_USEDFPU)
+ clts();
+ load_gdt(&__get_cpu_var(host_gdt));
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -828,62 +876,47 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u64 phys_addr = __pa(vmx->vmcs);
- u64 tsc_this, delta, new_offset;
+ u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- if (vcpu->cpu != cpu) {
+ if (!vmm_exclusive)
+ kvm_cpu_vmxon(phys_addr);
+ else if (vcpu->cpu != cpu)
vcpu_clear(vmx);
- kvm_migrate_timers(vcpu);
- set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
- local_irq_disable();
- list_add(&vmx->local_vcpus_link,
- &per_cpu(vcpus_on_cpu, cpu));
- local_irq_enable();
- }
if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
- u8 error;
-
per_cpu(current_vmcs, cpu) = vmx->vmcs;
- asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
- : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
- : "cc");
- if (error)
- printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
- vmx->vmcs, phys_addr);
+ vmcs_load(vmx->vmcs);
}
if (vcpu->cpu != cpu) {
- struct desc_ptr dt;
+ struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
unsigned long sysenter_esp;
- vcpu->cpu = cpu;
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ local_irq_disable();
+ list_add(&vmx->local_vcpus_link,
+ &per_cpu(vcpus_on_cpu, cpu));
+ local_irq_enable();
+
/*
* Linux uses per-cpu TSS and GDT, so set these when switching
* processors.
*/
vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
- native_store_gdt(&dt);
- vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */
+ vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
-
- /*
- * Make sure the time stamp counter is monotonous.
- */
- rdtscll(tsc_this);
- if (tsc_this < vcpu->arch.host_tsc) {
- delta = vcpu->arch.host_tsc - tsc_this;
- new_offset = vmcs_read64(TSC_OFFSET) + delta;
- vmcs_write64(TSC_OFFSET, new_offset);
- }
}
}
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
__vmx_load_host_state(to_vmx(vcpu));
+ if (!vmm_exclusive) {
+ __vcpu_clear(to_vmx(vcpu));
+ kvm_cpu_vmxoff();
+ }
}
static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -990,16 +1023,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
}
if (vmx->rmode.vm86_active) {
- vmx->rmode.irq.pending = true;
- vmx->rmode.irq.vector = nr;
- vmx->rmode.irq.rip = kvm_rip_read(vcpu);
- if (kvm_exception_is_soft(nr))
- vmx->rmode.irq.rip +=
- vmx->vcpu.arch.event_exit_inst_len;
- intr_info |= INTR_TYPE_SOFT_INTR;
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
- kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+ if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE)
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
@@ -1057,10 +1082,10 @@ static void setup_msrs(struct vcpu_vmx *vmx)
if (index >= 0 && vmx->rdtscp_enabled)
move_msr_up(vmx, index, save_nmsrs++);
/*
- * MSR_K6_STAR is only needed on long mode guests, and only
+ * MSR_STAR is only needed on long mode guests, and only
* if efer.sce is enabled.
*/
- index = __find_msr_index(vmx, MSR_K6_STAR);
+ index = __find_msr_index(vmx, MSR_STAR);
if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
move_msr_up(vmx, index, save_nmsrs++);
}
@@ -1095,12 +1120,17 @@ static u64 guest_read_tsc(void)
}
/*
- * writes 'guest_tsc' into guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
+ * writes 'offset' into guest's timestamp counter offset register
*/
-static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
+static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
- vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
+ vmcs_write64(TSC_OFFSET, offset);
+}
+
+static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+ u64 offset = vmcs_read64(TSC_OFFSET);
+ vmcs_write64(TSC_OFFSET, offset + adjustment);
}
/*
@@ -1173,7 +1203,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr;
- u64 host_tsc;
int ret = 0;
switch (msr_index) {
@@ -1203,8 +1232,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
case MSR_IA32_TSC:
- rdtscll(host_tsc);
- guest_write_tsc(data, host_tsc);
+ kvm_write_tsc(vcpu, data);
break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -1286,6 +1314,13 @@ static __init int vmx_disabled_by_bios(void)
/* locked but not enabled */
}
+static void kvm_cpu_vmxon(u64 addr)
+{
+ asm volatile (ASM_VMX_VMXON_RAX
+ : : "a"(&addr), "m"(addr)
+ : "memory", "cc");
+}
+
static int hardware_enable(void *garbage)
{
int cpu = raw_smp_processor_id();
@@ -1308,11 +1343,13 @@ static int hardware_enable(void *garbage)
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
- asm volatile (ASM_VMX_VMXON_RAX
- : : "a"(&phys_addr), "m"(phys_addr)
- : "memory", "cc");
- ept_sync_global();
+ if (vmm_exclusive) {
+ kvm_cpu_vmxon(phys_addr);
+ ept_sync_global();
+ }
+
+ store_gdt(&__get_cpu_var(host_gdt));
return 0;
}
@@ -1334,13 +1371,15 @@ static void vmclear_local_vcpus(void)
static void kvm_cpu_vmxoff(void)
{
asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
}
static void hardware_disable(void *garbage)
{
- vmclear_local_vcpus();
- kvm_cpu_vmxoff();
+ if (vmm_exclusive) {
+ vmclear_local_vcpus();
+ kvm_cpu_vmxoff();
+ }
+ write_cr4(read_cr4() & ~X86_CR4_VMXE);
}
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@ -1539,7 +1578,8 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_vpid())
enable_vpid = 0;
- if (!cpu_has_vmx_ept()) {
+ if (!cpu_has_vmx_ept() ||
+ !cpu_has_vmx_ept_4levels()) {
enable_ept = 0;
enable_unrestricted_guest = 0;
}
@@ -1628,7 +1668,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
gfn_t base_gfn;
slots = kvm_memslots(kvm);
- base_gfn = kvm->memslots->memslots[0].base_gfn +
+ base_gfn = slots->memslots[0].base_gfn +
kvm->memslots->memslots[0].npages - 3;
return base_gfn << PAGE_SHIFT;
}
@@ -1759,9 +1799,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
{
- vpid_sync_vcpu_all(to_vmx(vcpu));
- if (enable_ept)
+ vpid_sync_context(to_vmx(vcpu));
+ if (enable_ept) {
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
+ }
}
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -1787,20 +1830,20 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
return;
if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
- vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
- vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
- vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
- vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]);
+ vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
+ vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
+ vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
+ vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
}
}
static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
{
if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
- vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
- vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
- vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
- vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
+ vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+ vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+ vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+ vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
}
__set_bit(VCPU_EXREG_PDPTR,
@@ -2446,7 +2489,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
{
u32 host_sysenter_cs, msr_low, msr_high;
u32 junk;
- u64 host_pat, tsc_this, tsc_base;
+ u64 host_pat;
unsigned long a;
struct desc_ptr dt;
int i;
@@ -2507,15 +2550,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
- vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
+ vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
- vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */
- vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */
+ vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
+ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
#ifdef CONFIG_X86_64
rdmsrl(MSR_FS_BASE, a);
@@ -2587,33 +2630,34 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
- tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
- rdtscll(tsc_this);
- if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
- tsc_base = tsc_this;
-
- guest_write_tsc(0, tsc_base);
+ kvm_write_tsc(&vmx->vcpu, 0);
return 0;
}
static int init_rmode(struct kvm *kvm)
{
+ int idx, ret = 0;
+
+ idx = srcu_read_lock(&kvm->srcu);
if (!init_rmode_tss(kvm))
- return 0;
+ goto exit;
if (!init_rmode_identity_map(kvm))
- return 0;
- return 1;
+ goto exit;
+
+ ret = 1;
+exit:
+ srcu_read_unlock(&kvm->srcu, idx);
+ return ret;
}
static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 msr;
- int ret, idx;
+ int ret;
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
- idx = srcu_read_lock(&vcpu->kvm->srcu);
if (!init_rmode(vmx->vcpu.kvm)) {
ret = -ENOMEM;
goto out;
@@ -2630,7 +2674,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
msr |= MSR_IA32_APICBASE_BSP;
kvm_set_apic_base(&vmx->vcpu, msr);
- fx_init(&vmx->vcpu);
+ ret = fx_init(&vmx->vcpu);
+ if (ret != 0)
+ goto out;
seg_setup(VCPU_SREG_CS);
/*
@@ -2713,7 +2759,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx_fpu_activate(&vmx->vcpu);
update_exception_bitmap(&vmx->vcpu);
- vpid_sync_vcpu_all(vmx);
+ vpid_sync_context(vmx);
ret = 0;
@@ -2721,7 +2767,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->emulation_required = 0;
out:
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
return ret;
}
@@ -2758,16 +2803,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
++vcpu->stat.irq_injections;
if (vmx->rmode.vm86_active) {
- vmx->rmode.irq.pending = true;
- vmx->rmode.irq.vector = irq;
- vmx->rmode.irq.rip = kvm_rip_read(vcpu);
- if (vcpu->arch.interrupt.soft)
- vmx->rmode.irq.rip +=
- vmx->vcpu.arch.event_exit_inst_len;
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
- kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+ if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE)
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
intr = irq | INTR_INFO_VALID_MASK;
@@ -2799,14 +2836,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
++vcpu->stat.nmi_injections;
if (vmx->rmode.vm86_active) {
- vmx->rmode.irq.pending = true;
- vmx->rmode.irq.vector = NMI_VECTOR;
- vmx->rmode.irq.rip = kvm_rip_read(vcpu);
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- NMI_VECTOR | INTR_TYPE_SOFT_INTR |
- INTR_INFO_VALID_MASK);
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
- kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+ if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE)
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -2826,9 +2857,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
{
if (!cpu_has_virtual_nmis())
return to_vmx(vcpu)->soft_vnmi_blocked;
- else
- return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- GUEST_INTR_STATE_NMI);
+ return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
}
static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3070,7 +3099,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
++vcpu->stat.io_exits;
if (string || in)
- return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+ return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
@@ -3090,11 +3119,20 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xc1;
}
+static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+ if (err)
+ kvm_inject_gp(vcpu, 0);
+ else
+ skip_emulated_instruction(vcpu);
+}
+
static int handle_cr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification, val;
int cr;
int reg;
+ int err;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
cr = exit_qualification & 15;
@@ -3105,16 +3143,16 @@ static int handle_cr(struct kvm_vcpu *vcpu)
trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
- kvm_set_cr0(vcpu, val);
- skip_emulated_instruction(vcpu);
+ err = kvm_set_cr0(vcpu, val);
+ complete_insn_gp(vcpu, err);
return 1;
case 3:
- kvm_set_cr3(vcpu, val);
- skip_emulated_instruction(vcpu);
+ err = kvm_set_cr3(vcpu, val);
+ complete_insn_gp(vcpu, err);
return 1;
case 4:
- kvm_set_cr4(vcpu, val);
- skip_emulated_instruction(vcpu);
+ err = kvm_set_cr4(vcpu, val);
+ complete_insn_gp(vcpu, err);
return 1;
case 8: {
u8 cr8_prev = kvm_get_cr8(vcpu);
@@ -3263,6 +3301,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
{
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return 1;
}
@@ -3275,6 +3314,8 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
++vcpu->stat.irq_window_exits;
/*
@@ -3321,30 +3362,25 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
static int handle_wbinvd(struct kvm_vcpu *vcpu)
{
skip_emulated_instruction(vcpu);
- /* TODO: Add support for VT-d/pass-through device */
+ kvm_emulate_wbinvd(vcpu);
return 1;
}
-static int handle_apic_access(struct kvm_vcpu *vcpu)
+static int handle_xsetbv(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification;
- enum emulation_result er;
- unsigned long offset;
-
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- offset = exit_qualification & 0xffful;
+ u64 new_bv = kvm_read_edx_eax(vcpu);
+ u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
- er = emulate_instruction(vcpu, 0, 0, 0);
-
- if (er != EMULATE_DONE) {
- printk(KERN_ERR
- "Fail to handle apic access vmexit! Offset is 0x%lx\n",
- offset);
- return -ENOEXEC;
- }
+ if (kvm_set_xcr(vcpu, index, new_bv) == 0)
+ skip_emulated_instruction(vcpu);
return 1;
}
+static int handle_apic_access(struct kvm_vcpu *vcpu)
+{
+ return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+}
+
static int handle_task_switch(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3536,6 +3572,7 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
++vcpu->stat.nmi_window_exits;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return 1;
}
@@ -3545,8 +3582,17 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
enum emulation_result err = EMULATE_DONE;
int ret = 1;
+ u32 cpu_exec_ctrl;
+ bool intr_window_requested;
+
+ cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
while (!guest_state_valid(vcpu)) {
+ if (intr_window_requested
+ && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
+ return handle_interrupt_window(&vmx->vcpu);
+
err = emulate_instruction(vcpu, 0, 0, 0);
if (err == EMULATE_DO_MMIO) {
@@ -3554,13 +3600,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
goto out;
}
- if (err != EMULATE_DONE) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- ret = 0;
- goto out;
- }
+ if (err != EMULATE_DONE)
+ return 0;
if (signal_pending(current))
goto out;
@@ -3623,6 +3664,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_WBINVD] = handle_wbinvd,
+ [EXIT_REASON_XSETBV] = handle_xsetbv,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
@@ -3656,6 +3698,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
if (enable_ept && is_paging(vcpu))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason
+ = exit_reason;
+ return 0;
+ }
+
if (unlikely(vmx->fail)) {
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -3709,18 +3758,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
vmcs_write32(TPR_THRESHOLD, irr);
}
-static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
{
- u32 exit_intr_info;
- u32 idt_vectoring_info = vmx->idt_vectoring_info;
- bool unblock_nmi;
- u8 vector;
- int type;
- bool idtv_info_valid;
-
- exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
- vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+ u32 exit_intr_info = vmx->exit_intr_info;
/* Handle machine checks before interrupts are enabled */
if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
@@ -3735,8 +3775,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
asm("int $2");
kvm_after_handle_nmi(&vmx->vcpu);
}
+}
- idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
+{
+ u32 exit_intr_info = vmx->exit_intr_info;
+ bool unblock_nmi;
+ u8 vector;
+ bool idtv_info_valid;
+
+ idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
if (cpu_has_virtual_nmis()) {
unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
@@ -3758,6 +3806,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
} else if (unlikely(vmx->soft_vnmi_blocked))
vmx->vnmi_blocked_time +=
ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
+}
+
+static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
+ u32 idt_vectoring_info,
+ int instr_len_field,
+ int error_code_field)
+{
+ u8 vector;
+ int type;
+ bool idtv_info_valid;
+
+ idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
vmx->vcpu.arch.nmi_injected = false;
kvm_clear_exception_queue(&vmx->vcpu);
@@ -3766,6 +3826,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
if (!idtv_info_valid)
return;
+ kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
+
vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
@@ -3782,18 +3844,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
break;
case INTR_TYPE_SOFT_EXCEPTION:
vmx->vcpu.arch.event_exit_inst_len =
- vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ vmcs_read32(instr_len_field);
/* fall through */
case INTR_TYPE_HARD_EXCEPTION:
if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
- u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ u32 err = vmcs_read32(error_code_field);
kvm_queue_exception_e(&vmx->vcpu, vector, err);
} else
kvm_queue_exception(&vmx->vcpu, vector);
break;
case INTR_TYPE_SOFT_INTR:
vmx->vcpu.arch.event_exit_inst_len =
- vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ vmcs_read32(instr_len_field);
/* fall through */
case INTR_TYPE_EXT_INTR:
kvm_queue_interrupt(&vmx->vcpu, vector,
@@ -3804,27 +3866,21 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
}
}
-/*
- * Failure to inject an interrupt should give us the information
- * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
- * when fetching the interrupt redirection bitmap in the real-mode
- * tss, this doesn't happen. So we do it ourselves.
- */
-static void fixup_rmode_irq(struct vcpu_vmx *vmx)
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
{
- vmx->rmode.irq.pending = 0;
- if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
- return;
- kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
- if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
- vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
- vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
- return;
- }
- vmx->idt_vectoring_info =
- VECTORING_INFO_VALID_MASK
- | INTR_TYPE_EXT_INTR
- | vmx->rmode.irq.vector;
+ __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
+ VM_EXIT_INSTRUCTION_LEN,
+ IDT_VECTORING_ERROR_CODE);
+}
+
+static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+{
+ __vmx_complete_interrupts(to_vmx(vcpu),
+ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+ VM_ENTRY_INSTRUCTION_LEN,
+ VM_ENTRY_EXCEPTION_ERROR_CODE);
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
}
#ifdef CONFIG_X86_64
@@ -3861,11 +3917,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
- /*
- * Loading guest fpu may have cleared host cr0.ts
- */
- vmcs_writel(HOST_CR0, read_cr0());
-
asm(
/* Store host registers */
"push %%"R"dx; push %%"R"bp;"
@@ -3956,7 +4007,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
#endif
[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
: "cc", "memory"
- , R"bx", R"di", R"si"
+ , R"ax", R"bx", R"di", R"si"
#ifdef CONFIG_X86_64
, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
#endif
@@ -3967,12 +4018,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.regs_dirty = 0;
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
- if (vmx->rmode.irq.pending)
- fixup_rmode_irq(vmx);
asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
vmx->launched = 1;
+ vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+ vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ vmx_complete_atomic_exit(vmx);
+ vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
}
@@ -4001,6 +4055,19 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
kmem_cache_free(kvm_vcpu_cache, vmx);
}
+static inline void vmcs_init(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
+
+ if (!vmm_exclusive)
+ kvm_cpu_vmxon(phys_addr);
+
+ vmcs_clear(vmcs);
+
+ if (!vmm_exclusive)
+ kvm_cpu_vmxoff();
+}
+
static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
int err;
@@ -4026,10 +4093,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (!vmx->vmcs)
goto free_msrs;
- vmcs_clear(vmx->vmcs);
+ vmcs_init(vmx->vmcs);
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
+ vmx->vcpu.cpu = cpu;
err = vmx_vcpu_setup(vmx);
vmx_vcpu_put(&vmx->vcpu);
put_cpu();
@@ -4245,6 +4313,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_irq = vmx_inject_irq,
.set_nmi = vmx_inject_nmi,
.queue_exception = vmx_queue_exception,
+ .cancel_injection = vmx_cancel_injection,
.interrupt_allowed = vmx_interrupt_allowed,
.nmi_allowed = vmx_nmi_allowed,
.get_nmi_mask = vmx_get_nmi_mask,
@@ -4265,6 +4334,13 @@ static struct kvm_x86_ops vmx_x86_ops = {
.rdtscp_supported = vmx_rdtscp_supported,
.set_supported_cpuid = vmx_set_supported_cpuid,
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+
+ .write_tsc_offset = vmx_write_tsc_offset,
+ .adjust_tsc_offset = vmx_adjust_tsc_offset,
+
+ .set_tdp_cr3 = vmx_set_cr3,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7fa89c39c64f..2288ad829b32 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,6 +6,7 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2008 Qumranet, Inc.
* Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
@@ -41,17 +42,21 @@
#include <linux/srcu.h>
#include <linux/slab.h>
#include <linux/perf_event.h>
+#include <linux/uaccess.h>
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
#include "trace.h"
#include <asm/debugreg.h>
-#include <asm/uaccess.h>
#include <asm/msr.h>
#include <asm/desc.h>
#include <asm/mtrr.h>
#include <asm/mce.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
+#include <asm/pvclock.h>
+#include <asm/div64.h>
#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
@@ -62,12 +67,13 @@
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
| X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXSAVE \
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
#define KVM_MAX_MCE_BANKS 32
-#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
+#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
/* EFER defaults:
* - enable syscall per default because its emulated by KVM
@@ -147,6 +153,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ NULL }
};
+u64 __read_mostly host_xcr0;
+
+static inline u32 bit(int bitno)
+{
+ return 1 << (bitno & 31);
+}
+
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
@@ -271,6 +284,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
u32 prev_nr;
int class1, class2;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
if (!vcpu->arch.exception.pending) {
queue:
vcpu->arch.exception.pending = true;
@@ -285,7 +300,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
prev_nr = vcpu->arch.exception.nr;
if (prev_nr == DF_VECTOR) {
/* triple fault -> shutdown */
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
class1 = exception_class(prev_nr);
@@ -316,16 +331,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
- u32 error_code)
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
{
+ unsigned error_code = vcpu->arch.fault.error_code;
+
++vcpu->stat.pf_guest;
- vcpu->arch.cr2 = addr;
+ vcpu->arch.cr2 = vcpu->arch.fault.address;
kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
}
+void kvm_propagate_fault(struct kvm_vcpu *vcpu)
+{
+ if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested)
+ vcpu->arch.nested_mmu.inject_page_fault(vcpu);
+ else
+ vcpu->arch.mmu.inject_page_fault(vcpu);
+
+ vcpu->arch.fault.nested = false;
+}
+
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.nmi_pending = 1;
}
EXPORT_SYMBOL_GPL(kvm_inject_nmi);
@@ -356,18 +383,49 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
EXPORT_SYMBOL_GPL(kvm_require_cpl);
/*
+ * This function will be used to read from the physical memory of the currently
+ * running guest. The difference to kvm_read_guest_page is that this function
+ * can read from guest physical or from the guest's guest physical memory.
+ */
+int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gfn_t ngfn, void *data, int offset, int len,
+ u32 access)
+{
+ gfn_t real_gfn;
+ gpa_t ngpa;
+
+ ngpa = gfn_to_gpa(ngfn);
+ real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
+ if (real_gfn == UNMAPPED_GVA)
+ return -EFAULT;
+
+ real_gfn = gpa_to_gfn(real_gfn);
+
+ return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
+
+int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+ void *data, int offset, int len, u32 access)
+{
+ return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
+ data, offset, len, access);
+}
+
+/*
* Load the pae pdptrs. Return true is they are all valid.
*/
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
+int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
{
gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
int i;
int ret;
- u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+ u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
- ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
- offset * sizeof(u64), sizeof(pdpte));
+ ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
+ offset * sizeof(u64), sizeof(pdpte),
+ PFERR_USER_MASK|PFERR_WRITE_MASK);
if (ret < 0) {
ret = 0;
goto out;
@@ -381,7 +439,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
}
ret = 1;
- memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+ memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
__set_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail);
__set_bit(VCPU_EXREG_PDPTR,
@@ -394,8 +452,10 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
static bool pdptrs_changed(struct kvm_vcpu *vcpu)
{
- u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+ u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
bool changed = true;
+ int offset;
+ gfn_t gfn;
int r;
if (is_long_mode(vcpu) || !is_pae(vcpu))
@@ -405,130 +465,177 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
(unsigned long *)&vcpu->arch.regs_avail))
return true;
- r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
+ gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT;
+ offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1);
+ r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
+ PFERR_USER_MASK | PFERR_WRITE_MASK);
if (r < 0)
goto out;
- changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
+ changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
out:
return changed;
}
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
+ unsigned long old_cr0 = kvm_read_cr0(vcpu);
+ unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
+ X86_CR0_CD | X86_CR0_NW;
+
cr0 |= X86_CR0_ET;
#ifdef CONFIG_X86_64
- if (cr0 & 0xffffffff00000000UL) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (cr0 & 0xffffffff00000000UL)
+ return 1;
#endif
cr0 &= ~CR0_RESERVED_BITS;
- if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
+ return 1;
- if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+ return 1;
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
if ((vcpu->arch.efer & EFER_LME)) {
int cs_db, cs_l;
- if (!is_pae(vcpu)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (!is_pae(vcpu))
+ return 1;
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- if (cs_l) {
- kvm_inject_gp(vcpu, 0);
- return;
-
- }
+ if (cs_l)
+ return 1;
} else
#endif
- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
-
+ if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
+ vcpu->arch.cr3))
+ return 1;
}
kvm_x86_ops->set_cr0(vcpu, cr0);
- kvm_mmu_reset_context(vcpu);
- return;
+ if ((cr0 ^ old_cr0) & update_bits)
+ kvm_mmu_reset_context(vcpu);
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr0);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
- kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
+ (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
- unsigned long old_cr4 = kvm_read_cr4(vcpu);
- unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+ u64 xcr0;
+
+ /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
+ if (index != XCR_XFEATURE_ENABLED_MASK)
+ return 1;
+ xcr0 = xcr;
+ if (kvm_x86_ops->get_cpl(vcpu) != 0)
+ return 1;
+ if (!(xcr0 & XSTATE_FP))
+ return 1;
+ if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
+ return 1;
+ if (xcr0 & ~host_xcr0)
+ return 1;
+ vcpu->arch.xcr0 = xcr0;
+ vcpu->guest_xcr0_loaded = 0;
+ return 0;
+}
- if (cr4 & CR4_RESERVED_BITS) {
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+ if (__kvm_set_xcr(vcpu, index, xcr)) {
kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_xcr);
+
+static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ return best && (best->ecx & bit(X86_FEATURE_XSAVE));
+}
+
+static void update_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ if (!best)
return;
+
+ /* Update OSXSAVE bit */
+ if (cpu_has_xsave && best->function == 0x1) {
+ best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
+ best->ecx |= bit(X86_FEATURE_OSXSAVE);
}
+}
+
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+ unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+
+ if (cr4 & CR4_RESERVED_BITS)
+ return 1;
+
+ if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
+ return 1;
if (is_long_mode(vcpu)) {
- if (!(cr4 & X86_CR4_PAE)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (!(cr4 & X86_CR4_PAE))
+ return 1;
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
&& ((cr4 ^ old_cr4) & pdptr_bits)
- && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))
+ return 1;
+
+ if (cr4 & X86_CR4_VMXE)
+ return 1;
- if (cr4 & X86_CR4_VMXE) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
kvm_x86_ops->set_cr4(vcpu, cr4);
- vcpu->arch.cr4 = cr4;
- kvm_mmu_reset_context(vcpu);
+
+ if ((cr4 ^ old_cr4) & pdptr_bits)
+ kvm_mmu_reset_context(vcpu);
+
+ if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
+ update_cpuid(vcpu);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr4);
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
kvm_mmu_sync_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
- return;
+ return 0;
}
if (is_long_mode(vcpu)) {
- if (cr3 & CR3_L_MODE_RESERVED_BITS) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (cr3 & CR3_L_MODE_RESERVED_BITS)
+ return 1;
} else {
if (is_pae(vcpu)) {
- if (cr3 & CR3_PAE_RESERVED_BITS) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
- if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (cr3 & CR3_PAE_RESERVED_BITS)
+ return 1;
+ if (is_paging(vcpu) &&
+ !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+ return 1;
}
/*
* We don't check reserved bits in nonpae mode, because
@@ -546,24 +653,28 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
* to debug) behavior on the guest side.
*/
if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
- kvm_inject_gp(vcpu, 0);
- else {
- vcpu->arch.cr3 = cr3;
- vcpu->arch.mmu.new_cr3(vcpu);
- }
+ return 1;
+ vcpu->arch.cr3 = cr3;
+ vcpu->arch.mmu.new_cr3(vcpu);
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr3);
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
- if (cr8 & CR8_RESERVED_BITS) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (cr8 & CR8_RESERVED_BITS)
+ return 1;
if (irqchip_in_kernel(vcpu->kvm))
kvm_lapic_set_tpr(vcpu, cr8);
else
vcpu->arch.cr8 = cr8;
+ return 0;
+}
+
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+ if (__kvm_set_cr8(vcpu, cr8))
+ kvm_inject_gp(vcpu, 0);
}
EXPORT_SYMBOL_GPL(kvm_set_cr8);
@@ -576,7 +687,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_get_cr8);
-int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
switch (dr) {
case 0 ... 3:
@@ -585,29 +696,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
vcpu->arch.eff_db[dr] = val;
break;
case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return 1; /* #UD */
/* fall through */
case 6:
- if (val & 0xffffffff00000000ULL) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
+ if (val & 0xffffffff00000000ULL)
+ return -1; /* #GP */
vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
break;
case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return 1; /* #UD */
/* fall through */
default: /* 7 */
- if (val & 0xffffffff00000000ULL) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
+ if (val & 0xffffffff00000000ULL)
+ return -1; /* #GP */
vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
@@ -618,28 +721,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
return 0;
}
+
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+ int res;
+
+ res = __kvm_set_dr(vcpu, dr, val);
+ if (res > 0)
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ else if (res < 0)
+ kvm_inject_gp(vcpu, 0);
+
+ return res;
+}
EXPORT_SYMBOL_GPL(kvm_set_dr);
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
switch (dr) {
case 0 ... 3:
*val = vcpu->arch.db[dr];
break;
case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
return 1;
- }
/* fall through */
case 6:
*val = vcpu->arch.dr6;
break;
case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
return 1;
- }
/* fall through */
default: /* 7 */
*val = vcpu->arch.dr7;
@@ -648,12 +760,16 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_get_dr);
-static inline u32 bit(int bitno)
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
- return 1 << (bitno & 31);
+ if (_kvm_get_dr(vcpu, dr, val)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ return 0;
}
+EXPORT_SYMBOL_GPL(kvm_get_dr);
/*
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -671,21 +787,25 @@ static u32 msrs_to_save[] = {
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
HV_X64_MSR_APIC_ASSIST_PAGE,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
- MSR_K6_STAR,
+ MSR_STAR,
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
- MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+ MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
};
static unsigned num_msrs_to_save;
static u32 emulated_msrs[] = {
MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MCG_STATUS,
+ MSR_IA32_MCG_CTL,
};
static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
+ u64 old_efer = vcpu->arch.efer;
+
if (efer & efer_reserved_bits)
return 1;
@@ -714,11 +834,13 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
kvm_x86_ops->set_efer(vcpu, efer);
- vcpu->arch.efer = efer;
-
vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
kvm_mmu_reset_context(vcpu);
+ /* Update reserved bits */
+ if ((efer ^ old_efer) & EFER_NX)
+ kvm_mmu_reset_context(vcpu);
+
return 0;
}
@@ -770,7 +892,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
/*
* The guest calculates current wall clock time by adding
- * system time (updated by kvm_write_guest_time below) to the
+ * system time (updated by kvm_guest_time_update below) to the
* wall clock specified here. guest system time equals host
* system time for us, thus we must fill in host boot time here.
*/
@@ -798,65 +920,229 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
return quotient;
}
-static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
+static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
+ s8 *pshift, u32 *pmultiplier)
{
- uint64_t nsecs = 1000000000LL;
+ uint64_t scaled64;
int32_t shift = 0;
uint64_t tps64;
uint32_t tps32;
- tps64 = tsc_khz * 1000LL;
- while (tps64 > nsecs*2) {
+ tps64 = base_khz * 1000LL;
+ scaled64 = scaled_khz * 1000LL;
+ while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
tps64 >>= 1;
shift--;
}
tps32 = (uint32_t)tps64;
- while (tps32 <= (uint32_t)nsecs) {
- tps32 <<= 1;
+ while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
+ if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
+ scaled64 >>= 1;
+ else
+ tps32 <<= 1;
shift++;
}
- hv_clock->tsc_shift = shift;
- hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
+ *pshift = shift;
+ *pmultiplier = div_frac(scaled64, tps32);
- pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
- __func__, tsc_khz, hv_clock->tsc_shift,
- hv_clock->tsc_to_system_mul);
+ pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
+ __func__, base_khz, scaled_khz, shift, *pmultiplier);
+}
+
+static inline u64 get_kernel_ns(void)
+{
+ struct timespec ts;
+
+ WARN_ON(preemptible());
+ ktime_get_ts(&ts);
+ monotonic_to_bootbased(&ts);
+ return timespec_to_ns(&ts);
}
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+unsigned long max_tsc_khz;
-static void kvm_write_guest_time(struct kvm_vcpu *v)
+static inline int kvm_tsc_changes_freq(void)
+{
+ int cpu = get_cpu();
+ int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ cpufreq_quick_get(cpu) != 0;
+ put_cpu();
+ return ret;
+}
+
+static inline u64 nsec_to_cycles(u64 nsec)
+{
+ u64 ret;
+
+ WARN_ON(preemptible());
+ if (kvm_tsc_changes_freq())
+ printk_once(KERN_WARNING
+ "kvm: unreliable cycle conversion on adjustable rate TSC\n");
+ ret = nsec * __get_cpu_var(cpu_tsc_khz);
+ do_div(ret, USEC_PER_SEC);
+ return ret;
+}
+
+static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz)
+{
+ /* Compute a scale to convert nanoseconds in TSC cycles */
+ kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
+ &kvm->arch.virtual_tsc_shift,
+ &kvm->arch.virtual_tsc_mult);
+ kvm->arch.virtual_tsc_khz = this_tsc_khz;
+}
+
+static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
+{
+ u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
+ vcpu->kvm->arch.virtual_tsc_mult,
+ vcpu->kvm->arch.virtual_tsc_shift);
+ tsc += vcpu->arch.last_tsc_write;
+ return tsc;
+}
+
+void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u64 offset, ns, elapsed;
+ unsigned long flags;
+ s64 sdiff;
+
+ spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+ offset = data - native_read_tsc();
+ ns = get_kernel_ns();
+ elapsed = ns - kvm->arch.last_tsc_nsec;
+ sdiff = data - kvm->arch.last_tsc_write;
+ if (sdiff < 0)
+ sdiff = -sdiff;
+
+ /*
+ * Special case: close write to TSC within 5 seconds of
+ * another CPU is interpreted as an attempt to synchronize
+ * The 5 seconds is to accomodate host load / swapping as
+ * well as any reset of TSC during the boot process.
+ *
+ * In that case, for a reliable TSC, we can match TSC offsets,
+ * or make a best guest using elapsed value.
+ */
+ if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) &&
+ elapsed < 5ULL * NSEC_PER_SEC) {
+ if (!check_tsc_unstable()) {
+ offset = kvm->arch.last_tsc_offset;
+ pr_debug("kvm: matched tsc offset for %llu\n", data);
+ } else {
+ u64 delta = nsec_to_cycles(elapsed);
+ offset += delta;
+ pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
+ }
+ ns = kvm->arch.last_tsc_nsec;
+ }
+ kvm->arch.last_tsc_nsec = ns;
+ kvm->arch.last_tsc_write = data;
+ kvm->arch.last_tsc_offset = offset;
+ kvm_x86_ops->write_tsc_offset(vcpu, offset);
+ spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+ /* Reset of TSC must disable overshoot protection below */
+ vcpu->arch.hv_clock.tsc_timestamp = 0;
+ vcpu->arch.last_tsc_write = data;
+ vcpu->arch.last_tsc_nsec = ns;
+}
+EXPORT_SYMBOL_GPL(kvm_write_tsc);
+
+static int kvm_guest_time_update(struct kvm_vcpu *v)
{
- struct timespec ts;
unsigned long flags;
struct kvm_vcpu_arch *vcpu = &v->arch;
void *shared_kaddr;
unsigned long this_tsc_khz;
+ s64 kernel_ns, max_kernel_ns;
+ u64 tsc_timestamp;
- if ((!vcpu->time_page))
- return;
+ /* Keep irq disabled to prevent changes to the clock */
+ local_irq_save(flags);
+ kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
+ kernel_ns = get_kernel_ns();
+ this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
- this_tsc_khz = get_cpu_var(cpu_tsc_khz);
- if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
- kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
- vcpu->hv_clock_tsc_khz = this_tsc_khz;
+ if (unlikely(this_tsc_khz == 0)) {
+ local_irq_restore(flags);
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
+ return 1;
+ }
+
+ /*
+ * We may have to catch up the TSC to match elapsed wall clock
+ * time for two reasons, even if kvmclock is used.
+ * 1) CPU could have been running below the maximum TSC rate
+ * 2) Broken TSC compensation resets the base at each VCPU
+ * entry to avoid unknown leaps of TSC even when running
+ * again on the same CPU. This may cause apparent elapsed
+ * time to disappear, and the guest to stand still or run
+ * very slowly.
+ */
+ if (vcpu->tsc_catchup) {
+ u64 tsc = compute_guest_tsc(v, kernel_ns);
+ if (tsc > tsc_timestamp) {
+ kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
+ tsc_timestamp = tsc;
+ }
}
- put_cpu_var(cpu_tsc_khz);
- /* Keep irq disabled to prevent changes to the clock */
- local_irq_save(flags);
- kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
- ktime_get_ts(&ts);
- monotonic_to_bootbased(&ts);
local_irq_restore(flags);
- /* With all the info we got, fill in the values */
+ if (!vcpu->time_page)
+ return 0;
+
+ /*
+ * Time as measured by the TSC may go backwards when resetting the base
+ * tsc_timestamp. The reason for this is that the TSC resolution is
+ * higher than the resolution of the other clock scales. Thus, many
+ * possible measurments of the TSC correspond to one measurement of any
+ * other clock, and so a spread of values is possible. This is not a
+ * problem for the computation of the nanosecond clock; with TSC rates
+ * around 1GHZ, there can only be a few cycles which correspond to one
+ * nanosecond value, and any path through this code will inevitably
+ * take longer than that. However, with the kernel_ns value itself,
+ * the precision may be much lower, down to HZ granularity. If the
+ * first sampling of TSC against kernel_ns ends in the low part of the
+ * range, and the second in the high end of the range, we can get:
+ *
+ * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
+ *
+ * As the sampling errors potentially range in the thousands of cycles,
+ * it is possible such a time value has already been observed by the
+ * guest. To protect against this, we must compute the system time as
+ * observed by the guest and ensure the new system time is greater.
+ */
+ max_kernel_ns = 0;
+ if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
+ max_kernel_ns = vcpu->last_guest_tsc -
+ vcpu->hv_clock.tsc_timestamp;
+ max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
+ vcpu->hv_clock.tsc_to_system_mul,
+ vcpu->hv_clock.tsc_shift);
+ max_kernel_ns += vcpu->last_kernel_ns;
+ }
+
+ if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
+ kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
+ &vcpu->hv_clock.tsc_shift,
+ &vcpu->hv_clock.tsc_to_system_mul);
+ vcpu->hw_tsc_khz = this_tsc_khz;
+ }
- vcpu->hv_clock.system_time = ts.tv_nsec +
- (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
+ if (max_kernel_ns > kernel_ns)
+ kernel_ns = max_kernel_ns;
+ /* With all the info we got, fill in the values */
+ vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
+ vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
+ vcpu->last_kernel_ns = kernel_ns;
+ vcpu->last_guest_tsc = tsc_timestamp;
vcpu->hv_clock.flags = 0;
/*
@@ -874,16 +1160,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
kunmap_atomic(shared_kaddr, KM_USER0);
mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
-}
-
-static int kvm_request_guest_time_update(struct kvm_vcpu *v)
-{
- struct kvm_vcpu_arch *vcpu = &v->arch;
-
- if (!vcpu->time_page)
- return 0;
- set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
- return 1;
+ return 0;
}
static bool msr_mtrr_valid(unsigned msr)
@@ -1209,6 +1486,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
}
vcpu->arch.time = data;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
/* we verify if the enable bit is set... */
if (!(data & 1))
@@ -1224,8 +1502,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
kvm_release_page_clean(vcpu->arch.time_page);
vcpu->arch.time_page = NULL;
}
-
- kvm_request_guest_time_update(vcpu);
break;
}
case MSR_IA32_MCG_CTL:
@@ -1262,6 +1538,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
"0x%x data 0x%llx\n", msr, data);
break;
+ case MSR_K7_CLK_CTL:
+ /*
+ * Ignore all writes to this no longer documented MSR.
+ * Writes are only relevant for old K7 processors,
+ * all pre-dating SVM, but a recommended workaround from
+ * AMD for these chips. It is possible to speicify the
+ * affected processor models on the command line, hence
+ * the need to ignore the workaround.
+ */
+ break;
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
if (kvm_hv_msr_partition_wide(msr)) {
int r;
@@ -1454,6 +1740,20 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case 0xcd: /* fsb frequency */
data = 3;
break;
+ /*
+ * MSR_EBC_FREQUENCY_ID
+ * Conservative value valid for even the basic CPU models.
+ * Models 0,1: 000 in bits 23:21 indicating a bus speed of
+ * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
+ * and 266MHz for model 3, or 4. Set Core Clock
+ * Frequency to System Bus Frequency Ratio to 1 (bits
+ * 31:24) even though these are only valid for CPU
+ * models > 2, however guests may end up dividing or
+ * multiplying by zero otherwise.
+ */
+ case MSR_EBC_FREQUENCY_ID:
+ data = 1 << 24;
+ break;
case MSR_IA32_APICBASE:
data = kvm_get_apic_base(vcpu);
break;
@@ -1487,6 +1787,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
return get_msr_mce(vcpu, msr, pdata);
+ case MSR_K7_CLK_CTL:
+ /*
+ * Provide expected ramp-up count for K7. All other
+ * are set to zero, indicating minimum divisors for
+ * every field.
+ *
+ * This prevents guest kernels on AMD host with CPU
+ * type 6, model 8 and higher from exploding due to
+ * the rdmsr failing.
+ */
+ data = 0x20000000;
+ break;
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
if (kvm_hv_msr_partition_wide(msr)) {
int r;
@@ -1524,16 +1836,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
{
int i, idx;
- vcpu_load(vcpu);
-
idx = srcu_read_lock(&vcpu->kvm->srcu);
for (i = 0; i < msrs->nmsrs; ++i)
if (do_msr(vcpu, entries[i].index, &entries[i].data))
break;
srcu_read_unlock(&vcpu->kvm->srcu, idx);
- vcpu_put(vcpu);
-
return i;
}
@@ -1618,6 +1926,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
+ case KVM_CAP_XSAVE:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1641,6 +1950,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_MCE:
r = KVM_MAX_MCE_BANKS;
break;
+ case KVM_CAP_XCRS:
+ r = cpu_has_xsave;
+ break;
default:
r = 0;
break;
@@ -1717,22 +2029,51 @@ out:
return r;
}
+static void wbinvd_ipi(void *garbage)
+{
+ wbinvd();
+}
+
+static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+ return vcpu->kvm->arch.iommu_domain &&
+ !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
+}
+
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
+ /* Address WBINVD may be executed by guest */
+ if (need_emulate_wbinvd(vcpu)) {
+ if (kvm_x86_ops->has_wbinvd_exit())
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+ else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
+ smp_call_function_single(vcpu->cpu,
+ wbinvd_ipi, NULL, 1);
+ }
+
kvm_x86_ops->vcpu_load(vcpu, cpu);
- if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
- unsigned long khz = cpufreq_quick_get(cpu);
- if (!khz)
- khz = tsc_khz;
- per_cpu(cpu_tsc_khz, cpu) = khz;
+ if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
+ /* Make sure TSC doesn't go backwards */
+ s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+ native_read_tsc() - vcpu->arch.last_host_tsc;
+ if (tsc_delta < 0)
+ mark_tsc_unstable("KVM discovered backwards TSC");
+ if (check_tsc_unstable()) {
+ kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
+ vcpu->arch.tsc_catchup = 1;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+ if (vcpu->cpu != cpu)
+ kvm_migrate_timers(vcpu);
+ vcpu->cpu = cpu;
}
- kvm_request_guest_time_update(vcpu);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
- kvm_put_guest_fpu(vcpu);
kvm_x86_ops->vcpu_put(vcpu);
+ kvm_put_guest_fpu(vcpu);
+ vcpu->arch.last_host_tsc = native_read_tsc();
}
static int is_efer_nx(void)
@@ -1781,7 +2122,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
if (copy_from_user(cpuid_entries, entries,
cpuid->nent * sizeof(struct kvm_cpuid_entry)))
goto out_free;
- vcpu_load(vcpu);
for (i = 0; i < cpuid->nent; i++) {
vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1799,7 +2139,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
r = 0;
kvm_apic_set_version(vcpu);
kvm_x86_ops->cpuid_update(vcpu);
- vcpu_put(vcpu);
+ update_cpuid(vcpu);
out_free:
vfree(cpuid_entries);
@@ -1820,11 +2160,10 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
goto out;
- vcpu_load(vcpu);
vcpu->arch.cpuid_nent = cpuid->nent;
kvm_apic_set_version(vcpu);
kvm_x86_ops->cpuid_update(vcpu);
- vcpu_put(vcpu);
+ update_cpuid(vcpu);
return 0;
out:
@@ -1837,7 +2176,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
{
int r;
- vcpu_load(vcpu);
r = -E2BIG;
if (cpuid->nent < vcpu->arch.cpuid_nent)
goto out;
@@ -1849,7 +2187,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
out:
cpuid->nent = vcpu->arch.cpuid_nent;
- vcpu_put(vcpu);
return r;
}
@@ -1901,19 +2238,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
/* cpuid 1.ecx */
const u32 kvm_supported_word4_x86_features =
- F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
+ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
0 /* DS-CPL, VMX, SMX, EST */ |
0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
0 /* Reserved, DCA */ | F(XMM4_1) |
F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
- 0 /* Reserved, XSAVE, OSXSAVE */;
+ 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
+ F(F16C);
/* cpuid 0x80000001.ecx */
const u32 kvm_supported_word6_x86_features =
- F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
+ F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
- F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
- 0 /* SKINIT */ | 0 /* WDT */;
+ F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
+ 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -1922,7 +2260,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
switch (function) {
case 0:
- entry->eax = min(entry->eax, (u32)0xb);
+ entry->eax = min(entry->eax, (u32)0xd);
break;
case 1:
entry->edx &= kvm_supported_word0_x86_features;
@@ -1980,6 +2318,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
break;
}
+ case 0xd: {
+ int i;
+
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ for (i = 1; *nent < maxnent; ++i) {
+ if (entry[i - 1].eax == 0 && i != 2)
+ break;
+ do_cpuid_1_ent(&entry[i], function, i);
+ entry[i].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ }
+ break;
+ }
case KVM_CPUID_SIGNATURE: {
char signature[12] = "KVMKVMKVM\0\0";
u32 *sigptr = (u32 *)signature;
@@ -2081,9 +2433,7 @@ out:
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- vcpu_load(vcpu);
memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
- vcpu_put(vcpu);
return 0;
}
@@ -2091,11 +2441,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- vcpu_load(vcpu);
memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
kvm_apic_post_state_restore(vcpu);
update_cr8_intercept(vcpu);
- vcpu_put(vcpu);
return 0;
}
@@ -2107,20 +2455,16 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
return -EINVAL;
if (irqchip_in_kernel(vcpu->kvm))
return -ENXIO;
- vcpu_load(vcpu);
kvm_queue_interrupt(vcpu, irq->irq, false);
-
- vcpu_put(vcpu);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
}
static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
{
- vcpu_load(vcpu);
kvm_inject_nmi(vcpu);
- vcpu_put(vcpu);
return 0;
}
@@ -2140,7 +2484,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
int r;
unsigned bank_num = mcg_cap & 0xff, bank;
- vcpu_load(vcpu);
r = -EINVAL;
if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
goto out;
@@ -2155,7 +2498,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
for (bank = 0; bank < bank_num; bank++)
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
out:
- vcpu_put(vcpu);
return r;
}
@@ -2188,7 +2530,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
printk(KERN_DEBUG "kvm: set_mce: "
"injects mce exception while "
"previous one is in progress!\n");
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return 0;
}
if (banks[1] & MCI_STATUS_VAL)
@@ -2213,8 +2555,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
- vcpu_load(vcpu);
-
events->exception.injected =
vcpu->arch.exception.pending &&
!kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2239,8 +2579,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
| KVM_VCPUEVENT_VALID_SIPI_VECTOR
| KVM_VCPUEVENT_VALID_SHADOW);
-
- vcpu_put(vcpu);
}
static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
@@ -2251,8 +2589,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
| KVM_VCPUEVENT_VALID_SHADOW))
return -EINVAL;
- vcpu_load(vcpu);
-
vcpu->arch.exception.pending = events->exception.injected;
vcpu->arch.exception.nr = events->exception.nr;
vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -2275,7 +2611,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
vcpu->arch.sipi_vector = events->sipi_vector;
- vcpu_put(vcpu);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
}
@@ -2283,14 +2619,10 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
struct kvm_debugregs *dbgregs)
{
- vcpu_load(vcpu);
-
memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
dbgregs->dr6 = vcpu->arch.dr6;
dbgregs->dr7 = vcpu->arch.dr7;
dbgregs->flags = 0;
-
- vcpu_put(vcpu);
}
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -2299,40 +2631,113 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
if (dbgregs->flags)
return -EINVAL;
- vcpu_load(vcpu);
-
memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
vcpu->arch.dr6 = dbgregs->dr6;
vcpu->arch.dr7 = dbgregs->dr7;
- vcpu_put(vcpu);
+ return 0;
+}
+static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
+{
+ if (cpu_has_xsave)
+ memcpy(guest_xsave->region,
+ &vcpu->arch.guest_fpu.state->xsave,
+ xstate_size);
+ else {
+ memcpy(guest_xsave->region,
+ &vcpu->arch.guest_fpu.state->fxsave,
+ sizeof(struct i387_fxsave_struct));
+ *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
+ XSTATE_FPSSE;
+ }
+}
+
+static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
+{
+ u64 xstate_bv =
+ *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+
+ if (cpu_has_xsave)
+ memcpy(&vcpu->arch.guest_fpu.state->xsave,
+ guest_xsave->region, xstate_size);
+ else {
+ if (xstate_bv & ~XSTATE_FPSSE)
+ return -EINVAL;
+ memcpy(&vcpu->arch.guest_fpu.state->fxsave,
+ guest_xsave->region, sizeof(struct i387_fxsave_struct));
+ }
return 0;
}
+static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
+{
+ if (!cpu_has_xsave) {
+ guest_xcrs->nr_xcrs = 0;
+ return;
+ }
+
+ guest_xcrs->nr_xcrs = 1;
+ guest_xcrs->flags = 0;
+ guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
+ guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+}
+
+static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
+{
+ int i, r = 0;
+
+ if (!cpu_has_xsave)
+ return -EINVAL;
+
+ if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
+ return -EINVAL;
+
+ for (i = 0; i < guest_xcrs->nr_xcrs; i++)
+ /* Only support XCR0 currently */
+ if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
+ r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
+ guest_xcrs->xcrs[0].value);
+ break;
+ }
+ if (r)
+ r = -EINVAL;
+ return r;
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
int r;
- struct kvm_lapic_state *lapic = NULL;
+ union {
+ struct kvm_lapic_state *lapic;
+ struct kvm_xsave *xsave;
+ struct kvm_xcrs *xcrs;
+ void *buffer;
+ } u;
+ u.buffer = NULL;
switch (ioctl) {
case KVM_GET_LAPIC: {
r = -EINVAL;
if (!vcpu->arch.apic)
goto out;
- lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
- if (!lapic)
+ if (!u.lapic)
goto out;
- r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
+ r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
if (r)
goto out;
r = -EFAULT;
- if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
+ if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
goto out;
r = 0;
break;
@@ -2341,14 +2746,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
if (!vcpu->arch.apic)
goto out;
- lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
- if (!lapic)
+ if (!u.lapic)
goto out;
r = -EFAULT;
- if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
+ if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
goto out;
- r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
+ r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
if (r)
goto out;
r = 0;
@@ -2464,9 +2869,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&mce, argp, sizeof mce))
goto out;
- vcpu_load(vcpu);
r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
- vcpu_put(vcpu);
break;
}
case KVM_GET_VCPU_EVENTS: {
@@ -2513,11 +2916,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
break;
}
+ case KVM_GET_XSAVE: {
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xsave)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_XSAVE: {
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xsave)
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
+ break;
+ }
+ case KVM_GET_XCRS: {
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xcrs)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xcrs,
+ sizeof(struct kvm_xcrs)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_XCRS: {
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xcrs)
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(u.xcrs, argp,
+ sizeof(struct kvm_xcrs)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
+ break;
+ }
default:
r = -EINVAL;
}
out:
- kfree(lapic);
+ kfree(u.buffer);
return r;
}
@@ -2557,116 +3016,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
{
- return kvm->arch.n_alloc_mmu_pages;
-}
-
-gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
-{
- int i;
- struct kvm_mem_alias *alias;
- struct kvm_mem_aliases *aliases;
-
- aliases = kvm_aliases(kvm);
-
- for (i = 0; i < aliases->naliases; ++i) {
- alias = &aliases->aliases[i];
- if (alias->flags & KVM_ALIAS_INVALID)
- continue;
- if (gfn >= alias->base_gfn
- && gfn < alias->base_gfn + alias->npages)
- return alias->target_gfn + gfn - alias->base_gfn;
- }
- return gfn;
-}
-
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
- int i;
- struct kvm_mem_alias *alias;
- struct kvm_mem_aliases *aliases;
-
- aliases = kvm_aliases(kvm);
-
- for (i = 0; i < aliases->naliases; ++i) {
- alias = &aliases->aliases[i];
- if (gfn >= alias->base_gfn
- && gfn < alias->base_gfn + alias->npages)
- return alias->target_gfn + gfn - alias->base_gfn;
- }
- return gfn;
-}
-
-/*
- * Set a new alias region. Aliases map a portion of physical memory into
- * another portion. This is useful for memory windows, for example the PC
- * VGA region.
- */
-static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
- struct kvm_memory_alias *alias)
-{
- int r, n;
- struct kvm_mem_alias *p;
- struct kvm_mem_aliases *aliases, *old_aliases;
-
- r = -EINVAL;
- /* General sanity checks */
- if (alias->memory_size & (PAGE_SIZE - 1))
- goto out;
- if (alias->guest_phys_addr & (PAGE_SIZE - 1))
- goto out;
- if (alias->slot >= KVM_ALIAS_SLOTS)
- goto out;
- if (alias->guest_phys_addr + alias->memory_size
- < alias->guest_phys_addr)
- goto out;
- if (alias->target_phys_addr + alias->memory_size
- < alias->target_phys_addr)
- goto out;
-
- r = -ENOMEM;
- aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
- if (!aliases)
- goto out;
-
- mutex_lock(&kvm->slots_lock);
-
- /* invalidate any gfn reference in case of deletion/shrinking */
- memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
- aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
- old_aliases = kvm->arch.aliases;
- rcu_assign_pointer(kvm->arch.aliases, aliases);
- synchronize_srcu_expedited(&kvm->srcu);
- kvm_mmu_zap_all(kvm);
- kfree(old_aliases);
-
- r = -ENOMEM;
- aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
- if (!aliases)
- goto out_unlock;
-
- memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
-
- p = &aliases->aliases[alias->slot];
- p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
- p->npages = alias->memory_size >> PAGE_SHIFT;
- p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
- p->flags &= ~(KVM_ALIAS_INVALID);
-
- for (n = KVM_ALIAS_SLOTS; n > 0; --n)
- if (aliases->aliases[n - 1].npages)
- break;
- aliases->naliases = n;
-
- old_aliases = kvm->arch.aliases;
- rcu_assign_pointer(kvm->arch.aliases, aliases);
- synchronize_srcu_expedited(&kvm->srcu);
- kfree(old_aliases);
- r = 0;
-
-out_unlock:
- mutex_unlock(&kvm->slots_lock);
-out:
- return r;
+ return kvm->arch.n_max_mmu_pages;
}
static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
@@ -2702,18 +3052,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
r = 0;
switch (chip->chip_id) {
case KVM_IRQCHIP_PIC_MASTER:
- raw_spin_lock(&pic_irqchip(kvm)->lock);
+ spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[0],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
- raw_spin_unlock(&pic_irqchip(kvm)->lock);
+ spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_PIC_SLAVE:
- raw_spin_lock(&pic_irqchip(kvm)->lock);
+ spin_lock(&pic_irqchip(kvm)->lock);
memcpy(&pic_irqchip(kvm)->pics[1],
&chip->chip.pic,
sizeof(struct kvm_pic_state));
- raw_spin_unlock(&pic_irqchip(kvm)->lock);
+ spin_unlock(&pic_irqchip(kvm)->lock);
break;
case KVM_IRQCHIP_IOAPIC:
r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -2797,7 +3147,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot;
unsigned long n;
unsigned long is_dirty = 0;
- unsigned long *dirty_bitmap = NULL;
mutex_lock(&kvm->slots_lock);
@@ -2812,27 +3161,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
n = kvm_dirty_bitmap_bytes(memslot);
- r = -ENOMEM;
- dirty_bitmap = vmalloc(n);
- if (!dirty_bitmap)
- goto out;
- memset(dirty_bitmap, 0, n);
-
for (i = 0; !is_dirty && i < n/sizeof(long); i++)
is_dirty = memslot->dirty_bitmap[i];
/* If nothing is dirty, don't bother messing with page tables. */
if (is_dirty) {
struct kvm_memslots *slots, *old_slots;
+ unsigned long *dirty_bitmap;
spin_lock(&kvm->mmu_lock);
kvm_mmu_slot_remove_write_access(kvm, log->slot);
spin_unlock(&kvm->mmu_lock);
- slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
- if (!slots)
- goto out_free;
+ r = -ENOMEM;
+ dirty_bitmap = vmalloc(n);
+ if (!dirty_bitmap)
+ goto out;
+ memset(dirty_bitmap, 0, n);
+ r = -ENOMEM;
+ slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ if (!slots) {
+ vfree(dirty_bitmap);
+ goto out;
+ }
memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
@@ -2841,13 +3193,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
synchronize_srcu_expedited(&kvm->srcu);
dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
kfree(old_slots);
+
+ r = -EFAULT;
+ if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
+ vfree(dirty_bitmap);
+ goto out;
+ }
+ vfree(dirty_bitmap);
+ } else {
+ r = -EFAULT;
+ if (clear_user(log->dirty_bitmap, n))
+ goto out;
}
r = 0;
- if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
- r = -EFAULT;
-out_free:
- vfree(dirty_bitmap);
out:
mutex_unlock(&kvm->slots_lock);
return r;
@@ -2867,7 +3226,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
union {
struct kvm_pit_state ps;
struct kvm_pit_state2 ps2;
- struct kvm_memory_alias alias;
struct kvm_pit_config pit_config;
} u;
@@ -2888,22 +3246,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
break;
}
- case KVM_SET_MEMORY_REGION: {
- struct kvm_memory_region kvm_mem;
- struct kvm_userspace_memory_region kvm_userspace_mem;
-
- r = -EFAULT;
- if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
- goto out;
- kvm_userspace_mem.slot = kvm_mem.slot;
- kvm_userspace_mem.flags = kvm_mem.flags;
- kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
- kvm_userspace_mem.memory_size = kvm_mem.memory_size;
- r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
- if (r)
- goto out;
- break;
- }
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
if (r)
@@ -2912,14 +3254,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_GET_NR_MMU_PAGES:
r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
break;
- case KVM_SET_MEMORY_ALIAS:
- r = -EFAULT;
- if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
- goto out;
- r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
- if (r)
- goto out;
- break;
case KVM_CREATE_IRQCHIP: {
struct kvm_pic *vpic;
@@ -3123,7 +3457,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
break;
}
case KVM_SET_CLOCK: {
- struct timespec now;
struct kvm_clock_data user_ns;
u64 now_ns;
s64 delta;
@@ -3137,20 +3470,21 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
r = 0;
- ktime_get_ts(&now);
- now_ns = timespec_to_ns(&now);
+ local_irq_disable();
+ now_ns = get_kernel_ns();
delta = user_ns.clock - now_ns;
+ local_irq_enable();
kvm->arch.kvmclock_offset = delta;
break;
}
case KVM_GET_CLOCK: {
- struct timespec now;
struct kvm_clock_data user_ns;
u64 now_ns;
- ktime_get_ts(&now);
- now_ns = timespec_to_ns(&now);
+ local_irq_disable();
+ now_ns = get_kernel_ns();
user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
+ local_irq_enable();
user_ns.flags = 0;
r = -EFAULT;
@@ -3214,30 +3548,51 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
kvm_x86_ops->get_segment(vcpu, var, seg);
}
+static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+{
+ return gpa;
+}
+
+static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+{
+ gpa_t t_gpa;
+ u32 error;
+
+ BUG_ON(!mmu_is_nested(vcpu));
+
+ /* NPT walks are always user-walks */
+ access |= PFERR_USER_MASK;
+ t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error);
+ if (t_gpa == UNMAPPED_GVA)
+ vcpu->arch.fault.nested = true;
+
+ return t_gpa;
+}
+
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
- return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
}
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_FETCH_MASK;
- return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
}
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
- return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
}
/* uses this to access any guest's mapped memory without checking CPL */
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
{
- return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error);
}
static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
@@ -3248,7 +3603,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
+ gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
+ error);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -3259,7 +3615,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
}
ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
if (ret < 0) {
- r = X86EMUL_UNHANDLEABLE;
+ r = X86EMUL_IO_NEEDED;
goto out;
}
@@ -3303,8 +3659,9 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
- PFERR_WRITE_MASK, error);
+ gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
+ PFERR_WRITE_MASK,
+ error);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -3315,7 +3672,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
}
ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
if (ret < 0) {
- r = X86EMUL_UNHANDLEABLE;
+ r = X86EMUL_IO_NEEDED;
goto out;
}
@@ -3330,10 +3687,10 @@ out:
static int emulator_read_emulated(unsigned long addr,
void *val,
unsigned int bytes,
+ unsigned int *error_code,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
- u32 error_code;
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
@@ -3343,12 +3700,10 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
- gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
- if (gpa == UNMAPPED_GVA) {
- kvm_inject_page_fault(vcpu, addr, error_code);
+ if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
- }
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3370,11 +3725,12 @@ mmio:
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
vcpu->mmio_needed = 1;
- vcpu->mmio_phys_addr = gpa;
- vcpu->mmio_size = bytes;
- vcpu->mmio_is_write = 0;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+ vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+ vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+ vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
- return X86EMUL_UNHANDLEABLE;
+ return X86EMUL_IO_NEEDED;
}
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -3392,17 +3748,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
static int emulator_write_emulated_onepage(unsigned long addr,
const void *val,
unsigned int bytes,
+ unsigned int *error_code,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
- u32 error_code;
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
- if (gpa == UNMAPPED_GVA) {
- kvm_inject_page_fault(vcpu, addr, error_code);
+ if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
- }
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3420,10 +3774,11 @@ mmio:
return X86EMUL_CONTINUE;
vcpu->mmio_needed = 1;
- vcpu->mmio_phys_addr = gpa;
- vcpu->mmio_size = bytes;
- vcpu->mmio_is_write = 1;
- memcpy(vcpu->mmio_data, val, bytes);
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+ vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+ vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+ vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
+ memcpy(vcpu->run->mmio.data, val, bytes);
return X86EMUL_CONTINUE;
}
@@ -3431,6 +3786,7 @@ mmio:
int emulator_write_emulated(unsigned long addr,
const void *val,
unsigned int bytes,
+ unsigned int *error_code,
struct kvm_vcpu *vcpu)
{
/* Crossing a page boundary? */
@@ -3438,16 +3794,17 @@ int emulator_write_emulated(unsigned long addr,
int rc, now;
now = -addr & ~PAGE_MASK;
- rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+ rc = emulator_write_emulated_onepage(addr, val, now, error_code,
+ vcpu);
if (rc != X86EMUL_CONTINUE)
return rc;
addr += now;
val += now;
bytes -= now;
}
- return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+ return emulator_write_emulated_onepage(addr, val, bytes, error_code,
+ vcpu);
}
-EXPORT_SYMBOL_GPL(emulator_write_emulated);
#define CMPXCHG_TYPE(t, ptr, old, new) \
(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
@@ -3463,6 +3820,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
+ unsigned int *error_code,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
@@ -3484,6 +3842,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
goto emul_write;
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ kvm_release_page_clean(page);
+ goto emul_write;
+ }
kaddr = kmap_atomic(page, KM_USER0);
kaddr += offset_in_page(gpa);
@@ -3516,7 +3878,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
emul_write:
printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
- return emulator_write_emulated(addr, new, bytes, vcpu);
+ return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
}
static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3541,7 +3903,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
if (vcpu->arch.pio.count)
goto data_avail;
- trace_kvm_pio(1, port, size, 1);
+ trace_kvm_pio(0, port, size, 1);
vcpu->arch.pio.port = port;
vcpu->arch.pio.in = 1;
@@ -3569,7 +3931,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
const void *val, unsigned int count,
struct kvm_vcpu *vcpu)
{
- trace_kvm_pio(0, port, size, 1);
+ trace_kvm_pio(1, port, size, 1);
vcpu->arch.pio.port = port;
vcpu->arch.pio.in = 0;
@@ -3604,42 +3966,38 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
return X86EMUL_CONTINUE;
}
-int emulate_clts(struct kvm_vcpu *vcpu)
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
{
- kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
- kvm_x86_ops->fpu_activate(vcpu);
+ if (!need_emulate_wbinvd(vcpu))
+ return X86EMUL_CONTINUE;
+
+ if (kvm_x86_ops->has_wbinvd_exit()) {
+ smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+ wbinvd_ipi, NULL, 1);
+ cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
+ }
+ wbinvd();
return X86EMUL_CONTINUE;
}
+EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+int emulate_clts(struct kvm_vcpu *vcpu)
{
- return kvm_get_dr(ctxt->vcpu, dr, dest);
+ kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+ kvm_x86_ops->fpu_activate(vcpu);
+ return X86EMUL_CONTINUE;
}
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
+int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
{
- unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
-
- return kvm_set_dr(ctxt->vcpu, dr, value & mask);
+ return _kvm_get_dr(vcpu, dr, dest);
}
-void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
+int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
{
- u8 opcodes[4];
- unsigned long rip = kvm_rip_read(vcpu);
- unsigned long rip_linear;
- if (!printk_ratelimit())
- return;
-
- rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
-
- kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
-
- printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
- context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
+ return __kvm_set_dr(vcpu, dr, value);
}
-EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
static u64 mk_cr_64(u64 curr_cr, u32 new_val)
{
@@ -3674,27 +4032,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
return value;
}
-static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
+static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
{
+ int res = 0;
+
switch (cr) {
case 0:
- kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+ res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
break;
case 2:
vcpu->arch.cr2 = val;
break;
case 3:
- kvm_set_cr3(vcpu, val);
+ res = kvm_set_cr3(vcpu, val);
break;
case 4:
- kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+ res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
break;
case 8:
- kvm_set_cr8(vcpu, val & 0xfUL);
+ res = __kvm_set_cr8(vcpu, val & 0xfUL);
break;
default:
vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+ res = -1;
}
+
+ return res;
}
static int emulator_get_cpl(struct kvm_vcpu *vcpu)
@@ -3707,6 +4070,17 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
kvm_x86_ops->get_gdt(vcpu, dt);
}
+static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->get_idt(vcpu, dt);
+}
+
+static unsigned long emulator_get_cached_segment_base(int seg,
+ struct kvm_vcpu *vcpu)
+{
+ return get_segment_base(vcpu, seg);
+}
+
static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
struct kvm_vcpu *vcpu)
{
@@ -3779,11 +4153,6 @@ static void emulator_set_segment_selector(u16 sel, int seg,
kvm_set_segment(vcpu, &kvm_seg, seg);
}
-static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
- kvm_x86_ops->set_rflags(vcpu, rflags);
-}
-
static struct x86_emulate_ops emulate_ops = {
.read_std = kvm_read_guest_virt_system,
.write_std = kvm_write_guest_virt_system,
@@ -3797,11 +4166,16 @@ static struct x86_emulate_ops emulate_ops = {
.set_cached_descriptor = emulator_set_cached_descriptor,
.get_segment_selector = emulator_get_segment_selector,
.set_segment_selector = emulator_set_segment_selector,
+ .get_cached_segment_base = emulator_get_cached_segment_base,
.get_gdt = emulator_get_gdt,
+ .get_idt = emulator_get_idt,
.get_cr = emulator_get_cr,
.set_cr = emulator_set_cr,
.cpl = emulator_get_cpl,
- .set_rflags = emulator_set_rflags,
+ .get_dr = emulator_get_dr,
+ .set_dr = emulator_set_dr,
+ .set_msr = kvm_set_msr,
+ .get_msr = kvm_get_msr,
};
static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3812,14 +4186,126 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
vcpu->arch.regs_dirty = ~0;
}
+static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
+{
+ u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
+ /*
+ * an sti; sti; sequence only disable interrupts for the first
+ * instruction. So, if the last instruction, be it emulated or
+ * not, left the system with the INT_STI flag enabled, it
+ * means that the last instruction is an sti. We should not
+ * leave the flag on in this case. The same goes for mov ss
+ */
+ if (!(int_shadow & mask))
+ kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
+}
+
+static void inject_emulated_exception(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ if (ctxt->exception == PF_VECTOR)
+ kvm_propagate_fault(vcpu);
+ else if (ctxt->error_code_valid)
+ kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
+ else
+ kvm_queue_exception(vcpu, ctxt->exception);
+}
+
+static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
+{
+ struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+ int cs_db, cs_l;
+
+ cache_all_regs(vcpu);
+
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+
+ vcpu->arch.emulate_ctxt.vcpu = vcpu;
+ vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+ vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
+ vcpu->arch.emulate_ctxt.mode =
+ (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+ (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+ ? X86EMUL_MODE_VM86 : cs_l
+ ? X86EMUL_MODE_PROT64 : cs_db
+ ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ memset(c, 0, sizeof(struct decode_cache));
+ memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+}
+
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
+{
+ struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+ int ret;
+
+ init_emulate_ctxt(vcpu);
+
+ vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
+ vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
+ vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip;
+ ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
+
+ if (ret != X86EMUL_CONTINUE)
+ return EMULATE_FAIL;
+
+ vcpu->arch.emulate_ctxt.eip = c->eip;
+ memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+ kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+ kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+
+ if (irq == NMI_VECTOR)
+ vcpu->arch.nmi_pending = false;
+ else
+ vcpu->arch.interrupt.pending = false;
+
+ return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
+
+static int handle_emulation_failure(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.insn_emulation_fail;
+ trace_kvm_emulate_insn_failed(vcpu);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return EMULATE_FAIL;
+}
+
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ gpa_t gpa;
+
+ if (tdp_enabled)
+ return false;
+
+ /*
+ * if emulation was due to access to shadowed page table
+ * and it failed try to unshadow page and re-entetr the
+ * guest to let CPU execute the instruction.
+ */
+ if (kvm_mmu_unprotect_page_virt(vcpu, gva))
+ return true;
+
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+
+ if (gpa == UNMAPPED_GVA)
+ return true; /* let cpu generate fault */
+
+ if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
+ return true;
+
+ return false;
+}
+
int emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2,
u16 error_code,
int emulation_type)
{
- int r, shadow_mask;
- struct decode_cache *c;
- struct kvm_run *run = vcpu->run;
+ int r;
+ struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
kvm_clear_exception_queue(vcpu);
vcpu->arch.mmio_fault_cr2 = cr2;
@@ -3831,28 +4317,20 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
*/
cache_all_regs(vcpu);
- vcpu->mmio_is_write = 0;
-
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
- int cs_db, cs_l;
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-
- vcpu->arch.emulate_ctxt.vcpu = vcpu;
- vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
- vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
- vcpu->arch.emulate_ctxt.mode =
- (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
- (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
- ? X86EMUL_MODE_VM86 : cs_l
- ? X86EMUL_MODE_PROT64 : cs_db
- ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-
- r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+ init_emulate_ctxt(vcpu);
+ vcpu->arch.emulate_ctxt.interruptibility = 0;
+ vcpu->arch.emulate_ctxt.exception = -1;
+ vcpu->arch.emulate_ctxt.perm_ok = false;
+
+ r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
+ if (r == X86EMUL_PROPAGATE_FAULT)
+ goto done;
+
trace_kvm_emulate_insn_start(vcpu);
/* Only allow emulation of specific instructions on #UD
* (namely VMMCALL, sysenter, sysexit, syscall)*/
- c = &vcpu->arch.emulate_ctxt.decode;
if (emulation_type & EMULTYPE_TRAP_UD) {
if (!c->twobyte)
return EMULATE_FAIL;
@@ -3880,11 +4358,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
++vcpu->stat.insn_emulation;
if (r) {
- ++vcpu->stat.insn_emulation_fail;
- trace_kvm_emulate_insn_failed(vcpu);
- if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+ if (reexecute_instruction(vcpu, cr2))
return EMULATE_DONE;
- return EMULATE_FAIL;
+ if (emulation_type & EMULTYPE_SKIP)
+ return EMULATE_FAIL;
+ return handle_emulation_failure(vcpu);
}
}
@@ -3893,52 +4371,44 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
return EMULATE_DONE;
}
-restart:
- r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
- shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
-
- if (r == 0)
- kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+ /* this is needed for vmware backdor interface to work since it
+ changes registers values during IO operation */
+ memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
- if (vcpu->arch.pio.count) {
- if (!vcpu->arch.pio.in)
- vcpu->arch.pio.count = 0;
- return EMULATE_DO_MMIO;
- }
+restart:
+ r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
- if (r || vcpu->mmio_is_write) {
- run->exit_reason = KVM_EXIT_MMIO;
- run->mmio.phys_addr = vcpu->mmio_phys_addr;
- memcpy(run->mmio.data, vcpu->mmio_data, 8);
- run->mmio.len = vcpu->mmio_size;
- run->mmio.is_write = vcpu->mmio_is_write;
- }
+ if (r == EMULATION_FAILED) {
+ if (reexecute_instruction(vcpu, cr2))
+ return EMULATE_DONE;
- if (r) {
- if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
- goto done;
- if (!vcpu->mmio_needed) {
- ++vcpu->stat.insn_emulation_fail;
- trace_kvm_emulate_insn_failed(vcpu);
- kvm_report_emulation_failure(vcpu, "mmio");
- return EMULATE_FAIL;
- }
- return EMULATE_DO_MMIO;
- }
-
- if (vcpu->mmio_is_write) {
- vcpu->mmio_needed = 0;
- return EMULATE_DO_MMIO;
+ return handle_emulation_failure(vcpu);
}
done:
- if (vcpu->arch.exception.pending)
- vcpu->arch.emulate_ctxt.restart = false;
-
- if (vcpu->arch.emulate_ctxt.restart)
+ if (vcpu->arch.emulate_ctxt.exception >= 0) {
+ inject_emulated_exception(vcpu);
+ r = EMULATE_DONE;
+ } else if (vcpu->arch.pio.count) {
+ if (!vcpu->arch.pio.in)
+ vcpu->arch.pio.count = 0;
+ r = EMULATE_DO_MMIO;
+ } else if (vcpu->mmio_needed) {
+ if (vcpu->mmio_is_write)
+ vcpu->mmio_needed = 0;
+ r = EMULATE_DO_MMIO;
+ } else if (r == EMULATION_RESTART)
goto restart;
+ else
+ r = EMULATE_DONE;
- return EMULATE_DONE;
+ toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
+ kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+ kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+
+ return r;
}
EXPORT_SYMBOL_GPL(emulate_instruction);
@@ -3952,9 +4422,23 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
}
EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
-static void bounce_off(void *info)
+static void tsc_bad(void *info)
{
- /* nothing */
+ __get_cpu_var(cpu_tsc_khz) = 0;
+}
+
+static void tsc_khz_changed(void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ unsigned long khz = 0;
+
+ if (data)
+ khz = freq->new;
+ else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ khz = cpufreq_quick_get(raw_smp_processor_id());
+ if (!khz)
+ khz = tsc_khz;
+ __get_cpu_var(cpu_tsc_khz) = khz;
}
static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -3965,21 +4449,60 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
+ /*
+ * We allow guests to temporarily run on slowing clocks,
+ * provided we notify them after, or to run on accelerating
+ * clocks, provided we notify them before. Thus time never
+ * goes backwards.
+ *
+ * However, we have a problem. We can't atomically update
+ * the frequency of a given CPU from this function; it is
+ * merely a notifier, which can be called from any CPU.
+ * Changing the TSC frequency at arbitrary points in time
+ * requires a recomputation of local variables related to
+ * the TSC for each VCPU. We must flag these local variables
+ * to be updated and be sure the update takes place with the
+ * new frequency before any guests proceed.
+ *
+ * Unfortunately, the combination of hotplug CPU and frequency
+ * change creates an intractable locking scenario; the order
+ * of when these callouts happen is undefined with respect to
+ * CPU hotplug, and they can race with each other. As such,
+ * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
+ * undefined; you can actually have a CPU frequency change take
+ * place in between the computation of X and the setting of the
+ * variable. To protect against this problem, all updates of
+ * the per_cpu tsc_khz variable are done in an interrupt
+ * protected IPI, and all callers wishing to update the value
+ * must wait for a synchronous IPI to complete (which is trivial
+ * if the caller is on the CPU already). This establishes the
+ * necessary total order on variable updates.
+ *
+ * Note that because a guest time update may take place
+ * anytime after the setting of the VCPU's request bit, the
+ * correct TSC value must be set before the request. However,
+ * to ensure the update actually makes it to any guest which
+ * starts running in hardware virtualization between the set
+ * and the acquisition of the spinlock, we must also ping the
+ * CPU after setting the request bit.
+ *
+ */
+
if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
return 0;
if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
return 0;
- per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
+
+ smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != freq->cpu)
continue;
- if (!kvm_request_guest_time_update(vcpu))
- continue;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
if (vcpu->cpu != smp_processor_id())
- send_ipi++;
+ send_ipi = 1;
}
}
spin_unlock(&kvm_lock);
@@ -3997,32 +4520,57 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
* guest context is entered kvmclock will be updated,
* so the guest will not see stale values.
*/
- smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
+ smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
}
return 0;
}
static struct notifier_block kvmclock_cpufreq_notifier_block = {
- .notifier_call = kvmclock_cpufreq_notifier
+ .notifier_call = kvmclock_cpufreq_notifier
+};
+
+static int kvmclock_cpu_notifier(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
+ break;
+ case CPU_DOWN_PREPARE:
+ smp_call_function_single(cpu, tsc_bad, NULL, 1);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kvmclock_cpu_notifier_block = {
+ .notifier_call = kvmclock_cpu_notifier,
+ .priority = -INT_MAX
};
static void kvm_timer_init(void)
{
int cpu;
+ max_tsc_khz = tsc_khz;
+ register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+#ifdef CONFIG_CPU_FREQ
+ struct cpufreq_policy policy;
+ memset(&policy, 0, sizeof(policy));
+ cpufreq_get_policy(&policy, get_cpu());
+ if (policy.cpuinfo.max_freq)
+ max_tsc_khz = policy.cpuinfo.max_freq;
+#endif
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
- for_each_online_cpu(cpu) {
- unsigned long khz = cpufreq_get(cpu);
- if (!khz)
- khz = tsc_khz;
- per_cpu(cpu_tsc_khz, cpu) = khz;
- }
- } else {
- for_each_possible_cpu(cpu)
- per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
}
+ pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
+ for_each_online_cpu(cpu)
+ smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
}
static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@ -4108,6 +4656,9 @@ int kvm_arch_init(void *opaque)
perf_register_guest_info_callbacks(&kvm_guest_cbs);
+ if (cpu_has_xsave)
+ host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+
return 0;
out:
@@ -4121,6 +4672,7 @@ void kvm_arch_exit(void)
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
+ unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
kvm_x86_ops = NULL;
kvm_mmu_module_exit();
}
@@ -4270,7 +4822,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
kvm_x86_ops->patch_hypercall(vcpu, instruction);
- return emulator_write_emulated(rip, instruction, 3, vcpu);
+ return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
}
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
@@ -4506,78 +5058,103 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
}
}
+static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+ !vcpu->guest_xcr0_loaded) {
+ /* kvm_set_xcr() also depends on this */
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+ vcpu->guest_xcr0_loaded = 1;
+ }
+}
+
+static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->guest_xcr0_loaded) {
+ if (vcpu->arch.xcr0 != host_xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+ vcpu->guest_xcr0_loaded = 0;
+ }
+}
+
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
vcpu->run->request_interrupt_window;
- if (vcpu->requests)
- if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
- kvm_mmu_unload(vcpu);
-
- r = kvm_mmu_reload(vcpu);
- if (unlikely(r))
- goto out;
-
if (vcpu->requests) {
- if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
+ if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
+ kvm_mmu_unload(vcpu);
+ if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
__kvm_migrate_timers(vcpu);
- if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
- kvm_write_guest_time(vcpu);
- if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+ if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
+ r = kvm_guest_time_update(vcpu);
+ if (unlikely(r))
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
kvm_mmu_sync_roots(vcpu);
- if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
kvm_x86_ops->tlb_flush(vcpu);
- if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
- &vcpu->requests)) {
+ if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
goto out;
}
- if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
+ if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
r = 0;
goto out;
}
- if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
+ if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
vcpu->fpu_active = 0;
kvm_x86_ops->fpu_deactivate(vcpu);
}
}
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ goto out;
+
+ if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+ inject_pending_event(vcpu);
+
+ /* enable NMI/IRQ window open exits if needed */
+ if (vcpu->arch.nmi_pending)
+ kvm_x86_ops->enable_nmi_window(vcpu);
+ else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+ kvm_x86_ops->enable_irq_window(vcpu);
+
+ if (kvm_lapic_enabled(vcpu)) {
+ update_cr8_intercept(vcpu);
+ kvm_lapic_sync_to_vapic(vcpu);
+ }
+ }
+
preempt_disable();
kvm_x86_ops->prepare_guest_switch(vcpu);
if (vcpu->fpu_active)
kvm_load_guest_fpu(vcpu);
+ kvm_load_guest_xcr0(vcpu);
- local_irq_disable();
+ atomic_set(&vcpu->guest_mode, 1);
+ smp_wmb();
- clear_bit(KVM_REQ_KICK, &vcpu->requests);
- smp_mb__after_clear_bit();
+ local_irq_disable();
- if (vcpu->requests || need_resched() || signal_pending(current)) {
- set_bit(KVM_REQ_KICK, &vcpu->requests);
+ if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
+ || need_resched() || signal_pending(current)) {
+ atomic_set(&vcpu->guest_mode, 0);
+ smp_wmb();
local_irq_enable();
preempt_enable();
+ kvm_x86_ops->cancel_injection(vcpu);
r = 1;
goto out;
}
- inject_pending_event(vcpu);
-
- /* enable NMI/IRQ window open exits if needed */
- if (vcpu->arch.nmi_pending)
- kvm_x86_ops->enable_nmi_window(vcpu);
- else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
- kvm_x86_ops->enable_irq_window(vcpu);
-
- if (kvm_lapic_enabled(vcpu)) {
- update_cr8_intercept(vcpu);
- kvm_lapic_sync_to_vapic(vcpu);
- }
-
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
kvm_guest_enter();
@@ -4603,7 +5180,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (hw_breakpoint_active())
hw_breakpoint_restore();
- set_bit(KVM_REQ_KICK, &vcpu->requests);
+ kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+
+ atomic_set(&vcpu->guest_mode, 0);
+ smp_wmb();
local_irq_enable();
++vcpu->stat.exits;
@@ -4665,7 +5245,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
+ if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
{
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_HALTED:
@@ -4717,8 +5297,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
int r;
sigset_t sigsaved;
- vcpu_load(vcpu);
-
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
@@ -4733,8 +5311,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (!irqchip_in_kernel(vcpu->kvm))
kvm_set_cr8(vcpu, kvm_run->cr8);
- if (vcpu->arch.pio.count || vcpu->mmio_needed ||
- vcpu->arch.emulate_ctxt.restart) {
+ if (vcpu->arch.pio.count || vcpu->mmio_needed) {
if (vcpu->mmio_needed) {
memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
vcpu->mmio_read_completed = 1;
@@ -4743,7 +5320,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- if (r == EMULATE_DO_MMIO) {
+ if (r != EMULATE_DONE) {
r = 0;
goto out;
}
@@ -4759,14 +5336,11 @@ out:
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL);
- vcpu_put(vcpu);
return r;
}
int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
- vcpu_load(vcpu);
-
regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -4789,15 +5363,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
regs->rip = kvm_rip_read(vcpu);
regs->rflags = kvm_get_rflags(vcpu);
- vcpu_put(vcpu);
-
return 0;
}
int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
- vcpu_load(vcpu);
-
kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -4822,7 +5392,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
vcpu->arch.exception.pending = false;
- vcpu_put(vcpu);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
}
@@ -4842,8 +5412,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
{
struct desc_ptr dt;
- vcpu_load(vcpu);
-
kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -4875,55 +5443,43 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
set_bit(vcpu->arch.interrupt.nr,
(unsigned long *)sregs->interrupt_bitmap);
- vcpu_put(vcpu);
-
return 0;
}
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
- vcpu_load(vcpu);
mp_state->mp_state = vcpu->arch.mp_state;
- vcpu_put(vcpu);
return 0;
}
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
- vcpu_load(vcpu);
vcpu->arch.mp_state = mp_state->mp_state;
- vcpu_put(vcpu);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
}
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
bool has_error_code, u32 error_code)
{
- int cs_db, cs_l, ret;
- cache_all_regs(vcpu);
-
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+ int ret;
- vcpu->arch.emulate_ctxt.vcpu = vcpu;
- vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
- vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
- vcpu->arch.emulate_ctxt.mode =
- (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
- (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
- ? X86EMUL_MODE_VM86 : cs_l
- ? X86EMUL_MODE_PROT64 : cs_db
- ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ init_emulate_ctxt(vcpu);
- ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
+ ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
tss_selector, reason, has_error_code,
error_code);
if (ret)
return EMULATE_FAIL;
+ memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+ kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return EMULATE_DONE;
}
EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -4935,8 +5491,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
int pending_vec, max_bits;
struct desc_ptr dt;
- vcpu_load(vcpu);
-
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
kvm_x86_ops->set_idt(vcpu, &dt);
@@ -4961,7 +5515,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
if (!is_long_mode(vcpu) && is_pae(vcpu)) {
- load_pdptrs(vcpu, vcpu->arch.cr3);
+ load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
mmu_reset_needed = 1;
}
@@ -4996,7 +5550,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
!is_protmode(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- vcpu_put(vcpu);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
}
@@ -5007,12 +5561,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
unsigned long rflags;
int i, r;
- vcpu_load(vcpu);
-
if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
r = -EBUSY;
if (vcpu->arch.exception.pending)
- goto unlock_out;
+ goto out;
if (dbg->control & KVM_GUESTDBG_INJECT_DB)
kvm_queue_exception(vcpu, DB_VECTOR);
else
@@ -5054,34 +5606,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
r = 0;
-unlock_out:
- vcpu_put(vcpu);
+out:
return r;
}
/*
- * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
- * we have asm/x86/processor.h
- */
-struct fxsave {
- u16 cwd;
- u16 swd;
- u16 twd;
- u16 fop;
- u64 rip;
- u64 rdp;
- u32 mxcsr;
- u32 mxcsr_mask;
- u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
-#ifdef CONFIG_X86_64
- u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
-#else
- u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
-#endif
-};
-
-/*
* Translate a guest virtual address to a guest physical address.
*/
int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
@@ -5091,7 +5621,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
gpa_t gpa;
int idx;
- vcpu_load(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -5099,16 +5628,14 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
tr->valid = gpa != UNMAPPED_GVA;
tr->writeable = 1;
tr->usermode = 0;
- vcpu_put(vcpu);
return 0;
}
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
-
- vcpu_load(vcpu);
+ struct i387_fxsave_struct *fxsave =
+ &vcpu->arch.guest_fpu.state->fxsave;
memcpy(fpu->fpr, fxsave->st_space, 128);
fpu->fcw = fxsave->cwd;
@@ -5119,16 +5646,13 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
fpu->last_dp = fxsave->rdp;
memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
- vcpu_put(vcpu);
-
return 0;
}
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
-
- vcpu_load(vcpu);
+ struct i387_fxsave_struct *fxsave =
+ &vcpu->arch.guest_fpu.state->fxsave;
memcpy(fxsave->st_space, fpu->fpr, 128);
fxsave->cwd = fpu->fcw;
@@ -5139,61 +5663,63 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
fxsave->rdp = fpu->last_dp;
memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
- vcpu_put(vcpu);
-
return 0;
}
-void fx_init(struct kvm_vcpu *vcpu)
+int fx_init(struct kvm_vcpu *vcpu)
{
- unsigned after_mxcsr_mask;
+ int err;
+
+ err = fpu_alloc(&vcpu->arch.guest_fpu);
+ if (err)
+ return err;
+
+ fpu_finit(&vcpu->arch.guest_fpu);
/*
- * Touch the fpu the first time in non atomic context as if
- * this is the first fpu instruction the exception handler
- * will fire before the instruction returns and it'll have to
- * allocate ram with GFP_KERNEL.
+ * Ensure guest xcr0 is valid for loading
*/
- if (!used_math())
- kvm_fx_save(&vcpu->arch.host_fx_image);
-
- /* Initialize guest FPU by resetting ours and saving into guest's */
- preempt_disable();
- kvm_fx_save(&vcpu->arch.host_fx_image);
- kvm_fx_finit();
- kvm_fx_save(&vcpu->arch.guest_fx_image);
- kvm_fx_restore(&vcpu->arch.host_fx_image);
- preempt_enable();
+ vcpu->arch.xcr0 = XSTATE_FP;
vcpu->arch.cr0 |= X86_CR0_ET;
- after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
- vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
- memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
- 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(fx_init);
+static void fx_free(struct kvm_vcpu *vcpu)
+{
+ fpu_free(&vcpu->arch.guest_fpu);
+}
+
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
if (vcpu->guest_fpu_loaded)
return;
+ /*
+ * Restore all possible states in the guest,
+ * and assume host would use all available bits.
+ * Guest xcr0 would be loaded later.
+ */
+ kvm_put_guest_xcr0(vcpu);
vcpu->guest_fpu_loaded = 1;
- kvm_fx_save(&vcpu->arch.host_fx_image);
- kvm_fx_restore(&vcpu->arch.guest_fx_image);
+ unlazy_fpu(current);
+ fpu_restore_checking(&vcpu->arch.guest_fpu);
trace_kvm_fpu(1);
}
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
+ kvm_put_guest_xcr0(vcpu);
+
if (!vcpu->guest_fpu_loaded)
return;
vcpu->guest_fpu_loaded = 0;
- kvm_fx_save(&vcpu->arch.guest_fx_image);
- kvm_fx_restore(&vcpu->arch.host_fx_image);
+ fpu_save_init(&vcpu->arch.guest_fpu);
++vcpu->stat.fpu_reload;
- set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
+ kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
trace_kvm_fpu(0);
}
@@ -5204,12 +5730,18 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
vcpu->arch.time_page = NULL;
}
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+ fx_free(vcpu);
kvm_x86_ops->vcpu_free(vcpu);
}
struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
unsigned int id)
{
+ if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+ printk_once(KERN_WARNING
+ "kvm: SMP vm created on host with unstable TSC; "
+ "guest TSC will not be reliable\n");
return kvm_x86_ops->vcpu_create(kvm, id);
}
@@ -5217,9 +5749,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
int r;
- /* We do fxsave: this must be aligned. */
- BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
-
vcpu->arch.mtrr_state.have_fixed = 1;
vcpu_load(vcpu);
r = kvm_arch_vcpu_reset(vcpu);
@@ -5241,6 +5770,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
+ fx_free(vcpu);
kvm_x86_ops->vcpu_free(vcpu);
}
@@ -5254,22 +5784,22 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
vcpu->arch.dr6 = DR6_FIXED_1;
vcpu->arch.dr7 = DR7_FIXED_1;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
return kvm_x86_ops->vcpu_reset(vcpu);
}
int kvm_arch_hardware_enable(void *garbage)
{
- /*
- * Since this may be called from a hotplug notifcation,
- * we can't get the CPU frequency directly.
- */
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
- int cpu = raw_smp_processor_id();
- per_cpu(cpu_tsc_khz, cpu) = 0;
- }
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ int i;
kvm_shared_msr_cpu_online();
-
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ if (vcpu->cpu == smp_processor_id())
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
return kvm_x86_ops->hardware_enable(garbage);
}
@@ -5303,7 +5833,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
BUG_ON(vcpu->kvm == NULL);
kvm = vcpu->kvm;
+ vcpu->arch.emulate_ctxt.ops = &emulate_ops;
+ vcpu->arch.walk_mmu = &vcpu->arch.mmu;
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ vcpu->arch.mmu.translate_gpa = translate_gpa;
+ vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
@@ -5316,6 +5850,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
}
vcpu->arch.pio_data = page_address(page);
+ if (!kvm->arch.virtual_tsc_khz)
+ kvm_arch_set_tsc_khz(kvm, max_tsc_khz);
+
r = kvm_mmu_create(vcpu);
if (r < 0)
goto fail_free_pio_data;
@@ -5334,7 +5871,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
}
vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
+ goto fail_free_mce_banks;
+
return 0;
+fail_free_mce_banks:
+ kfree(vcpu->arch.mce_banks);
fail_free_lapic:
kvm_free_lapic(vcpu);
fail_mmu_destroy:
@@ -5364,19 +5906,13 @@ struct kvm *kvm_arch_create_vm(void)
if (!kvm)
return ERR_PTR(-ENOMEM);
- kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
- if (!kvm->arch.aliases) {
- kfree(kvm);
- return ERR_PTR(-ENOMEM);
- }
-
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
- rdtscll(kvm->arch.vm_init_tsc);
+ spin_lock_init(&kvm->arch.tsc_write_lock);
return kvm;
}
@@ -5412,12 +5948,12 @@ static void kvm_free_vcpus(struct kvm *kvm)
void kvm_arch_sync_events(struct kvm *kvm)
{
kvm_free_all_assigned_devices(kvm);
+ kvm_free_pit(kvm);
}
void kvm_arch_destroy_vm(struct kvm *kvm)
{
kvm_iommu_unmap_guest(kvm);
- kvm_free_pit(kvm);
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
@@ -5427,7 +5963,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
if (kvm->arch.ept_identity_pagetable)
put_page(kvm->arch.ept_identity_pagetable);
cleanup_srcu_struct(&kvm->srcu);
- kfree(kvm->arch.aliases);
kfree(kvm);
}
@@ -5438,6 +5973,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
int user_alloc)
{
int npages = memslot->npages;
+ int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
+
+ /* Prevent internal slot pages from being moved by fork()/COW. */
+ if (memslot->id >= KVM_MEMORY_SLOTS)
+ map_flags = MAP_SHARED | MAP_ANONYMOUS;
/*To keep backward compatibility with older userspace,
*x86 needs to hanlde !user_alloc case.
@@ -5450,7 +5990,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
userspace_addr = do_mmap(NULL, 0,
npages * PAGE_SIZE,
PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS,
+ map_flags,
0);
up_write(&current->mm->mmap_sem);
@@ -5523,7 +6063,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
me = get_cpu();
if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
- if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+ if (atomic_xchg(&vcpu->guest_mode, 0))
smp_send_reschedule(cpu);
put_cpu();
}
@@ -5559,6 +6099,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
rflags |= X86_EFLAGS_TF;
kvm_x86_ops->set_rflags(vcpu, rflags);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_set_rflags);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f4b54458285b..2cea414489f3 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -50,6 +50,11 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
#endif
}
+static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
+}
+
static inline int is_pae(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
@@ -65,14 +70,10 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
}
-static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
-{
- return rcu_dereference_check(kvm->arch.aliases,
- srcu_read_lock_held(&kvm->srcu)
- || lockdep_is_held(&kvm->slots_lock));
-}
-
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
+
+void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
#endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 9257510b4836..73b1e1a1f489 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -324,9 +324,8 @@ static void lguest_load_gdt(const struct desc_ptr *desc)
}
/*
- * For a single GDT entry which changes, we do the lazy thing: alter our GDT,
- * then tell the Host to reload the entire thing. This operation is so rare
- * that this naive implementation is reasonable.
+ * For a single GDT entry which changes, we simply change our copy and
+ * then tell the host about it.
*/
static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
const void *desc, int type)
@@ -338,9 +337,13 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
}
/*
- * OK, I lied. There are three "thread local storage" GDT entries which change
+ * There are three "thread local storage" GDT entries which change
* on every context switch (these three entries are how glibc implements
- * __thread variables). So we have a hypercall specifically for this case.
+ * __thread variables). As an optimization, we have a hypercall
+ * specifically for this case.
+ *
+ * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall
+ * which took a range of entries?
*/
static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
{
@@ -788,22 +791,22 @@ static void lguest_flush_tlb_kernel(void)
* simple as setting a bit. We don't actually "ack" interrupts as such, we
* just mask and unmask them. I wonder if we should be cleverer?
*/
-static void disable_lguest_irq(unsigned int irq)
+static void disable_lguest_irq(struct irq_data *data)
{
- set_bit(irq, lguest_data.blocked_interrupts);
+ set_bit(data->irq, lguest_data.blocked_interrupts);
}
-static void enable_lguest_irq(unsigned int irq)
+static void enable_lguest_irq(struct irq_data *data)
{
- clear_bit(irq, lguest_data.blocked_interrupts);
+ clear_bit(data->irq, lguest_data.blocked_interrupts);
}
/* This structure describes the lguest IRQ controller. */
static struct irq_chip lguest_irq_controller = {
.name = "lguest",
- .mask = disable_lguest_irq,
- .mask_ack = disable_lguest_irq,
- .unmask = enable_lguest_irq,
+ .irq_mask = disable_lguest_irq,
+ .irq_mask_ack = disable_lguest_irq,
+ .irq_unmask = enable_lguest_irq,
};
/*
@@ -835,12 +838,12 @@ static void __init lguest_init_IRQ(void)
* rather than set them in lguest_init_IRQ we are called here every time an
* lguest device needs an interrupt.
*
- * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should
+ * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should
* pass that up!
*/
void lguest_setup_irq(unsigned int irq)
{
- irq_to_desc_alloc_node(irq, 0);
+ irq_alloc_desc_at(irq, 0);
set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
handle_level_irq, "level");
}
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f871e04b6965..e10cf070ede0 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -30,6 +30,7 @@ ifeq ($(CONFIG_X86_32),y)
lib-y += checksum_32.o
lib-y += strstr_32.o
lib-y += semaphore_32.o string_32.o
+ lib-y += cmpxchg.o
ifneq ($(CONFIG_X86_CMPXCHG64),y)
lib-y += cmpxchg8b_emu.o atomic64_386_32.o
endif
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 4a5979aa6883..2cda60a06e65 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -25,150 +25,172 @@
CFI_ADJUST_CFA_OFFSET -4
.endm
-.macro BEGIN func reg
-$v = \reg
-
-ENTRY(atomic64_\func\()_386)
- CFI_STARTPROC
- LOCK $v
-
-.macro RETURN
- UNLOCK $v
+#define BEGIN(op) \
+.macro endp; \
+ CFI_ENDPROC; \
+ENDPROC(atomic64_##op##_386); \
+.purgem endp; \
+.endm; \
+ENTRY(atomic64_##op##_386); \
+ CFI_STARTPROC; \
+ LOCK v;
+
+#define ENDP endp
+
+#define RET \
+ UNLOCK v; \
ret
-.endm
-
-.macro END_
- CFI_ENDPROC
-ENDPROC(atomic64_\func\()_386)
-.purgem RETURN
-.purgem END_
-.purgem END
-.endm
-
-.macro END
-RETURN
-END_
-.endm
-.endm
-BEGIN read %ecx
- movl ($v), %eax
- movl 4($v), %edx
-END
-
-BEGIN set %esi
- movl %ebx, ($v)
- movl %ecx, 4($v)
-END
-
-BEGIN xchg %esi
- movl ($v), %eax
- movl 4($v), %edx
- movl %ebx, ($v)
- movl %ecx, 4($v)
-END
-
-BEGIN add %ecx
- addl %eax, ($v)
- adcl %edx, 4($v)
-END
-
-BEGIN add_return %ecx
- addl ($v), %eax
- adcl 4($v), %edx
- movl %eax, ($v)
- movl %edx, 4($v)
-END
-
-BEGIN sub %ecx
- subl %eax, ($v)
- sbbl %edx, 4($v)
-END
-
-BEGIN sub_return %ecx
+#define RET_ENDP \
+ RET; \
+ ENDP
+
+#define v %ecx
+BEGIN(read)
+ movl (v), %eax
+ movl 4(v), %edx
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(set)
+ movl %ebx, (v)
+ movl %ecx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(xchg)
+ movl (v), %eax
+ movl 4(v), %edx
+ movl %ebx, (v)
+ movl %ecx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(add)
+ addl %eax, (v)
+ adcl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(add_return)
+ addl (v), %eax
+ adcl 4(v), %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(sub)
+ subl %eax, (v)
+ sbbl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(sub_return)
negl %edx
negl %eax
sbbl $0, %edx
- addl ($v), %eax
- adcl 4($v), %edx
- movl %eax, ($v)
- movl %edx, 4($v)
-END
-
-BEGIN inc %esi
- addl $1, ($v)
- adcl $0, 4($v)
-END
-
-BEGIN inc_return %esi
- movl ($v), %eax
- movl 4($v), %edx
+ addl (v), %eax
+ adcl 4(v), %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(inc)
+ addl $1, (v)
+ adcl $0, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(inc_return)
+ movl (v), %eax
+ movl 4(v), %edx
addl $1, %eax
adcl $0, %edx
- movl %eax, ($v)
- movl %edx, 4($v)
-END
-
-BEGIN dec %esi
- subl $1, ($v)
- sbbl $0, 4($v)
-END
-
-BEGIN dec_return %esi
- movl ($v), %eax
- movl 4($v), %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(dec)
+ subl $1, (v)
+ sbbl $0, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(dec_return)
+ movl (v), %eax
+ movl 4(v), %edx
subl $1, %eax
sbbl $0, %edx
- movl %eax, ($v)
- movl %edx, 4($v)
-END
+ movl %eax, (v)
+ movl %edx, 4(v)
+RET_ENDP
+#undef v
-BEGIN add_unless %ecx
+#define v %ecx
+BEGIN(add_unless)
addl %eax, %esi
adcl %edx, %edi
- addl ($v), %eax
- adcl 4($v), %edx
+ addl (v), %eax
+ adcl 4(v), %edx
cmpl %eax, %esi
je 3f
1:
- movl %eax, ($v)
- movl %edx, 4($v)
+ movl %eax, (v)
+ movl %edx, 4(v)
movl $1, %eax
2:
-RETURN
+ RET
3:
cmpl %edx, %edi
jne 1b
xorl %eax, %eax
jmp 2b
-END_
+ENDP
+#undef v
-BEGIN inc_not_zero %esi
- movl ($v), %eax
- movl 4($v), %edx
+#define v %esi
+BEGIN(inc_not_zero)
+ movl (v), %eax
+ movl 4(v), %edx
testl %eax, %eax
je 3f
1:
addl $1, %eax
adcl $0, %edx
- movl %eax, ($v)
- movl %edx, 4($v)
+ movl %eax, (v)
+ movl %edx, 4(v)
movl $1, %eax
2:
-RETURN
+ RET
3:
testl %edx, %edx
jne 1b
jmp 2b
-END_
+ENDP
+#undef v
-BEGIN dec_if_positive %esi
- movl ($v), %eax
- movl 4($v), %edx
+#define v %esi
+BEGIN(dec_if_positive)
+ movl (v), %eax
+ movl 4(v), %edx
subl $1, %eax
sbbl $0, %edx
js 1f
- movl %eax, ($v)
- movl %edx, 4($v)
+ movl %eax, (v)
+ movl %edx, 4(v)
1:
-END
+RET_ENDP
+#undef v
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index ebeafcce04a9..aa4326bfb24a 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -52,7 +52,7 @@ ENDPROC(clear_page)
.align 8
.quad clear_page
.quad 1b
- .byte X86_FEATURE_REP_GOOD
+ .word X86_FEATURE_REP_GOOD
.byte .Lclear_page_end - clear_page
.byte 2b - 1b
.previous
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/lib/cmpxchg.c
index 2056ccf572cc..5d619f6df3ee 100644
--- a/arch/x86/kernel/cpu/cmpxchg.c
+++ b/arch/x86/lib/cmpxchg.c
@@ -52,21 +52,3 @@ unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
}
EXPORT_SYMBOL(cmpxchg_386_u32);
#endif
-
-#ifndef CONFIG_X86_CMPXCHG64
-unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
-{
- u64 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u64 *)ptr;
- if (prev == old)
- *(u64 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_486_u64);
-#endif
-
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 727a5d46d2fc..6fec2d1cebe1 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -113,7 +113,7 @@ ENDPROC(copy_page)
.align 8
.quad copy_page
.quad 1b
- .byte X86_FEATURE_REP_GOOD
+ .word X86_FEATURE_REP_GOOD
.byte .Lcopy_page_end - copy_page
.byte 2b - 1b
.previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 71100c98e337..a460158b5ac5 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -29,7 +29,7 @@
.align 8
.quad 0b
.quad 2b
- .byte \feature /* when feature is set */
+ .word \feature /* when feature is set */
.byte 5
.byte 5
.previous
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index 5415a9d06f53..b908a59eccf5 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -22,22 +22,187 @@ EXPORT_SYMBOL(memset);
void *memmove(void *dest, const void *src, size_t n)
{
- int d0, d1, d2;
-
- if (dest < src) {
- memcpy(dest, src, n);
- } else {
- __asm__ __volatile__(
- "std\n\t"
- "rep\n\t"
- "movsb\n\t"
- "cld"
- : "=&c" (d0), "=&S" (d1), "=&D" (d2)
- :"0" (n),
- "1" (n-1+src),
- "2" (n-1+dest)
- :"memory");
- }
- return dest;
+ int d0,d1,d2,d3,d4,d5;
+ char *ret = dest;
+
+ __asm__ __volatile__(
+ /* Handle more 16bytes in loop */
+ "cmp $0x10, %0\n\t"
+ "jb 1f\n\t"
+
+ /* Decide forward/backward copy mode */
+ "cmp %2, %1\n\t"
+ "jb 2f\n\t"
+
+ /*
+ * movs instruction have many startup latency
+ * so we handle small size by general register.
+ */
+ "cmp $680, %0\n\t"
+ "jb 3f\n\t"
+ /*
+ * movs instruction is only good for aligned case.
+ */
+ "mov %1, %3\n\t"
+ "xor %2, %3\n\t"
+ "and $0xff, %3\n\t"
+ "jz 4f\n\t"
+ "3:\n\t"
+ "sub $0x10, %0\n\t"
+
+ /*
+ * We gobble 16byts forward in each loop.
+ */
+ "3:\n\t"
+ "sub $0x10, %0\n\t"
+ "mov 0*4(%1), %3\n\t"
+ "mov 1*4(%1), %4\n\t"
+ "mov %3, 0*4(%2)\n\t"
+ "mov %4, 1*4(%2)\n\t"
+ "mov 2*4(%1), %3\n\t"
+ "mov 3*4(%1), %4\n\t"
+ "mov %3, 2*4(%2)\n\t"
+ "mov %4, 3*4(%2)\n\t"
+ "lea 0x10(%1), %1\n\t"
+ "lea 0x10(%2), %2\n\t"
+ "jae 3b\n\t"
+ "add $0x10, %0\n\t"
+ "jmp 1f\n\t"
+
+ /*
+ * Handle data forward by movs.
+ */
+ ".p2align 4\n\t"
+ "4:\n\t"
+ "mov -4(%1, %0), %3\n\t"
+ "lea -4(%2, %0), %4\n\t"
+ "shr $2, %0\n\t"
+ "rep movsl\n\t"
+ "mov %3, (%4)\n\t"
+ "jmp 11f\n\t"
+ /*
+ * Handle data backward by movs.
+ */
+ ".p2align 4\n\t"
+ "6:\n\t"
+ "mov (%1), %3\n\t"
+ "mov %2, %4\n\t"
+ "lea -4(%1, %0), %1\n\t"
+ "lea -4(%2, %0), %2\n\t"
+ "shr $2, %0\n\t"
+ "std\n\t"
+ "rep movsl\n\t"
+ "mov %3,(%4)\n\t"
+ "cld\n\t"
+ "jmp 11f\n\t"
+
+ /*
+ * Start to prepare for backward copy.
+ */
+ ".p2align 4\n\t"
+ "2:\n\t"
+ "cmp $680, %0\n\t"
+ "jb 5f\n\t"
+ "mov %1, %3\n\t"
+ "xor %2, %3\n\t"
+ "and $0xff, %3\n\t"
+ "jz 6b\n\t"
+
+ /*
+ * Calculate copy position to tail.
+ */
+ "5:\n\t"
+ "add %0, %1\n\t"
+ "add %0, %2\n\t"
+ "sub $0x10, %0\n\t"
+
+ /*
+ * We gobble 16byts backward in each loop.
+ */
+ "7:\n\t"
+ "sub $0x10, %0\n\t"
+
+ "mov -1*4(%1), %3\n\t"
+ "mov -2*4(%1), %4\n\t"
+ "mov %3, -1*4(%2)\n\t"
+ "mov %4, -2*4(%2)\n\t"
+ "mov -3*4(%1), %3\n\t"
+ "mov -4*4(%1), %4\n\t"
+ "mov %3, -3*4(%2)\n\t"
+ "mov %4, -4*4(%2)\n\t"
+ "lea -0x10(%1), %1\n\t"
+ "lea -0x10(%2), %2\n\t"
+ "jae 7b\n\t"
+ /*
+ * Calculate copy position to head.
+ */
+ "add $0x10, %0\n\t"
+ "sub %0, %1\n\t"
+ "sub %0, %2\n\t"
+
+ /*
+ * Move data from 8 bytes to 15 bytes.
+ */
+ ".p2align 4\n\t"
+ "1:\n\t"
+ "cmp $8, %0\n\t"
+ "jb 8f\n\t"
+ "mov 0*4(%1), %3\n\t"
+ "mov 1*4(%1), %4\n\t"
+ "mov -2*4(%1, %0), %5\n\t"
+ "mov -1*4(%1, %0), %1\n\t"
+
+ "mov %3, 0*4(%2)\n\t"
+ "mov %4, 1*4(%2)\n\t"
+ "mov %5, -2*4(%2, %0)\n\t"
+ "mov %1, -1*4(%2, %0)\n\t"
+ "jmp 11f\n\t"
+
+ /*
+ * Move data from 4 bytes to 7 bytes.
+ */
+ ".p2align 4\n\t"
+ "8:\n\t"
+ "cmp $4, %0\n\t"
+ "jb 9f\n\t"
+ "mov 0*4(%1), %3\n\t"
+ "mov -1*4(%1, %0), %4\n\t"
+ "mov %3, 0*4(%2)\n\t"
+ "mov %4, -1*4(%2, %0)\n\t"
+ "jmp 11f\n\t"
+
+ /*
+ * Move data from 2 bytes to 3 bytes.
+ */
+ ".p2align 4\n\t"
+ "9:\n\t"
+ "cmp $2, %0\n\t"
+ "jb 10f\n\t"
+ "movw 0*2(%1), %%dx\n\t"
+ "movw -1*2(%1, %0), %%bx\n\t"
+ "movw %%dx, 0*2(%2)\n\t"
+ "movw %%bx, -1*2(%2, %0)\n\t"
+ "jmp 11f\n\t"
+
+ /*
+ * Move data for 1 byte.
+ */
+ ".p2align 4\n\t"
+ "10:\n\t"
+ "cmp $1, %0\n\t"
+ "jb 11f\n\t"
+ "movb (%1), %%cl\n\t"
+ "movb %%cl, (%2)\n\t"
+ ".p2align 4\n\t"
+ "11:"
+ : "=&c" (d0), "=&S" (d1), "=&D" (d2),
+ "=r" (d3),"=r" (d4), "=r"(d5)
+ :"0" (n),
+ "1" (src),
+ "2" (dest)
+ :"memory");
+
+ return ret;
+
}
EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index f82e884928af..75ef61e35e38 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -40,84 +40,132 @@
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
+ movq %rdi, %rax
/*
- * Put the number of full 64-byte blocks into %ecx.
- * Tail portion is handled at the end:
+ * Use 32bit CMP here to avoid long NOP padding.
*/
- movq %rdi, %rax
- movl %edx, %ecx
- shrl $6, %ecx
- jz .Lhandle_tail
+ cmp $0x20, %edx
+ jb .Lhandle_tail
- .p2align 4
-.Lloop_64:
/*
- * We decrement the loop index here - and the zero-flag is
- * checked at the end of the loop (instructions inbetween do
- * not change the zero flag):
+ * We check whether memory false dependece could occur,
+ * then jump to corresponding copy mode.
*/
- decl %ecx
+ cmp %dil, %sil
+ jl .Lcopy_backward
+ subl $0x20, %edx
+.Lcopy_forward_loop:
+ subq $0x20, %rdx
/*
- * Move in blocks of 4x16 bytes:
+ * Move in blocks of 4x8 bytes:
*/
- movq 0*8(%rsi), %r11
- movq 1*8(%rsi), %r8
- movq %r11, 0*8(%rdi)
- movq %r8, 1*8(%rdi)
-
- movq 2*8(%rsi), %r9
- movq 3*8(%rsi), %r10
- movq %r9, 2*8(%rdi)
- movq %r10, 3*8(%rdi)
-
- movq 4*8(%rsi), %r11
- movq 5*8(%rsi), %r8
- movq %r11, 4*8(%rdi)
- movq %r8, 5*8(%rdi)
-
- movq 6*8(%rsi), %r9
- movq 7*8(%rsi), %r10
- movq %r9, 6*8(%rdi)
- movq %r10, 7*8(%rdi)
-
- leaq 64(%rsi), %rsi
- leaq 64(%rdi), %rdi
-
- jnz .Lloop_64
+ movq 0*8(%rsi), %r8
+ movq 1*8(%rsi), %r9
+ movq 2*8(%rsi), %r10
+ movq 3*8(%rsi), %r11
+ leaq 4*8(%rsi), %rsi
+
+ movq %r8, 0*8(%rdi)
+ movq %r9, 1*8(%rdi)
+ movq %r10, 2*8(%rdi)
+ movq %r11, 3*8(%rdi)
+ leaq 4*8(%rdi), %rdi
+ jae .Lcopy_forward_loop
+ addq $0x20, %rdx
+ jmp .Lhandle_tail
+
+.Lcopy_backward:
+ /*
+ * Calculate copy position to tail.
+ */
+ addq %rdx, %rsi
+ addq %rdx, %rdi
+ subq $0x20, %rdx
+ /*
+ * At most 3 ALU operations in one cycle,
+ * so append NOPS in the same 16bytes trunk.
+ */
+ .p2align 4
+.Lcopy_backward_loop:
+ subq $0x20, %rdx
+ movq -1*8(%rsi), %r8
+ movq -2*8(%rsi), %r9
+ movq -3*8(%rsi), %r10
+ movq -4*8(%rsi), %r11
+ leaq -4*8(%rsi), %rsi
+ movq %r8, -1*8(%rdi)
+ movq %r9, -2*8(%rdi)
+ movq %r10, -3*8(%rdi)
+ movq %r11, -4*8(%rdi)
+ leaq -4*8(%rdi), %rdi
+ jae .Lcopy_backward_loop
+ /*
+ * Calculate copy position to head.
+ */
+ addq $0x20, %rdx
+ subq %rdx, %rsi
+ subq %rdx, %rdi
.Lhandle_tail:
- movl %edx, %ecx
- andl $63, %ecx
- shrl $3, %ecx
- jz .Lhandle_7
+ cmpq $16, %rdx
+ jb .Lless_16bytes
+ /*
+ * Move data from 16 bytes to 31 bytes.
+ */
+ movq 0*8(%rsi), %r8
+ movq 1*8(%rsi), %r9
+ movq -2*8(%rsi, %rdx), %r10
+ movq -1*8(%rsi, %rdx), %r11
+ movq %r8, 0*8(%rdi)
+ movq %r9, 1*8(%rdi)
+ movq %r10, -2*8(%rdi, %rdx)
+ movq %r11, -1*8(%rdi, %rdx)
+ retq
.p2align 4
-.Lloop_8:
- decl %ecx
- movq (%rsi), %r8
- movq %r8, (%rdi)
- leaq 8(%rdi), %rdi
- leaq 8(%rsi), %rsi
- jnz .Lloop_8
-
-.Lhandle_7:
- movl %edx, %ecx
- andl $7, %ecx
- jz .Lend
+.Lless_16bytes:
+ cmpq $8, %rdx
+ jb .Lless_8bytes
+ /*
+ * Move data from 8 bytes to 15 bytes.
+ */
+ movq 0*8(%rsi), %r8
+ movq -1*8(%rsi, %rdx), %r9
+ movq %r8, 0*8(%rdi)
+ movq %r9, -1*8(%rdi, %rdx)
+ retq
+ .p2align 4
+.Lless_8bytes:
+ cmpq $4, %rdx
+ jb .Lless_3bytes
+ /*
+ * Move data from 4 bytes to 7 bytes.
+ */
+ movl (%rsi), %ecx
+ movl -4(%rsi, %rdx), %r8d
+ movl %ecx, (%rdi)
+ movl %r8d, -4(%rdi, %rdx)
+ retq
.p2align 4
+.Lless_3bytes:
+ cmpl $0, %edx
+ je .Lend
+ /*
+ * Move data from 1 bytes to 3 bytes.
+ */
.Lloop_1:
movb (%rsi), %r8b
movb %r8b, (%rdi)
incq %rdi
incq %rsi
- decl %ecx
+ decl %edx
jnz .Lloop_1
.Lend:
- ret
+ retq
CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)
@@ -131,7 +179,7 @@ ENDPROC(__memcpy)
.align 8
.quad memcpy
.quad .Lmemcpy_c
- .byte X86_FEATURE_REP_GOOD
+ .word X86_FEATURE_REP_GOOD
/*
* Replace only beginning, memcpy is used to apply alternatives,
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index 0a33909bf122..6d0f0ec41b34 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -8,14 +8,185 @@
#undef memmove
void *memmove(void *dest, const void *src, size_t count)
{
- if (dest < src) {
- return memcpy(dest, src, count);
- } else {
- char *p = dest + count;
- const char *s = src + count;
- while (count--)
- *--p = *--s;
- }
- return dest;
+ unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
+ char *ret;
+
+ __asm__ __volatile__(
+ /* Handle more 32bytes in loop */
+ "mov %2, %3\n\t"
+ "cmp $0x20, %0\n\t"
+ "jb 1f\n\t"
+
+ /* Decide forward/backward copy mode */
+ "cmp %2, %1\n\t"
+ "jb 2f\n\t"
+
+ /*
+ * movsq instruction have many startup latency
+ * so we handle small size by general register.
+ */
+ "cmp $680, %0\n\t"
+ "jb 3f\n\t"
+ /*
+ * movsq instruction is only good for aligned case.
+ */
+ "cmpb %%dil, %%sil\n\t"
+ "je 4f\n\t"
+ "3:\n\t"
+ "sub $0x20, %0\n\t"
+ /*
+ * We gobble 32byts forward in each loop.
+ */
+ "5:\n\t"
+ "sub $0x20, %0\n\t"
+ "movq 0*8(%1), %4\n\t"
+ "movq 1*8(%1), %5\n\t"
+ "movq 2*8(%1), %6\n\t"
+ "movq 3*8(%1), %7\n\t"
+ "leaq 4*8(%1), %1\n\t"
+
+ "movq %4, 0*8(%2)\n\t"
+ "movq %5, 1*8(%2)\n\t"
+ "movq %6, 2*8(%2)\n\t"
+ "movq %7, 3*8(%2)\n\t"
+ "leaq 4*8(%2), %2\n\t"
+ "jae 5b\n\t"
+ "addq $0x20, %0\n\t"
+ "jmp 1f\n\t"
+ /*
+ * Handle data forward by movsq.
+ */
+ ".p2align 4\n\t"
+ "4:\n\t"
+ "movq %0, %8\n\t"
+ "movq -8(%1, %0), %4\n\t"
+ "lea -8(%2, %0), %5\n\t"
+ "shrq $3, %8\n\t"
+ "rep movsq\n\t"
+ "movq %4, (%5)\n\t"
+ "jmp 13f\n\t"
+ /*
+ * Handle data backward by movsq.
+ */
+ ".p2align 4\n\t"
+ "7:\n\t"
+ "movq %0, %8\n\t"
+ "movq (%1), %4\n\t"
+ "movq %2, %5\n\t"
+ "leaq -8(%1, %0), %1\n\t"
+ "leaq -8(%2, %0), %2\n\t"
+ "shrq $3, %8\n\t"
+ "std\n\t"
+ "rep movsq\n\t"
+ "cld\n\t"
+ "movq %4, (%5)\n\t"
+ "jmp 13f\n\t"
+
+ /*
+ * Start to prepare for backward copy.
+ */
+ ".p2align 4\n\t"
+ "2:\n\t"
+ "cmp $680, %0\n\t"
+ "jb 6f \n\t"
+ "cmp %%dil, %%sil\n\t"
+ "je 7b \n\t"
+ "6:\n\t"
+ /*
+ * Calculate copy position to tail.
+ */
+ "addq %0, %1\n\t"
+ "addq %0, %2\n\t"
+ "subq $0x20, %0\n\t"
+ /*
+ * We gobble 32byts backward in each loop.
+ */
+ "8:\n\t"
+ "subq $0x20, %0\n\t"
+ "movq -1*8(%1), %4\n\t"
+ "movq -2*8(%1), %5\n\t"
+ "movq -3*8(%1), %6\n\t"
+ "movq -4*8(%1), %7\n\t"
+ "leaq -4*8(%1), %1\n\t"
+
+ "movq %4, -1*8(%2)\n\t"
+ "movq %5, -2*8(%2)\n\t"
+ "movq %6, -3*8(%2)\n\t"
+ "movq %7, -4*8(%2)\n\t"
+ "leaq -4*8(%2), %2\n\t"
+ "jae 8b\n\t"
+ /*
+ * Calculate copy position to head.
+ */
+ "addq $0x20, %0\n\t"
+ "subq %0, %1\n\t"
+ "subq %0, %2\n\t"
+ "1:\n\t"
+ "cmpq $16, %0\n\t"
+ "jb 9f\n\t"
+ /*
+ * Move data from 16 bytes to 31 bytes.
+ */
+ "movq 0*8(%1), %4\n\t"
+ "movq 1*8(%1), %5\n\t"
+ "movq -2*8(%1, %0), %6\n\t"
+ "movq -1*8(%1, %0), %7\n\t"
+ "movq %4, 0*8(%2)\n\t"
+ "movq %5, 1*8(%2)\n\t"
+ "movq %6, -2*8(%2, %0)\n\t"
+ "movq %7, -1*8(%2, %0)\n\t"
+ "jmp 13f\n\t"
+ ".p2align 4\n\t"
+ "9:\n\t"
+ "cmpq $8, %0\n\t"
+ "jb 10f\n\t"
+ /*
+ * Move data from 8 bytes to 15 bytes.
+ */
+ "movq 0*8(%1), %4\n\t"
+ "movq -1*8(%1, %0), %5\n\t"
+ "movq %4, 0*8(%2)\n\t"
+ "movq %5, -1*8(%2, %0)\n\t"
+ "jmp 13f\n\t"
+ "10:\n\t"
+ "cmpq $4, %0\n\t"
+ "jb 11f\n\t"
+ /*
+ * Move data from 4 bytes to 7 bytes.
+ */
+ "movl (%1), %4d\n\t"
+ "movl -4(%1, %0), %5d\n\t"
+ "movl %4d, (%2)\n\t"
+ "movl %5d, -4(%2, %0)\n\t"
+ "jmp 13f\n\t"
+ "11:\n\t"
+ "cmp $2, %0\n\t"
+ "jb 12f\n\t"
+ /*
+ * Move data from 2 bytes to 3 bytes.
+ */
+ "movw (%1), %4w\n\t"
+ "movw -2(%1, %0), %5w\n\t"
+ "movw %4w, (%2)\n\t"
+ "movw %5w, -2(%2, %0)\n\t"
+ "jmp 13f\n\t"
+ "12:\n\t"
+ "cmp $1, %0\n\t"
+ "jb 13f\n\t"
+ /*
+ * Move data for 1 byte.
+ */
+ "movb (%1), %4b\n\t"
+ "movb %4b, (%2)\n\t"
+ "13:\n\t"
+ : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
+ "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
+ :"0" (count),
+ "1" (src),
+ "2" (dest)
+ :"memory");
+
+ return ret;
+
}
EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index e88d3b81644a..09d344269652 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -121,7 +121,7 @@ ENDPROC(__memset)
.align 8
.quad memset
.quad .Lmemset_c
- .byte X86_FEATURE_REP_GOOD
+ .word X86_FEATURE_REP_GOOD
.byte .Lfinal - memset
.byte .Lmemset_e - .Lmemset_c
.previous
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index a4c768397baa..55543397a8a7 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -26,4 +26,6 @@ obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
obj-$(CONFIG_K8_NUMA) += k8topology_64.o
obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
+obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
+
obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a725b7f760ae..0002a3a33081 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -37,6 +37,28 @@ struct addr_marker {
const char *name;
};
+/* indices for address_markers; keep sync'd w/ address_markers below */
+enum address_markers_idx {
+ USER_SPACE_NR = 0,
+#ifdef CONFIG_X86_64
+ KERNEL_SPACE_NR,
+ LOW_KERNEL_NR,
+ VMALLOC_START_NR,
+ VMEMMAP_START_NR,
+ HIGH_KERNEL_NR,
+ MODULES_VADDR_NR,
+ MODULES_END_NR,
+#else
+ KERNEL_SPACE_NR,
+ VMALLOC_START_NR,
+ VMALLOC_END_NR,
+# ifdef CONFIG_HIGHMEM
+ PKMAP_BASE_NR,
+# endif
+ FIXADDR_START_NR,
+#endif
+};
+
/* Address space markers hints */
static struct addr_marker address_markers[] = {
{ 0, "User Space" },
@@ -331,14 +353,12 @@ static int pt_dump_init(void)
#ifdef CONFIG_X86_32
/* Not a compile-time constant on x86-32 */
- address_markers[2].start_address = VMALLOC_START;
- address_markers[3].start_address = VMALLOC_END;
+ address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
+ address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
# ifdef CONFIG_HIGHMEM
- address_markers[4].start_address = PKMAP_BASE;
- address_markers[5].start_address = FIXADDR_START;
-# else
- address_markers[4].start_address = FIXADDR_START;
+ address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
# endif
+ address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
#endif
pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f62777940dfb..7d90ceb882a4 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -11,6 +11,7 @@
#include <linux/kprobes.h> /* __kprobes, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <linux/perf_event.h> /* perf_sw_event */
+#include <linux/hugetlb.h> /* hstate_index_to_shift */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
static void
force_sig_info_fault(int si_signo, int si_code, unsigned long address,
- struct task_struct *tsk)
+ struct task_struct *tsk, int fault)
{
+ unsigned lsb = 0;
siginfo_t info;
info.si_signo = si_signo;
info.si_errno = 0;
info.si_code = si_code;
info.si_addr = (void __user *)address;
- info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
+ if (fault & VM_FAULT_HWPOISON_LARGE)
+ lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
+ if (fault & VM_FAULT_HWPOISON)
+ lsb = PAGE_SHIFT;
+ info.si_addr_lsb = lsb;
force_sig_info(si_signo, &info, tsk);
}
@@ -229,7 +235,16 @@ void vmalloc_sync_all(void)
spin_lock_irqsave(&pgd_lock, flags);
list_for_each_entry(page, &pgd_list, lru) {
- if (!vmalloc_sync_one(page_address(page), address))
+ spinlock_t *pgt_lock;
+ pmd_t *ret;
+
+ pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+ spin_lock(pgt_lock);
+ ret = vmalloc_sync_one(page_address(page), address);
+ spin_unlock(pgt_lock);
+
+ if (!ret)
break;
}
spin_unlock_irqrestore(&pgd_lock, flags);
@@ -251,6 +266,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
if (!(address >= VMALLOC_START && address < VMALLOC_END))
return -1;
+ WARN_ON_ONCE(in_nmi());
+
/*
* Synchronize this task's top level page-table
* with the 'reference' page table.
@@ -326,29 +343,7 @@ out:
void vmalloc_sync_all(void)
{
- unsigned long address;
-
- for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
- address += PGDIR_SIZE) {
-
- const pgd_t *pgd_ref = pgd_offset_k(address);
- unsigned long flags;
- struct page *page;
-
- if (pgd_none(*pgd_ref))
- continue;
-
- spin_lock_irqsave(&pgd_lock, flags);
- list_for_each_entry(page, &pgd_list, lru) {
- pgd_t *pgd;
- pgd = (pgd_t *)page_address(page) + pgd_index(address);
- if (pgd_none(*pgd))
- set_pgd(pgd, *pgd_ref);
- else
- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
- }
- spin_unlock_irqrestore(&pgd_lock, flags);
- }
+ sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
}
/*
@@ -369,6 +364,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
if (!(address >= VMALLOC_START && address < VMALLOC_END))
return -1;
+ WARN_ON_ONCE(in_nmi());
+
/*
* Copy kernel mappings over when needed. This can also
* happen within a race in page table update. In the later
@@ -731,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
tsk->thread.error_code = error_code | (address >= TASK_SIZE);
tsk->thread.trap_no = 14;
- force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+ force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
return;
}
@@ -802,8 +799,10 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
up_read(&mm->mmap_sem);
/* Kernel mode? Handle exceptions or die: */
- if (!(error_code & PF_USER))
+ if (!(error_code & PF_USER)) {
no_context(regs, error_code, address);
+ return;
+ }
/* User-space => ok to do another page fault: */
if (is_prefetch(regs, error_code, address))
@@ -814,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
tsk->thread.trap_no = 14;
#ifdef CONFIG_MEMORY_FAILURE
- if (fault & VM_FAULT_HWPOISON) {
+ if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
printk(KERN_ERR
"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
tsk->comm, tsk->pid, address);
code = BUS_MCEERR_AR;
}
#endif
- force_sig_info_fault(SIGBUS, code, address, tsk);
+ force_sig_info_fault(SIGBUS, code, address, tsk, fault);
}
static noinline void
@@ -831,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
if (fault & VM_FAULT_OOM) {
out_of_memory(regs, error_code, address);
} else {
- if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
+ if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+ VM_FAULT_HWPOISON_LARGE))
do_sigbus(regs, error_code, address, fault);
else
BUG();
@@ -892,8 +892,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
if (pmd_large(*pmd))
return spurious_fault_check(error_code, (pte_t *) pmd);
+ /*
+ * Note: don't use pte_present() here, since it returns true
+ * if the _PAGE_PROTNONE bit is set. However, this aliases the
+ * _PAGE_GLOBAL bit, which for kernel pages give false positives
+ * when CONFIG_DEBUG_PAGEALLOC is used.
+ */
pte = pte_offset_kernel(pmd, address);
- if (!pte_present(*pte))
+ if (!(pte_flags(*pte) & _PAGE_PRESENT))
return 0;
ret = spurious_fault_check(error_code, pte);
@@ -913,9 +919,9 @@ spurious_fault(unsigned long error_code, unsigned long address)
int show_unhandled_signals = 1;
static inline int
-access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
+access_error(unsigned long error_code, struct vm_area_struct *vma)
{
- if (write) {
+ if (error_code & PF_WRITE) {
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE)))
return 1;
@@ -950,8 +956,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
struct task_struct *tsk;
unsigned long address;
struct mm_struct *mm;
- int write;
int fault;
+ int write = error_code & PF_WRITE;
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY |
+ (write ? FAULT_FLAG_WRITE : 0);
tsk = current;
mm = tsk->mm;
@@ -1062,6 +1070,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
bad_area_nosemaphore(regs, error_code, address);
return;
}
+retry:
down_read(&mm->mmap_sem);
} else {
/*
@@ -1105,9 +1114,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
* we can handle it..
*/
good_area:
- write = error_code & PF_WRITE;
-
- if (unlikely(access_error(error_code, write, vma))) {
+ if (unlikely(access_error(error_code, vma))) {
bad_area_access_error(regs, error_code, address);
return;
}
@@ -1117,21 +1124,34 @@ good_area:
* make sure we exit gracefully rather than endlessly redo
* the fault:
*/
- fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
+ fault = handle_mm_fault(mm, vma, address, flags);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, fault);
return;
}
- if (fault & VM_FAULT_MAJOR) {
- tsk->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
- regs, address);
- } else {
- tsk->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
- regs, address);
+ /*
+ * Major/minor page fault accounting is only done on the
+ * initial attempt. If we go through a retry, it is extremely
+ * likely that the page will be found in page cache at that point.
+ */
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ if (fault & VM_FAULT_MAJOR) {
+ tsk->maj_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+ regs, address);
+ } else {
+ tsk->min_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+ regs, address);
+ }
+ if (fault & VM_FAULT_RETRY) {
+ /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
+ * of starvation. */
+ flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ goto retry;
+ }
}
check_v8086_mode(regs, address, tsk);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 63a6ba66cbe0..b49962662101 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -9,6 +9,7 @@ void *kmap(struct page *page)
return page_address(page);
return kmap_high(page);
}
+EXPORT_SYMBOL(kmap);
void kunmap(struct page *page)
{
@@ -18,6 +19,7 @@ void kunmap(struct page *page)
return;
kunmap_high(page);
}
+EXPORT_SYMBOL(kunmap);
/*
* kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
@@ -27,10 +29,10 @@ void kunmap(struct page *page)
* However when holding an atomic kmap it is not legal to sleep, so atomic
* kmaps are appropriate for short, tight code paths only.
*/
-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot(struct page *page, pgprot_t prot)
{
- enum fixed_addresses idx;
unsigned long vaddr;
+ int idx, type;
/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
pagefault_disable();
@@ -38,8 +40,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
if (!PageHighMem(page))
return page_address(page);
- debug_kmap_atomic(type);
-
+ type = kmap_atomic_idx_push();
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
BUG_ON(!pte_none(*(kmap_pte-idx)));
@@ -47,44 +48,57 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
return (void *)vaddr;
}
+EXPORT_SYMBOL(kmap_atomic_prot);
+
+void *__kmap_atomic(struct page *page)
+{
+ return kmap_atomic_prot(page, kmap_prot);
+}
+EXPORT_SYMBOL(__kmap_atomic);
-void *kmap_atomic(struct page *page, enum km_type type)
+/*
+ * This is the same as kmap_atomic() but can map memory that doesn't
+ * have a struct page associated with it.
+ */
+void *kmap_atomic_pfn(unsigned long pfn)
{
- return kmap_atomic_prot(page, type, kmap_prot);
+ return kmap_atomic_prot_pfn(pfn, kmap_prot);
}
+EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void __kunmap_atomic(void *kvaddr)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
- enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
-
- /*
- * Force other mappings to Oops if they'll try to access this pte
- * without first remap it. Keeping stale mappings around is a bad idea
- * also, in case the page changes cacheability attributes or becomes
- * a protected page in a hypervisor.
- */
- if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
+
+ if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+ vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
+ int idx, type;
+
+ type = kmap_atomic_idx();
+ idx = type + KM_TYPE_NR * smp_processor_id();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+ WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+ /*
+ * Force other mappings to Oops if they'll try to access this
+ * pte without first remap it. Keeping stale mappings around
+ * is a bad idea also, in case the page changes cacheability
+ * attributes or becomes a protected page in a hypervisor.
+ */
kpte_clear_flush(kmap_pte-idx, vaddr);
- else {
+ kmap_atomic_idx_pop();
+ }
#ifdef CONFIG_DEBUG_HIGHMEM
+ else {
BUG_ON(vaddr < PAGE_OFFSET);
BUG_ON(vaddr >= (unsigned long)high_memory);
-#endif
}
+#endif
pagefault_enable();
}
-
-/*
- * This is the same as kmap_atomic() but can map memory that doesn't
- * have a struct page associated with it.
- */
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
-{
- return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
-}
-EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
+EXPORT_SYMBOL(__kunmap_atomic);
struct page *kmap_atomic_to_page(void *ptr)
{
@@ -98,12 +112,6 @@ struct page *kmap_atomic_to_page(void *ptr)
pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
return pte_page(*pte);
}
-
-EXPORT_SYMBOL(kmap);
-EXPORT_SYMBOL(kunmap);
-EXPORT_SYMBOL(kmap_atomic);
-EXPORT_SYMBOL(kunmap_atomic);
-EXPORT_SYMBOL(kmap_atomic_prot);
EXPORT_SYMBOL(kmap_atomic_to_page);
void __init set_highmem_pages_init(void)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index b278535b14aa..c0e28a13de7d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -2,6 +2,7 @@
#include <linux/initrd.h>
#include <linux/ioport.h>
#include <linux/swap.h>
+#include <linux/memblock.h>
#include <asm/cacheflush.h>
#include <asm/e820.h>
@@ -33,6 +34,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
int use_gbpages)
{
unsigned long puds, pmds, ptes, tables, start;
+ phys_addr_t base;
puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
@@ -75,12 +77,12 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
#else
start = 0x8000;
#endif
- e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+ base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT,
tables, PAGE_SIZE);
- if (e820_table_start == -1UL)
+ if (base == MEMBLOCK_ERROR)
panic("Cannot find space for the kernel page tables");
- e820_table_start >>= PAGE_SHIFT;
+ e820_table_start = base >> PAGE_SHIFT;
e820_table_end = e820_table_start;
e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
@@ -299,7 +301,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
__flush_tlb_all();
if (!after_bootmem && e820_table_end > e820_table_start)
- reserve_early(e820_table_start << PAGE_SHIFT,
+ memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT,
e820_table_end << PAGE_SHIFT, "PGTABLE");
if (!after_bootmem)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bca79091b9d6..0e969f9f401b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -25,6 +25,7 @@
#include <linux/pfn.h>
#include <linux/poison.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/proc_fs.h>
#include <linux/memory_hotplug.h>
#include <linux/initrd.h>
@@ -67,7 +68,7 @@ static __init void *alloc_low_page(void)
panic("alloc_low_page: ran out of memory");
adr = __va(pfn * PAGE_SIZE);
- memset(adr, 0, PAGE_SIZE);
+ clear_page(adr);
return adr;
}
@@ -422,49 +423,28 @@ static void __init add_one_highpage_init(struct page *page)
totalhigh_pages++;
}
-struct add_highpages_data {
- unsigned long start_pfn;
- unsigned long end_pfn;
-};
-
-static int __init add_highpages_work_fn(unsigned long start_pfn,
- unsigned long end_pfn, void *datax)
+void __init add_highpages_with_active_regions(int nid,
+ unsigned long start_pfn, unsigned long end_pfn)
{
- int node_pfn;
- struct page *page;
- unsigned long final_start_pfn, final_end_pfn;
- struct add_highpages_data *data;
+ struct range *range;
+ int nr_range;
+ int i;
- data = (struct add_highpages_data *)datax;
+ nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn);
- final_start_pfn = max(start_pfn, data->start_pfn);
- final_end_pfn = min(end_pfn, data->end_pfn);
- if (final_start_pfn >= final_end_pfn)
- return 0;
+ for (i = 0; i < nr_range; i++) {
+ struct page *page;
+ int node_pfn;
- for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
- node_pfn++) {
- if (!pfn_valid(node_pfn))
- continue;
- page = pfn_to_page(node_pfn);
- add_one_highpage_init(page);
+ for (node_pfn = range[i].start; node_pfn < range[i].end;
+ node_pfn++) {
+ if (!pfn_valid(node_pfn))
+ continue;
+ page = pfn_to_page(node_pfn);
+ add_one_highpage_init(page);
+ }
}
-
- return 0;
-
}
-
-void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- struct add_highpages_data data;
-
- data.start_pfn = start_pfn;
- data.end_pfn = end_pfn;
-
- work_with_active_regions(nid, add_highpages_work_fn, &data);
-}
-
#else
static inline void permanent_kmaps_init(pgd_t *pgd_base)
{
@@ -548,48 +528,6 @@ static void __init pagetable_init(void)
permanent_kmaps_init(pgd_base);
}
-#ifdef CONFIG_ACPI_SLEEP
-/*
- * ACPI suspend needs this for resume, because things like the intel-agp
- * driver might have split up a kernel 4MB mapping.
- */
-char swsusp_pg_dir[PAGE_SIZE]
- __attribute__ ((aligned(PAGE_SIZE)));
-
-static inline void save_pg_dir(void)
-{
- memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
-}
-#else /* !CONFIG_ACPI_SLEEP */
-static inline void save_pg_dir(void)
-{
-}
-#endif /* !CONFIG_ACPI_SLEEP */
-
-void zap_low_mappings(bool early)
-{
- int i;
-
- /*
- * Zap initial low-memory mappings.
- *
- * Note that "pgd_clear()" doesn't do it for
- * us, because pgd_clear() is a no-op on i386.
- */
- for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
-#ifdef CONFIG_X86_PAE
- set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
-#else
- set_pgd(swapper_pg_dir+i, __pgd(0));
-#endif
- }
-
- if (early)
- __flush_tlb();
- else
- flush_tlb_all();
-}
-
pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
EXPORT_SYMBOL_GPL(__supported_pte_mask);
@@ -712,14 +650,14 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn)
highstart_pfn = max_low_pfn;
- e820_register_active_regions(0, 0, highend_pfn);
+ memblock_x86_register_active_regions(0, 0, highend_pfn);
sparse_memory_present_with_active_regions(0);
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
num_physpages = highend_pfn;
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
- e820_register_active_regions(0, 0, max_low_pfn);
+ memblock_x86_register_active_regions(0, 0, max_low_pfn);
sparse_memory_present_with_active_regions(0);
num_physpages = max_low_pfn;
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
@@ -750,68 +688,12 @@ static void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}
-#ifndef CONFIG_NO_BOOTMEM
-static unsigned long __init setup_node_bootmem(int nodeid,
- unsigned long start_pfn,
- unsigned long end_pfn,
- unsigned long bootmap)
-{
- unsigned long bootmap_size;
-
- /* don't touch min_low_pfn */
- bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
- bootmap >> PAGE_SHIFT,
- start_pfn, end_pfn);
- printk(KERN_INFO " node %d low ram: %08lx - %08lx\n",
- nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
- printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
- nodeid, bootmap, bootmap + bootmap_size);
- free_bootmem_with_active_regions(nodeid, end_pfn);
-
- return bootmap + bootmap_size;
-}
-#endif
-
void __init setup_bootmem_allocator(void)
{
-#ifndef CONFIG_NO_BOOTMEM
- int nodeid;
- unsigned long bootmap_size, bootmap;
- /*
- * Initialize the boot-time allocator (with low memory only):
- */
- bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
- bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
- PAGE_SIZE);
- if (bootmap == -1L)
- panic("Cannot find bootmem map of size %ld\n", bootmap_size);
- reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
-#endif
-
printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
max_pfn_mapped<<PAGE_SHIFT);
printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
-#ifndef CONFIG_NO_BOOTMEM
- for_each_online_node(nodeid) {
- unsigned long start_pfn, end_pfn;
-
-#ifdef CONFIG_NEED_MULTIPLE_NODES
- start_pfn = node_start_pfn[nodeid];
- end_pfn = node_end_pfn[nodeid];
- if (start_pfn > max_low_pfn)
- continue;
- if (end_pfn > max_low_pfn)
- end_pfn = max_low_pfn;
-#else
- start_pfn = 0;
- end_pfn = max_low_pfn;
-#endif
- bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
- bootmap);
- }
-#endif
-
after_bootmem = 1;
}
@@ -958,9 +840,6 @@ void __init mem_init(void)
if (boot_cpu_data.wp_works_ok < 0)
test_wp_bit();
-
- save_pg_dir();
- zap_low_mappings(true);
}
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -1070,8 +949,3 @@ void mark_rodata_ro(void)
}
#endif
-int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
- int flags)
-{
- return reserve_bootmem(phys, len, flags);
-}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ee41bba315d1..71a59296af80 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -2,7 +2,7 @@
* linux/arch/x86_64/mm/init.c
*
* Copyright (C) 1995 Linus Torvalds
- * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
*/
@@ -21,6 +21,7 @@
#include <linux/initrd.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/proc_fs.h>
#include <linux/pci.h>
#include <linux/pfn.h>
@@ -50,9 +51,6 @@
#include <asm/numa.h>
#include <asm/cacheflush.h>
#include <asm/init.h>
-#include <linux/bootmem.h>
-
-static unsigned long dma_reserve __initdata;
static int __init parse_direct_gbpages_off(char *arg)
{
@@ -98,6 +96,43 @@ static int __init nonx32_setup(char *str)
__setup("noexec32=", nonx32_setup);
/*
+ * When memory was added/removed make sure all the processes MM have
+ * suitable PGD entries in the local PGD level page.
+ */
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+ unsigned long address;
+
+ for (address = start; address <= end; address += PGDIR_SIZE) {
+ const pgd_t *pgd_ref = pgd_offset_k(address);
+ unsigned long flags;
+ struct page *page;
+
+ if (pgd_none(*pgd_ref))
+ continue;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd;
+ spinlock_t *pgt_lock;
+
+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
+ pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+ spin_lock(pgt_lock);
+
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
+ else
+ BUG_ON(pgd_page_vaddr(*pgd)
+ != pgd_page_vaddr(*pgd_ref));
+
+ spin_unlock(pgt_lock);
+ }
+ spin_unlock_irqrestore(&pgd_lock, flags);
+ }
+}
+
+/*
* NOTE: This function is marked __ref because it calls __init function
* (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
*/
@@ -293,7 +328,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
panic("alloc_low_page: ran out of memory");
adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
- memset(adr, 0, PAGE_SIZE);
+ clear_page(adr);
*phys = pfn * PAGE_SIZE;
return adr;
}
@@ -534,11 +569,13 @@ kernel_physical_mapping_init(unsigned long start,
unsigned long end,
unsigned long page_size_mask)
{
-
+ bool pgd_changed = false;
unsigned long next, last_map_addr = end;
+ unsigned long addr;
start = (unsigned long)__va(start);
end = (unsigned long)__va(end);
+ addr = start;
for (; start < end; start = next) {
pgd_t *pgd = pgd_offset_k(start);
@@ -563,7 +600,12 @@ kernel_physical_mapping_init(unsigned long start,
spin_lock(&init_mm.page_table_lock);
pgd_populate(&init_mm, pgd, __va(pud_phys));
spin_unlock(&init_mm.page_table_lock);
+ pgd_changed = true;
}
+
+ if (pgd_changed)
+ sync_global_pgds(addr, end);
+
__flush_tlb_all();
return last_map_addr;
@@ -573,23 +615,7 @@ kernel_physical_mapping_init(unsigned long start,
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
int acpi, int k8)
{
-#ifndef CONFIG_NO_BOOTMEM
- unsigned long bootmap_size, bootmap;
-
- bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
- bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
- PAGE_SIZE);
- if (bootmap == -1L)
- panic("Cannot find bootmem map of size %ld\n", bootmap_size);
- reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
- /* don't touch min_low_pfn */
- bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
- 0, end_pfn);
- e820_register_active_regions(0, start_pfn, end_pfn);
- free_bootmem_with_active_regions(0, end_pfn);
-#else
- e820_register_active_regions(0, start_pfn, end_pfn);
-#endif
+ memblock_x86_register_active_regions(0, start_pfn, end_pfn);
}
#endif
@@ -799,52 +825,6 @@ void mark_rodata_ro(void)
#endif
-int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
- int flags)
-{
-#ifdef CONFIG_NUMA
- int nid, next_nid;
- int ret;
-#endif
- unsigned long pfn = phys >> PAGE_SHIFT;
-
- if (pfn >= max_pfn) {
- /*
- * This can happen with kdump kernels when accessing
- * firmware tables:
- */
- if (pfn < max_pfn_mapped)
- return -EFAULT;
-
- printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
- phys, len);
- return -EFAULT;
- }
-
- /* Should check here against the e820 map to avoid double free */
-#ifdef CONFIG_NUMA
- nid = phys_to_nid(phys);
- next_nid = phys_to_nid(phys + len - 1);
- if (nid == next_nid)
- ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
- else
- ret = reserve_bootmem(phys, len, flags);
-
- if (ret != 0)
- return ret;
-
-#else
- reserve_bootmem(phys, len, flags);
-#endif
-
- if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
- dma_reserve += len / PAGE_SIZE;
- set_dma_reserve(dma_reserve);
- }
-
- return 0;
-}
-
int kern_addr_valid(unsigned long addr)
{
unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
@@ -1003,6 +983,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
}
}
+ sync_global_pgds((unsigned long)start_page, end);
return 0;
}
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 84e236ce76ba..7b179b499fa3 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -48,21 +48,20 @@ int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
}
EXPORT_SYMBOL_GPL(iomap_create_wc);
-void
-iomap_free(resource_size_t base, unsigned long size)
+void iomap_free(resource_size_t base, unsigned long size)
{
io_free_memtype(base, base + size);
}
EXPORT_SYMBOL_GPL(iomap_free);
-void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
{
- enum fixed_addresses idx;
unsigned long vaddr;
+ int idx, type;
pagefault_disable();
- debug_kmap_atomic(type);
+ type = kmap_atomic_idx_push();
idx = type + KM_TYPE_NR * smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
@@ -72,10 +71,10 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
}
/*
- * Map 'pfn' using fixed map 'type' and protections 'prot'
+ * Map 'pfn' using protections 'prot'
*/
-void *
-iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+void __iomem *
+iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
{
/*
* For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
@@ -86,24 +85,34 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
prot = PAGE_KERNEL_UC_MINUS;
- return kmap_atomic_prot_pfn(pfn, type, prot);
+ return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot);
}
EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
void
-iounmap_atomic(void *kvaddr, enum km_type type)
+iounmap_atomic(void __iomem *kvaddr)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
- enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
- /*
- * Force other mappings to Oops if they'll try to access this pte
- * without first remap it. Keeping stale mappings around is a bad idea
- * also, in case the page changes cacheability attributes or becomes
- * a protected page in a hypervisor.
- */
- if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
+ if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+ vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
+ int idx, type;
+
+ type = kmap_atomic_idx();
+ idx = type + KM_TYPE_NR * smp_processor_id();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+ WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+ /*
+ * Force other mappings to Oops if they'll try to access this
+ * pte without first remap it. Keeping stale mappings around
+ * is a bad idea also, in case the page changes cacheability
+ * attributes or becomes a protected page in a hypervisor.
+ */
kpte_clear_flush(kmap_pte-idx, vaddr);
+ kmap_atomic_idx_pop();
+ }
pagefault_enable();
}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 12e4d2d3c110..0369843511dc 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -62,8 +62,8 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
static void __iomem *__ioremap_caller(resource_size_t phys_addr,
unsigned long size, unsigned long prot_val, void *caller)
{
- unsigned long pfn, offset, vaddr;
- resource_size_t last_addr;
+ unsigned long offset, vaddr;
+ resource_size_t pfn, last_pfn, last_addr;
const resource_size_t unaligned_phys_addr = phys_addr;
const unsigned long unaligned_size = size;
struct vm_struct *area;
@@ -100,10 +100,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
- for (pfn = phys_addr >> PAGE_SHIFT;
- (pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK);
- pfn++) {
-
+ last_pfn = last_addr >> PAGE_SHIFT;
+ for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
int is_ram = page_is_ram(pfn);
if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
@@ -115,7 +113,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
* Mappings have to be page-aligned
*/
offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PAGE_MASK;
+ phys_addr &= PHYSICAL_PAGE_MASK;
size = PAGE_ALIGN(last_addr+1) - phys_addr;
retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
@@ -364,6 +362,11 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr)
return &bm_pte[pte_index(addr)];
}
+bool __init is_early_ioremap_ptep(pte_t *ptep)
+{
+ return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
+}
+
static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
void __init early_ioremap_init(void)
@@ -613,7 +616,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
return;
}
offset = virt_addr & ~PAGE_MASK;
- nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
+ nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
while (nrpages > 0) {
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 970ed579d4e4..804a3b6c6e14 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -11,6 +11,8 @@
#include <linux/string.h>
#include <linux/module.h>
#include <linux/nodemask.h>
+#include <linux/memblock.h>
+
#include <asm/io.h>
#include <linux/pci_ids.h>
#include <linux/acpi.h>
@@ -22,7 +24,7 @@
#include <asm/numa.h>
#include <asm/mpspec.h>
#include <asm/apic.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
static struct bootnode __initdata nodes[8];
static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
@@ -54,8 +56,8 @@ static __init int find_northbridge(void)
static __init void early_get_boot_cpu_id(void)
{
/*
- * need to get boot_cpu_id so can use that to create apicid_to_node
- * in k8_scan_nodes()
+ * need to get the APIC ID of the BSP so can use that to
+ * create apicid_to_node in k8_scan_nodes()
*/
#ifdef CONFIG_X86_MPPARSE
/*
@@ -212,7 +214,7 @@ int __init k8_scan_nodes(void)
bits = boot_cpu_data.x86_coreid_bits;
cores = (1<<bits);
apicid_base = 0;
- /* need to get boot_cpu_id early for system with apicid lifting */
+ /* get the APIC ID of the BSP early for systems with apicid lifting */
early_get_boot_cpu_id();
if (boot_cpu_physical_apicid > 0) {
pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
@@ -222,7 +224,7 @@ int __init k8_scan_nodes(void)
for_each_node_mask(i, node_possible_map) {
int j;
- e820_register_active_regions(i,
+ memblock_x86_register_active_regions(i,
nodes[i].start >> PAGE_SHIFT,
nodes[i].end >> PAGE_SHIFT);
for (j = apicid_base; j < cores + apicid_base; j++)
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index b3b531a4f8e5..d87dd6d042d6 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -631,6 +631,8 @@ bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
if (!pte)
return false;
+ WARN_ON_ONCE(in_nmi());
+
if (error_code & 2)
kmemcheck_access(regs, address, KMEMCHECK_WRITE);
else
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index 63c19e27aa6f..324aa3f07237 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b)
b == 0xf0 || b == 0xf2 || b == 0xf3
/* Group 2 */
|| b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
- || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
+ || b == 0x64 || b == 0x65
/* Group 3 */
|| b == 0x66
/* Group 4 */
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 5d0e67fff1a6..e5d5e2ce9f77 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -45,6 +45,8 @@ struct kmmio_fault_page {
* Protected by kmmio_lock, when linked into kmmio_page_table.
*/
int count;
+
+ bool scheduled_for_release;
};
struct kmmio_delayed_release {
@@ -398,8 +400,11 @@ static void release_kmmio_fault_page(unsigned long page,
BUG_ON(f->count < 0);
if (!f->count) {
disarm_kmmio_fault_page(f);
- f->release_next = *release_list;
- *release_list = f;
+ if (!f->scheduled_for_release) {
+ f->release_next = *release_list;
+ *release_list = f;
+ f->scheduled_for_release = true;
+ }
}
}
@@ -471,8 +476,10 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
prevp = &f->release_next;
} else {
*prevp = f->release_next;
+ f->release_next = NULL;
+ f->scheduled_for_release = false;
}
- f = f->release_next;
+ f = *prevp;
}
spin_unlock_irqrestore(&kmmio_lock, flags);
@@ -510,6 +517,9 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
kmmio_count--;
spin_unlock_irqrestore(&kmmio_lock, flags);
+ if (!release_list)
+ return;
+
drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
if (!drelease) {
pr_crit("leaking kmmio_fault_page objects.\n");
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
new file mode 100644
index 000000000000..aa1169392b83
--- /dev/null
+++ b/arch/x86/mm/memblock.c
@@ -0,0 +1,348 @@
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/range.h>
+
+/* Check for already reserved areas */
+static bool __init check_with_memblock_reserved_size(u64 *addrp, u64 *sizep, u64 align)
+{
+ struct memblock_region *r;
+ u64 addr = *addrp, last;
+ u64 size = *sizep;
+ bool changed = false;
+
+again:
+ last = addr + size;
+ for_each_memblock(reserved, r) {
+ if (last > r->base && addr < r->base) {
+ size = r->base - addr;
+ changed = true;
+ goto again;
+ }
+ if (last > (r->base + r->size) && addr < (r->base + r->size)) {
+ addr = round_up(r->base + r->size, align);
+ size = last - addr;
+ changed = true;
+ goto again;
+ }
+ if (last <= (r->base + r->size) && addr >= r->base) {
+ *sizep = 0;
+ return false;
+ }
+ }
+ if (changed) {
+ *addrp = addr;
+ *sizep = size;
+ }
+ return changed;
+}
+
+/*
+ * Find next free range after start, and size is returned in *sizep
+ */
+u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align)
+{
+ struct memblock_region *r;
+
+ for_each_memblock(memory, r) {
+ u64 ei_start = r->base;
+ u64 ei_last = ei_start + r->size;
+ u64 addr;
+
+ addr = round_up(ei_start, align);
+ if (addr < start)
+ addr = round_up(start, align);
+ if (addr >= ei_last)
+ continue;
+ *sizep = ei_last - addr;
+ while (check_with_memblock_reserved_size(&addr, sizep, align))
+ ;
+
+ if (*sizep)
+ return addr;
+ }
+
+ return MEMBLOCK_ERROR;
+}
+
+static __init struct range *find_range_array(int count)
+{
+ u64 end, size, mem;
+ struct range *range;
+
+ size = sizeof(struct range) * count;
+ end = memblock.current_limit;
+
+ mem = memblock_find_in_range(0, end, size, sizeof(struct range));
+ if (mem == MEMBLOCK_ERROR)
+ panic("can not find more space for range array");
+
+ /*
+ * This range is tempoaray, so don't reserve it, it will not be
+ * overlapped because We will not alloccate new buffer before
+ * We discard this one
+ */
+ range = __va(mem);
+ memset(range, 0, size);
+
+ return range;
+}
+
+static void __init memblock_x86_subtract_reserved(struct range *range, int az)
+{
+ u64 final_start, final_end;
+ struct memblock_region *r;
+
+ /* Take out region array itself at first*/
+ memblock_free_reserved_regions();
+
+ memblock_dbg("Subtract (%ld early reservations)\n", memblock.reserved.cnt);
+
+ for_each_memblock(reserved, r) {
+ memblock_dbg(" [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1);
+ final_start = PFN_DOWN(r->base);
+ final_end = PFN_UP(r->base + r->size);
+ if (final_start >= final_end)
+ continue;
+ subtract_range(range, az, final_start, final_end);
+ }
+
+ /* Put region array back ? */
+ memblock_reserve_reserved_regions();
+}
+
+struct count_data {
+ int nr;
+};
+
+static int __init count_work_fn(unsigned long start_pfn,
+ unsigned long end_pfn, void *datax)
+{
+ struct count_data *data = datax;
+
+ data->nr++;
+
+ return 0;
+}
+
+static int __init count_early_node_map(int nodeid)
+{
+ struct count_data data;
+
+ data.nr = 0;
+ work_with_active_regions(nodeid, count_work_fn, &data);
+
+ return data.nr;
+}
+
+int __init __get_free_all_memory_range(struct range **rangep, int nodeid,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ int count;
+ struct range *range;
+ int nr_range;
+
+ count = (memblock.reserved.cnt + count_early_node_map(nodeid)) * 2;
+
+ range = find_range_array(count);
+ nr_range = 0;
+
+ /*
+ * Use early_node_map[] and memblock.reserved.region to get range array
+ * at first
+ */
+ nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
+ subtract_range(range, count, 0, start_pfn);
+ subtract_range(range, count, end_pfn, -1ULL);
+
+ memblock_x86_subtract_reserved(range, count);
+ nr_range = clean_sort_range(range, count);
+
+ *rangep = range;
+ return nr_range;
+}
+
+int __init get_free_all_memory_range(struct range **rangep, int nodeid)
+{
+ unsigned long end_pfn = -1UL;
+
+#ifdef CONFIG_X86_32
+ end_pfn = max_low_pfn;
+#endif
+ return __get_free_all_memory_range(rangep, nodeid, 0, end_pfn);
+}
+
+static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free)
+{
+ int i, count;
+ struct range *range;
+ int nr_range;
+ u64 final_start, final_end;
+ u64 free_size;
+ struct memblock_region *r;
+
+ count = (memblock.reserved.cnt + memblock.memory.cnt) * 2;
+
+ range = find_range_array(count);
+ nr_range = 0;
+
+ addr = PFN_UP(addr);
+ limit = PFN_DOWN(limit);
+
+ for_each_memblock(memory, r) {
+ final_start = PFN_UP(r->base);
+ final_end = PFN_DOWN(r->base + r->size);
+ if (final_start >= final_end)
+ continue;
+ if (final_start >= limit || final_end <= addr)
+ continue;
+
+ nr_range = add_range(range, count, nr_range, final_start, final_end);
+ }
+ subtract_range(range, count, 0, addr);
+ subtract_range(range, count, limit, -1ULL);
+
+ /* Subtract memblock.reserved.region in range ? */
+ if (!get_free)
+ goto sort_and_count_them;
+ for_each_memblock(reserved, r) {
+ final_start = PFN_DOWN(r->base);
+ final_end = PFN_UP(r->base + r->size);
+ if (final_start >= final_end)
+ continue;
+ if (final_start >= limit || final_end <= addr)
+ continue;
+
+ subtract_range(range, count, final_start, final_end);
+ }
+
+sort_and_count_them:
+ nr_range = clean_sort_range(range, count);
+
+ free_size = 0;
+ for (i = 0; i < nr_range; i++)
+ free_size += range[i].end - range[i].start;
+
+ return free_size << PAGE_SHIFT;
+}
+
+u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit)
+{
+ return __memblock_x86_memory_in_range(addr, limit, true);
+}
+
+u64 __init memblock_x86_memory_in_range(u64 addr, u64 limit)
+{
+ return __memblock_x86_memory_in_range(addr, limit, false);
+}
+
+void __init memblock_x86_reserve_range(u64 start, u64 end, char *name)
+{
+ if (start == end)
+ return;
+
+ if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx)\n", start, end))
+ return;
+
+ memblock_dbg(" memblock_x86_reserve_range: [%#010llx-%#010llx] %16s\n", start, end - 1, name);
+
+ memblock_reserve(start, end - start);
+}
+
+void __init memblock_x86_free_range(u64 start, u64 end)
+{
+ if (start == end)
+ return;
+
+ if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx)\n", start, end))
+ return;
+
+ memblock_dbg(" memblock_x86_free_range: [%#010llx-%#010llx]\n", start, end - 1);
+
+ memblock_free(start, end - start);
+}
+
+/*
+ * Need to call this function after memblock_x86_register_active_regions,
+ * so early_node_map[] is filled already.
+ */
+u64 __init memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align)
+{
+ u64 addr;
+ addr = find_memory_core_early(nid, size, align, start, end);
+ if (addr != MEMBLOCK_ERROR)
+ return addr;
+
+ /* Fallback, should already have start end within node range */
+ return memblock_find_in_range(start, end, size, align);
+}
+
+/*
+ * Finds an active region in the address range from start_pfn to last_pfn and
+ * returns its range in ei_startpfn and ei_endpfn for the memblock entry.
+ */
+static int __init memblock_x86_find_active_region(const struct memblock_region *ei,
+ unsigned long start_pfn,
+ unsigned long last_pfn,
+ unsigned long *ei_startpfn,
+ unsigned long *ei_endpfn)
+{
+ u64 align = PAGE_SIZE;
+
+ *ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT;
+ *ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT;
+
+ /* Skip map entries smaller than a page */
+ if (*ei_startpfn >= *ei_endpfn)
+ return 0;
+
+ /* Skip if map is outside the node */
+ if (*ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn)
+ return 0;
+
+ /* Check for overlaps */
+ if (*ei_startpfn < start_pfn)
+ *ei_startpfn = start_pfn;
+ if (*ei_endpfn > last_pfn)
+ *ei_endpfn = last_pfn;
+
+ return 1;
+}
+
+/* Walk the memblock.memory map and register active regions within a node */
+void __init memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
+ unsigned long last_pfn)
+{
+ unsigned long ei_startpfn;
+ unsigned long ei_endpfn;
+ struct memblock_region *r;
+
+ for_each_memblock(memory, r)
+ if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
+ &ei_startpfn, &ei_endpfn))
+ add_active_range(nid, ei_startpfn, ei_endpfn);
+}
+
+/*
+ * Find the hole size (in bytes) in the memory range.
+ * @start: starting address of the memory range to scan
+ * @end: ending address of the memory range to scan
+ */
+u64 __init memblock_x86_hole_size(u64 start, u64 end)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long last_pfn = end >> PAGE_SHIFT;
+ unsigned long ei_startpfn, ei_endpfn, ram = 0;
+ struct memblock_region *r;
+
+ for_each_memblock(memory, r)
+ if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
+ &ei_startpfn, &ei_endpfn))
+ ram += ei_endpfn - ei_startpfn;
+
+ return end - start - ((u64)ram << PAGE_SHIFT);
+}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 18d244f70205..92faf3a1c53e 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -6,8 +6,7 @@
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/pfn.h>
-
-#include <asm/e820.h>
+#include <linux/memblock.h>
static u64 patterns[] __initdata = {
0,
@@ -35,7 +34,7 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
(unsigned long long) pattern,
(unsigned long long) start_bad,
(unsigned long long) end_bad);
- reserve_early(start_bad, end_bad, "BAD RAM");
+ memblock_x86_reserve_range(start_bad, end_bad, "BAD RAM");
}
static void __init memtest(u64 pattern, u64 start_phys, u64 size)
@@ -74,7 +73,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end)
u64 size = 0;
while (start < end) {
- start = find_e820_area_size(start, &size, 1);
+ start = memblock_x86_find_in_range_size(start, &size, 1);
/* done ? */
if (start >= end)
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 809baaaf48b1..84a3e4c9f277 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -24,6 +24,7 @@
#include <linux/mm.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/mmzone.h>
#include <linux/highmem.h>
#include <linux/initrd.h>
@@ -120,7 +121,7 @@ int __init get_memcfg_numa_flat(void)
node_start_pfn[0] = 0;
node_end_pfn[0] = max_pfn;
- e820_register_active_regions(0, 0, max_pfn);
+ memblock_x86_register_active_regions(0, 0, max_pfn);
memory_present(0, 0, max_pfn);
node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
@@ -161,14 +162,14 @@ static void __init allocate_pgdat(int nid)
NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
else {
unsigned long pgdat_phys;
- pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+ pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT,
max_pfn_mapped<<PAGE_SHIFT,
sizeof(pg_data_t),
PAGE_SIZE);
NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
memset(buf, 0, sizeof(buf));
sprintf(buf, "NODE_DATA %d", nid);
- reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
+ memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
}
printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
nid, (unsigned long)NODE_DATA(nid));
@@ -291,15 +292,15 @@ static __init unsigned long calculate_numa_remap_pages(void)
PTRS_PER_PTE);
node_kva_target <<= PAGE_SHIFT;
do {
- node_kva_final = find_e820_area(node_kva_target,
+ node_kva_final = memblock_find_in_range(node_kva_target,
((u64)node_end_pfn[nid])<<PAGE_SHIFT,
((u64)size)<<PAGE_SHIFT,
LARGE_PAGE_BYTES);
node_kva_target -= LARGE_PAGE_BYTES;
- } while (node_kva_final == -1ULL &&
+ } while (node_kva_final == MEMBLOCK_ERROR &&
(node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
- if (node_kva_final == -1ULL)
+ if (node_kva_final == MEMBLOCK_ERROR)
panic("Can not get kva ram\n");
node_remap_size[nid] = size;
@@ -318,15 +319,13 @@ static __init unsigned long calculate_numa_remap_pages(void)
* but we could have some hole in high memory, and it will only
* check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
* to use it as free.
- * So reserve_early here, hope we don't run out of that array
+ * So memblock_x86_reserve_range here, hope we don't run out of that array
*/
- reserve_early(node_kva_final,
+ memblock_x86_reserve_range(node_kva_final,
node_kva_final+(((u64)size)<<PAGE_SHIFT),
"KVA RAM");
node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
- remove_active_range(nid, node_remap_start_pfn[nid],
- node_remap_start_pfn[nid] + size);
}
printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
reserve_pages);
@@ -367,14 +366,14 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
do {
- kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
+ kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT,
max_low_pfn<<PAGE_SHIFT,
kva_pages<<PAGE_SHIFT,
PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
kva_target_pfn -= PTRS_PER_PTE;
- } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
+ } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn);
- if (kva_start_pfn == -1UL)
+ if (kva_start_pfn == MEMBLOCK_ERROR)
panic("Can not get kva space\n");
printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
@@ -382,7 +381,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
/* avoid clash with initrd */
- reserve_early(kva_start_pfn<<PAGE_SHIFT,
+ memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT,
(kva_start_pfn + kva_pages)<<PAGE_SHIFT,
"KVA PG");
#ifdef CONFIG_HIGHMEM
@@ -419,9 +418,6 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
for_each_online_node(nid) {
memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
NODE_DATA(nid)->node_id = nid;
-#ifndef CONFIG_NO_BOOTMEM
- NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
-#endif
}
setup_bootmem_allocator();
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a7bcc23ef96c..7ffc9b727efd 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -7,6 +7,7 @@
#include <linux/string.h>
#include <linux/init.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/mmzone.h>
#include <linux/ctype.h>
#include <linux/module.h>
@@ -18,7 +19,7 @@
#include <asm/dma.h>
#include <asm/numa.h>
#include <asm/acpi.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
@@ -86,16 +87,16 @@ static int __init allocate_cachealigned_memnodemap(void)
addr = 0x8000;
nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
- nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
+ nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT,
nodemap_size, L1_CACHE_BYTES);
- if (nodemap_addr == -1UL) {
+ if (nodemap_addr == MEMBLOCK_ERROR) {
printk(KERN_ERR
"NUMA: Unable to allocate Memory to Node hash map\n");
nodemap_addr = nodemap_size = 0;
return -1;
}
memnodemap = phys_to_virt(nodemap_addr);
- reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
+ memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
nodemap_addr, nodemap_addr + nodemap_size);
@@ -171,18 +172,15 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
end > (MAX_DMA32_PFN<<PAGE_SHIFT))
start = MAX_DMA32_PFN<<PAGE_SHIFT;
- mem = find_e820_area(start, end, size, align);
- if (mem != -1L)
+ mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
+ if (mem != MEMBLOCK_ERROR)
return __va(mem);
/* extend the search scope */
end = max_pfn_mapped << PAGE_SHIFT;
- if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
- start = MAX_DMA32_PFN<<PAGE_SHIFT;
- else
- start = MAX_DMA_PFN<<PAGE_SHIFT;
- mem = find_e820_area(start, end, size, align);
- if (mem != -1L)
+ start = MAX_DMA_PFN << PAGE_SHIFT;
+ mem = memblock_find_in_range(start, end, size, align);
+ if (mem != MEMBLOCK_ERROR)
return __va(mem);
printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
@@ -198,10 +196,6 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
unsigned long start_pfn, last_pfn, nodedata_phys;
const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
int nid;
-#ifndef CONFIG_NO_BOOTMEM
- unsigned long bootmap_start, bootmap_pages, bootmap_size;
- void *bootmap;
-#endif
if (!end)
return;
@@ -226,7 +220,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
if (node_data[nodeid] == NULL)
return;
nodedata_phys = __pa(node_data[nodeid]);
- reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
+ memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
nodedata_phys + pgdat_size - 1);
nid = phys_to_nid(nodedata_phys);
@@ -238,47 +232,6 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
-#ifndef CONFIG_NO_BOOTMEM
- NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
-
- /*
- * Find a place for the bootmem map
- * nodedata_phys could be on other nodes by alloc_bootmem,
- * so need to sure bootmap_start not to be small, otherwise
- * early_node_mem will get that with find_e820_area instead
- * of alloc_bootmem, that could clash with reserved range
- */
- bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
- bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
- /*
- * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
- * to use that to align to PAGE_SIZE
- */
- bootmap = early_node_mem(nodeid, bootmap_start, end,
- bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
- if (bootmap == NULL) {
- free_early(nodedata_phys, nodedata_phys + pgdat_size);
- node_data[nodeid] = NULL;
- return;
- }
- bootmap_start = __pa(bootmap);
- reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
- "BOOTMAP");
-
- bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
- bootmap_start >> PAGE_SHIFT,
- start_pfn, last_pfn);
-
- printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
- bootmap_start, bootmap_start + bootmap_size - 1,
- bootmap_pages);
- nid = phys_to_nid(bootmap_start);
- if (nid != nodeid)
- printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
-
- free_bootmem_with_active_regions(nodeid, end);
-#endif
-
node_set_online(nodeid);
}
@@ -416,7 +369,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
nr_nodes = MAX_NUMNODES;
}
- size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
+ size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
/*
* Calculate the number of big nodes that can be allocated as a result
* of consolidating the remainder.
@@ -452,7 +405,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
* non-reserved memory is less than the per-node size.
*/
while (end - physnodes[i].start -
- e820_hole_size(physnodes[i].start, end) < size) {
+ memblock_x86_hole_size(physnodes[i].start, end) < size) {
end += FAKE_NODE_MIN_SIZE;
if (end > physnodes[i].end) {
end = physnodes[i].end;
@@ -466,7 +419,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
* this one must extend to the boundary.
*/
if (end < dma32_end && dma32_end - end -
- e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+ memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
end = dma32_end;
/*
@@ -475,7 +428,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
* physical node.
*/
if (physnodes[i].end - end -
- e820_hole_size(end, physnodes[i].end) < size)
+ memblock_x86_hole_size(end, physnodes[i].end) < size)
end = physnodes[i].end;
/*
@@ -503,7 +456,7 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
{
u64 end = start + size;
- while (end - start - e820_hole_size(start, end) < size) {
+ while (end - start - memblock_x86_hole_size(start, end) < size) {
end += FAKE_NODE_MIN_SIZE;
if (end > max_addr) {
end = max_addr;
@@ -532,7 +485,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
* creates a uniform distribution of node sizes across the entire
* machine (but not necessarily over physical nodes).
*/
- min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) /
+ min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
MAX_NUMNODES;
min_size = max(min_size, FAKE_NODE_MIN_SIZE);
if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
@@ -565,7 +518,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
* this one must extend to the boundary.
*/
if (end < dma32_end && dma32_end - end -
- e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+ memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
end = dma32_end;
/*
@@ -574,7 +527,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
* physical node.
*/
if (physnodes[i].end - end -
- e820_hole_size(end, physnodes[i].end) < size)
+ memblock_x86_hole_size(end, physnodes[i].end) < size)
end = physnodes[i].end;
/*
@@ -638,7 +591,7 @@ static int __init numa_emulation(unsigned long start_pfn,
*/
remove_all_active_ranges();
for_each_node_mask(i, node_possible_map) {
- e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
+ memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
nodes[i].end >> PAGE_SHIFT);
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
}
@@ -691,7 +644,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
node_set(0, node_possible_map);
for (i = 0; i < nr_cpu_ids; i++)
numa_set_node(i, 0);
- e820_register_active_regions(0, start_pfn, last_pfn);
+ memblock_x86_register_active_regions(0, start_pfn, last_pfn);
setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
}
@@ -703,9 +656,7 @@ unsigned long __init numa_free_all_bootmem(void)
for_each_online_node(i)
pages += free_all_bootmem_node(NODE_DATA(i));
-#ifdef CONFIG_NO_BOOTMEM
pages += free_all_memory_core_early(MAX_NUMNODES);
-#endif
return pages;
}
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 64121a18b8cb..f6ff57b7efa5 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -158,7 +158,7 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
return req_type;
}
-static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
+static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
{
int ram_page = 0, not_rampage = 0;
unsigned long page_nr;
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index 308e32570d84..38e6d174c497 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -40,16 +40,16 @@ static unsigned char prefix_codes[] = {
static unsigned int reg_rop[] = {
0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
};
-static unsigned int reg_wop[] = { 0x88, 0x89 };
+static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
static unsigned int imm_wop[] = { 0xC6, 0xC7 };
/* IA32 Manual 3, 3-432*/
-static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 };
+static unsigned int rw8[] = { 0x88, 0x8A, 0xC6, 0xAA };
static unsigned int rw32[] = {
- 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+ 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
};
-static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F };
+static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F, 0xAA };
static unsigned int mw16[] = { 0xB70F, 0xBF0F };
-static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 };
+static unsigned int mw32[] = { 0x89, 0x8B, 0xC7, 0xAB };
static unsigned int mw64[] = {};
#else /* not __i386__ */
static unsigned char prefix_codes[] = {
@@ -63,20 +63,20 @@ static unsigned char prefix_codes[] = {
static unsigned int reg_rop[] = {
0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
};
-static unsigned int reg_wop[] = { 0x88, 0x89 };
+static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
static unsigned int imm_wop[] = { 0xC6, 0xC7 };
-static unsigned int rw8[] = { 0xC6, 0x88, 0x8A };
+static unsigned int rw8[] = { 0xC6, 0x88, 0x8A, 0xAA };
static unsigned int rw32[] = {
- 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+ 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
};
/* 8 bit only */
-static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F };
+static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F, 0xAA };
/* 16 bit only */
static unsigned int mw16[] = { 0xB70F, 0xBF0F };
/* 16 or 32 bit */
static unsigned int mw32[] = { 0xC7 };
/* 16, 32 or 64 bit */
-static unsigned int mw64[] = { 0x89, 0x8B };
+static unsigned int mw64[] = { 0x89, 0x8B, 0xAB };
#endif /* not __i386__ */
struct prefix_bits {
@@ -410,7 +410,6 @@ static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
{
unsigned int opcode;
- unsigned char mod_rm;
int reg;
unsigned char *p;
struct prefix_bits prf;
@@ -437,8 +436,13 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
goto err;
do_work:
- mod_rm = *p;
- reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
+ /* for STOS, source register is fixed */
+ if (opcode == 0xAA || opcode == 0xAB) {
+ reg = arg_AX;
+ } else {
+ unsigned char mod_rm = *p;
+ reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
+ }
switch (get_ins_reg_width(ins_addr)) {
case 1:
return *get_reg_w8(reg, prf.rex, regs);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee422590e..8be8c7d7bc89 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
-static void pgd_ctor(pgd_t *pgd)
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+ BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+ virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+ return (struct mm_struct *)page->index;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
{
/* If the pgd points to a shared pagetable level (either the
ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -98,15 +110,13 @@ static void pgd_ctor(pgd_t *pgd)
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
- paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
- __pa(swapper_pg_dir) >> PAGE_SHIFT,
- KERNEL_PGD_BOUNDARY,
- KERNEL_PGD_PTRS);
}
/* list required to sync kernel mapping updates */
- if (!SHARED_KERNEL_PMD)
+ if (!SHARED_KERNEL_PMD) {
+ pgd_set_mm(pgd, mm);
pgd_list_add(pgd);
+ }
}
static void pgd_dtor(pgd_t *pgd)
@@ -272,7 +282,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
*/
spin_lock_irqsave(&pgd_lock, flags);
- pgd_ctor(pgd);
+ pgd_ctor(mm, pgd);
pgd_prepopulate_pmd(mm, pgd, pmds);
spin_unlock_irqrestore(&pgd_lock, flags);
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 9324f13492d5..a17dffd136c1 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -25,6 +25,7 @@
*/
#include <linux/mm.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/mmzone.h>
#include <linux/acpi.h>
#include <linux/nodemask.h>
@@ -264,7 +265,7 @@ int __init get_memcfg_from_srat(void)
if (node_read_chunk(chunk->nid, chunk))
continue;
- e820_register_active_regions(chunk->nid, chunk->start_pfn,
+ memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn,
min(chunk->end_pfn, max_pfn));
}
/* for out of order entries in SRAT */
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index f9897f7a9ef1..a35cb9d8b060 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -16,6 +16,7 @@
#include <linux/module.h>
#include <linux/topology.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/mm.h>
#include <asm/proto.h>
#include <asm/numa.h>
@@ -98,15 +99,15 @@ void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
unsigned long phys;
length = slit->header.length;
- phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
+ phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length,
PAGE_SIZE);
- if (phys == -1L)
+ if (phys == MEMBLOCK_ERROR)
panic(" Can not save slit!\n");
acpi_slit = __va(phys);
memcpy(acpi_slit, slit, length);
- reserve_early(phys, phys + length, "ACPI SLIT");
+ memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT");
}
/* Callback for Proximity Domain -> x2APIC mapping */
@@ -324,7 +325,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
pxmram = 0;
}
- e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
+ e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
printk(KERN_ERR
@@ -420,9 +421,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
return -1;
}
- for_each_node_mask(i, nodes_parsed)
- e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
- nodes[i].end >> PAGE_SHIFT);
+ for (i = 0; i < num_node_memblks; i++)
+ memblock_x86_register_active_regions(memblk_nodeid[i],
+ node_memblk_range[i].start >> PAGE_SHIFT,
+ node_memblk_range[i].end >> PAGE_SHIFT);
+
/* for out of order entries in SRAT */
sort_node_map();
if (!nodes_cover_memory(nodes)) {
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 8565d944f7cf..38868adf07ea 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -90,6 +90,27 @@ static void do_test(unsigned long size)
iounmap(p);
}
+/*
+ * Tests how mmiotrace behaves in face of multiple ioremap / iounmaps in
+ * a short time. We had a bug in deferred freeing procedure which tried
+ * to free this region multiple times (ioremap can reuse the same address
+ * for many mappings).
+ */
+static void do_test_bulk_ioremapping(void)
+{
+ void __iomem *p;
+ int i;
+
+ for (i = 0; i < 10; ++i) {
+ p = ioremap_nocache(mmio_address, PAGE_SIZE);
+ if (p)
+ iounmap(p);
+ }
+
+ /* Force freeing. If it will crash we will know why. */
+ synchronize_rcu();
+}
+
static int __init init(void)
{
unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
@@ -104,6 +125,7 @@ static int __init init(void)
"and writing 16 kB of rubbish in there.\n",
size >> 10, mmio_address);
do_test(size);
+ do_test_bulk_ioremapping();
pr_info("All done.\n");
return 0;
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 426f3a1a64d3..49358481c733 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -5,6 +5,7 @@
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/module.h>
+#include <linux/cpu.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -52,6 +53,8 @@ union smp_flush_state {
want false sharing in the per cpu data segment. */
static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
+static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
+
/*
* We cannot call mmdrop() because we are in interrupt context,
* instead update mm->cpu_vm_mask.
@@ -173,7 +176,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
union smp_flush_state *f;
/* Caller has disabled preemption */
- sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
+ sender = this_cpu_read(tlb_vector_offset);
f = &flush_state[sender];
/*
@@ -218,6 +221,47 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
flush_tlb_others_ipi(cpumask, mm, va);
}
+static void __cpuinit calculate_tlb_offset(void)
+{
+ int cpu, node, nr_node_vecs;
+ /*
+ * we are changing tlb_vector_offset for each CPU in runtime, but this
+ * will not cause inconsistency, as the write is atomic under X86. we
+ * might see more lock contentions in a short time, but after all CPU's
+ * tlb_vector_offset are changed, everything should go normal
+ *
+ * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
+ * waste some vectors.
+ **/
+ if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
+ nr_node_vecs = 1;
+ else
+ nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
+
+ for_each_online_node(node) {
+ int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
+ nr_node_vecs;
+ int cpu_offset = 0;
+ for_each_cpu(cpu, cpumask_of_node(node)) {
+ per_cpu(tlb_vector_offset, cpu) = node_offset +
+ cpu_offset;
+ cpu_offset++;
+ cpu_offset = cpu_offset % nr_node_vecs;
+ }
+ }
+}
+
+static int tlb_cpuhp_notify(struct notifier_block *n,
+ unsigned long action, void *hcpu)
+{
+ switch (action & 0xf) {
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ calculate_tlb_offset();
+ }
+ return NOTIFY_OK;
+}
+
static int __cpuinit init_smp_flush(void)
{
int i;
@@ -225,6 +269,8 @@ static int __cpuinit init_smp_flush(void)
for (i = 0; i < ARRAY_SIZE(flush_state); i++)
raw_spin_lock_init(&flush_state[i].tlbstate_lock);
+ calculate_tlb_offset();
+ hotcpu_notifier(tlb_cpuhp_notify, 0);
return 0;
}
core_initcall(init_smp_flush);
@@ -278,11 +324,9 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
static void do_flush_tlb_all(void *info)
{
- unsigned long cpu = smp_processor_id();
-
__flush_tlb_all();
if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
- leave_mm(cpu);
+ leave_mm(smp_processor_id());
}
void flush_tlb_all(void)
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 3855096c59b8..2d49d4e19a36 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -14,6 +14,7 @@
#include <asm/ptrace.h>
#include <asm/uaccess.h>
#include <asm/stacktrace.h>
+#include <linux/compat.h>
static void backtrace_warning_symbol(void *data, char *msg,
unsigned long symbol)
@@ -48,14 +49,12 @@ static struct stacktrace_ops backtrace_ops = {
.walk_stack = print_context_stack,
};
-struct frame_head {
- struct frame_head *bp;
- unsigned long ret;
-} __attribute__((packed));
-
-static struct frame_head *dump_user_backtrace(struct frame_head *head)
+#ifdef CONFIG_COMPAT
+static struct stack_frame_ia32 *
+dump_user_backtrace_32(struct stack_frame_ia32 *head)
{
- struct frame_head bufhead[2];
+ struct stack_frame_ia32 bufhead[2];
+ struct stack_frame_ia32 *fp;
/* Also check accessibility of one struct frame_head beyond */
if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
@@ -63,20 +62,66 @@ static struct frame_head *dump_user_backtrace(struct frame_head *head)
if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
return NULL;
- oprofile_add_trace(bufhead[0].ret);
+ fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame);
+
+ oprofile_add_trace(bufhead[0].return_address);
+
+ /* frame pointers should strictly progress back up the stack
+ * (towards higher addresses) */
+ if (head >= fp)
+ return NULL;
+
+ return fp;
+}
+
+static inline int
+x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
+{
+ struct stack_frame_ia32 *head;
+
+ /* User process is 32-bit */
+ if (!current || !test_thread_flag(TIF_IA32))
+ return 0;
+
+ head = (struct stack_frame_ia32 *) regs->bp;
+ while (depth-- && head)
+ head = dump_user_backtrace_32(head);
+
+ return 1;
+}
+
+#else
+static inline int
+x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
+{
+ return 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
+{
+ struct stack_frame bufhead[2];
+
+ /* Also check accessibility of one struct stack_frame beyond */
+ if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
+ return NULL;
+ if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
+ return NULL;
+
+ oprofile_add_trace(bufhead[0].return_address);
/* frame pointers should strictly progress back up the stack
* (towards higher addresses) */
- if (head >= bufhead[0].bp)
+ if (head >= bufhead[0].next_frame)
return NULL;
- return bufhead[0].bp;
+ return bufhead[0].next_frame;
}
void
x86_backtrace(struct pt_regs * const regs, unsigned int depth)
{
- struct frame_head *head = (struct frame_head *)frame_pointer(regs);
+ struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
if (!user_mode_vm(regs)) {
unsigned long stack = kernel_stack_pointer(regs);
@@ -86,6 +131,9 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
return;
}
+ if (x86_backtrace_32(regs, depth))
+ return;
+
while (depth-- && head)
head = dump_user_backtrace(head);
}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index b28d2f1253bb..4e8baad36d37 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -568,8 +568,13 @@ static int __init init_sysfs(void)
int error;
error = sysdev_class_register(&oprofile_sysclass);
- if (!error)
- error = sysdev_register(&device_oprofile);
+ if (error)
+ return error;
+
+ error = sysdev_register(&device_oprofile);
+ if (error)
+ sysdev_class_unregister(&oprofile_sysclass);
+
return error;
}
@@ -580,8 +585,10 @@ static void exit_sysfs(void)
}
#else
-#define init_sysfs() do { } while (0)
-#define exit_sysfs() do { } while (0)
+
+static inline int init_sysfs(void) { return 0; }
+static inline void exit_sysfs(void) { }
+
#endif /* CONFIG_PM */
static int __init p4_init(char **cpu_type)
@@ -634,6 +641,18 @@ static int __init ppro_init(char **cpu_type)
if (force_arch_perfmon && cpu_has_arch_perfmon)
return 0;
+ /*
+ * Documentation on identifying Intel processors by CPU family
+ * and model can be found in the Intel Software Developer's
+ * Manuals (SDM):
+ *
+ * http://www.intel.com/products/processor/manuals/
+ *
+ * As of May 2010 the documentation for this was in the:
+ * "Intel 64 and IA-32 Architectures Software Developer's
+ * Manual Volume 3B: System Programming Guide", "Table B-1
+ * CPUID Signature Values of DisplayFamily_DisplayModel".
+ */
switch (cpu_model) {
case 0 ... 2:
*cpu_type = "i386/ppro";
@@ -652,15 +671,19 @@ static int __init ppro_init(char **cpu_type)
case 14:
*cpu_type = "i386/core";
break;
- case 15: case 23:
+ case 0x0f:
+ case 0x16:
+ case 0x17:
+ case 0x1d:
*cpu_type = "i386/core_2";
break;
+ case 0x1a:
+ case 0x1e:
case 0x2e:
- case 26:
spec = &op_arch_perfmon_spec;
*cpu_type = "i386/core_i7";
break;
- case 28:
+ case 0x1c:
*cpu_type = "i386/atom";
break;
default:
@@ -672,9 +695,6 @@ static int __init ppro_init(char **cpu_type)
return 1;
}
-/* in order to get sysfs right */
-static int using_nmi;
-
int __init op_nmi_init(struct oprofile_operations *ops)
{
__u8 vendor = boot_cpu_data.x86_vendor;
@@ -706,6 +726,12 @@ int __init op_nmi_init(struct oprofile_operations *ops)
case 0x11:
cpu_type = "x86-64/family11h";
break;
+ case 0x12:
+ cpu_type = "x86-64/family12h";
+ break;
+ case 0x14:
+ cpu_type = "x86-64/family14h";
+ break;
default:
return -ENODEV;
}
@@ -761,14 +787,15 @@ int __init op_nmi_init(struct oprofile_operations *ops)
mux_init(ops);
- init_sysfs();
- using_nmi = 1;
+ ret = init_sysfs();
+ if (ret)
+ return ret;
+
printk(KERN_INFO "oprofile: using NMI interrupt.\n");
return 0;
}
void op_nmi_exit(void)
{
- if (using_nmi)
- exit_sysfs();
+ exit_sysfs();
}
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index b67a6b5aa8d4..a011bcc0f943 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -48,31 +48,53 @@ static unsigned long reset_value[NUM_VIRT_COUNTERS];
static u32 ibs_caps;
-struct op_ibs_config {
+struct ibs_config {
unsigned long op_enabled;
unsigned long fetch_enabled;
unsigned long max_cnt_fetch;
unsigned long max_cnt_op;
unsigned long rand_en;
unsigned long dispatched_ops;
+ unsigned long branch_target;
};
-static struct op_ibs_config ibs_config;
-static u64 ibs_op_ctl;
+struct ibs_state {
+ u64 ibs_op_ctl;
+ int branch_target;
+ unsigned long sample_size;
+};
+
+static struct ibs_config ibs_config;
+static struct ibs_state ibs_state;
/*
* IBS cpuid feature detection
*/
-#define IBS_CPUID_FEATURES 0x8000001b
+#define IBS_CPUID_FEATURES 0x8000001b
/*
* Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
* bit 0 is used to indicate the existence of IBS.
*/
-#define IBS_CAPS_AVAIL (1LL<<0)
-#define IBS_CAPS_RDWROPCNT (1LL<<3)
-#define IBS_CAPS_OPCNT (1LL<<4)
+#define IBS_CAPS_AVAIL (1U<<0)
+#define IBS_CAPS_FETCHSAM (1U<<1)
+#define IBS_CAPS_OPSAM (1U<<2)
+#define IBS_CAPS_RDWROPCNT (1U<<3)
+#define IBS_CAPS_OPCNT (1U<<4)
+#define IBS_CAPS_BRNTRGT (1U<<5)
+#define IBS_CAPS_OPCNTEXT (1U<<6)
+
+#define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \
+ | IBS_CAPS_FETCHSAM \
+ | IBS_CAPS_OPSAM)
+
+/*
+ * IBS APIC setup
+ */
+#define IBSCTL 0x1cc
+#define IBSCTL_LVT_OFFSET_VALID (1ULL<<8)
+#define IBSCTL_LVT_OFFSET_MASK 0x0F
/*
* IBS randomization macros
@@ -92,12 +114,12 @@ static u32 get_ibs_caps(void)
/* check IBS cpuid feature flags */
max_level = cpuid_eax(0x80000000);
if (max_level < IBS_CPUID_FEATURES)
- return IBS_CAPS_AVAIL;
+ return IBS_CAPS_DEFAULT;
ibs_caps = cpuid_eax(IBS_CPUID_FEATURES);
if (!(ibs_caps & IBS_CAPS_AVAIL))
/* cpuid flags not valid */
- return IBS_CAPS_AVAIL;
+ return IBS_CAPS_DEFAULT;
return ibs_caps;
}
@@ -190,8 +212,8 @@ op_amd_handle_ibs(struct pt_regs * const regs,
rdmsrl(MSR_AMD64_IBSOPCTL, ctl);
if (ctl & IBS_OP_VAL) {
rdmsrl(MSR_AMD64_IBSOPRIP, val);
- oprofile_write_reserve(&entry, regs, val,
- IBS_OP_CODE, IBS_OP_SIZE);
+ oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE,
+ ibs_state.sample_size);
oprofile_add_data64(&entry, val);
rdmsrl(MSR_AMD64_IBSOPDATA, val);
oprofile_add_data64(&entry, val);
@@ -203,10 +225,14 @@ op_amd_handle_ibs(struct pt_regs * const regs,
oprofile_add_data64(&entry, val);
rdmsrl(MSR_AMD64_IBSDCPHYSAD, val);
oprofile_add_data64(&entry, val);
+ if (ibs_state.branch_target) {
+ rdmsrl(MSR_AMD64_IBSBRTARGET, val);
+ oprofile_add_data(&entry, (unsigned long)val);
+ }
oprofile_write_commit(&entry);
/* reenable the IRQ */
- ctl = op_amd_randomize_ibs_op(ibs_op_ctl);
+ ctl = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
wrmsrl(MSR_AMD64_IBSOPCTL, ctl);
}
}
@@ -219,21 +245,32 @@ static inline void op_amd_start_ibs(void)
if (!ibs_caps)
return;
+ memset(&ibs_state, 0, sizeof(ibs_state));
+
+ /*
+ * Note: Since the max count settings may out of range we
+ * write back the actual used values so that userland can read
+ * it.
+ */
+
if (ibs_config.fetch_enabled) {
- val = (ibs_config.max_cnt_fetch >> 4) & IBS_FETCH_MAX_CNT;
+ val = ibs_config.max_cnt_fetch >> 4;
+ val = min(val, IBS_FETCH_MAX_CNT);
+ ibs_config.max_cnt_fetch = val << 4;
val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
val |= IBS_FETCH_ENABLE;
wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
}
if (ibs_config.op_enabled) {
- ibs_op_ctl = ibs_config.max_cnt_op >> 4;
+ val = ibs_config.max_cnt_op >> 4;
if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) {
/*
* IbsOpCurCnt not supported. See
* op_amd_randomize_ibs_op() for details.
*/
- ibs_op_ctl = clamp(ibs_op_ctl, 0x0081ULL, 0xFF80ULL);
+ val = clamp(val, 0x0081ULL, 0xFF80ULL);
+ ibs_config.max_cnt_op = val << 4;
} else {
/*
* The start value is randomized with a
@@ -241,13 +278,24 @@ static inline void op_amd_start_ibs(void)
* with the half of the randomized range. Also
* avoid underflows.
*/
- ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET,
- IBS_OP_MAX_CNT);
+ val += IBS_RANDOM_MAXCNT_OFFSET;
+ if (ibs_caps & IBS_CAPS_OPCNTEXT)
+ val = min(val, IBS_OP_MAX_CNT_EXT);
+ else
+ val = min(val, IBS_OP_MAX_CNT);
+ ibs_config.max_cnt_op =
+ (val - IBS_RANDOM_MAXCNT_OFFSET) << 4;
+ }
+ val = ((val & ~IBS_OP_MAX_CNT) << 4) | (val & IBS_OP_MAX_CNT);
+ val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0;
+ val |= IBS_OP_ENABLE;
+ ibs_state.ibs_op_ctl = val;
+ ibs_state.sample_size = IBS_OP_SIZE;
+ if (ibs_config.branch_target) {
+ ibs_state.branch_target = 1;
+ ibs_state.sample_size++;
}
- if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops)
- ibs_op_ctl |= IBS_OP_CNT_CTL;
- ibs_op_ctl |= IBS_OP_ENABLE;
- val = op_amd_randomize_ibs_op(ibs_op_ctl);
+ val = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
wrmsrl(MSR_AMD64_IBSOPCTL, val);
}
}
@@ -266,6 +314,70 @@ static void op_amd_stop_ibs(void)
wrmsrl(MSR_AMD64_IBSOPCTL, 0);
}
+static inline int eilvt_is_available(int offset)
+{
+ /* check if we may assign a vector */
+ return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
+}
+
+static inline int ibs_eilvt_valid(void)
+{
+ int offset;
+ u64 val;
+
+ rdmsrl(MSR_AMD64_IBSCTL, val);
+ offset = val & IBSCTL_LVT_OFFSET_MASK;
+
+ if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
+ pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
+ smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+ return 0;
+ }
+
+ if (!eilvt_is_available(offset)) {
+ pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
+ smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+ return 0;
+ }
+
+ return 1;
+}
+
+static inline int get_ibs_offset(void)
+{
+ u64 val;
+
+ rdmsrl(MSR_AMD64_IBSCTL, val);
+ if (!(val & IBSCTL_LVT_OFFSET_VALID))
+ return -EINVAL;
+
+ return val & IBSCTL_LVT_OFFSET_MASK;
+}
+
+static void setup_APIC_ibs(void)
+{
+ int offset;
+
+ offset = get_ibs_offset();
+ if (offset < 0)
+ goto failed;
+
+ if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
+ return;
+failed:
+ pr_warn("oprofile: IBS APIC setup failed on cpu #%d\n",
+ smp_processor_id());
+}
+
+static void clear_APIC_ibs(void)
+{
+ int offset;
+
+ offset = get_ibs_offset();
+ if (offset >= 0)
+ setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
+}
+
#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
@@ -376,13 +488,13 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
}
if (ibs_caps)
- setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
+ setup_APIC_ibs();
}
static void op_amd_cpu_shutdown(void)
{
if (ibs_caps)
- setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
+ clear_APIC_ibs();
}
static int op_amd_check_ctrs(struct pt_regs * const regs,
@@ -445,16 +557,11 @@ static void op_amd_stop(struct op_msrs const * const msrs)
op_amd_stop_ibs();
}
-static int __init_ibs_nmi(void)
+static int setup_ibs_ctl(int ibs_eilvt_off)
{
-#define IBSCTL_LVTOFFSETVAL (1 << 8)
-#define IBSCTL 0x1cc
struct pci_dev *cpu_cfg;
int nodes;
u32 value = 0;
- u8 ibs_eilvt_off;
-
- ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
nodes = 0;
cpu_cfg = NULL;
@@ -466,24 +573,63 @@ static int __init_ibs_nmi(void)
break;
++nodes;
pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
- | IBSCTL_LVTOFFSETVAL);
+ | IBSCTL_LVT_OFFSET_VALID);
pci_read_config_dword(cpu_cfg, IBSCTL, &value);
- if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) {
+ if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
pci_dev_put(cpu_cfg);
printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
- "IBSCTL = 0x%08x", value);
- return 1;
+ "IBSCTL = 0x%08x\n", value);
+ return -EINVAL;
}
} while (1);
if (!nodes) {
- printk(KERN_DEBUG "No CPU node configured for IBS");
- return 1;
+ printk(KERN_DEBUG "No CPU node configured for IBS\n");
+ return -ENODEV;
}
return 0;
}
+static int force_ibs_eilvt_setup(void)
+{
+ int i;
+ int ret;
+
+ /* find the next free available EILVT entry */
+ for (i = 1; i < 4; i++) {
+ if (!eilvt_is_available(i))
+ continue;
+ ret = setup_ibs_ctl(i);
+ if (ret)
+ return ret;
+ return 0;
+ }
+
+ printk(KERN_DEBUG "No EILVT entry available\n");
+
+ return -EBUSY;
+}
+
+static int __init_ibs_nmi(void)
+{
+ int ret;
+
+ if (ibs_eilvt_valid())
+ return 0;
+
+ ret = force_ibs_eilvt_setup();
+ if (ret)
+ return ret;
+
+ if (!ibs_eilvt_valid())
+ return -EFAULT;
+
+ pr_err(FW_BUG "workaround enabled for IBS LVT offset\n");
+
+ return 0;
+}
+
/* initialize the APIC for the IBS interrupts if available */
static void init_ibs(void)
{
@@ -521,28 +667,33 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
/* model specific files */
/* setup some reasonable defaults */
+ memset(&ibs_config, 0, sizeof(ibs_config));
ibs_config.max_cnt_fetch = 250000;
- ibs_config.fetch_enabled = 0;
ibs_config.max_cnt_op = 250000;
- ibs_config.op_enabled = 0;
- ibs_config.dispatched_ops = 0;
-
- dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
- oprofilefs_create_ulong(sb, dir, "enable",
- &ibs_config.fetch_enabled);
- oprofilefs_create_ulong(sb, dir, "max_count",
- &ibs_config.max_cnt_fetch);
- oprofilefs_create_ulong(sb, dir, "rand_enable",
- &ibs_config.rand_en);
-
- dir = oprofilefs_mkdir(sb, root, "ibs_op");
- oprofilefs_create_ulong(sb, dir, "enable",
- &ibs_config.op_enabled);
- oprofilefs_create_ulong(sb, dir, "max_count",
- &ibs_config.max_cnt_op);
- if (ibs_caps & IBS_CAPS_OPCNT)
- oprofilefs_create_ulong(sb, dir, "dispatched_ops",
- &ibs_config.dispatched_ops);
+
+ if (ibs_caps & IBS_CAPS_FETCHSAM) {
+ dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
+ oprofilefs_create_ulong(sb, dir, "enable",
+ &ibs_config.fetch_enabled);
+ oprofilefs_create_ulong(sb, dir, "max_count",
+ &ibs_config.max_cnt_fetch);
+ oprofilefs_create_ulong(sb, dir, "rand_enable",
+ &ibs_config.rand_en);
+ }
+
+ if (ibs_caps & IBS_CAPS_OPSAM) {
+ dir = oprofilefs_mkdir(sb, root, "ibs_op");
+ oprofilefs_create_ulong(sb, dir, "enable",
+ &ibs_config.op_enabled);
+ oprofilefs_create_ulong(sb, dir, "max_count",
+ &ibs_config.max_cnt_op);
+ if (ibs_caps & IBS_CAPS_OPCNT)
+ oprofilefs_create_ulong(sb, dir, "dispatched_ops",
+ &ibs_config.dispatched_ops);
+ if (ibs_caps & IBS_CAPS_BRNTRGT)
+ oprofilefs_create_ulong(sb, dir, "branch_target",
+ &ibs_config.branch_target);
+ }
return 0;
}
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index a0207a7fdf39..effd96e33f16 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_PCI_BIOS) += pcbios.o
obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o
obj-$(CONFIG_PCI_DIRECT) += direct.o
obj-$(CONFIG_PCI_OLPC) += olpc.o
+obj-$(CONFIG_PCI_XEN) += xen.o
obj-y += fixup.o
obj-$(CONFIG_ACPI) += acpi.o
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 2ec04c424a62..15466c096ba5 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -34,6 +34,15 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
},
},
+ /* https://bugzilla.kernel.org/show_bug.cgi?id=16007 */
+ /* 2006 AMD HT/VIA system with two host bridges */
+ {
+ .callback = set_use_crs,
+ .ident = "ASRock ALiveSATA2-GLAN",
+ .matches = {
+ DMI_MATCH(DMI_PRODUCT_NAME, "ALiveSATA2-GLAN"),
+ },
+ },
{}
};
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 215a27ae050d..f7c8a399978c 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -125,6 +125,23 @@ void __init dmi_check_skip_isa_align(void)
static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
{
struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
+ struct resource *bar_r;
+ int bar;
+
+ if (pci_probe & PCI_NOASSIGN_BARS) {
+ /*
+ * If the BIOS did not assign the BAR, zero out the
+ * resource so the kernel doesn't attmept to assign
+ * it later on in pci_assign_unassigned_resources
+ */
+ for (bar = 0; bar <= PCI_STD_RESOURCE_END; bar++) {
+ bar_r = &dev->resource[bar];
+ if (bar_r->start == 0 && bar_r->end != 0) {
+ bar_r->flags = 0;
+ bar_r->end = 0;
+ }
+ }
+ }
if (pci_probe & PCI_NOASSIGN_ROMS) {
if (rom_r->parent)
@@ -404,16 +421,10 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
return bus;
}
-
-int __init pcibios_init(void)
+void __init pcibios_set_cache_line_size(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
- if (!raw_pci_ops) {
- printk(KERN_WARNING "PCI: System does not support PCI\n");
- return 0;
- }
-
/*
* Set PCI cacheline size to that of the CPU if the CPU has reported it.
* (For older CPUs that don't support cpuid, we se it to 32 bytes
@@ -428,7 +439,16 @@ int __init pcibios_init(void)
pci_dfl_cache_line_size = 32 >> 2;
printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n");
}
+}
+int __init pcibios_init(void)
+{
+ if (!raw_pci_ops) {
+ printk(KERN_WARNING "PCI: System does not support PCI\n");
+ return 0;
+ }
+
+ pcibios_set_cache_line_size();
pcibios_resource_survey();
if (pci_bf_sort >= pci_force_bf)
@@ -509,6 +529,9 @@ char * __devinit pcibios_setup(char *str)
} else if (!strcmp(str, "norom")) {
pci_probe |= PCI_NOASSIGN_ROMS;
return NULL;
+ } else if (!strcmp(str, "nobar")) {
+ pci_probe |= PCI_NOASSIGN_BARS;
+ return NULL;
} else if (!strcmp(str, "assign-busses")) {
pci_probe |= PCI_ASSIGN_ALL_BUSSES;
return NULL;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 55253095be84..c4bb261c106e 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -65,16 +65,21 @@ pcibios_align_resource(void *data, const struct resource *res,
resource_size_t size, resource_size_t align)
{
struct pci_dev *dev = data;
- resource_size_t start = res->start;
+ resource_size_t start = round_down(res->end - size + 1, align);
if (res->flags & IORESOURCE_IO) {
- if (skip_isa_ioresource_align(dev))
- return start;
- if (start & 0x300)
- start = (start + 0x3ff) & ~0x3ff;
+
+ /*
+ * If we're avoiding ISA aliases, the largest contiguous I/O
+ * port space is 256 bytes. Clearing bits 9 and 10 preserves
+ * all 256-byte and smaller alignments, so the result will
+ * still be correctly aligned.
+ */
+ if (!skip_isa_ioresource_align(dev))
+ start &= ~0x300;
} else if (res->flags & IORESOURCE_MEM) {
if (start < BIOS_END)
- start = BIOS_END;
+ start = res->end; /* fail; no space */
}
return start;
}
@@ -311,6 +316,8 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
*/
prot |= _PAGE_CACHE_UC_MINUS;
+ prot |= _PAGE_IOMAP; /* creating a mapping for IO */
+
vma->vm_page_prot = __pgprot(prot);
if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 9810a0f76c91..9f9bfb705cf9 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -584,27 +584,28 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
case PCI_DEVICE_ID_INTEL_ICH9_3:
case PCI_DEVICE_ID_INTEL_ICH9_4:
case PCI_DEVICE_ID_INTEL_ICH9_5:
- case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
+ case PCI_DEVICE_ID_INTEL_EP80579_0:
case PCI_DEVICE_ID_INTEL_ICH10_0:
case PCI_DEVICE_ID_INTEL_ICH10_1:
case PCI_DEVICE_ID_INTEL_ICH10_2:
case PCI_DEVICE_ID_INTEL_ICH10_3:
+ case PCI_DEVICE_ID_INTEL_PATSBURG_LPC:
r->name = "PIIX/ICH";
r->get = pirq_piix_get;
r->set = pirq_piix_set;
return 1;
}
- if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) &&
- (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) {
+ if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN) &&
+ (device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)) {
r->name = "PIIX/ICH";
r->get = pirq_piix_get;
r->set = pirq_piix_set;
return 1;
}
- if ((device >= PCI_DEVICE_ID_INTEL_CPT_LPC_MIN) &&
- (device <= PCI_DEVICE_ID_INTEL_CPT_LPC_MAX)) {
+ if ((device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN) &&
+ (device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)) {
r->name = "PIIX/ICH";
r->get = pirq_piix_get;
r->set = pirq_piix_set;
@@ -989,7 +990,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq);
/* Update IRQ for all devices with the same pirq value */
- while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
+ for_each_pci_dev(dev2) {
pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
if (!pin)
continue;
@@ -1028,7 +1029,7 @@ void __init pcibios_fixup_irqs(void)
u8 pin;
DBG(KERN_DEBUG "PCI: IRQ fixup\n");
- while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+ for_each_pci_dev(dev) {
/*
* If the BIOS has set an out of range IRQ number, just
* ignore it. Also keep track of which IRQ's are
@@ -1052,7 +1053,7 @@ void __init pcibios_fixup_irqs(void)
return;
dev = NULL;
- while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+ for_each_pci_dev(dev) {
pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
if (!pin)
continue;
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 8d460eaf524f..c89266be6048 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -36,7 +36,7 @@ int __init pci_legacy_init(void)
return 0;
}
-void pcibios_scan_specific_bus(int busn)
+void __devinit pcibios_scan_specific_bus(int busn)
{
int devfn;
long node;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index a918553ebc75..e282886616a0 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -65,7 +65,6 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
int end, u64 addr)
{
struct pci_mmcfg_region *new;
- int num_buses;
struct resource *res;
if (addr == 0)
@@ -82,10 +81,9 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
list_add_sorted(new);
- num_buses = end - start + 1;
res = &new->res;
res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
- res->end = addr + PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
+ res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1;
res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN,
"PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index b34815408f58..13700ec8e2e4 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -304,7 +304,7 @@ static struct pci_raw_ops pci_olpc_conf = {
int __init pci_olpc_init(void)
{
- printk(KERN_INFO "PCI: Using configuration type OLPC\n");
+ printk(KERN_INFO "PCI: Using configuration type OLPC XO-1\n");
raw_pci_ops = &pci_olpc_conf;
is_lx = is_geode_lx();
return 0;
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
new file mode 100644
index 000000000000..117f5b8daf75
--- /dev/null
+++ b/arch/x86/pci/xen.c
@@ -0,0 +1,414 @@
+/*
+ * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux
+ * x86 PCI core to support the Xen PCI Frontend
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/acpi.h>
+
+#include <linux/io.h>
+#include <asm/io_apic.h>
+#include <asm/pci_x86.h>
+
+#include <asm/xen/hypervisor.h>
+
+#include <xen/features.h>
+#include <xen/events.h>
+#include <asm/xen/pci.h>
+
+#ifdef CONFIG_ACPI
+static int xen_hvm_register_pirq(u32 gsi, int triggering)
+{
+ int rc, irq;
+ struct physdev_map_pirq map_irq;
+ int shareable = 0;
+ char *name;
+
+ if (!xen_hvm_domain())
+ return -1;
+
+ map_irq.domid = DOMID_SELF;
+ map_irq.type = MAP_PIRQ_TYPE_GSI;
+ map_irq.index = gsi;
+ map_irq.pirq = -1;
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+ if (rc) {
+ printk(KERN_WARNING "xen map irq failed %d\n", rc);
+ return -1;
+ }
+
+ if (triggering == ACPI_EDGE_SENSITIVE) {
+ shareable = 0;
+ name = "ioapic-edge";
+ } else {
+ shareable = 1;
+ name = "ioapic-level";
+ }
+
+ irq = xen_map_pirq_gsi(map_irq.pirq, gsi, shareable, name);
+
+ printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
+
+ return irq;
+}
+
+static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+ return xen_hvm_register_pirq(gsi, trigger);
+}
+#endif
+
+#if defined(CONFIG_PCI_MSI)
+#include <linux/msi.h>
+#include <asm/msidef.h>
+
+struct xen_pci_frontend_ops *xen_pci_frontend;
+EXPORT_SYMBOL_GPL(xen_pci_frontend);
+
+static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
+ struct msi_msg *msg)
+{
+ /* We set vector == 0 to tell the hypervisor we don't care about it,
+ * but we want a pirq setup instead.
+ * We use the dest_id field to pass the pirq that we want. */
+ msg->address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(pirq);
+ msg->address_lo =
+ MSI_ADDR_BASE_LO |
+ MSI_ADDR_DEST_MODE_PHYSICAL |
+ MSI_ADDR_REDIRECTION_CPU |
+ MSI_ADDR_DEST_ID(pirq);
+
+ msg->data =
+ MSI_DATA_TRIGGER_EDGE |
+ MSI_DATA_LEVEL_ASSERT |
+ /* delivery mode reserved */
+ (3 << 8) |
+ MSI_DATA_VECTOR(0);
+}
+
+static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ int irq, pirq, ret = 0;
+ struct msi_desc *msidesc;
+ struct msi_msg msg;
+
+ list_for_each_entry(msidesc, &dev->msi_list, list) {
+ xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
+ "msi-x" : "msi", &irq, &pirq);
+ if (irq < 0 || pirq < 0)
+ goto error;
+ printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq);
+ xen_msi_compose_msg(dev, pirq, &msg);
+ ret = set_irq_msi(irq, msidesc);
+ if (ret < 0)
+ goto error_while;
+ write_msi_msg(irq, &msg);
+ }
+ return 0;
+
+error_while:
+ unbind_from_irqhandler(irq, NULL);
+error:
+ if (ret == -ENODEV)
+ dev_err(&dev->dev, "Xen PCI frontend has not registered" \
+ " MSI/MSI-X support!\n");
+
+ return ret;
+}
+
+/*
+ * For MSI interrupts we have to use drivers/xen/event.s functions to
+ * allocate an irq_desc and setup the right */
+
+
+static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ int irq, ret, i;
+ struct msi_desc *msidesc;
+ int *v;
+
+ v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL);
+ if (!v)
+ return -ENOMEM;
+
+ if (type == PCI_CAP_ID_MSIX)
+ ret = xen_pci_frontend_enable_msix(dev, &v, nvec);
+ else
+ ret = xen_pci_frontend_enable_msi(dev, &v);
+ if (ret)
+ goto error;
+ i = 0;
+ list_for_each_entry(msidesc, &dev->msi_list, list) {
+ irq = xen_allocate_pirq(v[i], 0, /* not sharable */
+ (type == PCI_CAP_ID_MSIX) ?
+ "pcifront-msi-x" : "pcifront-msi");
+ if (irq < 0)
+ return -1;
+
+ ret = set_irq_msi(irq, msidesc);
+ if (ret)
+ goto error_while;
+ i++;
+ }
+ kfree(v);
+ return 0;
+
+error_while:
+ unbind_from_irqhandler(irq, NULL);
+error:
+ if (ret == -ENODEV)
+ dev_err(&dev->dev, "Xen PCI frontend has not registered" \
+ " MSI/MSI-X support!\n");
+
+ kfree(v);
+ return ret;
+}
+
+static void xen_teardown_msi_irqs(struct pci_dev *dev)
+{
+ struct msi_desc *msidesc;
+
+ msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
+ if (msidesc->msi_attrib.is_msix)
+ xen_pci_frontend_disable_msix(dev);
+ else
+ xen_pci_frontend_disable_msi(dev);
+}
+
+static void xen_teardown_msi_irq(unsigned int irq)
+{
+ xen_destroy_irq(irq);
+}
+
+static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ int irq, ret;
+ struct msi_desc *msidesc;
+
+ list_for_each_entry(msidesc, &dev->msi_list, list) {
+ irq = xen_create_msi_irq(dev, msidesc, type);
+ if (irq < 0)
+ return -1;
+
+ ret = set_irq_msi(irq, msidesc);
+ if (ret)
+ goto error;
+ }
+ return 0;
+
+error:
+ xen_destroy_irq(irq);
+ return ret;
+}
+#endif
+
+static int xen_pcifront_enable_irq(struct pci_dev *dev)
+{
+ int rc;
+ int share = 1;
+
+ dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq);
+
+ if (dev->irq < 0)
+ return -EINVAL;
+
+ if (dev->irq < NR_IRQS_LEGACY)
+ share = 0;
+
+ rc = xen_allocate_pirq(dev->irq, share, "pcifront");
+ if (rc < 0) {
+ dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n",
+ dev->irq, rc);
+ return rc;
+ }
+ return 0;
+}
+
+int __init pci_xen_init(void)
+{
+ if (!xen_pv_domain() || xen_initial_domain())
+ return -ENODEV;
+
+ printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
+
+ pcibios_set_cache_line_size();
+
+ pcibios_enable_irq = xen_pcifront_enable_irq;
+ pcibios_disable_irq = NULL;
+
+#ifdef CONFIG_ACPI
+ /* Keep ACPI out of the picture */
+ acpi_noirq = 1;
+#endif
+
+#ifdef CONFIG_PCI_MSI
+ x86_msi.setup_msi_irqs = xen_setup_msi_irqs;
+ x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
+ x86_msi.teardown_msi_irqs = xen_teardown_msi_irqs;
+#endif
+ return 0;
+}
+
+int __init pci_xen_hvm_init(void)
+{
+ if (!xen_feature(XENFEAT_hvm_pirqs))
+ return 0;
+
+#ifdef CONFIG_ACPI
+ /*
+ * We don't want to change the actual ACPI delivery model,
+ * just how GSIs get registered.
+ */
+ __acpi_register_gsi = acpi_register_gsi_xen_hvm;
+#endif
+
+#ifdef CONFIG_PCI_MSI
+ x86_msi.setup_msi_irqs = xen_hvm_setup_msi_irqs;
+ x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_XEN_DOM0
+static int xen_register_pirq(u32 gsi, int triggering)
+{
+ int rc, irq;
+ struct physdev_map_pirq map_irq;
+ int shareable = 0;
+ char *name;
+
+ if (!xen_pv_domain())
+ return -1;
+
+ if (triggering == ACPI_EDGE_SENSITIVE) {
+ shareable = 0;
+ name = "ioapic-edge";
+ } else {
+ shareable = 1;
+ name = "ioapic-level";
+ }
+
+ irq = xen_allocate_pirq(gsi, shareable, name);
+
+ printk(KERN_DEBUG "xen: --> irq=%d\n", irq);
+
+ if (irq < 0)
+ goto out;
+
+ map_irq.domid = DOMID_SELF;
+ map_irq.type = MAP_PIRQ_TYPE_GSI;
+ map_irq.index = gsi;
+ map_irq.pirq = irq;
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+ if (rc) {
+ printk(KERN_WARNING "xen map irq failed %d\n", rc);
+ return -1;
+ }
+
+out:
+ return irq;
+}
+
+static int xen_register_gsi(u32 gsi, int triggering, int polarity)
+{
+ int rc, irq;
+ struct physdev_setup_gsi setup_gsi;
+
+ if (!xen_pv_domain())
+ return -1;
+
+ printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
+ gsi, triggering, polarity);
+
+ irq = xen_register_pirq(gsi, triggering);
+
+ setup_gsi.gsi = gsi;
+ setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1);
+ setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
+ if (rc == -EEXIST)
+ printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
+ else if (rc) {
+ printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
+ gsi, rc);
+ }
+
+ return irq;
+}
+
+static __init void xen_setup_acpi_sci(void)
+{
+ int rc;
+ int trigger, polarity;
+ int gsi = acpi_sci_override_gsi;
+
+ if (!gsi)
+ return;
+
+ rc = acpi_get_override_irq(gsi, &trigger, &polarity);
+ if (rc) {
+ printk(KERN_WARNING "xen: acpi_get_override_irq failed for acpi"
+ " sci, rc=%d\n", rc);
+ return;
+ }
+ trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
+ polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
+
+ printk(KERN_INFO "xen: sci override: global_irq=%d trigger=%d "
+ "polarity=%d\n", gsi, trigger, polarity);
+
+ gsi = xen_register_gsi(gsi, trigger, polarity);
+ printk(KERN_INFO "xen: acpi sci %d\n", gsi);
+
+ return;
+}
+
+static int acpi_register_gsi_xen(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+ return xen_register_gsi(gsi, trigger, polarity);
+}
+
+static int __init pci_xen_initial_domain(void)
+{
+#ifdef CONFIG_PCI_MSI
+ x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs;
+ x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
+#endif
+ xen_setup_acpi_sci();
+ __acpi_register_gsi = acpi_register_gsi_xen;
+
+ return 0;
+}
+
+void __init xen_setup_pirqs(void)
+{
+ int irq;
+
+ pci_xen_initial_domain();
+
+ if (0 == nr_ioapics) {
+ for (irq = 0; irq < NR_IRQS_LEGACY; irq++)
+ xen_allocate_pirq(irq, 0, "xt-pic");
+ return;
+ }
+
+ /* Pre-allocate legacy irqs */
+ for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
+ int trigger, polarity;
+
+ if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
+ continue;
+
+ xen_register_pirq(irq,
+ trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE);
+ }
+}
+#endif
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
new file mode 100644
index 000000000000..7bf70b812fa2
--- /dev/null
+++ b/arch/x86/platform/Makefile
@@ -0,0 +1,8 @@
+# Platform specific code goes here
+obj-y += efi/
+obj-y += mrst/
+obj-y += olpc/
+obj-y += scx200/
+obj-y += sfi/
+obj-y += visws/
+obj-y += uv/
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
new file mode 100644
index 000000000000..73b8be0f3675
--- /dev/null
+++ b/arch/x86/platform/efi/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
diff --git a/arch/x86/kernel/efi.c b/arch/x86/platform/efi/efi.c
index c2fa9b8b497e..0fe27d7c6258 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -30,6 +30,7 @@
#include <linux/init.h>
#include <linux/efi.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/spinlock.h>
#include <linux/uaccess.h>
#include <linux/time.h>
@@ -275,7 +276,7 @@ static void __init do_add_efi_memmap(void)
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
}
-void __init efi_reserve_early(void)
+void __init efi_memblock_x86_reserve_range(void)
{
unsigned long pmap;
@@ -290,7 +291,7 @@ void __init efi_reserve_early(void)
boot_params.efi_info.efi_memdesc_size;
memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
- reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size,
+ memblock_x86_reserve_range(pmap, pmap + memmap.nr_map * memmap.desc_size,
"EFI memmap");
}
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 5cab48ee61a4..5cab48ee61a4 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/platform/efi/efi_64.c
index ac0621a7ac3d..ac0621a7ac3d 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S
index fbe66e626c09..fbe66e626c09 100644
--- a/arch/x86/kernel/efi_stub_32.S
+++ b/arch/x86/platform/efi/efi_stub_32.S
diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 4c07ccab8146..4c07ccab8146 100644
--- a/arch/x86/kernel/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
new file mode 100644
index 000000000000..efbbc552fa95
--- /dev/null
+++ b/arch/x86/platform/mrst/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_X86_MRST) += mrst.o
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/platform/mrst/mrst.c
index 5915e0b33303..79ae68154e87 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -25,8 +25,34 @@
#include <asm/i8259.h>
#include <asm/apb_timer.h>
+/*
+ * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
+ * cmdline option x86_mrst_timer can be used to override the configuration
+ * to prefer one or the other.
+ * at runtime, there are basically three timer configurations:
+ * 1. per cpu apbt clock only
+ * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
+ * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
+ *
+ * by default (without cmdline option), platform code first detects cpu type
+ * to see if we are on lincroft or penwell, then set up both lapic or apbt
+ * clocks accordingly.
+ * i.e. by default, medfield uses configuration #2, moorestown uses #1.
+ * config #3 is supported but not recommended on medfield.
+ *
+ * rating and feature summary:
+ * lapic (with C3STOP) --------- 100
+ * apbt (always-on) ------------ 110
+ * lapic (always-on,ARAT) ------ 150
+ */
+
+__cpuinitdata enum mrst_timer_options mrst_timer_options;
+
static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
+enum mrst_cpu_type __mrst_cpu_chip;
+EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
+
int sfi_mtimer_num;
struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
@@ -167,18 +193,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
return 0;
}
-/*
- * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
- * APBT but cmdline option can also override it.
- */
-static void __cpuinit mrst_setup_secondary_clock(void)
-{
- /* restore default lapic clock if disabled by cmdline */
- if (disable_apbt_percpu)
- return setup_secondary_APIC_clock();
- apbt_setup_secondary_clock();
-}
-
static unsigned long __init mrst_calibrate_tsc(void)
{
unsigned long flags, fast_calibrate;
@@ -195,6 +209,21 @@ static unsigned long __init mrst_calibrate_tsc(void)
void __init mrst_time_init(void)
{
+ switch (mrst_timer_options) {
+ case MRST_TIMER_APBT_ONLY:
+ break;
+ case MRST_TIMER_LAPIC_APBT:
+ x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+ x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+ break;
+ default:
+ if (!boot_cpu_has(X86_FEATURE_ARAT))
+ break;
+ x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+ x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+ return;
+ }
+ /* we need at least one APB timer */
sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
pre_init_apic_IRQ0();
apbt_time_init();
@@ -205,16 +234,21 @@ void __init mrst_rtc_init(void)
sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
}
-/*
- * if we use per cpu apb timer, the bootclock already setup. if we use lapic
- * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
- */
-static void __init mrst_setup_boot_clock(void)
+void __cpuinit mrst_arch_setup(void)
{
- pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu);
- if (disable_apbt_percpu)
- setup_boot_APIC_clock();
-};
+ if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
+ __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
+ else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
+ __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+ else {
+ pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
+ boot_cpu_data.x86, boot_cpu_data.x86_model);
+ __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+ }
+ pr_debug("Moorestown CPU %s identified\n",
+ (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
+ "Lincroft" : "Penwell");
+}
/* MID systems don't have i8042 controller */
static int mrst_i8042_detect(void)
@@ -232,11 +266,13 @@ void __init x86_mrst_early_setup(void)
x86_init.resources.reserve_resources = x86_init_noop;
x86_init.timers.timer_init = mrst_time_init;
- x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock;
+ x86_init.timers.setup_percpu_clockev = x86_init_noop;
x86_init.irqs.pre_vector_init = x86_init_noop;
- x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock;
+ x86_init.oem.arch_setup = mrst_arch_setup;
+
+ x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
x86_platform.calibrate_tsc = mrst_calibrate_tsc;
x86_platform.i8042_detect = mrst_i8042_detect;
@@ -250,3 +286,26 @@ void __init x86_mrst_early_setup(void)
x86_init.mpparse.get_smp_config = x86_init_uint_noop;
}
+
+/*
+ * if user does not want to use per CPU apb timer, just give it a lower rating
+ * than local apic timer and skip the late per cpu timer init.
+ */
+static inline int __init setup_x86_mrst_timer(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (strcmp("apbt_only", arg) == 0)
+ mrst_timer_options = MRST_TIMER_APBT_ONLY;
+ else if (strcmp("lapic_and_apbt", arg) == 0)
+ mrst_timer_options = MRST_TIMER_LAPIC_APBT;
+ else {
+ pr_warning("X86 MRST timer option %s not recognised"
+ " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
+ arg);
+ return -EINVAL;
+ }
+ return 0;
+}
+__setup("x86_mrst_timer=", setup_x86_mrst_timer);
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
new file mode 100644
index 000000000000..c31b8fcb5a86
--- /dev/null
+++ b/arch/x86/platform/olpc/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_OLPC) += olpc.o
+obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
+obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
new file mode 100644
index 000000000000..f5442c03abc3
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo1.c
@@ -0,0 +1,140 @@
+/*
+ * Support for features of the OLPC XO-1 laptop
+ *
+ * Copyright (C) 2010 One Laptop per Child
+ * Copyright (C) 2006 Red Hat, Inc.
+ * Copyright (C) 2006 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+
+#include <asm/io.h>
+#include <asm/olpc.h>
+
+#define DRV_NAME "olpc-xo1"
+
+#define PMS_BAR 4
+#define ACPI_BAR 5
+
+/* PMC registers (PMS block) */
+#define PM_SCLK 0x10
+#define PM_IN_SLPCTL 0x20
+#define PM_WKXD 0x34
+#define PM_WKD 0x30
+#define PM_SSC 0x54
+
+/* PM registers (ACPI block) */
+#define PM1_CNT 0x08
+#define PM_GPE0_STS 0x18
+
+static unsigned long acpi_base;
+static unsigned long pms_base;
+
+static void xo1_power_off(void)
+{
+ printk(KERN_INFO "OLPC XO-1 power off sequence...\n");
+
+ /* Enable all of these controls with 0 delay */
+ outl(0x40000000, pms_base + PM_SCLK);
+ outl(0x40000000, pms_base + PM_IN_SLPCTL);
+ outl(0x40000000, pms_base + PM_WKXD);
+ outl(0x40000000, pms_base + PM_WKD);
+
+ /* Clear status bits (possibly unnecessary) */
+ outl(0x0002ffff, pms_base + PM_SSC);
+ outl(0xffffffff, acpi_base + PM_GPE0_STS);
+
+ /* Write SLP_EN bit to start the machinery */
+ outl(0x00002000, acpi_base + PM1_CNT);
+}
+
+/* Read the base addresses from the PCI BAR info */
+static int __devinit setup_bases(struct pci_dev *pdev)
+{
+ int r;
+
+ r = pci_enable_device_io(pdev);
+ if (r) {
+ dev_err(&pdev->dev, "can't enable device IO\n");
+ return r;
+ }
+
+ r = pci_request_region(pdev, ACPI_BAR, DRV_NAME);
+ if (r) {
+ dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR);
+ return r;
+ }
+
+ r = pci_request_region(pdev, PMS_BAR, DRV_NAME);
+ if (r) {
+ dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR);
+ pci_release_region(pdev, ACPI_BAR);
+ return r;
+ }
+
+ acpi_base = pci_resource_start(pdev, ACPI_BAR);
+ pms_base = pci_resource_start(pdev, PMS_BAR);
+
+ return 0;
+}
+
+static int __devinit olpc_xo1_probe(struct platform_device *pdev)
+{
+ struct pci_dev *pcidev;
+ int r;
+
+ pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA,
+ NULL);
+ if (!pdev)
+ return -ENODEV;
+
+ r = setup_bases(pcidev);
+ if (r)
+ return r;
+
+ pm_power_off = xo1_power_off;
+
+ printk(KERN_INFO "OLPC XO-1 support registered\n");
+ return 0;
+}
+
+static int __devexit olpc_xo1_remove(struct platform_device *pdev)
+{
+ pm_power_off = NULL;
+ return 0;
+}
+
+static struct platform_driver olpc_xo1_driver = {
+ .driver = {
+ .name = DRV_NAME,
+ .owner = THIS_MODULE,
+ },
+ .probe = olpc_xo1_probe,
+ .remove = __devexit_p(olpc_xo1_remove),
+};
+
+static int __init olpc_xo1_init(void)
+{
+ return platform_driver_register(&olpc_xo1_driver);
+}
+
+static void __exit olpc_xo1_exit(void)
+{
+ platform_driver_unregister(&olpc_xo1_driver);
+}
+
+MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:olpc-xo1");
+
+module_init(olpc_xo1_init);
+module_exit(olpc_xo1_exit);
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/platform/olpc/olpc.c
index 8297160c41b3..edaf3fe8dc5e 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -17,14 +17,12 @@
#include <linux/spinlock.h>
#include <linux/io.h>
#include <linux/string.h>
+#include <linux/platform_device.h>
#include <asm/geode.h>
#include <asm/setup.h>
#include <asm/olpc.h>
-
-#ifdef CONFIG_OPEN_FIRMWARE
-#include <asm/ofw.h>
-#endif
+#include <asm/olpc_ofw.h>
struct olpc_platform_t olpc_platform_info;
EXPORT_SYMBOL_GPL(olpc_platform_info);
@@ -117,6 +115,7 @@ int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
unsigned long flags;
int ret = -EIO;
int i;
+ int restarts = 0;
spin_lock_irqsave(&ec_lock, flags);
@@ -145,7 +144,7 @@ restart:
* The OBF flag will sometimes misbehave due to what we believe
* is a hardware quirk..
*/
- printk(KERN_DEBUG "olpc-ec: running cmd 0x%x\n", cmd);
+ pr_devel("olpc-ec: running cmd 0x%x\n", cmd);
outb(cmd, 0x6c);
if (wait_on_ibf(0x6c, 0)) {
@@ -162,8 +161,7 @@ restart:
" EC accept data!\n");
goto err;
}
- printk(KERN_DEBUG "olpc-ec: sending cmd arg 0x%x\n",
- inbuf[i]);
+ pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]);
outb(inbuf[i], 0x68);
}
}
@@ -173,11 +171,12 @@ restart:
if (wait_on_obf(0x6c, 1)) {
printk(KERN_ERR "olpc-ec: timeout waiting for"
" EC to provide data!\n");
- goto restart;
+ if (restarts++ < 10)
+ goto restart;
+ goto err;
}
outbuf[i] = inb(0x68);
- printk(KERN_DEBUG "olpc-ec: received 0x%x\n",
- outbuf[i]);
+ pr_devel("olpc-ec: received 0x%x\n", outbuf[i]);
}
}
@@ -188,54 +187,66 @@ err:
}
EXPORT_SYMBOL_GPL(olpc_ec_cmd);
-#ifdef CONFIG_OPEN_FIRMWARE
-static void __init platform_detect(void)
+static bool __init check_ofw_architecture(void)
+{
+ size_t propsize;
+ char olpc_arch[5];
+ const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 };
+ void *res[] = { &propsize };
+
+ if (olpc_ofw("getprop", args, res)) {
+ printk(KERN_ERR "ofw: getprop call failed!\n");
+ return false;
+ }
+ return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
+}
+
+static u32 __init get_board_revision(void)
{
size_t propsize;
__be32 rev;
+ const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
+ void *res[] = { &propsize };
- if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
- &propsize) || propsize != 4) {
+ if (olpc_ofw("getprop", args, res) || propsize != 4) {
printk(KERN_ERR "ofw: getprop call failed!\n");
- rev = cpu_to_be32(0);
+ return cpu_to_be32(0);
}
- olpc_platform_info.boardrev = be32_to_cpu(rev);
+ return be32_to_cpu(rev);
}
-#else
-static void __init platform_detect(void)
+
+static bool __init platform_detect(void)
{
- /* stopgap until OFW support is added to the kernel */
- olpc_platform_info.boardrev = olpc_board(0xc2);
+ if (!check_ofw_architecture())
+ return false;
+ olpc_platform_info.flags |= OLPC_F_PRESENT;
+ olpc_platform_info.boardrev = get_board_revision();
+ return true;
}
-#endif
-static int __init olpc_init(void)
+static int __init add_xo1_platform_devices(void)
{
- unsigned char *romsig;
+ struct platform_device *pdev;
- /* The ioremap check is dangerous; limit what we run it on */
- if (!is_geode() || cs5535_has_vsa2())
- return 0;
+ pdev = platform_device_register_simple("xo1-rfkill", -1, NULL, 0);
+ if (IS_ERR(pdev))
+ return PTR_ERR(pdev);
- spin_lock_init(&ec_lock);
+ pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0);
+ if (IS_ERR(pdev))
+ return PTR_ERR(pdev);
- romsig = ioremap(0xffffffc0, 16);
- if (!romsig)
- return 0;
+ return 0;
+}
- if (strncmp(romsig, "CL1 Q", 7))
- goto unmap;
- if (strncmp(romsig+6, romsig+13, 3)) {
- printk(KERN_INFO "OLPC BIOS signature looks invalid. "
- "Assuming not OLPC\n");
- goto unmap;
- }
+static int __init olpc_init(void)
+{
+ int r = 0;
- printk(KERN_INFO "OLPC board with OpenFirmware %.16s\n", romsig);
- olpc_platform_info.flags |= OLPC_F_PRESENT;
+ if (!olpc_ofw_present() || !platform_detect())
+ return 0;
- /* get the platform revision */
- platform_detect();
+ spin_lock_init(&ec_lock);
/* assume B1 and above models always have a DCON */
if (olpc_board_at_least(olpc_board(0xb1)))
@@ -246,8 +257,10 @@ static int __init olpc_init(void)
(unsigned char *) &olpc_platform_info.ecver, 1);
#ifdef CONFIG_PCI_OLPC
- /* If the VSA exists let it emulate PCI, if not emulate in kernel */
- if (!cs5535_has_vsa2())
+ /* If the VSA exists let it emulate PCI, if not emulate in kernel.
+ * XO-1 only. */
+ if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) &&
+ !cs5535_has_vsa2())
x86_init.pci.arch_init = pci_olpc_init;
#endif
@@ -256,8 +269,12 @@ static int __init olpc_init(void)
olpc_platform_info.boardrev >> 4,
olpc_platform_info.ecver);
-unmap:
- iounmap(romsig);
+ if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */
+ r = add_xo1_platform_devices();
+ if (r)
+ return r;
+ }
+
return 0;
}
diff --git a/arch/x86/platform/olpc/olpc_ofw.c b/arch/x86/platform/olpc/olpc_ofw.c
new file mode 100644
index 000000000000..787320464379
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc_ofw.c
@@ -0,0 +1,112 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/page.h>
+#include <asm/setup.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/olpc_ofw.h>
+
+/* address of OFW callback interface; will be NULL if OFW isn't found */
+static int (*olpc_ofw_cif)(int *);
+
+/* page dir entry containing OFW's pgdir table; filled in by head_32.S */
+u32 olpc_ofw_pgd __initdata;
+
+static DEFINE_SPINLOCK(ofw_lock);
+
+#define MAXARGS 10
+
+void __init setup_olpc_ofw_pgd(void)
+{
+ pgd_t *base, *ofw_pde;
+
+ if (!olpc_ofw_cif)
+ return;
+
+ /* fetch OFW's PDE */
+ base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
+ if (!base) {
+ printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n");
+ olpc_ofw_cif = NULL;
+ return;
+ }
+ ofw_pde = &base[OLPC_OFW_PDE_NR];
+
+ /* install OFW's PDE permanently into the kernel's pgtable */
+ set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde);
+ /* implicit optimization barrier here due to uninline function return */
+
+ early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
+}
+
+int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
+ void **res)
+{
+ int ofw_args[MAXARGS + 3];
+ unsigned long flags;
+ int ret, i, *p;
+
+ BUG_ON(nr_args + nr_res > MAXARGS);
+
+ if (!olpc_ofw_cif)
+ return -EIO;
+
+ ofw_args[0] = (int)name;
+ ofw_args[1] = nr_args;
+ ofw_args[2] = nr_res;
+
+ p = &ofw_args[3];
+ for (i = 0; i < nr_args; i++, p++)
+ *p = (int)args[i];
+
+ /* call into ofw */
+ spin_lock_irqsave(&ofw_lock, flags);
+ ret = olpc_ofw_cif(ofw_args);
+ spin_unlock_irqrestore(&ofw_lock, flags);
+
+ if (!ret) {
+ for (i = 0; i < nr_res; i++, p++)
+ *((int *)res[i]) = *p;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__olpc_ofw);
+
+bool olpc_ofw_present(void)
+{
+ return olpc_ofw_cif != NULL;
+}
+EXPORT_SYMBOL_GPL(olpc_ofw_present);
+
+/* OFW cif _should_ be above this address */
+#define OFW_MIN 0xff000000
+
+/* OFW starts on a 1MB boundary */
+#define OFW_BOUND (1<<20)
+
+void __init olpc_ofw_detect(void)
+{
+ struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header;
+ unsigned long start;
+
+ /* ensure OFW booted us by checking for "OFW " string */
+ if (hdr->ofw_magic != OLPC_OFW_SIG)
+ return;
+
+ olpc_ofw_cif = (int (*)(int *))hdr->cif_handler;
+
+ if ((unsigned long)olpc_ofw_cif < OFW_MIN) {
+ printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n",
+ (unsigned long)olpc_ofw_cif);
+ olpc_ofw_cif = NULL;
+ return;
+ }
+
+ /* determine where OFW starts in memory */
+ start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND);
+ printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n",
+ (unsigned long)olpc_ofw_cif, (-start) >> 20);
+ reserve_top_address(-start);
+}
diff --git a/arch/x86/platform/scx200/Makefile b/arch/x86/platform/scx200/Makefile
new file mode 100644
index 000000000000..762b4c7f4314
--- /dev/null
+++ b/arch/x86/platform/scx200/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_SCx200) += scx200.o
+scx200-y += scx200_32.o
diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/platform/scx200/scx200_32.c
index 7e004acbe526..7e004acbe526 100644
--- a/arch/x86/kernel/scx200_32.c
+++ b/arch/x86/platform/scx200/scx200_32.c
diff --git a/arch/x86/platform/sfi/Makefile b/arch/x86/platform/sfi/Makefile
new file mode 100644
index 000000000000..cc5db1168a5e
--- /dev/null
+++ b/arch/x86/platform/sfi/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SFI) += sfi.o
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/platform/sfi/sfi.c
index cb22acf3ed09..dd4c281ffe57 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -34,7 +34,7 @@
#ifdef CONFIG_X86_LOCAL_APIC
static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
-void __init mp_sfi_register_lapic_address(unsigned long address)
+static void __init mp_sfi_register_lapic_address(unsigned long address)
{
mp_lapic_addr = address;
@@ -46,7 +46,7 @@ void __init mp_sfi_register_lapic_address(unsigned long address)
}
/* All CPUs enumerated by SFI must be present and enabled */
-void __cpuinit mp_sfi_register_lapic(u8 id)
+static void __cpuinit mp_sfi_register_lapic(u8 id)
{
if (MAX_APICS - id <= 0) {
pr_warning("Processor #%d invalid (max %d)\n",
diff --git a/arch/x86/platform/uv/Makefile b/arch/x86/platform/uv/Makefile
new file mode 100644
index 000000000000..6c40995fefb8
--- /dev/null
+++ b/arch/x86/platform/uv/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 8bc57baaa9ad..8bc57baaa9ad 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 7fea555929e2..20ea20a39e2a 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -8,6 +8,7 @@
*/
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -22,19 +23,37 @@
#include <asm/irq_vectors.h>
#include <asm/timer.h>
-struct msg_desc {
- struct bau_payload_queue_entry *msg;
- int msg_slot;
- int sw_ack_slot;
- struct bau_payload_queue_entry *va_queue_first;
- struct bau_payload_queue_entry *va_queue_last;
+/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
+static int timeout_base_ns[] = {
+ 20,
+ 160,
+ 1280,
+ 10240,
+ 81920,
+ 655360,
+ 5242880,
+ 167772160
};
-
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
-
-static int uv_bau_max_concurrent __read_mostly;
-
+static int timeout_us;
static int nobau;
+static int baudisabled;
+static spinlock_t disable_lock;
+static cycles_t congested_cycles;
+
+/* tunables: */
+static int max_bau_concurrent = MAX_BAU_CONCURRENT;
+static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
+static int plugged_delay = PLUGGED_DELAY;
+static int plugsb4reset = PLUGSB4RESET;
+static int timeoutsb4reset = TIMEOUTSB4RESET;
+static int ipi_reset_limit = IPI_RESET_LIMIT;
+static int complete_threshold = COMPLETE_THRESHOLD;
+static int congested_response_us = CONGESTED_RESPONSE_US;
+static int congested_reps = CONGESTED_REPS;
+static int congested_period = CONGESTED_PERIOD;
+static struct dentry *tunables_dir;
+static struct dentry *tunables_file;
+
static int __init setup_nobau(char *arg)
{
nobau = 1;
@@ -52,10 +71,6 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
static DEFINE_PER_CPU(struct bau_control, bau_control);
static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
-struct reset_args {
- int sender;
-};
-
/*
* Determine the first node on a uvhub. 'Nodes' are used for kernel
* memory allocation.
@@ -126,7 +141,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
struct ptc_stats *stat;
msg = mdp->msg;
- stat = &per_cpu(ptcstats, bcp->cpu);
+ stat = bcp->statp;
stat->d_retries++;
/*
* cancel any message from msg+1 to the retry itself
@@ -146,15 +161,14 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
slot2 = msg2 - mdp->va_queue_first;
mmr = uv_read_local_mmr
(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
- msg_res = ((msg2->sw_ack_vector << 8) |
- msg2->sw_ack_vector);
+ msg_res = msg2->sw_ack_vector;
/*
* This is a message retry; clear the resources held
* by the previous message only if they timed out.
* If it has not timed out we have an unexpected
* situation to report.
*/
- if (mmr & (msg_res << 8)) {
+ if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
/*
* is the resource timed out?
* make everyone ignore the cancelled message.
@@ -164,9 +178,9 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
cancel_count++;
uv_write_local_mmr(
UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
- (msg_res << 8) | msg_res);
- } else
- printk(KERN_INFO "note bau retry: no effect\n");
+ (msg_res << UV_SW_ACK_NPENDING) |
+ msg_res);
+ }
}
}
if (!cancel_count)
@@ -190,7 +204,7 @@ static void uv_bau_process_message(struct msg_desc *mdp,
* This must be a normal message, or retry of a normal message
*/
msg = mdp->msg;
- stat = &per_cpu(ptcstats, bcp->cpu);
+ stat = bcp->statp;
if (msg->address == TLB_FLUSH_ALL) {
local_flush_tlb();
stat->d_alltlb++;
@@ -274,7 +288,7 @@ uv_do_reset(void *ptr)
bcp = &per_cpu(bau_control, smp_processor_id());
rap = (struct reset_args *)ptr;
- stat = &per_cpu(ptcstats, bcp->cpu);
+ stat = bcp->statp;
stat->d_resets++;
/*
@@ -302,13 +316,13 @@ uv_do_reset(void *ptr)
*/
mmr = uv_read_local_mmr
(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
- msg_res = ((msg->sw_ack_vector << 8) |
- msg->sw_ack_vector);
+ msg_res = msg->sw_ack_vector;
if (mmr & msg_res) {
stat->d_rcanceled++;
uv_write_local_mmr(
UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
- msg_res);
+ (msg_res << UV_SW_ACK_NPENDING) |
+ msg_res);
}
}
}
@@ -386,17 +400,12 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
unsigned long mmr_offset, int right_shift, int this_cpu,
struct bau_control *bcp, struct bau_control *smaster, long try)
{
- int relaxes = 0;
unsigned long descriptor_status;
- unsigned long mmr;
- unsigned long mask;
cycles_t ttime;
- cycles_t timeout_time;
- struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
+ struct ptc_stats *stat = bcp->statp;
struct bau_control *hmaster;
hmaster = bcp->uvhub_master;
- timeout_time = get_cycles() + bcp->timeout_interval;
/* spin on the status MMR, waiting for it to go idle */
while ((descriptor_status = (((unsigned long)
@@ -423,7 +432,8 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
* pending. In that case hardware returns the
* ERROR that looks like a destination timeout.
*/
- if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) {
+ if (cycles_2_us(ttime - bcp->send_message) <
+ timeout_us) {
bcp->conseccompletes = 0;
return FLUSH_RETRY_PLUGGED;
}
@@ -435,26 +445,6 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
* descriptor_status is still BUSY
*/
cpu_relax();
- relaxes++;
- if (relaxes >= 10000) {
- relaxes = 0;
- if (get_cycles() > timeout_time) {
- quiesce_local_uvhub(hmaster);
-
- /* single-thread the register change */
- spin_lock(&hmaster->masks_lock);
- mmr = uv_read_local_mmr(mmr_offset);
- mask = 0UL;
- mask |= (3UL < right_shift);
- mask = ~mask;
- mmr &= mask;
- uv_write_local_mmr(mmr_offset, mmr);
- spin_unlock(&hmaster->masks_lock);
- end_uvhub_quiesce(hmaster);
- stat->s_busy++;
- return FLUSH_GIVEUP;
- }
- }
}
}
bcp->conseccompletes++;
@@ -494,56 +484,116 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
return 1;
}
+/*
+ * Our retries are blocked by all destination swack resources being
+ * in use, and a timeout is pending. In that case hardware immediately
+ * returns the ERROR that looks like a destination timeout.
+ */
+static void
+destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
+ struct bau_control *hmaster, struct ptc_stats *stat)
+{
+ udelay(bcp->plugged_delay);
+ bcp->plugged_tries++;
+ if (bcp->plugged_tries >= bcp->plugsb4reset) {
+ bcp->plugged_tries = 0;
+ quiesce_local_uvhub(hmaster);
+ spin_lock(&hmaster->queue_lock);
+ uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+ spin_unlock(&hmaster->queue_lock);
+ end_uvhub_quiesce(hmaster);
+ bcp->ipi_attempts++;
+ stat->s_resets_plug++;
+ }
+}
+
+static void
+destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
+ struct bau_control *hmaster, struct ptc_stats *stat)
+{
+ hmaster->max_bau_concurrent = 1;
+ bcp->timeout_tries++;
+ if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
+ bcp->timeout_tries = 0;
+ quiesce_local_uvhub(hmaster);
+ spin_lock(&hmaster->queue_lock);
+ uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+ spin_unlock(&hmaster->queue_lock);
+ end_uvhub_quiesce(hmaster);
+ bcp->ipi_attempts++;
+ stat->s_resets_timeout++;
+ }
+}
+
+/*
+ * Completions are taking a very long time due to a congested numalink
+ * network.
+ */
+static void
+disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
+{
+ int tcpu;
+ struct bau_control *tbcp;
+
+ /* let only one cpu do this disabling */
+ spin_lock(&disable_lock);
+ if (!baudisabled && bcp->period_requests &&
+ ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
+ /* it becomes this cpu's job to turn on the use of the
+ BAU again */
+ baudisabled = 1;
+ bcp->set_bau_off = 1;
+ bcp->set_bau_on_time = get_cycles() +
+ sec_2_cycles(bcp->congested_period);
+ stat->s_bau_disabled++;
+ for_each_present_cpu(tcpu) {
+ tbcp = &per_cpu(bau_control, tcpu);
+ tbcp->baudisabled = 1;
+ }
+ }
+ spin_unlock(&disable_lock);
+}
+
/**
* uv_flush_send_and_wait
*
* Send a broadcast and wait for it to complete.
*
- * The flush_mask contains the cpus the broadcast is to be sent to, plus
+ * The flush_mask contains the cpus the broadcast is to be sent to including
* cpus that are on the local uvhub.
*
- * Returns NULL if all flushing represented in the mask was done. The mask
- * is zeroed.
- * Returns @flush_mask if some remote flushing remains to be done. The
- * mask will have some bits still set, representing any cpus on the local
- * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
+ * Returns 0 if all flushing represented in the mask was done.
+ * Returns 1 if it gives up entirely and the original cpu mask is to be
+ * returned to the kernel.
*/
-const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
- struct cpumask *flush_mask,
- struct bau_control *bcp)
+int uv_flush_send_and_wait(struct bau_desc *bau_desc,
+ struct cpumask *flush_mask, struct bau_control *bcp)
{
int right_shift;
- int uvhub;
- int bit;
int completion_status = 0;
int seq_number = 0;
long try = 0;
int cpu = bcp->uvhub_cpu;
int this_cpu = bcp->cpu;
- int this_uvhub = bcp->uvhub;
unsigned long mmr_offset;
unsigned long index;
cycles_t time1;
cycles_t time2;
- struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
+ cycles_t elapsed;
+ struct ptc_stats *stat = bcp->statp;
struct bau_control *smaster = bcp->socket_master;
struct bau_control *hmaster = bcp->uvhub_master;
- /*
- * Spin here while there are hmaster->max_concurrent or more active
- * descriptors. This is the per-uvhub 'throttle'.
- */
if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
&hmaster->active_descriptor_count,
- hmaster->max_concurrent)) {
+ hmaster->max_bau_concurrent)) {
stat->s_throttles++;
do {
cpu_relax();
} while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
&hmaster->active_descriptor_count,
- hmaster->max_concurrent));
+ hmaster->max_bau_concurrent));
}
-
while (hmaster->uvhub_quiesce)
cpu_relax();
@@ -557,23 +607,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
}
time1 = get_cycles();
do {
- /*
- * Every message from any given cpu gets a unique message
- * sequence number. But retries use that same number.
- * Our message may have timed out at the destination because
- * all sw-ack resources are in use and there is a timeout
- * pending there. In that case, our last send never got
- * placed into the queue and we need to persist until it
- * does.
- *
- * Make any retry a type MSG_RETRY so that the destination will
- * free any resource held by a previous message from this cpu.
- */
if (try == 0) {
- /* use message type set by the caller the first time */
+ bau_desc->header.msg_type = MSG_REGULAR;
seq_number = bcp->message_number++;
} else {
- /* use RETRY type on all the rest; same sequence */
bau_desc->header.msg_type = MSG_RETRY;
stat->s_retry_messages++;
}
@@ -581,50 +618,17 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
bcp->uvhub_cpu;
bcp->send_message = get_cycles();
-
uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
-
try++;
completion_status = uv_wait_completion(bau_desc, mmr_offset,
right_shift, this_cpu, bcp, smaster, try);
if (completion_status == FLUSH_RETRY_PLUGGED) {
- /*
- * Our retries may be blocked by all destination swack
- * resources being consumed, and a timeout pending. In
- * that case hardware immediately returns the ERROR
- * that looks like a destination timeout.
- */
- udelay(TIMEOUT_DELAY);
- bcp->plugged_tries++;
- if (bcp->plugged_tries >= PLUGSB4RESET) {
- bcp->plugged_tries = 0;
- quiesce_local_uvhub(hmaster);
- spin_lock(&hmaster->queue_lock);
- uv_reset_with_ipi(&bau_desc->distribution,
- this_cpu);
- spin_unlock(&hmaster->queue_lock);
- end_uvhub_quiesce(hmaster);
- bcp->ipi_attempts++;
- stat->s_resets_plug++;
- }
+ destination_plugged(bau_desc, bcp, hmaster, stat);
} else if (completion_status == FLUSH_RETRY_TIMEOUT) {
- hmaster->max_concurrent = 1;
- bcp->timeout_tries++;
- udelay(TIMEOUT_DELAY);
- if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
- bcp->timeout_tries = 0;
- quiesce_local_uvhub(hmaster);
- spin_lock(&hmaster->queue_lock);
- uv_reset_with_ipi(&bau_desc->distribution,
- this_cpu);
- spin_unlock(&hmaster->queue_lock);
- end_uvhub_quiesce(hmaster);
- bcp->ipi_attempts++;
- stat->s_resets_timeout++;
- }
+ destination_timeout(bau_desc, bcp, hmaster, stat);
}
- if (bcp->ipi_attempts >= 3) {
+ if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
bcp->ipi_attempts = 0;
completion_status = FLUSH_GIVEUP;
break;
@@ -633,49 +637,36 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
} while ((completion_status == FLUSH_RETRY_PLUGGED) ||
(completion_status == FLUSH_RETRY_TIMEOUT));
time2 = get_cycles();
-
- if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
- && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
- hmaster->max_concurrent++;
-
- /*
- * hold any cpu not timing out here; no other cpu currently held by
- * the 'throttle' should enter the activation code
- */
+ bcp->plugged_tries = 0;
+ bcp->timeout_tries = 0;
+ if ((completion_status == FLUSH_COMPLETE) &&
+ (bcp->conseccompletes > bcp->complete_threshold) &&
+ (hmaster->max_bau_concurrent <
+ hmaster->max_bau_concurrent_constant))
+ hmaster->max_bau_concurrent++;
while (hmaster->uvhub_quiesce)
cpu_relax();
atomic_dec(&hmaster->active_descriptor_count);
-
- /* guard against cycles wrap */
- if (time2 > time1)
- stat->s_time += (time2 - time1);
- else
- stat->s_requestor--; /* don't count this one */
+ if (time2 > time1) {
+ elapsed = time2 - time1;
+ stat->s_time += elapsed;
+ if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
+ bcp->period_requests++;
+ bcp->period_time += elapsed;
+ if ((elapsed > congested_cycles) &&
+ (bcp->period_requests > bcp->congested_reps)) {
+ disable_for_congestion(bcp, stat);
+ }
+ }
+ } else
+ stat->s_requestor--;
if (completion_status == FLUSH_COMPLETE && try > 1)
stat->s_retriesok++;
else if (completion_status == FLUSH_GIVEUP) {
- /*
- * Cause the caller to do an IPI-style TLB shootdown on
- * the target cpu's, all of which are still in the mask.
- */
stat->s_giveup++;
- return flush_mask;
- }
-
- /*
- * Success, so clear the remote cpu's from the mask so we don't
- * use the IPI method of shootdown on them.
- */
- for_each_cpu(bit, flush_mask) {
- uvhub = uv_cpu_to_blade_id(bit);
- if (uvhub == this_uvhub)
- continue;
- cpumask_clear_cpu(bit, flush_mask);
+ return 1;
}
- if (!cpumask_empty(flush_mask))
- return flush_mask;
-
- return NULL;
+ return 0;
}
/**
@@ -707,70 +698,89 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long va, unsigned int cpu)
{
- int remotes;
int tcpu;
int uvhub;
int locals = 0;
+ int remotes = 0;
+ int hubs = 0;
struct bau_desc *bau_desc;
struct cpumask *flush_mask;
struct ptc_stats *stat;
struct bau_control *bcp;
+ struct bau_control *tbcp;
+ /* kernel was booted 'nobau' */
if (nobau)
return cpumask;
bcp = &per_cpu(bau_control, cpu);
+ stat = bcp->statp;
+
+ /* bau was disabled due to slow response */
+ if (bcp->baudisabled) {
+ /* the cpu that disabled it must re-enable it */
+ if (bcp->set_bau_off) {
+ if (get_cycles() >= bcp->set_bau_on_time) {
+ stat->s_bau_reenabled++;
+ baudisabled = 0;
+ for_each_present_cpu(tcpu) {
+ tbcp = &per_cpu(bau_control, tcpu);
+ tbcp->baudisabled = 0;
+ tbcp->period_requests = 0;
+ tbcp->period_time = 0;
+ }
+ }
+ }
+ return cpumask;
+ }
+
/*
* Each sending cpu has a per-cpu mask which it fills from the caller's
- * cpu mask. Only remote cpus are converted to uvhubs and copied.
+ * cpu mask. All cpus are converted to uvhubs and copied to the
+ * activation descriptor.
*/
flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
- /*
- * copy cpumask to flush_mask, removing current cpu
- * (current cpu should already have been flushed by the caller and
- * should never be returned if we return flush_mask)
- */
+ /* don't actually do a shootdown of the local cpu */
cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
if (cpu_isset(cpu, *cpumask))
- locals++; /* current cpu was targeted */
+ stat->s_ntargself++;
bau_desc = bcp->descriptor_base;
bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
-
bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
- remotes = 0;
+
+ /* cpu statistics */
for_each_cpu(tcpu, flush_mask) {
uvhub = uv_cpu_to_blade_id(tcpu);
- if (uvhub == bcp->uvhub) {
- locals++;
- continue;
- }
bau_uvhub_set(uvhub, &bau_desc->distribution);
- remotes++;
- }
- if (remotes == 0) {
- /*
- * No off_hub flushing; return status for local hub.
- * Return the caller's mask if all were local (the current
- * cpu may be in that mask).
- */
- if (locals)
- return cpumask;
+ if (uvhub == bcp->uvhub)
+ locals++;
else
- return NULL;
+ remotes++;
}
- stat = &per_cpu(ptcstats, cpu);
+ if ((locals + remotes) == 0)
+ return NULL;
stat->s_requestor++;
- stat->s_ntargcpu += remotes;
+ stat->s_ntargcpu += remotes + locals;
+ stat->s_ntargremotes += remotes;
+ stat->s_ntarglocals += locals;
remotes = bau_uvhub_weight(&bau_desc->distribution);
- stat->s_ntarguvhub += remotes;
- if (remotes >= 16)
+
+ /* uvhub statistics */
+ hubs = bau_uvhub_weight(&bau_desc->distribution);
+ if (locals) {
+ stat->s_ntarglocaluvhub++;
+ stat->s_ntargremoteuvhub += (hubs - 1);
+ } else
+ stat->s_ntargremoteuvhub += hubs;
+ stat->s_ntarguvhub += hubs;
+ if (hubs >= 16)
stat->s_ntarguvhub16++;
- else if (remotes >= 8)
+ else if (hubs >= 8)
stat->s_ntarguvhub8++;
- else if (remotes >= 4)
+ else if (hubs >= 4)
stat->s_ntarguvhub4++;
- else if (remotes >= 2)
+ else if (hubs >= 2)
stat->s_ntarguvhub2++;
else
stat->s_ntarguvhub1++;
@@ -779,10 +789,13 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
bau_desc->payload.sending_cpu = cpu;
/*
- * uv_flush_send_and_wait returns null if all cpu's were messaged, or
- * the adjusted flush_mask if any cpu's were not messaged.
+ * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
+ * or 1 if it gave up and the original cpumask should be returned.
*/
- return uv_flush_send_and_wait(bau_desc, flush_mask, bcp);
+ if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
+ return NULL;
+ else
+ return cpumask;
}
/*
@@ -810,7 +823,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
time_start = get_cycles();
bcp = &per_cpu(bau_control, smp_processor_id());
- stat = &per_cpu(ptcstats, smp_processor_id());
+ stat = bcp->statp;
msgdesc.va_queue_first = bcp->va_queue_first;
msgdesc.va_queue_last = bcp->va_queue_last;
msg = bcp->bau_msg_head;
@@ -908,12 +921,12 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
}
static inline unsigned long long
-millisec_2_cycles(unsigned long millisec)
+microsec_2_cycles(unsigned long microsec)
{
unsigned long ns;
unsigned long long cyc;
- ns = millisec * 1000;
+ ns = microsec * 1000;
cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
return cyc;
}
@@ -931,15 +944,19 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
if (!cpu) {
seq_printf(file,
- "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 ");
+ "# cpu sent stime self locals remotes ncpus localhub ");
+ seq_printf(file,
+ "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
seq_printf(file,
- "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto ");
+ "numuvhubs4 numuvhubs2 numuvhubs1 dto ");
seq_printf(file,
"retries rok resetp resett giveup sto bz throt ");
seq_printf(file,
"sw_ack recv rtime all ");
seq_printf(file,
- "one mult none retry canc nocan reset rcan\n");
+ "one mult none retry canc nocan reset rcan ");
+ seq_printf(file,
+ "disable enable\n");
}
if (cpu < num_possible_cpus() && cpu_online(cpu)) {
stat = &per_cpu(ptcstats, cpu);
@@ -947,18 +964,23 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
seq_printf(file,
"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
cpu, stat->s_requestor, cycles_2_us(stat->s_time),
- stat->s_ntarguvhub, stat->s_ntarguvhub16,
+ stat->s_ntargself, stat->s_ntarglocals,
+ stat->s_ntargremotes, stat->s_ntargcpu,
+ stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
+ stat->s_ntarguvhub, stat->s_ntarguvhub16);
+ seq_printf(file, "%ld %ld %ld %ld %ld ",
stat->s_ntarguvhub8, stat->s_ntarguvhub4,
stat->s_ntarguvhub2, stat->s_ntarguvhub1,
- stat->s_ntargcpu, stat->s_dtimeout);
+ stat->s_dtimeout);
seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
stat->s_retry_messages, stat->s_retriesok,
stat->s_resets_plug, stat->s_resets_timeout,
stat->s_giveup, stat->s_stimeout,
stat->s_busy, stat->s_throttles);
+
/* destination side statistics */
seq_printf(file,
- "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
+ "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
stat->d_requestee, cycles_2_us(stat->d_time),
@@ -966,15 +988,41 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
stat->d_nomsg, stat->d_retries, stat->d_canceled,
stat->d_nocanceled, stat->d_resets,
stat->d_rcanceled);
+ seq_printf(file, "%ld %ld\n",
+ stat->s_bau_disabled, stat->s_bau_reenabled);
}
return 0;
}
/*
+ * Display the tunables thru debugfs
+ */
+static ssize_t tunables_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ char *buf;
+ int ret;
+
+ buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
+ "max_bau_concurrent plugged_delay plugsb4reset",
+ "timeoutsb4reset ipi_reset_limit complete_threshold",
+ "congested_response_us congested_reps congested_period",
+ max_bau_concurrent, plugged_delay, plugsb4reset,
+ timeoutsb4reset, ipi_reset_limit, complete_threshold,
+ congested_response_us, congested_reps, congested_period);
+
+ if (!buf)
+ return -ENOMEM;
+
+ ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
+ kfree(buf);
+ return ret;
+}
+
+/*
* -1: resetf the statistics
* 0: display meaning of the statistics
- * >0: maximum concurrent active descriptors per uvhub (throttle)
*/
static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
size_t count, loff_t *data)
@@ -983,7 +1031,6 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
long input_arg;
char optstr[64];
struct ptc_stats *stat;
- struct bau_control *bcp;
if (count == 0 || count > sizeof(optstr))
return -EINVAL;
@@ -1059,29 +1106,158 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
"reset: number of ipi-style reset requests processed\n");
printk(KERN_DEBUG
"rcan: number messages canceled by reset requests\n");
+ printk(KERN_DEBUG
+ "disable: number times use of the BAU was disabled\n");
+ printk(KERN_DEBUG
+ "enable: number times use of the BAU was re-enabled\n");
} else if (input_arg == -1) {
for_each_present_cpu(cpu) {
stat = &per_cpu(ptcstats, cpu);
memset(stat, 0, sizeof(struct ptc_stats));
}
- } else {
- uv_bau_max_concurrent = input_arg;
- bcp = &per_cpu(bau_control, smp_processor_id());
- if (uv_bau_max_concurrent < 1 ||
- uv_bau_max_concurrent > bcp->cpus_in_uvhub) {
- printk(KERN_DEBUG
- "Error: BAU max concurrent %d; %d is invalid\n",
- bcp->max_concurrent, uv_bau_max_concurrent);
- return -EINVAL;
- }
- printk(KERN_DEBUG "Set BAU max concurrent:%d\n",
- uv_bau_max_concurrent);
- for_each_present_cpu(cpu) {
- bcp = &per_cpu(bau_control, cpu);
- bcp->max_concurrent = uv_bau_max_concurrent;
+ }
+
+ return count;
+}
+
+static int local_atoi(const char *name)
+{
+ int val = 0;
+
+ for (;; name++) {
+ switch (*name) {
+ case '0' ... '9':
+ val = 10*val+(*name-'0');
+ break;
+ default:
+ return val;
}
}
+}
+
+/*
+ * set the tunables
+ * 0 values reset them to defaults
+ */
+static ssize_t tunables_write(struct file *file, const char __user *user,
+ size_t count, loff_t *data)
+{
+ int cpu;
+ int cnt = 0;
+ int val;
+ char *p;
+ char *q;
+ char instr[64];
+ struct bau_control *bcp;
+
+ if (count == 0 || count > sizeof(instr)-1)
+ return -EINVAL;
+ if (copy_from_user(instr, user, count))
+ return -EFAULT;
+ instr[count] = '\0';
+ /* count the fields */
+ p = instr + strspn(instr, WHITESPACE);
+ q = p;
+ for (; *p; p = q + strspn(q, WHITESPACE)) {
+ q = p + strcspn(p, WHITESPACE);
+ cnt++;
+ if (q == p)
+ break;
+ }
+ if (cnt != 9) {
+ printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
+ return -EINVAL;
+ }
+
+ p = instr + strspn(instr, WHITESPACE);
+ q = p;
+ for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
+ q = p + strcspn(p, WHITESPACE);
+ val = local_atoi(p);
+ switch (cnt) {
+ case 0:
+ if (val == 0) {
+ max_bau_concurrent = MAX_BAU_CONCURRENT;
+ max_bau_concurrent_constant =
+ MAX_BAU_CONCURRENT;
+ continue;
+ }
+ bcp = &per_cpu(bau_control, smp_processor_id());
+ if (val < 1 || val > bcp->cpus_in_uvhub) {
+ printk(KERN_DEBUG
+ "Error: BAU max concurrent %d is invalid\n",
+ val);
+ return -EINVAL;
+ }
+ max_bau_concurrent = val;
+ max_bau_concurrent_constant = val;
+ continue;
+ case 1:
+ if (val == 0)
+ plugged_delay = PLUGGED_DELAY;
+ else
+ plugged_delay = val;
+ continue;
+ case 2:
+ if (val == 0)
+ plugsb4reset = PLUGSB4RESET;
+ else
+ plugsb4reset = val;
+ continue;
+ case 3:
+ if (val == 0)
+ timeoutsb4reset = TIMEOUTSB4RESET;
+ else
+ timeoutsb4reset = val;
+ continue;
+ case 4:
+ if (val == 0)
+ ipi_reset_limit = IPI_RESET_LIMIT;
+ else
+ ipi_reset_limit = val;
+ continue;
+ case 5:
+ if (val == 0)
+ complete_threshold = COMPLETE_THRESHOLD;
+ else
+ complete_threshold = val;
+ continue;
+ case 6:
+ if (val == 0)
+ congested_response_us = CONGESTED_RESPONSE_US;
+ else
+ congested_response_us = val;
+ continue;
+ case 7:
+ if (val == 0)
+ congested_reps = CONGESTED_REPS;
+ else
+ congested_reps = val;
+ continue;
+ case 8:
+ if (val == 0)
+ congested_period = CONGESTED_PERIOD;
+ else
+ congested_period = val;
+ continue;
+ }
+ if (q == p)
+ break;
+ }
+ for_each_present_cpu(cpu) {
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->max_bau_concurrent = max_bau_concurrent;
+ bcp->max_bau_concurrent_constant = max_bau_concurrent;
+ bcp->plugged_delay = plugged_delay;
+ bcp->plugsb4reset = plugsb4reset;
+ bcp->timeoutsb4reset = timeoutsb4reset;
+ bcp->ipi_reset_limit = ipi_reset_limit;
+ bcp->complete_threshold = complete_threshold;
+ bcp->congested_response_us = congested_response_us;
+ bcp->congested_reps = congested_reps;
+ bcp->congested_period = congested_period;
+ }
return count;
}
@@ -1097,6 +1273,11 @@ static int uv_ptc_proc_open(struct inode *inode, struct file *file)
return seq_open(file, &uv_ptc_seq_ops);
}
+static int tunables_open(struct inode *inode, struct file *file)
+{
+ return 0;
+}
+
static const struct file_operations proc_uv_ptc_operations = {
.open = uv_ptc_proc_open,
.read = seq_read,
@@ -1105,6 +1286,13 @@ static const struct file_operations proc_uv_ptc_operations = {
.release = seq_release,
};
+static const struct file_operations tunables_fops = {
+ .open = tunables_open,
+ .read = tunables_read,
+ .write = tunables_write,
+ .llseek = default_llseek,
+};
+
static int __init uv_ptc_init(void)
{
struct proc_dir_entry *proc_uv_ptc;
@@ -1119,6 +1307,20 @@ static int __init uv_ptc_init(void)
UV_PTC_BASENAME);
return -EINVAL;
}
+
+ tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
+ if (!tunables_dir) {
+ printk(KERN_ERR "unable to create debugfs directory %s\n",
+ UV_BAU_TUNABLES_DIR);
+ return -EINVAL;
+ }
+ tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
+ tunables_dir, NULL, &tunables_fops);
+ if (!tunables_file) {
+ printk(KERN_ERR "unable to create debugfs file %s\n",
+ UV_BAU_TUNABLES_FILE);
+ return -EINVAL;
+ }
return 0;
}
@@ -1259,15 +1461,45 @@ static void __init uv_init_uvhub(int uvhub, int vector)
}
/*
+ * We will set BAU_MISC_CONTROL with a timeout period.
+ * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
+ * So the destination timeout period has be be calculated from them.
+ */
+static int
+calculate_destination_timeout(void)
+{
+ unsigned long mmr_image;
+ int mult1;
+ int mult2;
+ int index;
+ int base;
+ int ret;
+ unsigned long ts_ns;
+
+ mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
+ mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
+ index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
+ mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
+ mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
+ base = timeout_base_ns[index];
+ ts_ns = base * mult1 * mult2;
+ ret = ts_ns / 1000;
+ return ret;
+}
+
+/*
* initialize the bau_control structure for each cpu
*/
-static void uv_init_per_cpu(int nuvhubs)
+static void __init uv_init_per_cpu(int nuvhubs)
{
- int i, j, k;
+ int i;
int cpu;
int pnode;
int uvhub;
+ int have_hmaster;
short socket = 0;
+ unsigned short socket_mask;
+ unsigned char *uvhub_mask;
struct bau_control *bcp;
struct uvhub_desc *bdp;
struct socket_desc *sdp;
@@ -1278,7 +1510,7 @@ static void uv_init_per_cpu(int nuvhubs)
short cpu_number[16];
};
struct uvhub_desc {
- short num_sockets;
+ unsigned short socket_mask;
short num_cpus;
short uvhub;
short pnode;
@@ -1286,57 +1518,84 @@ static void uv_init_per_cpu(int nuvhubs)
};
struct uvhub_desc *uvhub_descs;
+ timeout_us = calculate_destination_timeout();
+
uvhub_descs = (struct uvhub_desc *)
kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+ uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
for_each_present_cpu(cpu) {
bcp = &per_cpu(bau_control, cpu);
memset(bcp, 0, sizeof(struct bau_control));
- spin_lock_init(&bcp->masks_lock);
- bcp->max_concurrent = uv_bau_max_concurrent;
pnode = uv_cpu_hub_info(cpu)->pnode;
uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+ *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
bdp = &uvhub_descs[uvhub];
bdp->num_cpus++;
bdp->uvhub = uvhub;
bdp->pnode = pnode;
- /* time interval to catch a hardware stay-busy bug */
- bcp->timeout_interval = millisec_2_cycles(3);
- /* kludge: assume uv_hub.h is constant */
- socket = (cpu_physical_id(cpu)>>5)&1;
- if (socket >= bdp->num_sockets)
- bdp->num_sockets = socket+1;
+ /* kludge: 'assuming' one node per socket, and assuming that
+ disabling a socket just leaves a gap in node numbers */
+ socket = (cpu_to_node(cpu) & 1);
+ bdp->socket_mask |= (1 << socket);
sdp = &bdp->socket[socket];
sdp->cpu_number[sdp->num_cpus] = cpu;
sdp->num_cpus++;
}
- socket = 0;
- for_each_possible_blade(uvhub) {
+ for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+ if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
+ continue;
+ have_hmaster = 0;
bdp = &uvhub_descs[uvhub];
- for (i = 0; i < bdp->num_sockets; i++) {
- sdp = &bdp->socket[i];
- for (j = 0; j < sdp->num_cpus; j++) {
- cpu = sdp->cpu_number[j];
+ socket_mask = bdp->socket_mask;
+ socket = 0;
+ while (socket_mask) {
+ if (!(socket_mask & 1))
+ goto nextsocket;
+ sdp = &bdp->socket[socket];
+ for (i = 0; i < sdp->num_cpus; i++) {
+ cpu = sdp->cpu_number[i];
bcp = &per_cpu(bau_control, cpu);
bcp->cpu = cpu;
- if (j == 0) {
+ if (i == 0) {
smaster = bcp;
- if (i == 0)
+ if (!have_hmaster) {
+ have_hmaster++;
hmaster = bcp;
+ }
}
bcp->cpus_in_uvhub = bdp->num_cpus;
bcp->cpus_in_socket = sdp->num_cpus;
bcp->socket_master = smaster;
+ bcp->uvhub = bdp->uvhub;
bcp->uvhub_master = hmaster;
- for (k = 0; k < DEST_Q_SIZE; k++)
- bcp->socket_acknowledge_count[k] = 0;
- bcp->uvhub_cpu =
- uv_cpu_hub_info(cpu)->blade_processor_id;
+ bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
+ blade_processor_id;
}
+nextsocket:
socket++;
+ socket_mask = (socket_mask >> 1);
}
}
kfree(uvhub_descs);
+ kfree(uvhub_mask);
+ for_each_present_cpu(cpu) {
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->baudisabled = 0;
+ bcp->statp = &per_cpu(ptcstats, cpu);
+ /* time interval to catch a hardware stay-busy bug */
+ bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
+ bcp->max_bau_concurrent = max_bau_concurrent;
+ bcp->max_bau_concurrent_constant = max_bau_concurrent;
+ bcp->plugged_delay = plugged_delay;
+ bcp->plugsb4reset = plugsb4reset;
+ bcp->timeoutsb4reset = timeoutsb4reset;
+ bcp->ipi_reset_limit = ipi_reset_limit;
+ bcp->complete_threshold = complete_threshold;
+ bcp->congested_response_us = congested_response_us;
+ bcp->congested_reps = congested_reps;
+ bcp->congested_period = congested_period;
+ }
}
/*
@@ -1361,10 +1620,11 @@ static int __init uv_bau_init(void)
zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
GFP_KERNEL, cpu_to_node(cur_cpu));
- uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
uv_nshift = uv_hub_info->m_val;
uv_mmask = (1UL << uv_hub_info->m_val) - 1;
nuvhubs = uv_num_possible_blades();
+ spin_lock_init(&disable_lock);
+ congested_cycles = microsec_2_cycles(congested_response_us);
uv_init_per_cpu(nuvhubs);
@@ -1383,15 +1643,19 @@ static int __init uv_bau_init(void)
alloc_intr_gate(vector, uv_bau_message_intr1);
for_each_possible_blade(uvhub) {
- pnode = uv_blade_to_pnode(uvhub);
- /* INIT the bau */
- uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL,
- ((unsigned long)1 << 63));
- mmr = 1; /* should be 1 to broadcast to both sockets */
- uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr);
+ if (uv_blade_nr_possible_cpus(uvhub)) {
+ pnode = uv_blade_to_pnode(uvhub);
+ /* INIT the bau */
+ uv_write_global_mmr64(pnode,
+ UVH_LB_BAU_SB_ACTIVATION_CONTROL,
+ ((unsigned long)1 << 63));
+ mmr = 1; /* should be 1 to broadcast to both sockets */
+ uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
+ mmr);
+ }
}
return 0;
}
core_initcall(uv_bau_init);
-core_initcall(uv_ptc_init);
+fs_initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 1132129db792..7b24460917d5 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -28,34 +28,21 @@ struct uv_irq_2_mmr_pnode{
static spinlock_t uv_irq_lock;
static struct rb_root uv_irq_root;
-static int uv_set_irq_affinity(unsigned int, const struct cpumask *);
+static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool);
-static void uv_noop(unsigned int irq)
-{
-}
-
-static unsigned int uv_noop_ret(unsigned int irq)
-{
- return 0;
-}
+static void uv_noop(struct irq_data *data) { }
-static void uv_ack_apic(unsigned int irq)
+static void uv_ack_apic(struct irq_data *data)
{
ack_APIC_irq();
}
static struct irq_chip uv_irq_chip = {
- .name = "UV-CORE",
- .startup = uv_noop_ret,
- .shutdown = uv_noop,
- .enable = uv_noop,
- .disable = uv_noop,
- .ack = uv_noop,
- .mask = uv_noop,
- .unmask = uv_noop,
- .eoi = uv_ack_apic,
- .end = uv_noop,
- .set_affinity = uv_set_irq_affinity,
+ .name = "UV-CORE",
+ .irq_mask = uv_noop,
+ .irq_unmask = uv_noop,
+ .irq_eoi = uv_ack_apic,
+ .irq_set_affinity = uv_set_irq_affinity,
};
/*
@@ -144,26 +131,22 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
unsigned long mmr_offset, int limit)
{
const struct cpumask *eligible_cpu = cpumask_of(cpu);
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
- int mmr_pnode;
+ struct irq_cfg *cfg = get_irq_chip_data(irq);
unsigned long mmr_value;
struct uv_IO_APIC_route_entry *entry;
- int err;
+ int mmr_pnode, err;
BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
sizeof(unsigned long));
- cfg = irq_cfg(irq);
-
err = assign_irq_vector(irq, cfg, eligible_cpu);
if (err != 0)
return err;
if (limit == UV_AFFINITY_CPU)
- desc->status |= IRQ_NO_BALANCING;
+ irq_set_status_flags(irq, IRQ_NO_BALANCING);
else
- desc->status |= IRQ_MOVE_PCNTXT;
+ irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
irq_name);
@@ -206,17 +189,17 @@ static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
}
-static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg = desc->chip_data;
+ struct irq_cfg *cfg = data->chip_data;
unsigned int dest;
- unsigned long mmr_value;
+ unsigned long mmr_value, mmr_offset;
struct uv_IO_APIC_route_entry *entry;
- unsigned long mmr_offset;
int mmr_pnode;
- if (set_desc_affinity(desc, mask, &dest))
+ if (__ioapic_set_affinity(data, mask, &dest))
return -1;
mmr_value = 0;
@@ -231,7 +214,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
entry->dest = dest;
/* Get previously stored MMR and pnode of hub sourcing interrupts */
- if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode))
+ if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode))
return -1;
uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c
index 309c70fb7759..309c70fb7759 100644
--- a/arch/x86/kernel/uv_sysfs.c
+++ b/arch/x86/platform/uv/uv_sysfs.c
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 56e421bc379b..56e421bc379b 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
diff --git a/arch/x86/platform/visws/Makefile b/arch/x86/platform/visws/Makefile
new file mode 100644
index 000000000000..91bc17ab2fd5
--- /dev/null
+++ b/arch/x86/platform/visws/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_X86_VISWS) += visws_quirks.o
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
index e680ea52db9b..3371bd053b89 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -66,10 +66,7 @@ static void __init visws_time_init(void)
}
/* Replaces the default init_ISA_irqs in the generic setup */
-static void __init visws_pre_intr_init(void)
-{
- init_VISWS_APIC_irqs();
-}
+static void __init visws_pre_intr_init(void);
/* Quirk for machine specific memory setup. */
@@ -429,67 +426,34 @@ static int is_co_apic(unsigned int irq)
/*
* This is the SGI Cobalt (IO-)APIC:
*/
-
-static void enable_cobalt_irq(unsigned int irq)
+static void enable_cobalt_irq(struct irq_data *data)
{
- co_apic_set(is_co_apic(irq), irq);
+ co_apic_set(is_co_apic(data->irq), data->irq);
}
-static void disable_cobalt_irq(unsigned int irq)
+static void disable_cobalt_irq(struct irq_data *data)
{
- int entry = is_co_apic(irq);
+ int entry = is_co_apic(data->irq);
co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
co_apic_read(CO_APIC_LO(entry));
}
-/*
- * "irq" really just serves to identify the device. Here is where we
- * map this to the Cobalt APIC entry where it's physically wired.
- * This is called via request_irq -> setup_irq -> irq_desc->startup()
- */
-static unsigned int startup_cobalt_irq(unsigned int irq)
+static void ack_cobalt_irq(struct irq_data *data)
{
unsigned long flags;
- struct irq_desc *desc = irq_to_desc(irq);
spin_lock_irqsave(&cobalt_lock, flags);
- if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
- desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
- enable_cobalt_irq(irq);
- spin_unlock_irqrestore(&cobalt_lock, flags);
- return 0;
-}
-
-static void ack_cobalt_irq(unsigned int irq)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&cobalt_lock, flags);
- disable_cobalt_irq(irq);
+ disable_cobalt_irq(data);
apic_write(APIC_EOI, APIC_EIO_ACK);
spin_unlock_irqrestore(&cobalt_lock, flags);
}
-static void end_cobalt_irq(unsigned int irq)
-{
- unsigned long flags;
- struct irq_desc *desc = irq_to_desc(irq);
-
- spin_lock_irqsave(&cobalt_lock, flags);
- if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
- enable_cobalt_irq(irq);
- spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-
static struct irq_chip cobalt_irq_type = {
- .name = "Cobalt-APIC",
- .startup = startup_cobalt_irq,
- .shutdown = disable_cobalt_irq,
- .enable = enable_cobalt_irq,
- .disable = disable_cobalt_irq,
- .ack = ack_cobalt_irq,
- .end = end_cobalt_irq,
+ .name = "Cobalt-APIC",
+ .irq_enable = enable_cobalt_irq,
+ .irq_disable = disable_cobalt_irq,
+ .irq_ack = ack_cobalt_irq,
};
@@ -503,35 +467,34 @@ static struct irq_chip cobalt_irq_type = {
* interrupt controller type, and through a special virtual interrupt-
* controller. Device drivers only see the virtual interrupt sources.
*/
-static unsigned int startup_piix4_master_irq(unsigned int irq)
+static unsigned int startup_piix4_master_irq(struct irq_data *data)
{
legacy_pic->init(0);
-
- return startup_cobalt_irq(irq);
+ enable_cobalt_irq(data);
}
-static void end_piix4_master_irq(unsigned int irq)
+static void end_piix4_master_irq(struct irq_data *data)
{
unsigned long flags;
spin_lock_irqsave(&cobalt_lock, flags);
- enable_cobalt_irq(irq);
+ enable_cobalt_irq(data);
spin_unlock_irqrestore(&cobalt_lock, flags);
}
static struct irq_chip piix4_master_irq_type = {
- .name = "PIIX4-master",
- .startup = startup_piix4_master_irq,
- .ack = ack_cobalt_irq,
- .end = end_piix4_master_irq,
+ .name = "PIIX4-master",
+ .irq_startup = startup_piix4_master_irq,
+ .irq_ack = ack_cobalt_irq,
};
+static void pii4_mask(struct irq_data *data) { }
static struct irq_chip piix4_virtual_irq_type = {
- .name = "PIIX4-virtual",
+ .name = "PIIX4-virtual",
+ .mask = pii4_mask,
};
-
/*
* PIIX4-8259 master/virtual functions to handle interrupt requests
* from legacy devices: floppy, parallel, serial, rtc.
@@ -549,9 +512,8 @@ static struct irq_chip piix4_virtual_irq_type = {
*/
static irqreturn_t piix4_master_intr(int irq, void *dev_id)
{
- int realirq;
- struct irq_desc *desc;
unsigned long flags;
+ int realirq;
raw_spin_lock_irqsave(&i8259A_lock, flags);
@@ -592,18 +554,10 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
raw_spin_unlock_irqrestore(&i8259A_lock, flags);
- desc = irq_to_desc(realirq);
-
/*
* handle this 'virtual interrupt' as a Cobalt one now.
*/
- kstat_incr_irqs_this_cpu(realirq, desc);
-
- if (likely(desc->action != NULL))
- handle_IRQ_event(realirq, desc->action);
-
- if (!(desc->status & IRQ_DISABLED))
- legacy_pic->chip->unmask(realirq);
+ generic_handle_irq(realirq);
return IRQ_HANDLED;
@@ -624,41 +578,35 @@ static struct irqaction cascade_action = {
static inline void set_piix4_virtual_irq_type(void)
{
- piix4_virtual_irq_type.shutdown = i8259A_chip.mask;
piix4_virtual_irq_type.enable = i8259A_chip.unmask;
piix4_virtual_irq_type.disable = i8259A_chip.mask;
+ piix4_virtual_irq_type.unmask = i8259A_chip.unmask;
}
-void init_VISWS_APIC_irqs(void)
+static void __init visws_pre_intr_init(void)
{
int i;
- for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
- struct irq_desc *desc = irq_to_desc(i);
-
- desc->status = IRQ_DISABLED;
- desc->action = 0;
- desc->depth = 1;
+ set_piix4_virtual_irq_type();
- if (i == 0) {
- desc->chip = &cobalt_irq_type;
- }
- else if (i == CO_IRQ_IDE0) {
- desc->chip = &cobalt_irq_type;
- }
- else if (i == CO_IRQ_IDE1) {
- desc->chip = &cobalt_irq_type;
- }
- else if (i == CO_IRQ_8259) {
- desc->chip = &piix4_master_irq_type;
- }
- else if (i < CO_IRQ_APIC0) {
- set_piix4_virtual_irq_type();
- desc->chip = &piix4_virtual_irq_type;
- }
- else if (IS_CO_APIC(i)) {
- desc->chip = &cobalt_irq_type;
- }
+ for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
+ struct irq_chip *chip = NULL;
+
+ if (i == 0)
+ chip = &cobalt_irq_type;
+ else if (i == CO_IRQ_IDE0)
+ chip = &cobalt_irq_type;
+ else if (i == CO_IRQ_IDE1)
+ >chip = &cobalt_irq_type;
+ else if (i == CO_IRQ_8259)
+ chip = &piix4_master_irq_type;
+ else if (i < CO_IRQ_APIC0)
+ chip = &piix4_virtual_irq_type;
+ else if (IS_CO_APIC(i))
+ chip = &cobalt_irq_type;
+
+ if (chip)
+ set_irq_chip(i, chip);
}
setup_irq(CO_IRQ_8259, &master_action);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 1290ba54b350..87bb35e34ef1 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -4,7 +4,7 @@
* Distribute under GPLv2
*
* Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
- * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
* Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
*/
@@ -113,6 +113,7 @@ static void __save_processor_state(struct saved_context *ctxt)
void save_processor_state(void)
{
__save_processor_state(&saved_context);
+ save_sched_clock_state();
}
#ifdef CONFIG_X86_32
EXPORT_SYMBOL(save_processor_state);
@@ -229,6 +230,7 @@ static void __restore_processor_state(struct saved_context *ctxt)
void restore_processor_state(void)
{
__restore_processor_state(&saved_context);
+ restore_sched_clock_state();
}
#ifdef CONFIG_X86_32
EXPORT_SYMBOL(restore_processor_state);
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index d24f983ba1e5..460f314d13e5 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -4,7 +4,7 @@
* Distribute under GPLv2
*
* Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
- * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
* Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
*/
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 6b4ffedb93c9..4a2afa1bac51 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -120,7 +120,8 @@ $(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE
quiet_cmd_vdso = VDSO $@
cmd_vdso = $(CC) -nostdlib -o $@ \
$(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
- -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^)
+ -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
+ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
GCOV_PROFILE := n
diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/vdso/checkundef.sh
new file mode 100755
index 000000000000..7ee90a9b549d
--- /dev/null
+++ b/arch/x86/vdso/checkundef.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+nm="$1"
+file="$2"
+$nm "$file" | grep '^ *U' > /dev/null 2>&1
+if [ $? -eq 1 ]; then
+ exit 0
+else
+ echo "$file: undefined symbols found" >&2
+ exit 1
+fi
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 02b442e92007..36df991985b2 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -374,7 +374,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
#ifdef CONFIG_X86_64
-__initcall(sysenter_setup);
+subsys_initcall(sysenter_setup);
#ifdef CONFIG_SYSCTL
/* Register vsyscall32 into the ABI table */
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index ac74869b8140..4b5d26f108bb 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -67,6 +67,7 @@ static int __init init_vdso_vars(void)
*(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x;
#include "vextern.h"
#undef VEXTERN
+ vunmap(vbase);
return 0;
oom:
@@ -74,7 +75,7 @@ static int __init init_vdso_vars(void)
vdso_enabled = 0;
return -ENOMEM;
}
-__initcall(init_vdso_vars);
+subsys_initcall(init_vdso_vars);
struct linux_binprm;
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index b83e119fbeb0..5b54892e4bc3 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -13,16 +13,28 @@ config XEN
kernel to boot in a paravirtualized environment under the
Xen hypervisor.
+config XEN_DOM0
+ def_bool y
+ depends on XEN && PCI_XEN && SWIOTLB_XEN
+ depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
+
+# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST
+# name in tools.
+config XEN_PRIVILEGED_GUEST
+ def_bool XEN_DOM0
+
+config XEN_PVHVM
+ def_bool y
+ depends on XEN
+ depends on X86_LOCAL_APIC
+
config XEN_MAX_DOMAIN_MEMORY
- int "Maximum allowed size of a domain in gigabytes"
- default 8 if X86_32
- default 32 if X86_64
+ int
+ default 128
depends on XEN
help
- The pseudo-physical to machine address array is sized
- according to the maximum possible memory size of a Xen
- domain. This array uses 1 page per gigabyte, so there's no
- need to be too stingy here.
+ This only affects the sizing of some bss arrays, the unused
+ portions of which are freed.
config XEN_SAVE_RESTORE
bool
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3bb4fc21f4f2..779385158915 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,9 +12,10 @@ CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
- grant-table.o suspend.o
+ grant-table.o suspend.o platform-pci-unplug.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
+obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index 1304bcec8ee5..7c0fedd98ea0 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -106,6 +106,7 @@ static const struct file_operations u32_array_fops = {
.open = u32_array_open,
.release= xen_array_release,
.read = u32_array_read,
+ .llseek = no_llseek,
};
struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 65d8d79b46a8..235c0f4d3861 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -11,6 +11,7 @@
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
+#include <linux/cpu.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/smp.h>
@@ -29,19 +30,23 @@
#include <linux/console.h>
#include <linux/pci.h>
#include <linux/gfp.h>
+#include <linux/memblock.h>
#include <xen/xen.h>
#include <xen/interface/xen.h>
#include <xen/interface/version.h>
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
+#include <xen/interface/memory.h>
#include <xen/features.h>
#include <xen/page.h>
+#include <xen/hvm.h>
#include <xen/hvc-console.h>
#include <asm/paravirt.h>
#include <asm/apic.h>
#include <asm/page.h>
+#include <asm/xen/pci.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include <asm/fixmap.h>
@@ -56,6 +61,7 @@
#include <asm/tlbflush.h>
#include <asm/reboot.h>
#include <asm/stackprotector.h>
+#include <asm/hypervisor.h>
#include "xen-ops.h"
#include "mmu.h"
@@ -76,6 +82,10 @@ struct shared_info xen_dummy_shared_info;
void *xen_initial_gdt;
+RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
+__read_mostly int xen_have_vector_callback;
+EXPORT_SYMBOL_GPL(xen_have_vector_callback);
+
/*
* Point at some empty memory to start with. We map the real shared_info
* page as soon as fixmap is up and running.
@@ -97,6 +107,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
*/
static int have_vcpu_info_placement = 1;
+static void clamp_max_cpus(void)
+{
+#ifdef CONFIG_SMP
+ if (setup_max_cpus > MAX_VIRT_CPUS)
+ setup_max_cpus = MAX_VIRT_CPUS;
+#endif
+}
+
static void xen_vcpu_setup(int cpu)
{
struct vcpu_register_vcpu_info info;
@@ -104,19 +122,20 @@ static void xen_vcpu_setup(int cpu)
struct vcpu_info *vcpup;
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
- per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
- if (!have_vcpu_info_placement)
- return; /* already tested, not available */
+ if (cpu < MAX_VIRT_CPUS)
+ per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
- vcpup = &per_cpu(xen_vcpu_info, cpu);
+ if (!have_vcpu_info_placement) {
+ if (cpu >= MAX_VIRT_CPUS)
+ clamp_max_cpus();
+ return;
+ }
+ vcpup = &per_cpu(xen_vcpu_info, cpu);
info.mfn = arbitrary_virt_to_mfn(vcpup);
info.offset = offset_in_page(vcpup);
- printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
- cpu, vcpup, info.mfn, info.offset);
-
/* Check to see if the hypervisor will put the vcpu_info
structure where we want it, which allows direct access via
a percpu-variable. */
@@ -125,13 +144,11 @@ static void xen_vcpu_setup(int cpu)
if (err) {
printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
have_vcpu_info_placement = 0;
+ clamp_max_cpus();
} else {
/* This cpu is using the registered vcpu info, even if
later ones fail to. */
per_cpu(xen_vcpu, cpu) = vcpup;
-
- printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
- cpu, vcpup);
}
}
@@ -220,6 +237,7 @@ static __init void xen_init_cpuid_mask(void)
cpuid_leaf1_edx_mask =
~((1 << X86_FEATURE_MCE) | /* disable MCE */
(1 << X86_FEATURE_MCA) | /* disable MCA */
+ (1 << X86_FEATURE_MTRR) | /* disable MTRR */
(1 << X86_FEATURE_ACC)); /* thermal monitoring */
if (!xen_initial_domain())
@@ -731,7 +749,6 @@ static void set_xen_basic_apic_ops(void)
#endif
-
static void xen_clts(void)
{
struct multicall_space mcs;
@@ -814,6 +831,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
Xen console noise. */
break;
+ case MSR_IA32_CR_PAT:
+ if (smp_processor_id() == 0)
+ xen_set_pat(((u64)high << 32) | low);
+ break;
+
default:
ret = native_write_msr_safe(msr, low, high);
}
@@ -852,8 +874,6 @@ void xen_setup_vcpu_info_placement(void)
/* xen_vcpu_setup managed to place the vcpu_info within the
percpu area for all cpus, so make use of it */
if (have_vcpu_info_placement) {
- printk(KERN_INFO "Xen: using vcpu_info placement\n");
-
pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
@@ -926,10 +946,6 @@ static const struct pv_init_ops xen_init_ops __initdata = {
.patch = xen_patch,
};
-static const struct pv_time_ops xen_time_ops __initdata = {
- .sched_clock = xen_sched_clock,
-};
-
static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.cpuid = xen_cpuid,
@@ -1001,7 +1017,7 @@ static void xen_reboot(int reason)
struct sched_shutdown r = { .reason = reason };
#ifdef CONFIG_SMP
- smp_send_stop();
+ stop_other_cpus();
#endif
if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
@@ -1028,6 +1044,23 @@ static void xen_crash_shutdown(struct pt_regs *regs)
xen_reboot(SHUTDOWN_crash);
}
+static int
+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ xen_reboot(SHUTDOWN_crash);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block xen_panic_block = {
+ .notifier_call= xen_panic_event,
+};
+
+int xen_panic_handler_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+ return 0;
+}
+
static const struct machine_ops __initdata xen_machine_ops = {
.restart = xen_restart,
.halt = xen_machine_halt,
@@ -1067,7 +1100,6 @@ asmlinkage void __init xen_start_kernel(void)
/* Install Xen paravirt ops */
pv_info = xen_info;
pv_init_ops = xen_init_ops;
- pv_time_ops = xen_time_ops;
pv_cpu_ops = xen_cpu_ops;
pv_apic_ops = xen_apic_ops;
@@ -1075,13 +1107,7 @@ asmlinkage void __init xen_start_kernel(void)
x86_init.oem.arch_setup = xen_arch_setup;
x86_init.oem.banner = xen_banner;
- x86_init.timers.timer_init = xen_time_init;
- x86_init.timers.setup_percpu_clockev = x86_init_noop;
- x86_cpuinit.setup_percpu_clockev = x86_init_noop;
-
- x86_platform.calibrate_tsc = xen_tsc_khz;
- x86_platform.get_wallclock = xen_get_wallclock;
- x86_platform.set_wallclock = xen_set_wallclock;
+ xen_init_time_ops();
/*
* Set up some pagetable state before starting to set any ptes.
@@ -1145,6 +1171,10 @@ asmlinkage void __init xen_start_kernel(void)
pgd = (pgd_t *)xen_start_info->pt_base;
+ if (!xen_initial_domain())
+ __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
+
+ __supported_pte_mask |= _PAGE_IOMAP;
/* Don't do the full vcpu_info placement stuff until we have a
possible map and a non-dummy shared_info. */
per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
@@ -1152,8 +1182,14 @@ asmlinkage void __init xen_start_kernel(void)
local_irq_disable();
early_boot_irqs_off();
+ memblock_init();
+
xen_raw_console_write("mapping kernel into physical memory\n");
pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
+ xen_ident_map_ISA();
+
+ /* Allocate and initialize top and mid mfn levels for p2m structure */
+ xen_build_mfn_list_list();
init_mm.pgd = pgd;
@@ -1189,6 +1225,8 @@ asmlinkage void __init xen_start_kernel(void)
add_preferred_console("xenboot", 0, NULL);
add_preferred_console("tty", 0, NULL);
add_preferred_console("hvc", 0, NULL);
+ if (pci_xen)
+ x86_init.pci.arch_init = pci_xen_init;
} else {
/* Make sure ACS will be enabled */
pci_request_acs();
@@ -1206,3 +1244,139 @@ asmlinkage void __init xen_start_kernel(void)
x86_64_start_reservations((char *)__pa_symbol(&boot_params));
#endif
}
+
+static uint32_t xen_cpuid_base(void)
+{
+ uint32_t base, eax, ebx, ecx, edx;
+ char signature[13];
+
+ for (base = 0x40000000; base < 0x40010000; base += 0x100) {
+ cpuid(base, &eax, &ebx, &ecx, &edx);
+ *(uint32_t *)(signature + 0) = ebx;
+ *(uint32_t *)(signature + 4) = ecx;
+ *(uint32_t *)(signature + 8) = edx;
+ signature[12] = 0;
+
+ if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
+ return base;
+ }
+
+ return 0;
+}
+
+static int init_hvm_pv_info(int *major, int *minor)
+{
+ uint32_t eax, ebx, ecx, edx, pages, msr, base;
+ u64 pfn;
+
+ base = xen_cpuid_base();
+ cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+
+ *major = eax >> 16;
+ *minor = eax & 0xffff;
+ printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
+
+ cpuid(base + 2, &pages, &msr, &ecx, &edx);
+
+ pfn = __pa(hypercall_page);
+ wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+ xen_setup_features();
+
+ pv_info = xen_info;
+ pv_info.kernel_rpl = 0;
+
+ xen_domain_type = XEN_HVM_DOMAIN;
+
+ return 0;
+}
+
+void xen_hvm_init_shared_info(void)
+{
+ int cpu;
+ struct xen_add_to_physmap xatp;
+ static struct shared_info *shared_info_page = 0;
+
+ if (!shared_info_page)
+ shared_info_page = (struct shared_info *)
+ extend_brk(PAGE_SIZE, PAGE_SIZE);
+ xatp.domid = DOMID_SELF;
+ xatp.idx = 0;
+ xatp.space = XENMAPSPACE_shared_info;
+ xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+ if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+ BUG();
+
+ HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
+
+ /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
+ * page, we use it in the event channel upcall and in some pvclock
+ * related functions. We don't need the vcpu_info placement
+ * optimizations because we don't use any pv_mmu or pv_irq op on
+ * HVM.
+ * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
+ * online but xen_hvm_init_shared_info is run at resume time too and
+ * in that case multiple vcpus might be online. */
+ for_each_online_cpu(cpu) {
+ per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+ }
+}
+
+#ifdef CONFIG_XEN_PVHVM
+static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (long)hcpu;
+ switch (action) {
+ case CPU_UP_PREPARE:
+ per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = {
+ .notifier_call = xen_hvm_cpu_notify,
+};
+
+static void __init xen_hvm_guest_init(void)
+{
+ int r;
+ int major, minor;
+
+ r = init_hvm_pv_info(&major, &minor);
+ if (r < 0)
+ return;
+
+ xen_hvm_init_shared_info();
+
+ if (xen_feature(XENFEAT_hvm_callback_vector))
+ xen_have_vector_callback = 1;
+ register_cpu_notifier(&xen_hvm_cpu_notifier);
+ xen_unplug_emulated_devices();
+ have_vcpu_info_placement = 0;
+ x86_init.irqs.intr_init = xen_init_IRQ;
+ xen_hvm_init_time_ops();
+ xen_hvm_init_mmu_ops();
+}
+
+static bool __init xen_hvm_platform(void)
+{
+ if (xen_pv_domain())
+ return false;
+
+ if (!xen_cpuid_base())
+ return false;
+
+ return true;
+}
+
+const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
+ .name = "Xen HVM",
+ .detect = xen_hvm_platform,
+ .init_platform = xen_hvm_guest_init,
+};
+EXPORT_SYMBOL(x86_hyper_xen_hvm);
+#endif
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 914f04695ce5..c237b810b03f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -42,8 +42,10 @@
#include <linux/highmem.h>
#include <linux/debugfs.h>
#include <linux/bug.h>
+#include <linux/vmalloc.h>
#include <linux/module.h>
#include <linux/gfp.h>
+#include <linux/memblock.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -51,14 +53,21 @@
#include <asm/mmu_context.h>
#include <asm/setup.h>
#include <asm/paravirt.h>
+#include <asm/e820.h>
#include <asm/linkage.h>
+#include <asm/page.h>
+#include <asm/init.h>
+#include <asm/pat.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
+#include <xen/xen.h>
#include <xen/page.h>
#include <xen/interface/xen.h>
+#include <xen/interface/hvm/hvm_op.h>
#include <xen/interface/version.h>
+#include <xen/interface/memory.h>
#include <xen/hvc-console.h>
#include "multicalls.h"
@@ -67,6 +76,13 @@
#define MMU_UPDATE_HISTO 30
+/*
+ * Protects atomic reservation decrease/increase against concurrent increases.
+ * Also protects non-atomic updates of current_pages and driver_pages, and
+ * balloon lists.
+ */
+DEFINE_SPINLOCK(xen_reservation_lock);
+
#ifdef CONFIG_XEN_DEBUG_FS
static struct {
@@ -125,7 +141,8 @@ static inline void check_zero(void)
* large enough to allocate page table pages to allocate the rest.
* Each page can map 2MB.
*/
-static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
+static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
#ifdef CONFIG_X86_64
/* l3 pud for userspace vsyscall mapping */
@@ -156,49 +173,182 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
*/
#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
+/*
+ * Xen leaves the responsibility for maintaining p2m mappings to the
+ * guests themselves, but it must also access and update the p2m array
+ * during suspend/resume when all the pages are reallocated.
+ *
+ * The p2m table is logically a flat array, but we implement it as a
+ * three-level tree to allow the address space to be sparse.
+ *
+ * Xen
+ * |
+ * p2m_top p2m_top_mfn
+ * / \ / \
+ * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
+ * / \ / \ / /
+ * p2m p2m p2m p2m p2m p2m p2m ...
+ *
+ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
+ *
+ * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
+ * maximum representable pseudo-physical address space is:
+ * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
+ *
+ * P2M_PER_PAGE depends on the architecture, as a mfn is always
+ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
+ * 512 and 1024 entries respectively.
+ */
+
+unsigned long xen_max_p2m_pfn __read_mostly;
-#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
-#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
+#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
+#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
-/* Placeholder for holes in the address space */
-static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
- { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
+#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
- /* Array of pointers to pages containing p2m entries */
-static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
- { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
+/* Placeholders for holes in the address space */
+static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
-/* Arrays of p2m arrays expressed in mfns used for save/restore */
-static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
+static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
-static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
- __page_aligned_bss;
+RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
static inline unsigned p2m_top_index(unsigned long pfn)
{
- BUG_ON(pfn >= MAX_DOMAIN_PAGES);
- return pfn / P2M_ENTRIES_PER_PAGE;
+ BUG_ON(pfn >= MAX_P2M_PFN);
+ return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
+}
+
+static inline unsigned p2m_mid_index(unsigned long pfn)
+{
+ return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
}
static inline unsigned p2m_index(unsigned long pfn)
{
- return pfn % P2M_ENTRIES_PER_PAGE;
+ return pfn % P2M_PER_PAGE;
}
-/* Build the parallel p2m_top_mfn structures */
+static void p2m_top_init(unsigned long ***top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing;
+}
+
+static void p2m_top_mfn_init(unsigned long *top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = virt_to_mfn(p2m_mid_missing_mfn);
+}
+
+static void p2m_top_mfn_p_init(unsigned long **top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing_mfn;
+}
+
+static void p2m_mid_init(unsigned long **mid)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ mid[i] = p2m_missing;
+}
+
+static void p2m_mid_mfn_init(unsigned long *mid)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ mid[i] = virt_to_mfn(p2m_missing);
+}
+
+static void p2m_init(unsigned long *p2m)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ p2m[i] = INVALID_P2M_ENTRY;
+}
+
+/*
+ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
+ *
+ * This is called both at boot time, and after resuming from suspend:
+ * - At boot time we're called very early, and must use extend_brk()
+ * to allocate memory.
+ *
+ * - After resume we're called from within stop_machine, but the mfn
+ * tree should alreay be completely allocated.
+ */
void xen_build_mfn_list_list(void)
{
- unsigned pfn, idx;
+ unsigned long pfn;
- for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
+ /* Pre-initialize p2m_top_mfn to be completely missing */
+ if (p2m_top_mfn == NULL) {
+ p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(p2m_mid_missing_mfn);
+
+ p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_p_init(p2m_top_mfn_p);
- p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
+ p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_init(p2m_top_mfn);
+ } else {
+ /* Reinitialise, mfn's all change after migration */
+ p2m_mid_mfn_init(p2m_mid_missing_mfn);
}
- for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
- unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
- p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
+ for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+ unsigned long **mid;
+ unsigned long *mid_mfn_p;
+
+ mid = p2m_top[topidx];
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+
+ /* Don't bother allocating any mfn mid levels if
+ * they're just missing, just update the stored mfn,
+ * since all could have changed over a migrate.
+ */
+ if (mid == p2m_mid_missing) {
+ BUG_ON(mididx);
+ BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+ p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
+ pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
+ continue;
+ }
+
+ if (mid_mfn_p == p2m_mid_missing_mfn) {
+ /*
+ * XXX boot-time only! We should never find
+ * missing parts of the mfn tree after
+ * runtime. extend_brk() will BUG if we call
+ * it too late.
+ */
+ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(mid_mfn_p);
+
+ p2m_top_mfn_p[topidx] = mid_mfn_p;
+ }
+
+ p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+ mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
}
}
@@ -207,8 +357,8 @@ void xen_setup_mfn_list_list(void)
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
- virt_to_mfn(p2m_top_mfn_list);
- HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
+ virt_to_mfn(p2m_top_mfn);
+ HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
}
/* Set up p2m_top to point to the domain-builder provided p2m pages */
@@ -216,98 +366,176 @@ void __init xen_build_dynamic_phys_to_machine(void)
{
unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
- unsigned pfn;
+ unsigned long pfn;
+
+ xen_max_p2m_pfn = max_pfn;
+
+ p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_init(p2m_missing);
- for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
+ p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(p2m_mid_missing);
+
+ p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_init(p2m_top);
+
+ /*
+ * The domain builder gives us a pre-constructed p2m array in
+ * mfn_list for all the pages initially given to us, so we just
+ * need to graft that into our tree structure.
+ */
+ for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
- p2m_top[topidx] = &mfn_list[pfn];
- }
+ if (p2m_top[topidx] == p2m_mid_missing) {
+ unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(mid);
- xen_build_mfn_list_list();
+ p2m_top[topidx] = mid;
+ }
+
+ p2m_top[topidx][mididx] = &mfn_list[pfn];
+ }
}
unsigned long get_phys_to_machine(unsigned long pfn)
{
- unsigned topidx, idx;
+ unsigned topidx, mididx, idx;
- if (unlikely(pfn >= MAX_DOMAIN_PAGES))
+ if (unlikely(pfn >= MAX_P2M_PFN))
return INVALID_P2M_ENTRY;
topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
idx = p2m_index(pfn);
- return p2m_top[topidx][idx];
+
+ return p2m_top[topidx][mididx][idx];
}
EXPORT_SYMBOL_GPL(get_phys_to_machine);
-/* install a new p2m_top page */
-bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
+static void *alloc_p2m_page(void)
{
- unsigned topidx = p2m_top_index(pfn);
- unsigned long **pfnp, *mfnp;
- unsigned i;
+ return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
+}
- pfnp = &p2m_top[topidx];
- mfnp = &p2m_top_mfn[topidx];
+static void free_p2m_page(void *p)
+{
+ free_page((unsigned long)p);
+}
- for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
- p[i] = INVALID_P2M_ENTRY;
+/*
+ * Fully allocate the p2m structure for a given pfn. We need to check
+ * that both the top and mid levels are allocated, and make sure the
+ * parallel mfn tree is kept in sync. We may race with other cpus, so
+ * the new pages are installed with cmpxchg; if we lose the race then
+ * simply free the page we allocated and use the one that's there.
+ */
+static bool alloc_p2m(unsigned long pfn)
+{
+ unsigned topidx, mididx;
+ unsigned long ***top_p, **mid;
+ unsigned long *top_mfn_p, *mid_mfn;
- if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
- *mfnp = virt_to_mfn(p);
- return true;
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+
+ top_p = &p2m_top[topidx];
+ mid = *top_p;
+
+ if (mid == p2m_mid_missing) {
+ /* Mid level is missing, allocate a new one */
+ mid = alloc_p2m_page();
+ if (!mid)
+ return false;
+
+ p2m_mid_init(mid);
+
+ if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
+ free_p2m_page(mid);
}
- return false;
-}
+ top_mfn_p = &p2m_top_mfn[topidx];
+ mid_mfn = p2m_top_mfn_p[topidx];
-static void alloc_p2m(unsigned long pfn)
-{
- unsigned long *p;
+ BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
+
+ if (mid_mfn == p2m_mid_missing_mfn) {
+ /* Separately check the mid mfn level */
+ unsigned long missing_mfn;
+ unsigned long mid_mfn_mfn;
+
+ mid_mfn = alloc_p2m_page();
+ if (!mid_mfn)
+ return false;
+
+ p2m_mid_mfn_init(mid_mfn);
+
+ missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
+ mid_mfn_mfn = virt_to_mfn(mid_mfn);
+ if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
+ free_p2m_page(mid_mfn);
+ else
+ p2m_top_mfn_p[topidx] = mid_mfn;
+ }
+
+ if (p2m_top[topidx][mididx] == p2m_missing) {
+ /* p2m leaf page is missing */
+ unsigned long *p2m;
+
+ p2m = alloc_p2m_page();
+ if (!p2m)
+ return false;
+
+ p2m_init(p2m);
- p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
- BUG_ON(p == NULL);
+ if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
+ free_p2m_page(p2m);
+ else
+ mid_mfn[mididx] = virt_to_mfn(p2m);
+ }
- if (!install_p2mtop_page(pfn, p))
- free_page((unsigned long)p);
+ return true;
}
/* Try to install p2m mapping; fail if intermediate bits missing */
bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
- unsigned topidx, idx;
+ unsigned topidx, mididx, idx;
- if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
+ if (unlikely(pfn >= MAX_P2M_PFN)) {
BUG_ON(mfn != INVALID_P2M_ENTRY);
return true;
}
topidx = p2m_top_index(pfn);
- if (p2m_top[topidx] == p2m_missing) {
- if (mfn == INVALID_P2M_ENTRY)
- return true;
- return false;
- }
-
+ mididx = p2m_mid_index(pfn);
idx = p2m_index(pfn);
- p2m_top[topidx][idx] = mfn;
+
+ if (p2m_top[topidx][mididx] == p2m_missing)
+ return mfn == INVALID_P2M_ENTRY;
+
+ p2m_top[topidx][mididx][idx] = mfn;
return true;
}
-void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
- return;
+ return true;
}
if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
- alloc_p2m(pfn);
+ if (!alloc_p2m(pfn))
+ return false;
if (!__set_phys_to_machine(pfn, mfn))
- BUG();
+ return false;
}
+
+ return true;
}
unsigned long arbitrary_virt_to_mfn(void *vaddr)
@@ -346,7 +574,8 @@ void make_lowmem_page_readonly(void *vaddr)
unsigned int level;
pte = lookup_address(address, &level);
- BUG_ON(pte == NULL);
+ if (pte == NULL)
+ return; /* vaddr missing */
ptev = pte_wrprotect(*pte);
@@ -361,7 +590,8 @@ void make_lowmem_page_readwrite(void *vaddr)
unsigned int level;
pte = lookup_address(address, &level);
- BUG_ON(pte == NULL);
+ if (pte == NULL)
+ return; /* vaddr missing */
ptev = pte_mkwrite(*pte);
@@ -377,6 +607,34 @@ static bool xen_page_pinned(void *ptr)
return PagePinned(page);
}
+static bool xen_iomap_pte(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_IOMAP;
+}
+
+void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
+{
+ struct multicall_space mcs;
+ struct mmu_update *u;
+
+ mcs = xen_mc_entry(sizeof(*u));
+ u = mcs.args;
+
+ /* ptep might be kmapped when using 32-bit HIGHPTE */
+ u->ptr = arbitrary_virt_to_machine(ptep).maddr;
+ u->val = pte_val_ma(pteval);
+
+ MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+EXPORT_SYMBOL_GPL(xen_set_domain_pte);
+
+static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
+{
+ xen_set_domain_pte(ptep, pteval, DOMID_IO);
+}
+
static void xen_extend_mmu_update(const struct mmu_update *update)
{
struct multicall_space mcs;
@@ -453,6 +711,11 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval)
{
+ if (xen_iomap_pte(pteval)) {
+ xen_set_iomap_pte(ptep, pteval);
+ goto out;
+ }
+
ADD_STATS(set_pte_at, 1);
// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
ADD_STATS(set_pte_at_current, mm == current->mm);
@@ -517,7 +780,34 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
if (val & _PAGE_PRESENT) {
unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
pteval_t flags = val & PTE_FLAGS_MASK;
- val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
+ unsigned long mfn = pfn_to_mfn(pfn);
+
+ /*
+ * If there's no mfn for the pfn, then just create an
+ * empty non-present pte. Unfortunately this loses
+ * information about the original pfn, so
+ * pte_mfn_to_pfn is asymmetric.
+ */
+ if (unlikely(mfn == INVALID_P2M_ENTRY)) {
+ mfn = 0;
+ flags = 0;
+ }
+
+ val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
+ }
+
+ return val;
+}
+
+static pteval_t iomap_pte(pteval_t val)
+{
+ if (val & _PAGE_PRESENT) {
+ unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
+ pteval_t flags = val & PTE_FLAGS_MASK;
+
+ /* We assume the pte frame number is a MFN, so
+ just use it as-is. */
+ val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
}
return val;
@@ -525,7 +815,18 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
pteval_t xen_pte_val(pte_t pte)
{
- return pte_mfn_to_pfn(pte.pte);
+ pteval_t pteval = pte.pte;
+
+ /* If this is a WC pte, convert back from Xen WC to Linux WC */
+ if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
+ WARN_ON(!pat_enabled);
+ pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
+ }
+
+ if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
+ return pteval;
+
+ return pte_mfn_to_pfn(pteval);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
@@ -535,9 +836,62 @@ pgdval_t xen_pgd_val(pgd_t pgd)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
+/*
+ * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
+ * are reserved for now, to correspond to the Intel-reserved PAT
+ * types.
+ *
+ * We expect Linux's PAT set as follows:
+ *
+ * Idx PTE flags Linux Xen Default
+ * 0 WB WB WB
+ * 1 PWT WC WT WT
+ * 2 PCD UC- UC- UC-
+ * 3 PCD PWT UC UC UC
+ * 4 PAT WB WC WB
+ * 5 PAT PWT WC WP WT
+ * 6 PAT PCD UC- UC UC-
+ * 7 PAT PCD PWT UC UC UC
+ */
+
+void xen_set_pat(u64 pat)
+{
+ /* We expect Linux to use a PAT setting of
+ * UC UC- WC WB (ignoring the PAT flag) */
+ WARN_ON(pat != 0x0007010600070106ull);
+}
+
pte_t xen_make_pte(pteval_t pte)
{
- pte = pte_pfn_to_mfn(pte);
+ phys_addr_t addr = (pte & PTE_PFN_MASK);
+
+ /* If Linux is trying to set a WC pte, then map to the Xen WC.
+ * If _PAGE_PAT is set, then it probably means it is really
+ * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
+ * things work out OK...
+ *
+ * (We should never see kernel mappings with _PAGE_PSE set,
+ * but we could see hugetlbfs mappings, I think.).
+ */
+ if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
+ if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
+ pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
+ }
+
+ /*
+ * Unprivileged domains are allowed to do IOMAPpings for
+ * PCI passthrough, but not map ISA space. The ISA
+ * mappings are just dummy local mappings to keep other
+ * parts of the kernel happy.
+ */
+ if (unlikely(pte & _PAGE_IOMAP) &&
+ (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
+ pte = iomap_pte(pte);
+ } else {
+ pte &= ~_PAGE_IOMAP;
+ pte = pte_pfn_to_mfn(pte);
+ }
+
return native_make_pte(pte);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
@@ -593,6 +947,11 @@ void xen_set_pud(pud_t *ptr, pud_t val)
void xen_set_pte(pte_t *ptep, pte_t pte)
{
+ if (xen_iomap_pte(pte)) {
+ xen_set_iomap_pte(ptep, pte);
+ return;
+ }
+
ADD_STATS(pte_update, 1);
// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
@@ -609,6 +968,11 @@ void xen_set_pte(pte_t *ptep, pte_t pte)
#ifdef CONFIG_X86_PAE
void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
+ if (xen_iomap_pte(pte)) {
+ xen_set_iomap_pte(ptep, pte);
+ return;
+ }
+
set_64bit((u64 *)ptep, native_pte_val(pte));
}
@@ -935,8 +1299,6 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page,
read-only, and can be pinned. */
static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
{
- vm_unmap_aliases();
-
xen_mc_batch();
if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
@@ -1428,13 +1790,25 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
#endif
}
-#ifdef CONFIG_X86_32
static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
{
+ unsigned long pfn = pte_pfn(pte);
+
+#ifdef CONFIG_X86_32
/* If there's an existing pte, then don't allow _PAGE_RW to be set */
if (pte_val_ma(*ptep) & _PAGE_PRESENT)
pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
pte_val_ma(pte));
+#endif
+
+ /*
+ * If the new pfn is within the range of the newly allocated
+ * kernel pagetable, and it isn't being mapped into an
+ * early_ioremap fixmap slot, make sure it is RO.
+ */
+ if (!is_early_ioremap_ptep(ptep) &&
+ pfn >= e820_table_start && pfn < e820_table_end)
+ pte = pte_wrprotect(pte);
return pte;
}
@@ -1447,7 +1821,6 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
xen_set_pte(ptep, pte);
}
-#endif
static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
{
@@ -1500,7 +1873,6 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l
if (PagePinned(virt_to_page(mm->pgd))) {
SetPagePinned(page);
- vm_unmap_aliases();
if (!PageHighMem(page)) {
make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
if (level == PT_PTE && USE_SPLIT_PTLOCKS)
@@ -1603,6 +1975,7 @@ static void *m2v(phys_addr_t maddr)
return __ka(m2p(maddr));
}
+/* Set the page permissions on an identity-mapped pages */
static void set_page_prot(void *addr, pgprot_t prot)
{
unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
@@ -1618,6 +1991,9 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
unsigned ident_pte;
unsigned long pfn;
+ level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
+ PAGE_SIZE);
+
ident_pte = 0;
pfn = 0;
for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
@@ -1628,7 +2004,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
pte_page = m2v(pmd[pmdidx].pmd);
else {
/* Check for free pte pages */
- if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
+ if (ident_pte == LEVEL1_IDENT_ENTRIES)
break;
pte_page = &level1_ident_pgt[ident_pte];
@@ -1735,7 +2111,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
__xen_write_cr3(true, __pa(pgd));
xen_mc_issue(PARAVIRT_LAZY_CPU);
- reserve_early(__pa(xen_start_info->pt_base),
+ memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
__pa(xen_start_info->pt_base +
xen_start_info->nr_pt_frames * PAGE_SIZE),
"XEN PAGETABLES");
@@ -1743,13 +2119,15 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
return pgd;
}
#else /* !CONFIG_X86_64 */
-static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
+static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD);
__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
unsigned long max_pfn)
{
pmd_t *kernel_pmd;
+ level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE);
+
max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
xen_start_info->nr_pt_frames * PAGE_SIZE +
512*1024);
@@ -1773,7 +2151,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
- reserve_early(__pa(xen_start_info->pt_base),
+ memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
__pa(xen_start_info->pt_base +
xen_start_info->nr_pt_frames * PAGE_SIZE),
"XEN PAGETABLES");
@@ -1782,6 +2160,8 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
}
#endif /* CONFIG_X86_64 */
+static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
+
static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
{
pte_t pte;
@@ -1802,18 +2182,38 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#else
case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- case FIX_APIC_BASE: /* maps dummy local APIC */
-#endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
/* All local page mappings */
pte = pfn_pte(phys, prot);
break;
- default:
+#ifdef CONFIG_X86_LOCAL_APIC
+ case FIX_APIC_BASE: /* maps dummy local APIC */
+ pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
+ break;
+#endif
+
+#ifdef CONFIG_X86_IO_APIC
+ case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
+ /*
+ * We just don't map the IO APIC - all access is via
+ * hypercalls. Keep the address in the pte for reference.
+ */
+ pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
+ break;
+#endif
+
+ case FIX_PARAVIRT_BOOTMAP:
+ /* This is an MFN, but it isn't an IO mapping from the
+ IO domain */
pte = mfn_pte(phys, prot);
break;
+
+ default:
+ /* By default, set_fixmap is used for hardware mappings */
+ pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
+ break;
}
__native_set_fixmap(idx, pte);
@@ -1828,6 +2228,29 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
}
+__init void xen_ident_map_ISA(void)
+{
+ unsigned long pa;
+
+ /*
+ * If we're dom0, then linear map the ISA machine addresses into
+ * the kernel's address space.
+ */
+ if (!xen_initial_domain())
+ return;
+
+ xen_raw_printk("Xen: setup ISA identity maps\n");
+
+ for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
+ pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
+
+ if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
+ BUG();
+ }
+
+ xen_flush_tlb();
+}
+
static __init void xen_post_allocator_init(void)
{
pv_mmu_ops.set_pte = xen_set_pte;
@@ -1883,14 +2306,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.alloc_pte = xen_alloc_pte_init,
.release_pte = xen_release_pte_init,
.alloc_pmd = xen_alloc_pmd_init,
- .alloc_pmd_clone = paravirt_nop,
.release_pmd = xen_release_pmd_init,
-#ifdef CONFIG_X86_64
- .set_pte = xen_set_pte,
-#else
.set_pte = xen_set_pte_init,
-#endif
.set_pte_at = xen_set_pte_at,
.set_pmd = xen_set_pmd_hyper,
@@ -1939,7 +2357,307 @@ void __init xen_init_mmu_ops(void)
x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
pv_mmu_ops = xen_mmu_ops;
+
+ vmap_lazy_unmap = false;
+
+ memset(dummy_mapping, 0xff, PAGE_SIZE);
+}
+
+/* Protected by xen_reservation_lock. */
+#define MAX_CONTIG_ORDER 9 /* 2MB */
+static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
+
+#define VOID_PTE (mfn_pte(0, __pgprot(0)))
+static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
+ unsigned long *in_frames,
+ unsigned long *out_frames)
+{
+ int i;
+ struct multicall_space mcs;
+
+ xen_mc_batch();
+ for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
+ mcs = __xen_mc_entry(0);
+
+ if (in_frames)
+ in_frames[i] = virt_to_mfn(vaddr);
+
+ MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
+ set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
+
+ if (out_frames)
+ out_frames[i] = virt_to_pfn(vaddr);
+ }
+ xen_mc_issue(0);
+}
+
+/*
+ * Update the pfn-to-mfn mappings for a virtual address range, either to
+ * point to an array of mfns, or contiguously from a single starting
+ * mfn.
+ */
+static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
+ unsigned long *mfns,
+ unsigned long first_mfn)
+{
+ unsigned i, limit;
+ unsigned long mfn;
+
+ xen_mc_batch();
+
+ limit = 1u << order;
+ for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
+ struct multicall_space mcs;
+ unsigned flags;
+
+ mcs = __xen_mc_entry(0);
+ if (mfns)
+ mfn = mfns[i];
+ else
+ mfn = first_mfn + i;
+
+ if (i < (limit - 1))
+ flags = 0;
+ else {
+ if (order == 0)
+ flags = UVMF_INVLPG | UVMF_ALL;
+ else
+ flags = UVMF_TLB_FLUSH | UVMF_ALL;
+ }
+
+ MULTI_update_va_mapping(mcs.mc, vaddr,
+ mfn_pte(mfn, PAGE_KERNEL), flags);
+
+ set_phys_to_machine(virt_to_pfn(vaddr), mfn);
+ }
+
+ xen_mc_issue(0);
+}
+
+/*
+ * Perform the hypercall to exchange a region of our pfns to point to
+ * memory with the required contiguous alignment. Takes the pfns as
+ * input, and populates mfns as output.
+ *
+ * Returns a success code indicating whether the hypervisor was able to
+ * satisfy the request or not.
+ */
+static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
+ unsigned long *pfns_in,
+ unsigned long extents_out,
+ unsigned int order_out,
+ unsigned long *mfns_out,
+ unsigned int address_bits)
+{
+ long rc;
+ int success;
+
+ struct xen_memory_exchange exchange = {
+ .in = {
+ .nr_extents = extents_in,
+ .extent_order = order_in,
+ .extent_start = pfns_in,
+ .domid = DOMID_SELF
+ },
+ .out = {
+ .nr_extents = extents_out,
+ .extent_order = order_out,
+ .extent_start = mfns_out,
+ .address_bits = address_bits,
+ .domid = DOMID_SELF
+ }
+ };
+
+ BUG_ON(extents_in << order_in != extents_out << order_out);
+
+ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+ success = (exchange.nr_exchanged == extents_in);
+
+ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
+ BUG_ON(success && (rc != 0));
+
+ return success;
+}
+
+int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
+ unsigned int address_bits)
+{
+ unsigned long *in_frames = discontig_frames, out_frame;
+ unsigned long flags;
+ int success;
+
+ /*
+ * Currently an auto-translated guest will not perform I/O, nor will
+ * it require PAE page directories below 4GB. Therefore any calls to
+ * this function are redundant and can be ignored.
+ */
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ if (unlikely(order > MAX_CONTIG_ORDER))
+ return -ENOMEM;
+
+ memset((void *) vstart, 0, PAGE_SIZE << order);
+
+ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ /* 1. Zap current PTEs, remembering MFNs. */
+ xen_zap_pfn_range(vstart, order, in_frames, NULL);
+
+ /* 2. Get a new contiguous memory extent. */
+ out_frame = virt_to_pfn(vstart);
+ success = xen_exchange_memory(1UL << order, 0, in_frames,
+ 1, order, &out_frame,
+ address_bits);
+
+ /* 3. Map the new extent in place of old pages. */
+ if (success)
+ xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
+ else
+ xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
+
+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+
+ return success ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
+
+void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
+{
+ unsigned long *out_frames = discontig_frames, in_frame;
+ unsigned long flags;
+ int success;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ if (unlikely(order > MAX_CONTIG_ORDER))
+ return;
+
+ memset((void *) vstart, 0, PAGE_SIZE << order);
+
+ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ /* 1. Find start MFN of contiguous extent. */
+ in_frame = virt_to_mfn(vstart);
+
+ /* 2. Zap current PTEs. */
+ xen_zap_pfn_range(vstart, order, NULL, out_frames);
+
+ /* 3. Do the exchange for non-contiguous MFNs. */
+ success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
+ 0, out_frames, 0);
+
+ /* 4. Map new pages in place of old pages. */
+ if (success)
+ xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
+ else
+ xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
+
+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+}
+EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
+
+#ifdef CONFIG_XEN_PVHVM
+static void xen_hvm_exit_mmap(struct mm_struct *mm)
+{
+ struct xen_hvm_pagetable_dying a;
+ int rc;
+
+ a.domid = DOMID_SELF;
+ a.gpa = __pa(mm->pgd);
+ rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+ WARN_ON_ONCE(rc < 0);
+}
+
+static int is_pagetable_dying_supported(void)
+{
+ struct xen_hvm_pagetable_dying a;
+ int rc = 0;
+
+ a.domid = DOMID_SELF;
+ a.gpa = 0x00;
+ rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+ if (rc < 0) {
+ printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
+ return 0;
+ }
+ return 1;
+}
+
+void __init xen_hvm_init_mmu_ops(void)
+{
+ if (is_pagetable_dying_supported())
+ pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
+}
+#endif
+
+#define REMAP_BATCH_SIZE 16
+
+struct remap_data {
+ unsigned long mfn;
+ pgprot_t prot;
+ struct mmu_update *mmu_update;
+};
+
+static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
+ unsigned long addr, void *data)
+{
+ struct remap_data *rmd = data;
+ pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
+
+ rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
+ rmd->mmu_update->val = pte_val_ma(pte);
+ rmd->mmu_update++;
+
+ return 0;
+}
+
+int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+ unsigned long addr,
+ unsigned long mfn, int nr,
+ pgprot_t prot, unsigned domid)
+{
+ struct remap_data rmd;
+ struct mmu_update mmu_update[REMAP_BATCH_SIZE];
+ int batch;
+ unsigned long range;
+ int err = 0;
+
+ prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
+
+ vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+
+ rmd.mfn = mfn;
+ rmd.prot = prot;
+
+ while (nr) {
+ batch = min(REMAP_BATCH_SIZE, nr);
+ range = (unsigned long)batch << PAGE_SHIFT;
+
+ rmd.mmu_update = mmu_update;
+ err = apply_to_page_range(vma->vm_mm, addr, range,
+ remap_area_mfn_pte_fn, &rmd);
+ if (err)
+ goto out;
+
+ err = -EFAULT;
+ if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
+ goto out;
+
+ nr -= batch;
+ addr += range;
+ }
+
+ err = 0;
+out:
+
+ flush_tlb_all();
+
+ return err;
}
+EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
#ifdef CONFIG_XEN_DEBUG_FS
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 5fe6bc7f5ecf..537bb9aab777 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -12,7 +12,6 @@ enum pt_level {
bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-bool install_p2mtop_page(unsigned long pfn, unsigned long *p);
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
@@ -60,4 +59,5 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
unsigned long xen_read_cr2_direct(void);
extern void xen_init_mmu_ops(void);
+extern void xen_hvm_init_mmu_ops(void);
#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
new file mode 100644
index 000000000000..bfd0632fe65e
--- /dev/null
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -0,0 +1,67 @@
+/* Glue code to lib/swiotlb-xen.c */
+
+#include <linux/dma-mapping.h>
+#include <linux/pci.h>
+#include <xen/swiotlb-xen.h>
+
+#include <asm/xen/hypervisor.h>
+#include <xen/xen.h>
+#include <asm/iommu_table.h>
+
+int xen_swiotlb __read_mostly;
+
+static struct dma_map_ops xen_swiotlb_dma_ops = {
+ .mapping_error = xen_swiotlb_dma_mapping_error,
+ .alloc_coherent = xen_swiotlb_alloc_coherent,
+ .free_coherent = xen_swiotlb_free_coherent,
+ .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
+ .sync_single_for_device = xen_swiotlb_sync_single_for_device,
+ .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
+ .sync_sg_for_device = xen_swiotlb_sync_sg_for_device,
+ .map_sg = xen_swiotlb_map_sg_attrs,
+ .unmap_sg = xen_swiotlb_unmap_sg_attrs,
+ .map_page = xen_swiotlb_map_page,
+ .unmap_page = xen_swiotlb_unmap_page,
+ .dma_supported = xen_swiotlb_dma_supported,
+};
+
+/*
+ * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary
+ *
+ * This returns non-zero if we are forced to use xen_swiotlb (by the boot
+ * option).
+ */
+int __init pci_xen_swiotlb_detect(void)
+{
+
+ /* If running as PV guest, either iommu=soft, or swiotlb=force will
+ * activate this IOMMU. If running as PV privileged, activate it
+ * irregardlesss.
+ */
+ if ((xen_initial_domain() || swiotlb || swiotlb_force) &&
+ (xen_pv_domain()))
+ xen_swiotlb = 1;
+
+ /* If we are running under Xen, we MUST disable the native SWIOTLB.
+ * Don't worry about swiotlb_force flag activating the native, as
+ * the 'swiotlb' flag is the only one turning it on. */
+ if (xen_pv_domain())
+ swiotlb = 0;
+
+ return xen_swiotlb;
+}
+
+void __init pci_xen_swiotlb_init(void)
+{
+ if (xen_swiotlb) {
+ xen_swiotlb_init(1);
+ dma_ops = &xen_swiotlb_dma_ops;
+
+ /* Make sure ACS will be enabled */
+ pci_request_acs();
+ }
+}
+IOMMU_INIT_FINISH(pci_xen_swiotlb_detect,
+ 0,
+ pci_xen_swiotlb_init,
+ 0);
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
new file mode 100644
index 000000000000..0f456386cce5
--- /dev/null
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -0,0 +1,143 @@
+/******************************************************************************
+ * platform-pci-unplug.c
+ *
+ * Xen platform PCI device driver
+ * Copyright (c) 2010, Citrix
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+
+#include <xen/platform_pci.h>
+
+#define XEN_PLATFORM_ERR_MAGIC -1
+#define XEN_PLATFORM_ERR_PROTOCOL -2
+#define XEN_PLATFORM_ERR_BLACKLIST -3
+
+/* store the value of xen_emul_unplug after the unplug is done */
+int xen_platform_pci_unplug;
+EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
+#ifdef CONFIG_XEN_PVHVM
+static int xen_emul_unplug;
+
+static int __init check_platform_magic(void)
+{
+ short magic;
+ char protocol;
+
+ magic = inw(XEN_IOPORT_MAGIC);
+ if (magic != XEN_IOPORT_MAGIC_VAL) {
+ printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n");
+ return XEN_PLATFORM_ERR_MAGIC;
+ }
+
+ protocol = inb(XEN_IOPORT_PROTOVER);
+
+ printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n",
+ protocol);
+
+ switch (protocol) {
+ case 1:
+ outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM);
+ outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER);
+ if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) {
+ printk(KERN_ERR "Xen Platform: blacklisted by host\n");
+ return XEN_PLATFORM_ERR_BLACKLIST;
+ }
+ break;
+ default:
+ printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version");
+ return XEN_PLATFORM_ERR_PROTOCOL;
+ }
+
+ return 0;
+}
+
+void __init xen_unplug_emulated_devices(void)
+{
+ int r;
+
+ /* user explicitly requested no unplug */
+ if (xen_emul_unplug & XEN_UNPLUG_NEVER)
+ return;
+ /* check the version of the xen platform PCI device */
+ r = check_platform_magic();
+ /* If the version matches enable the Xen platform PCI driver.
+ * Also enable the Xen platform PCI driver if the host does
+ * not support the unplug protocol (XEN_PLATFORM_ERR_MAGIC)
+ * but the user told us that unplugging is unnecessary. */
+ if (r && !(r == XEN_PLATFORM_ERR_MAGIC &&
+ (xen_emul_unplug & XEN_UNPLUG_UNNECESSARY)))
+ return;
+ /* Set the default value of xen_emul_unplug depending on whether or
+ * not the Xen PV frontends and the Xen platform PCI driver have
+ * been compiled for this kernel (modules or built-in are both OK). */
+ if (!xen_emul_unplug) {
+ if (xen_must_unplug_nics()) {
+ printk(KERN_INFO "Netfront and the Xen platform PCI driver have "
+ "been compiled for this kernel: unplug emulated NICs.\n");
+ xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
+ }
+ if (xen_must_unplug_disks()) {
+ printk(KERN_INFO "Blkfront and the Xen platform PCI driver have "
+ "been compiled for this kernel: unplug emulated disks.\n"
+ "You might have to change the root device\n"
+ "from /dev/hd[a-d] to /dev/xvd[a-d]\n"
+ "in your root= kernel command line option\n");
+ xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
+ }
+ }
+ /* Now unplug the emulated devices */
+ if (!(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY))
+ outw(xen_emul_unplug, XEN_IOPORT_UNPLUG);
+ xen_platform_pci_unplug = xen_emul_unplug;
+}
+
+static int __init parse_xen_emul_unplug(char *arg)
+{
+ char *p, *q;
+ int l;
+
+ for (p = arg; p; p = q) {
+ q = strchr(p, ',');
+ if (q) {
+ l = q - p;
+ q++;
+ } else {
+ l = strlen(p);
+ }
+ if (!strncmp(p, "all", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL;
+ else if (!strncmp(p, "ide-disks", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
+ else if (!strncmp(p, "aux-ide-disks", l))
+ xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS;
+ else if (!strncmp(p, "nics", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
+ else if (!strncmp(p, "unnecessary", l))
+ xen_emul_unplug |= XEN_UNPLUG_UNNECESSARY;
+ else if (!strncmp(p, "never", l))
+ xen_emul_unplug |= XEN_UNPLUG_NEVER;
+ else
+ printk(KERN_WARNING "unrecognised option '%s' "
+ "in parameter 'xen_emul_unplug'\n", p);
+ }
+ return 0;
+}
+early_param("xen_emul_unplug", parse_xen_emul_unplug);
+#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index ad0047f47cd4..b1dbdaa23ecc 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -8,6 +8,7 @@
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/pm.h>
+#include <linux/memblock.h>
#include <asm/elf.h>
#include <asm/vdso.h>
@@ -17,9 +18,12 @@
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
+#include <xen/xen.h>
#include <xen/page.h>
#include <xen/interface/callback.h>
+#include <xen/interface/memory.h>
#include <xen/interface/physdev.h>
+#include <xen/interface/memory.h>
#include <xen/features.h>
#include "xen-ops.h"
@@ -32,25 +36,177 @@ extern void xen_sysenter_target(void);
extern void xen_syscall_target(void);
extern void xen_syscall32_target(void);
+/* Amount of extra memory space we add to the e820 ranges */
+phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
+
+/*
+ * The maximum amount of extra memory compared to the base size. The
+ * main scaling factor is the size of struct page. At extreme ratios
+ * of base:extra, all the base memory can be filled with page
+ * structures for the extra memory, leaving no space for anything
+ * else.
+ *
+ * 10x seems like a reasonable balance between scaling flexibility and
+ * leaving a practically usable system.
+ */
+#define EXTRA_MEM_RATIO (10)
+
+static __init void xen_add_extra_mem(unsigned long pages)
+{
+ u64 size = (u64)pages * PAGE_SIZE;
+ u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
+
+ if (!pages)
+ return;
+
+ e820_add_region(extra_start, size, E820_RAM);
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+
+ memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA");
+
+ xen_extra_mem_size += size;
+
+ xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
+}
+
+static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
+ phys_addr_t end_addr)
+{
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+ unsigned long start, end;
+ unsigned long len = 0;
+ unsigned long pfn;
+ int ret;
+
+ start = PFN_UP(start_addr);
+ end = PFN_DOWN(end_addr);
+
+ if (end <= start)
+ return 0;
+
+ printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ",
+ start, end);
+ for(pfn = start; pfn < end; pfn++) {
+ unsigned long mfn = pfn_to_mfn(pfn);
+
+ /* Make sure pfn exists to start with */
+ if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
+ continue;
+
+ set_xen_guest_handle(reservation.extent_start, &mfn);
+ reservation.nr_extents = 1;
+
+ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+ &reservation);
+ WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
+ start, end, ret);
+ if (ret == 1) {
+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ len++;
+ }
+ }
+ printk(KERN_CONT "%ld pages freed\n", len);
+
+ return len;
+}
+
+static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
+ const struct e820map *e820)
+{
+ phys_addr_t max_addr = PFN_PHYS(max_pfn);
+ phys_addr_t last_end = 0;
+ unsigned long released = 0;
+ int i;
+
+ for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
+ phys_addr_t end = e820->map[i].addr;
+ end = min(max_addr, end);
+
+ released += xen_release_chunk(last_end, end);
+ last_end = e820->map[i].addr + e820->map[i].size;
+ }
+
+ if (last_end < max_addr)
+ released += xen_release_chunk(last_end, max_addr);
+
+ printk(KERN_INFO "released %ld pages of unused memory\n", released);
+ return released;
+}
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
-
char * __init xen_memory_setup(void)
{
+ static struct e820entry map[E820MAX] __initdata;
+
unsigned long max_pfn = xen_start_info->nr_pages;
+ unsigned long long mem_end;
+ int rc;
+ struct xen_memory_map memmap;
+ unsigned long extra_pages = 0;
+ unsigned long extra_limit;
+ int i;
+ int op;
max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
+ mem_end = PFN_PHYS(max_pfn);
+
+ memmap.nr_entries = E820MAX;
+ set_xen_guest_handle(memmap.buffer, map);
+
+ op = xen_initial_domain() ?
+ XENMEM_machine_memory_map :
+ XENMEM_memory_map;
+ rc = HYPERVISOR_memory_op(op, &memmap);
+ if (rc == -ENOSYS) {
+ memmap.nr_entries = 1;
+ map[0].addr = 0ULL;
+ map[0].size = mem_end;
+ /* 8MB slack (to balance backend allocations). */
+ map[0].size += 8ULL << 20;
+ map[0].type = E820_RAM;
+ rc = 0;
+ }
+ BUG_ON(rc);
e820.nr_map = 0;
+ xen_extra_mem_start = mem_end;
+ for (i = 0; i < memmap.nr_entries; i++) {
+ unsigned long long end = map[i].addr + map[i].size;
+
+ if (map[i].type == E820_RAM) {
+ if (map[i].addr < mem_end && end > mem_end) {
+ /* Truncate region to max_mem. */
+ u64 delta = end - mem_end;
- e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM);
+ map[i].size -= delta;
+ extra_pages += PFN_DOWN(delta);
+
+ end = mem_end;
+ }
+ }
+
+ if (end > xen_extra_mem_start)
+ xen_extra_mem_start = end;
+
+ /* If region is non-RAM or below mem_end, add what remains */
+ if ((map[i].type != E820_RAM || map[i].addr < mem_end) &&
+ map[i].size > 0)
+ e820_add_region(map[i].addr, map[i].size, map[i].type);
+ }
/*
* Even though this is normal, usable memory under Xen, reserve
* ISA memory anyway because too many things think they can poke
* about in there.
+ *
+ * In a dom0 kernel, this region is identity mapped with the
+ * hardware ISA area, so it really is out of bounds.
*/
e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
E820_RESERVED);
@@ -61,12 +217,36 @@ char * __init xen_memory_setup(void)
* - xen_start_info
* See comment above "struct start_info" in <xen/interface/xen.h>
*/
- reserve_early(__pa(xen_start_info->mfn_list),
+ memblock_x86_reserve_range(__pa(xen_start_info->mfn_list),
__pa(xen_start_info->pt_base),
"XEN START INFO");
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
+
+ /*
+ * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
+ * factor the base size. On non-highmem systems, the base
+ * size is the full initial memory allocation; on highmem it
+ * is limited to the max size of lowmem, so that it doesn't
+ * get completely filled.
+ *
+ * In principle there could be a problem in lowmem systems if
+ * the initial memory is also very large with respect to
+ * lowmem, but we won't try to deal with that here.
+ */
+ extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+ max_pfn + extra_pages);
+
+ if (extra_limit >= max_pfn)
+ extra_pages = extra_limit - max_pfn;
+ else
+ extra_pages = 0;
+
+ if (!xen_initial_domain())
+ xen_add_extra_mem(extra_pages);
+
return "Xen";
}
@@ -156,6 +336,8 @@ void __init xen_arch_setup(void)
struct physdev_set_iopl set_iopl;
int rc;
+ xen_panic_handler_init();
+
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
@@ -188,7 +370,5 @@ void __init xen_arch_setup(void)
pm_idle = xen_idle;
- paravirt_disable_iospace();
-
fiddle_vdso();
}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index a29693fd3138..72a4c7959045 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -28,6 +28,7 @@
#include <asm/xen/interface.h>
#include <asm/xen/hypercall.h>
+#include <xen/xen.h>
#include <xen/page.h>
#include <xen/events.h>
@@ -156,6 +157,9 @@ static void __init xen_fill_possible_map(void)
{
int i, rc;
+ if (xen_initial_domain())
+ return;
+
for (i = 0; i < nr_cpu_ids; i++) {
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
if (rc >= 0) {
@@ -165,6 +169,27 @@ static void __init xen_fill_possible_map(void)
}
}
+static void __init xen_filter_cpu_maps(void)
+{
+ int i, rc;
+
+ if (!xen_initial_domain())
+ return;
+
+ num_processors = 0;
+ disabled_cpus = 0;
+ for (i = 0; i < nr_cpu_ids; i++) {
+ rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+ if (rc >= 0) {
+ num_processors++;
+ set_cpu_possible(i, true);
+ } else {
+ set_cpu_possible(i, false);
+ set_cpu_present(i, false);
+ }
+ }
+}
+
static void __init xen_smp_prepare_boot_cpu(void)
{
BUG_ON(smp_processor_id() != 0);
@@ -174,6 +199,7 @@ static void __init xen_smp_prepare_boot_cpu(void)
old memory can be recycled */
make_lowmem_page_readwrite(xen_initial_gdt);
+ xen_filter_cpu_maps();
xen_setup_vcpu_info_placement();
}
@@ -394,13 +420,15 @@ static void stop_self(void *v)
load_cr3(swapper_pg_dir);
/* should set up a minimal gdt */
+ set_cpu_online(cpu, false);
+
HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
BUG();
}
-static void xen_smp_send_stop(void)
+static void xen_stop_other_cpus(int wait)
{
- smp_call_function(stop_self, NULL, 0);
+ smp_call_function(stop_self, NULL, wait);
}
static void xen_smp_send_reschedule(int cpu)
@@ -468,7 +496,7 @@ static const struct smp_ops xen_smp_ops __initdata = {
.cpu_disable = xen_cpu_disable,
.play_dead = xen_play_dead,
- .smp_send_stop = xen_smp_send_stop,
+ .stop_other_cpus = xen_stop_other_cpus,
.smp_send_reschedule = xen_smp_send_reschedule,
.send_call_func_ipi = xen_smp_send_call_function_ipi,
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index e0500646585d..23e061b9327b 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -224,7 +224,7 @@ static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enab
goto out;
}
- flags = __raw_local_save_flags();
+ flags = arch_local_save_flags();
if (irq_enable) {
ADD_STATS(taken_slow_irqenable, 1);
raw_local_irq_enable();
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index a9c661108034..1d789d56877c 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -26,6 +26,18 @@ void xen_pre_suspend(void)
BUG();
}
+void xen_hvm_post_suspend(int suspend_cancelled)
+{
+ int cpu;
+ xen_hvm_init_shared_info();
+ xen_callback_vector();
+ if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
+ for_each_online_cpu(cpu) {
+ xen_setup_runstate_info(cpu);
+ }
+ }
+}
+
void xen_post_suspend(int suspend_cancelled)
{
xen_build_mfn_list_list();
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index b3c6c59ed302..b2bb5aa3b054 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -20,6 +20,7 @@
#include <asm/xen/hypercall.h>
#include <xen/events.h>
+#include <xen/features.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
@@ -155,47 +156,8 @@ static void do_stolen_accounting(void)
account_idle_ticks(ticks);
}
-/*
- * Xen sched_clock implementation. Returns the number of unstolen
- * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
- * states.
- */
-unsigned long long xen_sched_clock(void)
-{
- struct vcpu_runstate_info state;
- cycle_t now;
- u64 ret;
- s64 offset;
-
- /*
- * Ideally sched_clock should be called on a per-cpu basis
- * anyway, so preempt should already be disabled, but that's
- * not current practice at the moment.
- */
- preempt_disable();
-
- now = xen_clocksource_read();
-
- get_runstate_snapshot(&state);
-
- WARN_ON(state.state != RUNSTATE_running);
-
- offset = now - state.state_entry_time;
- if (offset < 0)
- offset = 0;
-
- ret = state.time[RUNSTATE_blocked] +
- state.time[RUNSTATE_running] +
- offset;
-
- preempt_enable();
-
- return ret;
-}
-
-
/* Get the TSC speed from Xen */
-unsigned long xen_tsc_khz(void)
+static unsigned long xen_tsc_khz(void)
{
struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
@@ -230,7 +192,7 @@ static void xen_read_wallclock(struct timespec *ts)
put_cpu_var(xen_vcpu);
}
-unsigned long xen_get_wallclock(void)
+static unsigned long xen_get_wallclock(void)
{
struct timespec ts;
@@ -238,7 +200,7 @@ unsigned long xen_get_wallclock(void)
return ts.tv_sec;
}
-int xen_set_wallclock(unsigned long now)
+static int xen_set_wallclock(unsigned long now)
{
/* do nothing for domU */
return -1;
@@ -473,7 +435,11 @@ void xen_timer_resume(void)
}
}
-__init void xen_time_init(void)
+static const struct pv_time_ops xen_time_ops __initdata = {
+ .sched_clock = xen_clocksource_read,
+};
+
+static __init void xen_time_init(void)
{
int cpu = smp_processor_id();
struct timespec tp;
@@ -497,3 +463,48 @@ __init void xen_time_init(void)
xen_setup_timer(cpu);
xen_setup_cpu_clockevents();
}
+
+__init void xen_init_time_ops(void)
+{
+ pv_time_ops = xen_time_ops;
+
+ x86_init.timers.timer_init = xen_time_init;
+ x86_init.timers.setup_percpu_clockev = x86_init_noop;
+ x86_cpuinit.setup_percpu_clockev = x86_init_noop;
+
+ x86_platform.calibrate_tsc = xen_tsc_khz;
+ x86_platform.get_wallclock = xen_get_wallclock;
+ x86_platform.set_wallclock = xen_set_wallclock;
+}
+
+#ifdef CONFIG_XEN_PVHVM
+static void xen_hvm_setup_cpu_clockevents(void)
+{
+ int cpu = smp_processor_id();
+ xen_setup_runstate_info(cpu);
+ xen_setup_timer(cpu);
+ xen_setup_cpu_clockevents();
+}
+
+__init void xen_hvm_init_time_ops(void)
+{
+ /* vector callback is needed otherwise we cannot receive interrupts
+ * on cpu > 0 and at this point we don't know how many cpus are
+ * available */
+ if (!xen_have_vector_callback)
+ return;
+ if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
+ printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
+ "disable pv timer\n");
+ return;
+ }
+
+ pv_time_ops = xen_time_ops;
+ x86_init.timers.setup_percpu_clockev = xen_time_init;
+ x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
+
+ x86_platform.calibrate_tsc = xen_tsc_khz;
+ x86_platform.get_wallclock = xen_get_wallclock;
+ x86_platform.set_wallclock = xen_set_wallclock;
+}
+#endif
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f9153a300bce..64044747348e 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,6 +30,9 @@ void xen_setup_machphys_mapping(void);
pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_ident_map_ISA(void);
void xen_reserve_top(void);
+extern unsigned long xen_max_p2m_pfn;
+
+void xen_set_pat(u64);
char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);
@@ -38,6 +41,10 @@ void xen_enable_sysenter(void);
void xen_enable_syscall(void);
void xen_vcpu_restore(void);
+void xen_callback_vector(void);
+void xen_hvm_init_shared_info(void);
+void __init xen_unplug_emulated_devices(void);
+
void __init xen_build_dynamic_phys_to_machine(void);
void xen_init_irq_ops(void);
@@ -46,11 +53,8 @@ void xen_setup_runstate_info(int cpu);
void xen_teardown_timer(int cpu);
cycle_t xen_clocksource_read(void);
void xen_setup_cpu_clockevents(void);
-unsigned long xen_tsc_khz(void);
-void __init xen_time_init(void);
-unsigned long xen_get_wallclock(void);
-int xen_set_wallclock(unsigned long time);
-unsigned long long xen_sched_clock(void);
+void __init xen_init_time_ops(void);
+void __init xen_hvm_init_time_ops(void);
irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
@@ -101,4 +105,6 @@ void xen_sysret32(void);
void xen_sysret64(void);
void xen_adjust_exception_frame(void);
+extern int xen_panic_handler_init(void);
+
#endif /* XEN_OPS_H */