summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar2008-12-31 08:31:57 +0100
committerIngo Molnar2008-12-31 08:31:57 +0100
commita9de18eb761f7c1c860964b2e5addc1a35c7e861 (patch)
tree886e75fdfd09690cd262ca69cb7f5d1d42b48602 /arch/x86
parentMerge branch 'linus' into stackprotector (diff)
parentMerge branch 'for-linus' of git://oss.sgi.com/xfs/xfs (diff)
downloadkernel-qcow2-linux-a9de18eb761f7c1c860964b2e5addc1a35c7e861.tar.gz
kernel-qcow2-linux-a9de18eb761f7c1c860964b2e5addc1a35c7e861.tar.xz
kernel-qcow2-linux-a9de18eb761f7c1c860964b2e5addc1a35c7e861.zip
Merge branch 'linus' into stackprotector
Conflicts: arch/x86/include/asm/pda.h kernel/fork.c
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig174
-rw-r--r--arch/x86/Kconfig.cpu25
-rw-r--r--arch/x86/Kconfig.debug24
-rw-r--r--arch/x86/Makefile6
-rw-r--r--arch/x86/boot/compressed/.gitignore2
-rw-r--r--arch/x86/boot/compressed/misc.c2
-rw-r--r--arch/x86/boot/tty.c2
-rw-r--r--arch/x86/boot/video-bios.c4
-rw-r--r--arch/x86/boot/video-vesa.c13
-rw-r--r--arch/x86/boot/video-vga.c4
-rw-r--r--arch/x86/boot/video.c2
-rw-r--r--arch/x86/configs/i386_defconfig5
-rw-r--r--arch/x86/configs/x86_64_defconfig4
-rw-r--r--arch/x86/crypto/crc32c-intel.c121
-rw-r--r--arch/x86/ia32/ia32_aout.c2
-rw-r--r--arch/x86/ia32/ia32_signal.c109
-rw-r--r--arch/x86/ia32/ia32entry.S4
-rw-r--r--arch/x86/ia32/sys_ia32.c99
-rw-r--r--arch/x86/include/asm/Kbuild24
-rw-r--r--arch/x86/include/asm/a.out-core.h73
-rw-r--r--arch/x86/include/asm/a.out.h20
-rw-r--r--arch/x86/include/asm/acpi.h177
-rw-r--r--arch/x86/include/asm/agp.h35
-rw-r--r--arch/x86/include/asm/alternative-asm.h22
-rw-r--r--arch/x86/include/asm/alternative.h183
-rw-r--r--arch/x86/include/asm/amd_iommu.h35
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h404
-rw-r--r--arch/x86/include/asm/apic.h200
-rw-r--r--arch/x86/include/asm/apicdef.h417
-rw-r--r--arch/x86/include/asm/arch_hooks.h26
-rw-r--r--arch/x86/include/asm/asm.h47
-rw-r--r--arch/x86/include/asm/atomic.h5
-rw-r--r--arch/x86/include/asm/atomic_32.h259
-rw-r--r--arch/x86/include/asm/atomic_64.h473
-rw-r--r--arch/x86/include/asm/auxvec.h12
-rw-r--r--arch/x86/include/asm/bigsmp/apic.h137
-rw-r--r--arch/x86/include/asm/bigsmp/apicdef.h13
-rw-r--r--arch/x86/include/asm/bigsmp/ipi.h25
-rw-r--r--arch/x86/include/asm/bios_ebda.h36
-rw-r--r--arch/x86/include/asm/bitops.h459
-rw-r--r--arch/x86/include/asm/boot.h26
-rw-r--r--arch/x86/include/asm/bootparam.h111
-rw-r--r--arch/x86/include/asm/bug.h39
-rw-r--r--arch/x86/include/asm/bugs.h12
-rw-r--r--arch/x86/include/asm/byteorder.h65
-rw-r--r--arch/x86/include/asm/cache.h20
-rw-r--r--arch/x86/include/asm/cacheflush.h118
-rw-r--r--arch/x86/include/asm/calgary.h72
-rw-r--r--arch/x86/include/asm/calling.h170
-rw-r--r--arch/x86/include/asm/checksum.h5
-rw-r--r--arch/x86/include/asm/checksum_32.h189
-rw-r--r--arch/x86/include/asm/checksum_64.h191
-rw-r--r--arch/x86/include/asm/cmpxchg.h5
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h344
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h185
-rw-r--r--arch/x86/include/asm/compat.h218
-rw-r--r--arch/x86/include/asm/cpu.h20
-rw-r--r--arch/x86/include/asm/cpufeature.h274
-rw-r--r--arch/x86/include/asm/cputime.h1
-rw-r--r--arch/x86/include/asm/current.h39
-rw-r--r--arch/x86/include/asm/debugreg.h70
-rw-r--r--arch/x86/include/asm/delay.h31
-rw-r--r--arch/x86/include/asm/desc.h409
-rw-r--r--arch/x86/include/asm/desc_defs.h95
-rw-r--r--arch/x86/include/asm/device.h16
-rw-r--r--arch/x86/include/asm/div64.h60
-rw-r--r--arch/x86/include/asm/dma-mapping.h306
-rw-r--r--arch/x86/include/asm/dma.h318
-rw-r--r--arch/x86/include/asm/dmi.h26
-rw-r--r--arch/x86/include/asm/ds.h272
-rw-r--r--arch/x86/include/asm/dwarf2.h96
-rw-r--r--arch/x86/include/asm/e820.h146
-rw-r--r--arch/x86/include/asm/edac.h18
-rw-r--r--arch/x86/include/asm/efi.h110
-rw-r--r--arch/x86/include/asm/elf.h336
-rw-r--r--arch/x86/include/asm/emergency-restart.h20
-rw-r--r--arch/x86/include/asm/errno.h1
-rw-r--r--arch/x86/include/asm/es7000/apic.h220
-rw-r--r--arch/x86/include/asm/es7000/apicdef.h13
-rw-r--r--arch/x86/include/asm/es7000/ipi.h24
-rw-r--r--arch/x86/include/asm/es7000/mpparse.h30
-rw-r--r--arch/x86/include/asm/es7000/wakecpu.h37
-rw-r--r--arch/x86/include/asm/fb.h21
-rw-r--r--arch/x86/include/asm/fcntl.h1
-rw-r--r--arch/x86/include/asm/fixmap.h72
-rw-r--r--arch/x86/include/asm/fixmap_32.h119
-rw-r--r--arch/x86/include/asm/fixmap_64.h83
-rw-r--r--arch/x86/include/asm/floppy.h281
-rw-r--r--arch/x86/include/asm/frame.h27
-rw-r--r--arch/x86/include/asm/ftrace.h83
-rw-r--r--arch/x86/include/asm/futex.h140
-rw-r--r--arch/x86/include/asm/gart.h106
-rw-r--r--arch/x86/include/asm/genapic.h5
-rw-r--r--arch/x86/include/asm/genapic_32.h143
-rw-r--r--arch/x86/include/asm/genapic_64.h60
-rw-r--r--arch/x86/include/asm/geode.h253
-rw-r--r--arch/x86/include/asm/gpio.h56
-rw-r--r--arch/x86/include/asm/hardirq.h11
-rw-r--r--arch/x86/include/asm/hardirq_32.h30
-rw-r--r--arch/x86/include/asm/hardirq_64.h25
-rw-r--r--arch/x86/include/asm/highmem.h79
-rw-r--r--arch/x86/include/asm/hpet.h114
-rw-r--r--arch/x86/include/asm/hugetlb.h93
-rw-r--r--arch/x86/include/asm/hw_irq.h129
-rw-r--r--arch/x86/include/asm/hypertransport.h45
-rw-r--r--arch/x86/include/asm/hypervisor.h26
-rw-r--r--arch/x86/include/asm/i387.h400
-rw-r--r--arch/x86/include/asm/i8253.h18
-rw-r--r--arch/x86/include/asm/i8259.h63
-rw-r--r--arch/x86/include/asm/ia32.h152
-rw-r--r--arch/x86/include/asm/ia32_unistd.h18
-rw-r--r--arch/x86/include/asm/idle.h21
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/io.h110
-rw-r--r--arch/x86/include/asm/io_32.h284
-rw-r--r--arch/x86/include/asm/io_64.h242
-rw-r--r--arch/x86/include/asm/io_apic.h211
-rw-r--r--arch/x86/include/asm/ioctl.h1
-rw-r--r--arch/x86/include/asm/ioctls.h94
-rw-r--r--arch/x86/include/asm/iomap.h30
-rw-r--r--arch/x86/include/asm/iommu.h13
-rw-r--r--arch/x86/include/asm/ipcbuf.h28
-rw-r--r--arch/x86/include/asm/ipi.h138
-rw-r--r--arch/x86/include/asm/irq.h46
-rw-r--r--arch/x86/include/asm/irq_regs.h5
-rw-r--r--arch/x86/include/asm/irq_regs_32.h31
-rw-r--r--arch/x86/include/asm/irq_regs_64.h1
-rw-r--r--arch/x86/include/asm/irq_remapping.h8
-rw-r--r--arch/x86/include/asm/irq_vectors.h167
-rw-r--r--arch/x86/include/asm/irqflags.h211
-rw-r--r--arch/x86/include/asm/ist.h34
-rw-r--r--arch/x86/include/asm/k8.h15
-rw-r--r--arch/x86/include/asm/kdebug.h37
-rw-r--r--arch/x86/include/asm/kexec.h176
-rw-r--r--arch/x86/include/asm/kgdb.h79
-rw-r--r--arch/x86/include/asm/kmap_types.h29
-rw-r--r--arch/x86/include/asm/kprobes.h88
-rw-r--r--arch/x86/include/asm/kvm.h211
-rw-r--r--arch/x86/include/asm/kvm_host.h755
-rw-r--r--arch/x86/include/asm/kvm_para.h147
-rw-r--r--arch/x86/include/asm/kvm_x86_emulate.h184
-rw-r--r--arch/x86/include/asm/ldt.h40
-rw-r--r--arch/x86/include/asm/lguest.h94
-rw-r--r--arch/x86/include/asm/lguest_hcall.h71
-rw-r--r--arch/x86/include/asm/linkage.h121
-rw-r--r--arch/x86/include/asm/local.h235
-rw-r--r--arch/x86/include/asm/mach-default/apm.h73
-rw-r--r--arch/x86/include/asm/mach-default/do_timer.h16
-rw-r--r--arch/x86/include/asm/mach-default/entry_arch.h36
-rw-r--r--arch/x86/include/asm/mach-default/mach_apic.h158
-rw-r--r--arch/x86/include/asm/mach-default/mach_apicdef.h24
-rw-r--r--arch/x86/include/asm/mach-default/mach_ipi.h64
-rw-r--r--arch/x86/include/asm/mach-default/mach_mpparse.h17
-rw-r--r--arch/x86/include/asm/mach-default/mach_mpspec.h12
-rw-r--r--arch/x86/include/asm/mach-default/mach_timer.h48
-rw-r--r--arch/x86/include/asm/mach-default/mach_traps.h33
-rw-r--r--arch/x86/include/asm/mach-default/mach_wakecpu.h35
-rw-r--r--arch/x86/include/asm/mach-default/pci-functions.h19
-rw-r--r--arch/x86/include/asm/mach-default/setup_arch.h3
-rw-r--r--arch/x86/include/asm/mach-default/smpboot_hooks.h61
-rw-r--r--arch/x86/include/asm/mach-generic/gpio.h15
-rw-r--r--arch/x86/include/asm/mach-generic/mach_apic.h34
-rw-r--r--arch/x86/include/asm/mach-generic/mach_apicdef.h11
-rw-r--r--arch/x86/include/asm/mach-generic/mach_ipi.h10
-rw-r--r--arch/x86/include/asm/mach-generic/mach_mpparse.h10
-rw-r--r--arch/x86/include/asm/mach-generic/mach_mpspec.h12
-rw-r--r--arch/x86/include/asm/mach-generic/mach_wakecpu.h12
-rw-r--r--arch/x86/include/asm/mach-rdc321x/gpio.h60
-rw-r--r--arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h12
-rw-r--r--arch/x86/include/asm/mach-voyager/do_timer.h17
-rw-r--r--arch/x86/include/asm/mach-voyager/entry_arch.h26
-rw-r--r--arch/x86/include/asm/mach-voyager/setup_arch.h12
-rw-r--r--arch/x86/include/asm/math_emu.h31
-rw-r--r--arch/x86/include/asm/mc146818rtc.h104
-rw-r--r--arch/x86/include/asm/mca.h43
-rw-r--r--arch/x86/include/asm/mca_dma.h201
-rw-r--r--arch/x86/include/asm/mce.h130
-rw-r--r--arch/x86/include/asm/microcode.h47
-rw-r--r--arch/x86/include/asm/mman.h20
-rw-r--r--arch/x86/include/asm/mmconfig.h12
-rw-r--r--arch/x86/include/asm/mmu.h26
-rw-r--r--arch/x86/include/asm/mmu_context.h37
-rw-r--r--arch/x86/include/asm/mmu_context_32.h55
-rw-r--r--arch/x86/include/asm/mmu_context_64.h54
-rw-r--r--arch/x86/include/asm/mmx.h14
-rw-r--r--arch/x86/include/asm/mmzone.h5
-rw-r--r--arch/x86/include/asm/mmzone_32.h138
-rw-r--r--arch/x86/include/asm/mmzone_64.h51
-rw-r--r--arch/x86/include/asm/module.h80
-rw-r--r--arch/x86/include/asm/mpspec.h145
-rw-r--r--arch/x86/include/asm/mpspec_def.h180
-rw-r--r--arch/x86/include/asm/msgbuf.h39
-rw-r--r--arch/x86/include/asm/msidef.h55
-rw-r--r--arch/x86/include/asm/msr-index.h334
-rw-r--r--arch/x86/include/asm/msr.h246
-rw-r--r--arch/x86/include/asm/mtrr.h173
-rw-r--r--arch/x86/include/asm/mutex.h5
-rw-r--r--arch/x86/include/asm/mutex_32.h125
-rw-r--r--arch/x86/include/asm/mutex_64.h100
-rw-r--r--arch/x86/include/asm/nmi.h81
-rw-r--r--arch/x86/include/asm/nops.h118
-rw-r--r--arch/x86/include/asm/numa.h5
-rw-r--r--arch/x86/include/asm/numa_32.h11
-rw-r--r--arch/x86/include/asm/numa_64.h43
-rw-r--r--arch/x86/include/asm/numaq.h169
-rw-r--r--arch/x86/include/asm/numaq/apic.h136
-rw-r--r--arch/x86/include/asm/numaq/apicdef.h14
-rw-r--r--arch/x86/include/asm/numaq/ipi.h25
-rw-r--r--arch/x86/include/asm/numaq/mpparse.h7
-rw-r--r--arch/x86/include/asm/numaq/wakecpu.h45
-rw-r--r--arch/x86/include/asm/olpc.h132
-rw-r--r--arch/x86/include/asm/page.h209
-rw-r--r--arch/x86/include/asm/page_32.h136
-rw-r--r--arch/x86/include/asm/page_64.h105
-rw-r--r--arch/x86/include/asm/param.h22
-rw-r--r--arch/x86/include/asm/paravirt.h1650
-rw-r--r--arch/x86/include/asm/parport.h10
-rw-r--r--arch/x86/include/asm/pat.h22
-rw-r--r--arch/x86/include/asm/pci-direct.h21
-rw-r--r--arch/x86/include/asm/pci.h118
-rw-r--r--arch/x86/include/asm/pci_32.h34
-rw-r--r--arch/x86/include/asm/pci_64.h51
-rw-r--r--arch/x86/include/asm/pda.h137
-rw-r--r--arch/x86/include/asm/percpu.h218
-rw-r--r--arch/x86/include/asm/pgalloc.h114
-rw-r--r--arch/x86/include/asm/pgtable-2level-defs.h20
-rw-r--r--arch/x86/include/asm/pgtable-2level.h111
-rw-r--r--arch/x86/include/asm/pgtable-3level-defs.h28
-rw-r--r--arch/x86/include/asm/pgtable-3level.h176
-rw-r--r--arch/x86/include/asm/pgtable.h578
-rw-r--r--arch/x86/include/asm/pgtable_32.h182
-rw-r--r--arch/x86/include/asm/pgtable_64.h291
-rw-r--r--arch/x86/include/asm/poll.h1
-rw-r--r--arch/x86/include/asm/posix_types.h13
-rw-r--r--arch/x86/include/asm/posix_types_32.h85
-rw-r--r--arch/x86/include/asm/posix_types_64.h119
-rw-r--r--arch/x86/include/asm/prctl.h13
-rw-r--r--arch/x86/include/asm/processor-cyrix.h38
-rw-r--r--arch/x86/include/asm/processor-flags.h100
-rw-r--r--arch/x86/include/asm/processor.h953
-rw-r--r--arch/x86/include/asm/proto.h32
-rw-r--r--arch/x86/include/asm/ptrace-abi.h145
-rw-r--r--arch/x86/include/asm/ptrace.h249
-rw-r--r--arch/x86/include/asm/pvclock-abi.h42
-rw-r--r--arch/x86/include/asm/pvclock.h14
-rw-r--r--arch/x86/include/asm/reboot.h26
-rw-r--r--arch/x86/include/asm/reboot_fixups.h6
-rw-r--r--arch/x86/include/asm/required-features.h82
-rw-r--r--arch/x86/include/asm/resource.h1
-rw-r--r--arch/x86/include/asm/resume-trace.h21
-rw-r--r--arch/x86/include/asm/rio.h63
-rw-r--r--arch/x86/include/asm/rtc.h1
-rw-r--r--arch/x86/include/asm/rwlock.h8
-rw-r--r--arch/x86/include/asm/rwsem.h265
-rw-r--r--arch/x86/include/asm/scatterlist.h33
-rw-r--r--arch/x86/include/asm/seccomp.h5
-rw-r--r--arch/x86/include/asm/seccomp_32.h17
-rw-r--r--arch/x86/include/asm/seccomp_64.h25
-rw-r--r--arch/x86/include/asm/sections.h1
-rw-r--r--arch/x86/include/asm/segment.h209
-rw-r--r--arch/x86/include/asm/sembuf.h24
-rw-r--r--arch/x86/include/asm/serial.h29
-rw-r--r--arch/x86/include/asm/setup.h112
-rw-r--r--arch/x86/include/asm/shmbuf.h51
-rw-r--r--arch/x86/include/asm/shmparam.h6
-rw-r--r--arch/x86/include/asm/sigcontext.h284
-rw-r--r--arch/x86/include/asm/sigcontext32.h75
-rw-r--r--arch/x86/include/asm/sigframe.h70
-rw-r--r--arch/x86/include/asm/siginfo.h10
-rw-r--r--arch/x86/include/asm/signal.h264
-rw-r--r--arch/x86/include/asm/smp.h235
-rw-r--r--arch/x86/include/asm/socket.h57
-rw-r--r--arch/x86/include/asm/sockios.h13
-rw-r--r--arch/x86/include/asm/sparsemem.h34
-rw-r--r--arch/x86/include/asm/spinlock.h364
-rw-r--r--arch/x86/include/asm/spinlock_types.h20
-rw-r--r--arch/x86/include/asm/srat.h39
-rw-r--r--arch/x86/include/asm/stacktrace.h21
-rw-r--r--arch/x86/include/asm/stat.h114
-rw-r--r--arch/x86/include/asm/statfs.h12
-rw-r--r--arch/x86/include/asm/string.h5
-rw-r--r--arch/x86/include/asm/string_32.h326
-rw-r--r--arch/x86/include/asm/string_64.h60
-rw-r--r--arch/x86/include/asm/summit/apic.h184
-rw-r--r--arch/x86/include/asm/summit/apicdef.h13
-rw-r--r--arch/x86/include/asm/summit/ipi.h25
-rw-r--r--arch/x86/include/asm/summit/mpparse.h109
-rw-r--r--arch/x86/include/asm/suspend.h5
-rw-r--r--arch/x86/include/asm/suspend_32.h51
-rw-r--r--arch/x86/include/asm/suspend_64.h52
-rw-r--r--arch/x86/include/asm/swiotlb.h58
-rw-r--r--arch/x86/include/asm/sync_bitops.h130
-rw-r--r--arch/x86/include/asm/syscall.h213
-rw-r--r--arch/x86/include/asm/syscalls.h93
-rw-r--r--arch/x86/include/asm/system.h431
-rw-r--r--arch/x86/include/asm/system_64.h22
-rw-r--r--arch/x86/include/asm/tce.h48
-rw-r--r--arch/x86/include/asm/termbits.h198
-rw-r--r--arch/x86/include/asm/termios.h113
-rw-r--r--arch/x86/include/asm/therm_throt.h9
-rw-r--r--arch/x86/include/asm/thread_info.h263
-rw-r--r--arch/x86/include/asm/time.h63
-rw-r--r--arch/x86/include/asm/timer.h66
-rw-r--r--arch/x86/include/asm/timex.h19
-rw-r--r--arch/x86/include/asm/tlb.h11
-rw-r--r--arch/x86/include/asm/tlbflush.h178
-rw-r--r--arch/x86/include/asm/topology.h259
-rw-r--r--arch/x86/include/asm/trampoline.h28
-rw-r--r--arch/x86/include/asm/traps.h88
-rw-r--r--arch/x86/include/asm/tsc.h62
-rw-r--r--arch/x86/include/asm/types.h36
-rw-r--r--arch/x86/include/asm/uaccess.h456
-rw-r--r--arch/x86/include/asm/uaccess_32.h218
-rw-r--r--arch/x86/include/asm/uaccess_64.h208
-rw-r--r--arch/x86/include/asm/ucontext.h18
-rw-r--r--arch/x86/include/asm/unaligned.h14
-rw-r--r--arch/x86/include/asm/unistd.h13
-rw-r--r--arch/x86/include/asm/unistd_32.h379
-rw-r--r--arch/x86/include/asm/unistd_64.h693
-rw-r--r--arch/x86/include/asm/unwind.h13
-rw-r--r--arch/x86/include/asm/user.h5
-rw-r--r--arch/x86/include/asm/user32.h70
-rw-r--r--arch/x86/include/asm/user_32.h131
-rw-r--r--arch/x86/include/asm/user_64.h137
-rw-r--r--arch/x86/include/asm/uv/bios.h120
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h332
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h404
-rw-r--r--arch/x86/include/asm/uv/uv_irq.h36
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h1295
-rw-r--r--arch/x86/include/asm/vdso.h47
-rw-r--r--arch/x86/include/asm/vga.h20
-rw-r--r--arch/x86/include/asm/vgtod.h29
-rw-r--r--arch/x86/include/asm/vic.h61
-rw-r--r--arch/x86/include/asm/visws/cobalt.h125
-rw-r--r--arch/x86/include/asm/visws/lithium.h53
-rw-r--r--arch/x86/include/asm/visws/piix4.h107
-rw-r--r--arch/x86/include/asm/visws/sgivw.h5
-rw-r--r--arch/x86/include/asm/vm86.h208
-rw-r--r--arch/x86/include/asm/vmi.h269
-rw-r--r--arch/x86/include/asm/vmi_time.h98
-rw-r--r--arch/x86/include/asm/vmware.h27
-rw-r--r--arch/x86/include/asm/voyager.h529
-rw-r--r--arch/x86/include/asm/vsyscall.h44
-rw-r--r--arch/x86/include/asm/xcr.h49
-rw-r--r--arch/x86/include/asm/xen/events.h24
-rw-r--r--arch/x86/include/asm/xen/grant_table.h7
-rw-r--r--arch/x86/include/asm/xen/hypercall.h533
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h59
-rw-r--r--arch/x86/include/asm/xen/interface.h175
-rw-r--r--arch/x86/include/asm/xen/interface_32.h97
-rw-r--r--arch/x86/include/asm/xen/interface_64.h159
-rw-r--r--arch/x86/include/asm/xen/page.h170
-rw-r--r--arch/x86/include/asm/xor.h5
-rw-r--r--arch/x86/include/asm/xor_32.h888
-rw-r--r--arch/x86/include/asm/xor_64.h361
-rw-r--r--arch/x86/include/asm/xsave.h118
-rw-r--r--arch/x86/kernel/Makefile24
-rw-r--r--arch/x86/kernel/acpi/boot.c31
-rw-r--r--arch/x86/kernel/acpi/sleep.c7
-rw-r--r--arch/x86/kernel/amd_iommu.c73
-rw-r--r--arch/x86/kernel/amd_iommu_init.c16
-rw-r--r--arch/x86/kernel/aperture_64.c5
-rw-r--r--arch/x86/kernel/apic.c (renamed from arch/x86/kernel/apic_32.c)690
-rw-r--r--arch/x86/kernel/apic_64.c1848
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c6
-rw-r--r--arch/x86/kernel/bios_uv.c187
-rw-r--r--arch/x86/kernel/check.c161
-rw-r--r--arch/x86/kernel/cpu/.gitignore1
-rw-r--r--arch/x86/kernel/cpu/Makefile8
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c10
-rw-r--r--arch/x86/kernel/cpu/amd.c11
-rw-r--r--arch/x86/kernel/cpu/common.c15
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c7
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c62
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h17
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c2
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c58
-rw-r--r--arch/x86/kernel/cpu/intel.c25
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c17
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c346
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c11
-rw-r--r--arch/x86/kernel/cpu/proc.c6
-rw-r--r--arch/x86/kernel/cpu/vmware.c112
-rw-r--r--arch/x86/kernel/cpuid.c4
-rw-r--r--arch/x86/kernel/crash.c70
-rw-r--r--arch/x86/kernel/crash_dump_32.c3
-rw-r--r--arch/x86/kernel/crash_dump_64.c3
-rw-r--r--arch/x86/kernel/ds.c1136
-rw-r--r--arch/x86/kernel/dumpstack.c351
-rw-r--r--arch/x86/kernel/dumpstack.h39
-rw-r--r--arch/x86/kernel/dumpstack_32.c307
-rw-r--r--arch/x86/kernel/dumpstack_64.c289
-rw-r--r--arch/x86/kernel/e820.c28
-rw-r--r--arch/x86/kernel/early-quirks.c62
-rw-r--r--arch/x86/kernel/early_printk.c47
-rw-r--r--arch/x86/kernel/efi.c4
-rw-r--r--arch/x86/kernel/entry_32.S548
-rw-r--r--arch/x86/kernel/entry_64.S1486
-rw-r--r--arch/x86/kernel/es7000_32.c71
-rw-r--r--arch/x86/kernel/ftrace.c500
-rw-r--r--arch/x86/kernel/genapic_64.c4
-rw-r--r--arch/x86/kernel/genapic_flat_64.c8
-rw-r--r--arch/x86/kernel/genx2apic_cluster.c2
-rw-r--r--arch/x86/kernel/genx2apic_phys.c2
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c165
-rw-r--r--arch/x86/kernel/head.c1
-rw-r--r--arch/x86/kernel/head32.c3
-rw-r--r--arch/x86/kernel/head64.c3
-rw-r--r--arch/x86/kernel/hpet.c464
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c2
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/io_apic.c (renamed from arch/x86/kernel/io_apic_64.c)1960
-rw-r--r--arch/x86/kernel/io_apic_32.c2908
-rw-r--r--arch/x86/kernel/irq.c192
-rw-r--r--arch/x86/kernel/irq_32.c196
-rw-r--r--arch/x86/kernel/irq_64.c192
-rw-r--r--arch/x86/kernel/irqinit_32.c48
-rw-r--r--arch/x86/kernel/irqinit_64.c91
-rw-r--r--arch/x86/kernel/k8.c1
-rw-r--r--arch/x86/kernel/kvmclock.c32
-rw-r--r--arch/x86/kernel/machine_kexec_32.c109
-rw-r--r--arch/x86/kernel/microcode_amd.c234
-rw-r--r--arch/x86/kernel/microcode_core.c29
-rw-r--r--arch/x86/kernel/microcode_intel.c8
-rw-r--r--arch/x86/kernel/mpparse.c24
-rw-r--r--arch/x86/kernel/msr.c4
-rw-r--r--arch/x86/kernel/nmi.c58
-rw-r--r--arch/x86/kernel/numaq_32.c10
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c3
-rw-r--r--arch/x86/kernel/pci-calgary_64.c20
-rw-r--r--arch/x86/kernel/pci-dma.c24
-rw-r--r--arch/x86/kernel/pci-gart_64.c14
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c43
-rw-r--r--arch/x86/kernel/process.c35
-rw-r--r--arch/x86/kernel/process_32.c67
-rw-r--r--arch/x86/kernel/process_64.c65
-rw-r--r--arch/x86/kernel/ptrace.c432
-rw-r--r--arch/x86/kernel/pvclock.c12
-rw-r--r--arch/x86/kernel/quirks.c5
-rw-r--r--arch/x86/kernel/reboot.c141
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S115
-rw-r--r--arch/x86/kernel/rtc.c20
-rw-r--r--arch/x86/kernel/setup.c195
-rw-r--r--arch/x86/kernel/setup_percpu.c19
-rw-r--r--arch/x86/kernel/sigframe.h42
-rw-r--r--arch/x86/kernel/signal.c (renamed from arch/x86/kernel/signal_32.c)567
-rw-r--r--arch/x86/kernel/signal_64.c516
-rw-r--r--arch/x86/kernel/smp.c31
-rw-r--r--arch/x86/kernel/smpboot.c55
-rw-r--r--arch/x86/kernel/stacktrace.c64
-rw-r--r--arch/x86/kernel/syscall_64.c4
-rw-r--r--arch/x86/kernel/time_32.c2
-rw-r--r--arch/x86/kernel/time_64.c6
-rw-r--r--arch/x86/kernel/tlb_32.c19
-rw-r--r--arch/x86/kernel/tlb_64.c7
-rw-r--r--arch/x86/kernel/tlb_uv.c6
-rw-r--r--arch/x86/kernel/trampoline.c19
-rw-r--r--arch/x86/kernel/traps.c46
-rw-r--r--arch/x86/kernel/tsc.c54
-rw-r--r--arch/x86/kernel/tsc_sync.c12
-rw-r--r--arch/x86/kernel/uv_irq.c79
-rw-r--r--arch/x86/kernel/uv_sysfs.c72
-rw-r--r--arch/x86/kernel/visws_quirks.c32
-rw-r--r--arch/x86/kernel/vmi_32.c135
-rw-r--r--arch/x86/kernel/vmiclock_32.c3
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S1
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S1
-rw-r--r--arch/x86/kernel/vsmp_64.c2
-rw-r--r--arch/x86/kernel/vsyscall_64.c12
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c2
-rw-r--r--arch/x86/kernel/xsave.c4
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/Makefile5
-rw-r--r--arch/x86/kvm/i8254.c96
-rw-r--r--arch/x86/kvm/i8254.h8
-rw-r--r--arch/x86/kvm/i8259.c53
-rw-r--r--arch/x86/kvm/irq.c3
-rw-r--r--arch/x86/kvm/irq.h6
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h32
-rw-r--r--arch/x86/kvm/lapic.c49
-rw-r--r--arch/x86/kvm/mmu.c683
-rw-r--r--arch/x86/kvm/paging_tmpl.h250
-rw-r--r--arch/x86/kvm/svm.c156
-rw-r--r--arch/x86/kvm/vmx.c717
-rw-r--r--arch/x86/kvm/vmx.h4
-rw-r--r--arch/x86/kvm/x86.c558
-rw-r--r--arch/x86/kvm/x86.h22
-rw-r--r--arch/x86/kvm/x86_emulate.c170
-rw-r--r--arch/x86/lguest/boot.c35
-rw-r--r--arch/x86/lguest/i386_head.S15
-rw-r--r--arch/x86/lib/usercopy_32.c8
-rw-r--r--arch/x86/lib/usercopy_64.c4
-rw-r--r--arch/x86/mach-generic/bigsmp.c5
-rw-r--r--arch/x86/mach-generic/default.c1
-rw-r--r--arch/x86/mach-generic/es7000.c28
-rw-r--r--arch/x86/mach-generic/numaq.c14
-rw-r--r--arch/x86/mach-generic/probe.c16
-rw-r--r--arch/x86/mach-generic/summit.c15
-rw-r--r--arch/x86/mach-voyager/setup.c2
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c34
-rw-r--r--arch/x86/mm/Makefile5
-rw-r--r--arch/x86/mm/fault.c60
-rw-r--r--arch/x86/mm/gup.c2
-rw-r--r--arch/x86/mm/highmem_32.c1
-rw-r--r--arch/x86/mm/init_32.c36
-rw-r--r--arch/x86/mm/init_64.c77
-rw-r--r--arch/x86/mm/iomap_32.c59
-rw-r--r--arch/x86/mm/ioremap.c29
-rw-r--r--arch/x86/mm/memtest.c7
-rw-r--r--arch/x86/mm/mmio-mod.c87
-rw-r--r--arch/x86/mm/numa_32.c35
-rw-r--r--arch/x86/mm/pageattr.c13
-rw-r--r--arch/x86/mm/pat.c240
-rw-r--r--arch/x86/mm/pf_in.c121
-rw-r--r--arch/x86/mm/testmmiotrace.c4
-rw-r--r--arch/x86/oprofile/backtrace.c3
-rw-r--r--arch/x86/oprofile/nmi_int.c178
-rw-r--r--arch/x86/oprofile/op_counter.h18
-rw-r--r--arch/x86/oprofile/op_model_amd.c144
-rw-r--r--arch/x86/oprofile/op_model_p4.c32
-rw-r--r--arch/x86/oprofile/op_model_ppro.c131
-rw-r--r--arch/x86/oprofile/op_x86_model.h13
-rw-r--r--arch/x86/pci/common.c17
-rw-r--r--arch/x86/pci/direct.c4
-rw-r--r--arch/x86/pci/fixup.c25
-rw-r--r--arch/x86/pci/irq.c19
-rw-r--r--arch/x86/pci/pci.h1
-rw-r--r--arch/x86/power/hibernate_32.c4
-rw-r--r--arch/x86/scripts/strip-symbols1
-rw-r--r--arch/x86/vdso/vclock_gettime.c3
-rw-r--r--arch/x86/vdso/vdso32-setup.c2
-rw-r--r--arch/x86/vdso/vma.c2
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c21
-rw-r--r--arch/x86/xen/irq.c2
-rw-r--r--arch/x86/xen/mmu.c66
-rw-r--r--arch/x86/xen/multicalls.c2
-rw-r--r--arch/x86/xen/setup.c9
-rw-r--r--arch/x86/xen/smp.c2
-rw-r--r--arch/x86/xen/spinlock.c2
-rw-r--r--arch/x86/xen/time.c11
-rw-r--r--arch/x86/xen/xen-ops.h2
554 files changed, 50687 insertions, 12097 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e73ddc382a16..615668d87bfd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -19,6 +19,8 @@ config X86_64
config X86
def_bool y
select HAVE_AOUT if X86_32
+ select HAVE_READQ
+ select HAVE_WRITEQ
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_IDE
select HAVE_OPROFILE
@@ -26,23 +28,23 @@ config X86
select HAVE_KPROBES
select ARCH_WANT_OPTIONAL_GPIOLIB
select HAVE_KRETPROBES
+ select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_DYNAMIC_FTRACE
- select HAVE_FTRACE
+ select HAVE_FUNCTION_TRACER
+ select HAVE_FUNCTION_GRAPH_TRACER
+ select HAVE_FUNCTION_TRACE_MCOUNT_TEST
select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
select HAVE_ARCH_KGDB if !X86_VOYAGER
select HAVE_ARCH_TRACEHOOK
select HAVE_GENERIC_DMA_COHERENT if X86_32
select HAVE_EFFICIENT_UNALIGNED_ACCESS
+ select USER_STACKTRACE_SUPPORT
config ARCH_DEFCONFIG
string
default "arch/x86/configs/i386_defconfig" if X86_32
default "arch/x86/configs/x86_64_defconfig" if X86_64
-
-config GENERIC_LOCKBREAK
- def_bool n
-
config GENERIC_TIME
def_bool y
@@ -90,12 +92,16 @@ config GENERIC_IOMAP
config GENERIC_BUG
def_bool y
depends on BUG
+ select GENERIC_BUG_RELATIVE_POINTERS if X86_64
+
+config GENERIC_BUG_RELATIVE_POINTERS
+ bool
config GENERIC_HWEIGHT
def_bool y
config GENERIC_GPIO
- def_bool n
+ bool
config ARCH_MAY_HAVE_PC_FDC
def_bool y
@@ -106,12 +112,6 @@ config RWSEM_GENERIC_SPINLOCK
config RWSEM_XCHGADD_ALGORITHM
def_bool X86_XADD
-config ARCH_HAS_ILOG2_U32
- def_bool n
-
-config ARCH_HAS_ILOG2_U64
- def_bool n
-
config ARCH_HAS_CPU_IDLE_WAIT
def_bool y
@@ -125,6 +125,9 @@ config GENERIC_TIME_VSYSCALL
config ARCH_HAS_CPU_RELAX
def_bool y
+config ARCH_HAS_DEFAULT_IDLE
+ def_bool y
+
config ARCH_HAS_CACHE_LINE_SIZE
def_bool y
@@ -173,9 +176,12 @@ config GENERIC_PENDING_IRQ
config X86_SMP
bool
depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64)
- select USE_GENERIC_SMP_HELPERS
default y
+config USE_GENERIC_SMP_HELPERS
+ def_bool y
+ depends on SMP
+
config X86_32_SMP
def_bool y
depends on X86_32 && SMP
@@ -203,6 +209,7 @@ config X86_TRAMPOLINE
config KTIME_SCALAR
def_bool X86_32
source "init/Kconfig"
+source "kernel/Kconfig.freezer"
menu "Processor type and features"
@@ -236,25 +243,43 @@ config SMP
If you don't know what to do here, say N.
+config X86_HAS_BOOT_CPU_ID
+ def_bool y
+ depends on X86_VOYAGER
+
+config SPARSE_IRQ
+ bool "Support sparse irq numbering"
+ depends on PCI_MSI || HT_IRQ
+ help
+ This enables support for sparse irqs. This is useful for distro
+ kernels that want to define a high CONFIG_NR_CPUS value but still
+ want to have low kernel memory footprint on smaller machines.
+
+ ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
+ out the irq_desc[] array in a more NUMA-friendly way. )
+
+ If you don't know what to do here, say N.
+
+config NUMA_MIGRATE_IRQ_DESC
+ bool "Move irq desc when changing irq smp_affinity"
+ depends on SPARSE_IRQ && NUMA
+ default n
+ help
+ This enables moving irq_desc to cpu/node that irq will use handled.
+
+ If you don't know what to do here, say N.
+
config X86_FIND_SMP_CONFIG
def_bool y
depends on X86_MPPARSE || X86_VOYAGER
-if ACPI
config X86_MPPARSE
- def_bool y
- bool "Enable MPS table"
+ bool "Enable MPS table" if ACPI
+ default y
depends on X86_LOCAL_APIC
help
For old smp systems that do not have proper acpi support. Newer systems
(esp with 64bit cpus) with acpi support, MADT and DSDT will override it
-endif
-
-if !ACPI
-config X86_MPPARSE
- def_bool y
- depends on X86_LOCAL_APIC
-endif
choice
prompt "Subarchitecture Type"
@@ -365,10 +390,10 @@ config X86_RDC321X
as R-8610-(G).
If you don't have one of these chips, you should say N here.
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
def_bool y
prompt "Single-depth WCHAN output"
- depends on X86_32
+ depends on X86
help
Calculate simpler /proc/<PID>/wchan values. If this option
is disabled then wchan values will recurse back to the
@@ -463,10 +488,6 @@ config X86_CYCLONE_TIMER
def_bool y
depends on X86_GENERICARCH
-config ES7000_CLUSTERED_APIC
- def_bool y
- depends on SMP && X86_ES7000 && MPENTIUMIII
-
source "arch/x86/Kconfig.cpu"
config HPET_TIMER
@@ -480,7 +501,7 @@ config HPET_TIMER
The HPET provides a stable time base on SMP
systems, unlike the TSC, but it is more expensive to access,
as it is off-chip. You can find the HPET spec at
- <http://www.intel.com/hardwaredesign/hpetspec.htm>.
+ <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
You can safely choose Y here. However, HPET will only be
activated if the platform and the BIOS support this feature.
@@ -567,7 +588,7 @@ config AMD_IOMMU
# need this always selected by IOMMU for the VIA workaround
config SWIOTLB
- bool
+ def_bool y if X86_64
help
Support for software bounce buffers used on x86-64 systems
which don't have a hardware IOMMU (e.g. the current generation
@@ -658,6 +679,30 @@ config X86_VISWS_APIC
def_bool y
depends on X86_32 && X86_VISWS
+config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
+ bool "Reroute for broken boot IRQs"
+ default n
+ depends on X86_IO_APIC
+ help
+ This option enables a workaround that fixes a source of
+ spurious interrupts. This is recommended when threaded
+ interrupt handling is used on systems where the generation of
+ superfluous "boot interrupts" cannot be disabled.
+
+ Some chipsets generate a legacy INTx "boot IRQ" when the IRQ
+ entry in the chipset's IO-APIC is masked (as, e.g. the RT
+ kernel does during interrupt handling). On chipsets where this
+ boot IRQ generation cannot be disabled, this workaround keeps
+ the original IRQ line masked so that only the equivalent "boot
+ IRQ" is delivered to the CPUs. The workaround also tells the
+ kernel to set up the IRQ handler on the boot IRQ line. In this
+ way only one interrupt is delivered to the kernel. Otherwise
+ the spurious second interrupt may cause the kernel to bring
+ down (vital) interrupt lines.
+
+ Only affects "broken" chipsets. Interrupt sharing may be
+ increased on these systems.
+
config X86_MCE
bool "Machine Check Exception"
depends on !X86_VOYAGER
@@ -758,9 +803,8 @@ config I8K
Say N otherwise.
config X86_REBOOTFIXUPS
- def_bool n
- prompt "Enable X86 board specific fixups for reboot"
- depends on X86_32 && X86
+ bool "Enable X86 board specific fixups for reboot"
+ depends on X86_32
---help---
This enables chipset and/or board specific fixups to be done
in order to get reboot to work correctly. This is only needed on
@@ -944,34 +988,48 @@ config HIGHMEM
depends on X86_32 && (HIGHMEM64G || HIGHMEM4G)
config X86_PAE
- def_bool n
- prompt "PAE (Physical Address Extension) Support"
+ bool "PAE (Physical Address Extension) Support"
depends on X86_32 && !HIGHMEM4G
- select RESOURCES_64BIT
help
PAE is required for NX support, and furthermore enables
larger swapspace support for non-overcommit purposes. It
has the cost of more pagetable lookup overhead, and also
consumes more pagetable space per process.
+config ARCH_PHYS_ADDR_T_64BIT
+ def_bool X86_64 || X86_PAE
+
+config DIRECT_GBPAGES
+ bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
+ default y
+ depends on X86_64
+ help
+ Allow the kernel linear mapping to use 1GB pages on CPUs that
+ support it. This can improve the kernel's performance a tiny bit by
+ reducing TLB pressure. If in doubt, say "Y".
+
# Common NUMA Features
config NUMA
- bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
+ bool "Numa Memory Allocation and Scheduler Support"
depends on SMP
depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
default n if X86_PC
default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
help
Enable NUMA (Non Uniform Memory Access) support.
+
The kernel will try to allocate memory used by a CPU on the
local memory controller of the CPU and add some more
NUMA awareness to the kernel.
- For 32-bit this is currently highly experimental and should be only
- used for kernel development. It might also cause boot failures.
- For 64-bit this is recommended on all multiprocessor Opteron systems.
- If the system is EM64T, you should say N unless your system is
- EM64T NUMA.
+ For 64-bit this is recommended if the system is Intel Core i7
+ (or later), AMD Opteron, or EM64T NUMA.
+
+ For 32-bit this is only needed on (rare) 32-bit-only platforms
+ that support NUMA topologies, such as NUMAQ / Summit, or if you
+ boot a 32-bit kernel on a 64-bit NUMA platform.
+
+ Otherwise, you should say N.
comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
@@ -1238,8 +1296,7 @@ config X86_PAT
If unsure, say Y.
config EFI
- def_bool n
- prompt "EFI runtime service support"
+ bool "EFI runtime service support"
depends on ACPI
---help---
This enables the kernel to use EFI runtime services that are
@@ -1252,14 +1309,6 @@ config EFI
resultant kernel should continue to boot on existing non-EFI
platforms.
-config IRQBALANCE
- def_bool y
- prompt "Enable kernel irq balancing"
- depends on X86_32 && SMP && X86_IO_APIC
- help
- The default yes will allow the kernel to do irq load balancing.
- Saying no will keep the kernel from doing irq load balancing.
-
config SECCOMP
def_bool y
prompt "Enable seccomp to safely compute untrusted bytecode"
@@ -1497,11 +1546,15 @@ config ARCH_ENABLE_MEMORY_HOTPLUG
def_bool y
depends on X86_64 || (X86_32 && HIGHMEM)
+config ARCH_ENABLE_MEMORY_HOTREMOVE
+ def_bool y
+ depends on MEMORY_HOTPLUG
+
config HAVE_ARCH_EARLY_PFN_TO_NID
def_bool X86_64
depends on NUMA
-menu "Power management options"
+menu "Power management and ACPI options"
depends on !X86_VOYAGER
config ARCH_HIBERNATION_HEADER
@@ -1636,19 +1689,14 @@ config APM_ALLOW_INTS
many of the newer IBM Thinkpads. If you experience hangs when you
suspend, try setting this to Y. Otherwise, say N.
-config APM_REAL_MODE_POWER_OFF
- bool "Use real mode APM BIOS call to power off"
- help
- Use real mode APM BIOS calls to switch off the computer. This is
- a work-around for a number of buggy BIOSes. Switch this option on if
- your computer crashes instead of powering off properly.
-
endif # APM
source "arch/x86/kernel/cpu/cpufreq/Kconfig"
source "drivers/cpuidle/Kconfig"
+source "drivers/idle/Kconfig"
+
endmenu
@@ -1899,6 +1947,10 @@ config SYSVIPC_COMPAT
endmenu
+config HAVE_ATOMIC_IOMAP
+ def_bool y
+ depends on X86_32
+
source "net/Kconfig"
source "drivers/Kconfig"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 0b7c4a3f0651..85a78575956c 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -513,19 +513,20 @@ config CPU_SUP_UMC_32
If unsure, say N.
config X86_DS
- bool "Debug Store support"
- default y
- help
- Add support for Debug Store.
- This allows the kernel to provide a memory buffer to the hardware
- to store various profiling and tracing events.
+ def_bool X86_PTRACE_BTS
+ depends on X86_DEBUGCTLMSR
+ select HAVE_HW_BRANCH_TRACER
config X86_PTRACE_BTS
- bool "ptrace interface to Branch Trace Store"
+ bool "Branch Trace Store"
default y
- depends on (X86_DS && X86_DEBUGCTLMSR)
+ depends on X86_DEBUGCTLMSR
help
- Add a ptrace interface to allow collecting an execution trace
- of the traced task.
- This collects control flow changes in a (cyclic) buffer and allows
- debuggers to fill in the gaps and show an execution trace of the debuggee.
+ This adds a ptrace interface to the hardware's branch trace store.
+
+ Debuggers may use it to collect an execution trace of the debugged
+ application in order to answer the question 'how did I get here?'.
+ Debuggers may trace user mode as well as kernel mode.
+
+ Say Y unless there is no application development on this machine
+ and you want to save a small amount of code size.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 95fe606cb9a3..28f111461ca8 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -114,18 +114,6 @@ config DEBUG_RODATA
data. This is recommended so that we can catch kernel bugs sooner.
If in doubt, say "Y".
-config DIRECT_GBPAGES
- bool "Enable gbpages-mapped kernel pagetables"
- depends on DEBUG_KERNEL && EXPERIMENTAL && X86_64
- help
- Enable gigabyte pages support (if the CPU supports it). This can
- improve the kernel's performance a tiny bit by reducing TLB
- pressure.
-
- This is experimental code.
-
- If in doubt, say "N".
-
config DEBUG_RODATA_TEST
bool "Testcase for the DEBUG_RODATA feature"
depends on DEBUG_RODATA
@@ -187,14 +175,10 @@ config IOMMU_LEAK
Add a simple leak tracer to the IOMMU code. This is useful when you
are debugging a buggy device driver that leaks IOMMU mappings.
-config MMIOTRACE_HOOKS
- bool
-
config MMIOTRACE
bool "Memory mapped IO tracing"
depends on DEBUG_KERNEL && PCI
select TRACING
- select MMIOTRACE_HOOKS
help
Mmiotrace traces Memory Mapped I/O access and is meant for
debugging and reverse engineering. It is called from the ioremap
@@ -308,10 +292,10 @@ config OPTIMIZE_INLINING
developers have marked 'inline'. Doing so takes away freedom from gcc to
do what it thinks is best, which is desirable for the gcc 3.x series of
compilers. The gcc 4.x series have a rewritten inlining algorithm and
- disabling this option will generate a smaller kernel there. Hopefully
- this algorithm is so good that allowing gcc4 to make the decision can
- become the default in the future, until then this option is there to
- test gcc for this.
+ enabling this option will generate a smaller kernel there. Hopefully
+ this algorithm is so good that allowing gcc 4.x and above to make the
+ decision will become the default in the future. Until then this option
+ is there to test gcc for this.
If unsure, say N.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 58ea55ce2423..cacee981d166 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -110,16 +110,16 @@ KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
mcore-y := arch/x86/mach-default/
# Voyager subarch support
-mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager
+mflags-$(CONFIG_X86_VOYAGER) := -Iarch/x86/include/asm/mach-voyager
mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
# generic subarchitecture
-mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
+mflags-$(CONFIG_X86_GENERICARCH):= -Iarch/x86/include/asm/mach-generic
fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/
# default subarch .h files
-mflags-y += -Iinclude/asm-x86/mach-default
+mflags-y += -Iarch/x86/include/asm/mach-default
# 64 bit does not support subarch support - clear sub arch variables
fcore-$(CONFIG_X86_64) :=
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore
index be0ed065249b..63eff3b04d01 100644
--- a/arch/x86/boot/compressed/.gitignore
+++ b/arch/x86/boot/compressed/.gitignore
@@ -1 +1,3 @@
relocs
+vmlinux.bin.all
+vmlinux.relocs
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 5780d361105b..da062216948a 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -16,7 +16,7 @@
*/
#undef CONFIG_PARAVIRT
#ifdef CONFIG_X86_32
-#define ASM_X86__DESC_H 1
+#define _ASM_X86_DESC_H 1
#endif
#ifdef CONFIG_X86_64
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index 0be77b39328a..7e8e8b25f5f6 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -74,7 +74,7 @@ static int kbd_pending(void)
{
u8 pending;
asm volatile("int $0x16; setnz %0"
- : "=rm" (pending)
+ : "=qm" (pending)
: "a" (0x0100));
return pending;
}
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index 49f26aaaebc8..3fa979c9c363 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -17,7 +17,7 @@
#include "boot.h"
#include "video.h"
-__videocard video_bios;
+static __videocard video_bios;
/* Set a conventional BIOS mode */
static int set_bios_mode(u8 mode);
@@ -119,7 +119,7 @@ static int bios_probe(void)
return nmodes;
}
-__videocard video_bios =
+static __videocard video_bios =
{
.card_name = "BIOS",
.probe = bios_probe,
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 1e6fe0214c85..75115849af33 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -20,7 +20,7 @@
static struct vesa_general_info vginfo;
static struct vesa_mode_info vminfo;
-__videocard video_vesa;
+static __videocard video_vesa;
#ifndef _WAKEUP
static void vesa_store_mode_params_graphics(void);
@@ -88,14 +88,11 @@ static int vesa_probe(void)
(vminfo.memory_layout == 4 ||
vminfo.memory_layout == 6) &&
vminfo.memory_planes == 1) {
-#ifdef CONFIG_FB
+#ifdef CONFIG_FB_BOOT_VESA_SUPPORT
/* Graphics mode, color, linear frame buffer
supported. Only register the mode if
if framebuffer is configured, however,
- otherwise the user will be left without a screen.
- We don't require CONFIG_FB_VESA, however, since
- some of the other framebuffer drivers can use
- this mode-setting, too. */
+ otherwise the user will be left without a screen. */
mi = GET_HEAP(struct mode_info, 1);
mi->mode = mode + VIDEO_FIRST_VESA;
mi->depth = vminfo.bpp;
@@ -133,10 +130,12 @@ static int vesa_set_mode(struct mode_info *mode)
if ((vminfo.mode_attr & 0x15) == 0x05) {
/* It's a supported text mode */
is_graphic = 0;
+#ifdef CONFIG_FB_BOOT_VESA_SUPPORT
} else if ((vminfo.mode_attr & 0x99) == 0x99) {
/* It's a graphics mode with linear frame buffer */
is_graphic = 1;
vesa_mode |= 0x4000; /* Request linear frame buffer */
+#endif
} else {
return -1; /* Invalid mode */
}
@@ -294,7 +293,7 @@ void vesa_store_edid(void)
#endif /* not _WAKEUP */
-__videocard video_vesa =
+static __videocard video_vesa =
{
.card_name = "VESA",
.probe = vesa_probe,
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index b939cb476dec..5d4742ed4aa2 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -34,7 +34,7 @@ static struct mode_info cga_modes[] = {
{ VIDEO_80x25, 80, 25, 0 },
};
-__videocard video_vga;
+static __videocard video_vga;
/* Set basic 80x25 mode */
static u8 vga_set_basic_mode(void)
@@ -259,7 +259,7 @@ static int vga_probe(void)
return mode_count[adapter];
}
-__videocard video_vga = {
+static __videocard video_vga = {
.card_name = "VGA",
.probe = vga_probe,
.set_mode = vga_set_mode,
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 83598b23093a..3bef2c1febe9 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -226,7 +226,7 @@ static unsigned int mode_menu(void)
#ifdef CONFIG_VIDEO_RETAIN
/* Save screen content to the heap */
-struct saved_screen {
+static struct saved_screen {
int x, y;
int curx, cury;
u16 *data;
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 52d0359719d7..b30a08ed8eb4 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -77,7 +77,7 @@ CONFIG_AUDIT=y
CONFIG_AUDITSYSCALL=y
CONFIG_AUDIT_TREE=y
# CONFIG_IKCONFIG is not set
-CONFIG_LOG_BUF_SHIFT=17
+CONFIG_LOG_BUF_SHIFT=18
CONFIG_CGROUPS=y
# CONFIG_CGROUP_DEBUG is not set
CONFIG_CGROUP_NS=y
@@ -287,7 +287,6 @@ CONFIG_MTRR=y
# CONFIG_MTRR_SANITIZER is not set
CONFIG_X86_PAT=y
CONFIG_EFI=y
-# CONFIG_IRQBALANCE is not set
CONFIG_SECCOMP=y
# CONFIG_HZ_100 is not set
# CONFIG_HZ_250 is not set
@@ -299,7 +298,7 @@ CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
# CONFIG_KEXEC_JUMP is not set
CONFIG_PHYSICAL_START=0x1000000
-CONFIG_RELOCATABLE=y
+# CONFIG_RELOCATABLE is not set
CONFIG_PHYSICAL_ALIGN=0x200000
CONFIG_HOTPLUG_CPU=y
# CONFIG_COMPAT_VDSO is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index f0a03d7a7d63..0e7dbc0a3e46 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -77,7 +77,7 @@ CONFIG_AUDIT=y
CONFIG_AUDITSYSCALL=y
CONFIG_AUDIT_TREE=y
# CONFIG_IKCONFIG is not set
-CONFIG_LOG_BUF_SHIFT=17
+CONFIG_LOG_BUF_SHIFT=18
CONFIG_CGROUPS=y
# CONFIG_CGROUP_DEBUG is not set
CONFIG_CGROUP_NS=y
@@ -298,7 +298,7 @@ CONFIG_SCHED_HRTICK=y
CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
CONFIG_PHYSICAL_START=0x1000000
-CONFIG_RELOCATABLE=y
+# CONFIG_RELOCATABLE is not set
CONFIG_PHYSICAL_ALIGN=0x200000
CONFIG_HOTPLUG_CPU=y
# CONFIG_COMPAT_VDSO is not set
diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c
index 070afc5b6c94..b9d00261703c 100644
--- a/arch/x86/crypto/crc32c-intel.c
+++ b/arch/x86/crypto/crc32c-intel.c
@@ -6,13 +6,22 @@
* Intel(R) 64 and IA-32 Architectures Software Developer's Manual
* Volume 2A: Instruction Set Reference, A-M
*
- * Copyright (c) 2008 Austin Zhang <austin_zhang@linux.intel.com>
- * Copyright (c) 2008 Kent Liu <kent.liu@intel.com>
+ * Copyright (C) 2008 Intel Corporation
+ * Authors: Austin Zhang <austin_zhang@linux.intel.com>
+ * Kent Liu <kent.liu@intel.com>
*
* This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*
*/
#include <linux/init.h>
@@ -75,99 +84,92 @@ static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len
* If your algorithm starts with ~0, then XOR with ~0 before you set
* the seed.
*/
-static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key,
+static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key,
unsigned int keylen)
{
- u32 *mctx = crypto_ahash_ctx(hash);
+ u32 *mctx = crypto_shash_ctx(hash);
if (keylen != sizeof(u32)) {
- crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
*mctx = le32_to_cpup((__le32 *)key);
return 0;
}
-static int crc32c_intel_init(struct ahash_request *req)
+static int crc32c_intel_init(struct shash_desc *desc)
{
- u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
- u32 *crcp = ahash_request_ctx(req);
+ u32 *mctx = crypto_shash_ctx(desc->tfm);
+ u32 *crcp = shash_desc_ctx(desc);
*crcp = *mctx;
return 0;
}
-static int crc32c_intel_update(struct ahash_request *req)
+static int crc32c_intel_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
{
- struct crypto_hash_walk walk;
- u32 *crcp = ahash_request_ctx(req);
- u32 crc = *crcp;
- int nbytes;
-
- for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
- nbytes = crypto_hash_walk_done(&walk, 0))
- crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
+ u32 *crcp = shash_desc_ctx(desc);
- *crcp = crc;
+ *crcp = crc32c_intel_le_hw(*crcp, data, len);
return 0;
}
-static int crc32c_intel_final(struct ahash_request *req)
+static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
+ u8 *out)
{
- u32 *crcp = ahash_request_ctx(req);
-
- *(__le32 *)req->result = ~cpu_to_le32p(crcp);
+ *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
return 0;
}
-static int crc32c_intel_digest(struct ahash_request *req)
+static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
{
- struct crypto_hash_walk walk;
- u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
- u32 crc = *mctx;
- int nbytes;
+ return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out);
+}
- for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
- nbytes = crypto_hash_walk_done(&walk, 0))
- crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
+static int crc32c_intel_final(struct shash_desc *desc, u8 *out)
+{
+ u32 *crcp = shash_desc_ctx(desc);
- *(__le32 *)req->result = ~cpu_to_le32(crc);
+ *(__le32 *)out = ~cpu_to_le32p(crcp);
return 0;
}
+static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
+ out);
+}
+
static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
{
u32 *key = crypto_tfm_ctx(tfm);
*key = ~0;
- tfm->crt_ahash.reqsize = sizeof(u32);
-
return 0;
}
-static struct crypto_alg alg = {
- .cra_name = "crc32c",
- .cra_driver_name = "crc32c-intel",
- .cra_priority = 200,
- .cra_flags = CRYPTO_ALG_TYPE_AHASH,
- .cra_blocksize = CHKSUM_BLOCK_SIZE,
- .cra_alignmask = 3,
- .cra_ctxsize = sizeof(u32),
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(alg.cra_list),
- .cra_init = crc32c_intel_cra_init,
- .cra_type = &crypto_ahash_type,
- .cra_u = {
- .ahash = {
- .digestsize = CHKSUM_DIGEST_SIZE,
- .setkey = crc32c_intel_setkey,
- .init = crc32c_intel_init,
- .update = crc32c_intel_update,
- .final = crc32c_intel_final,
- .digest = crc32c_intel_digest,
- }
+static struct shash_alg alg = {
+ .setkey = crc32c_intel_setkey,
+ .init = crc32c_intel_init,
+ .update = crc32c_intel_update,
+ .final = crc32c_intel_final,
+ .finup = crc32c_intel_finup,
+ .digest = crc32c_intel_digest,
+ .descsize = sizeof(u32),
+ .digestsize = CHKSUM_DIGEST_SIZE,
+ .base = {
+ .cra_name = "crc32c",
+ .cra_driver_name = "crc32c-intel",
+ .cra_priority = 200,
+ .cra_blocksize = CHKSUM_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(u32),
+ .cra_module = THIS_MODULE,
+ .cra_init = crc32c_intel_cra_init,
}
};
@@ -175,14 +177,14 @@ static struct crypto_alg alg = {
static int __init crc32c_intel_mod_init(void)
{
if (cpu_has_xmm4_2)
- return crypto_register_alg(&alg);
+ return crypto_register_shash(&alg);
else
return -ENODEV;
}
static void __exit crc32c_intel_mod_fini(void)
{
- crypto_unregister_alg(&alg);
+ crypto_unregister_shash(&alg);
}
module_init(crc32c_intel_mod_init);
@@ -194,4 +196,3 @@ MODULE_LICENSE("GPL");
MODULE_ALIAS("crc32c");
MODULE_ALIAS("crc32c-intel");
-
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 127ec3f07214..2a4d073d2cf1 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -327,7 +327,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
current->mm->cached_hole_size = 0;
current->mm->mmap = NULL;
- compute_creds(bprm);
+ install_exec_creds(bprm);
current->flags &= ~PF_FORKNOEXEC;
if (N_MAGIC(ex) == OMAGIC) {
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 4bc02b23674b..b195f85526e3 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -32,6 +32,8 @@
#include <asm/proto.h>
#include <asm/vdso.h>
+#include <asm/sigframe.h>
+
#define DEBUG_SIG 0
#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
@@ -41,7 +43,6 @@
X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
X86_EFLAGS_CF)
-asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
@@ -173,47 +174,28 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
/*
* Do a signal return; undo the signal stack.
*/
+#define COPY(x) { \
+ err |= __get_user(regs->x, &sc->x); \
+}
-struct sigframe
-{
- u32 pretcode;
- int sig;
- struct sigcontext_ia32 sc;
- struct _fpstate_ia32 fpstate_unused; /* look at kernel/sigframe.h */
- unsigned int extramask[_COMPAT_NSIG_WORDS-1];
- char retcode[8];
- /* fp state follows here */
-};
-
-struct rt_sigframe
-{
- u32 pretcode;
- int sig;
- u32 pinfo;
- u32 puc;
- compat_siginfo_t info;
- struct ucontext_ia32 uc;
- char retcode[8];
- /* fp state follows here */
-};
-
-#define COPY(x) { \
- unsigned int reg; \
- err |= __get_user(reg, &sc->x); \
- regs->x = reg; \
+#define COPY_SEG_CPL3(seg) { \
+ unsigned short tmp; \
+ err |= __get_user(tmp, &sc->seg); \
+ regs->seg = tmp | 3; \
}
-#define RELOAD_SEG(seg,mask) \
- { unsigned int cur; \
- unsigned short pre; \
- err |= __get_user(pre, &sc->seg); \
- savesegment(seg, cur); \
- pre |= mask; \
- if (pre != cur) loadsegment(seg, pre); }
+#define RELOAD_SEG(seg) { \
+ unsigned int cur, pre; \
+ err |= __get_user(pre, &sc->seg); \
+ savesegment(seg, cur); \
+ pre |= 3; \
+ if (pre != cur) \
+ loadsegment(seg, pre); \
+}
static int ia32_restore_sigcontext(struct pt_regs *regs,
struct sigcontext_ia32 __user *sc,
- unsigned int *peax)
+ unsigned int *pax)
{
unsigned int tmpflags, gs, oldgs, err = 0;
void __user *buf;
@@ -240,18 +222,16 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
if (gs != oldgs)
load_gs_index(gs);
- RELOAD_SEG(fs, 3);
- RELOAD_SEG(ds, 3);
- RELOAD_SEG(es, 3);
+ RELOAD_SEG(fs);
+ RELOAD_SEG(ds);
+ RELOAD_SEG(es);
COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
COPY(dx); COPY(cx); COPY(ip);
/* Don't touch extended registers */
- err |= __get_user(regs->cs, &sc->cs);
- regs->cs |= 3;
- err |= __get_user(regs->ss, &sc->ss);
- regs->ss |= 3;
+ COPY_SEG_CPL3(cs);
+ COPY_SEG_CPL3(ss);
err |= __get_user(tmpflags, &sc->flags);
regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
@@ -262,15 +242,13 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
buf = compat_ptr(tmp);
err |= restore_i387_xstate_ia32(buf);
- err |= __get_user(tmp, &sc->ax);
- *peax = tmp;
-
+ err |= __get_user(*pax, &sc->ax);
return err;
}
asmlinkage long sys32_sigreturn(struct pt_regs *regs)
{
- struct sigframe __user *frame = (struct sigframe __user *)(regs->sp-8);
+ struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
sigset_t set;
unsigned int ax;
@@ -300,12 +278,12 @@ badframe:
asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
{
- struct rt_sigframe __user *frame;
+ struct rt_sigframe_ia32 __user *frame;
sigset_t set;
unsigned int ax;
struct pt_regs tregs;
- frame = (struct rt_sigframe __user *)(regs->sp - 4);
+ frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
@@ -359,20 +337,15 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
err |= __put_user(regs->dx, &sc->dx);
err |= __put_user(regs->cx, &sc->cx);
err |= __put_user(regs->ax, &sc->ax);
- err |= __put_user(regs->cs, &sc->cs);
- err |= __put_user(regs->ss, &sc->ss);
err |= __put_user(current->thread.trap_no, &sc->trapno);
err |= __put_user(current->thread.error_code, &sc->err);
err |= __put_user(regs->ip, &sc->ip);
+ err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs);
err |= __put_user(regs->flags, &sc->flags);
err |= __put_user(regs->sp, &sc->sp_at_signal);
+ err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
- tmp = save_i387_xstate_ia32(fpstate);
- if (tmp < 0)
- err = -EFAULT;
- else
- err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
- &sc->fpstate);
+ err |= __put_user(ptr_to_compat(fpstate), &sc->fpstate);
/* non-iBCS2 extensions.. */
err |= __put_user(mask, &sc->oldmask);
@@ -400,7 +373,7 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
}
/* This is the legacy signal stack switching. */
- else if ((regs->ss & 0xffff) != __USER_DS &&
+ else if ((regs->ss & 0xffff) != __USER32_DS &&
!(ka->sa.sa_flags & SA_RESTORER) &&
ka->sa.sa_restorer)
sp = (unsigned long) ka->sa.sa_restorer;
@@ -408,6 +381,8 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
if (used_math()) {
sp = sp - sig_xstate_ia32_size;
*fpstate = (struct _fpstate_ia32 *) sp;
+ if (save_i387_xstate_ia32(*fpstate) < 0)
+ return (void __user *) -1L;
}
sp -= frame_size;
@@ -420,7 +395,7 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
int ia32_setup_frame(int sig, struct k_sigaction *ka,
compat_sigset_t *set, struct pt_regs *regs)
{
- struct sigframe __user *frame;
+ struct sigframe_ia32 __user *frame;
void __user *restorer;
int err = 0;
void __user *fpstate = NULL;
@@ -430,12 +405,10 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
u16 poplmovl;
u32 val;
u16 int80;
- u16 pad;
} __attribute__((packed)) code = {
0xb858, /* popl %eax ; movl $...,%eax */
__NR_ia32_sigreturn,
0x80cd, /* int $0x80 */
- 0,
};
frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
@@ -471,7 +444,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
* These are actually not used anymore, but left because some
* gdb versions depend on them as a marker.
*/
- err |= __copy_to_user(frame->retcode, &code, 8);
+ err |= __put_user(*((u64 *)&code), (u64 *)frame->retcode);
if (err)
return -EFAULT;
@@ -501,7 +474,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
compat_sigset_t *set, struct pt_regs *regs)
{
- struct rt_sigframe __user *frame;
+ struct rt_sigframe_ia32 __user *frame;
void __user *restorer;
int err = 0;
void __user *fpstate = NULL;
@@ -511,8 +484,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
u8 movl;
u32 val;
u16 int80;
- u16 pad;
- u8 pad2;
+ u8 pad;
} __attribute__((packed)) code = {
0xb8,
__NR_ia32_rt_sigreturn,
@@ -559,7 +531,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
* Not actually used anymore, but left because some gdb
* versions need it.
*/
- err |= __copy_to_user(frame->retcode, &code, 8);
+ err |= __put_user(*((u64 *)&code), (u64 *)frame->retcode);
if (err)
return -EFAULT;
@@ -572,11 +544,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
regs->dx = (unsigned long) &frame->info;
regs->cx = (unsigned long) &frame->uc;
- /* Make -mregparm=3 work */
- regs->ax = sig;
- regs->dx = (unsigned long) &frame->info;
- regs->cx = (unsigned long) &frame->uc;
-
loadsegment(ds, __USER32_DS);
loadsegment(es, __USER32_DS);
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index eb4314768bf7..256b00b61892 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -571,8 +571,8 @@ ia32_sys_call_table:
.quad compat_sys_setrlimit /* 75 */
.quad compat_sys_old_getrlimit /* old_getrlimit */
.quad compat_sys_getrusage
- .quad sys32_gettimeofday
- .quad sys32_settimeofday
+ .quad compat_sys_gettimeofday
+ .quad compat_sys_settimeofday
.quad sys_getgroups16 /* 80 */
.quad sys_setgroups16
.quad sys32_old_select
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index beda4232ce69..2e09dcd3c0a6 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -49,41 +49,6 @@
#define AA(__x) ((unsigned long)(__x))
-int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf)
-{
- compat_ino_t ino;
-
- typeof(ubuf->st_uid) uid = 0;
- typeof(ubuf->st_gid) gid = 0;
- SET_UID(uid, kbuf->uid);
- SET_GID(gid, kbuf->gid);
- if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev))
- return -EOVERFLOW;
- if (kbuf->size >= 0x7fffffff)
- return -EOVERFLOW;
- ino = kbuf->ino;
- if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino)
- return -EOVERFLOW;
- if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) ||
- __put_user(old_encode_dev(kbuf->dev), &ubuf->st_dev) ||
- __put_user(ino, &ubuf->st_ino) ||
- __put_user(kbuf->mode, &ubuf->st_mode) ||
- __put_user(kbuf->nlink, &ubuf->st_nlink) ||
- __put_user(uid, &ubuf->st_uid) ||
- __put_user(gid, &ubuf->st_gid) ||
- __put_user(old_encode_dev(kbuf->rdev), &ubuf->st_rdev) ||
- __put_user(kbuf->size, &ubuf->st_size) ||
- __put_user(kbuf->atime.tv_sec, &ubuf->st_atime) ||
- __put_user(kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) ||
- __put_user(kbuf->mtime.tv_sec, &ubuf->st_mtime) ||
- __put_user(kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
- __put_user(kbuf->ctime.tv_sec, &ubuf->st_ctime) ||
- __put_user(kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
- __put_user(kbuf->blksize, &ubuf->st_blksize) ||
- __put_user(kbuf->blocks, &ubuf->st_blocks))
- return -EFAULT;
- return 0;
-}
asmlinkage long sys32_truncate64(char __user *filename,
unsigned long offset_low,
@@ -402,75 +367,11 @@ asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
return 0;
}
-static inline long get_tv32(struct timeval *o, struct compat_timeval __user *i)
-{
- int err = -EFAULT;
-
- if (access_ok(VERIFY_READ, i, sizeof(*i))) {
- err = __get_user(o->tv_sec, &i->tv_sec);
- err |= __get_user(o->tv_usec, &i->tv_usec);
- }
- return err;
-}
-
-static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i)
-{
- int err = -EFAULT;
-
- if (access_ok(VERIFY_WRITE, o, sizeof(*o))) {
- err = __put_user(i->tv_sec, &o->tv_sec);
- err |= __put_user(i->tv_usec, &o->tv_usec);
- }
- return err;
-}
-
asmlinkage long sys32_alarm(unsigned int seconds)
{
return alarm_setitimer(seconds);
}
-/*
- * Translations due to time_t size differences. Which affects all
- * sorts of things, like timeval and itimerval.
- */
-asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv,
- struct timezone __user *tz)
-{
- if (tv) {
- struct timeval ktv;
-
- do_gettimeofday(&ktv);
- if (put_tv32(tv, &ktv))
- return -EFAULT;
- }
- if (tz) {
- if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
- return -EFAULT;
- }
- return 0;
-}
-
-asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv,
- struct timezone __user *tz)
-{
- struct timeval ktv;
- struct timespec kts;
- struct timezone ktz;
-
- if (tv) {
- if (get_tv32(&ktv, tv))
- return -EFAULT;
- kts.tv_sec = ktv.tv_sec;
- kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC;
- }
- if (tz) {
- if (copy_from_user(&ktz, tz, sizeof(ktz)))
- return -EFAULT;
- }
-
- return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
-}
-
struct sel_arg_struct {
unsigned int n;
unsigned int inp;
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
new file mode 100644
index 000000000000..4a8e80cdcfa5
--- /dev/null
+++ b/arch/x86/include/asm/Kbuild
@@ -0,0 +1,24 @@
+include include/asm-generic/Kbuild.asm
+
+header-y += boot.h
+header-y += bootparam.h
+header-y += debugreg.h
+header-y += ldt.h
+header-y += msr-index.h
+header-y += prctl.h
+header-y += ptrace-abi.h
+header-y += sigcontext32.h
+header-y += ucontext.h
+header-y += processor-flags.h
+
+unifdef-y += e820.h
+unifdef-y += ist.h
+unifdef-y += mce.h
+unifdef-y += msr.h
+unifdef-y += mtrr.h
+unifdef-y += posix_types_32.h
+unifdef-y += posix_types_64.h
+unifdef-y += unistd_32.h
+unifdef-y += unistd_64.h
+unifdef-y += vm86.h
+unifdef-y += vsyscall.h
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
new file mode 100644
index 000000000000..37822206083e
--- /dev/null
+++ b/arch/x86/include/asm/a.out-core.h
@@ -0,0 +1,73 @@
+/* a.out coredump register dumper
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _ASM_X86_A_OUT_CORE_H
+#define _ASM_X86_A_OUT_CORE_H
+
+#ifdef __KERNEL__
+#ifdef CONFIG_X86_32
+
+#include <linux/user.h>
+#include <linux/elfcore.h>
+
+/*
+ * fill in the user structure for an a.out core dump
+ */
+static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
+{
+ u16 gs;
+
+/* changed the size calculations - should hopefully work better. lbt */
+ dump->magic = CMAGIC;
+ dump->start_code = 0;
+ dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
+ dump->u_tsize = ((unsigned long)current->mm->end_code) >> PAGE_SHIFT;
+ dump->u_dsize = ((unsigned long)(current->mm->brk + (PAGE_SIZE - 1)))
+ >> PAGE_SHIFT;
+ dump->u_dsize -= dump->u_tsize;
+ dump->u_ssize = 0;
+ dump->u_debugreg[0] = current->thread.debugreg0;
+ dump->u_debugreg[1] = current->thread.debugreg1;
+ dump->u_debugreg[2] = current->thread.debugreg2;
+ dump->u_debugreg[3] = current->thread.debugreg3;
+ dump->u_debugreg[4] = 0;
+ dump->u_debugreg[5] = 0;
+ dump->u_debugreg[6] = current->thread.debugreg6;
+ dump->u_debugreg[7] = current->thread.debugreg7;
+
+ if (dump->start_stack < TASK_SIZE)
+ dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
+ >> PAGE_SHIFT;
+
+ dump->regs.bx = regs->bx;
+ dump->regs.cx = regs->cx;
+ dump->regs.dx = regs->dx;
+ dump->regs.si = regs->si;
+ dump->regs.di = regs->di;
+ dump->regs.bp = regs->bp;
+ dump->regs.ax = regs->ax;
+ dump->regs.ds = (u16)regs->ds;
+ dump->regs.es = (u16)regs->es;
+ dump->regs.fs = (u16)regs->fs;
+ savesegment(gs, gs);
+ dump->regs.orig_ax = regs->orig_ax;
+ dump->regs.ip = regs->ip;
+ dump->regs.cs = (u16)regs->cs;
+ dump->regs.flags = regs->flags;
+ dump->regs.sp = regs->sp;
+ dump->regs.ss = (u16)regs->ss;
+
+ dump->u_fpvalid = dump_fpu(regs, &dump->i387);
+}
+
+#endif /* CONFIG_X86_32 */
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_A_OUT_CORE_H */
diff --git a/arch/x86/include/asm/a.out.h b/arch/x86/include/asm/a.out.h
new file mode 100644
index 000000000000..4684f97a5bbd
--- /dev/null
+++ b/arch/x86/include/asm/a.out.h
@@ -0,0 +1,20 @@
+#ifndef _ASM_X86_A_OUT_H
+#define _ASM_X86_A_OUT_H
+
+struct exec
+{
+ unsigned int a_info; /* Use macros N_MAGIC, etc for access */
+ unsigned a_text; /* length of text, in bytes */
+ unsigned a_data; /* length of data, in bytes */
+ unsigned a_bss; /* length of uninitialized data area for file, in bytes */
+ unsigned a_syms; /* length of symbol table data in file, in bytes */
+ unsigned a_entry; /* start address */
+ unsigned a_trsize; /* length of relocation info for text, in bytes */
+ unsigned a_drsize; /* length of relocation info for data, in bytes */
+};
+
+#define N_TRSIZE(a) ((a).a_trsize)
+#define N_DRSIZE(a) ((a).a_drsize)
+#define N_SYMSIZE(a) ((a).a_syms)
+
+#endif /* _ASM_X86_A_OUT_H */
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
new file mode 100644
index 000000000000..9830681446ad
--- /dev/null
+++ b/arch/x86/include/asm/acpi.h
@@ -0,0 +1,177 @@
+#ifndef _ASM_X86_ACPI_H
+#define _ASM_X86_ACPI_H
+
+/*
+ * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <acpi/pdc_intel.h>
+
+#include <asm/numa.h>
+#include <asm/processor.h>
+#include <asm/mmu.h>
+#include <asm/mpspec.h>
+
+#define COMPILER_DEPENDENT_INT64 long long
+#define COMPILER_DEPENDENT_UINT64 unsigned long long
+
+/*
+ * Calling conventions:
+ *
+ * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads)
+ * ACPI_EXTERNAL_XFACE - External ACPI interfaces
+ * ACPI_INTERNAL_XFACE - Internal ACPI interfaces
+ * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces
+ */
+#define ACPI_SYSTEM_XFACE
+#define ACPI_EXTERNAL_XFACE
+#define ACPI_INTERNAL_XFACE
+#define ACPI_INTERNAL_VAR_XFACE
+
+/* Asm macros */
+
+#define ACPI_ASM_MACROS
+#define BREAKPOINT3
+#define ACPI_DISABLE_IRQS() local_irq_disable()
+#define ACPI_ENABLE_IRQS() local_irq_enable()
+#define ACPI_FLUSH_CPU_CACHE() wbinvd()
+
+int __acpi_acquire_global_lock(unsigned int *lock);
+int __acpi_release_global_lock(unsigned int *lock);
+
+#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \
+ ((Acq) = __acpi_acquire_global_lock(&facs->global_lock))
+
+#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \
+ ((Acq) = __acpi_release_global_lock(&facs->global_lock))
+
+/*
+ * Math helper asm macros
+ */
+#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \
+ asm("divl %2;" \
+ : "=a"(q32), "=d"(r32) \
+ : "r"(d32), \
+ "0"(n_lo), "1"(n_hi))
+
+
+#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \
+ asm("shrl $1,%2 ;" \
+ "rcrl $1,%3;" \
+ : "=r"(n_hi), "=r"(n_lo) \
+ : "0"(n_hi), "1"(n_lo))
+
+#ifdef CONFIG_ACPI
+extern int acpi_lapic;
+extern int acpi_ioapic;
+extern int acpi_noirq;
+extern int acpi_strict;
+extern int acpi_disabled;
+extern int acpi_ht;
+extern int acpi_pci_disabled;
+extern int acpi_skip_timer_override;
+extern int acpi_use_timer_override;
+
+extern u8 acpi_sci_flags;
+extern int acpi_sci_override_gsi;
+void acpi_pic_sci_set_trigger(unsigned int, u16);
+
+static inline void disable_acpi(void)
+{
+ acpi_disabled = 1;
+ acpi_ht = 0;
+ acpi_pci_disabled = 1;
+ acpi_noirq = 1;
+}
+
+/* Fixmap pages to reserve for ACPI boot-time tables (see fixmap.h) */
+#define FIX_ACPI_PAGES 4
+
+extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
+
+static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
+static inline void acpi_disable_pci(void)
+{
+ acpi_pci_disabled = 1;
+ acpi_noirq_set();
+}
+
+/* routines for saving/restoring kernel state */
+extern int acpi_save_state_mem(void);
+extern void acpi_restore_state_mem(void);
+
+extern unsigned long acpi_wakeup_address;
+
+/* early initialization routine */
+extern void acpi_reserve_bootmem(void);
+
+/*
+ * Check if the CPU can handle C2 and deeper
+ */
+static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
+{
+ /*
+ * Early models (<=5) of AMD Opterons are not supposed to go into
+ * C2 state.
+ *
+ * Steppings 0x0A and later are good
+ */
+ if (boot_cpu_data.x86 == 0x0F &&
+ boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+ boot_cpu_data.x86_model <= 0x05 &&
+ boot_cpu_data.x86_mask < 0x0A)
+ return 1;
+ else if (boot_cpu_has(X86_FEATURE_AMDC1E))
+ return 1;
+ else
+ return max_cstate;
+}
+
+#else /* !CONFIG_ACPI */
+
+#define acpi_lapic 0
+#define acpi_ioapic 0
+static inline void acpi_noirq_set(void) { }
+static inline void acpi_disable_pci(void) { }
+static inline void disable_acpi(void) { }
+
+#endif /* !CONFIG_ACPI */
+
+#define ARCH_HAS_POWER_INIT 1
+
+struct bootnode;
+
+#ifdef CONFIG_ACPI_NUMA
+extern int acpi_numa;
+extern int acpi_scan_nodes(unsigned long start, unsigned long end);
+#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
+extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
+ int num_nodes);
+#else
+static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
+ int num_nodes)
+{
+}
+#endif
+
+#define acpi_unlazy_tlb(x) leave_mm(x)
+
+#endif /* _ASM_X86_ACPI_H */
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
new file mode 100644
index 000000000000..9825cd64c9b6
--- /dev/null
+++ b/arch/x86/include/asm/agp.h
@@ -0,0 +1,35 @@
+#ifndef _ASM_X86_AGP_H
+#define _ASM_X86_AGP_H
+
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
+
+/*
+ * Functions to keep the agpgart mappings coherent with the MMU. The
+ * GART gives the CPU a physical alias of pages in memory. The alias
+ * region is mapped uncacheable. Make sure there are no conflicting
+ * mappings with different cachability attributes for the same
+ * page. This avoids data corruption on some CPUs.
+ */
+
+#define map_page_into_agp(page) set_pages_uc(page, 1)
+#define unmap_page_from_agp(page) set_pages_wb(page, 1)
+
+/*
+ * Could use CLFLUSH here if the cpu supports it. But then it would
+ * need to be called for each cacheline of the whole page so it may
+ * not be worth it. Would need a page for it.
+ */
+#define flush_agp_cache() wbinvd()
+
+/* Convert a physical address to an address suitable for the GART. */
+#define phys_to_gart(x) (x)
+#define gart_to_phys(x) (x)
+
+/* GATT allocation. Returns/accepts GATT kernel virtual address. */
+#define alloc_gatt_pages(order) \
+ ((char *)__get_free_pages(GFP_KERNEL, (order)))
+#define free_gatt_pages(table, order) \
+ free_pages((unsigned long)(table), (order))
+
+#endif /* _ASM_X86_AGP_H */
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
new file mode 100644
index 000000000000..e2077d343c33
--- /dev/null
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -0,0 +1,22 @@
+#ifdef __ASSEMBLY__
+
+#ifdef CONFIG_X86_32
+# define X86_ALIGN .long
+#else
+# define X86_ALIGN .quad
+#endif
+
+#ifdef CONFIG_SMP
+ .macro LOCK_PREFIX
+1: lock
+ .section .smp_locks,"a"
+ .align 4
+ X86_ALIGN 1b
+ .previous
+ .endm
+#else
+ .macro LOCK_PREFIX
+ .endm
+#endif
+
+#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
new file mode 100644
index 000000000000..f6aa18eadf71
--- /dev/null
+++ b/arch/x86/include/asm/alternative.h
@@ -0,0 +1,183 @@
+#ifndef _ASM_X86_ALTERNATIVE_H
+#define _ASM_X86_ALTERNATIVE_H
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <asm/asm.h>
+
+/*
+ * Alternative inline assembly for SMP.
+ *
+ * The LOCK_PREFIX macro defined here replaces the LOCK and
+ * LOCK_PREFIX macros used everywhere in the source tree.
+ *
+ * SMP alternatives use the same data structures as the other
+ * alternatives and the X86_FEATURE_UP flag to indicate the case of a
+ * UP system running a SMP kernel. The existing apply_alternatives()
+ * works fine for patching a SMP kernel for UP.
+ *
+ * The SMP alternative tables can be kept after boot and contain both
+ * UP and SMP versions of the instructions to allow switching back to
+ * SMP at runtime, when hotplugging in a new CPU, which is especially
+ * useful in virtualized environments.
+ *
+ * The very common lock prefix is handled as special case in a
+ * separate table which is a pure address list without replacement ptr
+ * and size information. That keeps the table sizes small.
+ */
+
+#ifdef CONFIG_SMP
+#define LOCK_PREFIX \
+ ".section .smp_locks,\"a\"\n" \
+ _ASM_ALIGN "\n" \
+ _ASM_PTR "661f\n" /* address */ \
+ ".previous\n" \
+ "661:\n\tlock; "
+
+#else /* ! CONFIG_SMP */
+#define LOCK_PREFIX ""
+#endif
+
+/* This must be included *after* the definition of LOCK_PREFIX */
+#include <asm/cpufeature.h>
+
+struct alt_instr {
+ u8 *instr; /* original instruction */
+ u8 *replacement;
+ u8 cpuid; /* cpuid bit set for replacement */
+ u8 instrlen; /* length of original instruction */
+ u8 replacementlen; /* length of new instruction, <= instrlen */
+ u8 pad1;
+#ifdef CONFIG_X86_64
+ u32 pad2;
+#endif
+};
+
+extern void alternative_instructions(void);
+extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
+
+struct module;
+
+#ifdef CONFIG_SMP
+extern void alternatives_smp_module_add(struct module *mod, char *name,
+ void *locks, void *locks_end,
+ void *text, void *text_end);
+extern void alternatives_smp_module_del(struct module *mod);
+extern void alternatives_smp_switch(int smp);
+#else
+static inline void alternatives_smp_module_add(struct module *mod, char *name,
+ void *locks, void *locks_end,
+ void *text, void *text_end) {}
+static inline void alternatives_smp_module_del(struct module *mod) {}
+static inline void alternatives_smp_switch(int smp) {}
+#endif /* CONFIG_SMP */
+
+const unsigned char *const *find_nop_table(void);
+
+/*
+ * Alternative instructions for different CPU types or capabilities.
+ *
+ * This allows to use optimized instructions even on generic binary
+ * kernels.
+ *
+ * length of oldinstr must be longer or equal the length of newinstr
+ * It can be padded with nops as needed.
+ *
+ * For non barrier like inlines please define new variants
+ * without volatile and memory clobber.
+ */
+#define alternative(oldinstr, newinstr, feature) \
+ asm volatile ("661:\n\t" oldinstr "\n662:\n" \
+ ".section .altinstructions,\"a\"\n" \
+ _ASM_ALIGN "\n" \
+ _ASM_PTR "661b\n" /* label */ \
+ _ASM_PTR "663f\n" /* new instruction */ \
+ " .byte %c0\n" /* feature bit */ \
+ " .byte 662b-661b\n" /* sourcelen */ \
+ " .byte 664f-663f\n" /* replacementlen */ \
+ ".previous\n" \
+ ".section .altinstr_replacement,\"ax\"\n" \
+ "663:\n\t" newinstr "\n664:\n" /* replacement */ \
+ ".previous" :: "i" (feature) : "memory")
+
+/*
+ * Alternative inline assembly with input.
+ *
+ * Pecularities:
+ * No memory clobber here.
+ * Argument numbers start with 1.
+ * Best is to use constraints that are fixed size (like (%1) ... "r")
+ * If you use variable sized constraints like "m" or "g" in the
+ * replacement make sure to pad to the worst case length.
+ */
+#define alternative_input(oldinstr, newinstr, feature, input...) \
+ asm volatile ("661:\n\t" oldinstr "\n662:\n" \
+ ".section .altinstructions,\"a\"\n" \
+ _ASM_ALIGN "\n" \
+ _ASM_PTR "661b\n" /* label */ \
+ _ASM_PTR "663f\n" /* new instruction */ \
+ " .byte %c0\n" /* feature bit */ \
+ " .byte 662b-661b\n" /* sourcelen */ \
+ " .byte 664f-663f\n" /* replacementlen */ \
+ ".previous\n" \
+ ".section .altinstr_replacement,\"ax\"\n" \
+ "663:\n\t" newinstr "\n664:\n" /* replacement */ \
+ ".previous" :: "i" (feature), ##input)
+
+/* Like alternative_input, but with a single output argument */
+#define alternative_io(oldinstr, newinstr, feature, output, input...) \
+ asm volatile ("661:\n\t" oldinstr "\n662:\n" \
+ ".section .altinstructions,\"a\"\n" \
+ _ASM_ALIGN "\n" \
+ _ASM_PTR "661b\n" /* label */ \
+ _ASM_PTR "663f\n" /* new instruction */ \
+ " .byte %c[feat]\n" /* feature bit */ \
+ " .byte 662b-661b\n" /* sourcelen */ \
+ " .byte 664f-663f\n" /* replacementlen */ \
+ ".previous\n" \
+ ".section .altinstr_replacement,\"ax\"\n" \
+ "663:\n\t" newinstr "\n664:\n" /* replacement */ \
+ ".previous" : output : [feat] "i" (feature), ##input)
+
+/*
+ * use this macro(s) if you need more than one output parameter
+ * in alternative_io
+ */
+#define ASM_OUTPUT2(a, b) a, b
+
+struct paravirt_patch_site;
+#ifdef CONFIG_PARAVIRT
+void apply_paravirt(struct paravirt_patch_site *start,
+ struct paravirt_patch_site *end);
+#else
+static inline void apply_paravirt(struct paravirt_patch_site *start,
+ struct paravirt_patch_site *end)
+{}
+#define __parainstructions NULL
+#define __parainstructions_end NULL
+#endif
+
+extern void add_nops(void *insns, unsigned int len);
+
+/*
+ * Clear and restore the kernel write-protection flag on the local CPU.
+ * Allows the kernel to edit read-only pages.
+ * Side-effect: any interrupt handler running between save and restore will have
+ * the ability to write to read-only pages.
+ *
+ * Warning:
+ * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and
+ * no thread can be preempted in the instructions being modified (no iret to an
+ * invalid instruction possible) or if the instructions are changed from a
+ * consistent state to another consistent state atomically.
+ * More care must be taken when modifying code in the SMP case because of
+ * Intel's errata.
+ * On the local CPU you need to be protected again NMI or MCE handlers seeing an
+ * inconsistent instruction while you patch.
+ * The _early version expects the memory to already be RW.
+ */
+
+extern void *text_poke(void *addr, const void *opcode, size_t len);
+extern void *text_poke_early(void *addr, const void *opcode, size_t len);
+
+#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
new file mode 100644
index 000000000000..f712344329bc
--- /dev/null
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ * Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _ASM_X86_AMD_IOMMU_H
+#define _ASM_X86_AMD_IOMMU_H
+
+#include <linux/irqreturn.h>
+
+#ifdef CONFIG_AMD_IOMMU
+extern int amd_iommu_init(void);
+extern int amd_iommu_init_dma_ops(void);
+extern void amd_iommu_detect(void);
+extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
+#else
+static inline int amd_iommu_init(void) { return -ENODEV; }
+static inline void amd_iommu_detect(void) { }
+#endif
+
+#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
new file mode 100644
index 000000000000..ac302a2fa339
--- /dev/null
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -0,0 +1,404 @@
+/*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ * Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _ASM_X86_AMD_IOMMU_TYPES_H
+#define _ASM_X86_AMD_IOMMU_TYPES_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+/*
+ * some size calculation constants
+ */
+#define DEV_TABLE_ENTRY_SIZE 32
+#define ALIAS_TABLE_ENTRY_SIZE 2
+#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *))
+
+/* Length of the MMIO region for the AMD IOMMU */
+#define MMIO_REGION_LENGTH 0x4000
+
+/* Capability offsets used by the driver */
+#define MMIO_CAP_HDR_OFFSET 0x00
+#define MMIO_RANGE_OFFSET 0x0c
+#define MMIO_MISC_OFFSET 0x10
+
+/* Masks, shifts and macros to parse the device range capability */
+#define MMIO_RANGE_LD_MASK 0xff000000
+#define MMIO_RANGE_FD_MASK 0x00ff0000
+#define MMIO_RANGE_BUS_MASK 0x0000ff00
+#define MMIO_RANGE_LD_SHIFT 24
+#define MMIO_RANGE_FD_SHIFT 16
+#define MMIO_RANGE_BUS_SHIFT 8
+#define MMIO_GET_LD(x) (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT)
+#define MMIO_GET_FD(x) (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT)
+#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT)
+#define MMIO_MSI_NUM(x) ((x) & 0x1f)
+
+/* Flag masks for the AMD IOMMU exclusion range */
+#define MMIO_EXCL_ENABLE_MASK 0x01ULL
+#define MMIO_EXCL_ALLOW_MASK 0x02ULL
+
+/* Used offsets into the MMIO space */
+#define MMIO_DEV_TABLE_OFFSET 0x0000
+#define MMIO_CMD_BUF_OFFSET 0x0008
+#define MMIO_EVT_BUF_OFFSET 0x0010
+#define MMIO_CONTROL_OFFSET 0x0018
+#define MMIO_EXCL_BASE_OFFSET 0x0020
+#define MMIO_EXCL_LIMIT_OFFSET 0x0028
+#define MMIO_CMD_HEAD_OFFSET 0x2000
+#define MMIO_CMD_TAIL_OFFSET 0x2008
+#define MMIO_EVT_HEAD_OFFSET 0x2010
+#define MMIO_EVT_TAIL_OFFSET 0x2018
+#define MMIO_STATUS_OFFSET 0x2020
+
+/* MMIO status bits */
+#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04
+
+/* event logging constants */
+#define EVENT_ENTRY_SIZE 0x10
+#define EVENT_TYPE_SHIFT 28
+#define EVENT_TYPE_MASK 0xf
+#define EVENT_TYPE_ILL_DEV 0x1
+#define EVENT_TYPE_IO_FAULT 0x2
+#define EVENT_TYPE_DEV_TAB_ERR 0x3
+#define EVENT_TYPE_PAGE_TAB_ERR 0x4
+#define EVENT_TYPE_ILL_CMD 0x5
+#define EVENT_TYPE_CMD_HARD_ERR 0x6
+#define EVENT_TYPE_IOTLB_INV_TO 0x7
+#define EVENT_TYPE_INV_DEV_REQ 0x8
+#define EVENT_DEVID_MASK 0xffff
+#define EVENT_DEVID_SHIFT 0
+#define EVENT_DOMID_MASK 0xffff
+#define EVENT_DOMID_SHIFT 0
+#define EVENT_FLAGS_MASK 0xfff
+#define EVENT_FLAGS_SHIFT 0x10
+
+/* feature control bits */
+#define CONTROL_IOMMU_EN 0x00ULL
+#define CONTROL_HT_TUN_EN 0x01ULL
+#define CONTROL_EVT_LOG_EN 0x02ULL
+#define CONTROL_EVT_INT_EN 0x03ULL
+#define CONTROL_COMWAIT_EN 0x04ULL
+#define CONTROL_PASSPW_EN 0x08ULL
+#define CONTROL_RESPASSPW_EN 0x09ULL
+#define CONTROL_COHERENT_EN 0x0aULL
+#define CONTROL_ISOC_EN 0x0bULL
+#define CONTROL_CMDBUF_EN 0x0cULL
+#define CONTROL_PPFLOG_EN 0x0dULL
+#define CONTROL_PPFINT_EN 0x0eULL
+
+/* command specific defines */
+#define CMD_COMPL_WAIT 0x01
+#define CMD_INV_DEV_ENTRY 0x02
+#define CMD_INV_IOMMU_PAGES 0x03
+
+#define CMD_COMPL_WAIT_STORE_MASK 0x01
+#define CMD_COMPL_WAIT_INT_MASK 0x02
+#define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01
+#define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02
+
+#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL
+
+/* macros and definitions for device table entries */
+#define DEV_ENTRY_VALID 0x00
+#define DEV_ENTRY_TRANSLATION 0x01
+#define DEV_ENTRY_IR 0x3d
+#define DEV_ENTRY_IW 0x3e
+#define DEV_ENTRY_NO_PAGE_FAULT 0x62
+#define DEV_ENTRY_EX 0x67
+#define DEV_ENTRY_SYSMGT1 0x68
+#define DEV_ENTRY_SYSMGT2 0x69
+#define DEV_ENTRY_INIT_PASS 0xb8
+#define DEV_ENTRY_EINT_PASS 0xb9
+#define DEV_ENTRY_NMI_PASS 0xba
+#define DEV_ENTRY_LINT0_PASS 0xbe
+#define DEV_ENTRY_LINT1_PASS 0xbf
+#define DEV_ENTRY_MODE_MASK 0x07
+#define DEV_ENTRY_MODE_SHIFT 0x09
+
+/* constants to configure the command buffer */
+#define CMD_BUFFER_SIZE 8192
+#define CMD_BUFFER_ENTRIES 512
+#define MMIO_CMD_SIZE_SHIFT 56
+#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
+
+/* constants for event buffer handling */
+#define EVT_BUFFER_SIZE 8192 /* 512 entries */
+#define EVT_LEN_MASK (0x9ULL << 56)
+
+#define PAGE_MODE_1_LEVEL 0x01
+#define PAGE_MODE_2_LEVEL 0x02
+#define PAGE_MODE_3_LEVEL 0x03
+
+#define IOMMU_PDE_NL_0 0x000ULL
+#define IOMMU_PDE_NL_1 0x200ULL
+#define IOMMU_PDE_NL_2 0x400ULL
+#define IOMMU_PDE_NL_3 0x600ULL
+
+#define IOMMU_PTE_L2_INDEX(address) (((address) >> 30) & 0x1ffULL)
+#define IOMMU_PTE_L1_INDEX(address) (((address) >> 21) & 0x1ffULL)
+#define IOMMU_PTE_L0_INDEX(address) (((address) >> 12) & 0x1ffULL)
+
+#define IOMMU_MAP_SIZE_L1 (1ULL << 21)
+#define IOMMU_MAP_SIZE_L2 (1ULL << 30)
+#define IOMMU_MAP_SIZE_L3 (1ULL << 39)
+
+#define IOMMU_PTE_P (1ULL << 0)
+#define IOMMU_PTE_TV (1ULL << 1)
+#define IOMMU_PTE_U (1ULL << 59)
+#define IOMMU_PTE_FC (1ULL << 60)
+#define IOMMU_PTE_IR (1ULL << 61)
+#define IOMMU_PTE_IW (1ULL << 62)
+
+#define IOMMU_L1_PDE(address) \
+ ((address) | IOMMU_PDE_NL_1 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
+#define IOMMU_L2_PDE(address) \
+ ((address) | IOMMU_PDE_NL_2 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
+
+#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
+#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
+#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
+#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
+
+#define IOMMU_PROT_MASK 0x03
+#define IOMMU_PROT_IR 0x01
+#define IOMMU_PROT_IW 0x02
+
+/* IOMMU capabilities */
+#define IOMMU_CAP_IOTLB 24
+#define IOMMU_CAP_NPCACHE 26
+
+#define MAX_DOMAIN_ID 65536
+
+/* FIXME: move this macro to <linux/pci.h> */
+#define PCI_BUS(x) (((x) >> 8) & 0xff)
+
+/*
+ * This structure contains generic data for IOMMU protection domains
+ * independent of their use.
+ */
+struct protection_domain {
+ spinlock_t lock; /* mostly used to lock the page table*/
+ u16 id; /* the domain id written to the device table */
+ int mode; /* paging mode (0-6 levels) */
+ u64 *pt_root; /* page table root pointer */
+ void *priv; /* private data */
+};
+
+/*
+ * Data container for a dma_ops specific protection domain
+ */
+struct dma_ops_domain {
+ struct list_head list;
+
+ /* generic protection domain information */
+ struct protection_domain domain;
+
+ /* size of the aperture for the mappings */
+ unsigned long aperture_size;
+
+ /* address we start to search for free addresses */
+ unsigned long next_bit;
+
+ /* address allocation bitmap */
+ unsigned long *bitmap;
+
+ /*
+ * Array of PTE pages for the aperture. In this array we save all the
+ * leaf pages of the domain page table used for the aperture. This way
+ * we don't need to walk the page table to find a specific PTE. We can
+ * just calculate its address in constant time.
+ */
+ u64 **pte_pages;
+
+ /* This will be set to true when TLB needs to be flushed */
+ bool need_flush;
+
+ /*
+ * if this is a preallocated domain, keep the device for which it was
+ * preallocated in this variable
+ */
+ u16 target_dev;
+};
+
+/*
+ * Structure where we save information about one hardware AMD IOMMU in the
+ * system.
+ */
+struct amd_iommu {
+ struct list_head list;
+
+ /* locks the accesses to the hardware */
+ spinlock_t lock;
+
+ /* Pointer to PCI device of this IOMMU */
+ struct pci_dev *dev;
+
+ /* physical address of MMIO space */
+ u64 mmio_phys;
+ /* virtual address of MMIO space */
+ u8 *mmio_base;
+
+ /* capabilities of that IOMMU read from ACPI */
+ u32 cap;
+
+ /*
+ * Capability pointer. There could be more than one IOMMU per PCI
+ * device function if there are more than one AMD IOMMU capability
+ * pointers.
+ */
+ u16 cap_ptr;
+
+ /* pci domain of this IOMMU */
+ u16 pci_seg;
+
+ /* first device this IOMMU handles. read from PCI */
+ u16 first_device;
+ /* last device this IOMMU handles. read from PCI */
+ u16 last_device;
+
+ /* start of exclusion range of that IOMMU */
+ u64 exclusion_start;
+ /* length of exclusion range of that IOMMU */
+ u64 exclusion_length;
+
+ /* command buffer virtual address */
+ u8 *cmd_buf;
+ /* size of command buffer */
+ u32 cmd_buf_size;
+
+ /* size of event buffer */
+ u32 evt_buf_size;
+ /* event buffer virtual address */
+ u8 *evt_buf;
+ /* MSI number for event interrupt */
+ u16 evt_msi_num;
+
+ /* true if interrupts for this IOMMU are already enabled */
+ bool int_enabled;
+
+ /* if one, we need to send a completion wait command */
+ int need_sync;
+
+ /* default dma_ops domain for that IOMMU */
+ struct dma_ops_domain *default_dom;
+};
+
+/*
+ * List with all IOMMUs in the system. This list is not locked because it is
+ * only written and read at driver initialization or suspend time
+ */
+extern struct list_head amd_iommu_list;
+
+/*
+ * Structure defining one entry in the device table
+ */
+struct dev_table_entry {
+ u32 data[8];
+};
+
+/*
+ * One entry for unity mappings parsed out of the ACPI table.
+ */
+struct unity_map_entry {
+ struct list_head list;
+
+ /* starting device id this entry is used for (including) */
+ u16 devid_start;
+ /* end device id this entry is used for (including) */
+ u16 devid_end;
+
+ /* start address to unity map (including) */
+ u64 address_start;
+ /* end address to unity map (including) */
+ u64 address_end;
+
+ /* required protection */
+ int prot;
+};
+
+/*
+ * List of all unity mappings. It is not locked because as runtime it is only
+ * read. It is created at ACPI table parsing time.
+ */
+extern struct list_head amd_iommu_unity_map;
+
+/*
+ * Data structures for device handling
+ */
+
+/*
+ * Device table used by hardware. Read and write accesses by software are
+ * locked with the amd_iommu_pd_table lock.
+ */
+extern struct dev_table_entry *amd_iommu_dev_table;
+
+/*
+ * Alias table to find requestor ids to device ids. Not locked because only
+ * read on runtime.
+ */
+extern u16 *amd_iommu_alias_table;
+
+/*
+ * Reverse lookup table to find the IOMMU which translates a specific device.
+ */
+extern struct amd_iommu **amd_iommu_rlookup_table;
+
+/* size of the dma_ops aperture as power of 2 */
+extern unsigned amd_iommu_aperture_order;
+
+/* largest PCI device id we expect translation requests for */
+extern u16 amd_iommu_last_bdf;
+
+/* data structures for protection domain handling */
+extern struct protection_domain **amd_iommu_pd_table;
+
+/* allocation bitmap for domain ids */
+extern unsigned long *amd_iommu_pd_alloc_bitmap;
+
+/* will be 1 if device isolation is enabled */
+extern int amd_iommu_isolate;
+
+/*
+ * If true, the addresses will be flushed on unmap time, not when
+ * they are reused
+ */
+extern bool amd_iommu_unmap_flush;
+
+/* takes a PCI device id and prints it out in a readable form */
+static inline void print_devid(u16 devid, int nl)
+{
+ int bus = devid >> 8;
+ int dev = devid >> 3 & 0x1f;
+ int fn = devid & 0x07;
+
+ printk("%02x:%02x.%x", bus, dev, fn);
+ if (nl)
+ printk("\n");
+}
+
+/* takes bus and device/function and returns the device id
+ * FIXME: should that be in generic PCI code? */
+static inline u16 calc_devid(u8 bus, u8 devfn)
+{
+ return (((u16)bus) << 8) | devfn;
+}
+
+#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
new file mode 100644
index 000000000000..25caa0738af5
--- /dev/null
+++ b/arch/x86/include/asm/apic.h
@@ -0,0 +1,200 @@
+#ifndef _ASM_X86_APIC_H
+#define _ASM_X86_APIC_H
+
+#include <linux/pm.h>
+#include <linux/delay.h>
+
+#include <asm/alternative.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+
+#define ARCH_APICTIMER_STOPS_ON_C3 1
+
+/*
+ * Debugging macros
+ */
+#define APIC_QUIET 0
+#define APIC_VERBOSE 1
+#define APIC_DEBUG 2
+
+/*
+ * Define the default level of output to be very little
+ * This can be turned up by using apic=verbose for more
+ * information and apic=debug for _lots_ of information.
+ * apic_verbosity is defined in apic.c
+ */
+#define apic_printk(v, s, a...) do { \
+ if ((v) <= apic_verbosity) \
+ printk(s, ##a); \
+ } while (0)
+
+
+extern void generic_apic_probe(void);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+extern unsigned int apic_verbosity;
+extern int local_apic_timer_c2_ok;
+
+extern int disable_apic;
+/*
+ * Basic functions accessing APICs.
+ */
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define setup_boot_clock setup_boot_APIC_clock
+#define setup_secondary_clock setup_secondary_APIC_clock
+#endif
+
+extern int is_vsmp_box(void);
+extern void xapic_wait_icr_idle(void);
+extern u32 safe_xapic_wait_icr_idle(void);
+extern u64 xapic_icr_read(void);
+extern void xapic_icr_write(u32, u32);
+extern int setup_profiling_timer(unsigned int);
+
+static inline void native_apic_mem_write(u32 reg, u32 v)
+{
+ volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
+
+ alternative_io("movl %0, %1", "xchgl %0, %1", X86_FEATURE_11AP,
+ ASM_OUTPUT2("=r" (v), "=m" (*addr)),
+ ASM_OUTPUT2("0" (v), "m" (*addr)));
+}
+
+static inline u32 native_apic_mem_read(u32 reg)
+{
+ return *((volatile u32 *)(APIC_BASE + reg));
+}
+
+static inline void native_apic_msr_write(u32 reg, u32 v)
+{
+ if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
+ reg == APIC_LVR)
+ return;
+
+ wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0);
+}
+
+static inline u32 native_apic_msr_read(u32 reg)
+{
+ u32 low, high;
+
+ if (reg == APIC_DFR)
+ return -1;
+
+ rdmsr(APIC_BASE_MSR + (reg >> 4), low, high);
+ return low;
+}
+
+#ifndef CONFIG_X86_32
+extern int x2apic, x2apic_preenabled;
+extern void check_x2apic(void);
+extern void enable_x2apic(void);
+extern void enable_IR_x2apic(void);
+extern void x2apic_icr_write(u32 low, u32 id);
+static inline int x2apic_enabled(void)
+{
+ int msr, msr2;
+
+ if (!cpu_has_x2apic)
+ return 0;
+
+ rdmsr(MSR_IA32_APICBASE, msr, msr2);
+ if (msr & X2APIC_ENABLE)
+ return 1;
+ return 0;
+}
+#else
+#define x2apic_enabled() 0
+#endif
+
+struct apic_ops {
+ u32 (*read)(u32 reg);
+ void (*write)(u32 reg, u32 v);
+ u64 (*icr_read)(void);
+ void (*icr_write)(u32 low, u32 high);
+ void (*wait_icr_idle)(void);
+ u32 (*safe_wait_icr_idle)(void);
+};
+
+extern struct apic_ops *apic_ops;
+
+#define apic_read (apic_ops->read)
+#define apic_write (apic_ops->write)
+#define apic_icr_read (apic_ops->icr_read)
+#define apic_icr_write (apic_ops->icr_write)
+#define apic_wait_icr_idle (apic_ops->wait_icr_idle)
+#define safe_apic_wait_icr_idle (apic_ops->safe_wait_icr_idle)
+
+extern int get_physical_broadcast(void);
+
+#ifdef CONFIG_X86_64
+static inline void ack_x2APIC_irq(void)
+{
+ /* Docs say use 0 for future compatibility */
+ native_apic_msr_write(APIC_EOI, 0);
+}
+#endif
+
+
+static inline void ack_APIC_irq(void)
+{
+ /*
+ * ack_APIC_irq() actually gets compiled as a single instruction
+ * ... yummie.
+ */
+
+ /* Docs say use 0 for future compatibility */
+ apic_write(APIC_EOI, 0);
+}
+
+extern int lapic_get_maxlvt(void);
+extern void clear_local_APIC(void);
+extern void connect_bsp_APIC(void);
+extern void disconnect_bsp_APIC(int virt_wire_setup);
+extern void disable_local_APIC(void);
+extern void lapic_shutdown(void);
+extern int verify_local_APIC(void);
+extern void cache_APIC_registers(void);
+extern void sync_Arb_IDs(void);
+extern void init_bsp_APIC(void);
+extern void setup_local_APIC(void);
+extern void end_local_APIC_setup(void);
+extern void init_apic_mappings(void);
+extern void setup_boot_APIC_clock(void);
+extern void setup_secondary_APIC_clock(void);
+extern int APIC_init_uniprocessor(void);
+extern void enable_NMI_through_LVT0(void);
+
+/*
+ * On 32bit this is mach-xxx local
+ */
+#ifdef CONFIG_X86_64
+extern void early_init_lapic_mapping(void);
+extern int apic_is_clustered_box(void);
+#else
+static inline int apic_is_clustered_box(void)
+{
+ return 0;
+}
+#endif
+
+extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask);
+extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
+
+
+#else /* !CONFIG_X86_LOCAL_APIC */
+static inline void lapic_shutdown(void) { }
+#define local_apic_timer_c2_ok 1
+static inline void init_apic_mappings(void) { }
+static inline void disable_local_APIC(void) { }
+
+#endif /* !CONFIG_X86_LOCAL_APIC */
+
+#endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
new file mode 100644
index 000000000000..63134e31e8b9
--- /dev/null
+++ b/arch/x86/include/asm/apicdef.h
@@ -0,0 +1,417 @@
+#ifndef _ASM_X86_APICDEF_H
+#define _ASM_X86_APICDEF_H
+
+/*
+ * Constants for various Intel APICs. (local APIC, IOAPIC, etc.)
+ *
+ * Alan Cox <Alan.Cox@linux.org>, 1995.
+ * Ingo Molnar <mingo@redhat.com>, 1999, 2000
+ */
+
+#define APIC_DEFAULT_PHYS_BASE 0xfee00000
+
+#define APIC_ID 0x20
+
+#define APIC_LVR 0x30
+#define APIC_LVR_MASK 0xFF00FF
+#define GET_APIC_VERSION(x) ((x) & 0xFFu)
+#define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu)
+#ifdef CONFIG_X86_32
+# define APIC_INTEGRATED(x) ((x) & 0xF0u)
+#else
+# define APIC_INTEGRATED(x) (1)
+#endif
+#define APIC_XAPIC(x) ((x) >= 0x14)
+#define APIC_TASKPRI 0x80
+#define APIC_TPRI_MASK 0xFFu
+#define APIC_ARBPRI 0x90
+#define APIC_ARBPRI_MASK 0xFFu
+#define APIC_PROCPRI 0xA0
+#define APIC_EOI 0xB0
+#define APIC_EIO_ACK 0x0
+#define APIC_RRR 0xC0
+#define APIC_LDR 0xD0
+#define APIC_LDR_MASK (0xFFu << 24)
+#define GET_APIC_LOGICAL_ID(x) (((x) >> 24) & 0xFFu)
+#define SET_APIC_LOGICAL_ID(x) (((x) << 24))
+#define APIC_ALL_CPUS 0xFFu
+#define APIC_DFR 0xE0
+#define APIC_DFR_CLUSTER 0x0FFFFFFFul
+#define APIC_DFR_FLAT 0xFFFFFFFFul
+#define APIC_SPIV 0xF0
+#define APIC_SPIV_FOCUS_DISABLED (1 << 9)
+#define APIC_SPIV_APIC_ENABLED (1 << 8)
+#define APIC_ISR 0x100
+#define APIC_ISR_NR 0x8 /* Number of 32 bit ISR registers. */
+#define APIC_TMR 0x180
+#define APIC_IRR 0x200
+#define APIC_ESR 0x280
+#define APIC_ESR_SEND_CS 0x00001
+#define APIC_ESR_RECV_CS 0x00002
+#define APIC_ESR_SEND_ACC 0x00004
+#define APIC_ESR_RECV_ACC 0x00008
+#define APIC_ESR_SENDILL 0x00020
+#define APIC_ESR_RECVILL 0x00040
+#define APIC_ESR_ILLREGA 0x00080
+#define APIC_ICR 0x300
+#define APIC_DEST_SELF 0x40000
+#define APIC_DEST_ALLINC 0x80000
+#define APIC_DEST_ALLBUT 0xC0000
+#define APIC_ICR_RR_MASK 0x30000
+#define APIC_ICR_RR_INVALID 0x00000
+#define APIC_ICR_RR_INPROG 0x10000
+#define APIC_ICR_RR_VALID 0x20000
+#define APIC_INT_LEVELTRIG 0x08000
+#define APIC_INT_ASSERT 0x04000
+#define APIC_ICR_BUSY 0x01000
+#define APIC_DEST_LOGICAL 0x00800
+#define APIC_DEST_PHYSICAL 0x00000
+#define APIC_DM_FIXED 0x00000
+#define APIC_DM_LOWEST 0x00100
+#define APIC_DM_SMI 0x00200
+#define APIC_DM_REMRD 0x00300
+#define APIC_DM_NMI 0x00400
+#define APIC_DM_INIT 0x00500
+#define APIC_DM_STARTUP 0x00600
+#define APIC_DM_EXTINT 0x00700
+#define APIC_VECTOR_MASK 0x000FF
+#define APIC_ICR2 0x310
+#define GET_APIC_DEST_FIELD(x) (((x) >> 24) & 0xFF)
+#define SET_APIC_DEST_FIELD(x) ((x) << 24)
+#define APIC_LVTT 0x320
+#define APIC_LVTTHMR 0x330
+#define APIC_LVTPC 0x340
+#define APIC_LVT0 0x350
+#define APIC_LVT_TIMER_BASE_MASK (0x3 << 18)
+#define GET_APIC_TIMER_BASE(x) (((x) >> 18) & 0x3)
+#define SET_APIC_TIMER_BASE(x) (((x) << 18))
+#define APIC_TIMER_BASE_CLKIN 0x0
+#define APIC_TIMER_BASE_TMBASE 0x1
+#define APIC_TIMER_BASE_DIV 0x2
+#define APIC_LVT_TIMER_PERIODIC (1 << 17)
+#define APIC_LVT_MASKED (1 << 16)
+#define APIC_LVT_LEVEL_TRIGGER (1 << 15)
+#define APIC_LVT_REMOTE_IRR (1 << 14)
+#define APIC_INPUT_POLARITY (1 << 13)
+#define APIC_SEND_PENDING (1 << 12)
+#define APIC_MODE_MASK 0x700
+#define GET_APIC_DELIVERY_MODE(x) (((x) >> 8) & 0x7)
+#define SET_APIC_DELIVERY_MODE(x, y) (((x) & ~0x700) | ((y) << 8))
+#define APIC_MODE_FIXED 0x0
+#define APIC_MODE_NMI 0x4
+#define APIC_MODE_EXTINT 0x7
+#define APIC_LVT1 0x360
+#define APIC_LVTERR 0x370
+#define APIC_TMICT 0x380
+#define APIC_TMCCT 0x390
+#define APIC_TDCR 0x3E0
+#define APIC_SELF_IPI 0x3F0
+#define APIC_TDR_DIV_TMBASE (1 << 2)
+#define APIC_TDR_DIV_1 0xB
+#define APIC_TDR_DIV_2 0x0
+#define APIC_TDR_DIV_4 0x1
+#define APIC_TDR_DIV_8 0x2
+#define APIC_TDR_DIV_16 0x3
+#define APIC_TDR_DIV_32 0x8
+#define APIC_TDR_DIV_64 0x9
+#define APIC_TDR_DIV_128 0xA
+#define APIC_EILVT0 0x500
+#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */
+#define APIC_EILVT_NR_AMD_10H 4
+#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF)
+#define APIC_EILVT_MSG_FIX 0x0
+#define APIC_EILVT_MSG_SMI 0x2
+#define APIC_EILVT_MSG_NMI 0x4
+#define APIC_EILVT_MSG_EXT 0x7
+#define APIC_EILVT_MASKED (1 << 16)
+#define APIC_EILVT1 0x510
+#define APIC_EILVT2 0x520
+#define APIC_EILVT3 0x530
+
+#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
+#define APIC_BASE_MSR 0x800
+#define X2APIC_ENABLE (1UL << 10)
+
+#ifdef CONFIG_X86_32
+# define MAX_IO_APICS 64
+#else
+# define MAX_IO_APICS 128
+# define MAX_LOCAL_APIC 32768
+#endif
+
+/*
+ * All x86-64 systems are xAPIC compatible.
+ * In the following, "apicid" is a physical APIC ID.
+ */
+#define XAPIC_DEST_CPUS_SHIFT 4
+#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
+#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
+#define APIC_CLUSTER(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK)
+#define APIC_CLUSTERID(apicid) (APIC_CLUSTER(apicid) >> XAPIC_DEST_CPUS_SHIFT)
+#define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK)
+#define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT)
+
+/*
+ * the local APIC register structure, memory mapped. Not terribly well
+ * tested, but we might eventually use this one in the future - the
+ * problem why we cannot use it right now is the P5 APIC, it has an
+ * errata which cannot take 8-bit reads and writes, only 32-bit ones ...
+ */
+#define u32 unsigned int
+
+struct local_apic {
+
+/*000*/ struct { u32 __reserved[4]; } __reserved_01;
+
+/*010*/ struct { u32 __reserved[4]; } __reserved_02;
+
+/*020*/ struct { /* APIC ID Register */
+ u32 __reserved_1 : 24,
+ phys_apic_id : 4,
+ __reserved_2 : 4;
+ u32 __reserved[3];
+ } id;
+
+/*030*/ const
+ struct { /* APIC Version Register */
+ u32 version : 8,
+ __reserved_1 : 8,
+ max_lvt : 8,
+ __reserved_2 : 8;
+ u32 __reserved[3];
+ } version;
+
+/*040*/ struct { u32 __reserved[4]; } __reserved_03;
+
+/*050*/ struct { u32 __reserved[4]; } __reserved_04;
+
+/*060*/ struct { u32 __reserved[4]; } __reserved_05;
+
+/*070*/ struct { u32 __reserved[4]; } __reserved_06;
+
+/*080*/ struct { /* Task Priority Register */
+ u32 priority : 8,
+ __reserved_1 : 24;
+ u32 __reserved_2[3];
+ } tpr;
+
+/*090*/ const
+ struct { /* Arbitration Priority Register */
+ u32 priority : 8,
+ __reserved_1 : 24;
+ u32 __reserved_2[3];
+ } apr;
+
+/*0A0*/ const
+ struct { /* Processor Priority Register */
+ u32 priority : 8,
+ __reserved_1 : 24;
+ u32 __reserved_2[3];
+ } ppr;
+
+/*0B0*/ struct { /* End Of Interrupt Register */
+ u32 eoi;
+ u32 __reserved[3];
+ } eoi;
+
+/*0C0*/ struct { u32 __reserved[4]; } __reserved_07;
+
+/*0D0*/ struct { /* Logical Destination Register */
+ u32 __reserved_1 : 24,
+ logical_dest : 8;
+ u32 __reserved_2[3];
+ } ldr;
+
+/*0E0*/ struct { /* Destination Format Register */
+ u32 __reserved_1 : 28,
+ model : 4;
+ u32 __reserved_2[3];
+ } dfr;
+
+/*0F0*/ struct { /* Spurious Interrupt Vector Register */
+ u32 spurious_vector : 8,
+ apic_enabled : 1,
+ focus_cpu : 1,
+ __reserved_2 : 22;
+ u32 __reserved_3[3];
+ } svr;
+
+/*100*/ struct { /* In Service Register */
+/*170*/ u32 bitfield;
+ u32 __reserved[3];
+ } isr [8];
+
+/*180*/ struct { /* Trigger Mode Register */
+/*1F0*/ u32 bitfield;
+ u32 __reserved[3];
+ } tmr [8];
+
+/*200*/ struct { /* Interrupt Request Register */
+/*270*/ u32 bitfield;
+ u32 __reserved[3];
+ } irr [8];
+
+/*280*/ union { /* Error Status Register */
+ struct {
+ u32 send_cs_error : 1,
+ receive_cs_error : 1,
+ send_accept_error : 1,
+ receive_accept_error : 1,
+ __reserved_1 : 1,
+ send_illegal_vector : 1,
+ receive_illegal_vector : 1,
+ illegal_register_address : 1,
+ __reserved_2 : 24;
+ u32 __reserved_3[3];
+ } error_bits;
+ struct {
+ u32 errors;
+ u32 __reserved_3[3];
+ } all_errors;
+ } esr;
+
+/*290*/ struct { u32 __reserved[4]; } __reserved_08;
+
+/*2A0*/ struct { u32 __reserved[4]; } __reserved_09;
+
+/*2B0*/ struct { u32 __reserved[4]; } __reserved_10;
+
+/*2C0*/ struct { u32 __reserved[4]; } __reserved_11;
+
+/*2D0*/ struct { u32 __reserved[4]; } __reserved_12;
+
+/*2E0*/ struct { u32 __reserved[4]; } __reserved_13;
+
+/*2F0*/ struct { u32 __reserved[4]; } __reserved_14;
+
+/*300*/ struct { /* Interrupt Command Register 1 */
+ u32 vector : 8,
+ delivery_mode : 3,
+ destination_mode : 1,
+ delivery_status : 1,
+ __reserved_1 : 1,
+ level : 1,
+ trigger : 1,
+ __reserved_2 : 2,
+ shorthand : 2,
+ __reserved_3 : 12;
+ u32 __reserved_4[3];
+ } icr1;
+
+/*310*/ struct { /* Interrupt Command Register 2 */
+ union {
+ u32 __reserved_1 : 24,
+ phys_dest : 4,
+ __reserved_2 : 4;
+ u32 __reserved_3 : 24,
+ logical_dest : 8;
+ } dest;
+ u32 __reserved_4[3];
+ } icr2;
+
+/*320*/ struct { /* LVT - Timer */
+ u32 vector : 8,
+ __reserved_1 : 4,
+ delivery_status : 1,
+ __reserved_2 : 3,
+ mask : 1,
+ timer_mode : 1,
+ __reserved_3 : 14;
+ u32 __reserved_4[3];
+ } lvt_timer;
+
+/*330*/ struct { /* LVT - Thermal Sensor */
+ u32 vector : 8,
+ delivery_mode : 3,
+ __reserved_1 : 1,
+ delivery_status : 1,
+ __reserved_2 : 3,
+ mask : 1,
+ __reserved_3 : 15;
+ u32 __reserved_4[3];
+ } lvt_thermal;
+
+/*340*/ struct { /* LVT - Performance Counter */
+ u32 vector : 8,
+ delivery_mode : 3,
+ __reserved_1 : 1,
+ delivery_status : 1,
+ __reserved_2 : 3,
+ mask : 1,
+ __reserved_3 : 15;
+ u32 __reserved_4[3];
+ } lvt_pc;
+
+/*350*/ struct { /* LVT - LINT0 */
+ u32 vector : 8,
+ delivery_mode : 3,
+ __reserved_1 : 1,
+ delivery_status : 1,
+ polarity : 1,
+ remote_irr : 1,
+ trigger : 1,
+ mask : 1,
+ __reserved_2 : 15;
+ u32 __reserved_3[3];
+ } lvt_lint0;
+
+/*360*/ struct { /* LVT - LINT1 */
+ u32 vector : 8,
+ delivery_mode : 3,
+ __reserved_1 : 1,
+ delivery_status : 1,
+ polarity : 1,
+ remote_irr : 1,
+ trigger : 1,
+ mask : 1,
+ __reserved_2 : 15;
+ u32 __reserved_3[3];
+ } lvt_lint1;
+
+/*370*/ struct { /* LVT - Error */
+ u32 vector : 8,
+ __reserved_1 : 4,
+ delivery_status : 1,
+ __reserved_2 : 3,
+ mask : 1,
+ __reserved_3 : 15;
+ u32 __reserved_4[3];
+ } lvt_error;
+
+/*380*/ struct { /* Timer Initial Count Register */
+ u32 initial_count;
+ u32 __reserved_2[3];
+ } timer_icr;
+
+/*390*/ const
+ struct { /* Timer Current Count Register */
+ u32 curr_count;
+ u32 __reserved_2[3];
+ } timer_ccr;
+
+/*3A0*/ struct { u32 __reserved[4]; } __reserved_16;
+
+/*3B0*/ struct { u32 __reserved[4]; } __reserved_17;
+
+/*3C0*/ struct { u32 __reserved[4]; } __reserved_18;
+
+/*3D0*/ struct { u32 __reserved[4]; } __reserved_19;
+
+/*3E0*/ struct { /* Timer Divide Configuration Register */
+ u32 divisor : 4,
+ __reserved_1 : 28;
+ u32 __reserved_2[3];
+ } timer_dcr;
+
+/*3F0*/ struct { u32 __reserved[4]; } __reserved_20;
+
+} __attribute__ ((packed));
+
+#undef u32
+
+#ifdef CONFIG_X86_32
+ #define BAD_APICID 0xFFu
+#else
+ #define BAD_APICID 0xFFFFu
+#endif
+#endif /* _ASM_X86_APICDEF_H */
diff --git a/arch/x86/include/asm/arch_hooks.h b/arch/x86/include/asm/arch_hooks.h
new file mode 100644
index 000000000000..cbd4957838a6
--- /dev/null
+++ b/arch/x86/include/asm/arch_hooks.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_X86_ARCH_HOOKS_H
+#define _ASM_X86_ARCH_HOOKS_H
+
+#include <linux/interrupt.h>
+
+/*
+ * linux/include/asm/arch_hooks.h
+ *
+ * define the architecture specific hooks
+ */
+
+/* these aren't arch hooks, they are generic routines
+ * that can be used by the hooks */
+extern void init_ISA_irqs(void);
+extern irqreturn_t timer_interrupt(int irq, void *dev_id);
+
+/* these are the defined hooks */
+extern void intr_init_hook(void);
+extern void pre_intr_init_hook(void);
+extern void pre_setup_arch_hook(void);
+extern void trap_init_hook(void);
+extern void pre_time_init_hook(void);
+extern void time_init_hook(void);
+extern void mca_nmi_hook(void);
+
+#endif /* _ASM_X86_ARCH_HOOKS_H */
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
new file mode 100644
index 000000000000..56be78f582f0
--- /dev/null
+++ b/arch/x86/include/asm/asm.h
@@ -0,0 +1,47 @@
+#ifndef _ASM_X86_ASM_H
+#define _ASM_X86_ASM_H
+
+#ifdef __ASSEMBLY__
+# define __ASM_FORM(x) x
+# define __ASM_EX_SEC .section __ex_table
+#else
+# define __ASM_FORM(x) " " #x " "
+# define __ASM_EX_SEC " .section __ex_table,\"a\"\n"
+#endif
+
+#ifdef CONFIG_X86_32
+# define __ASM_SEL(a,b) __ASM_FORM(a)
+#else
+# define __ASM_SEL(a,b) __ASM_FORM(b)
+#endif
+
+#define __ASM_SIZE(inst) __ASM_SEL(inst##l, inst##q)
+#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg)
+
+#define _ASM_PTR __ASM_SEL(.long, .quad)
+#define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8)
+
+#define _ASM_MOV __ASM_SIZE(mov)
+#define _ASM_INC __ASM_SIZE(inc)
+#define _ASM_DEC __ASM_SIZE(dec)
+#define _ASM_ADD __ASM_SIZE(add)
+#define _ASM_SUB __ASM_SIZE(sub)
+#define _ASM_XADD __ASM_SIZE(xadd)
+
+#define _ASM_AX __ASM_REG(ax)
+#define _ASM_BX __ASM_REG(bx)
+#define _ASM_CX __ASM_REG(cx)
+#define _ASM_DX __ASM_REG(dx)
+#define _ASM_SP __ASM_REG(sp)
+#define _ASM_BP __ASM_REG(bp)
+#define _ASM_SI __ASM_REG(si)
+#define _ASM_DI __ASM_REG(di)
+
+/* Exception table entry */
+# define _ASM_EXTABLE(from,to) \
+ __ASM_EX_SEC \
+ _ASM_ALIGN "\n" \
+ _ASM_PTR #from "," #to "\n" \
+ " .previous\n"
+
+#endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
new file mode 100644
index 000000000000..4e1b8873c474
--- /dev/null
+++ b/arch/x86/include/asm/atomic.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "atomic_32.h"
+#else
+# include "atomic_64.h"
+#endif
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
new file mode 100644
index 000000000000..ad5b9f6ecddf
--- /dev/null
+++ b/arch/x86/include/asm/atomic_32.h
@@ -0,0 +1,259 @@
+#ifndef _ASM_X86_ATOMIC_32_H
+#define _ASM_X86_ATOMIC_32_H
+
+#include <linux/compiler.h>
+#include <asm/processor.h>
+#include <asm/cmpxchg.h>
+
+/*
+ * Atomic operations that C can't guarantee us. Useful for
+ * resource counting etc..
+ */
+
+/*
+ * Make sure gcc doesn't try to be clever and move things around
+ * on us. We need to use _exactly_ the address the user gave us,
+ * not some alias that contains the same information.
+ */
+typedef struct {
+ int counter;
+} atomic_t;
+
+#define ATOMIC_INIT(i) { (i) }
+
+/**
+ * atomic_read - read atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically reads the value of @v.
+ */
+#define atomic_read(v) ((v)->counter)
+
+/**
+ * atomic_set - set atomic variable
+ * @v: pointer of type atomic_t
+ * @i: required value
+ *
+ * Atomically sets the value of @v to @i.
+ */
+#define atomic_set(v, i) (((v)->counter) = (i))
+
+/**
+ * atomic_add - add integer to atomic variable
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v.
+ */
+static inline void atomic_add(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "addl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i));
+}
+
+/**
+ * atomic_sub - subtract integer from atomic variable
+ * @i: integer value to subtract
+ * @v: pointer of type atomic_t
+ *
+ * Atomically subtracts @i from @v.
+ */
+static inline void atomic_sub(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "subl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i));
+}
+
+/**
+ * atomic_sub_and_test - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @v: pointer of type atomic_t
+ *
+ * Atomically subtracts @i from @v and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic_sub_and_test(int i, atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
+ : "+m" (v->counter), "=qm" (c)
+ : "ir" (i) : "memory");
+ return c;
+}
+
+/**
+ * atomic_inc - increment atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1.
+ */
+static inline void atomic_inc(atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "incl %0"
+ : "+m" (v->counter));
+}
+
+/**
+ * atomic_dec - decrement atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically decrements @v by 1.
+ */
+static inline void atomic_dec(atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "decl %0"
+ : "+m" (v->counter));
+}
+
+/**
+ * atomic_dec_and_test - decrement and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically decrements @v by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic_dec_and_test(atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "decl %0; sete %1"
+ : "+m" (v->counter), "=qm" (c)
+ : : "memory");
+ return c != 0;
+}
+
+/**
+ * atomic_inc_and_test - increment and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic_inc_and_test(atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "incl %0; sete %1"
+ : "+m" (v->counter), "=qm" (c)
+ : : "memory");
+ return c != 0;
+}
+
+/**
+ * atomic_add_negative - add and test if negative
+ * @v: pointer of type atomic_t
+ * @i: integer value to add
+ *
+ * Atomically adds @i to @v and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int atomic_add_negative(int i, atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
+ : "+m" (v->counter), "=qm" (c)
+ : "ir" (i) : "memory");
+ return c;
+}
+
+/**
+ * atomic_add_return - add integer and return
+ * @v: pointer of type atomic_t
+ * @i: integer value to add
+ *
+ * Atomically adds @i to @v and returns @i + @v
+ */
+static inline int atomic_add_return(int i, atomic_t *v)
+{
+ int __i;
+#ifdef CONFIG_M386
+ unsigned long flags;
+ if (unlikely(boot_cpu_data.x86 <= 3))
+ goto no_xadd;
+#endif
+ /* Modern 486+ processor */
+ __i = i;
+ asm volatile(LOCK_PREFIX "xaddl %0, %1"
+ : "+r" (i), "+m" (v->counter)
+ : : "memory");
+ return i + __i;
+
+#ifdef CONFIG_M386
+no_xadd: /* Legacy 386 processor */
+ local_irq_save(flags);
+ __i = atomic_read(v);
+ atomic_set(v, i + __i);
+ local_irq_restore(flags);
+ return i + __i;
+#endif
+}
+
+/**
+ * atomic_sub_return - subtract integer and return
+ * @v: pointer of type atomic_t
+ * @i: integer value to subtract
+ *
+ * Atomically subtracts @i from @v and returns @v - @i
+ */
+static inline int atomic_sub_return(int i, atomic_t *v)
+{
+ return atomic_add_return(-i, v);
+}
+
+#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new)))
+#define atomic_xchg(v, new) (xchg(&((v)->counter), (new)))
+
+/**
+ * atomic_add_unless - add unless the number is already a given value
+ * @v: pointer of type atomic_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, so long as @v was not already @u.
+ * Returns non-zero if @v was not @u, and zero otherwise.
+ */
+static inline int atomic_add_unless(atomic_t *v, int a, int u)
+{
+ int c, old;
+ c = atomic_read(v);
+ for (;;) {
+ if (unlikely(c == (u)))
+ break;
+ old = atomic_cmpxchg((v), c, c + (a));
+ if (likely(old == c))
+ break;
+ c = old;
+ }
+ return c != (u);
+}
+
+#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
+
+#define atomic_inc_return(v) (atomic_add_return(1, v))
+#define atomic_dec_return(v) (atomic_sub_return(1, v))
+
+/* These are x86-specific, used by some header files */
+#define atomic_clear_mask(mask, addr) \
+ asm volatile(LOCK_PREFIX "andl %0,%1" \
+ : : "r" (~(mask)), "m" (*(addr)) : "memory")
+
+#define atomic_set_mask(mask, addr) \
+ asm volatile(LOCK_PREFIX "orl %0,%1" \
+ : : "r" (mask), "m" (*(addr)) : "memory")
+
+/* Atomic operations are already serializing on x86 */
+#define smp_mb__before_atomic_dec() barrier()
+#define smp_mb__after_atomic_dec() barrier()
+#define smp_mb__before_atomic_inc() barrier()
+#define smp_mb__after_atomic_inc() barrier()
+
+#include <asm-generic/atomic.h>
+#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
new file mode 100644
index 000000000000..279d2a731f3f
--- /dev/null
+++ b/arch/x86/include/asm/atomic_64.h
@@ -0,0 +1,473 @@
+#ifndef _ASM_X86_ATOMIC_64_H
+#define _ASM_X86_ATOMIC_64_H
+
+#include <asm/alternative.h>
+#include <asm/cmpxchg.h>
+
+/* atomic_t should be 32 bit signed type */
+
+/*
+ * Atomic operations that C can't guarantee us. Useful for
+ * resource counting etc..
+ */
+
+/*
+ * Make sure gcc doesn't try to be clever and move things around
+ * on us. We need to use _exactly_ the address the user gave us,
+ * not some alias that contains the same information.
+ */
+typedef struct {
+ int counter;
+} atomic_t;
+
+#define ATOMIC_INIT(i) { (i) }
+
+/**
+ * atomic_read - read atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically reads the value of @v.
+ */
+#define atomic_read(v) ((v)->counter)
+
+/**
+ * atomic_set - set atomic variable
+ * @v: pointer of type atomic_t
+ * @i: required value
+ *
+ * Atomically sets the value of @v to @i.
+ */
+#define atomic_set(v, i) (((v)->counter) = (i))
+
+/**
+ * atomic_add - add integer to atomic variable
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v.
+ */
+static inline void atomic_add(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "addl %1,%0"
+ : "=m" (v->counter)
+ : "ir" (i), "m" (v->counter));
+}
+
+/**
+ * atomic_sub - subtract the atomic variable
+ * @i: integer value to subtract
+ * @v: pointer of type atomic_t
+ *
+ * Atomically subtracts @i from @v.
+ */
+static inline void atomic_sub(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "subl %1,%0"
+ : "=m" (v->counter)
+ : "ir" (i), "m" (v->counter));
+}
+
+/**
+ * atomic_sub_and_test - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @v: pointer of type atomic_t
+ *
+ * Atomically subtracts @i from @v and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic_sub_and_test(int i, atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "ir" (i), "m" (v->counter) : "memory");
+ return c;
+}
+
+/**
+ * atomic_inc - increment atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1.
+ */
+static inline void atomic_inc(atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "incl %0"
+ : "=m" (v->counter)
+ : "m" (v->counter));
+}
+
+/**
+ * atomic_dec - decrement atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically decrements @v by 1.
+ */
+static inline void atomic_dec(atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "decl %0"
+ : "=m" (v->counter)
+ : "m" (v->counter));
+}
+
+/**
+ * atomic_dec_and_test - decrement and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically decrements @v by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic_dec_and_test(atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "decl %0; sete %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "m" (v->counter) : "memory");
+ return c != 0;
+}
+
+/**
+ * atomic_inc_and_test - increment and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic_inc_and_test(atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "incl %0; sete %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "m" (v->counter) : "memory");
+ return c != 0;
+}
+
+/**
+ * atomic_add_negative - add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int atomic_add_negative(int i, atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "ir" (i), "m" (v->counter) : "memory");
+ return c;
+}
+
+/**
+ * atomic_add_return - add and return
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v and returns @i + @v
+ */
+static inline int atomic_add_return(int i, atomic_t *v)
+{
+ int __i = i;
+ asm volatile(LOCK_PREFIX "xaddl %0, %1"
+ : "+r" (i), "+m" (v->counter)
+ : : "memory");
+ return i + __i;
+}
+
+static inline int atomic_sub_return(int i, atomic_t *v)
+{
+ return atomic_add_return(-i, v);
+}
+
+#define atomic_inc_return(v) (atomic_add_return(1, v))
+#define atomic_dec_return(v) (atomic_sub_return(1, v))
+
+/* An 64bit atomic type */
+
+typedef struct {
+ long counter;
+} atomic64_t;
+
+#define ATOMIC64_INIT(i) { (i) }
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically reads the value of @v.
+ * Doesn't imply a read memory barrier.
+ */
+#define atomic64_read(v) ((v)->counter)
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @v: pointer to type atomic64_t
+ * @i: required value
+ *
+ * Atomically sets the value of @v to @i.
+ */
+#define atomic64_set(v, i) (((v)->counter) = (i))
+
+/**
+ * atomic64_add - add integer to atomic64 variable
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically adds @i to @v.
+ */
+static inline void atomic64_add(long i, atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "addq %1,%0"
+ : "=m" (v->counter)
+ : "er" (i), "m" (v->counter));
+}
+
+/**
+ * atomic64_sub - subtract the atomic64 variable
+ * @i: integer value to subtract
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically subtracts @i from @v.
+ */
+static inline void atomic64_sub(long i, atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "subq %1,%0"
+ : "=m" (v->counter)
+ : "er" (i), "m" (v->counter));
+}
+
+/**
+ * atomic64_sub_and_test - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically subtracts @i from @v and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic64_sub_and_test(long i, atomic64_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "subq %2,%0; sete %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "er" (i), "m" (v->counter) : "memory");
+ return c;
+}
+
+/**
+ * atomic64_inc - increment atomic64 variable
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically increments @v by 1.
+ */
+static inline void atomic64_inc(atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "incq %0"
+ : "=m" (v->counter)
+ : "m" (v->counter));
+}
+
+/**
+ * atomic64_dec - decrement atomic64 variable
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically decrements @v by 1.
+ */
+static inline void atomic64_dec(atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "decq %0"
+ : "=m" (v->counter)
+ : "m" (v->counter));
+}
+
+/**
+ * atomic64_dec_and_test - decrement and test
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically decrements @v by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic64_dec_and_test(atomic64_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "decq %0; sete %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "m" (v->counter) : "memory");
+ return c != 0;
+}
+
+/**
+ * atomic64_inc_and_test - increment and test
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically increments @v by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic64_inc_and_test(atomic64_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "incq %0; sete %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "m" (v->counter) : "memory");
+ return c != 0;
+}
+
+/**
+ * atomic64_add_negative - add and test if negative
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically adds @i to @v and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int atomic64_add_negative(long i, atomic64_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "addq %2,%0; sets %1"
+ : "=m" (v->counter), "=qm" (c)
+ : "er" (i), "m" (v->counter) : "memory");
+ return c;
+}
+
+/**
+ * atomic64_add_return - add and return
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically adds @i to @v and returns @i + @v
+ */
+static inline long atomic64_add_return(long i, atomic64_t *v)
+{
+ long __i = i;
+ asm volatile(LOCK_PREFIX "xaddq %0, %1;"
+ : "+r" (i), "+m" (v->counter)
+ : : "memory");
+ return i + __i;
+}
+
+static inline long atomic64_sub_return(long i, atomic64_t *v)
+{
+ return atomic64_add_return(-i, v);
+}
+
+#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
+#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
+
+#define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new)))
+#define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
+
+#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new)))
+#define atomic_xchg(v, new) (xchg(&((v)->counter), (new)))
+
+/**
+ * atomic_add_unless - add unless the number is a given value
+ * @v: pointer of type atomic_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, so long as it was not @u.
+ * Returns non-zero if @v was not @u, and zero otherwise.
+ */
+static inline int atomic_add_unless(atomic_t *v, int a, int u)
+{
+ int c, old;
+ c = atomic_read(v);
+ for (;;) {
+ if (unlikely(c == (u)))
+ break;
+ old = atomic_cmpxchg((v), c, c + (a));
+ if (likely(old == c))
+ break;
+ c = old;
+ }
+ return c != (u);
+}
+
+#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
+
+/**
+ * atomic64_add_unless - add unless the number is a given value
+ * @v: pointer of type atomic64_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, so long as it was not @u.
+ * Returns non-zero if @v was not @u, and zero otherwise.
+ */
+static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
+{
+ long c, old;
+ c = atomic64_read(v);
+ for (;;) {
+ if (unlikely(c == (u)))
+ break;
+ old = atomic64_cmpxchg((v), c, c + (a));
+ if (likely(old == c))
+ break;
+ c = old;
+ }
+ return c != (u);
+}
+
+/**
+ * atomic_inc_short - increment of a short integer
+ * @v: pointer to type int
+ *
+ * Atomically adds 1 to @v
+ * Returns the new value of @u
+ */
+static inline short int atomic_inc_short(short int *v)
+{
+ asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
+ return *v;
+}
+
+/**
+ * atomic_or_long - OR of two long integers
+ * @v1: pointer to type unsigned long
+ * @v2: pointer to type unsigned long
+ *
+ * Atomically ORs @v1 and @v2
+ * Returns the result of the OR
+ */
+static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
+{
+ asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2));
+}
+
+#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
+
+/* These are x86-specific, used by some header files */
+#define atomic_clear_mask(mask, addr) \
+ asm volatile(LOCK_PREFIX "andl %0,%1" \
+ : : "r" (~(mask)), "m" (*(addr)) : "memory")
+
+#define atomic_set_mask(mask, addr) \
+ asm volatile(LOCK_PREFIX "orl %0,%1" \
+ : : "r" ((unsigned)(mask)), "m" (*(addr)) \
+ : "memory")
+
+/* Atomic operations are already serializing on x86 */
+#define smp_mb__before_atomic_dec() barrier()
+#define smp_mb__after_atomic_dec() barrier()
+#define smp_mb__before_atomic_inc() barrier()
+#define smp_mb__after_atomic_inc() barrier()
+
+#include <asm-generic/atomic.h>
+#endif /* _ASM_X86_ATOMIC_64_H */
diff --git a/arch/x86/include/asm/auxvec.h b/arch/x86/include/asm/auxvec.h
new file mode 100644
index 000000000000..1316b4c35425
--- /dev/null
+++ b/arch/x86/include/asm/auxvec.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_AUXVEC_H
+#define _ASM_X86_AUXVEC_H
+/*
+ * Architecture-neutral AT_ values in 0-17, leave some room
+ * for more of them, start the x86-specific ones at 32.
+ */
+#ifdef __i386__
+#define AT_SYSINFO 32
+#endif
+#define AT_SYSINFO_EHDR 33
+
+#endif /* _ASM_X86_AUXVEC_H */
diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h
new file mode 100644
index 000000000000..ce547f24a1cd
--- /dev/null
+++ b/arch/x86/include/asm/bigsmp/apic.h
@@ -0,0 +1,137 @@
+#ifndef __ASM_MACH_APIC_H
+#define __ASM_MACH_APIC_H
+
+#define xapic_phys_to_log_apicid(cpu) (per_cpu(x86_bios_cpu_apicid, cpu))
+#define esr_disable (1)
+
+static inline int apic_id_registered(void)
+{
+ return (1);
+}
+
+static inline cpumask_t target_cpus(void)
+{
+#ifdef CONFIG_SMP
+ return cpu_online_map;
+#else
+ return cpumask_of_cpu(0);
+#endif
+}
+
+#undef APIC_DEST_LOGICAL
+#define APIC_DEST_LOGICAL 0
+#define APIC_DFR_VALUE (APIC_DFR_FLAT)
+#define INT_DELIVERY_MODE (dest_Fixed)
+#define INT_DEST_MODE (0) /* phys delivery to target proc */
+#define NO_BALANCE_IRQ (0)
+
+static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return (0);
+}
+
+static inline unsigned long check_apicid_present(int bit)
+{
+ return (1);
+}
+
+static inline unsigned long calculate_ldr(int cpu)
+{
+ unsigned long val, id;
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ id = xapic_phys_to_log_apicid(cpu);
+ val |= SET_APIC_LOGICAL_ID(id);
+ return val;
+}
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+static inline void init_apic_ldr(void)
+{
+ unsigned long val;
+ int cpu = smp_processor_id();
+
+ apic_write(APIC_DFR, APIC_DFR_VALUE);
+ val = calculate_ldr(cpu);
+ apic_write(APIC_LDR, val);
+}
+
+static inline void setup_apic_routing(void)
+{
+ printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
+ "Physflat", nr_ioapics);
+}
+
+static inline int multi_timer_check(int apic, int irq)
+{
+ return (0);
+}
+
+static inline int apicid_to_node(int logical_apicid)
+{
+ return apicid_2_node[hard_smp_processor_id()];
+}
+
+static inline int cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < NR_CPUS)
+ return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
+
+ return BAD_APICID;
+}
+
+static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
+{
+ return physid_mask_of_physid(phys_apicid);
+}
+
+extern u8 cpu_2_logical_apicid[];
+/* Mapping from cpu number to logical apicid */
+static inline int cpu_to_logical_apicid(int cpu)
+{
+ if (cpu >= NR_CPUS)
+ return BAD_APICID;
+ return cpu_physical_id(cpu);
+}
+
+static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
+{
+ /* For clustered we don't have a good way to do this yet - hack */
+ return physids_promote(0xFFL);
+}
+
+static inline void setup_portio_remap(void)
+{
+}
+
+static inline void enable_apic_mode(void)
+{
+}
+
+static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return (1);
+}
+
+/* As we are using single CPU as destination, pick only one CPU here */
+static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ int cpu;
+ int apicid;
+
+ cpu = first_cpu(cpumask);
+ apicid = cpu_to_logical_apicid(cpu);
+ return apicid;
+}
+
+static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+#endif /* __ASM_MACH_APIC_H */
diff --git a/arch/x86/include/asm/bigsmp/apicdef.h b/arch/x86/include/asm/bigsmp/apicdef.h
new file mode 100644
index 000000000000..392c3f5ef2fe
--- /dev/null
+++ b/arch/x86/include/asm/bigsmp/apicdef.h
@@ -0,0 +1,13 @@
+#ifndef __ASM_MACH_APICDEF_H
+#define __ASM_MACH_APICDEF_H
+
+#define APIC_ID_MASK (0xFF<<24)
+
+static inline unsigned get_apic_id(unsigned long x)
+{
+ return (((x)>>24)&0xFF);
+}
+
+#define GET_APIC_ID(x) get_apic_id(x)
+
+#endif
diff --git a/arch/x86/include/asm/bigsmp/ipi.h b/arch/x86/include/asm/bigsmp/ipi.h
new file mode 100644
index 000000000000..9404c535b7ec
--- /dev/null
+++ b/arch/x86/include/asm/bigsmp/ipi.h
@@ -0,0 +1,25 @@
+#ifndef __ASM_MACH_IPI_H
+#define __ASM_MACH_IPI_H
+
+void send_IPI_mask_sequence(cpumask_t mask, int vector);
+
+static inline void send_IPI_mask(cpumask_t mask, int vector)
+{
+ send_IPI_mask_sequence(mask, vector);
+}
+
+static inline void send_IPI_allbutself(int vector)
+{
+ cpumask_t mask = cpu_online_map;
+ cpu_clear(smp_processor_id(), mask);
+
+ if (!cpus_empty(mask))
+ send_IPI_mask(mask, vector);
+}
+
+static inline void send_IPI_all(int vector)
+{
+ send_IPI_mask(cpu_online_map, vector);
+}
+
+#endif /* __ASM_MACH_IPI_H */
diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h
new file mode 100644
index 000000000000..3c7521063d3f
--- /dev/null
+++ b/arch/x86/include/asm/bios_ebda.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_X86_BIOS_EBDA_H
+#define _ASM_X86_BIOS_EBDA_H
+
+#include <asm/io.h>
+
+/*
+ * there is a real-mode segmented pointer pointing to the
+ * 4K EBDA area at 0x40E.
+ */
+static inline unsigned int get_bios_ebda(void)
+{
+ unsigned int address = *(unsigned short *)phys_to_virt(0x40E);
+ address <<= 4;
+ return address; /* 0 means none */
+}
+
+void reserve_ebda_region(void);
+
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+/*
+ * This is obviously not a great place for this, but we want to be
+ * able to scatter it around anywhere in the kernel.
+ */
+void check_for_bios_corruption(void);
+void start_periodic_check_for_corruption(void);
+#else
+static inline void check_for_bios_corruption(void)
+{
+}
+
+static inline void start_periodic_check_for_corruption(void)
+{
+}
+#endif
+
+#endif /* _ASM_X86_BIOS_EBDA_H */
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
new file mode 100644
index 000000000000..9fa9dcdf344b
--- /dev/null
+++ b/arch/x86/include/asm/bitops.h
@@ -0,0 +1,459 @@
+#ifndef _ASM_X86_BITOPS_H
+#define _ASM_X86_BITOPS_H
+
+/*
+ * Copyright 1992, Linus Torvalds.
+ */
+
+#ifndef _LINUX_BITOPS_H
+#error only <linux/bitops.h> can be included directly
+#endif
+
+#include <linux/compiler.h>
+#include <asm/alternative.h>
+
+/*
+ * These have to be done with inline assembly: that way the bit-setting
+ * is guaranteed to be atomic. All bit operations return 0 if the bit
+ * was cleared before the operation and != 0 if it was not.
+ *
+ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
+ */
+
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
+/* Technically wrong, but this avoids compilation errors on some gcc
+ versions. */
+#define BITOP_ADDR(x) "=m" (*(volatile long *) (x))
+#else
+#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
+#endif
+
+#define ADDR BITOP_ADDR(addr)
+
+/*
+ * We do the locked ops that don't return the old value as
+ * a mask operation on a byte.
+ */
+#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr))
+#define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3))
+#define CONST_MASK(nr) (1 << ((nr) & 7))
+
+/**
+ * set_bit - Atomically set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * This function is atomic and may not be reordered. See __set_bit()
+ * if you do not require the atomic guarantees.
+ *
+ * Note: there are no guarantees that this function will not be reordered
+ * on non x86 architectures, so if you are writing portable code,
+ * make sure not to rely on its reordering guarantees.
+ *
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void set_bit(unsigned int nr, volatile unsigned long *addr)
+{
+ if (IS_IMMEDIATE(nr)) {
+ asm volatile(LOCK_PREFIX "orb %1,%0"
+ : CONST_MASK_ADDR(nr, addr)
+ : "iq" ((u8)CONST_MASK(nr))
+ : "memory");
+ } else {
+ asm volatile(LOCK_PREFIX "bts %1,%0"
+ : BITOP_ADDR(addr) : "Ir" (nr) : "memory");
+ }
+}
+
+/**
+ * __set_bit - Set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * Unlike set_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static inline void __set_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
+}
+
+/**
+ * clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and may not be reordered. However, it does
+ * not contain a memory barrier, so if it is used for locking purposes,
+ * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * in order to ensure changes are visible on other processors.
+ */
+static inline void clear_bit(int nr, volatile unsigned long *addr)
+{
+ if (IS_IMMEDIATE(nr)) {
+ asm volatile(LOCK_PREFIX "andb %1,%0"
+ : CONST_MASK_ADDR(nr, addr)
+ : "iq" ((u8)~CONST_MASK(nr)));
+ } else {
+ asm volatile(LOCK_PREFIX "btr %1,%0"
+ : BITOP_ADDR(addr)
+ : "Ir" (nr));
+ }
+}
+
+/*
+ * clear_bit_unlock - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and implies release semantics before the memory
+ * operation. It can be used for an unlock.
+ */
+static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr)
+{
+ barrier();
+ clear_bit(nr, addr);
+}
+
+static inline void __clear_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
+}
+
+/*
+ * __clear_bit_unlock - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * __clear_bit() is non-atomic and implies release semantics before the memory
+ * operation. It can be used for an unlock if no other CPUs can concurrently
+ * modify other bits in the word.
+ *
+ * No memory barrier is required here, because x86 cannot reorder stores past
+ * older loads. Same principle as spin_unlock.
+ */
+static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr)
+{
+ barrier();
+ __clear_bit(nr, addr);
+}
+
+#define smp_mb__before_clear_bit() barrier()
+#define smp_mb__after_clear_bit() barrier()
+
+/**
+ * __change_bit - Toggle a bit in memory
+ * @nr: the bit to change
+ * @addr: the address to start counting from
+ *
+ * Unlike change_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static inline void __change_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
+}
+
+/**
+ * change_bit - Toggle a bit in memory
+ * @nr: Bit to change
+ * @addr: Address to start counting from
+ *
+ * change_bit() is atomic and may not be reordered.
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void change_bit(int nr, volatile unsigned long *addr)
+{
+ if (IS_IMMEDIATE(nr)) {
+ asm volatile(LOCK_PREFIX "xorb %1,%0"
+ : CONST_MASK_ADDR(nr, addr)
+ : "iq" ((u8)CONST_MASK(nr)));
+ } else {
+ asm volatile(LOCK_PREFIX "btc %1,%0"
+ : BITOP_ADDR(addr)
+ : "Ir" (nr));
+ }
+}
+
+/**
+ * test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
+ "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
+
+ return oldbit;
+}
+
+/**
+ * test_and_set_bit_lock - Set a bit and return its old value for lock
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This is the same as test_and_set_bit on x86.
+ */
+static inline int test_and_set_bit_lock(int nr, volatile unsigned long *addr)
+{
+ return test_and_set_bit(nr, addr);
+}
+
+/**
+ * __test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail. You must protect multiple accesses with a lock.
+ */
+static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm("bts %2,%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit), ADDR
+ : "Ir" (nr));
+ return oldbit;
+}
+
+/**
+ * test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile(LOCK_PREFIX "btr %2,%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
+
+ return oldbit;
+}
+
+/**
+ * __test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail. You must protect multiple accesses with a lock.
+ */
+static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile("btr %2,%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit), ADDR
+ : "Ir" (nr));
+ return oldbit;
+}
+
+/* WARNING: non atomic and it can be reordered! */
+static inline int __test_and_change_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile("btc %2,%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit), ADDR
+ : "Ir" (nr) : "memory");
+
+ return oldbit;
+}
+
+/**
+ * test_and_change_bit - Change a bit and return its old value
+ * @nr: Bit to change
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int test_and_change_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile(LOCK_PREFIX "btc %2,%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
+
+ return oldbit;
+}
+
+static inline int constant_test_bit(int nr, const volatile unsigned long *addr)
+{
+ return ((1UL << (nr % BITS_PER_LONG)) &
+ (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
+}
+
+static inline int variable_test_bit(int nr, volatile const unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile("bt %2,%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit)
+ : "m" (*(unsigned long *)addr), "Ir" (nr));
+
+ return oldbit;
+}
+
+#if 0 /* Fool kernel-doc since it doesn't do macros yet */
+/**
+ * test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static int test_bit(int nr, const volatile unsigned long *addr);
+#endif
+
+#define test_bit(nr, addr) \
+ (__builtin_constant_p((nr)) \
+ ? constant_test_bit((nr), (addr)) \
+ : variable_test_bit((nr), (addr)))
+
+/**
+ * __ffs - find first set bit in word
+ * @word: The word to search
+ *
+ * Undefined if no bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __ffs(unsigned long word)
+{
+ asm("bsf %1,%0"
+ : "=r" (word)
+ : "rm" (word));
+ return word;
+}
+
+/**
+ * ffz - find first zero bit in word
+ * @word: The word to search
+ *
+ * Undefined if no zero exists, so code should check against ~0UL first.
+ */
+static inline unsigned long ffz(unsigned long word)
+{
+ asm("bsf %1,%0"
+ : "=r" (word)
+ : "r" (~word));
+ return word;
+}
+
+/*
+ * __fls: find last set bit in word
+ * @word: The word to search
+ *
+ * Undefined if no set bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __fls(unsigned long word)
+{
+ asm("bsr %1,%0"
+ : "=r" (word)
+ : "rm" (word));
+ return word;
+}
+
+#ifdef __KERNEL__
+/**
+ * ffs - find first set bit in word
+ * @x: the word to search
+ *
+ * This is defined the same way as the libc and compiler builtin ffs
+ * routines, therefore differs in spirit from the other bitops.
+ *
+ * ffs(value) returns 0 if value is 0 or the position of the first
+ * set bit if value is nonzero. The first (least significant) bit
+ * is at position 1.
+ */
+static inline int ffs(int x)
+{
+ int r;
+#ifdef CONFIG_X86_CMOV
+ asm("bsfl %1,%0\n\t"
+ "cmovzl %2,%0"
+ : "=r" (r) : "rm" (x), "r" (-1));
+#else
+ asm("bsfl %1,%0\n\t"
+ "jnz 1f\n\t"
+ "movl $-1,%0\n"
+ "1:" : "=r" (r) : "rm" (x));
+#endif
+ return r + 1;
+}
+
+/**
+ * fls - find last set bit in word
+ * @x: the word to search
+ *
+ * This is defined in a similar way as the libc and compiler builtin
+ * ffs, but returns the position of the most significant set bit.
+ *
+ * fls(value) returns 0 if value is 0 or the position of the last
+ * set bit if value is nonzero. The last (most significant) bit is
+ * at position 32.
+ */
+static inline int fls(int x)
+{
+ int r;
+#ifdef CONFIG_X86_CMOV
+ asm("bsrl %1,%0\n\t"
+ "cmovzl %2,%0"
+ : "=&r" (r) : "rm" (x), "rm" (-1));
+#else
+ asm("bsrl %1,%0\n\t"
+ "jnz 1f\n\t"
+ "movl $-1,%0\n"
+ "1:" : "=r" (r) : "rm" (x));
+#endif
+ return r + 1;
+}
+#endif /* __KERNEL__ */
+
+#undef ADDR
+
+#ifdef __KERNEL__
+
+#include <asm-generic/bitops/sched.h>
+
+#define ARCH_HAS_FAST_MULTIPLIER 1
+
+#include <asm-generic/bitops/hweight.h>
+
+#endif /* __KERNEL__ */
+
+#include <asm-generic/bitops/fls64.h>
+
+#ifdef __KERNEL__
+
+#include <asm-generic/bitops/ext2-non-atomic.h>
+
+#define ext2_set_bit_atomic(lock, nr, addr) \
+ test_and_set_bit((nr), (unsigned long *)(addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) \
+ test_and_clear_bit((nr), (unsigned long *)(addr))
+
+#include <asm-generic/bitops/minix.h>
+
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_BITOPS_H */
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
new file mode 100644
index 000000000000..dd61616cb73d
--- /dev/null
+++ b/arch/x86/include/asm/boot.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_X86_BOOT_H
+#define _ASM_X86_BOOT_H
+
+/* Don't touch these, unless you really know what you're doing. */
+#define DEF_SYSSEG 0x1000
+#define DEF_SYSSIZE 0x7F00
+
+/* Internal svga startup constants */
+#define NORMAL_VGA 0xffff /* 80x25 mode */
+#define EXTENDED_VGA 0xfffe /* 80x50 mode */
+#define ASK_VGA 0xfffd /* ask for it at bootup */
+
+/* Physical address where kernel should be loaded. */
+#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
+ + (CONFIG_PHYSICAL_ALIGN - 1)) \
+ & ~(CONFIG_PHYSICAL_ALIGN - 1))
+
+#ifdef CONFIG_X86_64
+#define BOOT_HEAP_SIZE 0x7000
+#define BOOT_STACK_SIZE 0x4000
+#else
+#define BOOT_HEAP_SIZE 0x4000
+#define BOOT_STACK_SIZE 0x1000
+#endif
+
+#endif /* _ASM_X86_BOOT_H */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
new file mode 100644
index 000000000000..433adaebf9b6
--- /dev/null
+++ b/arch/x86/include/asm/bootparam.h
@@ -0,0 +1,111 @@
+#ifndef _ASM_X86_BOOTPARAM_H
+#define _ASM_X86_BOOTPARAM_H
+
+#include <linux/types.h>
+#include <linux/screen_info.h>
+#include <linux/apm_bios.h>
+#include <linux/edd.h>
+#include <asm/e820.h>
+#include <asm/ist.h>
+#include <video/edid.h>
+
+/* setup data types */
+#define SETUP_NONE 0
+#define SETUP_E820_EXT 1
+
+/* extensible setup data list node */
+struct setup_data {
+ __u64 next;
+ __u32 type;
+ __u32 len;
+ __u8 data[0];
+};
+
+struct setup_header {
+ __u8 setup_sects;
+ __u16 root_flags;
+ __u32 syssize;
+ __u16 ram_size;
+#define RAMDISK_IMAGE_START_MASK 0x07FF
+#define RAMDISK_PROMPT_FLAG 0x8000
+#define RAMDISK_LOAD_FLAG 0x4000
+ __u16 vid_mode;
+ __u16 root_dev;
+ __u16 boot_flag;
+ __u16 jump;
+ __u32 header;
+ __u16 version;
+ __u32 realmode_swtch;
+ __u16 start_sys;
+ __u16 kernel_version;
+ __u8 type_of_loader;
+ __u8 loadflags;
+#define LOADED_HIGH (1<<0)
+#define QUIET_FLAG (1<<5)
+#define KEEP_SEGMENTS (1<<6)
+#define CAN_USE_HEAP (1<<7)
+ __u16 setup_move_size;
+ __u32 code32_start;
+ __u32 ramdisk_image;
+ __u32 ramdisk_size;
+ __u32 bootsect_kludge;
+ __u16 heap_end_ptr;
+ __u16 _pad1;
+ __u32 cmd_line_ptr;
+ __u32 initrd_addr_max;
+ __u32 kernel_alignment;
+ __u8 relocatable_kernel;
+ __u8 _pad2[3];
+ __u32 cmdline_size;
+ __u32 hardware_subarch;
+ __u64 hardware_subarch_data;
+ __u32 payload_offset;
+ __u32 payload_length;
+ __u64 setup_data;
+} __attribute__((packed));
+
+struct sys_desc_table {
+ __u16 length;
+ __u8 table[14];
+};
+
+struct efi_info {
+ __u32 efi_loader_signature;
+ __u32 efi_systab;
+ __u32 efi_memdesc_size;
+ __u32 efi_memdesc_version;
+ __u32 efi_memmap;
+ __u32 efi_memmap_size;
+ __u32 efi_systab_hi;
+ __u32 efi_memmap_hi;
+};
+
+/* The so-called "zeropage" */
+struct boot_params {
+ struct screen_info screen_info; /* 0x000 */
+ struct apm_bios_info apm_bios_info; /* 0x040 */
+ __u8 _pad2[12]; /* 0x054 */
+ struct ist_info ist_info; /* 0x060 */
+ __u8 _pad3[16]; /* 0x070 */
+ __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
+ __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
+ struct sys_desc_table sys_desc_table; /* 0x0a0 */
+ __u8 _pad4[144]; /* 0x0b0 */
+ struct edid_info edid_info; /* 0x140 */
+ struct efi_info efi_info; /* 0x1c0 */
+ __u32 alt_mem_k; /* 0x1e0 */
+ __u32 scratch; /* Scratch field! */ /* 0x1e4 */
+ __u8 e820_entries; /* 0x1e8 */
+ __u8 eddbuf_entries; /* 0x1e9 */
+ __u8 edd_mbr_sig_buf_entries; /* 0x1ea */
+ __u8 _pad6[6]; /* 0x1eb */
+ struct setup_header hdr; /* setup header */ /* 0x1f1 */
+ __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)];
+ __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */
+ struct e820entry e820_map[E820MAX]; /* 0x2d0 */
+ __u8 _pad8[48]; /* 0xcd0 */
+ struct edd_info eddbuf[EDDMAXNR]; /* 0xd00 */
+ __u8 _pad9[276]; /* 0xeec */
+} __attribute__((packed));
+
+#endif /* _ASM_X86_BOOTPARAM_H */
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
new file mode 100644
index 000000000000..d9cf1cd156d2
--- /dev/null
+++ b/arch/x86/include/asm/bug.h
@@ -0,0 +1,39 @@
+#ifndef _ASM_X86_BUG_H
+#define _ASM_X86_BUG_H
+
+#ifdef CONFIG_BUG
+#define HAVE_ARCH_BUG
+
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+
+#ifdef CONFIG_X86_32
+# define __BUG_C0 "2:\t.long 1b, %c0\n"
+#else
+# define __BUG_C0 "2:\t.long 1b - 2b, %c0 - 2b\n"
+#endif
+
+#define BUG() \
+do { \
+ asm volatile("1:\tud2\n" \
+ ".pushsection __bug_table,\"a\"\n" \
+ __BUG_C0 \
+ "\t.word %c1, 0\n" \
+ "\t.org 2b+%c2\n" \
+ ".popsection" \
+ : : "i" (__FILE__), "i" (__LINE__), \
+ "i" (sizeof(struct bug_entry))); \
+ for (;;) ; \
+} while (0)
+
+#else
+#define BUG() \
+do { \
+ asm volatile("ud2"); \
+ for (;;) ; \
+} while (0)
+#endif
+
+#endif /* !CONFIG_BUG */
+
+#include <asm-generic/bug.h>
+#endif /* _ASM_X86_BUG_H */
diff --git a/arch/x86/include/asm/bugs.h b/arch/x86/include/asm/bugs.h
new file mode 100644
index 000000000000..08abf639075f
--- /dev/null
+++ b/arch/x86/include/asm/bugs.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_BUGS_H
+#define _ASM_X86_BUGS_H
+
+extern void check_bugs(void);
+
+#if defined(CONFIG_CPU_SUP_INTEL) && defined(CONFIG_X86_32)
+int ppro_with_ram_bug(void);
+#else
+static inline int ppro_with_ram_bug(void) { return 0; }
+#endif
+
+#endif /* _ASM_X86_BUGS_H */
diff --git a/arch/x86/include/asm/byteorder.h b/arch/x86/include/asm/byteorder.h
new file mode 100644
index 000000000000..f110ad417df3
--- /dev/null
+++ b/arch/x86/include/asm/byteorder.h
@@ -0,0 +1,65 @@
+#ifndef _ASM_X86_BYTEORDER_H
+#define _ASM_X86_BYTEORDER_H
+
+#include <asm/types.h>
+#include <linux/compiler.h>
+
+#define __LITTLE_ENDIAN
+
+static inline __attribute_const__ __u32 __arch_swab32(__u32 val)
+{
+#ifdef __i386__
+# ifdef CONFIG_X86_BSWAP
+ asm("bswap %0" : "=r" (val) : "0" (val));
+# else
+ asm("xchgb %b0,%h0\n\t" /* swap lower bytes */
+ "rorl $16,%0\n\t" /* swap words */
+ "xchgb %b0,%h0" /* swap higher bytes */
+ : "=q" (val)
+ : "0" (val));
+# endif
+
+#else /* __i386__ */
+ asm("bswapl %0"
+ : "=r" (val)
+ : "0" (val));
+#endif
+ return val;
+}
+#define __arch_swab32 __arch_swab32
+
+static inline __attribute_const__ __u64 __arch_swab64(__u64 val)
+{
+#ifdef __i386__
+ union {
+ struct {
+ __u32 a;
+ __u32 b;
+ } s;
+ __u64 u;
+ } v;
+ v.u = val;
+# ifdef CONFIG_X86_BSWAP
+ asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1"
+ : "=r" (v.s.a), "=r" (v.s.b)
+ : "0" (v.s.a), "1" (v.s.b));
+# else
+ v.s.a = __arch_swab32(v.s.a);
+ v.s.b = __arch_swab32(v.s.b);
+ asm("xchgl %0,%1"
+ : "=r" (v.s.a), "=r" (v.s.b)
+ : "0" (v.s.a), "1" (v.s.b));
+# endif
+ return v.u;
+#else /* __i386__ */
+ asm("bswapq %0"
+ : "=r" (val)
+ : "0" (val));
+ return val;
+#endif
+}
+#define __arch_swab64 __arch_swab64
+
+#include <linux/byteorder.h>
+
+#endif /* _ASM_X86_BYTEORDER_H */
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
new file mode 100644
index 000000000000..5d367caa0e36
--- /dev/null
+++ b/arch/x86/include/asm/cache.h
@@ -0,0 +1,20 @@
+#ifndef _ASM_X86_CACHE_H
+#define _ASM_X86_CACHE_H
+
+/* L1 cache line size */
+#define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT)
+#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
+
+#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+
+#ifdef CONFIG_X86_VSMP
+/* vSMP Internode cacheline shift */
+#define INTERNODE_CACHE_SHIFT (12)
+#ifdef CONFIG_SMP
+#define __cacheline_aligned_in_smp \
+ __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \
+ __attribute__((__section__(".data.page_aligned")))
+#endif
+#endif
+
+#endif /* _ASM_X86_CACHE_H */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
new file mode 100644
index 000000000000..2f8466540fb5
--- /dev/null
+++ b/arch/x86/include/asm/cacheflush.h
@@ -0,0 +1,118 @@
+#ifndef _ASM_X86_CACHEFLUSH_H
+#define _ASM_X86_CACHEFLUSH_H
+
+/* Keep includes the same across arches. */
+#include <linux/mm.h>
+
+/* Caches aren't brain-dead on the intel. */
+#define flush_cache_all() do { } while (0)
+#define flush_cache_mm(mm) do { } while (0)
+#define flush_cache_dup_mm(mm) do { } while (0)
+#define flush_cache_range(vma, start, end) do { } while (0)
+#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
+#define flush_dcache_page(page) do { } while (0)
+#define flush_dcache_mmap_lock(mapping) do { } while (0)
+#define flush_dcache_mmap_unlock(mapping) do { } while (0)
+#define flush_icache_range(start, end) do { } while (0)
+#define flush_icache_page(vma, pg) do { } while (0)
+#define flush_icache_user_range(vma, pg, adr, len) do { } while (0)
+#define flush_cache_vmap(start, end) do { } while (0)
+#define flush_cache_vunmap(start, end) do { } while (0)
+
+#define copy_to_user_page(vma, page, vaddr, dst, src, len) \
+ memcpy((dst), (src), (len))
+#define copy_from_user_page(vma, page, vaddr, dst, src, len) \
+ memcpy((dst), (src), (len))
+
+#define PG_non_WB PG_arch_1
+PAGEFLAG(NonWB, non_WB)
+
+/*
+ * The set_memory_* API can be used to change various attributes of a virtual
+ * address range. The attributes include:
+ * Cachability : UnCached, WriteCombining, WriteBack
+ * Executability : eXeutable, NoteXecutable
+ * Read/Write : ReadOnly, ReadWrite
+ * Presence : NotPresent
+ *
+ * Within a catagory, the attributes are mutually exclusive.
+ *
+ * The implementation of this API will take care of various aspects that
+ * are associated with changing such attributes, such as:
+ * - Flushing TLBs
+ * - Flushing CPU caches
+ * - Making sure aliases of the memory behind the mapping don't violate
+ * coherency rules as defined by the CPU in the system.
+ *
+ * What this API does not do:
+ * - Provide exclusion between various callers - including callers that
+ * operation on other mappings of the same physical page
+ * - Restore default attributes when a page is freed
+ * - Guarantee that mappings other than the requested one are
+ * in any state, other than that these do not violate rules for
+ * the CPU you have. Do not depend on any effects on other mappings,
+ * CPUs other than the one you have may have more relaxed rules.
+ * The caller is required to take care of these.
+ */
+
+int _set_memory_uc(unsigned long addr, int numpages);
+int _set_memory_wc(unsigned long addr, int numpages);
+int _set_memory_wb(unsigned long addr, int numpages);
+int set_memory_uc(unsigned long addr, int numpages);
+int set_memory_wc(unsigned long addr, int numpages);
+int set_memory_wb(unsigned long addr, int numpages);
+int set_memory_x(unsigned long addr, int numpages);
+int set_memory_nx(unsigned long addr, int numpages);
+int set_memory_ro(unsigned long addr, int numpages);
+int set_memory_rw(unsigned long addr, int numpages);
+int set_memory_np(unsigned long addr, int numpages);
+int set_memory_4k(unsigned long addr, int numpages);
+
+int set_memory_array_uc(unsigned long *addr, int addrinarray);
+int set_memory_array_wb(unsigned long *addr, int addrinarray);
+
+/*
+ * For legacy compatibility with the old APIs, a few functions
+ * are provided that work on a "struct page".
+ * These functions operate ONLY on the 1:1 kernel mapping of the
+ * memory that the struct page represents, and internally just
+ * call the set_memory_* function. See the description of the
+ * set_memory_* function for more details on conventions.
+ *
+ * These APIs should be considered *deprecated* and are likely going to
+ * be removed in the future.
+ * The reason for this is the implicit operation on the 1:1 mapping only,
+ * making this not a generally useful API.
+ *
+ * Specifically, many users of the old APIs had a virtual address,
+ * called virt_to_page() or vmalloc_to_page() on that address to
+ * get a struct page* that the old API required.
+ * To convert these cases, use set_memory_*() on the original
+ * virtual address, do not use these functions.
+ */
+
+int set_pages_uc(struct page *page, int numpages);
+int set_pages_wb(struct page *page, int numpages);
+int set_pages_x(struct page *page, int numpages);
+int set_pages_nx(struct page *page, int numpages);
+int set_pages_ro(struct page *page, int numpages);
+int set_pages_rw(struct page *page, int numpages);
+
+
+void clflush_cache_range(void *addr, unsigned int size);
+
+#ifdef CONFIG_DEBUG_RODATA
+void mark_rodata_ro(void);
+extern const int rodata_test_data;
+#endif
+
+#ifdef CONFIG_DEBUG_RODATA_TEST
+int rodata_test(void);
+#else
+static inline int rodata_test(void)
+{
+ return 0;
+}
+#endif
+
+#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
new file mode 100644
index 000000000000..b03bedb62aa7
--- /dev/null
+++ b/arch/x86/include/asm/calgary.h
@@ -0,0 +1,72 @@
+/*
+ * Derived from include/asm-powerpc/iommu.h
+ *
+ * Copyright IBM Corporation, 2006-2007
+ *
+ * Author: Jon Mason <jdmason@us.ibm.com>
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _ASM_X86_CALGARY_H
+#define _ASM_X86_CALGARY_H
+
+#include <linux/spinlock.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/timer.h>
+#include <asm/types.h>
+
+struct iommu_table {
+ struct cal_chipset_ops *chip_ops; /* chipset specific funcs */
+ unsigned long it_base; /* mapped address of tce table */
+ unsigned long it_hint; /* Hint for next alloc */
+ unsigned long *it_map; /* A simple allocation bitmap for now */
+ void __iomem *bbar; /* Bridge BAR */
+ u64 tar_val; /* Table Address Register */
+ struct timer_list watchdog_timer;
+ spinlock_t it_lock; /* Protects it_map */
+ unsigned int it_size; /* Size of iommu table in entries */
+ unsigned char it_busno; /* Bus number this table belongs to */
+};
+
+struct cal_chipset_ops {
+ void (*handle_quirks)(struct iommu_table *tbl, struct pci_dev *dev);
+ void (*tce_cache_blast)(struct iommu_table *tbl);
+ void (*dump_error_regs)(struct iommu_table *tbl);
+};
+
+#define TCE_TABLE_SIZE_UNSPECIFIED ~0
+#define TCE_TABLE_SIZE_64K 0
+#define TCE_TABLE_SIZE_128K 1
+#define TCE_TABLE_SIZE_256K 2
+#define TCE_TABLE_SIZE_512K 3
+#define TCE_TABLE_SIZE_1M 4
+#define TCE_TABLE_SIZE_2M 5
+#define TCE_TABLE_SIZE_4M 6
+#define TCE_TABLE_SIZE_8M 7
+
+extern int use_calgary;
+
+#ifdef CONFIG_CALGARY_IOMMU
+extern int calgary_iommu_init(void);
+extern void detect_calgary(void);
+#else
+static inline int calgary_iommu_init(void) { return 1; }
+static inline void detect_calgary(void) { return; }
+#endif
+
+#endif /* _ASM_X86_CALGARY_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
new file mode 100644
index 000000000000..2bc162e0ec6e
--- /dev/null
+++ b/arch/x86/include/asm/calling.h
@@ -0,0 +1,170 @@
+/*
+ * Some macros to handle stack frames in assembly.
+ */
+
+#define R15 0
+#define R14 8
+#define R13 16
+#define R12 24
+#define RBP 32
+#define RBX 40
+
+/* arguments: interrupts/non tracing syscalls only save upto here*/
+#define R11 48
+#define R10 56
+#define R9 64
+#define R8 72
+#define RAX 80
+#define RCX 88
+#define RDX 96
+#define RSI 104
+#define RDI 112
+#define ORIG_RAX 120 /* + error_code */
+/* end of arguments */
+
+/* cpu exception frame or undefined in case of fast syscall. */
+#define RIP 128
+#define CS 136
+#define EFLAGS 144
+#define RSP 152
+#define SS 160
+
+#define ARGOFFSET R11
+#define SWFRAME ORIG_RAX
+
+ .macro SAVE_ARGS addskip=0, norcx=0, nor891011=0
+ subq $9*8+\addskip, %rsp
+ CFI_ADJUST_CFA_OFFSET 9*8+\addskip
+ movq %rdi, 8*8(%rsp)
+ CFI_REL_OFFSET rdi, 8*8
+ movq %rsi, 7*8(%rsp)
+ CFI_REL_OFFSET rsi, 7*8
+ movq %rdx, 6*8(%rsp)
+ CFI_REL_OFFSET rdx, 6*8
+ .if \norcx
+ .else
+ movq %rcx, 5*8(%rsp)
+ CFI_REL_OFFSET rcx, 5*8
+ .endif
+ movq %rax, 4*8(%rsp)
+ CFI_REL_OFFSET rax, 4*8
+ .if \nor891011
+ .else
+ movq %r8, 3*8(%rsp)
+ CFI_REL_OFFSET r8, 3*8
+ movq %r9, 2*8(%rsp)
+ CFI_REL_OFFSET r9, 2*8
+ movq %r10, 1*8(%rsp)
+ CFI_REL_OFFSET r10, 1*8
+ movq %r11, (%rsp)
+ CFI_REL_OFFSET r11, 0*8
+ .endif
+ .endm
+
+#define ARG_SKIP 9*8
+
+ .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \
+ skipr8910=0, skiprdx=0
+ .if \skipr11
+ .else
+ movq (%rsp), %r11
+ CFI_RESTORE r11
+ .endif
+ .if \skipr8910
+ .else
+ movq 1*8(%rsp), %r10
+ CFI_RESTORE r10
+ movq 2*8(%rsp), %r9
+ CFI_RESTORE r9
+ movq 3*8(%rsp), %r8
+ CFI_RESTORE r8
+ .endif
+ .if \skiprax
+ .else
+ movq 4*8(%rsp), %rax
+ CFI_RESTORE rax
+ .endif
+ .if \skiprcx
+ .else
+ movq 5*8(%rsp), %rcx
+ CFI_RESTORE rcx
+ .endif
+ .if \skiprdx
+ .else
+ movq 6*8(%rsp), %rdx
+ CFI_RESTORE rdx
+ .endif
+ movq 7*8(%rsp), %rsi
+ CFI_RESTORE rsi
+ movq 8*8(%rsp), %rdi
+ CFI_RESTORE rdi
+ .if ARG_SKIP+\addskip > 0
+ addq $ARG_SKIP+\addskip, %rsp
+ CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
+ .endif
+ .endm
+
+ .macro LOAD_ARGS offset, skiprax=0
+ movq \offset(%rsp), %r11
+ movq \offset+8(%rsp), %r10
+ movq \offset+16(%rsp), %r9
+ movq \offset+24(%rsp), %r8
+ movq \offset+40(%rsp), %rcx
+ movq \offset+48(%rsp), %rdx
+ movq \offset+56(%rsp), %rsi
+ movq \offset+64(%rsp), %rdi
+ .if \skiprax
+ .else
+ movq \offset+72(%rsp), %rax
+ .endif
+ .endm
+
+#define REST_SKIP 6*8
+
+ .macro SAVE_REST
+ subq $REST_SKIP, %rsp
+ CFI_ADJUST_CFA_OFFSET REST_SKIP
+ movq %rbx, 5*8(%rsp)
+ CFI_REL_OFFSET rbx, 5*8
+ movq %rbp, 4*8(%rsp)
+ CFI_REL_OFFSET rbp, 4*8
+ movq %r12, 3*8(%rsp)
+ CFI_REL_OFFSET r12, 3*8
+ movq %r13, 2*8(%rsp)
+ CFI_REL_OFFSET r13, 2*8
+ movq %r14, 1*8(%rsp)
+ CFI_REL_OFFSET r14, 1*8
+ movq %r15, (%rsp)
+ CFI_REL_OFFSET r15, 0*8
+ .endm
+
+ .macro RESTORE_REST
+ movq (%rsp), %r15
+ CFI_RESTORE r15
+ movq 1*8(%rsp), %r14
+ CFI_RESTORE r14
+ movq 2*8(%rsp), %r13
+ CFI_RESTORE r13
+ movq 3*8(%rsp), %r12
+ CFI_RESTORE r12
+ movq 4*8(%rsp), %rbp
+ CFI_RESTORE rbp
+ movq 5*8(%rsp), %rbx
+ CFI_RESTORE rbx
+ addq $REST_SKIP, %rsp
+ CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
+ .endm
+
+ .macro SAVE_ALL
+ SAVE_ARGS
+ SAVE_REST
+ .endm
+
+ .macro RESTORE_ALL addskip=0
+ RESTORE_REST
+ RESTORE_ARGS 0, \addskip
+ .endm
+
+ .macro icebp
+ .byte 0xf1
+ .endm
diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h
new file mode 100644
index 000000000000..848850fd7d62
--- /dev/null
+++ b/arch/x86/include/asm/checksum.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "checksum_32.h"
+#else
+# include "checksum_64.h"
+#endif
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
new file mode 100644
index 000000000000..7c5ef8b14d92
--- /dev/null
+++ b/arch/x86/include/asm/checksum_32.h
@@ -0,0 +1,189 @@
+#ifndef _ASM_X86_CHECKSUM_32_H
+#define _ASM_X86_CHECKSUM_32_H
+
+#include <linux/in6.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 32-bit boundary
+ */
+asmlinkage __wsum csum_partial(const void *buff, int len, __wsum sum);
+
+/*
+ * the same as csum_partial, but copies from src while it
+ * checksums, and handles user-space pointer exceptions correctly, when needed.
+ *
+ * here even more important to align src and dst on a 32-bit (or even
+ * better 64-bit) boundary
+ */
+
+asmlinkage __wsum csum_partial_copy_generic(const void *src, void *dst,
+ int len, __wsum sum,
+ int *src_err_ptr, int *dst_err_ptr);
+
+/*
+ * Note: when you get a NULL pointer exception here this means someone
+ * passed in an incorrect kernel address to one of these functions.
+ *
+ * If you use these functions directly please don't forget the
+ * access_ok().
+ */
+static inline __wsum csum_partial_copy_nocheck(const void *src, void *dst,
+ int len, __wsum sum)
+{
+ return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL);
+}
+
+static inline __wsum csum_partial_copy_from_user(const void __user *src,
+ void *dst,
+ int len, __wsum sum,
+ int *err_ptr)
+{
+ might_sleep();
+ return csum_partial_copy_generic((__force void *)src, dst,
+ len, sum, err_ptr, NULL);
+}
+
+/*
+ * This is a version of ip_compute_csum() optimized for IP headers,
+ * which always checksum on 4 octet boundaries.
+ *
+ * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
+ * Arnt Gulbrandsen.
+ */
+static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
+{
+ unsigned int sum;
+
+ asm volatile("movl (%1), %0 ;\n"
+ "subl $4, %2 ;\n"
+ "jbe 2f ;\n"
+ "addl 4(%1), %0 ;\n"
+ "adcl 8(%1), %0 ;\n"
+ "adcl 12(%1), %0;\n"
+ "1: adcl 16(%1), %0 ;\n"
+ "lea 4(%1), %1 ;\n"
+ "decl %2 ;\n"
+ "jne 1b ;\n"
+ "adcl $0, %0 ;\n"
+ "movl %0, %2 ;\n"
+ "shrl $16, %0 ;\n"
+ "addw %w2, %w0 ;\n"
+ "adcl $0, %0 ;\n"
+ "notl %0 ;\n"
+ "2: ;\n"
+ /* Since the input registers which are loaded with iph and ihl
+ are modified, we must also specify them as outputs, or gcc
+ will assume they contain their original values. */
+ : "=r" (sum), "=r" (iph), "=r" (ihl)
+ : "1" (iph), "2" (ihl)
+ : "memory");
+ return (__force __sum16)sum;
+}
+
+/*
+ * Fold a partial checksum
+ */
+
+static inline __sum16 csum_fold(__wsum sum)
+{
+ asm("addl %1, %0 ;\n"
+ "adcl $0xffff, %0 ;\n"
+ : "=r" (sum)
+ : "r" ((__force u32)sum << 16),
+ "0" ((__force u32)sum & 0xffff0000));
+ return (__force __sum16)(~(__force u32)sum >> 16);
+}
+
+static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
+ unsigned short len,
+ unsigned short proto,
+ __wsum sum)
+{
+ asm("addl %1, %0 ;\n"
+ "adcl %2, %0 ;\n"
+ "adcl %3, %0 ;\n"
+ "adcl $0, %0 ;\n"
+ : "=r" (sum)
+ : "g" (daddr), "g"(saddr),
+ "g" ((len + proto) << 8), "0" (sum));
+ return sum;
+}
+
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented
+ */
+static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+ unsigned short len,
+ unsigned short proto,
+ __wsum sum)
+{
+ return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
+}
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+
+static inline __sum16 ip_compute_csum(const void *buff, int len)
+{
+ return csum_fold(csum_partial(buff, len, 0));
+}
+
+#define _HAVE_ARCH_IPV6_CSUM
+static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u32 len, unsigned short proto,
+ __wsum sum)
+{
+ asm("addl 0(%1), %0 ;\n"
+ "adcl 4(%1), %0 ;\n"
+ "adcl 8(%1), %0 ;\n"
+ "adcl 12(%1), %0 ;\n"
+ "adcl 0(%2), %0 ;\n"
+ "adcl 4(%2), %0 ;\n"
+ "adcl 8(%2), %0 ;\n"
+ "adcl 12(%2), %0 ;\n"
+ "adcl %3, %0 ;\n"
+ "adcl %4, %0 ;\n"
+ "adcl $0, %0 ;\n"
+ : "=&r" (sum)
+ : "r" (saddr), "r" (daddr),
+ "r" (htonl(len)), "r" (htonl(proto)), "0" (sum));
+
+ return csum_fold(sum);
+}
+
+/*
+ * Copy and checksum to user
+ */
+#define HAVE_CSUM_COPY_USER
+static inline __wsum csum_and_copy_to_user(const void *src,
+ void __user *dst,
+ int len, __wsum sum,
+ int *err_ptr)
+{
+ might_sleep();
+ if (access_ok(VERIFY_WRITE, dst, len))
+ return csum_partial_copy_generic(src, (__force void *)dst,
+ len, sum, NULL, err_ptr);
+
+ if (len)
+ *err_ptr = -EFAULT;
+
+ return (__force __wsum)-1; /* invalid checksum */
+}
+
+#endif /* _ASM_X86_CHECKSUM_32_H */
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
new file mode 100644
index 000000000000..9bfdc41629ec
--- /dev/null
+++ b/arch/x86/include/asm/checksum_64.h
@@ -0,0 +1,191 @@
+#ifndef _ASM_X86_CHECKSUM_64_H
+#define _ASM_X86_CHECKSUM_64_H
+
+/*
+ * Checksums for x86-64
+ * Copyright 2002 by Andi Kleen, SuSE Labs
+ * with some code from asm-x86/checksum.h
+ */
+
+#include <linux/compiler.h>
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+
+/**
+ * csum_fold - Fold and invert a 32bit checksum.
+ * sum: 32bit unfolded sum
+ *
+ * Fold a 32bit running checksum to 16bit and invert it. This is usually
+ * the last step before putting a checksum into a packet.
+ * Make sure not to mix with 64bit checksums.
+ */
+static inline __sum16 csum_fold(__wsum sum)
+{
+ asm(" addl %1,%0\n"
+ " adcl $0xffff,%0"
+ : "=r" (sum)
+ : "r" ((__force u32)sum << 16),
+ "0" ((__force u32)sum & 0xffff0000));
+ return (__force __sum16)(~(__force u32)sum >> 16);
+}
+
+/*
+ * This is a version of ip_compute_csum() optimized for IP headers,
+ * which always checksum on 4 octet boundaries.
+ *
+ * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
+ * Arnt Gulbrandsen.
+ */
+
+/**
+ * ip_fast_csum - Compute the IPv4 header checksum efficiently.
+ * iph: ipv4 header
+ * ihl: length of header / 4
+ */
+static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
+{
+ unsigned int sum;
+
+ asm(" movl (%1), %0\n"
+ " subl $4, %2\n"
+ " jbe 2f\n"
+ " addl 4(%1), %0\n"
+ " adcl 8(%1), %0\n"
+ " adcl 12(%1), %0\n"
+ "1: adcl 16(%1), %0\n"
+ " lea 4(%1), %1\n"
+ " decl %2\n"
+ " jne 1b\n"
+ " adcl $0, %0\n"
+ " movl %0, %2\n"
+ " shrl $16, %0\n"
+ " addw %w2, %w0\n"
+ " adcl $0, %0\n"
+ " notl %0\n"
+ "2:"
+ /* Since the input registers which are loaded with iph and ihl
+ are modified, we must also specify them as outputs, or gcc
+ will assume they contain their original values. */
+ : "=r" (sum), "=r" (iph), "=r" (ihl)
+ : "1" (iph), "2" (ihl)
+ : "memory");
+ return (__force __sum16)sum;
+}
+
+/**
+ * csum_tcpup_nofold - Compute an IPv4 pseudo header checksum.
+ * @saddr: source address
+ * @daddr: destination address
+ * @len: length of packet
+ * @proto: ip protocol of packet
+ * @sum: initial sum to be added in (32bit unfolded)
+ *
+ * Returns the pseudo header checksum the input data. Result is
+ * 32bit unfolded.
+ */
+static inline __wsum
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
+ unsigned short proto, __wsum sum)
+{
+ asm(" addl %1, %0\n"
+ " adcl %2, %0\n"
+ " adcl %3, %0\n"
+ " adcl $0, %0\n"
+ : "=r" (sum)
+ : "g" (daddr), "g" (saddr),
+ "g" ((len + proto)<<8), "0" (sum));
+ return sum;
+}
+
+
+/**
+ * csum_tcpup_magic - Compute an IPv4 pseudo header checksum.
+ * @saddr: source address
+ * @daddr: destination address
+ * @len: length of packet
+ * @proto: ip protocol of packet
+ * @sum: initial sum to be added in (32bit unfolded)
+ *
+ * Returns the 16bit pseudo header checksum the input data already
+ * complemented and ready to be filled in.
+ */
+static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+ unsigned short len,
+ unsigned short proto, __wsum sum)
+{
+ return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
+}
+
+/**
+ * csum_partial - Compute an internet checksum.
+ * @buff: buffer to be checksummed
+ * @len: length of buffer.
+ * @sum: initial sum to be added in (32bit unfolded)
+ *
+ * Returns the 32bit unfolded internet checksum of the buffer.
+ * Before filling it in it needs to be csum_fold()'ed.
+ * buff should be aligned to a 64bit boundary if possible.
+ */
+extern __wsum csum_partial(const void *buff, int len, __wsum sum);
+
+#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1
+#define HAVE_CSUM_COPY_USER 1
+
+
+/* Do not call this directly. Use the wrappers below */
+extern __wsum csum_partial_copy_generic(const void *src, const void *dst,
+ int len, __wsum sum,
+ int *src_err_ptr, int *dst_err_ptr);
+
+
+extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
+ int len, __wsum isum, int *errp);
+extern __wsum csum_partial_copy_to_user(const void *src, void __user *dst,
+ int len, __wsum isum, int *errp);
+extern __wsum csum_partial_copy_nocheck(const void *src, void *dst,
+ int len, __wsum sum);
+
+/* Old names. To be removed. */
+#define csum_and_copy_to_user csum_partial_copy_to_user
+#define csum_and_copy_from_user csum_partial_copy_from_user
+
+/**
+ * ip_compute_csum - Compute an 16bit IP checksum.
+ * @buff: buffer address.
+ * @len: length of buffer.
+ *
+ * Returns the 16bit folded/inverted checksum of the passed buffer.
+ * Ready to fill in.
+ */
+extern __sum16 ip_compute_csum(const void *buff, int len);
+
+/**
+ * csum_ipv6_magic - Compute checksum of an IPv6 pseudo header.
+ * @saddr: source address
+ * @daddr: destination address
+ * @len: length of packet
+ * @proto: protocol of packet
+ * @sum: initial sum (32bit unfolded) to be added in
+ *
+ * Computes an IPv6 pseudo header checksum. This sum is added the checksum
+ * into UDP/TCP packets and contains some link layer information.
+ * Returns the unfolded 32bit checksum.
+ */
+
+struct in6_addr;
+
+#define _HAVE_ARCH_IPV6_CSUM 1
+extern __sum16
+csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
+ __u32 len, unsigned short proto, __wsum sum);
+
+static inline unsigned add32_with_carry(unsigned a, unsigned b)
+{
+ asm("addl %2,%0\n\t"
+ "adcl $0,%0"
+ : "=r" (a)
+ : "0" (a), "r" (b));
+ return a;
+}
+
+#endif /* _ASM_X86_CHECKSUM_64_H */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
new file mode 100644
index 000000000000..a460fa088d4c
--- /dev/null
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "cmpxchg_32.h"
+#else
+# include "cmpxchg_64.h"
+#endif
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
new file mode 100644
index 000000000000..82ceb788a981
--- /dev/null
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -0,0 +1,344 @@
+#ifndef _ASM_X86_CMPXCHG_32_H
+#define _ASM_X86_CMPXCHG_32_H
+
+#include <linux/bitops.h> /* for LOCK_PREFIX */
+
+/*
+ * Note: if you use set64_bit(), __cmpxchg64(), or their variants, you
+ * you need to test for the feature in boot_cpu_data.
+ */
+
+#define xchg(ptr, v) \
+ ((__typeof__(*(ptr)))__xchg((unsigned long)(v), (ptr), sizeof(*(ptr))))
+
+struct __xchg_dummy {
+ unsigned long a[100];
+};
+#define __xg(x) ((struct __xchg_dummy *)(x))
+
+/*
+ * The semantics of XCHGCMP8B are a bit strange, this is why
+ * there is a loop and the loading of %%eax and %%edx has to
+ * be inside. This inlines well in most cases, the cached
+ * cost is around ~38 cycles. (in the future we might want
+ * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
+ * might have an implicit FPU-save as a cost, so it's not
+ * clear which path to go.)
+ *
+ * cmpxchg8b must be used with the lock prefix here to allow
+ * the instruction to be executed atomically, see page 3-102
+ * of the instruction set reference 24319102.pdf. We need
+ * the reader side to see the coherent 64bit value.
+ */
+static inline void __set_64bit(unsigned long long *ptr,
+ unsigned int low, unsigned int high)
+{
+ asm volatile("\n1:\t"
+ "movl (%0), %%eax\n\t"
+ "movl 4(%0), %%edx\n\t"
+ LOCK_PREFIX "cmpxchg8b (%0)\n\t"
+ "jnz 1b"
+ : /* no outputs */
+ : "D"(ptr),
+ "b"(low),
+ "c"(high)
+ : "ax", "dx", "memory");
+}
+
+static inline void __set_64bit_constant(unsigned long long *ptr,
+ unsigned long long value)
+{
+ __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32));
+}
+
+#define ll_low(x) *(((unsigned int *)&(x)) + 0)
+#define ll_high(x) *(((unsigned int *)&(x)) + 1)
+
+static inline void __set_64bit_var(unsigned long long *ptr,
+ unsigned long long value)
+{
+ __set_64bit(ptr, ll_low(value), ll_high(value));
+}
+
+#define set_64bit(ptr, value) \
+ (__builtin_constant_p((value)) \
+ ? __set_64bit_constant((ptr), (value)) \
+ : __set_64bit_var((ptr), (value)))
+
+#define _set_64bit(ptr, value) \
+ (__builtin_constant_p(value) \
+ ? __set_64bit(ptr, (unsigned int)(value), \
+ (unsigned int)((value) >> 32)) \
+ : __set_64bit(ptr, ll_low((value)), ll_high((value))))
+
+/*
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
+ * Note 2: xchg has side effect, so that attribute volatile is necessary,
+ * but generally the primitive is invalid, *ptr is output argument. --ANK
+ */
+static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
+ int size)
+{
+ switch (size) {
+ case 1:
+ asm volatile("xchgb %b0,%1"
+ : "=q" (x)
+ : "m" (*__xg(ptr)), "0" (x)
+ : "memory");
+ break;
+ case 2:
+ asm volatile("xchgw %w0,%1"
+ : "=r" (x)
+ : "m" (*__xg(ptr)), "0" (x)
+ : "memory");
+ break;
+ case 4:
+ asm volatile("xchgl %0,%1"
+ : "=r" (x)
+ : "m" (*__xg(ptr)), "0" (x)
+ : "memory");
+ break;
+ }
+ return x;
+}
+
+/*
+ * Atomic compare and exchange. Compare OLD with MEM, if identical,
+ * store NEW in MEM. Return the initial value in MEM. Success is
+ * indicated by comparing RETURN with OLD.
+ */
+
+#ifdef CONFIG_X86_CMPXCHG
+#define __HAVE_ARCH_CMPXCHG 1
+#define cmpxchg(ptr, o, n) \
+ ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
+ (unsigned long)(n), \
+ sizeof(*(ptr))))
+#define sync_cmpxchg(ptr, o, n) \
+ ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
+ (unsigned long)(n), \
+ sizeof(*(ptr))))
+#define cmpxchg_local(ptr, o, n) \
+ ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
+ (unsigned long)(n), \
+ sizeof(*(ptr))))
+#endif
+
+#ifdef CONFIG_X86_CMPXCHG64
+#define cmpxchg64(ptr, o, n) \
+ ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \
+ (unsigned long long)(n)))
+#define cmpxchg64_local(ptr, o, n) \
+ ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
+ (unsigned long long)(n)))
+#endif
+
+static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+ switch (size) {
+ case 1:
+ asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+ asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 4:
+ asm volatile(LOCK_PREFIX "cmpxchgl %1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ }
+ return old;
+}
+
+/*
+ * Always use locked operations when touching memory shared with a
+ * hypervisor, since the system may be SMP even if the guest kernel
+ * isn't.
+ */
+static inline unsigned long __sync_cmpxchg(volatile void *ptr,
+ unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+ switch (size) {
+ case 1:
+ asm volatile("lock; cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+ asm volatile("lock; cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 4:
+ asm volatile("lock; cmpxchgl %1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ }
+ return old;
+}
+
+static inline unsigned long __cmpxchg_local(volatile void *ptr,
+ unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+ switch (size) {
+ case 1:
+ asm volatile("cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+ asm volatile("cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 4:
+ asm volatile("cmpxchgl %1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ }
+ return old;
+}
+
+static inline unsigned long long __cmpxchg64(volatile void *ptr,
+ unsigned long long old,
+ unsigned long long new)
+{
+ unsigned long long prev;
+ asm volatile(LOCK_PREFIX "cmpxchg8b %3"
+ : "=A"(prev)
+ : "b"((unsigned long)new),
+ "c"((unsigned long)(new >> 32)),
+ "m"(*__xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+}
+
+static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
+ unsigned long long old,
+ unsigned long long new)
+{
+ unsigned long long prev;
+ asm volatile("cmpxchg8b %3"
+ : "=A"(prev)
+ : "b"((unsigned long)new),
+ "c"((unsigned long)(new >> 32)),
+ "m"(*__xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+}
+
+#ifndef CONFIG_X86_CMPXCHG
+/*
+ * Building a kernel capable running on 80386. It may be necessary to
+ * simulate the cmpxchg on the 80386 CPU. For that purpose we define
+ * a function for each of the sizes we support.
+ */
+
+extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
+extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
+extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
+
+static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
+ unsigned long new, int size)
+{
+ switch (size) {
+ case 1:
+ return cmpxchg_386_u8(ptr, old, new);
+ case 2:
+ return cmpxchg_386_u16(ptr, old, new);
+ case 4:
+ return cmpxchg_386_u32(ptr, old, new);
+ }
+ return old;
+}
+
+#define cmpxchg(ptr, o, n) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ if (likely(boot_cpu_data.x86 > 3)) \
+ __ret = (__typeof__(*(ptr)))__cmpxchg((ptr), \
+ (unsigned long)(o), (unsigned long)(n), \
+ sizeof(*(ptr))); \
+ else \
+ __ret = (__typeof__(*(ptr)))cmpxchg_386((ptr), \
+ (unsigned long)(o), (unsigned long)(n), \
+ sizeof(*(ptr))); \
+ __ret; \
+})
+#define cmpxchg_local(ptr, o, n) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ if (likely(boot_cpu_data.x86 > 3)) \
+ __ret = (__typeof__(*(ptr)))__cmpxchg_local((ptr), \
+ (unsigned long)(o), (unsigned long)(n), \
+ sizeof(*(ptr))); \
+ else \
+ __ret = (__typeof__(*(ptr)))cmpxchg_386((ptr), \
+ (unsigned long)(o), (unsigned long)(n), \
+ sizeof(*(ptr))); \
+ __ret; \
+})
+#endif
+
+#ifndef CONFIG_X86_CMPXCHG64
+/*
+ * Building a kernel capable running on 80386 and 80486. It may be necessary
+ * to simulate the cmpxchg8b on the 80386 and 80486 CPU.
+ */
+
+extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
+
+#define cmpxchg64(ptr, o, n) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ if (likely(boot_cpu_data.x86 > 4)) \
+ __ret = (__typeof__(*(ptr)))__cmpxchg64((ptr), \
+ (unsigned long long)(o), \
+ (unsigned long long)(n)); \
+ else \
+ __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \
+ (unsigned long long)(o), \
+ (unsigned long long)(n)); \
+ __ret; \
+})
+#define cmpxchg64_local(ptr, o, n) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ if (likely(boot_cpu_data.x86 > 4)) \
+ __ret = (__typeof__(*(ptr)))__cmpxchg64_local((ptr), \
+ (unsigned long long)(o), \
+ (unsigned long long)(n)); \
+ else \
+ __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \
+ (unsigned long long)(o), \
+ (unsigned long long)(n)); \
+ __ret; \
+})
+
+#endif
+
+#endif /* _ASM_X86_CMPXCHG_32_H */
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
new file mode 100644
index 000000000000..52de72e0de8c
--- /dev/null
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -0,0 +1,185 @@
+#ifndef _ASM_X86_CMPXCHG_64_H
+#define _ASM_X86_CMPXCHG_64_H
+
+#include <asm/alternative.h> /* Provides LOCK_PREFIX */
+
+#define xchg(ptr, v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v), \
+ (ptr), sizeof(*(ptr))))
+
+#define __xg(x) ((volatile long *)(x))
+
+static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
+{
+ *ptr = val;
+}
+
+#define _set_64bit set_64bit
+
+/*
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
+ * Note 2: xchg has side effect, so that attribute volatile is necessary,
+ * but generally the primitive is invalid, *ptr is output argument. --ANK
+ */
+static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
+ int size)
+{
+ switch (size) {
+ case 1:
+ asm volatile("xchgb %b0,%1"
+ : "=q" (x)
+ : "m" (*__xg(ptr)), "0" (x)
+ : "memory");
+ break;
+ case 2:
+ asm volatile("xchgw %w0,%1"
+ : "=r" (x)
+ : "m" (*__xg(ptr)), "0" (x)
+ : "memory");
+ break;
+ case 4:
+ asm volatile("xchgl %k0,%1"
+ : "=r" (x)
+ : "m" (*__xg(ptr)), "0" (x)
+ : "memory");
+ break;
+ case 8:
+ asm volatile("xchgq %0,%1"
+ : "=r" (x)
+ : "m" (*__xg(ptr)), "0" (x)
+ : "memory");
+ break;
+ }
+ return x;
+}
+
+/*
+ * Atomic compare and exchange. Compare OLD with MEM, if identical,
+ * store NEW in MEM. Return the initial value in MEM. Success is
+ * indicated by comparing RETURN with OLD.
+ */
+
+#define __HAVE_ARCH_CMPXCHG 1
+
+static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+ switch (size) {
+ case 1:
+ asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+ asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 4:
+ asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 8:
+ asm volatile(LOCK_PREFIX "cmpxchgq %1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ }
+ return old;
+}
+
+/*
+ * Always use locked operations when touching memory shared with a
+ * hypervisor, since the system may be SMP even if the guest kernel
+ * isn't.
+ */
+static inline unsigned long __sync_cmpxchg(volatile void *ptr,
+ unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+ switch (size) {
+ case 1:
+ asm volatile("lock; cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+ asm volatile("lock; cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 4:
+ asm volatile("lock; cmpxchgl %1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ }
+ return old;
+}
+
+static inline unsigned long __cmpxchg_local(volatile void *ptr,
+ unsigned long old,
+ unsigned long new, int size)
+{
+ unsigned long prev;
+ switch (size) {
+ case 1:
+ asm volatile("cmpxchgb %b1,%2"
+ : "=a"(prev)
+ : "q"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 2:
+ asm volatile("cmpxchgw %w1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 4:
+ asm volatile("cmpxchgl %k1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ case 8:
+ asm volatile("cmpxchgq %1,%2"
+ : "=a"(prev)
+ : "r"(new), "m"(*__xg(ptr)), "0"(old)
+ : "memory");
+ return prev;
+ }
+ return old;
+}
+
+#define cmpxchg(ptr, o, n) \
+ ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
+ (unsigned long)(n), sizeof(*(ptr))))
+#define cmpxchg64(ptr, o, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ cmpxchg((ptr), (o), (n)); \
+})
+#define cmpxchg_local(ptr, o, n) \
+ ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
+ (unsigned long)(n), \
+ sizeof(*(ptr))))
+#define sync_cmpxchg(ptr, o, n) \
+ ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
+ (unsigned long)(n), \
+ sizeof(*(ptr))))
+#define cmpxchg64_local(ptr, o, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ cmpxchg_local((ptr), (o), (n)); \
+})
+
+#endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
new file mode 100644
index 000000000000..9a9c7bdc923d
--- /dev/null
+++ b/arch/x86/include/asm/compat.h
@@ -0,0 +1,218 @@
+#ifndef _ASM_X86_COMPAT_H
+#define _ASM_X86_COMPAT_H
+
+/*
+ * Architecture specific compatibility types
+ */
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <asm/user32.h>
+
+#define COMPAT_USER_HZ 100
+
+typedef u32 compat_size_t;
+typedef s32 compat_ssize_t;
+typedef s32 compat_time_t;
+typedef s32 compat_clock_t;
+typedef s32 compat_pid_t;
+typedef u16 __compat_uid_t;
+typedef u16 __compat_gid_t;
+typedef u32 __compat_uid32_t;
+typedef u32 __compat_gid32_t;
+typedef u16 compat_mode_t;
+typedef u32 compat_ino_t;
+typedef u16 compat_dev_t;
+typedef s32 compat_off_t;
+typedef s64 compat_loff_t;
+typedef u16 compat_nlink_t;
+typedef u16 compat_ipc_pid_t;
+typedef s32 compat_daddr_t;
+typedef u32 compat_caddr_t;
+typedef __kernel_fsid_t compat_fsid_t;
+typedef s32 compat_timer_t;
+typedef s32 compat_key_t;
+
+typedef s32 compat_int_t;
+typedef s32 compat_long_t;
+typedef s64 __attribute__((aligned(4))) compat_s64;
+typedef u32 compat_uint_t;
+typedef u32 compat_ulong_t;
+typedef u64 __attribute__((aligned(4))) compat_u64;
+
+struct compat_timespec {
+ compat_time_t tv_sec;
+ s32 tv_nsec;
+};
+
+struct compat_timeval {
+ compat_time_t tv_sec;
+ s32 tv_usec;
+};
+
+struct compat_stat {
+ compat_dev_t st_dev;
+ u16 __pad1;
+ compat_ino_t st_ino;
+ compat_mode_t st_mode;
+ compat_nlink_t st_nlink;
+ __compat_uid_t st_uid;
+ __compat_gid_t st_gid;
+ compat_dev_t st_rdev;
+ u16 __pad2;
+ u32 st_size;
+ u32 st_blksize;
+ u32 st_blocks;
+ u32 st_atime;
+ u32 st_atime_nsec;
+ u32 st_mtime;
+ u32 st_mtime_nsec;
+ u32 st_ctime;
+ u32 st_ctime_nsec;
+ u32 __unused4;
+ u32 __unused5;
+};
+
+struct compat_flock {
+ short l_type;
+ short l_whence;
+ compat_off_t l_start;
+ compat_off_t l_len;
+ compat_pid_t l_pid;
+};
+
+#define F_GETLK64 12 /* using 'struct flock64' */
+#define F_SETLK64 13
+#define F_SETLKW64 14
+
+/*
+ * IA32 uses 4 byte alignment for 64 bit quantities,
+ * so we need to pack this structure.
+ */
+struct compat_flock64 {
+ short l_type;
+ short l_whence;
+ compat_loff_t l_start;
+ compat_loff_t l_len;
+ compat_pid_t l_pid;
+} __attribute__((packed));
+
+struct compat_statfs {
+ int f_type;
+ int f_bsize;
+ int f_blocks;
+ int f_bfree;
+ int f_bavail;
+ int f_files;
+ int f_ffree;
+ compat_fsid_t f_fsid;
+ int f_namelen; /* SunOS ignores this field. */
+ int f_frsize;
+ int f_spare[5];
+};
+
+#define COMPAT_RLIM_OLD_INFINITY 0x7fffffff
+#define COMPAT_RLIM_INFINITY 0xffffffff
+
+typedef u32 compat_old_sigset_t; /* at least 32 bits */
+
+#define _COMPAT_NSIG 64
+#define _COMPAT_NSIG_BPW 32
+
+typedef u32 compat_sigset_word;
+
+#define COMPAT_OFF_T_MAX 0x7fffffff
+#define COMPAT_LOFF_T_MAX 0x7fffffffffffffffL
+
+struct compat_ipc64_perm {
+ compat_key_t key;
+ __compat_uid32_t uid;
+ __compat_gid32_t gid;
+ __compat_uid32_t cuid;
+ __compat_gid32_t cgid;
+ unsigned short mode;
+ unsigned short __pad1;
+ unsigned short seq;
+ unsigned short __pad2;
+ compat_ulong_t unused1;
+ compat_ulong_t unused2;
+};
+
+struct compat_semid64_ds {
+ struct compat_ipc64_perm sem_perm;
+ compat_time_t sem_otime;
+ compat_ulong_t __unused1;
+ compat_time_t sem_ctime;
+ compat_ulong_t __unused2;
+ compat_ulong_t sem_nsems;
+ compat_ulong_t __unused3;
+ compat_ulong_t __unused4;
+};
+
+struct compat_msqid64_ds {
+ struct compat_ipc64_perm msg_perm;
+ compat_time_t msg_stime;
+ compat_ulong_t __unused1;
+ compat_time_t msg_rtime;
+ compat_ulong_t __unused2;
+ compat_time_t msg_ctime;
+ compat_ulong_t __unused3;
+ compat_ulong_t msg_cbytes;
+ compat_ulong_t msg_qnum;
+ compat_ulong_t msg_qbytes;
+ compat_pid_t msg_lspid;
+ compat_pid_t msg_lrpid;
+ compat_ulong_t __unused4;
+ compat_ulong_t __unused5;
+};
+
+struct compat_shmid64_ds {
+ struct compat_ipc64_perm shm_perm;
+ compat_size_t shm_segsz;
+ compat_time_t shm_atime;
+ compat_ulong_t __unused1;
+ compat_time_t shm_dtime;
+ compat_ulong_t __unused2;
+ compat_time_t shm_ctime;
+ compat_ulong_t __unused3;
+ compat_pid_t shm_cpid;
+ compat_pid_t shm_lpid;
+ compat_ulong_t shm_nattch;
+ compat_ulong_t __unused4;
+ compat_ulong_t __unused5;
+};
+
+/*
+ * The type of struct elf_prstatus.pr_reg in compatible core dumps.
+ */
+typedef struct user_regs_struct32 compat_elf_gregset_t;
+
+/*
+ * A pointer passed in from user mode. This should not
+ * be used for syscall parameters, just declare them
+ * as pointers because the syscall entry code will have
+ * appropriately converted them already.
+ */
+typedef u32 compat_uptr_t;
+
+static inline void __user *compat_ptr(compat_uptr_t uptr)
+{
+ return (void __user *)(unsigned long)uptr;
+}
+
+static inline compat_uptr_t ptr_to_compat(void __user *uptr)
+{
+ return (u32)(unsigned long)uptr;
+}
+
+static inline void __user *compat_alloc_user_space(long len)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ return (void __user *)regs->sp - len;
+}
+
+static inline int is_compat_task(void)
+{
+ return current_thread_info()->status & TS_COMPAT;
+}
+
+#endif /* _ASM_X86_COMPAT_H */
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
new file mode 100644
index 000000000000..bae482df6039
--- /dev/null
+++ b/arch/x86/include/asm/cpu.h
@@ -0,0 +1,20 @@
+#ifndef _ASM_X86_CPU_H
+#define _ASM_X86_CPU_H
+
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/topology.h>
+#include <linux/nodemask.h>
+#include <linux/percpu.h>
+
+struct x86_cpu {
+ struct cpu cpu;
+};
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern int arch_register_cpu(int num);
+extern void arch_unregister_cpu(int);
+#endif
+
+DECLARE_PER_CPU(int, cpu_state);
+#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
new file mode 100644
index 000000000000..ea408dcba513
--- /dev/null
+++ b/arch/x86/include/asm/cpufeature.h
@@ -0,0 +1,274 @@
+/*
+ * Defines x86 CPU feature bits
+ */
+#ifndef _ASM_X86_CPUFEATURE_H
+#define _ASM_X86_CPUFEATURE_H
+
+#include <asm/required-features.h>
+
+#define NCAPINTS 9 /* N 32-bit words worth of info */
+
+/*
+ * Note: If the comment begins with a quoted string, that string is used
+ * in /proc/cpuinfo instead of the macro name. If the string is "",
+ * this feature bit is not displayed in /proc/cpuinfo at all.
+ */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
+#define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */
+#define X86_FEATURE_VME (0*32+ 1) /* Virtual Mode Extensions */
+#define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */
+#define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extensions */
+#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */
+#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */
+#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */
+#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Architecture */
+#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */
+#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */
+#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */
+#define X86_FEATURE_PGE (0*32+13) /* Page Global Enable */
+#define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */
+#define X86_FEATURE_CMOV (0*32+15) /* CMOV instructions */
+ /* (plus FCMOVcc, FCOMI with FPU) */
+#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */
+#define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */
+#define X86_FEATURE_PN (0*32+18) /* Processor serial number */
+#define X86_FEATURE_CLFLSH (0*32+19) /* "clflush" CLFLUSH instruction */
+#define X86_FEATURE_DS (0*32+21) /* "dts" Debug Store */
+#define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */
+#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */
+#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_XMM (0*32+25) /* "sse" */
+#define X86_FEATURE_XMM2 (0*32+26) /* "sse2" */
+#define X86_FEATURE_SELFSNOOP (0*32+27) /* "ss" CPU self snoop */
+#define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */
+#define X86_FEATURE_ACC (0*32+29) /* "tm" Automatic clock control */
+#define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */
+#define X86_FEATURE_PBE (0*32+31) /* Pending Break Enable */
+
+/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
+/* Don't duplicate feature flags which are redundant with Intel! */
+#define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */
+#define X86_FEATURE_MP (1*32+19) /* MP Capable. */
+#define X86_FEATURE_NX (1*32+20) /* Execute Disable */
+#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_FXSR_OPT (1*32+25) /* FXSAVE/FXRSTOR optimizations */
+#define X86_FEATURE_GBPAGES (1*32+26) /* "pdpe1gb" GB pages */
+#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */
+#define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64) */
+#define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */
+#define X86_FEATURE_3DNOW (1*32+31) /* 3DNow! */
+
+/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
+#define X86_FEATURE_RECOVERY (2*32+ 0) /* CPU in recovery mode */
+#define X86_FEATURE_LONGRUN (2*32+ 1) /* Longrun power control */
+#define X86_FEATURE_LRTI (2*32+ 3) /* LongRun table interface */
+
+/* Other features, Linux-defined mapping, word 3 */
+/* This range is used for feature bits which conflict or are synthesized */
+#define X86_FEATURE_CXMMX (3*32+ 0) /* Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */
+/* cpu types for specific tunings: */
+#define X86_FEATURE_K8 (3*32+ 4) /* "" Opteron, Athlon64 */
+#define X86_FEATURE_K7 (3*32+ 5) /* "" Athlon */
+#define X86_FEATURE_P3 (3*32+ 6) /* "" P3 */
+#define X86_FEATURE_P4 (3*32+ 7) /* "" P4 */
+#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
+#define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */
+#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */
+#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */
+#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */
+#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */
+#define X86_FEATURE_SYSENTER32 (3*32+15) /* "" sysenter in ia32 userspace */
+#define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well */
+#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* "" Mfence synchronizes RDTSC */
+#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
+#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */
+#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
+#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */
+#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */
+#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
+#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
+#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_PCLMULQDQ (4*32+ 1) /* PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64 (4*32+ 2) /* 64-bit Debug Store */
+#define X86_FEATURE_MWAIT (4*32+ 3) /* "monitor" Monitor/Mwait support */
+#define X86_FEATURE_DSCPL (4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
+#define X86_FEATURE_VMX (4*32+ 5) /* Hardware virtualization */
+#define X86_FEATURE_SMX (4*32+ 6) /* Safer mode */
+#define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */
+#define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */
+#define X86_FEATURE_CID (4*32+10) /* Context ID */
+#define X86_FEATURE_FMA (4*32+12) /* Fused multiply-add */
+#define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */
+#define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */
+#define X86_FEATURE_PDCM (4*32+15) /* Performance Capabilities */
+#define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */
+#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */
+#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */
+#define X86_FEATURE_AES (4*32+25) /* AES instructions */
+#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
+#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
+
+/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
+#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */
+#define X86_FEATURE_XSTORE_EN (5*32+ 3) /* "rng_en" RNG enabled */
+#define X86_FEATURE_XCRYPT (5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
+#define X86_FEATURE_XCRYPT_EN (5*32+ 7) /* "ace_en" on-CPU crypto enabled */
+#define X86_FEATURE_ACE2 (5*32+ 8) /* Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN (5*32+ 9) /* ACE v2 enabled */
+#define X86_FEATURE_PHE (5*32+10) /* PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN (5*32+11) /* PHE enabled */
+#define X86_FEATURE_PMM (5*32+12) /* PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN (5*32+13) /* PMM enabled */
+
+/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
+#define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */
+#define X86_FEATURE_SVM (6*32+ 2) /* Secure virtual machine */
+#define X86_FEATURE_EXTAPIC (6*32+ 3) /* Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY (6*32+ 4) /* CR8 in 32-bit mode */
+#define X86_FEATURE_ABM (6*32+ 5) /* Advanced bit manipulation */
+#define X86_FEATURE_SSE4A (6*32+ 6) /* SSE-4A */
+#define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */
+#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */
+#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */
+#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */
+#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */
+
+/*
+ * Auxiliary flags: Linux defined - For features scattered in various
+ * CPUID levels like 0x6, 0xA etc
+ */
+#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
+
+/* Virtualization flags: Linux defined */
+#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
+#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */
+#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
+#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
+
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+
+#include <linux/bitops.h>
+
+extern const char * const x86_cap_flags[NCAPINTS*32];
+extern const char * const x86_power_flags[32];
+
+#define test_cpu_cap(c, bit) \
+ test_bit(bit, (unsigned long *)((c)->x86_capability))
+
+#define cpu_has(c, bit) \
+ (__builtin_constant_p(bit) && \
+ ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
+ (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
+ (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
+ (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3)) || \
+ (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \
+ (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \
+ (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
+ (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ) \
+ ? 1 : \
+ test_cpu_cap(c, bit))
+
+#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
+
+#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
+#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability))
+#define setup_clear_cpu_cap(bit) do { \
+ clear_cpu_cap(&boot_cpu_data, bit); \
+ set_bit(bit, (unsigned long *)cleared_cpu_caps); \
+} while (0)
+#define setup_force_cpu_cap(bit) do { \
+ set_cpu_cap(&boot_cpu_data, bit); \
+ clear_bit(bit, (unsigned long *)cleared_cpu_caps); \
+} while (0)
+
+#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU)
+#define cpu_has_vme boot_cpu_has(X86_FEATURE_VME)
+#define cpu_has_de boot_cpu_has(X86_FEATURE_DE)
+#define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE)
+#define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC)
+#define cpu_has_pae boot_cpu_has(X86_FEATURE_PAE)
+#define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE)
+#define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC)
+#define cpu_has_sep boot_cpu_has(X86_FEATURE_SEP)
+#define cpu_has_mtrr boot_cpu_has(X86_FEATURE_MTRR)
+#define cpu_has_mmx boot_cpu_has(X86_FEATURE_MMX)
+#define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR)
+#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM)
+#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2)
+#define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3)
+#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
+#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP)
+#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
+#define cpu_has_k6_mtrr boot_cpu_has(X86_FEATURE_K6_MTRR)
+#define cpu_has_cyrix_arr boot_cpu_has(X86_FEATURE_CYRIX_ARR)
+#define cpu_has_centaur_mcr boot_cpu_has(X86_FEATURE_CENTAUR_MCR)
+#define cpu_has_xstore boot_cpu_has(X86_FEATURE_XSTORE)
+#define cpu_has_xstore_enabled boot_cpu_has(X86_FEATURE_XSTORE_EN)
+#define cpu_has_xcrypt boot_cpu_has(X86_FEATURE_XCRYPT)
+#define cpu_has_xcrypt_enabled boot_cpu_has(X86_FEATURE_XCRYPT_EN)
+#define cpu_has_ace2 boot_cpu_has(X86_FEATURE_ACE2)
+#define cpu_has_ace2_enabled boot_cpu_has(X86_FEATURE_ACE2_EN)
+#define cpu_has_phe boot_cpu_has(X86_FEATURE_PHE)
+#define cpu_has_phe_enabled boot_cpu_has(X86_FEATURE_PHE_EN)
+#define cpu_has_pmm boot_cpu_has(X86_FEATURE_PMM)
+#define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN)
+#define cpu_has_ds boot_cpu_has(X86_FEATURE_DS)
+#define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS)
+#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH)
+#define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS)
+#define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES)
+#define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
+#define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT)
+#define cpu_has_xmm4_1 boot_cpu_has(X86_FEATURE_XMM4_1)
+#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)
+#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
+#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
+#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
+
+#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
+# define cpu_has_invlpg 1
+#else
+# define cpu_has_invlpg (boot_cpu_data.x86 > 3)
+#endif
+
+#ifdef CONFIG_X86_64
+
+#undef cpu_has_vme
+#define cpu_has_vme 0
+
+#undef cpu_has_pae
+#define cpu_has_pae ___BUG___
+
+#undef cpu_has_mp
+#define cpu_has_mp 1
+
+#undef cpu_has_k6_mtrr
+#define cpu_has_k6_mtrr 0
+
+#undef cpu_has_cyrix_arr
+#define cpu_has_cyrix_arr 0
+
+#undef cpu_has_centaur_mcr
+#define cpu_has_centaur_mcr 0
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
+
+#endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/arch/x86/include/asm/cputime.h b/arch/x86/include/asm/cputime.h
new file mode 100644
index 000000000000..6d68ad7e0ea3
--- /dev/null
+++ b/arch/x86/include/asm/cputime.h
@@ -0,0 +1 @@
+#include <asm-generic/cputime.h>
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
new file mode 100644
index 000000000000..0930b4f8d672
--- /dev/null
+++ b/arch/x86/include/asm/current.h
@@ -0,0 +1,39 @@
+#ifndef _ASM_X86_CURRENT_H
+#define _ASM_X86_CURRENT_H
+
+#ifdef CONFIG_X86_32
+#include <linux/compiler.h>
+#include <asm/percpu.h>
+
+struct task_struct;
+
+DECLARE_PER_CPU(struct task_struct *, current_task);
+static __always_inline struct task_struct *get_current(void)
+{
+ return x86_read_percpu(current_task);
+}
+
+#else /* X86_32 */
+
+#ifndef __ASSEMBLY__
+#include <asm/pda.h>
+
+struct task_struct;
+
+static __always_inline struct task_struct *get_current(void)
+{
+ return read_pda(pcurrent);
+}
+
+#else /* __ASSEMBLY__ */
+
+#include <asm/asm-offsets.h>
+#define GET_CURRENT(reg) movq %gs:(pda_pcurrent),reg
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* X86_32 */
+
+#define current get_current()
+
+#endif /* _ASM_X86_CURRENT_H */
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
new file mode 100644
index 000000000000..3ea6f37be9e2
--- /dev/null
+++ b/arch/x86/include/asm/debugreg.h
@@ -0,0 +1,70 @@
+#ifndef _ASM_X86_DEBUGREG_H
+#define _ASM_X86_DEBUGREG_H
+
+
+/* Indicate the register numbers for a number of the specific
+ debug registers. Registers 0-3 contain the addresses we wish to trap on */
+#define DR_FIRSTADDR 0 /* u_debugreg[DR_FIRSTADDR] */
+#define DR_LASTADDR 3 /* u_debugreg[DR_LASTADDR] */
+
+#define DR_STATUS 6 /* u_debugreg[DR_STATUS] */
+#define DR_CONTROL 7 /* u_debugreg[DR_CONTROL] */
+
+/* Define a few things for the status register. We can use this to determine
+ which debugging register was responsible for the trap. The other bits
+ are either reserved or not of interest to us. */
+
+#define DR_TRAP0 (0x1) /* db0 */
+#define DR_TRAP1 (0x2) /* db1 */
+#define DR_TRAP2 (0x4) /* db2 */
+#define DR_TRAP3 (0x8) /* db3 */
+
+#define DR_STEP (0x4000) /* single-step */
+#define DR_SWITCH (0x8000) /* task switch */
+
+/* Now define a bunch of things for manipulating the control register.
+ The top two bytes of the control register consist of 4 fields of 4
+ bits - each field corresponds to one of the four debug registers,
+ and indicates what types of access we trap on, and how large the data
+ field is that we are looking at */
+
+#define DR_CONTROL_SHIFT 16 /* Skip this many bits in ctl register */
+#define DR_CONTROL_SIZE 4 /* 4 control bits per register */
+
+#define DR_RW_EXECUTE (0x0) /* Settings for the access types to trap on */
+#define DR_RW_WRITE (0x1)
+#define DR_RW_READ (0x3)
+
+#define DR_LEN_1 (0x0) /* Settings for data length to trap on */
+#define DR_LEN_2 (0x4)
+#define DR_LEN_4 (0xC)
+#define DR_LEN_8 (0x8)
+
+/* The low byte to the control register determine which registers are
+ enabled. There are 4 fields of two bits. One bit is "local", meaning
+ that the processor will reset the bit after a task switch and the other
+ is global meaning that we have to explicitly reset the bit. With linux,
+ you can use either one, since we explicitly zero the register when we enter
+ kernel mode. */
+
+#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */
+#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */
+#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */
+
+#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */
+#define DR_GLOBAL_ENABLE_MASK (0xAA) /* Set global bits for all 4 regs */
+
+/* The second byte to the control register has a few special things.
+ We can slow the instruction pipeline for instructions coming via the
+ gdt or the ldt if we want to. I am not sure why this is an advantage */
+
+#ifdef __i386__
+#define DR_CONTROL_RESERVED (0xFC00) /* Reserved by Intel */
+#else
+#define DR_CONTROL_RESERVED (0xFFFFFFFF0000FC00UL) /* Reserved */
+#endif
+
+#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */
+#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */
+
+#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h
new file mode 100644
index 000000000000..409a649204aa
--- /dev/null
+++ b/arch/x86/include/asm/delay.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_DELAY_H
+#define _ASM_X86_DELAY_H
+
+/*
+ * Copyright (C) 1993 Linus Torvalds
+ *
+ * Delay routines calling functions in arch/x86/lib/delay.c
+ */
+
+/* Undefined functions to get compile-time errors */
+extern void __bad_udelay(void);
+extern void __bad_ndelay(void);
+
+extern void __udelay(unsigned long usecs);
+extern void __ndelay(unsigned long nsecs);
+extern void __const_udelay(unsigned long xloops);
+extern void __delay(unsigned long loops);
+
+/* 0x10c7 is 2**32 / 1000000 (rounded up) */
+#define udelay(n) (__builtin_constant_p(n) ? \
+ ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
+ __udelay(n))
+
+/* 0x5 is 2**32 / 1000000000 (rounded up) */
+#define ndelay(n) (__builtin_constant_p(n) ? \
+ ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
+ __ndelay(n))
+
+void use_tsc_delay(void);
+
+#endif /* _ASM_X86_DELAY_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
new file mode 100644
index 000000000000..e6b82b17b072
--- /dev/null
+++ b/arch/x86/include/asm/desc.h
@@ -0,0 +1,409 @@
+#ifndef _ASM_X86_DESC_H
+#define _ASM_X86_DESC_H
+
+#ifndef __ASSEMBLY__
+#include <asm/desc_defs.h>
+#include <asm/ldt.h>
+#include <asm/mmu.h>
+#include <linux/smp.h>
+
+static inline void fill_ldt(struct desc_struct *desc,
+ const struct user_desc *info)
+{
+ desc->limit0 = info->limit & 0x0ffff;
+ desc->base0 = info->base_addr & 0x0000ffff;
+
+ desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
+ desc->type = (info->read_exec_only ^ 1) << 1;
+ desc->type |= info->contents << 2;
+ desc->s = 1;
+ desc->dpl = 0x3;
+ desc->p = info->seg_not_present ^ 1;
+ desc->limit = (info->limit & 0xf0000) >> 16;
+ desc->avl = info->useable;
+ desc->d = info->seg_32bit;
+ desc->g = info->limit_in_pages;
+ desc->base2 = (info->base_addr & 0xff000000) >> 24;
+ /*
+ * Don't allow setting of the lm bit. It is useless anyway
+ * because 64bit system calls require __USER_CS:
+ */
+ desc->l = 0;
+}
+
+extern struct desc_ptr idt_descr;
+extern gate_desc idt_table[];
+
+struct gdt_page {
+ struct desc_struct gdt[GDT_ENTRIES];
+} __attribute__((aligned(PAGE_SIZE)));
+DECLARE_PER_CPU(struct gdt_page, gdt_page);
+
+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+{
+ return per_cpu(gdt_page, cpu).gdt;
+}
+
+#ifdef CONFIG_X86_64
+
+static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
+ unsigned dpl, unsigned ist, unsigned seg)
+{
+ gate->offset_low = PTR_LOW(func);
+ gate->segment = __KERNEL_CS;
+ gate->ist = ist;
+ gate->p = 1;
+ gate->dpl = dpl;
+ gate->zero0 = 0;
+ gate->zero1 = 0;
+ gate->type = type;
+ gate->offset_middle = PTR_MIDDLE(func);
+ gate->offset_high = PTR_HIGH(func);
+}
+
+#else
+static inline void pack_gate(gate_desc *gate, unsigned char type,
+ unsigned long base, unsigned dpl, unsigned flags,
+ unsigned short seg)
+{
+ gate->a = (seg << 16) | (base & 0xffff);
+ gate->b = (base & 0xffff0000) |
+ (((0x80 | type | (dpl << 5)) & 0xff) << 8);
+}
+
+#endif
+
+static inline int desc_empty(const void *ptr)
+{
+ const u32 *desc = ptr;
+ return !(desc[0] | desc[1]);
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define load_TR_desc() native_load_tr_desc()
+#define load_gdt(dtr) native_load_gdt(dtr)
+#define load_idt(dtr) native_load_idt(dtr)
+#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
+#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
+
+#define store_gdt(dtr) native_store_gdt(dtr)
+#define store_idt(dtr) native_store_idt(dtr)
+#define store_tr(tr) (tr = native_store_tr())
+#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
+
+#define load_TLS(t, cpu) native_load_tls(t, cpu)
+#define set_ldt native_set_ldt
+
+#define write_ldt_entry(dt, entry, desc) \
+ native_write_ldt_entry(dt, entry, desc)
+#define write_gdt_entry(dt, entry, desc, type) \
+ native_write_gdt_entry(dt, entry, desc, type)
+#define write_idt_entry(dt, entry, g) \
+ native_write_idt_entry(dt, entry, g)
+
+static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
+{
+}
+
+static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
+{
+}
+#endif /* CONFIG_PARAVIRT */
+
+static inline void native_write_idt_entry(gate_desc *idt, int entry,
+ const gate_desc *gate)
+{
+ memcpy(&idt[entry], gate, sizeof(*gate));
+}
+
+static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
+ const void *desc)
+{
+ memcpy(&ldt[entry], desc, 8);
+}
+
+static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
+ const void *desc, int type)
+{
+ unsigned int size;
+ switch (type) {
+ case DESC_TSS:
+ size = sizeof(tss_desc);
+ break;
+ case DESC_LDT:
+ size = sizeof(ldt_desc);
+ break;
+ default:
+ size = sizeof(struct desc_struct);
+ break;
+ }
+ memcpy(&gdt[entry], desc, size);
+}
+
+static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
+ unsigned long limit, unsigned char type,
+ unsigned char flags)
+{
+ desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
+ desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
+ (limit & 0x000f0000) | ((type & 0xff) << 8) |
+ ((flags & 0xf) << 20);
+ desc->p = 1;
+}
+
+
+static inline void set_tssldt_descriptor(void *d, unsigned long addr,
+ unsigned type, unsigned size)
+{
+#ifdef CONFIG_X86_64
+ struct ldttss_desc64 *desc = d;
+ memset(desc, 0, sizeof(*desc));
+ desc->limit0 = size & 0xFFFF;
+ desc->base0 = PTR_LOW(addr);
+ desc->base1 = PTR_MIDDLE(addr) & 0xFF;
+ desc->type = type;
+ desc->p = 1;
+ desc->limit1 = (size >> 16) & 0xF;
+ desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
+ desc->base3 = PTR_HIGH(addr);
+#else
+ pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
+#endif
+}
+
+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
+{
+ struct desc_struct *d = get_cpu_gdt_table(cpu);
+ tss_desc tss;
+
+ /*
+ * sizeof(unsigned long) coming from an extra "long" at the end
+ * of the iobitmap. See tss_struct definition in processor.h
+ *
+ * -1? seg base+limit should be pointing to the address of the
+ * last valid byte
+ */
+ set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
+ IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
+ sizeof(unsigned long) - 1);
+ write_gdt_entry(d, entry, &tss, DESC_TSS);
+}
+
+#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
+
+static inline void native_set_ldt(const void *addr, unsigned int entries)
+{
+ if (likely(entries == 0))
+ asm volatile("lldt %w0"::"q" (0));
+ else {
+ unsigned cpu = smp_processor_id();
+ ldt_desc ldt;
+
+ set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
+ entries * LDT_ENTRY_SIZE - 1);
+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
+ &ldt, DESC_LDT);
+ asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+ }
+}
+
+static inline void native_load_tr_desc(void)
+{
+ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+
+static inline void native_load_gdt(const struct desc_ptr *dtr)
+{
+ asm volatile("lgdt %0"::"m" (*dtr));
+}
+
+static inline void native_load_idt(const struct desc_ptr *dtr)
+{
+ asm volatile("lidt %0"::"m" (*dtr));
+}
+
+static inline void native_store_gdt(struct desc_ptr *dtr)
+{
+ asm volatile("sgdt %0":"=m" (*dtr));
+}
+
+static inline void native_store_idt(struct desc_ptr *dtr)
+{
+ asm volatile("sidt %0":"=m" (*dtr));
+}
+
+static inline unsigned long native_store_tr(void)
+{
+ unsigned long tr;
+ asm volatile("str %0":"=r" (tr));
+ return tr;
+}
+
+static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ unsigned int i;
+ struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+ gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
+}
+
+#define _LDT_empty(info) \
+ ((info)->base_addr == 0 && \
+ (info)->limit == 0 && \
+ (info)->contents == 0 && \
+ (info)->read_exec_only == 1 && \
+ (info)->seg_32bit == 0 && \
+ (info)->limit_in_pages == 0 && \
+ (info)->seg_not_present == 1 && \
+ (info)->useable == 0)
+
+#ifdef CONFIG_X86_64
+#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
+#else
+#define LDT_empty(info) (_LDT_empty(info))
+#endif
+
+static inline void clear_LDT(void)
+{
+ set_ldt(NULL, 0);
+}
+
+/*
+ * load one particular LDT into the current CPU
+ */
+static inline void load_LDT_nolock(mm_context_t *pc)
+{
+ set_ldt(pc->ldt, pc->size);
+}
+
+static inline void load_LDT(mm_context_t *pc)
+{
+ preempt_disable();
+ load_LDT_nolock(pc);
+ preempt_enable();
+}
+
+static inline unsigned long get_desc_base(const struct desc_struct *desc)
+{
+ return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
+}
+
+static inline unsigned long get_desc_limit(const struct desc_struct *desc)
+{
+ return desc->limit0 | (desc->limit << 16);
+}
+
+static inline void _set_gate(int gate, unsigned type, void *addr,
+ unsigned dpl, unsigned ist, unsigned seg)
+{
+ gate_desc s;
+ pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
+ /*
+ * does not need to be atomic because it is only done once at
+ * setup time
+ */
+ write_idt_entry(idt_table, gate, &s);
+}
+
+/*
+ * This needs to use 'idt_table' rather than 'idt', and
+ * thus use the _nonmapped_ version of the IDT, as the
+ * Pentium F0 0F bugfix can have resulted in the mapped
+ * IDT being write-protected.
+ */
+static inline void set_intr_gate(unsigned int n, void *addr)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
+}
+
+#define SYS_VECTOR_FREE 0
+#define SYS_VECTOR_ALLOCED 1
+
+extern int first_system_vector;
+extern char system_vectors[];
+
+static inline void alloc_system_vector(int vector)
+{
+ if (system_vectors[vector] == SYS_VECTOR_FREE) {
+ system_vectors[vector] = SYS_VECTOR_ALLOCED;
+ if (first_system_vector > vector)
+ first_system_vector = vector;
+ } else
+ BUG();
+}
+
+static inline void alloc_intr_gate(unsigned int n, void *addr)
+{
+ alloc_system_vector(n);
+ set_intr_gate(n, addr);
+}
+
+/*
+ * This routine sets up an interrupt gate at directory privilege level 3.
+ */
+static inline void set_system_intr_gate(unsigned int n, void *addr)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
+}
+
+static inline void set_system_trap_gate(unsigned int n, void *addr)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
+}
+
+static inline void set_trap_gate(unsigned int n, void *addr)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
+}
+
+static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
+}
+
+static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
+}
+
+static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
+{
+ BUG_ON((unsigned)n > 0xFF);
+ _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
+}
+
+#else
+/*
+ * GET_DESC_BASE reads the descriptor base of the specified segment.
+ *
+ * Args:
+ * idx - descriptor index
+ * gdt - GDT pointer
+ * base - 32bit register to which the base will be written
+ * lo_w - lo word of the "base" register
+ * lo_b - lo byte of the "base" register
+ * hi_b - hi byte of the low word of the "base" register
+ *
+ * Example:
+ * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
+ * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
+ */
+#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
+ movb idx * 8 + 4(gdt), lo_b; \
+ movb idx * 8 + 7(gdt), hi_b; \
+ shll $16, base; \
+ movw idx * 8 + 2(gdt), lo_w;
+
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
new file mode 100644
index 000000000000..a6adefa28b94
--- /dev/null
+++ b/arch/x86/include/asm/desc_defs.h
@@ -0,0 +1,95 @@
+/* Written 2000 by Andi Kleen */
+#ifndef _ASM_X86_DESC_DEFS_H
+#define _ASM_X86_DESC_DEFS_H
+
+/*
+ * Segment descriptor structure definitions, usable from both x86_64 and i386
+ * archs.
+ */
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+/*
+ * FIXME: Acessing the desc_struct through its fields is more elegant,
+ * and should be the one valid thing to do. However, a lot of open code
+ * still touches the a and b acessors, and doing this allow us to do it
+ * incrementally. We keep the signature as a struct, rather than an union,
+ * so we can get rid of it transparently in the future -- glommer
+ */
+/* 8 byte segment descriptor */
+struct desc_struct {
+ union {
+ struct {
+ unsigned int a;
+ unsigned int b;
+ };
+ struct {
+ u16 limit0;
+ u16 base0;
+ unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
+ unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
+ };
+ };
+} __attribute__((packed));
+
+enum {
+ GATE_INTERRUPT = 0xE,
+ GATE_TRAP = 0xF,
+ GATE_CALL = 0xC,
+ GATE_TASK = 0x5,
+};
+
+/* 16byte gate */
+struct gate_struct64 {
+ u16 offset_low;
+ u16 segment;
+ unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
+ u16 offset_middle;
+ u32 offset_high;
+ u32 zero1;
+} __attribute__((packed));
+
+#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF)
+#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF)
+#define PTR_HIGH(x) ((unsigned long long)(x) >> 32)
+
+enum {
+ DESC_TSS = 0x9,
+ DESC_LDT = 0x2,
+ DESCTYPE_S = 0x10, /* !system */
+};
+
+/* LDT or TSS descriptor in the GDT. 16 bytes. */
+struct ldttss_desc64 {
+ u16 limit0;
+ u16 base0;
+ unsigned base1 : 8, type : 5, dpl : 2, p : 1;
+ unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
+ u32 base3;
+ u32 zero1;
+} __attribute__((packed));
+
+#ifdef CONFIG_X86_64
+typedef struct gate_struct64 gate_desc;
+typedef struct ldttss_desc64 ldt_desc;
+typedef struct ldttss_desc64 tss_desc;
+#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32))
+#define gate_segment(g) ((g).segment)
+#else
+typedef struct desc_struct gate_desc;
+typedef struct desc_struct ldt_desc;
+typedef struct desc_struct tss_desc;
+#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff))
+#define gate_segment(g) ((g).a >> 16)
+#endif
+
+struct desc_ptr {
+ unsigned short size;
+ unsigned long address;
+} __attribute__((packed)) ;
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_DESC_DEFS_H */
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
new file mode 100644
index 000000000000..3c034f48fdb0
--- /dev/null
+++ b/arch/x86/include/asm/device.h
@@ -0,0 +1,16 @@
+#ifndef _ASM_X86_DEVICE_H
+#define _ASM_X86_DEVICE_H
+
+struct dev_archdata {
+#ifdef CONFIG_ACPI
+ void *acpi_handle;
+#endif
+#ifdef CONFIG_X86_64
+struct dma_mapping_ops *dma_ops;
+#endif
+#ifdef CONFIG_DMAR
+ void *iommu; /* hook for IOMMU specific extension */
+#endif
+};
+
+#endif /* _ASM_X86_DEVICE_H */
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
new file mode 100644
index 000000000000..9a2d644c08ef
--- /dev/null
+++ b/arch/x86/include/asm/div64.h
@@ -0,0 +1,60 @@
+#ifndef _ASM_X86_DIV64_H
+#define _ASM_X86_DIV64_H
+
+#ifdef CONFIG_X86_32
+
+#include <linux/types.h>
+
+/*
+ * do_div() is NOT a C function. It wants to return
+ * two values (the quotient and the remainder), but
+ * since that doesn't work very well in C, what it
+ * does is:
+ *
+ * - modifies the 64-bit dividend _in_place_
+ * - returns the 32-bit remainder
+ *
+ * This ends up being the most efficient "calling
+ * convention" on x86.
+ */
+#define do_div(n, base) \
+({ \
+ unsigned long __upper, __low, __high, __mod, __base; \
+ __base = (base); \
+ asm("":"=a" (__low), "=d" (__high) : "A" (n)); \
+ __upper = __high; \
+ if (__high) { \
+ __upper = __high % (__base); \
+ __high = __high / (__base); \
+ } \
+ asm("divl %2":"=a" (__low), "=d" (__mod) \
+ : "rm" (__base), "0" (__low), "1" (__upper)); \
+ asm("":"=A" (n) : "a" (__low), "d" (__high)); \
+ __mod; \
+})
+
+static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
+{
+ union {
+ u64 v64;
+ u32 v32[2];
+ } d = { dividend };
+ u32 upper;
+
+ upper = d.v32[1];
+ d.v32[1] = 0;
+ if (upper >= divisor) {
+ d.v32[1] = upper / divisor;
+ upper %= divisor;
+ }
+ asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) :
+ "rm" (divisor), "0" (d.v32[0]), "1" (upper));
+ return d.v64;
+}
+#define div_u64_rem div_u64_rem
+
+#else
+# include <asm-generic/div64.h>
+#endif /* CONFIG_X86_32 */
+
+#endif /* _ASM_X86_DIV64_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
new file mode 100644
index 000000000000..4035357f5b9d
--- /dev/null
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -0,0 +1,306 @@
+#ifndef _ASM_X86_DMA_MAPPING_H
+#define _ASM_X86_DMA_MAPPING_H
+
+/*
+ * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
+ * documentation.
+ */
+
+#include <linux/scatterlist.h>
+#include <asm/io.h>
+#include <asm/swiotlb.h>
+#include <asm-generic/dma-coherent.h>
+
+extern dma_addr_t bad_dma_address;
+extern int iommu_merge;
+extern struct device x86_dma_fallback_dev;
+extern int panic_on_overflow;
+
+struct dma_mapping_ops {
+ int (*mapping_error)(struct device *dev,
+ dma_addr_t dma_addr);
+ void* (*alloc_coherent)(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp);
+ void (*free_coherent)(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_handle);
+ dma_addr_t (*map_single)(struct device *hwdev, phys_addr_t ptr,
+ size_t size, int direction);
+ void (*unmap_single)(struct device *dev, dma_addr_t addr,
+ size_t size, int direction);
+ void (*sync_single_for_cpu)(struct device *hwdev,
+ dma_addr_t dma_handle, size_t size,
+ int direction);
+ void (*sync_single_for_device)(struct device *hwdev,
+ dma_addr_t dma_handle, size_t size,
+ int direction);
+ void (*sync_single_range_for_cpu)(struct device *hwdev,
+ dma_addr_t dma_handle, unsigned long offset,
+ size_t size, int direction);
+ void (*sync_single_range_for_device)(struct device *hwdev,
+ dma_addr_t dma_handle, unsigned long offset,
+ size_t size, int direction);
+ void (*sync_sg_for_cpu)(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int direction);
+ void (*sync_sg_for_device)(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int direction);
+ int (*map_sg)(struct device *hwdev, struct scatterlist *sg,
+ int nents, int direction);
+ void (*unmap_sg)(struct device *hwdev,
+ struct scatterlist *sg, int nents,
+ int direction);
+ int (*dma_supported)(struct device *hwdev, u64 mask);
+ int is_phys;
+};
+
+extern struct dma_mapping_ops *dma_ops;
+
+static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
+{
+#ifdef CONFIG_X86_32
+ return dma_ops;
+#else
+ if (unlikely(!dev) || !dev->archdata.dma_ops)
+ return dma_ops;
+ else
+ return dev->archdata.dma_ops;
+#endif
+}
+
+/* Make sure we keep the same behaviour */
+static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+ if (ops->mapping_error)
+ return ops->mapping_error(dev, dma_addr);
+
+ return (dma_addr == bad_dma_address);
+}
+
+#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
+#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
+#define dma_is_consistent(d, h) (1)
+
+extern int dma_supported(struct device *hwdev, u64 mask);
+extern int dma_set_mask(struct device *dev, u64 mask);
+
+extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_addr, gfp_t flag);
+
+static inline dma_addr_t
+dma_map_single(struct device *hwdev, void *ptr, size_t size,
+ int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ return ops->map_single(hwdev, virt_to_phys(ptr), size, direction);
+}
+
+static inline void
+dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size,
+ int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ if (ops->unmap_single)
+ ops->unmap_single(dev, addr, size, direction);
+}
+
+static inline int
+dma_map_sg(struct device *hwdev, struct scatterlist *sg,
+ int nents, int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ return ops->map_sg(hwdev, sg, nents, direction);
+}
+
+static inline void
+dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
+ int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ if (ops->unmap_sg)
+ ops->unmap_sg(hwdev, sg, nents, direction);
+}
+
+static inline void
+dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
+ size_t size, int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ if (ops->sync_single_for_cpu)
+ ops->sync_single_for_cpu(hwdev, dma_handle, size, direction);
+ flush_write_buffers();
+}
+
+static inline void
+dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
+ size_t size, int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ if (ops->sync_single_for_device)
+ ops->sync_single_for_device(hwdev, dma_handle, size, direction);
+ flush_write_buffers();
+}
+
+static inline void
+dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
+ unsigned long offset, size_t size, int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ if (ops->sync_single_range_for_cpu)
+ ops->sync_single_range_for_cpu(hwdev, dma_handle, offset,
+ size, direction);
+ flush_write_buffers();
+}
+
+static inline void
+dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
+ unsigned long offset, size_t size,
+ int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ if (ops->sync_single_range_for_device)
+ ops->sync_single_range_for_device(hwdev, dma_handle,
+ offset, size, direction);
+ flush_write_buffers();
+}
+
+static inline void
+dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+ int nelems, int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ if (ops->sync_sg_for_cpu)
+ ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
+ flush_write_buffers();
+}
+
+static inline void
+dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
+ int nelems, int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ if (ops->sync_sg_for_device)
+ ops->sync_sg_for_device(hwdev, sg, nelems, direction);
+
+ flush_write_buffers();
+}
+
+static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
+ size_t offset, size_t size,
+ int direction)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(direction));
+ return ops->map_single(dev, page_to_phys(page) + offset,
+ size, direction);
+}
+
+static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
+ size_t size, int direction)
+{
+ dma_unmap_single(dev, addr, size, direction);
+}
+
+static inline void
+dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+ enum dma_data_direction dir)
+{
+ flush_write_buffers();
+}
+
+static inline int dma_get_cache_alignment(void)
+{
+ /* no easy way to get cache size on all x86, so return the
+ * maximum possible, to be safe */
+ return boot_cpu_data.x86_clflush_size;
+}
+
+static inline unsigned long dma_alloc_coherent_mask(struct device *dev,
+ gfp_t gfp)
+{
+ unsigned long dma_mask = 0;
+
+ dma_mask = dev->coherent_dma_mask;
+ if (!dma_mask)
+ dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
+
+ return dma_mask;
+}
+
+static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
+{
+ unsigned long dma_mask = dma_alloc_coherent_mask(dev, gfp);
+
+ if (dma_mask <= DMA_24BIT_MASK)
+ gfp |= GFP_DMA;
+#ifdef CONFIG_X86_64
+ if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA))
+ gfp |= GFP_DMA32;
+#endif
+ return gfp;
+}
+
+static inline void *
+dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t gfp)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+ void *memory;
+
+ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
+ if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
+ return memory;
+
+ if (!dev) {
+ dev = &x86_dma_fallback_dev;
+ gfp |= GFP_DMA;
+ }
+
+ if (!is_device_dma_capable(dev))
+ return NULL;
+
+ if (!ops->alloc_coherent)
+ return NULL;
+
+ return ops->alloc_coherent(dev, size, dma_handle,
+ dma_alloc_coherent_gfp_flags(dev, gfp));
+}
+
+static inline void dma_free_coherent(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t bus)
+{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+
+ WARN_ON(irqs_disabled()); /* for portability */
+
+ if (dma_release_from_coherent(dev, get_order(size), vaddr))
+ return;
+
+ if (ops->free_coherent)
+ ops->free_coherent(dev, size, vaddr, bus);
+}
+
+#endif
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
new file mode 100644
index 000000000000..ca1098a7e580
--- /dev/null
+++ b/arch/x86/include/asm/dma.h
@@ -0,0 +1,318 @@
+/*
+ * linux/include/asm/dma.h: Defines for using and allocating dma channels.
+ * Written by Hennus Bergman, 1992.
+ * High DMA channel support & info by Hannu Savolainen
+ * and John Boyd, Nov. 1992.
+ */
+
+#ifndef _ASM_X86_DMA_H
+#define _ASM_X86_DMA_H
+
+#include <linux/spinlock.h> /* And spinlocks */
+#include <asm/io.h> /* need byte IO */
+#include <linux/delay.h>
+
+#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER
+#define dma_outb outb_p
+#else
+#define dma_outb outb
+#endif
+
+#define dma_inb inb
+
+/*
+ * NOTES about DMA transfers:
+ *
+ * controller 1: channels 0-3, byte operations, ports 00-1F
+ * controller 2: channels 4-7, word operations, ports C0-DF
+ *
+ * - ALL registers are 8 bits only, regardless of transfer size
+ * - channel 4 is not used - cascades 1 into 2.
+ * - channels 0-3 are byte - addresses/counts are for physical bytes
+ * - channels 5-7 are word - addresses/counts are for physical words
+ * - transfers must not cross physical 64K (0-3) or 128K (5-7) boundaries
+ * - transfer count loaded to registers is 1 less than actual count
+ * - controller 2 offsets are all even (2x offsets for controller 1)
+ * - page registers for 5-7 don't use data bit 0, represent 128K pages
+ * - page registers for 0-3 use bit 0, represent 64K pages
+ *
+ * DMA transfers are limited to the lower 16MB of _physical_ memory.
+ * Note that addresses loaded into registers must be _physical_ addresses,
+ * not logical addresses (which may differ if paging is active).
+ *
+ * Address mapping for channels 0-3:
+ *
+ * A23 ... A16 A15 ... A8 A7 ... A0 (Physical addresses)
+ * | ... | | ... | | ... |
+ * | ... | | ... | | ... |
+ * | ... | | ... | | ... |
+ * P7 ... P0 A7 ... A0 A7 ... A0
+ * | Page | Addr MSB | Addr LSB | (DMA registers)
+ *
+ * Address mapping for channels 5-7:
+ *
+ * A23 ... A17 A16 A15 ... A9 A8 A7 ... A1 A0 (Physical addresses)
+ * | ... | \ \ ... \ \ \ ... \ \
+ * | ... | \ \ ... \ \ \ ... \ (not used)
+ * | ... | \ \ ... \ \ \ ... \
+ * P7 ... P1 (0) A7 A6 ... A0 A7 A6 ... A0
+ * | Page | Addr MSB | Addr LSB | (DMA registers)
+ *
+ * Again, channels 5-7 transfer _physical_ words (16 bits), so addresses
+ * and counts _must_ be word-aligned (the lowest address bit is _ignored_ at
+ * the hardware level, so odd-byte transfers aren't possible).
+ *
+ * Transfer count (_not # bytes_) is limited to 64K, represented as actual
+ * count - 1 : 64K => 0xFFFF, 1 => 0x0000. Thus, count is always 1 or more,
+ * and up to 128K bytes may be transferred on channels 5-7 in one operation.
+ *
+ */
+
+#define MAX_DMA_CHANNELS 8
+
+#ifdef CONFIG_X86_32
+
+/* The maximum address that we can perform a DMA transfer to on this platform */
+#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
+
+#else
+
+/* 16MB ISA DMA zone */
+#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT)
+
+/* 4GB broken PCI/AGP hardware bus master zone */
+#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
+
+/* Compat define for old dma zone */
+#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
+
+#endif
+
+/* 8237 DMA controllers */
+#define IO_DMA1_BASE 0x00 /* 8 bit slave DMA, channels 0..3 */
+#define IO_DMA2_BASE 0xC0 /* 16 bit master DMA, ch 4(=slave input)..7 */
+
+/* DMA controller registers */
+#define DMA1_CMD_REG 0x08 /* command register (w) */
+#define DMA1_STAT_REG 0x08 /* status register (r) */
+#define DMA1_REQ_REG 0x09 /* request register (w) */
+#define DMA1_MASK_REG 0x0A /* single-channel mask (w) */
+#define DMA1_MODE_REG 0x0B /* mode register (w) */
+#define DMA1_CLEAR_FF_REG 0x0C /* clear pointer flip-flop (w) */
+#define DMA1_TEMP_REG 0x0D /* Temporary Register (r) */
+#define DMA1_RESET_REG 0x0D /* Master Clear (w) */
+#define DMA1_CLR_MASK_REG 0x0E /* Clear Mask */
+#define DMA1_MASK_ALL_REG 0x0F /* all-channels mask (w) */
+
+#define DMA2_CMD_REG 0xD0 /* command register (w) */
+#define DMA2_STAT_REG 0xD0 /* status register (r) */
+#define DMA2_REQ_REG 0xD2 /* request register (w) */
+#define DMA2_MASK_REG 0xD4 /* single-channel mask (w) */
+#define DMA2_MODE_REG 0xD6 /* mode register (w) */
+#define DMA2_CLEAR_FF_REG 0xD8 /* clear pointer flip-flop (w) */
+#define DMA2_TEMP_REG 0xDA /* Temporary Register (r) */
+#define DMA2_RESET_REG 0xDA /* Master Clear (w) */
+#define DMA2_CLR_MASK_REG 0xDC /* Clear Mask */
+#define DMA2_MASK_ALL_REG 0xDE /* all-channels mask (w) */
+
+#define DMA_ADDR_0 0x00 /* DMA address registers */
+#define DMA_ADDR_1 0x02
+#define DMA_ADDR_2 0x04
+#define DMA_ADDR_3 0x06
+#define DMA_ADDR_4 0xC0
+#define DMA_ADDR_5 0xC4
+#define DMA_ADDR_6 0xC8
+#define DMA_ADDR_7 0xCC
+
+#define DMA_CNT_0 0x01 /* DMA count registers */
+#define DMA_CNT_1 0x03
+#define DMA_CNT_2 0x05
+#define DMA_CNT_3 0x07
+#define DMA_CNT_4 0xC2
+#define DMA_CNT_5 0xC6
+#define DMA_CNT_6 0xCA
+#define DMA_CNT_7 0xCE
+
+#define DMA_PAGE_0 0x87 /* DMA page registers */
+#define DMA_PAGE_1 0x83
+#define DMA_PAGE_2 0x81
+#define DMA_PAGE_3 0x82
+#define DMA_PAGE_5 0x8B
+#define DMA_PAGE_6 0x89
+#define DMA_PAGE_7 0x8A
+
+/* I/O to memory, no autoinit, increment, single mode */
+#define DMA_MODE_READ 0x44
+/* memory to I/O, no autoinit, increment, single mode */
+#define DMA_MODE_WRITE 0x48
+/* pass thru DREQ->HRQ, DACK<-HLDA only */
+#define DMA_MODE_CASCADE 0xC0
+
+#define DMA_AUTOINIT 0x10
+
+
+extern spinlock_t dma_spin_lock;
+
+static inline unsigned long claim_dma_lock(void)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&dma_spin_lock, flags);
+ return flags;
+}
+
+static inline void release_dma_lock(unsigned long flags)
+{
+ spin_unlock_irqrestore(&dma_spin_lock, flags);
+}
+
+/* enable/disable a specific DMA channel */
+static inline void enable_dma(unsigned int dmanr)
+{
+ if (dmanr <= 3)
+ dma_outb(dmanr, DMA1_MASK_REG);
+ else
+ dma_outb(dmanr & 3, DMA2_MASK_REG);
+}
+
+static inline void disable_dma(unsigned int dmanr)
+{
+ if (dmanr <= 3)
+ dma_outb(dmanr | 4, DMA1_MASK_REG);
+ else
+ dma_outb((dmanr & 3) | 4, DMA2_MASK_REG);
+}
+
+/* Clear the 'DMA Pointer Flip Flop'.
+ * Write 0 for LSB/MSB, 1 for MSB/LSB access.
+ * Use this once to initialize the FF to a known state.
+ * After that, keep track of it. :-)
+ * --- In order to do that, the DMA routines below should ---
+ * --- only be used while holding the DMA lock ! ---
+ */
+static inline void clear_dma_ff(unsigned int dmanr)
+{
+ if (dmanr <= 3)
+ dma_outb(0, DMA1_CLEAR_FF_REG);
+ else
+ dma_outb(0, DMA2_CLEAR_FF_REG);
+}
+
+/* set mode (above) for a specific DMA channel */
+static inline void set_dma_mode(unsigned int dmanr, char mode)
+{
+ if (dmanr <= 3)
+ dma_outb(mode | dmanr, DMA1_MODE_REG);
+ else
+ dma_outb(mode | (dmanr & 3), DMA2_MODE_REG);
+}
+
+/* Set only the page register bits of the transfer address.
+ * This is used for successive transfers when we know the contents of
+ * the lower 16 bits of the DMA current address register, but a 64k boundary
+ * may have been crossed.
+ */
+static inline void set_dma_page(unsigned int dmanr, char pagenr)
+{
+ switch (dmanr) {
+ case 0:
+ dma_outb(pagenr, DMA_PAGE_0);
+ break;
+ case 1:
+ dma_outb(pagenr, DMA_PAGE_1);
+ break;
+ case 2:
+ dma_outb(pagenr, DMA_PAGE_2);
+ break;
+ case 3:
+ dma_outb(pagenr, DMA_PAGE_3);
+ break;
+ case 5:
+ dma_outb(pagenr & 0xfe, DMA_PAGE_5);
+ break;
+ case 6:
+ dma_outb(pagenr & 0xfe, DMA_PAGE_6);
+ break;
+ case 7:
+ dma_outb(pagenr & 0xfe, DMA_PAGE_7);
+ break;
+ }
+}
+
+
+/* Set transfer address & page bits for specific DMA channel.
+ * Assumes dma flipflop is clear.
+ */
+static inline void set_dma_addr(unsigned int dmanr, unsigned int a)
+{
+ set_dma_page(dmanr, a>>16);
+ if (dmanr <= 3) {
+ dma_outb(a & 0xff, ((dmanr & 3) << 1) + IO_DMA1_BASE);
+ dma_outb((a >> 8) & 0xff, ((dmanr & 3) << 1) + IO_DMA1_BASE);
+ } else {
+ dma_outb((a >> 1) & 0xff, ((dmanr & 3) << 2) + IO_DMA2_BASE);
+ dma_outb((a >> 9) & 0xff, ((dmanr & 3) << 2) + IO_DMA2_BASE);
+ }
+}
+
+
+/* Set transfer size (max 64k for DMA0..3, 128k for DMA5..7) for
+ * a specific DMA channel.
+ * You must ensure the parameters are valid.
+ * NOTE: from a manual: "the number of transfers is one more
+ * than the initial word count"! This is taken into account.
+ * Assumes dma flip-flop is clear.
+ * NOTE 2: "count" represents _bytes_ and must be even for channels 5-7.
+ */
+static inline void set_dma_count(unsigned int dmanr, unsigned int count)
+{
+ count--;
+ if (dmanr <= 3) {
+ dma_outb(count & 0xff, ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE);
+ dma_outb((count >> 8) & 0xff,
+ ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE);
+ } else {
+ dma_outb((count >> 1) & 0xff,
+ ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE);
+ dma_outb((count >> 9) & 0xff,
+ ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE);
+ }
+}
+
+
+/* Get DMA residue count. After a DMA transfer, this
+ * should return zero. Reading this while a DMA transfer is
+ * still in progress will return unpredictable results.
+ * If called before the channel has been used, it may return 1.
+ * Otherwise, it returns the number of _bytes_ left to transfer.
+ *
+ * Assumes DMA flip-flop is clear.
+ */
+static inline int get_dma_residue(unsigned int dmanr)
+{
+ unsigned int io_port;
+ /* using short to get 16-bit wrap around */
+ unsigned short count;
+
+ io_port = (dmanr <= 3) ? ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE
+ : ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE;
+
+ count = 1 + dma_inb(io_port);
+ count += dma_inb(io_port) << 8;
+
+ return (dmanr <= 3) ? count : (count << 1);
+}
+
+
+/* These are in kernel/dma.c: */
+extern int request_dma(unsigned int dmanr, const char *device_id);
+extern void free_dma(unsigned int dmanr);
+
+/* From PCI */
+
+#ifdef CONFIG_PCI
+extern int isa_dma_bridge_buggy;
+#else
+#define isa_dma_bridge_buggy (0)
+#endif
+
+#endif /* _ASM_X86_DMA_H */
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
new file mode 100644
index 000000000000..bc68212c6bc0
--- /dev/null
+++ b/arch/x86/include/asm/dmi.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_X86_DMI_H
+#define _ASM_X86_DMI_H
+
+#include <asm/io.h>
+
+#define DMI_MAX_DATA 2048
+
+extern int dmi_alloc_index;
+extern char dmi_alloc_data[DMI_MAX_DATA];
+
+/* This is so early that there is no good way to allocate dynamic memory.
+ Allocate data in an BSS array. */
+static inline void *dmi_alloc(unsigned len)
+{
+ int idx = dmi_alloc_index;
+ if ((dmi_alloc_index + len) > DMI_MAX_DATA)
+ return NULL;
+ dmi_alloc_index += len;
+ return dmi_alloc_data + idx;
+}
+
+/* Use early IO mappings for DMI because it's initialized early */
+#define dmi_ioremap early_ioremap
+#define dmi_iounmap early_iounmap
+
+#endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
new file mode 100644
index 000000000000..a8f672ba100c
--- /dev/null
+++ b/arch/x86/include/asm/ds.h
@@ -0,0 +1,272 @@
+/*
+ * Debug Store (DS) support
+ *
+ * This provides a low-level interface to the hardware's Debug Store
+ * feature that is used for branch trace store (BTS) and
+ * precise-event based sampling (PEBS).
+ *
+ * It manages:
+ * - DS and BTS hardware configuration
+ * - buffer overflow handling (to be done)
+ * - buffer access
+ *
+ * It does not do:
+ * - security checking (is the caller allowed to trace the task)
+ * - buffer allocation (memory accounting)
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
+ */
+
+#ifndef _ASM_X86_DS_H
+#define _ASM_X86_DS_H
+
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/err.h>
+
+
+#ifdef CONFIG_X86_DS
+
+struct task_struct;
+struct ds_context;
+struct ds_tracer;
+struct bts_tracer;
+struct pebs_tracer;
+
+typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
+typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
+
+
+/*
+ * A list of features plus corresponding macros to talk about them in
+ * the ds_request function's flags parameter.
+ *
+ * We use the enum to index an array of corresponding control bits;
+ * we use the macro to index a flags bit-vector.
+ */
+enum ds_feature {
+ dsf_bts = 0,
+ dsf_bts_kernel,
+#define BTS_KERNEL (1 << dsf_bts_kernel)
+ /* trace kernel-mode branches */
+
+ dsf_bts_user,
+#define BTS_USER (1 << dsf_bts_user)
+ /* trace user-mode branches */
+
+ dsf_bts_overflow,
+ dsf_bts_max,
+ dsf_pebs = dsf_bts_max,
+
+ dsf_pebs_max,
+ dsf_ctl_max = dsf_pebs_max,
+ dsf_bts_timestamps = dsf_ctl_max,
+#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps)
+ /* add timestamps into BTS trace */
+
+#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS)
+};
+
+
+/*
+ * Request BTS or PEBS
+ *
+ * Due to alignement constraints, the actual buffer may be slightly
+ * smaller than the requested or provided buffer.
+ *
+ * Returns a pointer to a tracer structure on success, or
+ * ERR_PTR(errcode) on failure.
+ *
+ * The interrupt threshold is independent from the overflow callback
+ * to allow users to use their own overflow interrupt handling mechanism.
+ *
+ * task: the task to request recording for;
+ * NULL for per-cpu recording on the current cpu
+ * base: the base pointer for the (non-pageable) buffer;
+ * size: the size of the provided buffer in bytes
+ * ovfl: pointer to a function to be called on buffer overflow;
+ * NULL if cyclic buffer requested
+ * th: the interrupt threshold in records from the end of the buffer;
+ * -1 if no interrupt threshold is requested.
+ * flags: a bit-mask of the above flags
+ */
+extern struct bts_tracer *ds_request_bts(struct task_struct *task,
+ void *base, size_t size,
+ bts_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags);
+extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
+ void *base, size_t size,
+ pebs_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags);
+
+/*
+ * Release BTS or PEBS resources
+ * Suspend and resume BTS or PEBS tracing
+ *
+ * tracer: the tracer handle returned from ds_request_~()
+ */
+extern void ds_release_bts(struct bts_tracer *tracer);
+extern void ds_suspend_bts(struct bts_tracer *tracer);
+extern void ds_resume_bts(struct bts_tracer *tracer);
+extern void ds_release_pebs(struct pebs_tracer *tracer);
+extern void ds_suspend_pebs(struct pebs_tracer *tracer);
+extern void ds_resume_pebs(struct pebs_tracer *tracer);
+
+
+/*
+ * The raw DS buffer state as it is used for BTS and PEBS recording.
+ *
+ * This is the low-level, arch-dependent interface for working
+ * directly on the raw trace data.
+ */
+struct ds_trace {
+ /* the number of bts/pebs records */
+ size_t n;
+ /* the size of a bts/pebs record in bytes */
+ size_t size;
+ /* pointers into the raw buffer:
+ - to the first entry */
+ void *begin;
+ /* - one beyond the last entry */
+ void *end;
+ /* - one beyond the newest entry */
+ void *top;
+ /* - the interrupt threshold */
+ void *ith;
+ /* flags given on ds_request() */
+ unsigned int flags;
+};
+
+/*
+ * An arch-independent view on branch trace data.
+ */
+enum bts_qualifier {
+ bts_invalid,
+#define BTS_INVALID bts_invalid
+
+ bts_branch,
+#define BTS_BRANCH bts_branch
+
+ bts_task_arrives,
+#define BTS_TASK_ARRIVES bts_task_arrives
+
+ bts_task_departs,
+#define BTS_TASK_DEPARTS bts_task_departs
+
+ bts_qual_bit_size = 4,
+ bts_qual_max = (1 << bts_qual_bit_size),
+};
+
+struct bts_struct {
+ __u64 qualifier;
+ union {
+ /* BTS_BRANCH */
+ struct {
+ __u64 from;
+ __u64 to;
+ } lbr;
+ /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
+ struct {
+ __u64 jiffies;
+ pid_t pid;
+ } timestamp;
+ } variant;
+};
+
+
+/*
+ * The BTS state.
+ *
+ * This gives access to the raw DS state and adds functions to provide
+ * an arch-independent view of the BTS data.
+ */
+struct bts_trace {
+ struct ds_trace ds;
+
+ int (*read)(struct bts_tracer *tracer, const void *at,
+ struct bts_struct *out);
+ int (*write)(struct bts_tracer *tracer, const struct bts_struct *in);
+};
+
+
+/*
+ * The PEBS state.
+ *
+ * This gives access to the raw DS state and the PEBS-specific counter
+ * reset value.
+ */
+struct pebs_trace {
+ struct ds_trace ds;
+
+ /* the PEBS reset value */
+ unsigned long long reset_value;
+};
+
+
+/*
+ * Read the BTS or PEBS trace.
+ *
+ * Returns a view on the trace collected for the parameter tracer.
+ *
+ * The view remains valid as long as the traced task is not running or
+ * the tracer is suspended.
+ * Writes into the trace buffer are not reflected.
+ *
+ * tracer: the tracer handle returned from ds_request_~()
+ */
+extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer);
+extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer);
+
+
+/*
+ * Reset the write pointer of the BTS/PEBS buffer.
+ *
+ * Returns 0 on success; -Eerrno on error
+ *
+ * tracer: the tracer handle returned from ds_request_~()
+ */
+extern int ds_reset_bts(struct bts_tracer *tracer);
+extern int ds_reset_pebs(struct pebs_tracer *tracer);
+
+/*
+ * Set the PEBS counter reset value.
+ *
+ * Returns 0 on success; -Eerrno on error
+ *
+ * tracer: the tracer handle returned from ds_request_pebs()
+ * value: the new counter reset value
+ */
+extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
+
+/*
+ * Initialization
+ */
+struct cpuinfo_x86;
+extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
+
+/*
+ * Context switch work
+ */
+extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
+
+/*
+ * Task clone/init and cleanup work
+ */
+extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father);
+extern void ds_exit_thread(struct task_struct *tsk);
+
+#else /* CONFIG_X86_DS */
+
+struct cpuinfo_x86;
+static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
+static inline void ds_switch_to(struct task_struct *prev,
+ struct task_struct *next) {}
+static inline void ds_copy_thread(struct task_struct *tsk,
+ struct task_struct *father) {}
+static inline void ds_exit_thread(struct task_struct *tsk) {}
+
+#endif /* CONFIG_X86_DS */
+#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
new file mode 100644
index 000000000000..3afc5e87cfdd
--- /dev/null
+++ b/arch/x86/include/asm/dwarf2.h
@@ -0,0 +1,96 @@
+#ifndef _ASM_X86_DWARF2_H
+#define _ASM_X86_DWARF2_H
+
+#ifndef __ASSEMBLY__
+#warning "asm/dwarf2.h should be only included in pure assembly files"
+#endif
+
+/*
+ * Macros for dwarf2 CFI unwind table entries.
+ * See "as.info" for details on these pseudo ops. Unfortunately
+ * they are only supported in very new binutils, so define them
+ * away for older version.
+ */
+
+#ifdef CONFIG_AS_CFI
+
+#define CFI_STARTPROC .cfi_startproc
+#define CFI_ENDPROC .cfi_endproc
+#define CFI_DEF_CFA .cfi_def_cfa
+#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register
+#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset
+#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset
+#define CFI_OFFSET .cfi_offset
+#define CFI_REL_OFFSET .cfi_rel_offset
+#define CFI_REGISTER .cfi_register
+#define CFI_RESTORE .cfi_restore
+#define CFI_REMEMBER_STATE .cfi_remember_state
+#define CFI_RESTORE_STATE .cfi_restore_state
+#define CFI_UNDEFINED .cfi_undefined
+
+#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
+#define CFI_SIGNAL_FRAME .cfi_signal_frame
+#else
+#define CFI_SIGNAL_FRAME
+#endif
+
+#else
+
+/*
+ * Due to the structure of pre-exisiting code, don't use assembler line
+ * comment character # to ignore the arguments. Instead, use a dummy macro.
+ */
+.macro cfi_ignore a=0, b=0, c=0, d=0
+.endm
+
+#define CFI_STARTPROC cfi_ignore
+#define CFI_ENDPROC cfi_ignore
+#define CFI_DEF_CFA cfi_ignore
+#define CFI_DEF_CFA_REGISTER cfi_ignore
+#define CFI_DEF_CFA_OFFSET cfi_ignore
+#define CFI_ADJUST_CFA_OFFSET cfi_ignore
+#define CFI_OFFSET cfi_ignore
+#define CFI_REL_OFFSET cfi_ignore
+#define CFI_REGISTER cfi_ignore
+#define CFI_RESTORE cfi_ignore
+#define CFI_REMEMBER_STATE cfi_ignore
+#define CFI_RESTORE_STATE cfi_ignore
+#define CFI_UNDEFINED cfi_ignore
+#define CFI_SIGNAL_FRAME cfi_ignore
+
+#endif
+
+/*
+ * An attempt to make CFI annotations more or less
+ * correct and shorter. It is implied that you know
+ * what you're doing if you use them.
+ */
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_X86_64
+ .macro pushq_cfi reg
+ pushq \reg
+ CFI_ADJUST_CFA_OFFSET 8
+ .endm
+
+ .macro popq_cfi reg
+ popq \reg
+ CFI_ADJUST_CFA_OFFSET -8
+ .endm
+
+ .macro movq_cfi reg offset=0
+ movq %\reg, \offset(%rsp)
+ CFI_REL_OFFSET \reg, \offset
+ .endm
+
+ .macro movq_cfi_restore offset reg
+ movq \offset(%rsp), %\reg
+ CFI_RESTORE \reg
+ .endm
+#else /*!CONFIG_X86_64*/
+
+ /* 32bit defenitions are missed yet */
+
+#endif /*!CONFIG_X86_64*/
+#endif /*__ASSEMBLY__*/
+
+#endif /* _ASM_X86_DWARF2_H */
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
new file mode 100644
index 000000000000..3d8ceddbd407
--- /dev/null
+++ b/arch/x86/include/asm/e820.h
@@ -0,0 +1,146 @@
+#ifndef _ASM_X86_E820_H
+#define _ASM_X86_E820_H
+#define E820MAP 0x2d0 /* our map */
+#define E820MAX 128 /* number of entries in E820MAP */
+
+/*
+ * Legacy E820 BIOS limits us to 128 (E820MAX) nodes due to the
+ * constrained space in the zeropage. If we have more nodes than
+ * that, and if we've booted off EFI firmware, then the EFI tables
+ * passed us from the EFI firmware can list more nodes. Size our
+ * internal memory map tables to have room for these additional
+ * nodes, based on up to three entries per node for which the
+ * kernel was built: MAX_NUMNODES == (1 << CONFIG_NODES_SHIFT),
+ * plus E820MAX, allowing space for the possible duplicate E820
+ * entries that might need room in the same arrays, prior to the
+ * call to sanitize_e820_map() to remove duplicates. The allowance
+ * of three memory map entries per node is "enough" entries for
+ * the initial hardware platform motivating this mechanism to make
+ * use of additional EFI map entries. Future platforms may want
+ * to allow more than three entries per node or otherwise refine
+ * this size.
+ */
+
+/*
+ * Odd: 'make headers_check' complains about numa.h if I try
+ * to collapse the next two #ifdef lines to a single line:
+ * #if defined(__KERNEL__) && defined(CONFIG_EFI)
+ */
+#ifdef __KERNEL__
+#ifdef CONFIG_EFI
+#include <linux/numa.h>
+#define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES)
+#else /* ! CONFIG_EFI */
+#define E820_X_MAX E820MAX
+#endif
+#else /* ! __KERNEL__ */
+#define E820_X_MAX E820MAX
+#endif
+
+#define E820NR 0x1e8 /* # entries in E820MAP */
+
+#define E820_RAM 1
+#define E820_RESERVED 2
+#define E820_ACPI 3
+#define E820_NVS 4
+#define E820_UNUSABLE 5
+
+/* reserved RAM used by kernel itself */
+#define E820_RESERVED_KERN 128
+
+#ifndef __ASSEMBLY__
+struct e820entry {
+ __u64 addr; /* start of memory segment */
+ __u64 size; /* size of memory segment */
+ __u32 type; /* type of memory segment */
+} __attribute__((packed));
+
+struct e820map {
+ __u32 nr_map;
+ struct e820entry map[E820_X_MAX];
+};
+
+#ifdef __KERNEL__
+/* see comment in arch/x86/kernel/e820.c */
+extern struct e820map e820;
+extern struct e820map e820_saved;
+
+extern unsigned long pci_mem_start;
+extern int e820_any_mapped(u64 start, u64 end, unsigned type);
+extern int e820_all_mapped(u64 start, u64 end, unsigned type);
+extern void e820_add_region(u64 start, u64 size, int type);
+extern void e820_print_map(char *who);
+extern int
+sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map);
+extern u64 e820_update_range(u64 start, u64 size, unsigned old_type,
+ unsigned new_type);
+extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type,
+ int checktype);
+extern void update_e820(void);
+extern void e820_setup_gap(void);
+extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
+ unsigned long start_addr, unsigned long long end_addr);
+struct setup_data;
+extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data);
+
+#if defined(CONFIG_X86_64) || \
+ (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
+extern void e820_mark_nosave_regions(unsigned long limit_pfn);
+#else
+static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
+{
+}
+#endif
+
+#ifdef CONFIG_MEMTEST
+extern void early_memtest(unsigned long start, unsigned long end);
+#else
+static inline void early_memtest(unsigned long start, unsigned long end)
+{
+}
+#endif
+
+extern unsigned long end_user_pfn;
+
+extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
+extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
+extern void reserve_early(u64 start, u64 end, char *name);
+extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
+extern void free_early(u64 start, u64 end);
+extern void early_res_to_bootmem(u64 start, u64 end);
+extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
+
+extern unsigned long e820_end_of_ram_pfn(void);
+extern unsigned long e820_end_of_low_ram_pfn(void);
+extern int e820_find_active_region(const struct e820entry *ei,
+ unsigned long start_pfn,
+ unsigned long last_pfn,
+ unsigned long *ei_startpfn,
+ unsigned long *ei_endpfn);
+extern void e820_register_active_regions(int nid, unsigned long start_pfn,
+ unsigned long end_pfn);
+extern u64 e820_hole_size(u64 start, u64 end);
+extern void finish_e820_parsing(void);
+extern void e820_reserve_resources(void);
+extern void e820_reserve_resources_late(void);
+extern void setup_memory_map(void);
+extern char *default_machine_specific_memory_setup(void);
+extern char *machine_specific_memory_setup(void);
+extern char *memory_setup(void);
+#endif /* __KERNEL__ */
+#endif /* __ASSEMBLY__ */
+
+#define ISA_START_ADDRESS 0xa0000
+#define ISA_END_ADDRESS 0x100000
+#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS)
+
+#define BIOS_BEGIN 0x000a0000
+#define BIOS_END 0x00100000
+
+#ifdef __KERNEL__
+#include <linux/ioport.h>
+
+#define HIGH_MEMORY (1024*1024)
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_E820_H */
diff --git a/arch/x86/include/asm/edac.h b/arch/x86/include/asm/edac.h
new file mode 100644
index 000000000000..e9b57ecc70c5
--- /dev/null
+++ b/arch/x86/include/asm/edac.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_EDAC_H
+#define _ASM_X86_EDAC_H
+
+/* ECC atomic, DMA, SMP and interrupt safe scrub function */
+
+static inline void atomic_scrub(void *va, u32 size)
+{
+ u32 i, *virt_addr = va;
+
+ /*
+ * Very carefully read and write to memory atomically so we
+ * are interrupt, DMA and SMP safe.
+ */
+ for (i = 0; i < size / 4; i++, virt_addr++)
+ asm volatile("lock; addl $0, %0"::"m" (*virt_addr));
+}
+
+#endif /* _ASM_X86_EDAC_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
new file mode 100644
index 000000000000..a2e545c91c35
--- /dev/null
+++ b/arch/x86/include/asm/efi.h
@@ -0,0 +1,110 @@
+#ifndef _ASM_X86_EFI_H
+#define _ASM_X86_EFI_H
+
+#ifdef CONFIG_X86_32
+
+extern unsigned long asmlinkage efi_call_phys(void *, ...);
+
+#define efi_call_phys0(f) efi_call_phys(f)
+#define efi_call_phys1(f, a1) efi_call_phys(f, a1)
+#define efi_call_phys2(f, a1, a2) efi_call_phys(f, a1, a2)
+#define efi_call_phys3(f, a1, a2, a3) efi_call_phys(f, a1, a2, a3)
+#define efi_call_phys4(f, a1, a2, a3, a4) \
+ efi_call_phys(f, a1, a2, a3, a4)
+#define efi_call_phys5(f, a1, a2, a3, a4, a5) \
+ efi_call_phys(f, a1, a2, a3, a4, a5)
+#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \
+ efi_call_phys(f, a1, a2, a3, a4, a5, a6)
+/*
+ * Wrap all the virtual calls in a way that forces the parameters on the stack.
+ */
+
+#define efi_call_virt(f, args...) \
+ ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
+
+#define efi_call_virt0(f) efi_call_virt(f)
+#define efi_call_virt1(f, a1) efi_call_virt(f, a1)
+#define efi_call_virt2(f, a1, a2) efi_call_virt(f, a1, a2)
+#define efi_call_virt3(f, a1, a2, a3) efi_call_virt(f, a1, a2, a3)
+#define efi_call_virt4(f, a1, a2, a3, a4) \
+ efi_call_virt(f, a1, a2, a3, a4)
+#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
+ efi_call_virt(f, a1, a2, a3, a4, a5)
+#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
+ efi_call_virt(f, a1, a2, a3, a4, a5, a6)
+
+#define efi_ioremap(addr, size) ioremap_cache(addr, size)
+
+#else /* !CONFIG_X86_32 */
+
+#define MAX_EFI_IO_PAGES 100
+
+extern u64 efi_call0(void *fp);
+extern u64 efi_call1(void *fp, u64 arg1);
+extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
+extern u64 efi_call3(void *fp, u64 arg1, u64 arg2, u64 arg3);
+extern u64 efi_call4(void *fp, u64 arg1, u64 arg2, u64 arg3, u64 arg4);
+extern u64 efi_call5(void *fp, u64 arg1, u64 arg2, u64 arg3,
+ u64 arg4, u64 arg5);
+extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
+ u64 arg4, u64 arg5, u64 arg6);
+
+#define efi_call_phys0(f) \
+ efi_call0((void *)(f))
+#define efi_call_phys1(f, a1) \
+ efi_call1((void *)(f), (u64)(a1))
+#define efi_call_phys2(f, a1, a2) \
+ efi_call2((void *)(f), (u64)(a1), (u64)(a2))
+#define efi_call_phys3(f, a1, a2, a3) \
+ efi_call3((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3))
+#define efi_call_phys4(f, a1, a2, a3, a4) \
+ efi_call4((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
+ (u64)(a4))
+#define efi_call_phys5(f, a1, a2, a3, a4, a5) \
+ efi_call5((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
+ (u64)(a4), (u64)(a5))
+#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \
+ efi_call6((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
+ (u64)(a4), (u64)(a5), (u64)(a6))
+
+#define efi_call_virt0(f) \
+ efi_call0((void *)(efi.systab->runtime->f))
+#define efi_call_virt1(f, a1) \
+ efi_call1((void *)(efi.systab->runtime->f), (u64)(a1))
+#define efi_call_virt2(f, a1, a2) \
+ efi_call2((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2))
+#define efi_call_virt3(f, a1, a2, a3) \
+ efi_call3((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+ (u64)(a3))
+#define efi_call_virt4(f, a1, a2, a3, a4) \
+ efi_call4((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+ (u64)(a3), (u64)(a4))
+#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
+ efi_call5((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+ (u64)(a3), (u64)(a4), (u64)(a5))
+#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
+ efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+ (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
+
+extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size);
+
+#endif /* CONFIG_X86_32 */
+
+extern void efi_reserve_early(void);
+extern void efi_call_phys_prelog(void);
+extern void efi_call_phys_epilog(void);
+
+#ifndef CONFIG_EFI
+/*
+ * IF EFI is not configured, have the EFI calls return -ENOSYS.
+ */
+#define efi_call0(_f) (-ENOSYS)
+#define efi_call1(_f, _a1) (-ENOSYS)
+#define efi_call2(_f, _a1, _a2) (-ENOSYS)
+#define efi_call3(_f, _a1, _a2, _a3) (-ENOSYS)
+#define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS)
+#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS)
+#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS)
+#endif /* CONFIG_EFI */
+
+#endif /* _ASM_X86_EFI_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
new file mode 100644
index 000000000000..f51a3ddde01a
--- /dev/null
+++ b/arch/x86/include/asm/elf.h
@@ -0,0 +1,336 @@
+#ifndef _ASM_X86_ELF_H
+#define _ASM_X86_ELF_H
+
+/*
+ * ELF register definitions..
+ */
+
+#include <asm/ptrace.h>
+#include <asm/user.h>
+#include <asm/auxvec.h>
+
+typedef unsigned long elf_greg_t;
+
+#define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t))
+typedef elf_greg_t elf_gregset_t[ELF_NGREG];
+
+typedef struct user_i387_struct elf_fpregset_t;
+
+#ifdef __i386__
+
+typedef struct user_fxsr_struct elf_fpxregset_t;
+
+#define R_386_NONE 0
+#define R_386_32 1
+#define R_386_PC32 2
+#define R_386_GOT32 3
+#define R_386_PLT32 4
+#define R_386_COPY 5
+#define R_386_GLOB_DAT 6
+#define R_386_JMP_SLOT 7
+#define R_386_RELATIVE 8
+#define R_386_GOTOFF 9
+#define R_386_GOTPC 10
+#define R_386_NUM 11
+
+/*
+ * These are used to set parameters in the core dumps.
+ */
+#define ELF_CLASS ELFCLASS32
+#define ELF_DATA ELFDATA2LSB
+#define ELF_ARCH EM_386
+
+#else
+
+/* x86-64 relocation types */
+#define R_X86_64_NONE 0 /* No reloc */
+#define R_X86_64_64 1 /* Direct 64 bit */
+#define R_X86_64_PC32 2 /* PC relative 32 bit signed */
+#define R_X86_64_GOT32 3 /* 32 bit GOT entry */
+#define R_X86_64_PLT32 4 /* 32 bit PLT address */
+#define R_X86_64_COPY 5 /* Copy symbol at runtime */
+#define R_X86_64_GLOB_DAT 6 /* Create GOT entry */
+#define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */
+#define R_X86_64_RELATIVE 8 /* Adjust by program base */
+#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative
+ offset to GOT */
+#define R_X86_64_32 10 /* Direct 32 bit zero extended */
+#define R_X86_64_32S 11 /* Direct 32 bit sign extended */
+#define R_X86_64_16 12 /* Direct 16 bit zero extended */
+#define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */
+#define R_X86_64_8 14 /* Direct 8 bit sign extended */
+#define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */
+
+#define R_X86_64_NUM 16
+
+/*
+ * These are used to set parameters in the core dumps.
+ */
+#define ELF_CLASS ELFCLASS64
+#define ELF_DATA ELFDATA2LSB
+#define ELF_ARCH EM_X86_64
+
+#endif
+
+#include <asm/vdso.h>
+
+extern unsigned int vdso_enabled;
+
+/*
+ * This is used to ensure we don't load something for the wrong architecture.
+ */
+#define elf_check_arch_ia32(x) \
+ (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486))
+
+#include <asm/processor.h>
+#include <asm/system.h>
+
+#ifdef CONFIG_X86_32
+#include <asm/desc.h>
+
+#define elf_check_arch(x) elf_check_arch_ia32(x)
+
+/* SVR4/i386 ABI (pages 3-31, 3-32) says that when the program starts %edx
+ contains a pointer to a function which might be registered using `atexit'.
+ This provides a mean for the dynamic linker to call DT_FINI functions for
+ shared libraries that have been loaded before the code runs.
+
+ A value of 0 tells we have no such handler.
+
+ We might as well make sure everything else is cleared too (except for %esp),
+ just to make things more deterministic.
+ */
+#define ELF_PLAT_INIT(_r, load_addr) \
+ do { \
+ _r->bx = 0; _r->cx = 0; _r->dx = 0; \
+ _r->si = 0; _r->di = 0; _r->bp = 0; \
+ _r->ax = 0; \
+} while (0)
+
+/*
+ * regs is struct pt_regs, pr_reg is elf_gregset_t (which is
+ * now struct_user_regs, they are different)
+ */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs) \
+do { \
+ pr_reg[0] = regs->bx; \
+ pr_reg[1] = regs->cx; \
+ pr_reg[2] = regs->dx; \
+ pr_reg[3] = regs->si; \
+ pr_reg[4] = regs->di; \
+ pr_reg[5] = regs->bp; \
+ pr_reg[6] = regs->ax; \
+ pr_reg[7] = regs->ds & 0xffff; \
+ pr_reg[8] = regs->es & 0xffff; \
+ pr_reg[9] = regs->fs & 0xffff; \
+ savesegment(gs, pr_reg[10]); \
+ pr_reg[11] = regs->orig_ax; \
+ pr_reg[12] = regs->ip; \
+ pr_reg[13] = regs->cs & 0xffff; \
+ pr_reg[14] = regs->flags; \
+ pr_reg[15] = regs->sp; \
+ pr_reg[16] = regs->ss & 0xffff; \
+} while (0);
+
+#define ELF_PLATFORM (utsname()->machine)
+#define set_personality_64bit() do { } while (0)
+
+#else /* CONFIG_X86_32 */
+
+/*
+ * This is used to ensure we don't load something for the wrong architecture.
+ */
+#define elf_check_arch(x) \
+ ((x)->e_machine == EM_X86_64)
+
+#define compat_elf_check_arch(x) elf_check_arch_ia32(x)
+
+static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp)
+{
+ loadsegment(fs, 0);
+ loadsegment(ds, __USER32_DS);
+ loadsegment(es, __USER32_DS);
+ load_gs_index(0);
+ regs->ip = ip;
+ regs->sp = sp;
+ regs->flags = X86_EFLAGS_IF;
+ regs->cs = __USER32_CS;
+ regs->ss = __USER32_DS;
+}
+
+static inline void elf_common_init(struct thread_struct *t,
+ struct pt_regs *regs, const u16 ds)
+{
+ regs->ax = regs->bx = regs->cx = regs->dx = 0;
+ regs->si = regs->di = regs->bp = 0;
+ regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
+ regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
+ t->fs = t->gs = 0;
+ t->fsindex = t->gsindex = 0;
+ t->ds = t->es = ds;
+}
+
+#define ELF_PLAT_INIT(_r, load_addr) \
+do { \
+ elf_common_init(&current->thread, _r, 0); \
+ clear_thread_flag(TIF_IA32); \
+} while (0)
+
+#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
+ elf_common_init(&current->thread, regs, __USER_DS)
+
+#define compat_start_thread(regs, ip, sp) \
+do { \
+ start_ia32_thread(regs, ip, sp); \
+ set_fs(USER_DS); \
+} while (0)
+
+#define COMPAT_SET_PERSONALITY(ex) \
+do { \
+ if (test_thread_flag(TIF_IA32)) \
+ clear_thread_flag(TIF_ABI_PENDING); \
+ else \
+ set_thread_flag(TIF_ABI_PENDING); \
+ current->personality |= force_personality32; \
+} while (0)
+
+#define COMPAT_ELF_PLATFORM ("i686")
+
+/*
+ * regs is struct pt_regs, pr_reg is elf_gregset_t (which is
+ * now struct_user_regs, they are different). Assumes current is the process
+ * getting dumped.
+ */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs) \
+do { \
+ unsigned v; \
+ (pr_reg)[0] = (regs)->r15; \
+ (pr_reg)[1] = (regs)->r14; \
+ (pr_reg)[2] = (regs)->r13; \
+ (pr_reg)[3] = (regs)->r12; \
+ (pr_reg)[4] = (regs)->bp; \
+ (pr_reg)[5] = (regs)->bx; \
+ (pr_reg)[6] = (regs)->r11; \
+ (pr_reg)[7] = (regs)->r10; \
+ (pr_reg)[8] = (regs)->r9; \
+ (pr_reg)[9] = (regs)->r8; \
+ (pr_reg)[10] = (regs)->ax; \
+ (pr_reg)[11] = (regs)->cx; \
+ (pr_reg)[12] = (regs)->dx; \
+ (pr_reg)[13] = (regs)->si; \
+ (pr_reg)[14] = (regs)->di; \
+ (pr_reg)[15] = (regs)->orig_ax; \
+ (pr_reg)[16] = (regs)->ip; \
+ (pr_reg)[17] = (regs)->cs; \
+ (pr_reg)[18] = (regs)->flags; \
+ (pr_reg)[19] = (regs)->sp; \
+ (pr_reg)[20] = (regs)->ss; \
+ (pr_reg)[21] = current->thread.fs; \
+ (pr_reg)[22] = current->thread.gs; \
+ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \
+ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
+ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \
+ asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v; \
+} while (0);
+
+/* I'm not sure if we can use '-' here */
+#define ELF_PLATFORM ("x86_64")
+extern void set_personality_64bit(void);
+extern unsigned int sysctl_vsyscall32;
+extern int force_personality32;
+
+#endif /* !CONFIG_X86_32 */
+
+#define CORE_DUMP_USE_REGSET
+#define USE_ELF_CORE_DUMP
+#define ELF_EXEC_PAGESIZE 4096
+
+/* This is the location that an ET_DYN program is loaded if exec'ed. Typical
+ use of this is to invoke "./ld.so someprog" to test out a new version of
+ the loader. We need to make sure that it is out of the way of the program
+ that it will "exec", and that there is sufficient room for the brk. */
+
+#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2)
+
+/* This yields a mask that user programs can use to figure out what
+ instruction set this CPU supports. This could be done in user space,
+ but it's not easy, and we've already done it here. */
+
+#define ELF_HWCAP (boot_cpu_data.x86_capability[0])
+
+/* This yields a string that ld.so will use to load implementation
+ specific libraries for optimization. This is more specific in
+ intent than poking at uname or /proc/cpuinfo.
+
+ For the moment, we have only optimizations for the Intel generations,
+ but that could change... */
+
+#define SET_PERSONALITY(ex) set_personality_64bit()
+
+/*
+ * An executable for which elf_read_implies_exec() returns TRUE will
+ * have the READ_IMPLIES_EXEC personality flag set automatically.
+ */
+#define elf_read_implies_exec(ex, executable_stack) \
+ (executable_stack != EXSTACK_DISABLE_X)
+
+struct task_struct;
+
+#define ARCH_DLINFO_IA32(vdso_enabled) \
+do { \
+ if (vdso_enabled) { \
+ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \
+ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \
+ } \
+} while (0)
+
+#ifdef CONFIG_X86_32
+
+#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO))
+
+#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled)
+
+/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
+
+#else /* CONFIG_X86_32 */
+
+#define VDSO_HIGH_BASE 0xffffe000U /* CONFIG_COMPAT_VDSO address */
+
+/* 1GB for 64bit, 8MB for 32bit */
+#define STACK_RND_MASK (test_thread_flag(TIF_IA32) ? 0x7ff : 0x3fffff)
+
+#define ARCH_DLINFO \
+do { \
+ if (vdso_enabled) \
+ NEW_AUX_ENT(AT_SYSINFO_EHDR, \
+ (unsigned long)current->mm->context.vdso); \
+} while (0)
+
+#define AT_SYSINFO 32
+
+#define COMPAT_ARCH_DLINFO ARCH_DLINFO_IA32(sysctl_vsyscall32)
+
+#define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
+
+#endif /* !CONFIG_X86_32 */
+
+#define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso)
+
+#define VDSO_ENTRY \
+ ((unsigned long)VDSO32_SYMBOL(VDSO_CURRENT_BASE, vsyscall))
+
+struct linux_binprm;
+
+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
+extern int arch_setup_additional_pages(struct linux_binprm *bprm,
+ int uses_interp);
+
+extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
+#define compat_arch_setup_additional_pages syscall32_setup_pages
+
+extern unsigned long arch_randomize_brk(struct mm_struct *mm);
+#define arch_randomize_brk arch_randomize_brk
+
+#endif /* _ASM_X86_ELF_H */
diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h
new file mode 100644
index 000000000000..cc70c1c78ca4
--- /dev/null
+++ b/arch/x86/include/asm/emergency-restart.h
@@ -0,0 +1,20 @@
+#ifndef _ASM_X86_EMERGENCY_RESTART_H
+#define _ASM_X86_EMERGENCY_RESTART_H
+
+enum reboot_type {
+ BOOT_TRIPLE = 't',
+ BOOT_KBD = 'k',
+#ifdef CONFIG_X86_32
+ BOOT_BIOS = 'b',
+#endif
+ BOOT_ACPI = 'a',
+ BOOT_EFI = 'e',
+ BOOT_CF9 = 'p',
+ BOOT_CF9_COND = 'q',
+};
+
+extern enum reboot_type reboot_type;
+
+extern void machine_emergency_restart(void);
+
+#endif /* _ASM_X86_EMERGENCY_RESTART_H */
diff --git a/arch/x86/include/asm/errno.h b/arch/x86/include/asm/errno.h
new file mode 100644
index 000000000000..4c82b503d92f
--- /dev/null
+++ b/arch/x86/include/asm/errno.h
@@ -0,0 +1 @@
+#include <asm-generic/errno.h>
diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h
new file mode 100644
index 000000000000..e24ef876915f
--- /dev/null
+++ b/arch/x86/include/asm/es7000/apic.h
@@ -0,0 +1,220 @@
+#ifndef __ASM_ES7000_APIC_H
+#define __ASM_ES7000_APIC_H
+
+#define xapic_phys_to_log_apicid(cpu) per_cpu(x86_bios_cpu_apicid, cpu)
+#define esr_disable (1)
+
+static inline int apic_id_registered(void)
+{
+ return (1);
+}
+
+static inline cpumask_t target_cpus_cluster(void)
+{
+ return CPU_MASK_ALL;
+}
+
+static inline cpumask_t target_cpus(void)
+{
+ return cpumask_of_cpu(smp_processor_id());
+}
+
+#define APIC_DFR_VALUE_CLUSTER (APIC_DFR_CLUSTER)
+#define INT_DELIVERY_MODE_CLUSTER (dest_LowestPrio)
+#define INT_DEST_MODE_CLUSTER (1) /* logical delivery broadcast to all procs */
+#define NO_BALANCE_IRQ_CLUSTER (1)
+
+#define APIC_DFR_VALUE (APIC_DFR_FLAT)
+#define INT_DELIVERY_MODE (dest_Fixed)
+#define INT_DEST_MODE (0) /* phys delivery to target procs */
+#define NO_BALANCE_IRQ (0)
+#undef APIC_DEST_LOGICAL
+#define APIC_DEST_LOGICAL 0x0
+
+static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return 0;
+}
+static inline unsigned long check_apicid_present(int bit)
+{
+ return physid_isset(bit, phys_cpu_present_map);
+}
+
+#define apicid_cluster(apicid) (apicid & 0xF0)
+
+static inline unsigned long calculate_ldr(int cpu)
+{
+ unsigned long id;
+ id = xapic_phys_to_log_apicid(cpu);
+ return (SET_APIC_LOGICAL_ID(id));
+}
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LdR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+static inline void init_apic_ldr_cluster(void)
+{
+ unsigned long val;
+ int cpu = smp_processor_id();
+
+ apic_write(APIC_DFR, APIC_DFR_VALUE_CLUSTER);
+ val = calculate_ldr(cpu);
+ apic_write(APIC_LDR, val);
+}
+
+static inline void init_apic_ldr(void)
+{
+ unsigned long val;
+ int cpu = smp_processor_id();
+
+ apic_write(APIC_DFR, APIC_DFR_VALUE);
+ val = calculate_ldr(cpu);
+ apic_write(APIC_LDR, val);
+}
+
+extern int apic_version [MAX_APICS];
+static inline void setup_apic_routing(void)
+{
+ int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
+ printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
+ (apic_version[apic] == 0x14) ?
+ "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(target_cpus())[0]);
+}
+
+static inline int multi_timer_check(int apic, int irq)
+{
+ return 0;
+}
+
+static inline int apicid_to_node(int logical_apicid)
+{
+ return 0;
+}
+
+
+static inline int cpu_present_to_apicid(int mps_cpu)
+{
+ if (!mps_cpu)
+ return boot_cpu_physical_apicid;
+ else if (mps_cpu < NR_CPUS)
+ return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+
+static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
+{
+ static int id = 0;
+ physid_mask_t mask;
+ mask = physid_mask_of_physid(id);
+ ++id;
+ return mask;
+}
+
+extern u8 cpu_2_logical_apicid[];
+/* Mapping from cpu number to logical apicid */
+static inline int cpu_to_logical_apicid(int cpu)
+{
+#ifdef CONFIG_SMP
+ if (cpu >= NR_CPUS)
+ return BAD_APICID;
+ return (int)cpu_2_logical_apicid[cpu];
+#else
+ return logical_smp_processor_id();
+#endif
+}
+
+static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
+{
+ /* For clustered we don't have a good way to do this yet - hack */
+ return physids_promote(0xff);
+}
+
+
+static inline void setup_portio_remap(void)
+{
+}
+
+extern unsigned int boot_cpu_physical_apicid;
+static inline int check_phys_apicid_present(int cpu_physical_apicid)
+{
+ boot_cpu_physical_apicid = read_apic_id();
+ return (1);
+}
+
+static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask)
+{
+ int num_bits_set;
+ int cpus_found = 0;
+ int cpu;
+ int apicid;
+
+ num_bits_set = cpus_weight(cpumask);
+ /* Return id to all */
+ if (num_bits_set == NR_CPUS)
+ return 0xFF;
+ /*
+ * The cpus in the mask must all be on the apic cluster. If are not
+ * on the same apicid cluster return default value of TARGET_CPUS.
+ */
+ cpu = first_cpu(cpumask);
+ apicid = cpu_to_logical_apicid(cpu);
+ while (cpus_found < num_bits_set) {
+ if (cpu_isset(cpu, cpumask)) {
+ int new_apicid = cpu_to_logical_apicid(cpu);
+ if (apicid_cluster(apicid) !=
+ apicid_cluster(new_apicid)){
+ printk ("%s: Not a valid mask!\n", __func__);
+ return 0xFF;
+ }
+ apicid = new_apicid;
+ cpus_found++;
+ }
+ cpu++;
+ }
+ return apicid;
+}
+
+static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ int num_bits_set;
+ int cpus_found = 0;
+ int cpu;
+ int apicid;
+
+ num_bits_set = cpus_weight(cpumask);
+ /* Return id to all */
+ if (num_bits_set == NR_CPUS)
+ return cpu_to_logical_apicid(0);
+ /*
+ * The cpus in the mask must all be on the apic cluster. If are not
+ * on the same apicid cluster return default value of TARGET_CPUS.
+ */
+ cpu = first_cpu(cpumask);
+ apicid = cpu_to_logical_apicid(cpu);
+ while (cpus_found < num_bits_set) {
+ if (cpu_isset(cpu, cpumask)) {
+ int new_apicid = cpu_to_logical_apicid(cpu);
+ if (apicid_cluster(apicid) !=
+ apicid_cluster(new_apicid)){
+ printk ("%s: Not a valid mask!\n", __func__);
+ return cpu_to_logical_apicid(0);
+ }
+ apicid = new_apicid;
+ cpus_found++;
+ }
+ cpu++;
+ }
+ return apicid;
+}
+
+static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+#endif /* __ASM_ES7000_APIC_H */
diff --git a/arch/x86/include/asm/es7000/apicdef.h b/arch/x86/include/asm/es7000/apicdef.h
new file mode 100644
index 000000000000..8b234a3cb851
--- /dev/null
+++ b/arch/x86/include/asm/es7000/apicdef.h
@@ -0,0 +1,13 @@
+#ifndef __ASM_ES7000_APICDEF_H
+#define __ASM_ES7000_APICDEF_H
+
+#define APIC_ID_MASK (0xFF<<24)
+
+static inline unsigned get_apic_id(unsigned long x)
+{
+ return (((x)>>24)&0xFF);
+}
+
+#define GET_APIC_ID(x) get_apic_id(x)
+
+#endif
diff --git a/arch/x86/include/asm/es7000/ipi.h b/arch/x86/include/asm/es7000/ipi.h
new file mode 100644
index 000000000000..632a955fcc0a
--- /dev/null
+++ b/arch/x86/include/asm/es7000/ipi.h
@@ -0,0 +1,24 @@
+#ifndef __ASM_ES7000_IPI_H
+#define __ASM_ES7000_IPI_H
+
+void send_IPI_mask_sequence(cpumask_t mask, int vector);
+
+static inline void send_IPI_mask(cpumask_t mask, int vector)
+{
+ send_IPI_mask_sequence(mask, vector);
+}
+
+static inline void send_IPI_allbutself(int vector)
+{
+ cpumask_t mask = cpu_online_map;
+ cpu_clear(smp_processor_id(), mask);
+ if (!cpus_empty(mask))
+ send_IPI_mask(mask, vector);
+}
+
+static inline void send_IPI_all(int vector)
+{
+ send_IPI_mask(cpu_online_map, vector);
+}
+
+#endif /* __ASM_ES7000_IPI_H */
diff --git a/arch/x86/include/asm/es7000/mpparse.h b/arch/x86/include/asm/es7000/mpparse.h
new file mode 100644
index 000000000000..ed5a3caae141
--- /dev/null
+++ b/arch/x86/include/asm/es7000/mpparse.h
@@ -0,0 +1,30 @@
+#ifndef __ASM_ES7000_MPPARSE_H
+#define __ASM_ES7000_MPPARSE_H
+
+#include <linux/acpi.h>
+
+extern int parse_unisys_oem (char *oemptr);
+extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
+extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
+extern void setup_unisys(void);
+
+#ifndef CONFIG_X86_GENERICARCH
+extern int acpi_madt_oem_check(char *oem_id, char *oem_table_id);
+extern int mps_oem_check(struct mp_config_table *mpc, char *oem,
+ char *productid);
+#endif
+
+#ifdef CONFIG_ACPI
+
+static inline int es7000_check_dsdt(void)
+{
+ struct acpi_table_header header;
+
+ if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) &&
+ !strncmp(header.oem_id, "UNISYS", 6))
+ return 1;
+ return 0;
+}
+#endif
+
+#endif /* __ASM_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/es7000/wakecpu.h b/arch/x86/include/asm/es7000/wakecpu.h
new file mode 100644
index 000000000000..78f0daaee436
--- /dev/null
+++ b/arch/x86/include/asm/es7000/wakecpu.h
@@ -0,0 +1,37 @@
+#ifndef __ASM_ES7000_WAKECPU_H
+#define __ASM_ES7000_WAKECPU_H
+
+#define TRAMPOLINE_PHYS_LOW 0x467
+#define TRAMPOLINE_PHYS_HIGH 0x469
+
+static inline void wait_for_init_deassert(atomic_t *deassert)
+{
+#ifndef CONFIG_ES7000_CLUSTERED_APIC
+ while (!atomic_read(deassert))
+ cpu_relax();
+#endif
+ return;
+}
+
+/* Nothing to do for most platforms, since cleared by the INIT cycle */
+static inline void smp_callin_clear_local_apic(void)
+{
+}
+
+static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
+{
+}
+
+static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
+{
+}
+
+extern void __inquire_remote_apic(int apicid);
+
+static inline void inquire_remote_apic(int apicid)
+{
+ if (apic_verbosity >= APIC_DEBUG)
+ __inquire_remote_apic(apicid);
+}
+
+#endif /* __ASM_MACH_WAKECPU_H */
diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h
new file mode 100644
index 000000000000..53018464aea6
--- /dev/null
+++ b/arch/x86/include/asm/fb.h
@@ -0,0 +1,21 @@
+#ifndef _ASM_X86_FB_H
+#define _ASM_X86_FB_H
+
+#include <linux/fb.h>
+#include <linux/fs.h>
+#include <asm/page.h>
+
+static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
+ unsigned long off)
+{
+ if (boot_cpu_data.x86 > 3)
+ pgprot_val(vma->vm_page_prot) |= _PAGE_PCD;
+}
+
+#ifdef CONFIG_X86_32
+extern int fb_is_primary_device(struct fb_info *info);
+#else
+static inline int fb_is_primary_device(struct fb_info *info) { return 0; }
+#endif
+
+#endif /* _ASM_X86_FB_H */
diff --git a/arch/x86/include/asm/fcntl.h b/arch/x86/include/asm/fcntl.h
new file mode 100644
index 000000000000..46ab12db5739
--- /dev/null
+++ b/arch/x86/include/asm/fcntl.h
@@ -0,0 +1 @@
+#include <asm-generic/fcntl.h>
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
new file mode 100644
index 000000000000..23696d44a0af
--- /dev/null
+++ b/arch/x86/include/asm/fixmap.h
@@ -0,0 +1,72 @@
+#ifndef _ASM_X86_FIXMAP_H
+#define _ASM_X86_FIXMAP_H
+
+#ifdef CONFIG_X86_32
+# include "fixmap_32.h"
+#else
+# include "fixmap_64.h"
+#endif
+
+extern int fixmaps_set;
+
+extern pte_t *kmap_pte;
+extern pgprot_t kmap_prot;
+extern pte_t *pkmap_page_table;
+
+void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
+void native_set_fixmap(enum fixed_addresses idx,
+ unsigned long phys, pgprot_t flags);
+
+#ifndef CONFIG_PARAVIRT
+static inline void __set_fixmap(enum fixed_addresses idx,
+ unsigned long phys, pgprot_t flags)
+{
+ native_set_fixmap(idx, phys, flags);
+}
+#endif
+
+#define set_fixmap(idx, phys) \
+ __set_fixmap(idx, phys, PAGE_KERNEL)
+
+/*
+ * Some hardware wants to get fixmapped without caching.
+ */
+#define set_fixmap_nocache(idx, phys) \
+ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
+
+#define clear_fixmap(idx) \
+ __set_fixmap(idx, 0, __pgprot(0))
+
+#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
+
+extern void __this_fixmap_does_not_exist(void);
+
+/*
+ * 'index to address' translation. If anyone tries to use the idx
+ * directly without translation, we catch the bug with a NULL-deference
+ * kernel oops. Illegal ranges of incoming indices are caught too.
+ */
+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
+{
+ /*
+ * this branch gets completely eliminated after inlining,
+ * except when someone tries to use fixaddr indices in an
+ * illegal way. (such as mixing up address types or using
+ * out-of-range indices).
+ *
+ * If it doesn't get removed, the linker will complain
+ * loudly with a reasonably clear error message..
+ */
+ if (idx >= __end_of_fixed_addresses)
+ __this_fixmap_does_not_exist();
+
+ return __fix_to_virt(idx);
+}
+
+static inline unsigned long virt_to_fix(const unsigned long vaddr)
+{
+ BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+ return __virt_to_fix(vaddr);
+}
+#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/fixmap_32.h b/arch/x86/include/asm/fixmap_32.h
new file mode 100644
index 000000000000..c7115c1d7217
--- /dev/null
+++ b/arch/x86/include/asm/fixmap_32.h
@@ -0,0 +1,119 @@
+/*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ */
+
+#ifndef _ASM_X86_FIXMAP_32_H
+#define _ASM_X86_FIXMAP_32_H
+
+
+/* used by vmalloc.c, vsyscall.lds.S.
+ *
+ * Leave one empty page between vmalloc'ed areas and
+ * the start of the fixmap.
+ */
+extern unsigned long __FIXADDR_TOP;
+#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
+#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
+
+#ifndef __ASSEMBLY__
+#include <linux/kernel.h>
+#include <asm/acpi.h>
+#include <asm/apicdef.h>
+#include <asm/page.h>
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+
+/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process. We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * these 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages. (or larger if used with an increment
+ * highger than 1) use fixmap_set(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+enum fixed_addresses {
+ FIX_HOLE,
+ FIX_VDSO,
+ FIX_DBGP_BASE,
+ FIX_EARLYCON_MEM_BASE,
+#ifdef CONFIG_X86_LOCAL_APIC
+ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
+#endif
+#ifdef CONFIG_X86_IO_APIC
+ FIX_IO_APIC_BASE_0,
+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
+#endif
+#ifdef CONFIG_X86_VISWS_APIC
+ FIX_CO_CPU, /* Cobalt timer */
+ FIX_CO_APIC, /* Cobalt APIC Redirection Table */
+ FIX_LI_PCIA, /* Lithium PCI Bridge A */
+ FIX_LI_PCIB, /* Lithium PCI Bridge B */
+#endif
+#ifdef CONFIG_X86_F00F_BUG
+ FIX_F00F_IDT, /* Virtual mapping for IDT */
+#endif
+#ifdef CONFIG_X86_CYCLONE_TIMER
+ FIX_CYCLONE_TIMER, /*cyclone timer register*/
+#endif
+ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
+ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+#ifdef CONFIG_PCI_MMCONFIG
+ FIX_PCIE_MCFG,
+#endif
+#ifdef CONFIG_PARAVIRT
+ FIX_PARAVIRT_BOOTMAP,
+#endif
+ __end_of_permanent_fixed_addresses,
+ /*
+ * 256 temporary boot-time mappings, used by early_ioremap(),
+ * before ioremap() is functional.
+ *
+ * We round it up to the next 256 pages boundary so that we
+ * can have a single pgd entry and a single pte table:
+ */
+#define NR_FIX_BTMAPS 64
+#define FIX_BTMAPS_SLOTS 4
+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
+ (__end_of_permanent_fixed_addresses & 255),
+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
+ FIX_WP_TEST,
+#ifdef CONFIG_ACPI
+ FIX_ACPI_BEGIN,
+ FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
+#endif
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ FIX_OHCI1394_BASE,
+#endif
+ __end_of_fixed_addresses
+};
+
+extern void reserve_top_address(unsigned long reserve);
+
+
+#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
+
+#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
+#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
+#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
+
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_FIXMAP_32_H */
diff --git a/arch/x86/include/asm/fixmap_64.h b/arch/x86/include/asm/fixmap_64.h
new file mode 100644
index 000000000000..00a30ab9b1a5
--- /dev/null
+++ b/arch/x86/include/asm/fixmap_64.h
@@ -0,0 +1,83 @@
+/*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ */
+
+#ifndef _ASM_X86_FIXMAP_64_H
+#define _ASM_X86_FIXMAP_64_H
+
+#include <linux/kernel.h>
+#include <asm/acpi.h>
+#include <asm/apicdef.h>
+#include <asm/page.h>
+#include <asm/vsyscall.h>
+#include <asm/efi.h>
+
+/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process.
+ *
+ * These 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages (or larger if used with an increment
+ * higher than 1). Use set_fixmap(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+
+enum fixed_addresses {
+ VSYSCALL_LAST_PAGE,
+ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+ + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+ VSYSCALL_HPET,
+ FIX_DBGP_BASE,
+ FIX_EARLYCON_MEM_BASE,
+ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
+ FIX_IO_APIC_BASE_0,
+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
+ FIX_EFI_IO_MAP_LAST_PAGE,
+ FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
+ + MAX_EFI_IO_PAGES - 1,
+#ifdef CONFIG_PARAVIRT
+ FIX_PARAVIRT_BOOTMAP,
+#endif
+ __end_of_permanent_fixed_addresses,
+#ifdef CONFIG_ACPI
+ FIX_ACPI_BEGIN,
+ FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
+#endif
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ FIX_OHCI1394_BASE,
+#endif
+ /*
+ * 256 temporary boot-time mappings, used by early_ioremap(),
+ * before ioremap() is functional.
+ *
+ * We round it up to the next 256 pages boundary so that we
+ * can have a single pgd entry and a single pte table:
+ */
+#define NR_FIX_BTMAPS 64
+#define FIX_BTMAPS_SLOTS 4
+ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
+ (__end_of_permanent_fixed_addresses & 255),
+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
+ __end_of_fixed_addresses
+};
+
+#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
+#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
+
+/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
+#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
+#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
+
+#endif /* _ASM_X86_FIXMAP_64_H */
diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h
new file mode 100644
index 000000000000..dbe82a5c5eac
--- /dev/null
+++ b/arch/x86/include/asm/floppy.h
@@ -0,0 +1,281 @@
+/*
+ * Architecture specific parts of the Floppy driver
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1995
+ */
+#ifndef _ASM_X86_FLOPPY_H
+#define _ASM_X86_FLOPPY_H
+
+#include <linux/vmalloc.h>
+
+/*
+ * The DMA channel used by the floppy controller cannot access data at
+ * addresses >= 16MB
+ *
+ * Went back to the 1MB limit, as some people had problems with the floppy
+ * driver otherwise. It doesn't matter much for performance anyway, as most
+ * floppy accesses go through the track buffer.
+ */
+#define _CROSS_64KB(a, s, vdma) \
+ (!(vdma) && \
+ ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64))
+
+#define CROSS_64KB(a, s) _CROSS_64KB(a, s, use_virtual_dma & 1)
+
+
+#define SW fd_routine[use_virtual_dma & 1]
+#define CSW fd_routine[can_use_virtual_dma & 1]
+
+
+#define fd_inb(port) inb_p(port)
+#define fd_outb(value, port) outb_p(value, port)
+
+#define fd_request_dma() CSW._request_dma(FLOPPY_DMA, "floppy")
+#define fd_free_dma() CSW._free_dma(FLOPPY_DMA)
+#define fd_enable_irq() enable_irq(FLOPPY_IRQ)
+#define fd_disable_irq() disable_irq(FLOPPY_IRQ)
+#define fd_free_irq() free_irq(FLOPPY_IRQ, NULL)
+#define fd_get_dma_residue() SW._get_dma_residue(FLOPPY_DMA)
+#define fd_dma_mem_alloc(size) SW._dma_mem_alloc(size)
+#define fd_dma_setup(addr, size, mode, io) SW._dma_setup(addr, size, mode, io)
+
+#define FLOPPY_CAN_FALLBACK_ON_NODMA
+
+static int virtual_dma_count;
+static int virtual_dma_residue;
+static char *virtual_dma_addr;
+static int virtual_dma_mode;
+static int doing_pdma;
+
+static irqreturn_t floppy_hardint(int irq, void *dev_id)
+{
+ unsigned char st;
+
+#undef TRACE_FLPY_INT
+
+#ifdef TRACE_FLPY_INT
+ static int calls;
+ static int bytes;
+ static int dma_wait;
+#endif
+ if (!doing_pdma)
+ return floppy_interrupt(irq, dev_id);
+
+#ifdef TRACE_FLPY_INT
+ if (!calls)
+ bytes = virtual_dma_count;
+#endif
+
+ {
+ int lcount;
+ char *lptr;
+
+ st = 1;
+ for (lcount = virtual_dma_count, lptr = virtual_dma_addr;
+ lcount; lcount--, lptr++) {
+ st = inb(virtual_dma_port + 4) & 0xa0;
+ if (st != 0xa0)
+ break;
+ if (virtual_dma_mode)
+ outb_p(*lptr, virtual_dma_port + 5);
+ else
+ *lptr = inb_p(virtual_dma_port + 5);
+ }
+ virtual_dma_count = lcount;
+ virtual_dma_addr = lptr;
+ st = inb(virtual_dma_port + 4);
+ }
+
+#ifdef TRACE_FLPY_INT
+ calls++;
+#endif
+ if (st == 0x20)
+ return IRQ_HANDLED;
+ if (!(st & 0x20)) {
+ virtual_dma_residue += virtual_dma_count;
+ virtual_dma_count = 0;
+#ifdef TRACE_FLPY_INT
+ printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n",
+ virtual_dma_count, virtual_dma_residue, calls, bytes,
+ dma_wait);
+ calls = 0;
+ dma_wait = 0;
+#endif
+ doing_pdma = 0;
+ floppy_interrupt(irq, dev_id);
+ return IRQ_HANDLED;
+ }
+#ifdef TRACE_FLPY_INT
+ if (!virtual_dma_count)
+ dma_wait++;
+#endif
+ return IRQ_HANDLED;
+}
+
+static void fd_disable_dma(void)
+{
+ if (!(can_use_virtual_dma & 1))
+ disable_dma(FLOPPY_DMA);
+ doing_pdma = 0;
+ virtual_dma_residue += virtual_dma_count;
+ virtual_dma_count = 0;
+}
+
+static int vdma_request_dma(unsigned int dmanr, const char *device_id)
+{
+ return 0;
+}
+
+static void vdma_nop(unsigned int dummy)
+{
+}
+
+
+static int vdma_get_dma_residue(unsigned int dummy)
+{
+ return virtual_dma_count + virtual_dma_residue;
+}
+
+
+static int fd_request_irq(void)
+{
+ if (can_use_virtual_dma)
+ return request_irq(FLOPPY_IRQ, floppy_hardint,
+ IRQF_DISABLED, "floppy", NULL);
+ else
+ return request_irq(FLOPPY_IRQ, floppy_interrupt,
+ IRQF_DISABLED, "floppy", NULL);
+}
+
+static unsigned long dma_mem_alloc(unsigned long size)
+{
+ return __get_dma_pages(GFP_KERNEL|__GFP_NORETRY, get_order(size));
+}
+
+
+static unsigned long vdma_mem_alloc(unsigned long size)
+{
+ return (unsigned long)vmalloc(size);
+
+}
+
+#define nodma_mem_alloc(size) vdma_mem_alloc(size)
+
+static void _fd_dma_mem_free(unsigned long addr, unsigned long size)
+{
+ if ((unsigned long)addr >= (unsigned long)high_memory)
+ vfree((void *)addr);
+ else
+ free_pages(addr, get_order(size));
+}
+
+#define fd_dma_mem_free(addr, size) _fd_dma_mem_free(addr, size)
+
+static void _fd_chose_dma_mode(char *addr, unsigned long size)
+{
+ if (can_use_virtual_dma == 2) {
+ if ((unsigned long)addr >= (unsigned long)high_memory ||
+ isa_virt_to_bus(addr) >= 0x1000000 ||
+ _CROSS_64KB(addr, size, 0))
+ use_virtual_dma = 1;
+ else
+ use_virtual_dma = 0;
+ } else {
+ use_virtual_dma = can_use_virtual_dma & 1;
+ }
+}
+
+#define fd_chose_dma_mode(addr, size) _fd_chose_dma_mode(addr, size)
+
+
+static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
+{
+ doing_pdma = 1;
+ virtual_dma_port = io;
+ virtual_dma_mode = (mode == DMA_MODE_WRITE);
+ virtual_dma_addr = addr;
+ virtual_dma_count = size;
+ virtual_dma_residue = 0;
+ return 0;
+}
+
+static int hard_dma_setup(char *addr, unsigned long size, int mode, int io)
+{
+#ifdef FLOPPY_SANITY_CHECK
+ if (CROSS_64KB(addr, size)) {
+ printk("DMA crossing 64-K boundary %p-%p\n", addr, addr+size);
+ return -1;
+ }
+#endif
+ /* actual, physical DMA */
+ doing_pdma = 0;
+ clear_dma_ff(FLOPPY_DMA);
+ set_dma_mode(FLOPPY_DMA, mode);
+ set_dma_addr(FLOPPY_DMA, isa_virt_to_bus(addr));
+ set_dma_count(FLOPPY_DMA, size);
+ enable_dma(FLOPPY_DMA);
+ return 0;
+}
+
+static struct fd_routine_l {
+ int (*_request_dma)(unsigned int dmanr, const char *device_id);
+ void (*_free_dma)(unsigned int dmanr);
+ int (*_get_dma_residue)(unsigned int dummy);
+ unsigned long (*_dma_mem_alloc)(unsigned long size);
+ int (*_dma_setup)(char *addr, unsigned long size, int mode, int io);
+} fd_routine[] = {
+ {
+ request_dma,
+ free_dma,
+ get_dma_residue,
+ dma_mem_alloc,
+ hard_dma_setup
+ },
+ {
+ vdma_request_dma,
+ vdma_nop,
+ vdma_get_dma_residue,
+ vdma_mem_alloc,
+ vdma_dma_setup
+ }
+};
+
+
+static int FDC1 = 0x3f0;
+static int FDC2 = -1;
+
+/*
+ * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock
+ * is needed to prevent corrupted CMOS RAM in case "insmod floppy"
+ * coincides with another rtc CMOS user. Paul G.
+ */
+#define FLOPPY0_TYPE \
+({ \
+ unsigned long flags; \
+ unsigned char val; \
+ spin_lock_irqsave(&rtc_lock, flags); \
+ val = (CMOS_READ(0x10) >> 4) & 15; \
+ spin_unlock_irqrestore(&rtc_lock, flags); \
+ val; \
+})
+
+#define FLOPPY1_TYPE \
+({ \
+ unsigned long flags; \
+ unsigned char val; \
+ spin_lock_irqsave(&rtc_lock, flags); \
+ val = CMOS_READ(0x10) & 15; \
+ spin_unlock_irqrestore(&rtc_lock, flags); \
+ val; \
+})
+
+#define N_FDC 2
+#define N_DRIVE 8
+
+#define EXTRA_FLOPPY_PARAMS
+
+#endif /* _ASM_X86_FLOPPY_H */
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
new file mode 100644
index 000000000000..06850a7194e1
--- /dev/null
+++ b/arch/x86/include/asm/frame.h
@@ -0,0 +1,27 @@
+#ifdef __ASSEMBLY__
+
+#include <asm/dwarf2.h>
+
+/* The annotation hides the frame from the unwinder and makes it look
+ like a ordinary ebp save/restore. This avoids some special cases for
+ frame pointer later */
+#ifdef CONFIG_FRAME_POINTER
+ .macro FRAME
+ pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebp,0
+ movl %esp,%ebp
+ .endm
+ .macro ENDFRAME
+ popl %ebp
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ebp
+ .endm
+#else
+ .macro FRAME
+ .endm
+ .macro ENDFRAME
+ .endm
+#endif
+
+#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
new file mode 100644
index 000000000000..b55b4a7fbefd
--- /dev/null
+++ b/arch/x86/include/asm/ftrace.h
@@ -0,0 +1,83 @@
+#ifndef _ASM_X86_FTRACE_H
+#define _ASM_X86_FTRACE_H
+
+#ifdef __ASSEMBLY__
+
+ .macro MCOUNT_SAVE_FRAME
+ /* taken from glibc */
+ subq $0x38, %rsp
+ movq %rax, (%rsp)
+ movq %rcx, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rsi, 24(%rsp)
+ movq %rdi, 32(%rsp)
+ movq %r8, 40(%rsp)
+ movq %r9, 48(%rsp)
+ .endm
+
+ .macro MCOUNT_RESTORE_FRAME
+ movq 48(%rsp), %r9
+ movq 40(%rsp), %r8
+ movq 32(%rsp), %rdi
+ movq 24(%rsp), %rsi
+ movq 16(%rsp), %rdx
+ movq 8(%rsp), %rcx
+ movq (%rsp), %rax
+ addq $0x38, %rsp
+ .endm
+
+#endif
+
+#ifdef CONFIG_FUNCTION_TRACER
+#define MCOUNT_ADDR ((long)(mcount))
+#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void mcount(void);
+
+static inline unsigned long ftrace_call_adjust(unsigned long addr)
+{
+ /*
+ * call mcount is "e8 <4 byte offset>"
+ * The addr points to the 4 byte offset and the caller of this
+ * function wants the pointer to e8. Simply subtract one.
+ */
+ return addr - 1;
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+struct dyn_arch_ftrace {
+ /* No extra data needed for x86 */
+};
+
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+#ifndef __ASSEMBLY__
+
+/*
+ * Stack of return addresses for functions
+ * of a thread.
+ * Used in struct thread_info
+ */
+struct ftrace_ret_stack {
+ unsigned long ret;
+ unsigned long func;
+ unsigned long long calltime;
+};
+
+/*
+ * Primary handler of a function return.
+ * It relays on ftrace_return_to_handler.
+ * Defined in entry_32/64.S
+ */
+extern void return_to_handler(void);
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+#endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
new file mode 100644
index 000000000000..1f11ce44e956
--- /dev/null
+++ b/arch/x86/include/asm/futex.h
@@ -0,0 +1,140 @@
+#ifndef _ASM_X86_FUTEX_H
+#define _ASM_X86_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <linux/uaccess.h>
+
+#include <asm/asm.h>
+#include <asm/errno.h>
+#include <asm/processor.h>
+#include <asm/system.h>
+
+#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
+ asm volatile("1:\t" insn "\n" \
+ "2:\t.section .fixup,\"ax\"\n" \
+ "3:\tmov\t%3, %1\n" \
+ "\tjmp\t2b\n" \
+ "\t.previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
+ : "i" (-EFAULT), "0" (oparg), "1" (0))
+
+#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
+ asm volatile("1:\tmovl %2, %0\n" \
+ "\tmovl\t%0, %3\n" \
+ "\t" insn "\n" \
+ "2:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" \
+ "\tjnz\t1b\n" \
+ "3:\t.section .fixup,\"ax\"\n" \
+ "4:\tmov\t%5, %1\n" \
+ "\tjmp\t3b\n" \
+ "\t.previous\n" \
+ _ASM_EXTABLE(1b, 4b) \
+ _ASM_EXTABLE(2b, 4b) \
+ : "=&a" (oldval), "=&r" (ret), \
+ "+m" (*uaddr), "=&r" (tem) \
+ : "r" (oparg), "i" (-EFAULT), "1" (0))
+
+static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+{
+ int op = (encoded_op >> 28) & 7;
+ int cmp = (encoded_op >> 24) & 15;
+ int oparg = (encoded_op << 8) >> 20;
+ int cmparg = (encoded_op << 20) >> 20;
+ int oldval = 0, ret, tem;
+
+ if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+ oparg = 1 << oparg;
+
+ if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+ return -EFAULT;
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
+ /* Real i386 machines can only support FUTEX_OP_SET */
+ if (op != FUTEX_OP_SET && boot_cpu_data.x86 == 3)
+ return -ENOSYS;
+#endif
+
+ pagefault_disable();
+
+ switch (op) {
+ case FUTEX_OP_SET:
+ __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
+ break;
+ case FUTEX_OP_ADD:
+ __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval,
+ uaddr, oparg);
+ break;
+ case FUTEX_OP_OR:
+ __futex_atomic_op2("orl %4, %3", ret, oldval, uaddr, oparg);
+ break;
+ case FUTEX_OP_ANDN:
+ __futex_atomic_op2("andl %4, %3", ret, oldval, uaddr, ~oparg);
+ break;
+ case FUTEX_OP_XOR:
+ __futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr, oparg);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ pagefault_enable();
+
+ if (!ret) {
+ switch (cmp) {
+ case FUTEX_OP_CMP_EQ:
+ ret = (oldval == cmparg);
+ break;
+ case FUTEX_OP_CMP_NE:
+ ret = (oldval != cmparg);
+ break;
+ case FUTEX_OP_CMP_LT:
+ ret = (oldval < cmparg);
+ break;
+ case FUTEX_OP_CMP_GE:
+ ret = (oldval >= cmparg);
+ break;
+ case FUTEX_OP_CMP_LE:
+ ret = (oldval <= cmparg);
+ break;
+ case FUTEX_OP_CMP_GT:
+ ret = (oldval > cmparg);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+ }
+ return ret;
+}
+
+static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
+ int newval)
+{
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
+ /* Real i386 machines have no cmpxchg instruction */
+ if (boot_cpu_data.x86 == 3)
+ return -ENOSYS;
+#endif
+
+ if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+ return -EFAULT;
+
+ asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n"
+ "2:\t.section .fixup, \"ax\"\n"
+ "3:\tmov %2, %0\n"
+ "\tjmp 2b\n"
+ "\t.previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : "=a" (oldval), "+m" (*uaddr)
+ : "i" (-EFAULT), "r" (newval), "0" (oldval)
+ : "memory"
+ );
+
+ return oldval;
+}
+
+#endif
+#endif /* _ASM_X86_FUTEX_H */
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
new file mode 100644
index 000000000000..6cfdafa409d8
--- /dev/null
+++ b/arch/x86/include/asm/gart.h
@@ -0,0 +1,106 @@
+#ifndef _ASM_X86_GART_H
+#define _ASM_X86_GART_H
+
+#include <asm/e820.h>
+
+extern void set_up_gart_resume(u32, u32);
+
+extern int fallback_aper_order;
+extern int fallback_aper_force;
+extern int fix_aperture;
+
+/* PTE bits. */
+#define GPTE_VALID 1
+#define GPTE_COHERENT 2
+
+/* Aperture control register bits. */
+#define GARTEN (1<<0)
+#define DISGARTCPU (1<<4)
+#define DISGARTIO (1<<5)
+
+/* GART cache control register bits. */
+#define INVGART (1<<0)
+#define GARTPTEERR (1<<1)
+
+/* K8 On-cpu GART registers */
+#define AMD64_GARTAPERTURECTL 0x90
+#define AMD64_GARTAPERTUREBASE 0x94
+#define AMD64_GARTTABLEBASE 0x98
+#define AMD64_GARTCACHECTL 0x9c
+#define AMD64_GARTEN (1<<0)
+
+#ifdef CONFIG_GART_IOMMU
+extern int gart_iommu_aperture;
+extern int gart_iommu_aperture_allowed;
+extern int gart_iommu_aperture_disabled;
+
+extern void early_gart_iommu_check(void);
+extern void gart_iommu_init(void);
+extern void gart_iommu_shutdown(void);
+extern void __init gart_parse_options(char *);
+extern void gart_iommu_hole_init(void);
+
+#else
+#define gart_iommu_aperture 0
+#define gart_iommu_aperture_allowed 0
+#define gart_iommu_aperture_disabled 1
+
+static inline void early_gart_iommu_check(void)
+{
+}
+static inline void gart_iommu_init(void)
+{
+}
+static inline void gart_iommu_shutdown(void)
+{
+}
+static inline void gart_parse_options(char *options)
+{
+}
+static inline void gart_iommu_hole_init(void)
+{
+}
+#endif
+
+extern int agp_amd64_init(void);
+
+static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
+{
+ u32 tmp, ctl;
+
+ /* address of the mappings table */
+ addr >>= 12;
+ tmp = (u32) addr<<4;
+ tmp &= ~0xf;
+ pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp);
+
+ /* Enable GART translation for this hammer. */
+ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
+ ctl |= GARTEN;
+ ctl &= ~(DISGARTCPU | DISGARTIO);
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+}
+
+static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
+{
+ if (!aper_base)
+ return 0;
+
+ if (aper_base + aper_size > 0x100000000ULL) {
+ printk(KERN_INFO "Aperture beyond 4GB. Ignoring.\n");
+ return 0;
+ }
+ if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
+ printk(KERN_INFO "Aperture pointing to e820 RAM. Ignoring.\n");
+ return 0;
+ }
+ if (aper_size < min_size) {
+ printk(KERN_INFO "Aperture too small (%d MB) than (%d MB)\n",
+ aper_size>>20, min_size>>20);
+ return 0;
+ }
+
+ return 1;
+}
+
+#endif /* _ASM_X86_GART_H */
diff --git a/arch/x86/include/asm/genapic.h b/arch/x86/include/asm/genapic.h
new file mode 100644
index 000000000000..d48bee663a6f
--- /dev/null
+++ b/arch/x86/include/asm/genapic.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "genapic_32.h"
+#else
+# include "genapic_64.h"
+#endif
diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h
new file mode 100644
index 000000000000..0ac17d33a8c7
--- /dev/null
+++ b/arch/x86/include/asm/genapic_32.h
@@ -0,0 +1,143 @@
+#ifndef _ASM_X86_GENAPIC_32_H
+#define _ASM_X86_GENAPIC_32_H
+
+#include <asm/mpspec.h>
+#include <asm/atomic.h>
+
+/*
+ * Generic APIC driver interface.
+ *
+ * An straight forward mapping of the APIC related parts of the
+ * x86 subarchitecture interface to a dynamic object.
+ *
+ * This is used by the "generic" x86 subarchitecture.
+ *
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ */
+
+struct mpc_config_bus;
+struct mp_config_table;
+struct mpc_config_processor;
+
+struct genapic {
+ char *name;
+ int (*probe)(void);
+
+ int (*apic_id_registered)(void);
+ cpumask_t (*target_cpus)(void);
+ int int_delivery_mode;
+ int int_dest_mode;
+ int ESR_DISABLE;
+ int apic_destination_logical;
+ unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid);
+ unsigned long (*check_apicid_present)(int apicid);
+ int no_balance_irq;
+ int no_ioapic_check;
+ void (*init_apic_ldr)(void);
+ physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map);
+
+ void (*setup_apic_routing)(void);
+ int (*multi_timer_check)(int apic, int irq);
+ int (*apicid_to_node)(int logical_apicid);
+ int (*cpu_to_logical_apicid)(int cpu);
+ int (*cpu_present_to_apicid)(int mps_cpu);
+ physid_mask_t (*apicid_to_cpu_present)(int phys_apicid);
+ void (*setup_portio_remap)(void);
+ int (*check_phys_apicid_present)(int boot_cpu_physical_apicid);
+ void (*enable_apic_mode)(void);
+ u32 (*phys_pkg_id)(u32 cpuid_apic, int index_msb);
+
+ /* mpparse */
+ /* When one of the next two hooks returns 1 the genapic
+ is switched to this. Essentially they are additional probe
+ functions. */
+ int (*mps_oem_check)(struct mp_config_table *mpc, char *oem,
+ char *productid);
+ int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
+
+ unsigned (*get_apic_id)(unsigned long x);
+ unsigned long apic_id_mask;
+ unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
+ cpumask_t (*vector_allocation_domain)(int cpu);
+
+#ifdef CONFIG_SMP
+ /* ipi */
+ void (*send_IPI_mask)(cpumask_t mask, int vector);
+ void (*send_IPI_allbutself)(int vector);
+ void (*send_IPI_all)(int vector);
+#endif
+ int (*wakeup_cpu)(int apicid, unsigned long start_eip);
+ int trampoline_phys_low;
+ int trampoline_phys_high;
+ void (*wait_for_init_deassert)(atomic_t *deassert);
+ void (*smp_callin_clear_local_apic)(void);
+ void (*store_NMI_vector)(unsigned short *high, unsigned short *low);
+ void (*restore_NMI_vector)(unsigned short *high, unsigned short *low);
+ void (*inquire_remote_apic)(int apicid);
+};
+
+#define APICFUNC(x) .x = x,
+
+/* More functions could be probably marked IPIFUNC and save some space
+ in UP GENERICARCH kernels, but I don't have the nerve right now
+ to untangle this mess. -AK */
+#ifdef CONFIG_SMP
+#define IPIFUNC(x) APICFUNC(x)
+#else
+#define IPIFUNC(x)
+#endif
+
+#define APIC_INIT(aname, aprobe) \
+{ \
+ .name = aname, \
+ .probe = aprobe, \
+ .int_delivery_mode = INT_DELIVERY_MODE, \
+ .int_dest_mode = INT_DEST_MODE, \
+ .no_balance_irq = NO_BALANCE_IRQ, \
+ .ESR_DISABLE = esr_disable, \
+ .apic_destination_logical = APIC_DEST_LOGICAL, \
+ APICFUNC(apic_id_registered) \
+ APICFUNC(target_cpus) \
+ APICFUNC(check_apicid_used) \
+ APICFUNC(check_apicid_present) \
+ APICFUNC(init_apic_ldr) \
+ APICFUNC(ioapic_phys_id_map) \
+ APICFUNC(setup_apic_routing) \
+ APICFUNC(multi_timer_check) \
+ APICFUNC(apicid_to_node) \
+ APICFUNC(cpu_to_logical_apicid) \
+ APICFUNC(cpu_present_to_apicid) \
+ APICFUNC(apicid_to_cpu_present) \
+ APICFUNC(setup_portio_remap) \
+ APICFUNC(check_phys_apicid_present) \
+ APICFUNC(mps_oem_check) \
+ APICFUNC(get_apic_id) \
+ .apic_id_mask = APIC_ID_MASK, \
+ APICFUNC(cpu_mask_to_apicid) \
+ APICFUNC(vector_allocation_domain) \
+ APICFUNC(acpi_madt_oem_check) \
+ IPIFUNC(send_IPI_mask) \
+ IPIFUNC(send_IPI_allbutself) \
+ IPIFUNC(send_IPI_all) \
+ APICFUNC(enable_apic_mode) \
+ APICFUNC(phys_pkg_id) \
+ .trampoline_phys_low = TRAMPOLINE_PHYS_LOW, \
+ .trampoline_phys_high = TRAMPOLINE_PHYS_HIGH, \
+ APICFUNC(wait_for_init_deassert) \
+ APICFUNC(smp_callin_clear_local_apic) \
+ APICFUNC(store_NMI_vector) \
+ APICFUNC(restore_NMI_vector) \
+ APICFUNC(inquire_remote_apic) \
+}
+
+extern struct genapic *genapic;
+extern void es7000_update_genapic_to_cluster(void);
+
+enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
+#define get_uv_system_type() UV_NONE
+#define is_uv_system() 0
+#define uv_wakeup_secondary(a, b) 1
+#define uv_system_init() do {} while (0)
+
+
+#endif /* _ASM_X86_GENAPIC_32_H */
diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h
new file mode 100644
index 000000000000..2cae011668b7
--- /dev/null
+++ b/arch/x86/include/asm/genapic_64.h
@@ -0,0 +1,60 @@
+#ifndef _ASM_X86_GENAPIC_64_H
+#define _ASM_X86_GENAPIC_64_H
+
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic APIC sub-arch data struct.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+
+struct genapic {
+ char *name;
+ int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
+ u32 int_delivery_mode;
+ u32 int_dest_mode;
+ int (*apic_id_registered)(void);
+ cpumask_t (*target_cpus)(void);
+ cpumask_t (*vector_allocation_domain)(int cpu);
+ void (*init_apic_ldr)(void);
+ /* ipi */
+ void (*send_IPI_mask)(cpumask_t mask, int vector);
+ void (*send_IPI_allbutself)(int vector);
+ void (*send_IPI_all)(int vector);
+ void (*send_IPI_self)(int vector);
+ /* */
+ unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
+ unsigned int (*phys_pkg_id)(int index_msb);
+ unsigned int (*get_apic_id)(unsigned long x);
+ unsigned long (*set_apic_id)(unsigned int id);
+ unsigned long apic_id_mask;
+ /* wakeup_secondary_cpu */
+ int (*wakeup_cpu)(int apicid, unsigned long start_eip);
+};
+
+extern struct genapic *genapic;
+
+extern struct genapic apic_flat;
+extern struct genapic apic_physflat;
+extern struct genapic apic_x2apic_cluster;
+extern struct genapic apic_x2apic_phys;
+extern int acpi_madt_oem_check(char *, char *);
+
+extern void apic_send_IPI_self(int vector);
+enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
+extern enum uv_system_type get_uv_system_type(void);
+extern int is_uv_system(void);
+
+extern struct genapic apic_x2apic_uv_x;
+DECLARE_PER_CPU(int, x2apic_extra_bits);
+extern void uv_cpu_init(void);
+extern void uv_system_init(void);
+extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
+
+extern void setup_apic_routing(void);
+
+#endif /* _ASM_X86_GENAPIC_64_H */
diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h
new file mode 100644
index 000000000000..ad3c2ed75481
--- /dev/null
+++ b/arch/x86/include/asm/geode.h
@@ -0,0 +1,253 @@
+/*
+ * AMD Geode definitions
+ * Copyright (C) 2006, Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_X86_GEODE_H
+#define _ASM_X86_GEODE_H
+
+#include <asm/processor.h>
+#include <linux/io.h>
+
+/* Generic southbridge functions */
+
+#define GEODE_DEV_PMS 0
+#define GEODE_DEV_ACPI 1
+#define GEODE_DEV_GPIO 2
+#define GEODE_DEV_MFGPT 3
+
+extern int geode_get_dev_base(unsigned int dev);
+
+/* Useful macros */
+#define geode_pms_base() geode_get_dev_base(GEODE_DEV_PMS)
+#define geode_acpi_base() geode_get_dev_base(GEODE_DEV_ACPI)
+#define geode_gpio_base() geode_get_dev_base(GEODE_DEV_GPIO)
+#define geode_mfgpt_base() geode_get_dev_base(GEODE_DEV_MFGPT)
+
+/* MSRS */
+
+#define MSR_GLIU_P2D_RO0 0x10000029
+
+#define MSR_LX_GLD_MSR_CONFIG 0x48002001
+#define MSR_LX_MSR_PADSEL 0x48002011 /* NOT 0x48000011; the data
+ * sheet has the wrong value */
+#define MSR_GLCP_SYS_RSTPLL 0x4C000014
+#define MSR_GLCP_DOTPLL 0x4C000015
+
+#define MSR_LBAR_SMB 0x5140000B
+#define MSR_LBAR_GPIO 0x5140000C
+#define MSR_LBAR_MFGPT 0x5140000D
+#define MSR_LBAR_ACPI 0x5140000E
+#define MSR_LBAR_PMS 0x5140000F
+
+#define MSR_DIVIL_SOFT_RESET 0x51400017
+
+#define MSR_PIC_YSEL_LOW 0x51400020
+#define MSR_PIC_YSEL_HIGH 0x51400021
+#define MSR_PIC_ZSEL_LOW 0x51400022
+#define MSR_PIC_ZSEL_HIGH 0x51400023
+#define MSR_PIC_IRQM_LPC 0x51400025
+
+#define MSR_MFGPT_IRQ 0x51400028
+#define MSR_MFGPT_NR 0x51400029
+#define MSR_MFGPT_SETUP 0x5140002B
+
+#define MSR_LX_SPARE_MSR 0x80000011 /* DC-specific */
+
+#define MSR_GX_GLD_MSR_CONFIG 0xC0002001
+#define MSR_GX_MSR_PADSEL 0xC0002011
+
+/* Resource Sizes */
+
+#define LBAR_GPIO_SIZE 0xFF
+#define LBAR_MFGPT_SIZE 0x40
+#define LBAR_ACPI_SIZE 0x40
+#define LBAR_PMS_SIZE 0x80
+
+/* ACPI registers (PMS block) */
+
+/*
+ * PM1_EN is only valid when VSA is enabled for 16 bit reads.
+ * When VSA is not enabled, *always* read both PM1_STS and PM1_EN
+ * with a 32 bit read at offset 0x0
+ */
+
+#define PM1_STS 0x00
+#define PM1_EN 0x02
+#define PM1_CNT 0x08
+#define PM2_CNT 0x0C
+#define PM_TMR 0x10
+#define PM_GPE0_STS 0x18
+#define PM_GPE0_EN 0x1C
+
+/* PMC registers (PMS block) */
+
+#define PM_SSD 0x00
+#define PM_SCXA 0x04
+#define PM_SCYA 0x08
+#define PM_OUT_SLPCTL 0x0C
+#define PM_SCLK 0x10
+#define PM_SED 0x1
+#define PM_SCXD 0x18
+#define PM_SCYD 0x1C
+#define PM_IN_SLPCTL 0x20
+#define PM_WKD 0x30
+#define PM_WKXD 0x34
+#define PM_RD 0x38
+#define PM_WKXA 0x3C
+#define PM_FSD 0x40
+#define PM_TSD 0x44
+#define PM_PSD 0x48
+#define PM_NWKD 0x4C
+#define PM_AWKD 0x50
+#define PM_SSC 0x54
+
+/* VSA2 magic values */
+
+#define VSA_VRC_INDEX 0xAC1C
+#define VSA_VRC_DATA 0xAC1E
+#define VSA_VR_UNLOCK 0xFC53 /* unlock virtual register */
+#define VSA_VR_SIGNATURE 0x0003
+#define VSA_VR_MEM_SIZE 0x0200
+#define AMD_VSA_SIG 0x4132 /* signature is ascii 'VSA2' */
+#define GSW_VSA_SIG 0x534d /* General Software signature */
+/* GPIO */
+
+#define GPIO_OUTPUT_VAL 0x00
+#define GPIO_OUTPUT_ENABLE 0x04
+#define GPIO_OUTPUT_OPEN_DRAIN 0x08
+#define GPIO_OUTPUT_INVERT 0x0C
+#define GPIO_OUTPUT_AUX1 0x10
+#define GPIO_OUTPUT_AUX2 0x14
+#define GPIO_PULL_UP 0x18
+#define GPIO_PULL_DOWN 0x1C
+#define GPIO_INPUT_ENABLE 0x20
+#define GPIO_INPUT_INVERT 0x24
+#define GPIO_INPUT_FILTER 0x28
+#define GPIO_INPUT_EVENT_COUNT 0x2C
+#define GPIO_READ_BACK 0x30
+#define GPIO_INPUT_AUX1 0x34
+#define GPIO_EVENTS_ENABLE 0x38
+#define GPIO_LOCK_ENABLE 0x3C
+#define GPIO_POSITIVE_EDGE_EN 0x40
+#define GPIO_NEGATIVE_EDGE_EN 0x44
+#define GPIO_POSITIVE_EDGE_STS 0x48
+#define GPIO_NEGATIVE_EDGE_STS 0x4C
+
+#define GPIO_MAP_X 0xE0
+#define GPIO_MAP_Y 0xE4
+#define GPIO_MAP_Z 0xE8
+#define GPIO_MAP_W 0xEC
+
+static inline u32 geode_gpio(unsigned int nr)
+{
+ BUG_ON(nr > 28);
+ return 1 << nr;
+}
+
+extern void geode_gpio_set(u32, unsigned int);
+extern void geode_gpio_clear(u32, unsigned int);
+extern int geode_gpio_isset(u32, unsigned int);
+extern void geode_gpio_setup_event(unsigned int, int, int);
+extern void geode_gpio_set_irq(unsigned int, unsigned int);
+
+static inline void geode_gpio_event_irq(unsigned int gpio, int pair)
+{
+ geode_gpio_setup_event(gpio, pair, 0);
+}
+
+static inline void geode_gpio_event_pme(unsigned int gpio, int pair)
+{
+ geode_gpio_setup_event(gpio, pair, 1);
+}
+
+/* Specific geode tests */
+
+static inline int is_geode_gx(void)
+{
+ return ((boot_cpu_data.x86_vendor == X86_VENDOR_NSC) &&
+ (boot_cpu_data.x86 == 5) &&
+ (boot_cpu_data.x86_model == 5));
+}
+
+static inline int is_geode_lx(void)
+{
+ return ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+ (boot_cpu_data.x86 == 5) &&
+ (boot_cpu_data.x86_model == 10));
+}
+
+static inline int is_geode(void)
+{
+ return (is_geode_gx() || is_geode_lx());
+}
+
+#ifdef CONFIG_MGEODE_LX
+extern int geode_has_vsa2(void);
+#else
+static inline int geode_has_vsa2(void)
+{
+ return 0;
+}
+#endif
+
+/* MFGPTs */
+
+#define MFGPT_MAX_TIMERS 8
+#define MFGPT_TIMER_ANY (-1)
+
+#define MFGPT_DOMAIN_WORKING 1
+#define MFGPT_DOMAIN_STANDBY 2
+#define MFGPT_DOMAIN_ANY (MFGPT_DOMAIN_WORKING | MFGPT_DOMAIN_STANDBY)
+
+#define MFGPT_CMP1 0
+#define MFGPT_CMP2 1
+
+#define MFGPT_EVENT_IRQ 0
+#define MFGPT_EVENT_NMI 1
+#define MFGPT_EVENT_RESET 3
+
+#define MFGPT_REG_CMP1 0
+#define MFGPT_REG_CMP2 2
+#define MFGPT_REG_COUNTER 4
+#define MFGPT_REG_SETUP 6
+
+#define MFGPT_SETUP_CNTEN (1 << 15)
+#define MFGPT_SETUP_CMP2 (1 << 14)
+#define MFGPT_SETUP_CMP1 (1 << 13)
+#define MFGPT_SETUP_SETUP (1 << 12)
+#define MFGPT_SETUP_STOPEN (1 << 11)
+#define MFGPT_SETUP_EXTEN (1 << 10)
+#define MFGPT_SETUP_REVEN (1 << 5)
+#define MFGPT_SETUP_CLKSEL (1 << 4)
+
+static inline void geode_mfgpt_write(int timer, u16 reg, u16 value)
+{
+ u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
+ outw(value, base + reg + (timer * 8));
+}
+
+static inline u16 geode_mfgpt_read(int timer, u16 reg)
+{
+ u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
+ return inw(base + reg + (timer * 8));
+}
+
+extern int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable);
+extern int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable);
+extern int geode_mfgpt_alloc_timer(int timer, int domain);
+
+#define geode_mfgpt_setup_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 1)
+#define geode_mfgpt_release_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 0)
+
+#ifdef CONFIG_GEODE_MFGPT_TIMER
+extern int __init mfgpt_timer_setup(void);
+#else
+static inline int mfgpt_timer_setup(void) { return 0; }
+#endif
+
+#endif /* _ASM_X86_GEODE_H */
diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h
new file mode 100644
index 000000000000..49dbfdfa50f9
--- /dev/null
+++ b/arch/x86/include/asm/gpio.h
@@ -0,0 +1,56 @@
+/*
+ * Generic GPIO API implementation for x86.
+ *
+ * Derived from the generic GPIO API for powerpc:
+ *
+ * Copyright (c) 2007-2008 MontaVista Software, Inc.
+ *
+ * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _ASM_X86_GPIO_H
+#define _ASM_X86_GPIO_H
+
+#include <asm-generic/gpio.h>
+
+#ifdef CONFIG_GPIOLIB
+
+/*
+ * Just call gpiolib.
+ */
+static inline int gpio_get_value(unsigned int gpio)
+{
+ return __gpio_get_value(gpio);
+}
+
+static inline void gpio_set_value(unsigned int gpio, int value)
+{
+ __gpio_set_value(gpio, value);
+}
+
+static inline int gpio_cansleep(unsigned int gpio)
+{
+ return __gpio_cansleep(gpio);
+}
+
+/*
+ * Not implemented, yet.
+ */
+static inline int gpio_to_irq(unsigned int gpio)
+{
+ return -ENOSYS;
+}
+
+static inline int irq_to_gpio(unsigned int irq)
+{
+ return -EINVAL;
+}
+
+#endif /* CONFIG_GPIOLIB */
+
+#endif /* _ASM_X86_GPIO_H */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
new file mode 100644
index 000000000000..000787df66e6
--- /dev/null
+++ b/arch/x86/include/asm/hardirq.h
@@ -0,0 +1,11 @@
+#ifdef CONFIG_X86_32
+# include "hardirq_32.h"
+#else
+# include "hardirq_64.h"
+#endif
+
+extern u64 arch_irq_stat_cpu(unsigned int cpu);
+#define arch_irq_stat_cpu arch_irq_stat_cpu
+
+extern u64 arch_irq_stat(void);
+#define arch_irq_stat arch_irq_stat
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
new file mode 100644
index 000000000000..cf7954d1405f
--- /dev/null
+++ b/arch/x86/include/asm/hardirq_32.h
@@ -0,0 +1,30 @@
+#ifndef _ASM_X86_HARDIRQ_32_H
+#define _ASM_X86_HARDIRQ_32_H
+
+#include <linux/threads.h>
+#include <linux/irq.h>
+
+typedef struct {
+ unsigned int __softirq_pending;
+ unsigned long idle_timestamp;
+ unsigned int __nmi_count; /* arch dependent */
+ unsigned int apic_timer_irqs; /* arch dependent */
+ unsigned int irq0_irqs;
+ unsigned int irq_resched_count;
+ unsigned int irq_call_count;
+ unsigned int irq_tlb_count;
+ unsigned int irq_thermal_count;
+ unsigned int irq_spurious_count;
+} ____cacheline_aligned irq_cpustat_t;
+
+DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
+
+#define __ARCH_IRQ_STAT
+#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member)
+
+#define inc_irq_stat(member) (__get_cpu_var(irq_stat).member++)
+
+void ack_bad_irq(unsigned int irq);
+#include <linux/irq_cpustat.h>
+
+#endif /* _ASM_X86_HARDIRQ_32_H */
diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h
new file mode 100644
index 000000000000..b5a6b5d56704
--- /dev/null
+++ b/arch/x86/include/asm/hardirq_64.h
@@ -0,0 +1,25 @@
+#ifndef _ASM_X86_HARDIRQ_64_H
+#define _ASM_X86_HARDIRQ_64_H
+
+#include <linux/threads.h>
+#include <linux/irq.h>
+#include <asm/pda.h>
+#include <asm/apic.h>
+
+/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
+#define MAX_HARDIRQS_PER_CPU NR_VECTORS
+
+#define __ARCH_IRQ_STAT 1
+
+#define inc_irq_stat(member) add_pda(member, 1)
+
+#define local_softirq_pending() read_pda(__softirq_pending)
+
+#define __ARCH_SET_SOFTIRQ_PENDING 1
+
+#define set_softirq_pending(x) write_pda(__softirq_pending, (x))
+#define or_softirq_pending(x) or_pda(__softirq_pending, (x))
+
+extern void ack_bad_irq(unsigned int irq);
+
+#endif /* _ASM_X86_HARDIRQ_64_H */
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
new file mode 100644
index 000000000000..bf9276bea660
--- /dev/null
+++ b/arch/x86/include/asm/highmem.h
@@ -0,0 +1,79 @@
+/*
+ * highmem.h: virtual kernel memory mappings for high memory
+ *
+ * Used in CONFIG_HIGHMEM systems for memory pages which
+ * are not addressable by direct kernel virtual addresses.
+ *
+ * Copyright (C) 1999 Gerhard Wichert, Siemens AG
+ * Gerhard.Wichert@pdb.siemens.de
+ *
+ *
+ * Redesigned the x86 32-bit VM architecture to deal with
+ * up to 16 Terabyte physical memory. With current x86 CPUs
+ * we now support up to 64 Gigabytes physical RAM.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#ifndef _ASM_X86_HIGHMEM_H
+#define _ASM_X86_HIGHMEM_H
+
+#ifdef __KERNEL__
+
+#include <linux/interrupt.h>
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#include <asm/tlbflush.h>
+#include <asm/paravirt.h>
+#include <asm/fixmap.h>
+
+/* declarations for highmem.c */
+extern unsigned long highstart_pfn, highend_pfn;
+
+/*
+ * Right now we initialize only a single pte table. It can be extended
+ * easily, subsequent pte tables have to be allocated in one physical
+ * chunk of RAM.
+ */
+/*
+ * Ordering is:
+ *
+ * FIXADDR_TOP
+ * fixed_addresses
+ * FIXADDR_START
+ * temp fixed addresses
+ * FIXADDR_BOOT_START
+ * Persistent kmap area
+ * PKMAP_BASE
+ * VMALLOC_END
+ * Vmalloc area
+ * VMALLOC_START
+ * high_memory
+ */
+#define LAST_PKMAP_MASK (LAST_PKMAP-1)
+#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
+#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
+
+extern void *kmap_high(struct page *page);
+extern void kunmap_high(struct page *page);
+
+void *kmap(struct page *page);
+void kunmap(struct page *page);
+void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
+void *kmap_atomic(struct page *page, enum km_type type);
+void kunmap_atomic(void *kvaddr, enum km_type type);
+void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
+struct page *kmap_atomic_to_page(void *ptr);
+
+#ifndef CONFIG_PARAVIRT
+#define kmap_atomic_pte(page, type) kmap_atomic(page, type)
+#endif
+
+#define flush_cache_kmaps() do { } while (0)
+
+extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+ unsigned long end_pfn);
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_HIGHMEM_H */
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
new file mode 100644
index 000000000000..1c22cb05ad6a
--- /dev/null
+++ b/arch/x86/include/asm/hpet.h
@@ -0,0 +1,114 @@
+#ifndef _ASM_X86_HPET_H
+#define _ASM_X86_HPET_H
+
+#include <linux/msi.h>
+
+#ifdef CONFIG_HPET_TIMER
+
+#define HPET_MMAP_SIZE 1024
+
+#define HPET_ID 0x000
+#define HPET_PERIOD 0x004
+#define HPET_CFG 0x010
+#define HPET_STATUS 0x020
+#define HPET_COUNTER 0x0f0
+
+#define HPET_Tn_CFG(n) (0x100 + 0x20 * n)
+#define HPET_Tn_CMP(n) (0x108 + 0x20 * n)
+#define HPET_Tn_ROUTE(n) (0x110 + 0x20 * n)
+
+#define HPET_T0_CFG 0x100
+#define HPET_T0_CMP 0x108
+#define HPET_T0_ROUTE 0x110
+#define HPET_T1_CFG 0x120
+#define HPET_T1_CMP 0x128
+#define HPET_T1_ROUTE 0x130
+#define HPET_T2_CFG 0x140
+#define HPET_T2_CMP 0x148
+#define HPET_T2_ROUTE 0x150
+
+#define HPET_ID_REV 0x000000ff
+#define HPET_ID_NUMBER 0x00001f00
+#define HPET_ID_64BIT 0x00002000
+#define HPET_ID_LEGSUP 0x00008000
+#define HPET_ID_VENDOR 0xffff0000
+#define HPET_ID_NUMBER_SHIFT 8
+#define HPET_ID_VENDOR_SHIFT 16
+
+#define HPET_ID_VENDOR_8086 0x8086
+
+#define HPET_CFG_ENABLE 0x001
+#define HPET_CFG_LEGACY 0x002
+#define HPET_LEGACY_8254 2
+#define HPET_LEGACY_RTC 8
+
+#define HPET_TN_LEVEL 0x0002
+#define HPET_TN_ENABLE 0x0004
+#define HPET_TN_PERIODIC 0x0008
+#define HPET_TN_PERIODIC_CAP 0x0010
+#define HPET_TN_64BIT_CAP 0x0020
+#define HPET_TN_SETVAL 0x0040
+#define HPET_TN_32BIT 0x0100
+#define HPET_TN_ROUTE 0x3e00
+#define HPET_TN_FSB 0x4000
+#define HPET_TN_FSB_CAP 0x8000
+#define HPET_TN_ROUTE_SHIFT 9
+
+/* Max HPET Period is 10^8 femto sec as in HPET spec */
+#define HPET_MAX_PERIOD 100000000UL
+/*
+ * Min HPET period is 10^5 femto sec just for safety. If it is less than this,
+ * then 32 bit HPET counter wrapsaround in less than 0.5 sec.
+ */
+#define HPET_MIN_PERIOD 100000UL
+
+/* hpet memory map physical address */
+extern unsigned long hpet_address;
+extern unsigned long force_hpet_address;
+extern int hpet_force_user;
+extern int is_hpet_enabled(void);
+extern int hpet_enable(void);
+extern void hpet_disable(void);
+extern unsigned long hpet_readl(unsigned long a);
+extern void force_hpet_resume(void);
+
+extern void hpet_msi_unmask(unsigned int irq);
+extern void hpet_msi_mask(unsigned int irq);
+extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
+extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
+
+#ifdef CONFIG_PCI_MSI
+extern int arch_setup_hpet_msi(unsigned int irq);
+#else
+static inline int arch_setup_hpet_msi(unsigned int irq)
+{
+ return -EINVAL;
+}
+#endif
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+
+#include <linux/interrupt.h>
+
+typedef irqreturn_t (*rtc_irq_handler)(int interrupt, void *cookie);
+extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask);
+extern int hpet_set_rtc_irq_bit(unsigned long bit_mask);
+extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
+ unsigned char sec);
+extern int hpet_set_periodic_freq(unsigned long freq);
+extern int hpet_rtc_dropped_irq(void);
+extern int hpet_rtc_timer_init(void);
+extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id);
+extern int hpet_register_irq_handler(rtc_irq_handler handler);
+extern void hpet_unregister_irq_handler(rtc_irq_handler handler);
+
+#endif /* CONFIG_HPET_EMULATE_RTC */
+
+#else /* CONFIG_HPET_TIMER */
+
+static inline int hpet_enable(void) { return 0; }
+static inline int is_hpet_enabled(void) { return 0; }
+#define hpet_readl(a) 0
+
+#endif
+#endif /* _ASM_X86_HPET_H */
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
new file mode 100644
index 000000000000..439a9acc132d
--- /dev/null
+++ b/arch/x86/include/asm/hugetlb.h
@@ -0,0 +1,93 @@
+#ifndef _ASM_X86_HUGETLB_H
+#define _ASM_X86_HUGETLB_H
+
+#include <asm/page.h>
+
+
+static inline int is_hugepage_only_range(struct mm_struct *mm,
+ unsigned long addr,
+ unsigned long len) {
+ return 0;
+}
+
+/*
+ * If the arch doesn't supply something else, assume that hugepage
+ * size aligned regions are ok without further preparation.
+ */
+static inline int prepare_hugepage_range(struct file *file,
+ unsigned long addr, unsigned long len)
+{
+ struct hstate *h = hstate_file(file);
+ if (len & ~huge_page_mask(h))
+ return -EINVAL;
+ if (addr & ~huge_page_mask(h))
+ return -EINVAL;
+ return 0;
+}
+
+static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
+}
+
+static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
+ unsigned long addr, unsigned long end,
+ unsigned long floor,
+ unsigned long ceiling)
+{
+ free_pgd_range(tlb, addr, end, floor, ceiling);
+}
+
+static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ set_pte_at(mm, addr, ptep, pte);
+}
+
+static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ return ptep_get_and_clear(mm, addr, ptep);
+}
+
+static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+}
+
+static inline int huge_pte_none(pte_t pte)
+{
+ return pte_none(pte);
+}
+
+static inline pte_t huge_pte_wrprotect(pte_t pte)
+{
+ return pte_wrprotect(pte);
+}
+
+static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ ptep_set_wrprotect(mm, addr, ptep);
+}
+
+static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t pte, int dirty)
+{
+ return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+}
+
+static inline pte_t huge_ptep_get(pte_t *ptep)
+{
+ return *ptep;
+}
+
+static inline int arch_prepare_hugepage(struct page *page)
+{
+ return 0;
+}
+
+static inline void arch_release_hugepage(struct page *page)
+{
+}
+
+#endif /* _ASM_X86_HUGETLB_H */
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
new file mode 100644
index 000000000000..8de644b6b959
--- /dev/null
+++ b/arch/x86/include/asm/hw_irq.h
@@ -0,0 +1,129 @@
+#ifndef _ASM_X86_HW_IRQ_H
+#define _ASM_X86_HW_IRQ_H
+
+/*
+ * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
+ *
+ * moved some of the old arch/i386/kernel/irq.h to here. VY
+ *
+ * IRQ/IPI changes taken from work by Thomas Radke
+ * <tomsoft@informatik.tu-chemnitz.de>
+ *
+ * hacked by Andi Kleen for x86-64.
+ * unified by tglx
+ */
+
+#include <asm/irq_vectors.h>
+
+#ifndef __ASSEMBLY__
+
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/smp.h>
+
+#include <asm/atomic.h>
+#include <asm/irq.h>
+#include <asm/sections.h>
+
+#define platform_legacy_irq(irq) ((irq) < 16)
+
+/* Interrupt handlers registered during init_IRQ */
+extern void apic_timer_interrupt(void);
+extern void error_interrupt(void);
+extern void spurious_interrupt(void);
+extern void thermal_interrupt(void);
+extern void reschedule_interrupt(void);
+
+extern void invalidate_interrupt(void);
+extern void invalidate_interrupt0(void);
+extern void invalidate_interrupt1(void);
+extern void invalidate_interrupt2(void);
+extern void invalidate_interrupt3(void);
+extern void invalidate_interrupt4(void);
+extern void invalidate_interrupt5(void);
+extern void invalidate_interrupt6(void);
+extern void invalidate_interrupt7(void);
+
+extern void irq_move_cleanup_interrupt(void);
+extern void threshold_interrupt(void);
+
+extern void call_function_interrupt(void);
+extern void call_function_single_interrupt(void);
+
+/* PIC specific functions */
+extern void disable_8259A_irq(unsigned int irq);
+extern void enable_8259A_irq(unsigned int irq);
+extern int i8259A_irq_pending(unsigned int irq);
+extern void make_8259A_irq(unsigned int irq);
+extern void init_8259A(int aeoi);
+
+/* IOAPIC */
+#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
+extern unsigned long io_apic_irqs;
+
+extern void init_VISWS_APIC_irqs(void);
+extern void setup_IO_APIC(void);
+extern void disable_IO_APIC(void);
+extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
+extern void setup_ioapic_dest(void);
+
+#ifdef CONFIG_X86_64
+extern void enable_IO_APIC(void);
+#endif
+
+/* IPI functions */
+#ifdef CONFIG_X86_32
+extern void send_IPI_self(int vector);
+#endif
+extern void send_IPI(int dest, int vector);
+
+/* Statistics */
+extern atomic_t irq_err_count;
+extern atomic_t irq_mis_count;
+
+/* EISA */
+extern void eisa_set_level_irq(unsigned int irq);
+
+/* Voyager functions */
+extern asmlinkage void vic_cpi_interrupt(void);
+extern asmlinkage void vic_sys_interrupt(void);
+extern asmlinkage void vic_cmn_interrupt(void);
+extern asmlinkage void qic_timer_interrupt(void);
+extern asmlinkage void qic_invalidate_interrupt(void);
+extern asmlinkage void qic_reschedule_interrupt(void);
+extern asmlinkage void qic_enable_irq_interrupt(void);
+extern asmlinkage void qic_call_function_interrupt(void);
+
+/* SMP */
+extern void smp_apic_timer_interrupt(struct pt_regs *);
+extern void smp_spurious_interrupt(struct pt_regs *);
+extern void smp_error_interrupt(struct pt_regs *);
+#ifdef CONFIG_X86_SMP
+extern void smp_reschedule_interrupt(struct pt_regs *);
+extern void smp_call_function_interrupt(struct pt_regs *);
+extern void smp_call_function_single_interrupt(struct pt_regs *);
+#ifdef CONFIG_X86_32
+extern void smp_invalidate_interrupt(struct pt_regs *);
+#else
+extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
+#endif
+#endif
+
+extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
+
+typedef int vector_irq_t[NR_VECTORS];
+DECLARE_PER_CPU(vector_irq_t, vector_irq);
+
+#ifdef CONFIG_X86_IO_APIC
+extern void lock_vector_lock(void);
+extern void unlock_vector_lock(void);
+extern void __setup_vector_irq(int cpu);
+#else
+static inline void lock_vector_lock(void) {}
+static inline void unlock_vector_lock(void) {}
+static inline void __setup_vector_irq(int cpu) {}
+#endif
+
+#endif /* !ASSEMBLY_ */
+
+#endif /* _ASM_X86_HW_IRQ_H */
diff --git a/arch/x86/include/asm/hypertransport.h b/arch/x86/include/asm/hypertransport.h
new file mode 100644
index 000000000000..334b1a885d9c
--- /dev/null
+++ b/arch/x86/include/asm/hypertransport.h
@@ -0,0 +1,45 @@
+#ifndef _ASM_X86_HYPERTRANSPORT_H
+#define _ASM_X86_HYPERTRANSPORT_H
+
+/*
+ * Constants for x86 Hypertransport Interrupts.
+ */
+
+#define HT_IRQ_LOW_BASE 0xf8000000
+
+#define HT_IRQ_LOW_VECTOR_SHIFT 16
+#define HT_IRQ_LOW_VECTOR_MASK 0x00ff0000
+#define HT_IRQ_LOW_VECTOR(v) \
+ (((v) << HT_IRQ_LOW_VECTOR_SHIFT) & HT_IRQ_LOW_VECTOR_MASK)
+
+#define HT_IRQ_LOW_DEST_ID_SHIFT 8
+#define HT_IRQ_LOW_DEST_ID_MASK 0x0000ff00
+#define HT_IRQ_LOW_DEST_ID(v) \
+ (((v) << HT_IRQ_LOW_DEST_ID_SHIFT) & HT_IRQ_LOW_DEST_ID_MASK)
+
+#define HT_IRQ_LOW_DM_PHYSICAL 0x0000000
+#define HT_IRQ_LOW_DM_LOGICAL 0x0000040
+
+#define HT_IRQ_LOW_RQEOI_EDGE 0x0000000
+#define HT_IRQ_LOW_RQEOI_LEVEL 0x0000020
+
+
+#define HT_IRQ_LOW_MT_FIXED 0x0000000
+#define HT_IRQ_LOW_MT_ARBITRATED 0x0000004
+#define HT_IRQ_LOW_MT_SMI 0x0000008
+#define HT_IRQ_LOW_MT_NMI 0x000000c
+#define HT_IRQ_LOW_MT_INIT 0x0000010
+#define HT_IRQ_LOW_MT_STARTUP 0x0000014
+#define HT_IRQ_LOW_MT_EXTINT 0x0000018
+#define HT_IRQ_LOW_MT_LINT1 0x000008c
+#define HT_IRQ_LOW_MT_LINT0 0x0000098
+
+#define HT_IRQ_LOW_IRQ_MASKED 0x0000001
+
+
+#define HT_IRQ_HIGH_DEST_ID_SHIFT 0
+#define HT_IRQ_HIGH_DEST_ID_MASK 0x00ffffff
+#define HT_IRQ_HIGH_DEST_ID(v) \
+ ((((v) >> 8) << HT_IRQ_HIGH_DEST_ID_SHIFT) & HT_IRQ_HIGH_DEST_ID_MASK)
+
+#endif /* _ASM_X86_HYPERTRANSPORT_H */
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
new file mode 100644
index 000000000000..369f5c5d09a1
--- /dev/null
+++ b/arch/x86/include/asm/hypervisor.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) 2008, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#ifndef ASM_X86__HYPERVISOR_H
+#define ASM_X86__HYPERVISOR_H
+
+extern unsigned long get_hypervisor_tsc_freq(void);
+extern void init_hypervisor(struct cpuinfo_x86 *c);
+
+#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
new file mode 100644
index 000000000000..48f0004db8c9
--- /dev/null
+++ b/arch/x86/include/asm/i387.h
@@ -0,0 +1,400 @@
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ * x86-64 work by Andi Kleen 2002
+ */
+
+#ifndef _ASM_X86_I387_H
+#define _ASM_X86_I387_H
+
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/regset.h>
+#include <linux/hardirq.h>
+#include <asm/asm.h>
+#include <asm/processor.h>
+#include <asm/sigcontext.h>
+#include <asm/user.h>
+#include <asm/uaccess.h>
+#include <asm/xsave.h>
+
+extern unsigned int sig_xstate_size;
+extern void fpu_init(void);
+extern void mxcsr_feature_mask_init(void);
+extern int init_fpu(struct task_struct *child);
+extern asmlinkage void math_state_restore(void);
+extern void init_thread_xstate(void);
+extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
+
+extern user_regset_active_fn fpregs_active, xfpregs_active;
+extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get;
+extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set;
+
+extern struct _fpx_sw_bytes fx_sw_reserved;
+#ifdef CONFIG_IA32_EMULATION
+extern unsigned int sig_xstate_ia32_size;
+extern struct _fpx_sw_bytes fx_sw_reserved_ia32;
+struct _fpstate_ia32;
+struct _xstate_ia32;
+extern int save_i387_xstate_ia32(void __user *buf);
+extern int restore_i387_xstate_ia32(void __user *buf);
+#endif
+
+#define X87_FSW_ES (1 << 7) /* Exception Summary */
+
+#ifdef CONFIG_X86_64
+
+/* Ignore delayed exceptions from user space */
+static inline void tolerant_fwait(void)
+{
+ asm volatile("1: fwait\n"
+ "2:\n"
+ _ASM_EXTABLE(1b, 2b));
+}
+
+static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
+{
+ int err;
+
+ asm volatile("1: rex64/fxrstor (%[fx])\n\t"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : [err] "=r" (err)
+#if 0 /* See comment in __save_init_fpu() below. */
+ : [fx] "r" (fx), "m" (*fx), "0" (0));
+#else
+ : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
+#endif
+ return err;
+}
+
+static inline int restore_fpu_checking(struct task_struct *tsk)
+{
+ if (task_thread_info(tsk)->status & TS_XSAVE)
+ return xrstor_checking(&tsk->thread.xstate->xsave);
+ else
+ return fxrstor_checking(&tsk->thread.xstate->fxsave);
+}
+
+/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
+ is pending. Clear the x87 state here by setting it to fixed
+ values. The kernel data segment can be sometimes 0 and sometimes
+ new user value. Both should be ok.
+ Use the PDA as safe address because it should be already in L1. */
+static inline void clear_fpu_state(struct task_struct *tsk)
+{
+ struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
+ struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+
+ /*
+ * xsave header may indicate the init state of the FP.
+ */
+ if ((task_thread_info(tsk)->status & TS_XSAVE) &&
+ !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
+ return;
+
+ if (unlikely(fx->swd & X87_FSW_ES))
+ asm volatile("fnclex");
+ alternative_input(ASM_NOP8 ASM_NOP2,
+ " emms\n" /* clear stack tags */
+ " fildl %%gs:0", /* load to clear state */
+ X86_FEATURE_FXSAVE_LEAK);
+}
+
+static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
+{
+ int err;
+
+ asm volatile("1: rex64/fxsave (%[fx])\n\t"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : [err] "=r" (err), "=m" (*fx)
+#if 0 /* See comment in __fxsave_clear() below. */
+ : [fx] "r" (fx), "0" (0));
+#else
+ : [fx] "cdaSDb" (fx), "0" (0));
+#endif
+ if (unlikely(err) &&
+ __clear_user(fx, sizeof(struct i387_fxsave_struct)))
+ err = -EFAULT;
+ /* No need to clear here because the caller clears USED_MATH */
+ return err;
+}
+
+static inline void fxsave(struct task_struct *tsk)
+{
+ /* Using "rex64; fxsave %0" is broken because, if the memory operand
+ uses any extended registers for addressing, a second REX prefix
+ will be generated (to the assembler, rex64 followed by semicolon
+ is a separate instruction), and hence the 64-bitness is lost. */
+#if 0
+ /* Using "fxsaveq %0" would be the ideal choice, but is only supported
+ starting with gas 2.16. */
+ __asm__ __volatile__("fxsaveq %0"
+ : "=m" (tsk->thread.xstate->fxsave));
+#elif 0
+ /* Using, as a workaround, the properly prefixed form below isn't
+ accepted by any binutils version so far released, complaining that
+ the same type of prefix is used twice if an extended register is
+ needed for addressing (fix submitted to mainline 2005-11-21). */
+ __asm__ __volatile__("rex64/fxsave %0"
+ : "=m" (tsk->thread.xstate->fxsave));
+#else
+ /* This, however, we can work around by forcing the compiler to select
+ an addressing mode that doesn't require extended registers. */
+ __asm__ __volatile__("rex64/fxsave (%1)"
+ : "=m" (tsk->thread.xstate->fxsave)
+ : "cdaSDb" (&tsk->thread.xstate->fxsave));
+#endif
+}
+
+static inline void __save_init_fpu(struct task_struct *tsk)
+{
+ if (task_thread_info(tsk)->status & TS_XSAVE)
+ xsave(tsk);
+ else
+ fxsave(tsk);
+
+ clear_fpu_state(tsk);
+ task_thread_info(tsk)->status &= ~TS_USEDFPU;
+}
+
+#else /* CONFIG_X86_32 */
+
+extern void finit(void);
+
+static inline void tolerant_fwait(void)
+{
+ asm volatile("fnclex ; fwait");
+}
+
+static inline void restore_fpu(struct task_struct *tsk)
+{
+ if (task_thread_info(tsk)->status & TS_XSAVE) {
+ xrstor_checking(&tsk->thread.xstate->xsave);
+ return;
+ }
+ /*
+ * The "nop" is needed to make the instructions the same
+ * length.
+ */
+ alternative_input(
+ "nop ; frstor %1",
+ "fxrstor %1",
+ X86_FEATURE_FXSR,
+ "m" (tsk->thread.xstate->fxsave));
+}
+
+/* We need a safe address that is cheap to find and that is already
+ in L1 during context switch. The best choices are unfortunately
+ different for UP and SMP */
+#ifdef CONFIG_SMP
+#define safe_address (__per_cpu_offset[0])
+#else
+#define safe_address (kstat_cpu(0).cpustat.user)
+#endif
+
+/*
+ * These must be called with preempt disabled
+ */
+static inline void __save_init_fpu(struct task_struct *tsk)
+{
+ if (task_thread_info(tsk)->status & TS_XSAVE) {
+ struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
+ struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+
+ xsave(tsk);
+
+ /*
+ * xsave header may indicate the init state of the FP.
+ */
+ if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
+ goto end;
+
+ if (unlikely(fx->swd & X87_FSW_ES))
+ asm volatile("fnclex");
+
+ /*
+ * we can do a simple return here or be paranoid :)
+ */
+ goto clear_state;
+ }
+
+ /* Use more nops than strictly needed in case the compiler
+ varies code */
+ alternative_input(
+ "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
+ "fxsave %[fx]\n"
+ "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
+ X86_FEATURE_FXSR,
+ [fx] "m" (tsk->thread.xstate->fxsave),
+ [fsw] "m" (tsk->thread.xstate->fxsave.swd) : "memory");
+clear_state:
+ /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
+ is pending. Clear the x87 state here by setting it to fixed
+ values. safe_address is a random variable that should be in L1 */
+ alternative_input(
+ GENERIC_NOP8 GENERIC_NOP2,
+ "emms\n\t" /* clear stack tags */
+ "fildl %[addr]", /* set F?P to defined value */
+ X86_FEATURE_FXSAVE_LEAK,
+ [addr] "m" (safe_address));
+end:
+ task_thread_info(tsk)->status &= ~TS_USEDFPU;
+}
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Signal frame handlers...
+ */
+extern int save_i387_xstate(void __user *buf);
+extern int restore_i387_xstate(void __user *buf);
+
+static inline void __unlazy_fpu(struct task_struct *tsk)
+{
+ if (task_thread_info(tsk)->status & TS_USEDFPU) {
+ __save_init_fpu(tsk);
+ stts();
+ } else
+ tsk->fpu_counter = 0;
+}
+
+static inline void __clear_fpu(struct task_struct *tsk)
+{
+ if (task_thread_info(tsk)->status & TS_USEDFPU) {
+ tolerant_fwait();
+ task_thread_info(tsk)->status &= ~TS_USEDFPU;
+ stts();
+ }
+}
+
+static inline void kernel_fpu_begin(void)
+{
+ struct thread_info *me = current_thread_info();
+ preempt_disable();
+ if (me->status & TS_USEDFPU)
+ __save_init_fpu(me->task);
+ else
+ clts();
+}
+
+static inline void kernel_fpu_end(void)
+{
+ stts();
+ preempt_enable();
+}
+
+/*
+ * Some instructions like VIA's padlock instructions generate a spurious
+ * DNA fault but don't modify SSE registers. And these instructions
+ * get used from interrupt context aswell. To prevent these kernel instructions
+ * in interrupt context interact wrongly with other user/kernel fpu usage, we
+ * should use them only in the context of irq_ts_save/restore()
+ */
+static inline int irq_ts_save(void)
+{
+ /*
+ * If we are in process context, we are ok to take a spurious DNA fault.
+ * Otherwise, doing clts() in process context require pre-emption to
+ * be disabled or some heavy lifting like kernel_fpu_begin()
+ */
+ if (!in_interrupt())
+ return 0;
+
+ if (read_cr0() & X86_CR0_TS) {
+ clts();
+ return 1;
+ }
+
+ return 0;
+}
+
+static inline void irq_ts_restore(int TS_state)
+{
+ if (TS_state)
+ stts();
+}
+
+#ifdef CONFIG_X86_64
+
+static inline void save_init_fpu(struct task_struct *tsk)
+{
+ __save_init_fpu(tsk);
+ stts();
+}
+
+#define unlazy_fpu __unlazy_fpu
+#define clear_fpu __clear_fpu
+
+#else /* CONFIG_X86_32 */
+
+/*
+ * These disable preemption on their own and are safe
+ */
+static inline void save_init_fpu(struct task_struct *tsk)
+{
+ preempt_disable();
+ __save_init_fpu(tsk);
+ stts();
+ preempt_enable();
+}
+
+static inline void unlazy_fpu(struct task_struct *tsk)
+{
+ preempt_disable();
+ __unlazy_fpu(tsk);
+ preempt_enable();
+}
+
+static inline void clear_fpu(struct task_struct *tsk)
+{
+ preempt_disable();
+ __clear_fpu(tsk);
+ preempt_enable();
+}
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * i387 state interaction
+ */
+static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
+{
+ if (cpu_has_fxsr) {
+ return tsk->thread.xstate->fxsave.cwd;
+ } else {
+ return (unsigned short)tsk->thread.xstate->fsave.cwd;
+ }
+}
+
+static inline unsigned short get_fpu_swd(struct task_struct *tsk)
+{
+ if (cpu_has_fxsr) {
+ return tsk->thread.xstate->fxsave.swd;
+ } else {
+ return (unsigned short)tsk->thread.xstate->fsave.swd;
+ }
+}
+
+static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
+{
+ if (cpu_has_xmm) {
+ return tsk->thread.xstate->fxsave.mxcsr;
+ } else {
+ return MXCSR_DEFAULT;
+ }
+}
+
+#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
new file mode 100644
index 000000000000..1edbf89680fd
--- /dev/null
+++ b/arch/x86/include/asm/i8253.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_I8253_H
+#define _ASM_X86_I8253_H
+
+/* i8253A PIT registers */
+#define PIT_MODE 0x43
+#define PIT_CH0 0x40
+#define PIT_CH2 0x42
+
+extern spinlock_t i8253_lock;
+
+extern struct clock_event_device *global_clock_event;
+
+extern void setup_pit_timer(void);
+
+#define inb_pit inb_p
+#define outb_pit outb_p
+
+#endif /* _ASM_X86_I8253_H */
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
new file mode 100644
index 000000000000..58d7091eeb1f
--- /dev/null
+++ b/arch/x86/include/asm/i8259.h
@@ -0,0 +1,63 @@
+#ifndef _ASM_X86_I8259_H
+#define _ASM_X86_I8259_H
+
+#include <linux/delay.h>
+
+extern unsigned int cached_irq_mask;
+
+#define __byte(x, y) (((unsigned char *)&(y))[x])
+#define cached_master_mask (__byte(0, cached_irq_mask))
+#define cached_slave_mask (__byte(1, cached_irq_mask))
+
+/* i8259A PIC registers */
+#define PIC_MASTER_CMD 0x20
+#define PIC_MASTER_IMR 0x21
+#define PIC_MASTER_ISR PIC_MASTER_CMD
+#define PIC_MASTER_POLL PIC_MASTER_ISR
+#define PIC_MASTER_OCW3 PIC_MASTER_ISR
+#define PIC_SLAVE_CMD 0xa0
+#define PIC_SLAVE_IMR 0xa1
+
+/* i8259A PIC related value */
+#define PIC_CASCADE_IR 2
+#define MASTER_ICW4_DEFAULT 0x01
+#define SLAVE_ICW4_DEFAULT 0x01
+#define PIC_ICW4_AEOI 2
+
+extern spinlock_t i8259A_lock;
+
+extern void init_8259A(int auto_eoi);
+extern void enable_8259A_irq(unsigned int irq);
+extern void disable_8259A_irq(unsigned int irq);
+extern unsigned int startup_8259A_irq(unsigned int irq);
+
+/* the PIC may need a careful delay on some platforms, hence specific calls */
+static inline unsigned char inb_pic(unsigned int port)
+{
+ unsigned char value = inb(port);
+
+ /*
+ * delay for some accesses to PIC on motherboard or in chipset
+ * must be at least one microsecond, so be safe here:
+ */
+ udelay(2);
+
+ return value;
+}
+
+static inline void outb_pic(unsigned char value, unsigned int port)
+{
+ outb(value, port);
+ /*
+ * delay for some accesses to PIC on motherboard or in chipset
+ * must be at least one microsecond, so be safe here:
+ */
+ udelay(2);
+}
+
+extern struct irq_chip i8259A_chip;
+
+extern void mask_8259A(void);
+extern void unmask_8259A(void);
+
+#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
new file mode 100644
index 000000000000..50ca486fd88c
--- /dev/null
+++ b/arch/x86/include/asm/ia32.h
@@ -0,0 +1,152 @@
+#ifndef _ASM_X86_IA32_H
+#define _ASM_X86_IA32_H
+
+
+#ifdef CONFIG_IA32_EMULATION
+
+#include <linux/compat.h>
+
+/*
+ * 32 bit structures for IA32 support.
+ */
+
+#include <asm/sigcontext32.h>
+
+/* signal.h */
+struct sigaction32 {
+ unsigned int sa_handler; /* Really a pointer, but need to deal
+ with 32 bits */
+ unsigned int sa_flags;
+ unsigned int sa_restorer; /* Another 32 bit pointer */
+ compat_sigset_t sa_mask; /* A 32 bit mask */
+};
+
+struct old_sigaction32 {
+ unsigned int sa_handler; /* Really a pointer, but need to deal
+ with 32 bits */
+ compat_old_sigset_t sa_mask; /* A 32 bit mask */
+ unsigned int sa_flags;
+ unsigned int sa_restorer; /* Another 32 bit pointer */
+};
+
+typedef struct sigaltstack_ia32 {
+ unsigned int ss_sp;
+ int ss_flags;
+ unsigned int ss_size;
+} stack_ia32_t;
+
+struct ucontext_ia32 {
+ unsigned int uc_flags;
+ unsigned int uc_link;
+ stack_ia32_t uc_stack;
+ struct sigcontext_ia32 uc_mcontext;
+ compat_sigset_t uc_sigmask; /* mask last for extensibility */
+};
+
+/* This matches struct stat64 in glibc2.2, hence the absolutely
+ * insane amounts of padding around dev_t's.
+ */
+struct stat64 {
+ unsigned long long st_dev;
+ unsigned char __pad0[4];
+
+#define STAT64_HAS_BROKEN_ST_INO 1
+ unsigned int __st_ino;
+
+ unsigned int st_mode;
+ unsigned int st_nlink;
+
+ unsigned int st_uid;
+ unsigned int st_gid;
+
+ unsigned long long st_rdev;
+ unsigned char __pad3[4];
+
+ long long st_size;
+ unsigned int st_blksize;
+
+ long long st_blocks;/* Number 512-byte blocks allocated */
+
+ unsigned st_atime;
+ unsigned st_atime_nsec;
+ unsigned st_mtime;
+ unsigned st_mtime_nsec;
+ unsigned st_ctime;
+ unsigned st_ctime_nsec;
+
+ unsigned long long st_ino;
+} __attribute__((packed));
+
+typedef struct compat_siginfo {
+ int si_signo;
+ int si_errno;
+ int si_code;
+
+ union {
+ int _pad[((128 / sizeof(int)) - 3)];
+
+ /* kill() */
+ struct {
+ unsigned int _pid; /* sender's pid */
+ unsigned int _uid; /* sender's uid */
+ } _kill;
+
+ /* POSIX.1b timers */
+ struct {
+ compat_timer_t _tid; /* timer id */
+ int _overrun; /* overrun count */
+ compat_sigval_t _sigval; /* same as below */
+ int _sys_private; /* not to be passed to user */
+ int _overrun_incr; /* amount to add to overrun */
+ } _timer;
+
+ /* POSIX.1b signals */
+ struct {
+ unsigned int _pid; /* sender's pid */
+ unsigned int _uid; /* sender's uid */
+ compat_sigval_t _sigval;
+ } _rt;
+
+ /* SIGCHLD */
+ struct {
+ unsigned int _pid; /* which child */
+ unsigned int _uid; /* sender's uid */
+ int _status; /* exit code */
+ compat_clock_t _utime;
+ compat_clock_t _stime;
+ } _sigchld;
+
+ /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
+ struct {
+ unsigned int _addr; /* faulting insn/memory ref. */
+ } _sigfault;
+
+ /* SIGPOLL */
+ struct {
+ int _band; /* POLL_IN, POLL_OUT, POLL_MSG */
+ int _fd;
+ } _sigpoll;
+ } _sifields;
+} compat_siginfo_t;
+
+struct ustat32 {
+ __u32 f_tfree;
+ compat_ino_t f_tinode;
+ char f_fname[6];
+ char f_fpack[6];
+};
+
+#define IA32_STACK_TOP IA32_PAGE_OFFSET
+
+#ifdef __KERNEL__
+struct linux_binprm;
+extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
+ unsigned long stack_top, int exec_stack);
+struct mm_struct;
+extern void ia32_pick_mmap_layout(struct mm_struct *mm);
+
+#endif
+
+#endif /* !CONFIG_IA32_SUPPORT */
+
+#endif /* _ASM_X86_IA32_H */
diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h
new file mode 100644
index 000000000000..976f6ecd2ce6
--- /dev/null
+++ b/arch/x86/include/asm/ia32_unistd.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_IA32_UNISTD_H
+#define _ASM_X86_IA32_UNISTD_H
+
+/*
+ * This file contains the system call numbers of the ia32 port,
+ * this is for the kernel only.
+ * Only add syscalls here where some part of the kernel needs to know
+ * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK
+ */
+
+#define __NR_ia32_restart_syscall 0
+#define __NR_ia32_exit 1
+#define __NR_ia32_read 3
+#define __NR_ia32_write 4
+#define __NR_ia32_sigreturn 119
+#define __NR_ia32_rt_sigreturn 173
+
+#endif /* _ASM_X86_IA32_UNISTD_H */
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
new file mode 100644
index 000000000000..38d87379e270
--- /dev/null
+++ b/arch/x86/include/asm/idle.h
@@ -0,0 +1,21 @@
+#ifndef _ASM_X86_IDLE_H
+#define _ASM_X86_IDLE_H
+
+#define IDLE_START 1
+#define IDLE_END 2
+
+struct notifier_block;
+void idle_notifier_register(struct notifier_block *n);
+void idle_notifier_unregister(struct notifier_block *n);
+
+#ifdef CONFIG_X86_64
+void enter_idle(void);
+void exit_idle(void);
+#else /* !CONFIG_X86_64 */
+static inline void enter_idle(void) { }
+static inline void exit_idle(void) { }
+#endif /* CONFIG_X86_64 */
+
+void c1e_remove_cpu(int cpu);
+
+#endif /* _ASM_X86_IDLE_H */
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
new file mode 100644
index 000000000000..fa0fd068bc2e
--- /dev/null
+++ b/arch/x86/include/asm/intel_arch_perfmon.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
+#define _ASM_X86_INTEL_ARCH_PERFMON_H
+
+#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
+
+#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
+
+#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
+
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
+ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+union cpuid10_eax {
+ struct {
+ unsigned int version_id:8;
+ unsigned int num_counters:8;
+ unsigned int bit_width:8;
+ unsigned int mask_length:8;
+ } split;
+ unsigned int full;
+};
+
+#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
new file mode 100644
index 000000000000..05cfed4485fa
--- /dev/null
+++ b/arch/x86/include/asm/io.h
@@ -0,0 +1,110 @@
+#ifndef _ASM_X86_IO_H
+#define _ASM_X86_IO_H
+
+#define ARCH_HAS_IOREMAP_WC
+
+#include <linux/compiler.h>
+#include <asm-generic/int-ll64.h>
+
+#define build_mmio_read(name, size, type, reg, barrier) \
+static inline type name(const volatile void __iomem *addr) \
+{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
+:"m" (*(volatile type __force *)addr) barrier); return ret; }
+
+#define build_mmio_write(name, size, type, reg, barrier) \
+static inline void name(type val, volatile void __iomem *addr) \
+{ asm volatile("mov" size " %0,%1": :reg (val), \
+"m" (*(volatile type __force *)addr) barrier); }
+
+build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
+build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
+build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
+
+build_mmio_read(__readb, "b", unsigned char, "=q", )
+build_mmio_read(__readw, "w", unsigned short, "=r", )
+build_mmio_read(__readl, "l", unsigned int, "=r", )
+
+build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
+build_mmio_write(writew, "w", unsigned short, "r", :"memory")
+build_mmio_write(writel, "l", unsigned int, "r", :"memory")
+
+build_mmio_write(__writeb, "b", unsigned char, "q", )
+build_mmio_write(__writew, "w", unsigned short, "r", )
+build_mmio_write(__writel, "l", unsigned int, "r", )
+
+#define readb_relaxed(a) __readb(a)
+#define readw_relaxed(a) __readw(a)
+#define readl_relaxed(a) __readl(a)
+#define __raw_readb __readb
+#define __raw_readw __readw
+#define __raw_readl __readl
+
+#define __raw_writeb __writeb
+#define __raw_writew __writew
+#define __raw_writel __writel
+
+#define mmiowb() barrier()
+
+#ifdef CONFIG_X86_64
+
+build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
+build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
+
+#else
+
+static inline __u64 readq(const volatile void __iomem *addr)
+{
+ const volatile u32 __iomem *p = addr;
+ u32 low, high;
+
+ low = readl(p);
+ high = readl(p + 1);
+
+ return low + ((u64)high << 32);
+}
+
+static inline void writeq(__u64 val, volatile void __iomem *addr)
+{
+ writel(val, addr);
+ writel(val >> 32, addr+4);
+}
+
+#endif
+
+#define readq_relaxed(a) readq(a)
+
+#define __raw_readq(a) readq(a)
+#define __raw_writeq(val, addr) writeq(val, addr)
+
+/* Let people know that we have them */
+#define readq readq
+#define writeq writeq
+
+#ifdef CONFIG_X86_32
+# include "io_32.h"
+#else
+# include "io_64.h"
+#endif
+
+extern void *xlate_dev_mem_ptr(unsigned long phys);
+extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
+
+extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
+ unsigned long prot_val);
+extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
+
+/*
+ * early_ioremap() and early_iounmap() are for temporary early boot-time
+ * mappings, before the real ioremap() is functional.
+ * A boot-time mapping is currently limited to at most 16 pages.
+ */
+extern void early_ioremap_init(void);
+extern void early_ioremap_clear(void);
+extern void early_ioremap_reset(void);
+extern void __iomem *early_ioremap(unsigned long offset, unsigned long size);
+extern void __iomem *early_memremap(unsigned long offset, unsigned long size);
+extern void early_iounmap(void __iomem *addr, unsigned long size);
+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
+
+
+#endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h
new file mode 100644
index 000000000000..d8e242e1b396
--- /dev/null
+++ b/arch/x86/include/asm/io_32.h
@@ -0,0 +1,284 @@
+#ifndef _ASM_X86_IO_32_H
+#define _ASM_X86_IO_32_H
+
+#include <linux/string.h>
+#include <linux/compiler.h>
+
+/*
+ * This file contains the definitions for the x86 IO instructions
+ * inb/inw/inl/outb/outw/outl and the "string versions" of the same
+ * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
+ * versions of the single-IO instructions (inb_p/inw_p/..).
+ *
+ * This file is not meant to be obfuscating: it's just complicated
+ * to (a) handle it all in a way that makes gcc able to optimize it
+ * as well as possible and (b) trying to avoid writing the same thing
+ * over and over again with slight variations and possibly making a
+ * mistake somewhere.
+ */
+
+/*
+ * Thanks to James van Artsdalen for a better timing-fix than
+ * the two short jumps: using outb's to a nonexistent port seems
+ * to guarantee better timings even on fast machines.
+ *
+ * On the other hand, I'd like to be sure of a non-existent port:
+ * I feel a bit unsafe about using 0x80 (should be safe, though)
+ *
+ * Linus
+ */
+
+ /*
+ * Bit simplified and optimized by Jan Hubicka
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
+ *
+ * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
+ * isa_read[wl] and isa_write[wl] fixed
+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ */
+
+#define IO_SPACE_LIMIT 0xffff
+
+#define XQUAD_PORTIO_BASE 0xfe400000
+#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
+
+#ifdef __KERNEL__
+
+#include <asm-generic/iomap.h>
+
+#include <linux/vmalloc.h>
+
+/*
+ * Convert a virtual cached pointer to an uncached pointer
+ */
+#define xlate_dev_kmem_ptr(p) p
+
+/**
+ * virt_to_phys - map virtual addresses to physical
+ * @address: address to remap
+ *
+ * The returned physical address is the physical (CPU) mapping for
+ * the memory address given. It is only valid to use this function on
+ * addresses directly mapped or allocated via kmalloc.
+ *
+ * This function does not give bus mappings for DMA transfers. In
+ * almost all conceivable cases a device driver should not be using
+ * this function
+ */
+
+static inline unsigned long virt_to_phys(volatile void *address)
+{
+ return __pa(address);
+}
+
+/**
+ * phys_to_virt - map physical address to virtual
+ * @address: address to remap
+ *
+ * The returned virtual address is a current CPU mapping for
+ * the memory address given. It is only valid to use this function on
+ * addresses that have a kernel mapping
+ *
+ * This function does not handle bus mappings for DMA transfers. In
+ * almost all conceivable cases a device driver should not be using
+ * this function
+ */
+
+static inline void *phys_to_virt(unsigned long address)
+{
+ return __va(address);
+}
+
+/*
+ * Change "struct page" to physical address.
+ */
+#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
+
+/**
+ * ioremap - map bus memory into CPU space
+ * @offset: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * ioremap performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * If the area you are trying to map is a PCI BAR you should have a
+ * look at pci_iomap().
+ */
+extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
+ unsigned long prot_val);
+
+/*
+ * The default ioremap() behavior is non-cached:
+ */
+static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
+{
+ return ioremap_nocache(offset, size);
+}
+
+extern void iounmap(volatile void __iomem *addr);
+
+/*
+ * ISA I/O bus memory addresses are 1:1 with the physical address.
+ */
+#define isa_virt_to_bus virt_to_phys
+#define isa_page_to_bus page_to_phys
+#define isa_bus_to_virt phys_to_virt
+
+/*
+ * However PCI ones are not necessarily 1:1 and therefore these interfaces
+ * are forbidden in portable PCI drivers.
+ *
+ * Allow them on x86 for legacy drivers, though.
+ */
+#define virt_to_bus virt_to_phys
+#define bus_to_virt phys_to_virt
+
+static inline void
+memset_io(volatile void __iomem *addr, unsigned char val, int count)
+{
+ memset((void __force *)addr, val, count);
+}
+
+static inline void
+memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
+{
+ __memcpy(dst, (const void __force *)src, count);
+}
+
+static inline void
+memcpy_toio(volatile void __iomem *dst, const void *src, int count)
+{
+ __memcpy((void __force *)dst, src, count);
+}
+
+/*
+ * ISA space is 'always mapped' on a typical x86 system, no need to
+ * explicitly ioremap() it. The fact that the ISA IO space is mapped
+ * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
+ * are physical addresses. The following constant pointer can be
+ * used as the IO-area pointer (it can be iounmapped as well, so the
+ * analogy with PCI is quite large):
+ */
+#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
+
+/*
+ * Cache management
+ *
+ * This needed for two cases
+ * 1. Out of order aware processors
+ * 2. Accidentally out of order processors (PPro errata #51)
+ */
+
+#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
+
+static inline void flush_write_buffers(void)
+{
+ asm volatile("lock; addl $0,0(%%esp)": : :"memory");
+}
+
+#else
+
+#define flush_write_buffers() do { } while (0)
+
+#endif
+
+#endif /* __KERNEL__ */
+
+extern void native_io_delay(void);
+
+extern int io_delay_type;
+extern void io_delay_init(void);
+
+#if defined(CONFIG_PARAVIRT)
+#include <asm/paravirt.h>
+#else
+
+static inline void slow_down_io(void)
+{
+ native_io_delay();
+#ifdef REALLY_SLOW_IO
+ native_io_delay();
+ native_io_delay();
+ native_io_delay();
+#endif
+}
+
+#endif
+
+#define __BUILDIO(bwl, bw, type) \
+static inline void out##bwl(unsigned type value, int port) \
+{ \
+ out##bwl##_local(value, port); \
+} \
+ \
+static inline unsigned type in##bwl(int port) \
+{ \
+ return in##bwl##_local(port); \
+}
+
+#define BUILDIO(bwl, bw, type) \
+static inline void out##bwl##_local(unsigned type value, int port) \
+{ \
+ asm volatile("out" #bwl " %" #bw "0, %w1" \
+ : : "a"(value), "Nd"(port)); \
+} \
+ \
+static inline unsigned type in##bwl##_local(int port) \
+{ \
+ unsigned type value; \
+ asm volatile("in" #bwl " %w1, %" #bw "0" \
+ : "=a"(value) : "Nd"(port)); \
+ return value; \
+} \
+ \
+static inline void out##bwl##_local_p(unsigned type value, int port) \
+{ \
+ out##bwl##_local(value, port); \
+ slow_down_io(); \
+} \
+ \
+static inline unsigned type in##bwl##_local_p(int port) \
+{ \
+ unsigned type value = in##bwl##_local(port); \
+ slow_down_io(); \
+ return value; \
+} \
+ \
+__BUILDIO(bwl, bw, type) \
+ \
+static inline void out##bwl##_p(unsigned type value, int port) \
+{ \
+ out##bwl(value, port); \
+ slow_down_io(); \
+} \
+ \
+static inline unsigned type in##bwl##_p(int port) \
+{ \
+ unsigned type value = in##bwl(port); \
+ slow_down_io(); \
+ return value; \
+} \
+ \
+static inline void outs##bwl(int port, const void *addr, unsigned long count) \
+{ \
+ asm volatile("rep; outs" #bwl \
+ : "+S"(addr), "+c"(count) : "d"(port)); \
+} \
+ \
+static inline void ins##bwl(int port, void *addr, unsigned long count) \
+{ \
+ asm volatile("rep; ins" #bwl \
+ : "+D"(addr), "+c"(count) : "d"(port)); \
+}
+
+BUILDIO(b, b, char)
+BUILDIO(w, w, short)
+BUILDIO(l, , int)
+
+#endif /* _ASM_X86_IO_32_H */
diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h
new file mode 100644
index 000000000000..563c16270ba6
--- /dev/null
+++ b/arch/x86/include/asm/io_64.h
@@ -0,0 +1,242 @@
+#ifndef _ASM_X86_IO_64_H
+#define _ASM_X86_IO_64_H
+
+
+/*
+ * This file contains the definitions for the x86 IO instructions
+ * inb/inw/inl/outb/outw/outl and the "string versions" of the same
+ * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
+ * versions of the single-IO instructions (inb_p/inw_p/..).
+ *
+ * This file is not meant to be obfuscating: it's just complicated
+ * to (a) handle it all in a way that makes gcc able to optimize it
+ * as well as possible and (b) trying to avoid writing the same thing
+ * over and over again with slight variations and possibly making a
+ * mistake somewhere.
+ */
+
+/*
+ * Thanks to James van Artsdalen for a better timing-fix than
+ * the two short jumps: using outb's to a nonexistent port seems
+ * to guarantee better timings even on fast machines.
+ *
+ * On the other hand, I'd like to be sure of a non-existent port:
+ * I feel a bit unsafe about using 0x80 (should be safe, though)
+ *
+ * Linus
+ */
+
+ /*
+ * Bit simplified and optimized by Jan Hubicka
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
+ *
+ * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
+ * isa_read[wl] and isa_write[wl] fixed
+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ */
+
+extern void native_io_delay(void);
+
+extern int io_delay_type;
+extern void io_delay_init(void);
+
+#if defined(CONFIG_PARAVIRT)
+#include <asm/paravirt.h>
+#else
+
+static inline void slow_down_io(void)
+{
+ native_io_delay();
+#ifdef REALLY_SLOW_IO
+ native_io_delay();
+ native_io_delay();
+ native_io_delay();
+#endif
+}
+#endif
+
+/*
+ * Talk about misusing macros..
+ */
+#define __OUT1(s, x) \
+static inline void out##s(unsigned x value, unsigned short port) {
+
+#define __OUT2(s, s1, s2) \
+asm volatile ("out" #s " %" s1 "0,%" s2 "1"
+
+#ifndef REALLY_SLOW_IO
+#define REALLY_SLOW_IO
+#define UNSET_REALLY_SLOW_IO
+#endif
+
+#define __OUT(s, s1, x) \
+ __OUT1(s, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
+ } \
+ __OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
+ slow_down_io(); \
+}
+
+#define __IN1(s) \
+static inline RETURN_TYPE in##s(unsigned short port) \
+{ \
+ RETURN_TYPE _v;
+
+#define __IN2(s, s1, s2) \
+ asm volatile ("in" #s " %" s2 "1,%" s1 "0"
+
+#define __IN(s, s1, i...) \
+ __IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
+ return _v; \
+ } \
+ __IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
+ slow_down_io(); \
+ return _v; }
+
+#ifdef UNSET_REALLY_SLOW_IO
+#undef REALLY_SLOW_IO
+#endif
+
+#define __INS(s) \
+static inline void ins##s(unsigned short port, void *addr, \
+ unsigned long count) \
+{ \
+ asm volatile ("rep ; ins" #s \
+ : "=D" (addr), "=c" (count) \
+ : "d" (port), "0" (addr), "1" (count)); \
+}
+
+#define __OUTS(s) \
+static inline void outs##s(unsigned short port, const void *addr, \
+ unsigned long count) \
+{ \
+ asm volatile ("rep ; outs" #s \
+ : "=S" (addr), "=c" (count) \
+ : "d" (port), "0" (addr), "1" (count)); \
+}
+
+#define RETURN_TYPE unsigned char
+__IN(b, "")
+#undef RETURN_TYPE
+#define RETURN_TYPE unsigned short
+__IN(w, "")
+#undef RETURN_TYPE
+#define RETURN_TYPE unsigned int
+__IN(l, "")
+#undef RETURN_TYPE
+
+__OUT(b, "b", char)
+__OUT(w, "w", short)
+__OUT(l, , int)
+
+__INS(b)
+__INS(w)
+__INS(l)
+
+__OUTS(b)
+__OUTS(w)
+__OUTS(l)
+
+#define IO_SPACE_LIMIT 0xffff
+
+#if defined(__KERNEL__) && defined(__x86_64__)
+
+#include <linux/vmalloc.h>
+
+#ifndef __i386__
+/*
+ * Change virtual addresses to physical addresses and vv.
+ * These are pretty trivial
+ */
+static inline unsigned long virt_to_phys(volatile void *address)
+{
+ return __pa(address);
+}
+
+static inline void *phys_to_virt(unsigned long address)
+{
+ return __va(address);
+}
+#endif
+
+/*
+ * Change "struct page" to physical address.
+ */
+#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
+
+#include <asm-generic/iomap.h>
+
+/*
+ * This one maps high address device memory and turns off caching for that area.
+ * it's useful if some control registers are in such an area and write combining
+ * or read caching is not desirable:
+ */
+extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
+ unsigned long prot_val);
+
+/*
+ * The default ioremap() behavior is non-cached:
+ */
+static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
+{
+ return ioremap_nocache(offset, size);
+}
+
+extern void iounmap(volatile void __iomem *addr);
+
+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
+
+/*
+ * ISA I/O bus memory addresses are 1:1 with the physical address.
+ */
+#define isa_virt_to_bus virt_to_phys
+#define isa_page_to_bus page_to_phys
+#define isa_bus_to_virt phys_to_virt
+
+/*
+ * However PCI ones are not necessarily 1:1 and therefore these interfaces
+ * are forbidden in portable PCI drivers.
+ *
+ * Allow them on x86 for legacy drivers, though.
+ */
+#define virt_to_bus virt_to_phys
+#define bus_to_virt phys_to_virt
+
+void __memcpy_fromio(void *, unsigned long, unsigned);
+void __memcpy_toio(unsigned long, const void *, unsigned);
+
+static inline void memcpy_fromio(void *to, const volatile void __iomem *from,
+ unsigned len)
+{
+ __memcpy_fromio(to, (unsigned long)from, len);
+}
+
+static inline void memcpy_toio(volatile void __iomem *to, const void *from,
+ unsigned len)
+{
+ __memcpy_toio((unsigned long)to, from, len);
+}
+
+void memset_io(volatile void __iomem *a, int b, size_t c);
+
+/*
+ * ISA space is 'always mapped' on a typical x86 system, no need to
+ * explicitly ioremap() it. The fact that the ISA IO space is mapped
+ * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
+ * are physical addresses. The following constant pointer can be
+ * used as the IO-area pointer (it can be iounmapped as well, so the
+ * analogy with PCI is quite large):
+ */
+#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
+
+#define flush_write_buffers()
+
+/*
+ * Convert a virtual cached pointer to an uncached pointer
+ */
+#define xlate_dev_kmem_ptr(p) p
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_IO_64_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
new file mode 100644
index 000000000000..7a1f44ac1f17
--- /dev/null
+++ b/arch/x86/include/asm/io_apic.h
@@ -0,0 +1,211 @@
+#ifndef _ASM_X86_IO_APIC_H
+#define _ASM_X86_IO_APIC_H
+
+#include <linux/types.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/irq_vectors.h>
+
+/*
+ * Intel IO-APIC support for SMP and UP systems.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar
+ */
+
+/* I/O Unit Redirection Table */
+#define IO_APIC_REDIR_VECTOR_MASK 0x000FF
+#define IO_APIC_REDIR_DEST_LOGICAL 0x00800
+#define IO_APIC_REDIR_DEST_PHYSICAL 0x00000
+#define IO_APIC_REDIR_SEND_PENDING (1 << 12)
+#define IO_APIC_REDIR_REMOTE_IRR (1 << 14)
+#define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15)
+#define IO_APIC_REDIR_MASKED (1 << 16)
+
+/*
+ * The structure of the IO-APIC:
+ */
+union IO_APIC_reg_00 {
+ u32 raw;
+ struct {
+ u32 __reserved_2 : 14,
+ LTS : 1,
+ delivery_type : 1,
+ __reserved_1 : 8,
+ ID : 8;
+ } __attribute__ ((packed)) bits;
+};
+
+union IO_APIC_reg_01 {
+ u32 raw;
+ struct {
+ u32 version : 8,
+ __reserved_2 : 7,
+ PRQ : 1,
+ entries : 8,
+ __reserved_1 : 8;
+ } __attribute__ ((packed)) bits;
+};
+
+union IO_APIC_reg_02 {
+ u32 raw;
+ struct {
+ u32 __reserved_2 : 24,
+ arbitration : 4,
+ __reserved_1 : 4;
+ } __attribute__ ((packed)) bits;
+};
+
+union IO_APIC_reg_03 {
+ u32 raw;
+ struct {
+ u32 boot_DT : 1,
+ __reserved_1 : 31;
+ } __attribute__ ((packed)) bits;
+};
+
+enum ioapic_irq_destination_types {
+ dest_Fixed = 0,
+ dest_LowestPrio = 1,
+ dest_SMI = 2,
+ dest__reserved_1 = 3,
+ dest_NMI = 4,
+ dest_INIT = 5,
+ dest__reserved_2 = 6,
+ dest_ExtINT = 7
+};
+
+struct IO_APIC_route_entry {
+ __u32 vector : 8,
+ delivery_mode : 3, /* 000: FIXED
+ * 001: lowest prio
+ * 111: ExtINT
+ */
+ dest_mode : 1, /* 0: physical, 1: logical */
+ delivery_status : 1,
+ polarity : 1,
+ irr : 1,
+ trigger : 1, /* 0: edge, 1: level */
+ mask : 1, /* 0: enabled, 1: disabled */
+ __reserved_2 : 15;
+
+ __u32 __reserved_3 : 24,
+ dest : 8;
+} __attribute__ ((packed));
+
+struct IR_IO_APIC_route_entry {
+ __u64 vector : 8,
+ zero : 3,
+ index2 : 1,
+ delivery_status : 1,
+ polarity : 1,
+ irr : 1,
+ trigger : 1,
+ mask : 1,
+ reserved : 31,
+ format : 1,
+ index : 15;
+} __attribute__ ((packed));
+
+#ifdef CONFIG_X86_IO_APIC
+
+/*
+ * # of IO-APICs and # of IRQ routing registers
+ */
+extern int nr_ioapics;
+extern int nr_ioapic_registers[MAX_IO_APICS];
+
+/*
+ * MP-BIOS irq configuration table structures:
+ */
+
+#define MP_MAX_IOAPIC_PIN 127
+
+struct mp_config_ioapic {
+ unsigned long mp_apicaddr;
+ unsigned int mp_apicid;
+ unsigned char mp_type;
+ unsigned char mp_apicver;
+ unsigned char mp_flags;
+};
+
+struct mp_config_intsrc {
+ unsigned int mp_dstapic;
+ unsigned char mp_type;
+ unsigned char mp_irqtype;
+ unsigned short mp_irqflag;
+ unsigned char mp_srcbus;
+ unsigned char mp_srcbusirq;
+ unsigned char mp_dstirq;
+};
+
+/* I/O APIC entries */
+extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
+
+/* # of MP IRQ source entries */
+extern int mp_irq_entries;
+
+/* MP IRQ source entries */
+extern struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* non-0 if default (table-less) MP configuration */
+extern int mpc_default_type;
+
+/* Older SiS APIC requires we rewrite the index register */
+extern int sis_apic_bug;
+
+/* 1 if "noapic" boot option passed */
+extern int skip_ioapic_setup;
+
+/* 1 if "noapic" boot option passed */
+extern int noioapicquirk;
+
+/* -1 if "noapic" boot option passed */
+extern int noioapicreroute;
+
+/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
+extern int timer_through_8259;
+
+static inline void disable_ioapic_setup(void)
+{
+#ifdef CONFIG_PCI
+ noioapicquirk = 1;
+ noioapicreroute = -1;
+#endif
+ skip_ioapic_setup = 1;
+}
+
+/*
+ * If we use the IO-APIC for IRQ routing, disable automatic
+ * assignment of PCI IRQ's.
+ */
+#define io_apic_assign_pci_irqs \
+ (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
+
+#ifdef CONFIG_ACPI
+extern int io_apic_get_unique_id(int ioapic, int apic_id);
+extern int io_apic_get_version(int ioapic);
+extern int io_apic_get_redir_entries(int ioapic);
+extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
+ int edge_level, int active_high_low);
+#endif /* CONFIG_ACPI */
+
+extern int (*ioapic_renumber_irq)(int ioapic, int irq);
+extern void ioapic_init_mappings(void);
+
+#ifdef CONFIG_X86_64
+extern int save_mask_IO_APIC_setup(void);
+extern void restore_IO_APIC_setup(void);
+extern void reinit_intr_remapped_IO_APIC(int);
+#endif
+
+extern void probe_nr_irqs_gsi(void);
+
+#else /* !CONFIG_X86_IO_APIC */
+#define io_apic_assign_pci_irqs 0
+static const int timer_through_8259 = 0;
+static inline void ioapic_init_mappings(void) { }
+
+static inline void probe_nr_irqs_gsi(void) { }
+#endif
+
+#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/ioctl.h b/arch/x86/include/asm/ioctl.h
new file mode 100644
index 000000000000..b279fe06dfe5
--- /dev/null
+++ b/arch/x86/include/asm/ioctl.h
@@ -0,0 +1 @@
+#include <asm-generic/ioctl.h>
diff --git a/arch/x86/include/asm/ioctls.h b/arch/x86/include/asm/ioctls.h
new file mode 100644
index 000000000000..0d5b23b7b06e
--- /dev/null
+++ b/arch/x86/include/asm/ioctls.h
@@ -0,0 +1,94 @@
+#ifndef _ASM_X86_IOCTLS_H
+#define _ASM_X86_IOCTLS_H
+
+#include <asm/ioctl.h>
+
+/* 0x54 is just a magic number to make these relatively unique ('T') */
+
+#define TCGETS 0x5401
+#define TCSETS 0x5402 /* Clashes with SNDCTL_TMR_START sound ioctl */
+#define TCSETSW 0x5403
+#define TCSETSF 0x5404
+#define TCGETA 0x5405
+#define TCSETA 0x5406
+#define TCSETAW 0x5407
+#define TCSETAF 0x5408
+#define TCSBRK 0x5409
+#define TCXONC 0x540A
+#define TCFLSH 0x540B
+#define TIOCEXCL 0x540C
+#define TIOCNXCL 0x540D
+#define TIOCSCTTY 0x540E
+#define TIOCGPGRP 0x540F
+#define TIOCSPGRP 0x5410
+#define TIOCOUTQ 0x5411
+#define TIOCSTI 0x5412
+#define TIOCGWINSZ 0x5413
+#define TIOCSWINSZ 0x5414
+#define TIOCMGET 0x5415
+#define TIOCMBIS 0x5416
+#define TIOCMBIC 0x5417
+#define TIOCMSET 0x5418
+#define TIOCGSOFTCAR 0x5419
+#define TIOCSSOFTCAR 0x541A
+#define FIONREAD 0x541B
+#define TIOCINQ FIONREAD
+#define TIOCLINUX 0x541C
+#define TIOCCONS 0x541D
+#define TIOCGSERIAL 0x541E
+#define TIOCSSERIAL 0x541F
+#define TIOCPKT 0x5420
+#define FIONBIO 0x5421
+#define TIOCNOTTY 0x5422
+#define TIOCSETD 0x5423
+#define TIOCGETD 0x5424
+#define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */
+/* #define TIOCTTYGSTRUCT 0x5426 - Former debugging-only ioctl */
+#define TIOCSBRK 0x5427 /* BSD compatibility */
+#define TIOCCBRK 0x5428 /* BSD compatibility */
+#define TIOCGSID 0x5429 /* Return the session ID of FD */
+#define TCGETS2 _IOR('T', 0x2A, struct termios2)
+#define TCSETS2 _IOW('T', 0x2B, struct termios2)
+#define TCSETSW2 _IOW('T', 0x2C, struct termios2)
+#define TCSETSF2 _IOW('T', 0x2D, struct termios2)
+#define TIOCGRS485 0x542E
+#define TIOCSRS485 0x542F
+#define TIOCGPTN _IOR('T', 0x30, unsigned int)
+ /* Get Pty Number (of pty-mux device) */
+#define TIOCSPTLCK _IOW('T', 0x31, int) /* Lock/unlock Pty */
+#define TCGETX 0x5432 /* SYS5 TCGETX compatibility */
+#define TCSETX 0x5433
+#define TCSETXF 0x5434
+#define TCSETXW 0x5435
+
+#define FIONCLEX 0x5450
+#define FIOCLEX 0x5451
+#define FIOASYNC 0x5452
+#define TIOCSERCONFIG 0x5453
+#define TIOCSERGWILD 0x5454
+#define TIOCSERSWILD 0x5455
+#define TIOCGLCKTRMIOS 0x5456
+#define TIOCSLCKTRMIOS 0x5457
+#define TIOCSERGSTRUCT 0x5458 /* For debugging only */
+#define TIOCSERGETLSR 0x5459 /* Get line status register */
+#define TIOCSERGETMULTI 0x545A /* Get multiport config */
+#define TIOCSERSETMULTI 0x545B /* Set multiport config */
+
+#define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */
+#define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */
+#define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */
+#define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */
+#define FIOQSIZE 0x5460
+
+/* Used for packet mode */
+#define TIOCPKT_DATA 0
+#define TIOCPKT_FLUSHREAD 1
+#define TIOCPKT_FLUSHWRITE 2
+#define TIOCPKT_STOP 4
+#define TIOCPKT_START 8
+#define TIOCPKT_NOSTOP 16
+#define TIOCPKT_DOSTOP 32
+
+#define TIOCSER_TEMT 0x01 /* Transmitter physically empty */
+
+#endif /* _ASM_X86_IOCTLS_H */
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
new file mode 100644
index 000000000000..c1f06289b14b
--- /dev/null
+++ b/arch/x86/include/asm/iomap.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2008 Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+
+void *
+iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
+
+void
+iounmap_atomic(void *kvaddr, enum km_type type);
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
new file mode 100644
index 000000000000..a6ee9e6f530f
--- /dev/null
+++ b/arch/x86/include/asm/iommu.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_X86_IOMMU_H
+#define _ASM_X86_IOMMU_H
+
+extern void pci_iommu_shutdown(void);
+extern void no_iommu_init(void);
+extern struct dma_mapping_ops nommu_dma_ops;
+extern int force_iommu, no_iommu;
+extern int iommu_detected;
+
+/* 10 seconds */
+#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
+
+#endif /* _ASM_X86_IOMMU_H */
diff --git a/arch/x86/include/asm/ipcbuf.h b/arch/x86/include/asm/ipcbuf.h
new file mode 100644
index 000000000000..ee678fd51594
--- /dev/null
+++ b/arch/x86/include/asm/ipcbuf.h
@@ -0,0 +1,28 @@
+#ifndef _ASM_X86_IPCBUF_H
+#define _ASM_X86_IPCBUF_H
+
+/*
+ * The ipc64_perm structure for x86 architecture.
+ * Note extra padding because this structure is passed back and forth
+ * between kernel and user space.
+ *
+ * Pad space is left for:
+ * - 32-bit mode_t and seq
+ * - 2 miscellaneous 32-bit values
+ */
+
+struct ipc64_perm {
+ __kernel_key_t key;
+ __kernel_uid32_t uid;
+ __kernel_gid32_t gid;
+ __kernel_uid32_t cuid;
+ __kernel_gid32_t cgid;
+ __kernel_mode_t mode;
+ unsigned short __pad1;
+ unsigned short seq;
+ unsigned short __pad2;
+ unsigned long __unused1;
+ unsigned long __unused2;
+};
+
+#endif /* _ASM_X86_IPCBUF_H */
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
new file mode 100644
index 000000000000..f89dffb28aa9
--- /dev/null
+++ b/arch/x86/include/asm/ipi.h
@@ -0,0 +1,138 @@
+#ifndef _ASM_X86_IPI_H
+#define _ASM_X86_IPI_H
+
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic APIC InterProcessor Interrupt code.
+ *
+ * Moved to include file by James Cleverdon from
+ * arch/x86-64/kernel/smp.c
+ *
+ * Copyrights from kernel/smp.c:
+ *
+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 2002,2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License, v.2
+ */
+
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/smp.h>
+
+/*
+ * the following functions deal with sending IPIs between CPUs.
+ *
+ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
+ */
+
+static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector,
+ unsigned int dest)
+{
+ unsigned int icr = shortcut | dest;
+
+ switch (vector) {
+ default:
+ icr |= APIC_DM_FIXED | vector;
+ break;
+ case NMI_VECTOR:
+ icr |= APIC_DM_NMI;
+ break;
+ }
+ return icr;
+}
+
+static inline int __prepare_ICR2(unsigned int mask)
+{
+ return SET_APIC_DEST_FIELD(mask);
+}
+
+static inline void __xapic_wait_icr_idle(void)
+{
+ while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY)
+ cpu_relax();
+}
+
+static inline void __send_IPI_shortcut(unsigned int shortcut, int vector,
+ unsigned int dest)
+{
+ /*
+ * Subtle. In the case of the 'never do double writes' workaround
+ * we have to lock out interrupts to be safe. As we don't care
+ * of the value read we use an atomic rmw access to avoid costly
+ * cli/sti. Otherwise we use an even cheaper single atomic write
+ * to the APIC.
+ */
+ unsigned int cfg;
+
+ /*
+ * Wait for idle.
+ */
+ __xapic_wait_icr_idle();
+
+ /*
+ * No need to touch the target chip field
+ */
+ cfg = __prepare_ICR(shortcut, vector, dest);
+
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ native_apic_mem_write(APIC_ICR, cfg);
+}
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+static inline void __send_IPI_dest_field(unsigned int mask, int vector,
+ unsigned int dest)
+{
+ unsigned long cfg;
+
+ /*
+ * Wait for idle.
+ */
+ if (unlikely(vector == NMI_VECTOR))
+ safe_apic_wait_icr_idle();
+ else
+ __xapic_wait_icr_idle();
+
+ /*
+ * prepare target chip field
+ */
+ cfg = __prepare_ICR2(mask);
+ native_apic_mem_write(APIC_ICR2, cfg);
+
+ /*
+ * program the ICR
+ */
+ cfg = __prepare_ICR(0, vector, dest);
+
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ native_apic_mem_write(APIC_ICR, cfg);
+}
+
+static inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
+{
+ unsigned long flags;
+ unsigned long query_cpu;
+
+ /*
+ * Hack. The clustered APIC addressing mode doesn't allow us to send
+ * to an arbitrary mask, so I do a unicast to each CPU instead.
+ * - mbligh
+ */
+ local_irq_save(flags);
+ for_each_cpu_mask_nr(query_cpu, mask) {
+ __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu),
+ vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
+}
+
+#endif /* _ASM_X86_IPI_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
new file mode 100644
index 000000000000..28e409fc73f3
--- /dev/null
+++ b/arch/x86/include/asm/irq.h
@@ -0,0 +1,46 @@
+#ifndef _ASM_X86_IRQ_H
+#define _ASM_X86_IRQ_H
+/*
+ * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
+ *
+ * IRQ/IPI changes taken from work by Thomas Radke
+ * <tomsoft@informatik.tu-chemnitz.de>
+ */
+
+#include <asm/apicdef.h>
+#include <asm/irq_vectors.h>
+
+static inline int irq_canonicalize(int irq)
+{
+ return ((irq == 2) ? 9 : irq);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+# define ARCH_HAS_NMI_WATCHDOG
+#endif
+
+#ifdef CONFIG_4KSTACKS
+ extern void irq_ctx_init(int cpu);
+ extern void irq_ctx_exit(int cpu);
+# define __ARCH_HAS_DO_SOFTIRQ
+#else
+# define irq_ctx_init(cpu) do { } while (0)
+# define irq_ctx_exit(cpu) do { } while (0)
+# ifdef CONFIG_X86_64
+# define __ARCH_HAS_DO_SOFTIRQ
+# endif
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+#include <linux/cpumask.h>
+extern void fixup_irqs(cpumask_t map);
+#endif
+
+extern unsigned int do_IRQ(struct pt_regs *regs);
+extern void init_IRQ(void);
+extern void native_init_IRQ(void);
+
+/* Interrupt vector management */
+extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
+
+#endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h
new file mode 100644
index 000000000000..89c898ab298b
--- /dev/null
+++ b/arch/x86/include/asm/irq_regs.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "irq_regs_32.h"
+#else
+# include "irq_regs_64.h"
+#endif
diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h
new file mode 100644
index 000000000000..86afd7473457
--- /dev/null
+++ b/arch/x86/include/asm/irq_regs_32.h
@@ -0,0 +1,31 @@
+/*
+ * Per-cpu current frame pointer - the location of the last exception frame on
+ * the stack, stored in the per-cpu area.
+ *
+ * Jeremy Fitzhardinge <jeremy@goop.org>
+ */
+#ifndef _ASM_X86_IRQ_REGS_32_H
+#define _ASM_X86_IRQ_REGS_32_H
+
+#include <asm/percpu.h>
+
+#define ARCH_HAS_OWN_IRQ_REGS
+
+DECLARE_PER_CPU(struct pt_regs *, irq_regs);
+
+static inline struct pt_regs *get_irq_regs(void)
+{
+ return x86_read_percpu(irq_regs);
+}
+
+static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
+{
+ struct pt_regs *old_regs;
+
+ old_regs = get_irq_regs();
+ x86_write_percpu(irq_regs, new_regs);
+
+ return old_regs;
+}
+
+#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/include/asm/irq_regs_64.h b/arch/x86/include/asm/irq_regs_64.h
new file mode 100644
index 000000000000..3dd9c0b70270
--- /dev/null
+++ b/arch/x86/include/asm/irq_regs_64.h
@@ -0,0 +1 @@
+#include <asm-generic/irq_regs.h>
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
new file mode 100644
index 000000000000..20e1fd588dbf
--- /dev/null
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -0,0 +1,8 @@
+#ifndef _ASM_X86_IRQ_REMAPPING_H
+#define _ASM_X86_IRQ_REMAPPING_H
+
+extern int x2apic;
+
+#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
+
+#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
new file mode 100644
index 000000000000..f7ff65032b9d
--- /dev/null
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -0,0 +1,167 @@
+#ifndef _ASM_X86_IRQ_VECTORS_H
+#define _ASM_X86_IRQ_VECTORS_H
+
+#include <linux/threads.h>
+
+#define NMI_VECTOR 0x02
+
+/*
+ * IDT vectors usable for external interrupt sources start
+ * at 0x20:
+ */
+#define FIRST_EXTERNAL_VECTOR 0x20
+
+#ifdef CONFIG_X86_32
+# define SYSCALL_VECTOR 0x80
+#else
+# define IA32_SYSCALL_VECTOR 0x80
+#endif
+
+/*
+ * Reserve the lowest usable priority level 0x20 - 0x2f for triggering
+ * cleanup after irq migration.
+ */
+#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
+
+/*
+ * Vectors 0x30-0x3f are used for ISA interrupts.
+ */
+#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10)
+#define IRQ1_VECTOR (IRQ0_VECTOR + 1)
+#define IRQ2_VECTOR (IRQ0_VECTOR + 2)
+#define IRQ3_VECTOR (IRQ0_VECTOR + 3)
+#define IRQ4_VECTOR (IRQ0_VECTOR + 4)
+#define IRQ5_VECTOR (IRQ0_VECTOR + 5)
+#define IRQ6_VECTOR (IRQ0_VECTOR + 6)
+#define IRQ7_VECTOR (IRQ0_VECTOR + 7)
+#define IRQ8_VECTOR (IRQ0_VECTOR + 8)
+#define IRQ9_VECTOR (IRQ0_VECTOR + 9)
+#define IRQ10_VECTOR (IRQ0_VECTOR + 10)
+#define IRQ11_VECTOR (IRQ0_VECTOR + 11)
+#define IRQ12_VECTOR (IRQ0_VECTOR + 12)
+#define IRQ13_VECTOR (IRQ0_VECTOR + 13)
+#define IRQ14_VECTOR (IRQ0_VECTOR + 14)
+#define IRQ15_VECTOR (IRQ0_VECTOR + 15)
+
+/*
+ * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
+ *
+ * some of the following vectors are 'rare', they are merged
+ * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
+ * TLB, reschedule and local APIC vectors are performance-critical.
+ *
+ * Vectors 0xf0-0xfa are free (reserved for future Linux use).
+ */
+#ifdef CONFIG_X86_32
+
+# define SPURIOUS_APIC_VECTOR 0xff
+# define ERROR_APIC_VECTOR 0xfe
+# define INVALIDATE_TLB_VECTOR 0xfd
+# define RESCHEDULE_VECTOR 0xfc
+# define CALL_FUNCTION_VECTOR 0xfb
+# define CALL_FUNCTION_SINGLE_VECTOR 0xfa
+# define THERMAL_APIC_VECTOR 0xf0
+
+#else
+
+#define SPURIOUS_APIC_VECTOR 0xff
+#define ERROR_APIC_VECTOR 0xfe
+#define RESCHEDULE_VECTOR 0xfd
+#define CALL_FUNCTION_VECTOR 0xfc
+#define CALL_FUNCTION_SINGLE_VECTOR 0xfb
+#define THERMAL_APIC_VECTOR 0xfa
+#define THRESHOLD_APIC_VECTOR 0xf9
+#define UV_BAU_MESSAGE 0xf8
+#define INVALIDATE_TLB_VECTOR_END 0xf7
+#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */
+
+#define NUM_INVALIDATE_TLB_VECTORS 8
+
+#endif
+
+/*
+ * Local APIC timer IRQ vector is on a different priority level,
+ * to work around the 'lost local interrupt if more than 2 IRQ
+ * sources per level' errata.
+ */
+#define LOCAL_TIMER_VECTOR 0xef
+
+/*
+ * First APIC vector available to drivers: (vectors 0x30-0xee) we
+ * start at 0x31(0x41) to spread out vectors evenly between priority
+ * levels. (0x80 is the syscall vector)
+ */
+#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
+
+#define NR_VECTORS 256
+
+#define FPU_IRQ 13
+
+#define FIRST_VM86_IRQ 3
+#define LAST_VM86_IRQ 15
+#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
+
+#define NR_IRQS_LEGACY 16
+
+#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
+
+#ifndef CONFIG_SPARSE_IRQ
+# if NR_CPUS < MAX_IO_APICS
+# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
+# else
+# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
+# endif
+#else
+# if (8 * NR_CPUS) > (32 * MAX_IO_APICS)
+# define NR_IRQS (NR_VECTORS + (8 * NR_CPUS))
+# else
+# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
+# endif
+#endif
+
+#elif defined(CONFIG_X86_VOYAGER)
+
+# define NR_IRQS 224
+
+#else /* IO_APIC || VOYAGER */
+
+# define NR_IRQS 16
+
+#endif
+
+/* Voyager specific defines */
+/* These define the CPIs we use in linux */
+#define VIC_CPI_LEVEL0 0
+#define VIC_CPI_LEVEL1 1
+/* now the fake CPIs */
+#define VIC_TIMER_CPI 2
+#define VIC_INVALIDATE_CPI 3
+#define VIC_RESCHEDULE_CPI 4
+#define VIC_ENABLE_IRQ_CPI 5
+#define VIC_CALL_FUNCTION_CPI 6
+#define VIC_CALL_FUNCTION_SINGLE_CPI 7
+
+/* Now the QIC CPIs: Since we don't need the two initial levels,
+ * these are 2 less than the VIC CPIs */
+#define QIC_CPI_OFFSET 1
+#define QIC_TIMER_CPI (VIC_TIMER_CPI - QIC_CPI_OFFSET)
+#define QIC_INVALIDATE_CPI (VIC_INVALIDATE_CPI - QIC_CPI_OFFSET)
+#define QIC_RESCHEDULE_CPI (VIC_RESCHEDULE_CPI - QIC_CPI_OFFSET)
+#define QIC_ENABLE_IRQ_CPI (VIC_ENABLE_IRQ_CPI - QIC_CPI_OFFSET)
+#define QIC_CALL_FUNCTION_CPI (VIC_CALL_FUNCTION_CPI - QIC_CPI_OFFSET)
+#define QIC_CALL_FUNCTION_SINGLE_CPI (VIC_CALL_FUNCTION_SINGLE_CPI - QIC_CPI_OFFSET)
+
+#define VIC_START_FAKE_CPI VIC_TIMER_CPI
+#define VIC_END_FAKE_CPI VIC_CALL_FUNCTION_SINGLE_CPI
+
+/* this is the SYS_INT CPI. */
+#define VIC_SYS_INT 8
+#define VIC_CMN_INT 15
+
+/* This is the boot CPI for alternate processors. It gets overwritten
+ * by the above once the system has activated all available processors */
+#define VIC_CPU_BOOT_CPI VIC_CPI_LEVEL0
+#define VIC_CPU_BOOT_ERRATA_CPI (VIC_CPI_LEVEL0 + 8)
+
+
+#endif /* _ASM_X86_IRQ_VECTORS_H */
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
new file mode 100644
index 000000000000..2bdab21f0898
--- /dev/null
+++ b/arch/x86/include/asm/irqflags.h
@@ -0,0 +1,211 @@
+#ifndef _X86_IRQFLAGS_H_
+#define _X86_IRQFLAGS_H_
+
+#include <asm/processor-flags.h>
+
+#ifndef __ASSEMBLY__
+/*
+ * Interrupt control:
+ */
+
+static inline unsigned long native_save_fl(void)
+{
+ unsigned long flags;
+
+ asm volatile("# __raw_save_flags\n\t"
+ "pushf ; pop %0"
+ : "=g" (flags)
+ : /* no input */
+ : "memory");
+
+ return flags;
+}
+
+static inline void native_restore_fl(unsigned long flags)
+{
+ asm volatile("push %0 ; popf"
+ : /* no output */
+ :"g" (flags)
+ :"memory", "cc");
+}
+
+static inline void native_irq_disable(void)
+{
+ asm volatile("cli": : :"memory");
+}
+
+static inline void native_irq_enable(void)
+{
+ asm volatile("sti": : :"memory");
+}
+
+static inline void native_safe_halt(void)
+{
+ asm volatile("sti; hlt": : :"memory");
+}
+
+static inline void native_halt(void)
+{
+ asm volatile("hlt": : :"memory");
+}
+
+#endif
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#ifndef __ASSEMBLY__
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+ return native_save_fl();
+}
+
+static inline void raw_local_irq_restore(unsigned long flags)
+{
+ native_restore_fl(flags);
+}
+
+static inline void raw_local_irq_disable(void)
+{
+ native_irq_disable();
+}
+
+static inline void raw_local_irq_enable(void)
+{
+ native_irq_enable();
+}
+
+/*
+ * Used in the idle loop; sti takes one instruction cycle
+ * to complete:
+ */
+static inline void raw_safe_halt(void)
+{
+ native_safe_halt();
+}
+
+/*
+ * Used when interrupts are already enabled or to
+ * shutdown the processor:
+ */
+static inline void halt(void)
+{
+ native_halt();
+}
+
+/*
+ * For spinlocks, etc:
+ */
+static inline unsigned long __raw_local_irq_save(void)
+{
+ unsigned long flags = __raw_local_save_flags();
+
+ raw_local_irq_disable();
+
+ return flags;
+}
+#else
+
+#define ENABLE_INTERRUPTS(x) sti
+#define DISABLE_INTERRUPTS(x) cli
+
+#ifdef CONFIG_X86_64
+#define SWAPGS swapgs
+/*
+ * Currently paravirt can't handle swapgs nicely when we
+ * don't have a stack we can rely on (such as a user space
+ * stack). So we either find a way around these or just fault
+ * and emulate if a guest tries to call swapgs directly.
+ *
+ * Either way, this is a good way to document that we don't
+ * have a reliable stack. x86_64 only.
+ */
+#define SWAPGS_UNSAFE_STACK swapgs
+
+#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */
+
+#define INTERRUPT_RETURN iretq
+#define USERGS_SYSRET64 \
+ swapgs; \
+ sysretq;
+#define USERGS_SYSRET32 \
+ swapgs; \
+ sysretl
+#define ENABLE_INTERRUPTS_SYSEXIT32 \
+ swapgs; \
+ sti; \
+ sysexit
+
+#else
+#define INTERRUPT_RETURN iret
+#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
+#define GET_CR0_INTO_EAX movl %cr0, %eax
+#endif
+
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+
+#ifndef __ASSEMBLY__
+#define raw_local_save_flags(flags) \
+ do { (flags) = __raw_local_save_flags(); } while (0)
+
+#define raw_local_irq_save(flags) \
+ do { (flags) = __raw_local_irq_save(); } while (0)
+
+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+ return !(flags & X86_EFLAGS_IF);
+}
+
+static inline int raw_irqs_disabled(void)
+{
+ unsigned long flags = __raw_local_save_flags();
+
+ return raw_irqs_disabled_flags(flags);
+}
+
+#else
+
+#ifdef CONFIG_X86_64
+#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
+ TRACE_IRQS_ON; \
+ sti; \
+ SAVE_REST; \
+ LOCKDEP_SYS_EXIT; \
+ RESTORE_REST; \
+ cli; \
+ TRACE_IRQS_OFF;
+
+#else
+#define ARCH_LOCKDEP_SYS_EXIT \
+ pushl %eax; \
+ pushl %ecx; \
+ pushl %edx; \
+ call lockdep_sys_exit; \
+ popl %edx; \
+ popl %ecx; \
+ popl %eax;
+
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
+# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
+#else
+# define TRACE_IRQS_ON
+# define TRACE_IRQS_OFF
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
+# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
+# else
+# define LOCKDEP_SYS_EXIT
+# define LOCKDEP_SYS_EXIT_IRQ
+# endif
+
+#endif /* __ASSEMBLY__ */
+#endif
diff --git a/arch/x86/include/asm/ist.h b/arch/x86/include/asm/ist.h
new file mode 100644
index 000000000000..7e5dff1de0e9
--- /dev/null
+++ b/arch/x86/include/asm/ist.h
@@ -0,0 +1,34 @@
+#ifndef _ASM_X86_IST_H
+#define _ASM_X86_IST_H
+
+/*
+ * Include file for the interface to IST BIOS
+ * Copyright 2002 Andy Grover <andrew.grover@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+
+#include <linux/types.h>
+
+struct ist_info {
+ __u32 signature;
+ __u32 command;
+ __u32 event;
+ __u32 perf_level;
+};
+
+#ifdef __KERNEL__
+
+extern struct ist_info ist_info;
+
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_IST_H */
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
new file mode 100644
index 000000000000..54c8cc53b24d
--- /dev/null
+++ b/arch/x86/include/asm/k8.h
@@ -0,0 +1,15 @@
+#ifndef _ASM_X86_K8_H
+#define _ASM_X86_K8_H
+
+#include <linux/pci.h>
+
+extern struct pci_device_id k8_nb_ids[];
+
+extern int early_is_k8_nb(u32 value);
+extern struct pci_dev **k8_northbridges;
+extern int num_k8_northbridges;
+extern int cache_k8_northbridges(void);
+extern void k8_flush_garts(void);
+extern int k8_scan_nodes(unsigned long start, unsigned long end);
+
+#endif /* _ASM_X86_K8_H */
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
new file mode 100644
index 000000000000..fa7c0b974761
--- /dev/null
+++ b/arch/x86/include/asm/kdebug.h
@@ -0,0 +1,37 @@
+#ifndef _ASM_X86_KDEBUG_H
+#define _ASM_X86_KDEBUG_H
+
+#include <linux/notifier.h>
+
+struct pt_regs;
+
+/* Grossly misnamed. */
+enum die_val {
+ DIE_OOPS = 1,
+ DIE_INT3,
+ DIE_DEBUG,
+ DIE_PANIC,
+ DIE_NMI,
+ DIE_DIE,
+ DIE_NMIWATCHDOG,
+ DIE_KERNELDEBUG,
+ DIE_TRAP,
+ DIE_GPF,
+ DIE_CALL,
+ DIE_NMI_IPI,
+ DIE_PAGE_FAULT,
+ DIE_NMIUNKNOWN,
+};
+
+extern void printk_address(unsigned long address, int reliable);
+extern void die(const char *, struct pt_regs *,long);
+extern int __must_check __die(const char *, struct pt_regs *, long);
+extern void show_registers(struct pt_regs *regs);
+extern void show_trace(struct task_struct *t, struct pt_regs *regs,
+ unsigned long *sp, unsigned long bp);
+extern void __show_regs(struct pt_regs *regs, int all);
+extern void show_regs(struct pt_regs *regs);
+extern unsigned long oops_begin(void);
+extern void oops_end(unsigned long, struct pt_regs *, int signr);
+
+#endif /* _ASM_X86_KDEBUG_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
new file mode 100644
index 000000000000..c61d8b2ab8b9
--- /dev/null
+++ b/arch/x86/include/asm/kexec.h
@@ -0,0 +1,176 @@
+#ifndef _ASM_X86_KEXEC_H
+#define _ASM_X86_KEXEC_H
+
+#ifdef CONFIG_X86_32
+# define PA_CONTROL_PAGE 0
+# define VA_CONTROL_PAGE 1
+# define PA_PGD 2
+# define PA_SWAP_PAGE 3
+# define PAGES_NR 4
+#else
+# define PA_CONTROL_PAGE 0
+# define VA_CONTROL_PAGE 1
+# define PA_PGD 2
+# define VA_PGD 3
+# define PA_PUD_0 4
+# define VA_PUD_0 5
+# define PA_PMD_0 6
+# define VA_PMD_0 7
+# define PA_PTE_0 8
+# define VA_PTE_0 9
+# define PA_PUD_1 10
+# define VA_PUD_1 11
+# define PA_PMD_1 12
+# define VA_PMD_1 13
+# define PA_PTE_1 14
+# define VA_PTE_1 15
+# define PA_TABLE_PAGE 16
+# define PAGES_NR 17
+#endif
+
+#ifdef CONFIG_X86_32
+# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
+#endif
+
+#ifndef __ASSEMBLY__
+
+#include <linux/string.h>
+
+#include <asm/page.h>
+#include <asm/ptrace.h>
+
+/*
+ * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
+ * I.e. Maximum page that is mapped directly into kernel memory,
+ * and kmap is not required.
+ *
+ * So far x86_64 is limited to 40 physical address bits.
+ */
+#ifdef CONFIG_X86_32
+/* Maximum physical address we can use pages from */
+# define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+/* Maximum address we can reach in physical address mode */
+# define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+/* Maximum address we can use for the control code buffer */
+# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
+
+# define KEXEC_CONTROL_PAGE_SIZE 4096
+
+/* The native architecture */
+# define KEXEC_ARCH KEXEC_ARCH_386
+
+/* We can also handle crash dumps from 64 bit kernel. */
+# define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
+#else
+/* Maximum physical address we can use pages from */
+# define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+/* Maximum address we can reach in physical address mode */
+# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+/* Maximum address we can use for the control pages */
+# define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+
+/* Allocate one page for the pdp and the second for the code */
+# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
+
+/* The native architecture */
+# define KEXEC_ARCH KEXEC_ARCH_X86_64
+#endif
+
+/*
+ * CPU does not save ss and sp on stack if execution is already
+ * running in kernel mode at the time of NMI occurrence. This code
+ * fixes it.
+ */
+static inline void crash_fixup_ss_esp(struct pt_regs *newregs,
+ struct pt_regs *oldregs)
+{
+#ifdef CONFIG_X86_32
+ newregs->sp = (unsigned long)&(oldregs->sp);
+ asm volatile("xorl %%eax, %%eax\n\t"
+ "movw %%ss, %%ax\n\t"
+ :"=a"(newregs->ss));
+#endif
+}
+
+/*
+ * This function is responsible for capturing register states if coming
+ * via panic otherwise just fix up the ss and sp if coming via kernel
+ * mode exception.
+ */
+static inline void crash_setup_regs(struct pt_regs *newregs,
+ struct pt_regs *oldregs)
+{
+ if (oldregs) {
+ memcpy(newregs, oldregs, sizeof(*newregs));
+ crash_fixup_ss_esp(newregs, oldregs);
+ } else {
+#ifdef CONFIG_X86_32
+ asm volatile("movl %%ebx,%0" : "=m"(newregs->bx));
+ asm volatile("movl %%ecx,%0" : "=m"(newregs->cx));
+ asm volatile("movl %%edx,%0" : "=m"(newregs->dx));
+ asm volatile("movl %%esi,%0" : "=m"(newregs->si));
+ asm volatile("movl %%edi,%0" : "=m"(newregs->di));
+ asm volatile("movl %%ebp,%0" : "=m"(newregs->bp));
+ asm volatile("movl %%eax,%0" : "=m"(newregs->ax));
+ asm volatile("movl %%esp,%0" : "=m"(newregs->sp));
+ asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss));
+ asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs));
+ asm volatile("movl %%ds, %%eax;" :"=a"(newregs->ds));
+ asm volatile("movl %%es, %%eax;" :"=a"(newregs->es));
+ asm volatile("pushfl; popl %0" :"=m"(newregs->flags));
+#else
+ asm volatile("movq %%rbx,%0" : "=m"(newregs->bx));
+ asm volatile("movq %%rcx,%0" : "=m"(newregs->cx));
+ asm volatile("movq %%rdx,%0" : "=m"(newregs->dx));
+ asm volatile("movq %%rsi,%0" : "=m"(newregs->si));
+ asm volatile("movq %%rdi,%0" : "=m"(newregs->di));
+ asm volatile("movq %%rbp,%0" : "=m"(newregs->bp));
+ asm volatile("movq %%rax,%0" : "=m"(newregs->ax));
+ asm volatile("movq %%rsp,%0" : "=m"(newregs->sp));
+ asm volatile("movq %%r8,%0" : "=m"(newregs->r8));
+ asm volatile("movq %%r9,%0" : "=m"(newregs->r9));
+ asm volatile("movq %%r10,%0" : "=m"(newregs->r10));
+ asm volatile("movq %%r11,%0" : "=m"(newregs->r11));
+ asm volatile("movq %%r12,%0" : "=m"(newregs->r12));
+ asm volatile("movq %%r13,%0" : "=m"(newregs->r13));
+ asm volatile("movq %%r14,%0" : "=m"(newregs->r14));
+ asm volatile("movq %%r15,%0" : "=m"(newregs->r15));
+ asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss));
+ asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs));
+ asm volatile("pushfq; popq %0" :"=m"(newregs->flags));
+#endif
+ newregs->ip = (unsigned long)current_text_addr();
+ }
+}
+
+#ifdef CONFIG_X86_32
+asmlinkage unsigned long
+relocate_kernel(unsigned long indirection_page,
+ unsigned long control_page,
+ unsigned long start_address,
+ unsigned int has_pae,
+ unsigned int preserve_context);
+#else
+NORET_TYPE void
+relocate_kernel(unsigned long indirection_page,
+ unsigned long page_list,
+ unsigned long start_address) ATTRIB_NORET;
+#endif
+
+#ifdef CONFIG_X86_32
+#define ARCH_HAS_KIMAGE_ARCH
+
+struct kimage_arch {
+ pgd_t *pgd;
+#ifdef CONFIG_X86_PAE
+ pmd_t *pmd0;
+ pmd_t *pmd1;
+#endif
+ pte_t *pte0;
+ pte_t *pte1;
+};
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
new file mode 100644
index 000000000000..e6c6c808489f
--- /dev/null
+++ b/arch/x86/include/asm/kgdb.h
@@ -0,0 +1,79 @@
+#ifndef _ASM_X86_KGDB_H
+#define _ASM_X86_KGDB_H
+
+/*
+ * Copyright (C) 2001-2004 Amit S. Kale
+ * Copyright (C) 2008 Wind River Systems, Inc.
+ */
+
+/*
+ * BUFMAX defines the maximum number of characters in inbound/outbound
+ * buffers at least NUMREGBYTES*2 are needed for register packets
+ * Longer buffer is needed to list all threads
+ */
+#define BUFMAX 1024
+
+/*
+ * Note that this register image is in a different order than
+ * the register image that Linux produces at interrupt time.
+ *
+ * Linux's register image is defined by struct pt_regs in ptrace.h.
+ * Just why GDB uses a different order is a historical mystery.
+ */
+#ifdef CONFIG_X86_32
+enum regnames {
+ GDB_AX, /* 0 */
+ GDB_CX, /* 1 */
+ GDB_DX, /* 2 */
+ GDB_BX, /* 3 */
+ GDB_SP, /* 4 */
+ GDB_BP, /* 5 */
+ GDB_SI, /* 6 */
+ GDB_DI, /* 7 */
+ GDB_PC, /* 8 also known as eip */
+ GDB_PS, /* 9 also known as eflags */
+ GDB_CS, /* 10 */
+ GDB_SS, /* 11 */
+ GDB_DS, /* 12 */
+ GDB_ES, /* 13 */
+ GDB_FS, /* 14 */
+ GDB_GS, /* 15 */
+};
+#define NUMREGBYTES ((GDB_GS+1)*4)
+#else /* ! CONFIG_X86_32 */
+enum regnames64 {
+ GDB_AX, /* 0 */
+ GDB_BX, /* 1 */
+ GDB_CX, /* 2 */
+ GDB_DX, /* 3 */
+ GDB_SI, /* 4 */
+ GDB_DI, /* 5 */
+ GDB_BP, /* 6 */
+ GDB_SP, /* 7 */
+ GDB_R8, /* 8 */
+ GDB_R9, /* 9 */
+ GDB_R10, /* 10 */
+ GDB_R11, /* 11 */
+ GDB_R12, /* 12 */
+ GDB_R13, /* 13 */
+ GDB_R14, /* 14 */
+ GDB_R15, /* 15 */
+ GDB_PC, /* 16 */
+};
+
+enum regnames32 {
+ GDB_PS = 34,
+ GDB_CS,
+ GDB_SS,
+};
+#define NUMREGBYTES ((GDB_SS+1)*4)
+#endif /* CONFIG_X86_32 */
+
+static inline void arch_kgdb_breakpoint(void)
+{
+ asm(" int $3");
+}
+#define BREAK_INSTR_SIZE 1
+#define CACHE_FLUSH_IS_SAFE 1
+
+#endif /* _ASM_X86_KGDB_H */
diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h
new file mode 100644
index 000000000000..5759c165a5cf
--- /dev/null
+++ b/arch/x86/include/asm/kmap_types.h
@@ -0,0 +1,29 @@
+#ifndef _ASM_X86_KMAP_TYPES_H
+#define _ASM_X86_KMAP_TYPES_H
+
+#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM)
+# define D(n) __KM_FENCE_##n ,
+#else
+# define D(n)
+#endif
+
+enum km_type {
+D(0) KM_BOUNCE_READ,
+D(1) KM_SKB_SUNRPC_DATA,
+D(2) KM_SKB_DATA_SOFTIRQ,
+D(3) KM_USER0,
+D(4) KM_USER1,
+D(5) KM_BIO_SRC_IRQ,
+D(6) KM_BIO_DST_IRQ,
+D(7) KM_PTE0,
+D(8) KM_PTE1,
+D(9) KM_IRQ0,
+D(10) KM_IRQ1,
+D(11) KM_SOFTIRQ0,
+D(12) KM_SOFTIRQ1,
+D(13) KM_TYPE_NR
+};
+
+#undef D
+
+#endif /* _ASM_X86_KMAP_TYPES_H */
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
new file mode 100644
index 000000000000..4fe681de1e76
--- /dev/null
+++ b/arch/x86/include/asm/kprobes.h
@@ -0,0 +1,88 @@
+#ifndef _ASM_X86_KPROBES_H
+#define _ASM_X86_KPROBES_H
+/*
+ * Kernel Probes (KProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * See arch/x86/kernel/kprobes.c for x86 kprobes history.
+ */
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+
+#define __ARCH_WANT_KPROBES_INSN_SLOT
+
+struct pt_regs;
+struct kprobe;
+
+typedef u8 kprobe_opcode_t;
+#define BREAKPOINT_INSTRUCTION 0xcc
+#define RELATIVEJUMP_INSTRUCTION 0xe9
+#define MAX_INSN_SIZE 16
+#define MAX_STACK_SIZE 64
+#define MIN_STACK_SIZE(ADDR) \
+ (((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \
+ THREAD_SIZE - (unsigned long)(ADDR))) \
+ ? (MAX_STACK_SIZE) \
+ : (((unsigned long)current_thread_info()) + \
+ THREAD_SIZE - (unsigned long)(ADDR)))
+
+#define flush_insn_slot(p) do { } while (0)
+
+extern const int kretprobe_blacklist_size;
+
+void arch_remove_kprobe(struct kprobe *p);
+void kretprobe_trampoline(void);
+
+/* Architecture specific copy of original instruction*/
+struct arch_specific_insn {
+ /* copy of the original instruction */
+ kprobe_opcode_t *insn;
+ /*
+ * boostable = -1: This instruction type is not boostable.
+ * boostable = 0: This instruction type is boostable.
+ * boostable = 1: This instruction has been boosted: we have
+ * added a relative jump after the instruction copy in insn,
+ * so no single-step and fixup are needed (unless there's
+ * a post_handler or break_handler).
+ */
+ int boostable;
+};
+
+struct prev_kprobe {
+ struct kprobe *kp;
+ unsigned long status;
+ unsigned long old_flags;
+ unsigned long saved_flags;
+};
+
+/* per-cpu kprobe control block */
+struct kprobe_ctlblk {
+ unsigned long kprobe_status;
+ unsigned long kprobe_old_flags;
+ unsigned long kprobe_saved_flags;
+ unsigned long *jprobe_saved_sp;
+ struct pt_regs jprobe_saved_regs;
+ kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
+ struct prev_kprobe prev_kprobe;
+};
+
+extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
+extern int kprobe_exceptions_notify(struct notifier_block *self,
+ unsigned long val, void *data);
+#endif /* _ASM_X86_KPROBES_H */
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
new file mode 100644
index 000000000000..b95162af0bf6
--- /dev/null
+++ b/arch/x86/include/asm/kvm.h
@@ -0,0 +1,211 @@
+#ifndef _ASM_X86_KVM_H
+#define _ASM_X86_KVM_H
+
+/*
+ * KVM x86 specific structures and definitions
+ *
+ */
+
+#include <asm/types.h>
+#include <linux/ioctl.h>
+
+/* Architectural interrupt line count. */
+#define KVM_NR_INTERRUPTS 256
+
+struct kvm_memory_alias {
+ __u32 slot; /* this has a different namespace than memory slots */
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size;
+ __u64 target_phys_addr;
+};
+
+/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
+struct kvm_pic_state {
+ __u8 last_irr; /* edge detection */
+ __u8 irr; /* interrupt request register */
+ __u8 imr; /* interrupt mask register */
+ __u8 isr; /* interrupt service register */
+ __u8 priority_add; /* highest irq priority */
+ __u8 irq_base;
+ __u8 read_reg_select;
+ __u8 poll;
+ __u8 special_mask;
+ __u8 init_state;
+ __u8 auto_eoi;
+ __u8 rotate_on_auto_eoi;
+ __u8 special_fully_nested_mode;
+ __u8 init4; /* true if 4 byte init */
+ __u8 elcr; /* PIIX edge/trigger selection */
+ __u8 elcr_mask;
+};
+
+#define KVM_IOAPIC_NUM_PINS 24
+struct kvm_ioapic_state {
+ __u64 base_address;
+ __u32 ioregsel;
+ __u32 id;
+ __u32 irr;
+ __u32 pad;
+ union {
+ __u64 bits;
+ struct {
+ __u8 vector;
+ __u8 delivery_mode:3;
+ __u8 dest_mode:1;
+ __u8 delivery_status:1;
+ __u8 polarity:1;
+ __u8 remote_irr:1;
+ __u8 trig_mode:1;
+ __u8 mask:1;
+ __u8 reserve:7;
+ __u8 reserved[4];
+ __u8 dest_id;
+ } fields;
+ } redirtbl[KVM_IOAPIC_NUM_PINS];
+};
+
+#define KVM_IRQCHIP_PIC_MASTER 0
+#define KVM_IRQCHIP_PIC_SLAVE 1
+#define KVM_IRQCHIP_IOAPIC 2
+
+/* for KVM_GET_REGS and KVM_SET_REGS */
+struct kvm_regs {
+ /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+ __u64 rax, rbx, rcx, rdx;
+ __u64 rsi, rdi, rsp, rbp;
+ __u64 r8, r9, r10, r11;
+ __u64 r12, r13, r14, r15;
+ __u64 rip, rflags;
+};
+
+/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+ char regs[KVM_APIC_REG_SIZE];
+};
+
+struct kvm_segment {
+ __u64 base;
+ __u32 limit;
+ __u16 selector;
+ __u8 type;
+ __u8 present, dpl, db, s, l, g, avl;
+ __u8 unusable;
+ __u8 padding;
+};
+
+struct kvm_dtable {
+ __u64 base;
+ __u16 limit;
+ __u16 padding[3];
+};
+
+
+/* for KVM_GET_SREGS and KVM_SET_SREGS */
+struct kvm_sregs {
+ /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
+ struct kvm_segment cs, ds, es, fs, gs, ss;
+ struct kvm_segment tr, ldt;
+ struct kvm_dtable gdt, idt;
+ __u64 cr0, cr2, cr3, cr4, cr8;
+ __u64 efer;
+ __u64 apic_base;
+ __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
+};
+
+/* for KVM_GET_FPU and KVM_SET_FPU */
+struct kvm_fpu {
+ __u8 fpr[8][16];
+ __u16 fcw;
+ __u16 fsw;
+ __u8 ftwx; /* in fxsave format */
+ __u8 pad1;
+ __u16 last_opcode;
+ __u64 last_ip;
+ __u64 last_dp;
+ __u8 xmm[16][16];
+ __u32 mxcsr;
+ __u32 pad2;
+};
+
+struct kvm_msr_entry {
+ __u32 index;
+ __u32 reserved;
+ __u64 data;
+};
+
+/* for KVM_GET_MSRS and KVM_SET_MSRS */
+struct kvm_msrs {
+ __u32 nmsrs; /* number of msrs in entries */
+ __u32 pad;
+
+ struct kvm_msr_entry entries[0];
+};
+
+/* for KVM_GET_MSR_INDEX_LIST */
+struct kvm_msr_list {
+ __u32 nmsrs; /* number of msrs in entries */
+ __u32 indices[0];
+};
+
+
+struct kvm_cpuid_entry {
+ __u32 function;
+ __u32 eax;
+ __u32 ebx;
+ __u32 ecx;
+ __u32 edx;
+ __u32 padding;
+};
+
+/* for KVM_SET_CPUID */
+struct kvm_cpuid {
+ __u32 nent;
+ __u32 padding;
+ struct kvm_cpuid_entry entries[0];
+};
+
+struct kvm_cpuid_entry2 {
+ __u32 function;
+ __u32 index;
+ __u32 flags;
+ __u32 eax;
+ __u32 ebx;
+ __u32 ecx;
+ __u32 edx;
+ __u32 padding[3];
+};
+
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
+#define KVM_CPUID_FLAG_STATEFUL_FUNC 2
+#define KVM_CPUID_FLAG_STATE_READ_NEXT 4
+
+/* for KVM_SET_CPUID2 */
+struct kvm_cpuid2 {
+ __u32 nent;
+ __u32 padding;
+ struct kvm_cpuid_entry2 entries[0];
+};
+
+/* for KVM_GET_PIT and KVM_SET_PIT */
+struct kvm_pit_channel_state {
+ __u32 count; /* can be 65536 */
+ __u16 latched_count;
+ __u8 count_latched;
+ __u8 status_latched;
+ __u8 status;
+ __u8 read_state;
+ __u8 write_state;
+ __u8 write_latch;
+ __u8 rw_mode;
+ __u8 mode;
+ __u8 bcd;
+ __u8 gate;
+ __s64 count_load_time;
+};
+
+struct kvm_pit_state {
+ struct kvm_pit_channel_state channels[3];
+};
+#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
new file mode 100644
index 000000000000..8346be87cfa1
--- /dev/null
+++ b/arch/x86/include/asm/kvm_host.h
@@ -0,0 +1,755 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This header defines architecture specific interfaces, x86 version
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _ASM_X86_KVM_HOST_H
+#define _ASM_X86_KVM_HOST_H
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mmu_notifier.h>
+
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+#include <linux/kvm_types.h>
+
+#include <asm/pvclock-abi.h>
+#include <asm/desc.h>
+
+#define KVM_MAX_VCPUS 16
+#define KVM_MEMORY_SLOTS 32
+/* memory slots that does not exposed to userspace */
+#define KVM_PRIVATE_MEM_SLOTS 4
+
+#define KVM_PIO_PAGE_OFFSET 1
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+
+#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
+#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
+#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
+ 0xFFFFFF0000000000ULL)
+
+#define KVM_GUEST_CR0_MASK \
+ (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
+ | X86_CR0_NW | X86_CR0_CD)
+#define KVM_VM_CR0_ALWAYS_ON \
+ (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
+ | X86_CR0_MP)
+#define KVM_GUEST_CR4_MASK \
+ (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
+#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
+#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
+
+#define INVALID_PAGE (~(hpa_t)0)
+#define UNMAPPED_GVA (~(gpa_t)0)
+
+/* shadow tables are PAE even on non-PAE hosts */
+#define KVM_HPAGE_SHIFT 21
+#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT)
+#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1))
+
+#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
+
+#define DE_VECTOR 0
+#define DB_VECTOR 1
+#define BP_VECTOR 3
+#define OF_VECTOR 4
+#define BR_VECTOR 5
+#define UD_VECTOR 6
+#define NM_VECTOR 7
+#define DF_VECTOR 8
+#define TS_VECTOR 10
+#define NP_VECTOR 11
+#define SS_VECTOR 12
+#define GP_VECTOR 13
+#define PF_VECTOR 14
+#define MF_VECTOR 16
+#define MC_VECTOR 18
+
+#define SELECTOR_TI_MASK (1 << 2)
+#define SELECTOR_RPL_MASK 0x03
+
+#define IOPL_SHIFT 12
+
+#define KVM_ALIAS_SLOTS 4
+
+#define KVM_PERMILLE_MMU_PAGES 20
+#define KVM_MIN_ALLOC_MMU_PAGES 64
+#define KVM_MMU_HASH_SHIFT 10
+#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
+#define KVM_MIN_FREE_MMU_PAGES 5
+#define KVM_REFILL_PAGES 25
+#define KVM_MAX_CPUID_ENTRIES 40
+#define KVM_NR_VAR_MTRR 8
+
+extern spinlock_t kvm_lock;
+extern struct list_head vm_list;
+
+struct kvm_vcpu;
+struct kvm;
+
+enum kvm_reg {
+ VCPU_REGS_RAX = 0,
+ VCPU_REGS_RCX = 1,
+ VCPU_REGS_RDX = 2,
+ VCPU_REGS_RBX = 3,
+ VCPU_REGS_RSP = 4,
+ VCPU_REGS_RBP = 5,
+ VCPU_REGS_RSI = 6,
+ VCPU_REGS_RDI = 7,
+#ifdef CONFIG_X86_64
+ VCPU_REGS_R8 = 8,
+ VCPU_REGS_R9 = 9,
+ VCPU_REGS_R10 = 10,
+ VCPU_REGS_R11 = 11,
+ VCPU_REGS_R12 = 12,
+ VCPU_REGS_R13 = 13,
+ VCPU_REGS_R14 = 14,
+ VCPU_REGS_R15 = 15,
+#endif
+ VCPU_REGS_RIP,
+ NR_VCPU_REGS
+};
+
+enum {
+ VCPU_SREG_ES,
+ VCPU_SREG_CS,
+ VCPU_SREG_SS,
+ VCPU_SREG_DS,
+ VCPU_SREG_FS,
+ VCPU_SREG_GS,
+ VCPU_SREG_TR,
+ VCPU_SREG_LDTR,
+};
+
+#include <asm/kvm_x86_emulate.h>
+
+#define KVM_NR_MEM_OBJS 40
+
+struct kvm_guest_debug {
+ int enabled;
+ unsigned long bp[4];
+ int singlestep;
+};
+
+/*
+ * We don't want allocation failures within the mmu code, so we preallocate
+ * enough memory for a single page fault in a cache.
+ */
+struct kvm_mmu_memory_cache {
+ int nobjs;
+ void *objects[KVM_NR_MEM_OBJS];
+};
+
+#define NR_PTE_CHAIN_ENTRIES 5
+
+struct kvm_pte_chain {
+ u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
+ struct hlist_node link;
+};
+
+/*
+ * kvm_mmu_page_role, below, is defined as:
+ *
+ * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
+ * bits 4:7 - page table level for this shadow (1-4)
+ * bits 8:9 - page table quadrant for 2-level guests
+ * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
+ * bits 17:19 - common access permissions for all ptes in this shadow page
+ */
+union kvm_mmu_page_role {
+ unsigned word;
+ struct {
+ unsigned glevels:4;
+ unsigned level:4;
+ unsigned quadrant:2;
+ unsigned pad_for_nice_hex_output:6;
+ unsigned metaphysical:1;
+ unsigned access:3;
+ unsigned invalid:1;
+ };
+};
+
+struct kvm_mmu_page {
+ struct list_head link;
+ struct hlist_node hash_link;
+
+ /*
+ * The following two entries are used to key the shadow page in the
+ * hash table.
+ */
+ gfn_t gfn;
+ union kvm_mmu_page_role role;
+
+ u64 *spt;
+ /* hold the gfn of each spte inside spt */
+ gfn_t *gfns;
+ unsigned long slot_bitmap; /* One bit set per slot which has memory
+ * in this shadow page.
+ */
+ int multimapped; /* More than one parent_pte? */
+ int root_count; /* Currently serving as active root */
+ bool unsync;
+ bool unsync_children;
+ union {
+ u64 *parent_pte; /* !multimapped */
+ struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
+ };
+ DECLARE_BITMAP(unsync_child_bitmap, 512);
+};
+
+struct kvm_pv_mmu_op_buffer {
+ void *ptr;
+ unsigned len;
+ unsigned processed;
+ char buf[512] __aligned(sizeof(long));
+};
+
+/*
+ * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
+ * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
+ * mode.
+ */
+struct kvm_mmu {
+ void (*new_cr3)(struct kvm_vcpu *vcpu);
+ int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
+ void (*free)(struct kvm_vcpu *vcpu);
+ gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
+ void (*prefetch_page)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *page);
+ int (*sync_page)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp);
+ void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
+ hpa_t root_hpa;
+ int root_level;
+ int shadow_root_level;
+
+ u64 *pae_root;
+};
+
+struct kvm_vcpu_arch {
+ u64 host_tsc;
+ int interrupt_window_open;
+ unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
+ DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
+ /*
+ * rip and regs accesses must go through
+ * kvm_{register,rip}_{read,write} functions.
+ */
+ unsigned long regs[NR_VCPU_REGS];
+ u32 regs_avail;
+ u32 regs_dirty;
+
+ unsigned long cr0;
+ unsigned long cr2;
+ unsigned long cr3;
+ unsigned long cr4;
+ unsigned long cr8;
+ u64 pdptrs[4]; /* pae */
+ u64 shadow_efer;
+ u64 apic_base;
+ struct kvm_lapic *apic; /* kernel irqchip context */
+ int mp_state;
+ int sipi_vector;
+ u64 ia32_misc_enable_msr;
+ bool tpr_access_reporting;
+
+ struct kvm_mmu mmu;
+ /* only needed in kvm_pv_mmu_op() path, but it's hot so
+ * put it here to avoid allocation */
+ struct kvm_pv_mmu_op_buffer mmu_op_buffer;
+
+ struct kvm_mmu_memory_cache mmu_pte_chain_cache;
+ struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
+ struct kvm_mmu_memory_cache mmu_page_cache;
+ struct kvm_mmu_memory_cache mmu_page_header_cache;
+
+ gfn_t last_pt_write_gfn;
+ int last_pt_write_count;
+ u64 *last_pte_updated;
+ gfn_t last_pte_gfn;
+
+ struct {
+ gfn_t gfn; /* presumed gfn during guest pte update */
+ pfn_t pfn; /* pfn corresponding to that gfn */
+ int largepage;
+ unsigned long mmu_seq;
+ } update_pte;
+
+ struct i387_fxsave_struct host_fx_image;
+ struct i387_fxsave_struct guest_fx_image;
+
+ gva_t mmio_fault_cr2;
+ struct kvm_pio_request pio;
+ void *pio_data;
+
+ struct kvm_queued_exception {
+ bool pending;
+ bool has_error_code;
+ u8 nr;
+ u32 error_code;
+ } exception;
+
+ struct kvm_queued_interrupt {
+ bool pending;
+ u8 nr;
+ } interrupt;
+
+ struct {
+ int active;
+ u8 save_iopl;
+ struct kvm_save_segment {
+ u16 selector;
+ unsigned long base;
+ u32 limit;
+ u32 ar;
+ } tr, es, ds, fs, gs;
+ } rmode;
+ int halt_request; /* real mode on Intel only */
+
+ int cpuid_nent;
+ struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
+ /* emulate context */
+
+ struct x86_emulate_ctxt emulate_ctxt;
+
+ gpa_t time;
+ struct pvclock_vcpu_time_info hv_clock;
+ unsigned int hv_clock_tsc_khz;
+ unsigned int time_offset;
+ struct page *time_page;
+
+ bool nmi_pending;
+ bool nmi_injected;
+
+ u64 mtrr[0x100];
+};
+
+struct kvm_mem_alias {
+ gfn_t base_gfn;
+ unsigned long npages;
+ gfn_t target_gfn;
+};
+
+struct kvm_arch{
+ int naliases;
+ struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
+
+ unsigned int n_free_mmu_pages;
+ unsigned int n_requested_mmu_pages;
+ unsigned int n_alloc_mmu_pages;
+ struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
+ /*
+ * Hash table of struct kvm_mmu_page.
+ */
+ struct list_head active_mmu_pages;
+ struct list_head assigned_dev_head;
+ struct dmar_domain *intel_iommu_domain;
+ struct kvm_pic *vpic;
+ struct kvm_ioapic *vioapic;
+ struct kvm_pit *vpit;
+ struct hlist_head irq_ack_notifier_list;
+
+ int round_robin_prev_vcpu;
+ unsigned int tss_addr;
+ struct page *apic_access_page;
+
+ gpa_t wall_clock;
+
+ struct page *ept_identity_pagetable;
+ bool ept_identity_pagetable_done;
+
+ unsigned long irq_sources_bitmap;
+ unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
+};
+
+struct kvm_vm_stat {
+ u32 mmu_shadow_zapped;
+ u32 mmu_pte_write;
+ u32 mmu_pte_updated;
+ u32 mmu_pde_zapped;
+ u32 mmu_flooded;
+ u32 mmu_recycled;
+ u32 mmu_cache_miss;
+ u32 mmu_unsync;
+ u32 remote_tlb_flush;
+ u32 lpages;
+};
+
+struct kvm_vcpu_stat {
+ u32 pf_fixed;
+ u32 pf_guest;
+ u32 tlb_flush;
+ u32 invlpg;
+
+ u32 exits;
+ u32 io_exits;
+ u32 mmio_exits;
+ u32 signal_exits;
+ u32 irq_window_exits;
+ u32 nmi_window_exits;
+ u32 halt_exits;
+ u32 halt_wakeup;
+ u32 request_irq_exits;
+ u32 irq_exits;
+ u32 host_state_reload;
+ u32 efer_reload;
+ u32 fpu_reload;
+ u32 insn_emulation;
+ u32 insn_emulation_fail;
+ u32 hypercalls;
+ u32 irq_injections;
+};
+
+struct descriptor_table {
+ u16 limit;
+ unsigned long base;
+} __attribute__((packed));
+
+struct kvm_x86_ops {
+ int (*cpu_has_kvm_support)(void); /* __init */
+ int (*disabled_by_bios)(void); /* __init */
+ void (*hardware_enable)(void *dummy); /* __init */
+ void (*hardware_disable)(void *dummy);
+ void (*check_processor_compatibility)(void *rtn);
+ int (*hardware_setup)(void); /* __init */
+ void (*hardware_unsetup)(void); /* __exit */
+ bool (*cpu_has_accelerated_tpr)(void);
+
+ /* Create, but do not attach this VCPU */
+ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
+ void (*vcpu_free)(struct kvm_vcpu *vcpu);
+ int (*vcpu_reset)(struct kvm_vcpu *vcpu);
+
+ void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+ void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+ void (*vcpu_put)(struct kvm_vcpu *vcpu);
+
+ int (*set_guest_debug)(struct kvm_vcpu *vcpu,
+ struct kvm_debug_guest *dbg);
+ void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
+ int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+ int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+ u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
+ void (*get_segment)(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+ int (*get_cpl)(struct kvm_vcpu *vcpu);
+ void (*set_segment)(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+ void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
+ void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
+ void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
+ void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
+ void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
+ void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
+ void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+ void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+ void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+ void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+ unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
+ void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
+ int *exception);
+ void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+ unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
+ void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+
+ void (*tlb_flush)(struct kvm_vcpu *vcpu);
+
+ void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
+ int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+ void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+ void (*patch_hypercall)(struct kvm_vcpu *vcpu,
+ unsigned char *hypercall_addr);
+ int (*get_irq)(struct kvm_vcpu *vcpu);
+ void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
+ void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
+ bool has_error_code, u32 error_code);
+ bool (*exception_injected)(struct kvm_vcpu *vcpu);
+ void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
+ void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
+ struct kvm_run *run);
+
+ int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
+ int (*get_tdp_level)(void);
+};
+
+extern struct kvm_x86_ops *kvm_x86_ops;
+
+int kvm_mmu_module_init(void);
+void kvm_mmu_module_exit(void);
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
+int kvm_mmu_create(struct kvm_vcpu *vcpu);
+int kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
+void kvm_mmu_set_base_ptes(u64 base_pte);
+void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
+ u64 dirty_mask, u64 nx_mask, u64 x_mask);
+
+int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_zap_all(struct kvm *kvm);
+unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
+
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
+
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const void *val, int bytes);
+int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
+ gpa_t addr, unsigned long *ret);
+
+extern bool tdp_enabled;
+
+enum emulation_result {
+ EMULATE_DONE, /* no further processing */
+ EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
+ EMULATE_FAIL, /* can't emulate this instruction */
+};
+
+#define EMULTYPE_NO_DECODE (1 << 0)
+#define EMULTYPE_TRAP_UD (1 << 1)
+int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ unsigned long cr2, u16 error_code, int emulation_type);
+void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
+void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
+void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
+void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
+ unsigned long *rflags);
+
+unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
+ unsigned long *rflags);
+void kvm_enable_efer_bits(u64);
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+
+struct x86_emulate_ctxt;
+
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+ int size, unsigned port);
+int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+ int size, unsigned long count, int down,
+ gva_t address, int rep, unsigned port);
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
+int kvm_emulate_halt(struct kvm_vcpu *vcpu);
+int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
+int emulate_clts(struct kvm_vcpu *vcpu);
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long *dest);
+int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long value);
+
+void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
+ int type_bits, int seg);
+
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
+
+void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
+void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
+void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
+ u32 error_code);
+
+void kvm_pic_set_irq(void *opaque, int irq, int level);
+
+void kvm_inject_nmi(struct kvm_vcpu *vcpu);
+
+void fx_init(struct kvm_vcpu *vcpu);
+
+int emulator_read_std(unsigned long addr,
+ void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu);
+int emulator_write_emulated(unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu);
+
+unsigned long segment_base(u16 selector);
+
+void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
+void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes);
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
+void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
+
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
+
+void kvm_enable_tdp(void);
+void kvm_disable_tdp(void);
+
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
+int complete_pio(struct kvm_vcpu *vcpu);
+
+static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
+{
+ struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
+
+ return (struct kvm_mmu_page *)page_private(page);
+}
+
+static inline u16 kvm_read_fs(void)
+{
+ u16 seg;
+ asm("mov %%fs, %0" : "=g"(seg));
+ return seg;
+}
+
+static inline u16 kvm_read_gs(void)
+{
+ u16 seg;
+ asm("mov %%gs, %0" : "=g"(seg));
+ return seg;
+}
+
+static inline u16 kvm_read_ldt(void)
+{
+ u16 ldt;
+ asm("sldt %0" : "=g"(ldt));
+ return ldt;
+}
+
+static inline void kvm_load_fs(u16 sel)
+{
+ asm("mov %0, %%fs" : : "rm"(sel));
+}
+
+static inline void kvm_load_gs(u16 sel)
+{
+ asm("mov %0, %%gs" : : "rm"(sel));
+}
+
+static inline void kvm_load_ldt(u16 sel)
+{
+ asm("lldt %0" : : "rm"(sel));
+}
+
+static inline void kvm_get_idt(struct descriptor_table *table)
+{
+ asm("sidt %0" : "=m"(*table));
+}
+
+static inline void kvm_get_gdt(struct descriptor_table *table)
+{
+ asm("sgdt %0" : "=m"(*table));
+}
+
+static inline unsigned long kvm_read_tr_base(void)
+{
+ u16 tr;
+ asm("str %0" : "=g"(tr));
+ return segment_base(tr);
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long read_msr(unsigned long msr)
+{
+ u64 value;
+
+ rdmsrl(msr, value);
+ return value;
+}
+#endif
+
+static inline void kvm_fx_save(struct i387_fxsave_struct *image)
+{
+ asm("fxsave (%0)":: "r" (image));
+}
+
+static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
+{
+ asm("fxrstor (%0)":: "r" (image));
+}
+
+static inline void kvm_fx_finit(void)
+{
+ asm("finit");
+}
+
+static inline u32 get_rdx_init_val(void)
+{
+ return 0x600; /* P6 family */
+}
+
+static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
+{
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+}
+
+#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
+#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
+#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
+#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
+#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
+#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
+#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
+#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
+
+#define MSR_IA32_TIME_STAMP_COUNTER 0x010
+
+#define TSS_IOPB_BASE_OFFSET 0x66
+#define TSS_BASE_SIZE 0x68
+#define TSS_IOPB_SIZE (65536 / 8)
+#define TSS_REDIRECTION_SIZE (256 / 8)
+#define RMODE_TSS_SIZE \
+ (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
+
+enum {
+ TASK_SWITCH_CALL = 0,
+ TASK_SWITCH_IRET = 1,
+ TASK_SWITCH_JMP = 2,
+ TASK_SWITCH_GATE = 3,
+};
+
+/*
+ * Hardware virtualization extension instructions may fault if a
+ * reboot turns off virtualization while processes are running.
+ * Trap the fault and ignore the instruction if that happens.
+ */
+asmlinkage void kvm_handle_fault_on_reboot(void);
+
+#define __kvm_handle_fault_on_reboot(insn) \
+ "666: " insn "\n\t" \
+ ".pushsection .fixup, \"ax\" \n" \
+ "667: \n\t" \
+ __ASM_SIZE(push) " $666b \n\t" \
+ "jmp kvm_handle_fault_on_reboot \n\t" \
+ ".popsection \n\t" \
+ ".pushsection __ex_table, \"a\" \n\t" \
+ _ASM_PTR " 666b, 667b \n\t" \
+ ".popsection"
+
+#define KVM_ARCH_WANT_MMU_NOTIFIER
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+
+#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
new file mode 100644
index 000000000000..b8a3305ae093
--- /dev/null
+++ b/arch/x86/include/asm/kvm_para.h
@@ -0,0 +1,147 @@
+#ifndef _ASM_X86_KVM_PARA_H
+#define _ASM_X86_KVM_PARA_H
+
+/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
+ * should be used to determine that a VM is running under KVM.
+ */
+#define KVM_CPUID_SIGNATURE 0x40000000
+
+/* This CPUID returns a feature bitmap in eax. Before enabling a particular
+ * paravirtualization, the appropriate feature bit should be checked.
+ */
+#define KVM_CPUID_FEATURES 0x40000001
+#define KVM_FEATURE_CLOCKSOURCE 0
+#define KVM_FEATURE_NOP_IO_DELAY 1
+#define KVM_FEATURE_MMU_OP 2
+
+#define MSR_KVM_WALL_CLOCK 0x11
+#define MSR_KVM_SYSTEM_TIME 0x12
+
+#define KVM_MAX_MMU_OP_BATCH 32
+
+/* Operations for KVM_HC_MMU_OP */
+#define KVM_MMU_OP_WRITE_PTE 1
+#define KVM_MMU_OP_FLUSH_TLB 2
+#define KVM_MMU_OP_RELEASE_PT 3
+
+/* Payload for KVM_HC_MMU_OP */
+struct kvm_mmu_op_header {
+ __u32 op;
+ __u32 pad;
+};
+
+struct kvm_mmu_op_write_pte {
+ struct kvm_mmu_op_header header;
+ __u64 pte_phys;
+ __u64 pte_val;
+};
+
+struct kvm_mmu_op_flush_tlb {
+ struct kvm_mmu_op_header header;
+};
+
+struct kvm_mmu_op_release_pt {
+ struct kvm_mmu_op_header header;
+ __u64 pt_phys;
+};
+
+#ifdef __KERNEL__
+#include <asm/processor.h>
+
+extern void kvmclock_init(void);
+
+
+/* This instruction is vmcall. On non-VT architectures, it will generate a
+ * trap that we will then rewrite to the appropriate instruction.
+ */
+#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
+
+/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
+ * instruction. The hypervisor may replace it with something else but only the
+ * instructions are guaranteed to be supported.
+ *
+ * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
+ * The hypercall number should be placed in rax and the return value will be
+ * placed in rax. No other registers will be clobbered unless explicited
+ * noted by the particular hypercall.
+ */
+
+static inline long kvm_hypercall0(unsigned int nr)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr)
+ : "memory");
+ return ret;
+}
+
+static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1)
+ : "memory");
+ return ret;
+}
+
+static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
+ unsigned long p2)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2)
+ : "memory");
+ return ret;
+}
+
+static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2), "d"(p3)
+ : "memory");
+ return ret;
+}
+
+static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3,
+ unsigned long p4)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4)
+ : "memory");
+ return ret;
+}
+
+static inline int kvm_para_available(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ char signature[13];
+
+ cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
+ memcpy(signature + 0, &ebx, 4);
+ memcpy(signature + 4, &ecx, 4);
+ memcpy(signature + 8, &edx, 4);
+ signature[12] = 0;
+
+ if (strcmp(signature, "KVMKVMKVM") == 0)
+ return 1;
+
+ return 0;
+}
+
+static inline unsigned int kvm_arch_para_features(void)
+{
+ return cpuid_eax(KVM_CPUID_FEATURES);
+}
+
+#endif
+
+#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
new file mode 100644
index 000000000000..25179a29f208
--- /dev/null
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -0,0 +1,184 @@
+/******************************************************************************
+ * x86_emulate.h
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#ifndef _ASM_X86_KVM_X86_EMULATE_H
+#define _ASM_X86_KVM_X86_EMULATE_H
+
+struct x86_emulate_ctxt;
+
+/*
+ * x86_emulate_ops:
+ *
+ * These operations represent the instruction emulator's interface to memory.
+ * There are two categories of operation: those that act on ordinary memory
+ * regions (*_std), and those that act on memory regions known to require
+ * special treatment or emulation (*_emulated).
+ *
+ * The emulator assumes that an instruction accesses only one 'emulated memory'
+ * location, that this location is the given linear faulting address (cr2), and
+ * that this is one of the instruction's data operands. Instruction fetches and
+ * stack operations are assumed never to access emulated memory. The emulator
+ * automatically deduces which operand of a string-move operation is accessing
+ * emulated memory, and assumes that the other operand accesses normal memory.
+ *
+ * NOTES:
+ * 1. The emulator isn't very smart about emulated vs. standard memory.
+ * 'Emulated memory' access addresses should be checked for sanity.
+ * 'Normal memory' accesses may fault, and the caller must arrange to
+ * detect and handle reentrancy into the emulator via recursive faults.
+ * Accesses may be unaligned and may cross page boundaries.
+ * 2. If the access fails (cannot emulate, or a standard access faults) then
+ * it is up to the memop to propagate the fault to the guest VM via
+ * some out-of-band mechanism, unknown to the emulator. The memop signals
+ * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
+ * then immediately bail.
+ * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
+ * cmpxchg8b_emulated need support 8-byte accesses.
+ * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
+ */
+/* Access completed successfully: continue emulation as normal. */
+#define X86EMUL_CONTINUE 0
+/* Access is unhandleable: bail from emulation and return error to caller. */
+#define X86EMUL_UNHANDLEABLE 1
+/* Terminate emulation but return success to the caller. */
+#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
+#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */
+#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */
+struct x86_emulate_ops {
+ /*
+ * read_std: Read bytes of standard (non-emulated/special) memory.
+ * Used for instruction fetch, stack operations, and others.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*read_std)(unsigned long addr, void *val,
+ unsigned int bytes, struct kvm_vcpu *vcpu);
+
+ /*
+ * read_emulated: Read bytes from emulated/special memory area.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*read_emulated)(unsigned long addr,
+ void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu);
+
+ /*
+ * write_emulated: Read bytes from emulated/special memory area.
+ * @addr: [IN ] Linear address to which to write.
+ * @val: [IN ] Value to write to memory (low-order bytes used as
+ * required).
+ * @bytes: [IN ] Number of bytes to write to memory.
+ */
+ int (*write_emulated)(unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu);
+
+ /*
+ * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
+ * emulated/special memory area.
+ * @addr: [IN ] Linear address to access.
+ * @old: [IN ] Value expected to be current at @addr.
+ * @new: [IN ] Value to write to @addr.
+ * @bytes: [IN ] Number of bytes to access using CMPXCHG.
+ */
+ int (*cmpxchg_emulated)(unsigned long addr,
+ const void *old,
+ const void *new,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu);
+
+};
+
+/* Type, address-of, and value of an instruction's operand. */
+struct operand {
+ enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
+ unsigned int bytes;
+ unsigned long val, orig_val, *ptr;
+};
+
+struct fetch_cache {
+ u8 data[15];
+ unsigned long start;
+ unsigned long end;
+};
+
+struct decode_cache {
+ u8 twobyte;
+ u8 b;
+ u8 lock_prefix;
+ u8 rep_prefix;
+ u8 op_bytes;
+ u8 ad_bytes;
+ u8 rex_prefix;
+ struct operand src;
+ struct operand dst;
+ bool has_seg_override;
+ u8 seg_override;
+ unsigned int d;
+ unsigned long regs[NR_VCPU_REGS];
+ unsigned long eip;
+ /* modrm */
+ u8 modrm;
+ u8 modrm_mod;
+ u8 modrm_reg;
+ u8 modrm_rm;
+ u8 use_modrm_ea;
+ bool rip_relative;
+ unsigned long modrm_ea;
+ void *modrm_ptr;
+ unsigned long modrm_val;
+ struct fetch_cache fetch;
+};
+
+struct x86_emulate_ctxt {
+ /* Register state before/after emulation. */
+ struct kvm_vcpu *vcpu;
+
+ /* Linear faulting address (if emulating a page-faulting instruction) */
+ unsigned long eflags;
+
+ /* Emulated execution mode, represented by an X86EMUL_MODE value. */
+ int mode;
+
+ u32 cs_base;
+
+ /* decode cache */
+
+ struct decode_cache decode;
+};
+
+/* Repeat String Operation Prefix */
+#define REPE_PREFIX 1
+#define REPNE_PREFIX 2
+
+/* Execution mode, passed to the emulator. */
+#define X86EMUL_MODE_REAL 0 /* Real mode. */
+#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
+#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
+#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
+
+/* Host execution mode. */
+#if defined(__i386__)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
+#elif defined(CONFIG_X86_64)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
+#endif
+
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops);
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops);
+
+#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/ldt.h b/arch/x86/include/asm/ldt.h
new file mode 100644
index 000000000000..46727eb37bfe
--- /dev/null
+++ b/arch/x86/include/asm/ldt.h
@@ -0,0 +1,40 @@
+/*
+ * ldt.h
+ *
+ * Definitions of structures used with the modify_ldt system call.
+ */
+#ifndef _ASM_X86_LDT_H
+#define _ASM_X86_LDT_H
+
+/* Maximum number of LDT entries supported. */
+#define LDT_ENTRIES 8192
+/* The size of each LDT entry. */
+#define LDT_ENTRY_SIZE 8
+
+#ifndef __ASSEMBLY__
+/*
+ * Note on 64bit base and limit is ignored and you cannot set DS/ES/CS
+ * not to the default values if you still want to do syscalls. This
+ * call is more for 32bit mode therefore.
+ */
+struct user_desc {
+ unsigned int entry_number;
+ unsigned int base_addr;
+ unsigned int limit;
+ unsigned int seg_32bit:1;
+ unsigned int contents:2;
+ unsigned int read_exec_only:1;
+ unsigned int limit_in_pages:1;
+ unsigned int seg_not_present:1;
+ unsigned int useable:1;
+#ifdef __x86_64__
+ unsigned int lm:1;
+#endif
+};
+
+#define MODIFY_LDT_CONTENTS_DATA 0
+#define MODIFY_LDT_CONTENTS_STACK 1
+#define MODIFY_LDT_CONTENTS_CODE 2
+
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_LDT_H */
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
new file mode 100644
index 000000000000..d28a507cef39
--- /dev/null
+++ b/arch/x86/include/asm/lguest.h
@@ -0,0 +1,94 @@
+#ifndef _ASM_X86_LGUEST_H
+#define _ASM_X86_LGUEST_H
+
+#define GDT_ENTRY_LGUEST_CS 10
+#define GDT_ENTRY_LGUEST_DS 11
+#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
+#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
+
+#ifndef __ASSEMBLY__
+#include <asm/desc.h>
+
+#define GUEST_PL 1
+
+/* Every guest maps the core switcher code. */
+#define SHARED_SWITCHER_PAGES \
+ DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
+/* Pages for switcher itself, then two pages per cpu */
+#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
+
+/* We map at -4M for ease of mapping into the guest (one PTE page). */
+#define SWITCHER_ADDR 0xFFC00000
+
+/* Found in switcher.S */
+extern unsigned long default_idt_entries[];
+
+/* Declarations for definitions in lguest_guest.S */
+extern char lguest_noirq_start[], lguest_noirq_end[];
+extern const char lgstart_cli[], lgend_cli[];
+extern const char lgstart_sti[], lgend_sti[];
+extern const char lgstart_popf[], lgend_popf[];
+extern const char lgstart_pushf[], lgend_pushf[];
+extern const char lgstart_iret[], lgend_iret[];
+
+extern void lguest_iret(void);
+extern void lguest_init(void);
+
+struct lguest_regs {
+ /* Manually saved part. */
+ unsigned long eax, ebx, ecx, edx;
+ unsigned long esi, edi, ebp;
+ unsigned long gs;
+ unsigned long fs, ds, es;
+ unsigned long trapnum, errcode;
+ /* Trap pushed part */
+ unsigned long eip;
+ unsigned long cs;
+ unsigned long eflags;
+ unsigned long esp;
+ unsigned long ss;
+};
+
+/* This is a guest-specific page (mapped ro) into the guest. */
+struct lguest_ro_state {
+ /* Host information we need to restore when we switch back. */
+ u32 host_cr3;
+ struct desc_ptr host_idt_desc;
+ struct desc_ptr host_gdt_desc;
+ u32 host_sp;
+
+ /* Fields which are used when guest is running. */
+ struct desc_ptr guest_idt_desc;
+ struct desc_ptr guest_gdt_desc;
+ struct x86_hw_tss guest_tss;
+ struct desc_struct guest_idt[IDT_ENTRIES];
+ struct desc_struct guest_gdt[GDT_ENTRIES];
+};
+
+struct lg_cpu_arch {
+ /* The GDT entries copied into lguest_ro_state when running. */
+ struct desc_struct gdt[GDT_ENTRIES];
+
+ /* The IDT entries: some copied into lguest_ro_state when running. */
+ struct desc_struct idt[IDT_ENTRIES];
+
+ /* The address of the last guest-visible pagefault (ie. cr2). */
+ unsigned long last_pagefault;
+};
+
+static inline void lguest_set_ts(void)
+{
+ u32 cr0;
+
+ cr0 = read_cr0();
+ if (!(cr0 & 8))
+ write_cr0(cr0 | 8);
+}
+
+/* Full 4G segment descriptors, suitable for CS and DS. */
+#define FULL_EXEC_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9b00} } })
+#define FULL_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9300} } })
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_LGUEST_H */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
new file mode 100644
index 000000000000..43894428c3c2
--- /dev/null
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -0,0 +1,71 @@
+/* Architecture specific portion of the lguest hypercalls */
+#ifndef _ASM_X86_LGUEST_HCALL_H
+#define _ASM_X86_LGUEST_HCALL_H
+
+#define LHCALL_FLUSH_ASYNC 0
+#define LHCALL_LGUEST_INIT 1
+#define LHCALL_SHUTDOWN 2
+#define LHCALL_LOAD_GDT 3
+#define LHCALL_NEW_PGTABLE 4
+#define LHCALL_FLUSH_TLB 5
+#define LHCALL_LOAD_IDT_ENTRY 6
+#define LHCALL_SET_STACK 7
+#define LHCALL_TS 8
+#define LHCALL_SET_CLOCKEVENT 9
+#define LHCALL_HALT 10
+#define LHCALL_SET_PTE 14
+#define LHCALL_SET_PMD 15
+#define LHCALL_LOAD_TLS 16
+#define LHCALL_NOTIFY 17
+
+#define LGUEST_TRAP_ENTRY 0x1F
+
+/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
+#define LGUEST_SHUTDOWN_POWEROFF 1
+#define LGUEST_SHUTDOWN_RESTART 2
+
+#ifndef __ASSEMBLY__
+#include <asm/hw_irq.h>
+
+/*G:031 But first, how does our Guest contact the Host to ask for privileged
+ * operations? There are two ways: the direct way is to make a "hypercall",
+ * to make requests of the Host Itself.
+ *
+ * Our hypercall mechanism uses the highest unused trap code (traps 32 and
+ * above are used by real hardware interrupts). Fifteen hypercalls are
+ * available: the hypercall number is put in the %eax register, and the
+ * arguments (when required) are placed in %edx, %ebx and %ecx. If a return
+ * value makes sense, it's returned in %eax.
+ *
+ * Grossly invalid calls result in Sudden Death at the hands of the vengeful
+ * Host, rather than returning failure. This reflects Winston Churchill's
+ * definition of a gentleman: "someone who is only rude intentionally". */
+static inline unsigned long
+hcall(unsigned long call,
+ unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+ /* "int" is the Intel instruction to trigger a trap. */
+ asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
+ /* The call in %eax (aka "a") might be overwritten */
+ : "=a"(call)
+ /* The arguments are in %eax, %edx, %ebx & %ecx */
+ : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
+ /* "memory" means this might write somewhere in memory.
+ * This isn't true for all calls, but it's safe to tell
+ * gcc that it might happen so it doesn't get clever. */
+ : "memory");
+ return call;
+}
+/*:*/
+
+/* Can't use our min() macro here: needs to be a constant */
+#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
+
+#define LHCALL_RING_SIZE 64
+struct hcall_args {
+ /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */
+ unsigned long arg0, arg2, arg3, arg1;
+};
+
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_LGUEST_HCALL_H */
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
new file mode 100644
index 000000000000..5d98d0b68ffc
--- /dev/null
+++ b/arch/x86/include/asm/linkage.h
@@ -0,0 +1,121 @@
+#ifndef _ASM_X86_LINKAGE_H
+#define _ASM_X86_LINKAGE_H
+
+#undef notrace
+#define notrace __attribute__((no_instrument_function))
+
+#ifdef CONFIG_X86_64
+#define __ALIGN .p2align 4,,15
+#define __ALIGN_STR ".p2align 4,,15"
+#endif
+
+#ifdef CONFIG_X86_32
+#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
+/*
+ * For 32-bit UML - mark functions implemented in assembly that use
+ * regparm input parameters:
+ */
+#define asmregparm __attribute__((regparm(3)))
+
+/*
+ * Make sure the compiler doesn't do anything stupid with the
+ * arguments on the stack - they are owned by the *caller*, not
+ * the callee. This just fools gcc into not spilling into them,
+ * and keeps it from doing tailcall recursion and/or using the
+ * stack slots for temporaries, since they are live and "used"
+ * all the way to the end of the function.
+ *
+ * NOTE! On x86-64, all the arguments are in registers, so this
+ * only matters on a 32-bit kernel.
+ */
+#define asmlinkage_protect(n, ret, args...) \
+ __asmlinkage_protect##n(ret, ##args)
+#define __asmlinkage_protect_n(ret, args...) \
+ __asm__ __volatile__ ("" : "=r" (ret) : "0" (ret), ##args)
+#define __asmlinkage_protect0(ret) \
+ __asmlinkage_protect_n(ret)
+#define __asmlinkage_protect1(ret, arg1) \
+ __asmlinkage_protect_n(ret, "g" (arg1))
+#define __asmlinkage_protect2(ret, arg1, arg2) \
+ __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2))
+#define __asmlinkage_protect3(ret, arg1, arg2, arg3) \
+ __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3))
+#define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \
+ __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
+ "g" (arg4))
+#define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \
+ __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
+ "g" (arg4), "g" (arg5))
+#define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \
+ __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
+ "g" (arg4), "g" (arg5), "g" (arg6))
+
+#endif
+
+#ifdef CONFIG_X86_ALIGNMENT_16
+#define __ALIGN .align 16,0x90
+#define __ALIGN_STR ".align 16,0x90"
+#endif
+
+/*
+ * to check ENTRY_X86/END_X86 and
+ * KPROBE_ENTRY_X86/KPROBE_END_X86
+ * unbalanced-missed-mixed appearance
+ */
+#define __set_entry_x86 .set ENTRY_X86_IN, 0
+#define __unset_entry_x86 .set ENTRY_X86_IN, 1
+#define __set_kprobe_x86 .set KPROBE_X86_IN, 0
+#define __unset_kprobe_x86 .set KPROBE_X86_IN, 1
+
+#define __macro_err_x86 .error "ENTRY_X86/KPROBE_X86 unbalanced,missed,mixed"
+
+#define __check_entry_x86 \
+ .ifdef ENTRY_X86_IN; \
+ .ifeq ENTRY_X86_IN; \
+ __macro_err_x86; \
+ .abort; \
+ .endif; \
+ .endif
+
+#define __check_kprobe_x86 \
+ .ifdef KPROBE_X86_IN; \
+ .ifeq KPROBE_X86_IN; \
+ __macro_err_x86; \
+ .abort; \
+ .endif; \
+ .endif
+
+#define __check_entry_kprobe_x86 \
+ __check_entry_x86; \
+ __check_kprobe_x86
+
+#define ENTRY_KPROBE_FINAL_X86 __check_entry_kprobe_x86
+
+#define ENTRY_X86(name) \
+ __check_entry_kprobe_x86; \
+ __set_entry_x86; \
+ .globl name; \
+ __ALIGN; \
+ name:
+
+#define END_X86(name) \
+ __unset_entry_x86; \
+ __check_entry_kprobe_x86; \
+ .size name, .-name
+
+#define KPROBE_ENTRY_X86(name) \
+ __check_entry_kprobe_x86; \
+ __set_kprobe_x86; \
+ .pushsection .kprobes.text, "ax"; \
+ .globl name; \
+ __ALIGN; \
+ name:
+
+#define KPROBE_END_X86(name) \
+ __unset_kprobe_x86; \
+ __check_entry_kprobe_x86; \
+ .size name, .-name; \
+ .popsection
+
+#endif /* _ASM_X86_LINKAGE_H */
+
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
new file mode 100644
index 000000000000..47b9b6f19057
--- /dev/null
+++ b/arch/x86/include/asm/local.h
@@ -0,0 +1,235 @@
+#ifndef _ASM_X86_LOCAL_H
+#define _ASM_X86_LOCAL_H
+
+#include <linux/percpu.h>
+
+#include <asm/system.h>
+#include <asm/atomic.h>
+#include <asm/asm.h>
+
+typedef struct {
+ atomic_long_t a;
+} local_t;
+
+#define LOCAL_INIT(i) { ATOMIC_LONG_INIT(i) }
+
+#define local_read(l) atomic_long_read(&(l)->a)
+#define local_set(l, i) atomic_long_set(&(l)->a, (i))
+
+static inline void local_inc(local_t *l)
+{
+ asm volatile(_ASM_INC "%0"
+ : "+m" (l->a.counter));
+}
+
+static inline void local_dec(local_t *l)
+{
+ asm volatile(_ASM_DEC "%0"
+ : "+m" (l->a.counter));
+}
+
+static inline void local_add(long i, local_t *l)
+{
+ asm volatile(_ASM_ADD "%1,%0"
+ : "+m" (l->a.counter)
+ : "ir" (i));
+}
+
+static inline void local_sub(long i, local_t *l)
+{
+ asm volatile(_ASM_SUB "%1,%0"
+ : "+m" (l->a.counter)
+ : "ir" (i));
+}
+
+/**
+ * local_sub_and_test - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @l: pointer to type local_t
+ *
+ * Atomically subtracts @i from @l and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int local_sub_and_test(long i, local_t *l)
+{
+ unsigned char c;
+
+ asm volatile(_ASM_SUB "%2,%0; sete %1"
+ : "+m" (l->a.counter), "=qm" (c)
+ : "ir" (i) : "memory");
+ return c;
+}
+
+/**
+ * local_dec_and_test - decrement and test
+ * @l: pointer to type local_t
+ *
+ * Atomically decrements @l by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int local_dec_and_test(local_t *l)
+{
+ unsigned char c;
+
+ asm volatile(_ASM_DEC "%0; sete %1"
+ : "+m" (l->a.counter), "=qm" (c)
+ : : "memory");
+ return c != 0;
+}
+
+/**
+ * local_inc_and_test - increment and test
+ * @l: pointer to type local_t
+ *
+ * Atomically increments @l by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int local_inc_and_test(local_t *l)
+{
+ unsigned char c;
+
+ asm volatile(_ASM_INC "%0; sete %1"
+ : "+m" (l->a.counter), "=qm" (c)
+ : : "memory");
+ return c != 0;
+}
+
+/**
+ * local_add_negative - add and test if negative
+ * @i: integer value to add
+ * @l: pointer to type local_t
+ *
+ * Atomically adds @i to @l and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int local_add_negative(long i, local_t *l)
+{
+ unsigned char c;
+
+ asm volatile(_ASM_ADD "%2,%0; sets %1"
+ : "+m" (l->a.counter), "=qm" (c)
+ : "ir" (i) : "memory");
+ return c;
+}
+
+/**
+ * local_add_return - add and return
+ * @i: integer value to add
+ * @l: pointer to type local_t
+ *
+ * Atomically adds @i to @l and returns @i + @l
+ */
+static inline long local_add_return(long i, local_t *l)
+{
+ long __i;
+#ifdef CONFIG_M386
+ unsigned long flags;
+ if (unlikely(boot_cpu_data.x86 <= 3))
+ goto no_xadd;
+#endif
+ /* Modern 486+ processor */
+ __i = i;
+ asm volatile(_ASM_XADD "%0, %1;"
+ : "+r" (i), "+m" (l->a.counter)
+ : : "memory");
+ return i + __i;
+
+#ifdef CONFIG_M386
+no_xadd: /* Legacy 386 processor */
+ local_irq_save(flags);
+ __i = local_read(l);
+ local_set(l, i + __i);
+ local_irq_restore(flags);
+ return i + __i;
+#endif
+}
+
+static inline long local_sub_return(long i, local_t *l)
+{
+ return local_add_return(-i, l);
+}
+
+#define local_inc_return(l) (local_add_return(1, l))
+#define local_dec_return(l) (local_sub_return(1, l))
+
+#define local_cmpxchg(l, o, n) \
+ (cmpxchg_local(&((l)->a.counter), (o), (n)))
+/* Always has a lock prefix */
+#define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
+
+/**
+ * local_add_unless - add unless the number is a given value
+ * @l: pointer of type local_t
+ * @a: the amount to add to l...
+ * @u: ...unless l is equal to u.
+ *
+ * Atomically adds @a to @l, so long as it was not @u.
+ * Returns non-zero if @l was not @u, and zero otherwise.
+ */
+#define local_add_unless(l, a, u) \
+({ \
+ long c, old; \
+ c = local_read((l)); \
+ for (;;) { \
+ if (unlikely(c == (u))) \
+ break; \
+ old = local_cmpxchg((l), c, c + (a)); \
+ if (likely(old == c)) \
+ break; \
+ c = old; \
+ } \
+ c != (u); \
+})
+#define local_inc_not_zero(l) local_add_unless((l), 1, 0)
+
+/* On x86_32, these are no better than the atomic variants.
+ * On x86-64 these are better than the atomic variants on SMP kernels
+ * because they dont use a lock prefix.
+ */
+#define __local_inc(l) local_inc(l)
+#define __local_dec(l) local_dec(l)
+#define __local_add(i, l) local_add((i), (l))
+#define __local_sub(i, l) local_sub((i), (l))
+
+/* Use these for per-cpu local_t variables: on some archs they are
+ * much more efficient than these naive implementations. Note they take
+ * a variable, not an address.
+ *
+ * X86_64: This could be done better if we moved the per cpu data directly
+ * after GS.
+ */
+
+/* Need to disable preemption for the cpu local counters otherwise we could
+ still access a variable of a previous CPU in a non atomic way. */
+#define cpu_local_wrap_v(l) \
+({ \
+ local_t res__; \
+ preempt_disable(); \
+ res__ = (l); \
+ preempt_enable(); \
+ res__; \
+})
+#define cpu_local_wrap(l) \
+({ \
+ preempt_disable(); \
+ (l); \
+ preempt_enable(); \
+}) \
+
+#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var((l))))
+#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var((l)), (i)))
+#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var((l))))
+#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var((l))))
+#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var((l))))
+#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var((l))))
+
+#define __cpu_local_inc(l) cpu_local_inc((l))
+#define __cpu_local_dec(l) cpu_local_dec((l))
+#define __cpu_local_add(i, l) cpu_local_add((i), (l))
+#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
+
+#endif /* _ASM_X86_LOCAL_H */
diff --git a/arch/x86/include/asm/mach-default/apm.h b/arch/x86/include/asm/mach-default/apm.h
new file mode 100644
index 000000000000..20370c6db74b
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/apm.h
@@ -0,0 +1,73 @@
+/*
+ * Machine specific APM BIOS functions for generic.
+ * Split out from apm.c by Osamu Tomita <tomita@cinet.co.jp>
+ */
+
+#ifndef _ASM_X86_MACH_DEFAULT_APM_H
+#define _ASM_X86_MACH_DEFAULT_APM_H
+
+#ifdef APM_ZERO_SEGS
+# define APM_DO_ZERO_SEGS \
+ "pushl %%ds\n\t" \
+ "pushl %%es\n\t" \
+ "xorl %%edx, %%edx\n\t" \
+ "mov %%dx, %%ds\n\t" \
+ "mov %%dx, %%es\n\t" \
+ "mov %%dx, %%fs\n\t" \
+ "mov %%dx, %%gs\n\t"
+# define APM_DO_POP_SEGS \
+ "popl %%es\n\t" \
+ "popl %%ds\n\t"
+#else
+# define APM_DO_ZERO_SEGS
+# define APM_DO_POP_SEGS
+#endif
+
+static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in,
+ u32 *eax, u32 *ebx, u32 *ecx,
+ u32 *edx, u32 *esi)
+{
+ /*
+ * N.B. We do NOT need a cld after the BIOS call
+ * because we always save and restore the flags.
+ */
+ __asm__ __volatile__(APM_DO_ZERO_SEGS
+ "pushl %%edi\n\t"
+ "pushl %%ebp\n\t"
+ "lcall *%%cs:apm_bios_entry\n\t"
+ "setc %%al\n\t"
+ "popl %%ebp\n\t"
+ "popl %%edi\n\t"
+ APM_DO_POP_SEGS
+ : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx),
+ "=S" (*esi)
+ : "a" (func), "b" (ebx_in), "c" (ecx_in)
+ : "memory", "cc");
+}
+
+static inline u8 apm_bios_call_simple_asm(u32 func, u32 ebx_in,
+ u32 ecx_in, u32 *eax)
+{
+ int cx, dx, si;
+ u8 error;
+
+ /*
+ * N.B. We do NOT need a cld after the BIOS call
+ * because we always save and restore the flags.
+ */
+ __asm__ __volatile__(APM_DO_ZERO_SEGS
+ "pushl %%edi\n\t"
+ "pushl %%ebp\n\t"
+ "lcall *%%cs:apm_bios_entry\n\t"
+ "setc %%bl\n\t"
+ "popl %%ebp\n\t"
+ "popl %%edi\n\t"
+ APM_DO_POP_SEGS
+ : "=a" (*eax), "=b" (error), "=c" (cx), "=d" (dx),
+ "=S" (si)
+ : "a" (func), "b" (ebx_in), "c" (ecx_in)
+ : "memory", "cc");
+ return error;
+}
+
+#endif /* _ASM_X86_MACH_DEFAULT_APM_H */
diff --git a/arch/x86/include/asm/mach-default/do_timer.h b/arch/x86/include/asm/mach-default/do_timer.h
new file mode 100644
index 000000000000..23ecda0b28a0
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/do_timer.h
@@ -0,0 +1,16 @@
+/* defines for inline arch setup functions */
+#include <linux/clockchips.h>
+
+#include <asm/i8259.h>
+#include <asm/i8253.h>
+
+/**
+ * do_timer_interrupt_hook - hook into timer tick
+ *
+ * Call the pit clock event handler. see asm/i8253.h
+ **/
+
+static inline void do_timer_interrupt_hook(void)
+{
+ global_clock_event->event_handler(global_clock_event);
+}
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
new file mode 100644
index 000000000000..6b1add8e31dd
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -0,0 +1,36 @@
+/*
+ * This file is designed to contain the BUILD_INTERRUPT specifications for
+ * all of the extra named interrupt vectors used by the architecture.
+ * Usually this is the Inter Process Interrupts (IPIs)
+ */
+
+/*
+ * The following vectors are part of the Linux architecture, there
+ * is no hardware IRQ pin equivalent for them, they are triggered
+ * through the ICC by us (IPIs)
+ */
+#ifdef CONFIG_X86_SMP
+BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
+BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
+BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
+BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
+BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
+#endif
+
+/*
+ * every pentium local APIC has two 'local interrupts', with a
+ * soft-definable vector attached to both interrupts, one of
+ * which is a timer interrupt, the other one is error counter
+ * overflow. Linux uses the local APIC timer interrupt to get
+ * a much simpler SMP time architecture:
+ */
+#ifdef CONFIG_X86_LOCAL_APIC
+BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
+BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
+BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
+
+#ifdef CONFIG_X86_MCE_P4THERMAL
+BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h
new file mode 100644
index 000000000000..6cb3a467e067
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_apic.h
@@ -0,0 +1,158 @@
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_APIC_H
+#define _ASM_X86_MACH_DEFAULT_MACH_APIC_H
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+#include <mach_apicdef.h>
+#include <asm/smp.h>
+
+#define APIC_DFR_VALUE (APIC_DFR_FLAT)
+
+static inline cpumask_t target_cpus(void)
+{
+#ifdef CONFIG_SMP
+ return cpu_online_map;
+#else
+ return cpumask_of_cpu(0);
+#endif
+}
+
+#define NO_BALANCE_IRQ (0)
+#define esr_disable (0)
+
+#ifdef CONFIG_X86_64
+#include <asm/genapic.h>
+#define INT_DELIVERY_MODE (genapic->int_delivery_mode)
+#define INT_DEST_MODE (genapic->int_dest_mode)
+#define TARGET_CPUS (genapic->target_cpus())
+#define apic_id_registered (genapic->apic_id_registered)
+#define init_apic_ldr (genapic->init_apic_ldr)
+#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
+#define phys_pkg_id (genapic->phys_pkg_id)
+#define vector_allocation_domain (genapic->vector_allocation_domain)
+#define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID)))
+#define send_IPI_self (genapic->send_IPI_self)
+#define wakeup_secondary_cpu (genapic->wakeup_cpu)
+extern void setup_apic_routing(void);
+#else
+#define INT_DELIVERY_MODE dest_LowestPrio
+#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */
+#define TARGET_CPUS (target_cpus())
+#define wakeup_secondary_cpu wakeup_secondary_cpu_via_init
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+static inline void init_apic_ldr(void)
+{
+ unsigned long val;
+
+ apic_write(APIC_DFR, APIC_DFR_VALUE);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
+ apic_write(APIC_LDR, val);
+}
+
+static inline int apic_id_registered(void)
+{
+ return physid_isset(read_apic_id(), phys_cpu_present_map);
+}
+
+static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ return cpus_addr(cpumask)[0];
+}
+
+static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+static inline void setup_apic_routing(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
+ "Flat", nr_ioapics);
+#endif
+}
+
+static inline int apicid_to_node(int logical_apicid)
+{
+#ifdef CONFIG_SMP
+ return apicid_2_node[hard_smp_processor_id()];
+#else
+ return 0;
+#endif
+}
+
+static inline cpumask_t vector_allocation_domain(int cpu)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+ return domain;
+}
+#endif
+
+static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return physid_isset(apicid, bitmap);
+}
+
+static inline unsigned long check_apicid_present(int bit)
+{
+ return physid_isset(bit, phys_cpu_present_map);
+}
+
+static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
+{
+ return phys_map;
+}
+
+static inline int multi_timer_check(int apic, int irq)
+{
+ return 0;
+}
+
+/* Mapping from cpu number to logical apicid */
+static inline int cpu_to_logical_apicid(int cpu)
+{
+ return 1 << cpu;
+}
+
+static inline int cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < NR_CPUS && cpu_present(mps_cpu))
+ return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+
+static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
+{
+ return physid_mask_of_physid(phys_apicid);
+}
+
+static inline void setup_portio_remap(void)
+{
+}
+
+static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map);
+}
+
+static inline void enable_apic_mode(void)
+{
+}
+#endif /* CONFIG_X86_LOCAL_APIC */
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_APIC_H */
diff --git a/arch/x86/include/asm/mach-default/mach_apicdef.h b/arch/x86/include/asm/mach-default/mach_apicdef.h
new file mode 100644
index 000000000000..53179936d6c6
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_apicdef.h
@@ -0,0 +1,24 @@
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H
+#define _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H
+
+#include <asm/apic.h>
+
+#ifdef CONFIG_X86_64
+#define APIC_ID_MASK (genapic->apic_id_mask)
+#define GET_APIC_ID(x) (genapic->get_apic_id(x))
+#define SET_APIC_ID(x) (genapic->set_apic_id(x))
+#else
+#define APIC_ID_MASK (0xF<<24)
+static inline unsigned get_apic_id(unsigned long x)
+{
+ unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+ if (APIC_XAPIC(ver))
+ return (((x)>>24)&0xFF);
+ else
+ return (((x)>>24)&0xF);
+}
+
+#define GET_APIC_ID(x) get_apic_id(x)
+#endif
+
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H */
diff --git a/arch/x86/include/asm/mach-default/mach_ipi.h b/arch/x86/include/asm/mach-default/mach_ipi.h
new file mode 100644
index 000000000000..fabca01ebacf
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_ipi.h
@@ -0,0 +1,64 @@
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_IPI_H
+#define _ASM_X86_MACH_DEFAULT_MACH_IPI_H
+
+/* Avoid include hell */
+#define NMI_VECTOR 0x02
+
+void send_IPI_mask_bitmask(cpumask_t mask, int vector);
+void __send_IPI_shortcut(unsigned int shortcut, int vector);
+
+extern int no_broadcast;
+
+#ifdef CONFIG_X86_64
+#include <asm/genapic.h>
+#define send_IPI_mask (genapic->send_IPI_mask)
+#else
+static inline void send_IPI_mask(cpumask_t mask, int vector)
+{
+ send_IPI_mask_bitmask(mask, vector);
+}
+#endif
+
+static inline void __local_send_IPI_allbutself(int vector)
+{
+ if (no_broadcast || vector == NMI_VECTOR) {
+ cpumask_t mask = cpu_online_map;
+
+ cpu_clear(smp_processor_id(), mask);
+ send_IPI_mask(mask, vector);
+ } else
+ __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
+}
+
+static inline void __local_send_IPI_all(int vector)
+{
+ if (no_broadcast || vector == NMI_VECTOR)
+ send_IPI_mask(cpu_online_map, vector);
+ else
+ __send_IPI_shortcut(APIC_DEST_ALLINC, vector);
+}
+
+#ifdef CONFIG_X86_64
+#define send_IPI_allbutself (genapic->send_IPI_allbutself)
+#define send_IPI_all (genapic->send_IPI_all)
+#else
+static inline void send_IPI_allbutself(int vector)
+{
+ /*
+ * if there are no other CPUs in the system then we get an APIC send
+ * error if we try to broadcast, thus avoid sending IPIs in this case.
+ */
+ if (!(num_online_cpus() > 1))
+ return;
+
+ __local_send_IPI_allbutself(vector);
+ return;
+}
+
+static inline void send_IPI_all(int vector)
+{
+ __local_send_IPI_all(vector);
+}
+#endif
+
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_IPI_H */
diff --git a/arch/x86/include/asm/mach-default/mach_mpparse.h b/arch/x86/include/asm/mach-default/mach_mpparse.h
new file mode 100644
index 000000000000..8c1ea21238a7
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_mpparse.h
@@ -0,0 +1,17 @@
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H
+#define _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H
+
+static inline int mps_oem_check(struct mp_config_table *mpc, char *oem,
+ char *productid)
+{
+ return 0;
+}
+
+/* Hook from generic ACPI tables.c */
+static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return 0;
+}
+
+
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/mach-default/mach_mpspec.h b/arch/x86/include/asm/mach-default/mach_mpspec.h
new file mode 100644
index 000000000000..e85ede686be8
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_mpspec.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H
+#define _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H
+
+#define MAX_IRQ_SOURCES 256
+
+#if CONFIG_BASE_SMALL == 0
+#define MAX_MP_BUSSES 256
+#else
+#define MAX_MP_BUSSES 32
+#endif
+
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H */
diff --git a/arch/x86/include/asm/mach-default/mach_timer.h b/arch/x86/include/asm/mach-default/mach_timer.h
new file mode 100644
index 000000000000..853728519ae9
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_timer.h
@@ -0,0 +1,48 @@
+/*
+ * Machine specific calibrate_tsc() for generic.
+ * Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
+ */
+/* ------ Calibrate the TSC -------
+ * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
+ * Too much 64-bit arithmetic here to do this cleanly in C, and for
+ * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
+ * output busy loop as low as possible. We avoid reading the CTC registers
+ * directly because of the awkward 8-bit access mechanism of the 82C54
+ * device.
+ */
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_TIMER_H
+#define _ASM_X86_MACH_DEFAULT_MACH_TIMER_H
+
+#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */
+#define CALIBRATE_LATCH \
+ ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
+
+static inline void mach_prepare_counter(void)
+{
+ /* Set the Gate high, disable speaker */
+ outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+ /*
+ * Now let's take care of CTC channel 2
+ *
+ * Set the Gate high, program CTC channel 2 for mode 0,
+ * (interrupt on terminal count mode), binary count,
+ * load 5 * LATCH count, (LSB and MSB) to begin countdown.
+ *
+ * Some devices need a delay here.
+ */
+ outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */
+ outb_p(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */
+ outb_p(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */
+}
+
+static inline void mach_countup(unsigned long *count_p)
+{
+ unsigned long count = 0;
+ do {
+ count++;
+ } while ((inb_p(0x61) & 0x20) == 0);
+ *count_p = count;
+}
+
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_TIMER_H */
diff --git a/arch/x86/include/asm/mach-default/mach_traps.h b/arch/x86/include/asm/mach-default/mach_traps.h
new file mode 100644
index 000000000000..f7920601e472
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_traps.h
@@ -0,0 +1,33 @@
+/*
+ * Machine specific NMI handling for generic.
+ * Split out from traps.c by Osamu Tomita <tomita@cinet.co.jp>
+ */
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H
+#define _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H
+
+#include <asm/mc146818rtc.h>
+
+static inline unsigned char get_nmi_reason(void)
+{
+ return inb(0x61);
+}
+
+static inline void reassert_nmi(void)
+{
+ int old_reg = -1;
+
+ if (do_i_have_lock_cmos())
+ old_reg = current_lock_cmos_reg();
+ else
+ lock_cmos(0); /* register doesn't matter here */
+ outb(0x8f, 0x70);
+ inb(0x71); /* dummy */
+ outb(0x0f, 0x70);
+ inb(0x71); /* dummy */
+ if (old_reg >= 0)
+ outb(old_reg, 0x70);
+ else
+ unlock_cmos();
+}
+
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H */
diff --git a/arch/x86/include/asm/mach-default/mach_wakecpu.h b/arch/x86/include/asm/mach-default/mach_wakecpu.h
new file mode 100644
index 000000000000..ceb013660146
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_wakecpu.h
@@ -0,0 +1,35 @@
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
+#define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
+
+#define TRAMPOLINE_PHYS_LOW (0x467)
+#define TRAMPOLINE_PHYS_HIGH (0x469)
+
+static inline void wait_for_init_deassert(atomic_t *deassert)
+{
+ while (!atomic_read(deassert))
+ cpu_relax();
+ return;
+}
+
+/* Nothing to do for most platforms, since cleared by the INIT cycle */
+static inline void smp_callin_clear_local_apic(void)
+{
+}
+
+static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
+{
+}
+
+static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
+{
+}
+
+extern void __inquire_remote_apic(int apicid);
+
+static inline void inquire_remote_apic(int apicid)
+{
+ if (apic_verbosity >= APIC_DEBUG)
+ __inquire_remote_apic(apicid);
+}
+
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */
diff --git a/arch/x86/include/asm/mach-default/pci-functions.h b/arch/x86/include/asm/mach-default/pci-functions.h
new file mode 100644
index 000000000000..ed0bab427354
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/pci-functions.h
@@ -0,0 +1,19 @@
+/*
+ * PCI BIOS function numbering for conventional PCI BIOS
+ * systems
+ */
+
+#define PCIBIOS_PCI_FUNCTION_ID 0xb1XX
+#define PCIBIOS_PCI_BIOS_PRESENT 0xb101
+#define PCIBIOS_FIND_PCI_DEVICE 0xb102
+#define PCIBIOS_FIND_PCI_CLASS_CODE 0xb103
+#define PCIBIOS_GENERATE_SPECIAL_CYCLE 0xb106
+#define PCIBIOS_READ_CONFIG_BYTE 0xb108
+#define PCIBIOS_READ_CONFIG_WORD 0xb109
+#define PCIBIOS_READ_CONFIG_DWORD 0xb10a
+#define PCIBIOS_WRITE_CONFIG_BYTE 0xb10b
+#define PCIBIOS_WRITE_CONFIG_WORD 0xb10c
+#define PCIBIOS_WRITE_CONFIG_DWORD 0xb10d
+#define PCIBIOS_GET_ROUTING_OPTIONS 0xb10e
+#define PCIBIOS_SET_PCI_HW_INT 0xb10f
+
diff --git a/arch/x86/include/asm/mach-default/setup_arch.h b/arch/x86/include/asm/mach-default/setup_arch.h
new file mode 100644
index 000000000000..38846208b548
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/setup_arch.h
@@ -0,0 +1,3 @@
+/* Hook to call BIOS initialisation function */
+
+/* no action for generic */
diff --git a/arch/x86/include/asm/mach-default/smpboot_hooks.h b/arch/x86/include/asm/mach-default/smpboot_hooks.h
new file mode 100644
index 000000000000..23bf52103b89
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/smpboot_hooks.h
@@ -0,0 +1,61 @@
+/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws
+ * which needs to alter them. */
+
+static inline void smpboot_clear_io_apic_irqs(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ io_apic_irqs = 0;
+#endif
+}
+
+static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
+{
+ CMOS_WRITE(0xa, 0xf);
+ local_flush_tlb();
+ pr_debug("1.\n");
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
+ start_eip >> 4;
+ pr_debug("2.\n");
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
+ start_eip & 0xf;
+ pr_debug("3.\n");
+}
+
+static inline void smpboot_restore_warm_reset_vector(void)
+{
+ /*
+ * Install writable page 0 entry to set BIOS data area.
+ */
+ local_flush_tlb();
+
+ /*
+ * Paranoid: Set warm reset code and vector here back
+ * to default values.
+ */
+ CMOS_WRITE(0, 0xf);
+
+ *((volatile long *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
+}
+
+static inline void __init smpboot_setup_io_apic(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * Here we can be sure that there is an IO-APIC in the system. Let's
+ * go and set it up:
+ */
+ if (!skip_ioapic_setup && nr_ioapics)
+ setup_IO_APIC();
+ else {
+ nr_ioapics = 0;
+ localise_nmi_watchdog();
+ }
+#endif
+}
+
+static inline void smpboot_clear_io_apic(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ nr_ioapics = 0;
+#endif
+}
diff --git a/arch/x86/include/asm/mach-generic/gpio.h b/arch/x86/include/asm/mach-generic/gpio.h
new file mode 100644
index 000000000000..995c45efdb33
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/gpio.h
@@ -0,0 +1,15 @@
+#ifndef _ASM_X86_MACH_GENERIC_GPIO_H
+#define _ASM_X86_MACH_GENERIC_GPIO_H
+
+int gpio_request(unsigned gpio, const char *label);
+void gpio_free(unsigned gpio);
+int gpio_direction_input(unsigned gpio);
+int gpio_direction_output(unsigned gpio, int value);
+int gpio_get_value(unsigned gpio);
+void gpio_set_value(unsigned gpio, int value);
+int gpio_to_irq(unsigned gpio);
+int irq_to_gpio(unsigned irq);
+
+#include <asm-generic/gpio.h> /* cansleep wrappers */
+
+#endif /* _ASM_X86_MACH_GENERIC_GPIO_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_apic.h b/arch/x86/include/asm/mach-generic/mach_apic.h
new file mode 100644
index 000000000000..e430f47df667
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_apic.h
@@ -0,0 +1,34 @@
+#ifndef _ASM_X86_MACH_GENERIC_MACH_APIC_H
+#define _ASM_X86_MACH_GENERIC_MACH_APIC_H
+
+#include <asm/genapic.h>
+
+#define esr_disable (genapic->ESR_DISABLE)
+#define NO_BALANCE_IRQ (genapic->no_balance_irq)
+#define INT_DELIVERY_MODE (genapic->int_delivery_mode)
+#define INT_DEST_MODE (genapic->int_dest_mode)
+#undef APIC_DEST_LOGICAL
+#define APIC_DEST_LOGICAL (genapic->apic_destination_logical)
+#define TARGET_CPUS (genapic->target_cpus())
+#define apic_id_registered (genapic->apic_id_registered)
+#define init_apic_ldr (genapic->init_apic_ldr)
+#define ioapic_phys_id_map (genapic->ioapic_phys_id_map)
+#define setup_apic_routing (genapic->setup_apic_routing)
+#define multi_timer_check (genapic->multi_timer_check)
+#define apicid_to_node (genapic->apicid_to_node)
+#define cpu_to_logical_apicid (genapic->cpu_to_logical_apicid)
+#define cpu_present_to_apicid (genapic->cpu_present_to_apicid)
+#define apicid_to_cpu_present (genapic->apicid_to_cpu_present)
+#define setup_portio_remap (genapic->setup_portio_remap)
+#define check_apicid_present (genapic->check_apicid_present)
+#define check_phys_apicid_present (genapic->check_phys_apicid_present)
+#define check_apicid_used (genapic->check_apicid_used)
+#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
+#define vector_allocation_domain (genapic->vector_allocation_domain)
+#define enable_apic_mode (genapic->enable_apic_mode)
+#define phys_pkg_id (genapic->phys_pkg_id)
+#define wakeup_secondary_cpu (genapic->wakeup_cpu)
+
+extern void generic_bigsmp_probe(void);
+
+#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_apicdef.h b/arch/x86/include/asm/mach-generic/mach_apicdef.h
new file mode 100644
index 000000000000..68041f3802f4
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_apicdef.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_X86_MACH_GENERIC_MACH_APICDEF_H
+#define _ASM_X86_MACH_GENERIC_MACH_APICDEF_H
+
+#ifndef APIC_DEFINITION
+#include <asm/genapic.h>
+
+#define GET_APIC_ID (genapic->get_apic_id)
+#define APIC_ID_MASK (genapic->apic_id_mask)
+#endif
+
+#endif /* _ASM_X86_MACH_GENERIC_MACH_APICDEF_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_ipi.h b/arch/x86/include/asm/mach-generic/mach_ipi.h
new file mode 100644
index 000000000000..ffd637e3c3d9
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_ipi.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_MACH_GENERIC_MACH_IPI_H
+#define _ASM_X86_MACH_GENERIC_MACH_IPI_H
+
+#include <asm/genapic.h>
+
+#define send_IPI_mask (genapic->send_IPI_mask)
+#define send_IPI_allbutself (genapic->send_IPI_allbutself)
+#define send_IPI_all (genapic->send_IPI_all)
+
+#endif /* _ASM_X86_MACH_GENERIC_MACH_IPI_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_mpparse.h b/arch/x86/include/asm/mach-generic/mach_mpparse.h
new file mode 100644
index 000000000000..048f1d468535
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_mpparse.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H
+#define _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H
+
+
+extern int mps_oem_check(struct mp_config_table *mpc, char *oem,
+ char *productid);
+
+extern int acpi_madt_oem_check(char *oem_id, char *oem_table_id);
+
+#endif /* _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_mpspec.h b/arch/x86/include/asm/mach-generic/mach_mpspec.h
new file mode 100644
index 000000000000..bbab5ccfd4fe
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_mpspec.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H
+#define _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H
+
+#define MAX_IRQ_SOURCES 256
+
+/* Summit or generic (i.e. installer) kernels need lots of bus entries. */
+/* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */
+#define MAX_MP_BUSSES 260
+
+extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
+ char *productid);
+#endif /* _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_wakecpu.h b/arch/x86/include/asm/mach-generic/mach_wakecpu.h
new file mode 100644
index 000000000000..1ab16b168c8a
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_wakecpu.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H
+#define _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H
+
+#define TRAMPOLINE_PHYS_LOW (genapic->trampoline_phys_low)
+#define TRAMPOLINE_PHYS_HIGH (genapic->trampoline_phys_high)
+#define wait_for_init_deassert (genapic->wait_for_init_deassert)
+#define smp_callin_clear_local_apic (genapic->smp_callin_clear_local_apic)
+#define store_NMI_vector (genapic->store_NMI_vector)
+#define restore_NMI_vector (genapic->restore_NMI_vector)
+#define inquire_remote_apic (genapic->inquire_remote_apic)
+
+#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */
diff --git a/arch/x86/include/asm/mach-rdc321x/gpio.h b/arch/x86/include/asm/mach-rdc321x/gpio.h
new file mode 100644
index 000000000000..c210ab5788b0
--- /dev/null
+++ b/arch/x86/include/asm/mach-rdc321x/gpio.h
@@ -0,0 +1,60 @@
+#ifndef _ASM_X86_MACH_RDC321X_GPIO_H
+#define _ASM_X86_MACH_RDC321X_GPIO_H
+
+#include <linux/kernel.h>
+
+extern int rdc_gpio_get_value(unsigned gpio);
+extern void rdc_gpio_set_value(unsigned gpio, int value);
+extern int rdc_gpio_direction_input(unsigned gpio);
+extern int rdc_gpio_direction_output(unsigned gpio, int value);
+extern int rdc_gpio_request(unsigned gpio, const char *label);
+extern void rdc_gpio_free(unsigned gpio);
+extern void __init rdc321x_gpio_setup(void);
+
+/* Wrappers for the arch-neutral GPIO API */
+
+static inline int gpio_request(unsigned gpio, const char *label)
+{
+ return rdc_gpio_request(gpio, label);
+}
+
+static inline void gpio_free(unsigned gpio)
+{
+ might_sleep();
+ rdc_gpio_free(gpio);
+}
+
+static inline int gpio_direction_input(unsigned gpio)
+{
+ return rdc_gpio_direction_input(gpio);
+}
+
+static inline int gpio_direction_output(unsigned gpio, int value)
+{
+ return rdc_gpio_direction_output(gpio, value);
+}
+
+static inline int gpio_get_value(unsigned gpio)
+{
+ return rdc_gpio_get_value(gpio);
+}
+
+static inline void gpio_set_value(unsigned gpio, int value)
+{
+ rdc_gpio_set_value(gpio, value);
+}
+
+static inline int gpio_to_irq(unsigned gpio)
+{
+ return gpio;
+}
+
+static inline int irq_to_gpio(unsigned irq)
+{
+ return irq;
+}
+
+/* For cansleep */
+#include <asm-generic/gpio.h>
+
+#endif /* _ASM_X86_MACH_RDC321X_GPIO_H */
diff --git a/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h b/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h
new file mode 100644
index 000000000000..c8e9c8bed3d0
--- /dev/null
+++ b/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h
@@ -0,0 +1,12 @@
+#define PFX "rdc321x: "
+
+/* General purpose configuration and data registers */
+#define RDC3210_CFGREG_ADDR 0x0CF8
+#define RDC3210_CFGREG_DATA 0x0CFC
+
+#define RDC321X_GPIO_CTRL_REG1 0x48
+#define RDC321X_GPIO_CTRL_REG2 0x84
+#define RDC321X_GPIO_DATA_REG1 0x4c
+#define RDC321X_GPIO_DATA_REG2 0x88
+
+#define RDC321X_MAX_GPIO 58
diff --git a/arch/x86/include/asm/mach-voyager/do_timer.h b/arch/x86/include/asm/mach-voyager/do_timer.h
new file mode 100644
index 000000000000..9e5a459fd15b
--- /dev/null
+++ b/arch/x86/include/asm/mach-voyager/do_timer.h
@@ -0,0 +1,17 @@
+/* defines for inline arch setup functions */
+#include <linux/clockchips.h>
+
+#include <asm/voyager.h>
+#include <asm/i8253.h>
+
+/**
+ * do_timer_interrupt_hook - hook into timer tick
+ *
+ * Call the pit clock event handler. see asm/i8253.h
+ **/
+static inline void do_timer_interrupt_hook(void)
+{
+ global_clock_event->event_handler(global_clock_event);
+ voyager_timer_interrupt();
+}
+
diff --git a/arch/x86/include/asm/mach-voyager/entry_arch.h b/arch/x86/include/asm/mach-voyager/entry_arch.h
new file mode 100644
index 000000000000..ae52624b5937
--- /dev/null
+++ b/arch/x86/include/asm/mach-voyager/entry_arch.h
@@ -0,0 +1,26 @@
+/* -*- mode: c; c-basic-offset: 8 -*- */
+
+/* Copyright (C) 2002
+ *
+ * Author: James.Bottomley@HansenPartnership.com
+ *
+ * linux/arch/i386/voyager/entry_arch.h
+ *
+ * This file builds the VIC and QIC CPI gates
+ */
+
+/* initialise the voyager interrupt gates
+ *
+ * This uses the macros in irq.h to set up assembly jump gates. The
+ * calls are then redirected to the same routine with smp_ prefixed */
+BUILD_INTERRUPT(vic_sys_interrupt, VIC_SYS_INT)
+BUILD_INTERRUPT(vic_cmn_interrupt, VIC_CMN_INT)
+BUILD_INTERRUPT(vic_cpi_interrupt, VIC_CPI_LEVEL0);
+
+/* do all the QIC interrupts */
+BUILD_INTERRUPT(qic_timer_interrupt, QIC_TIMER_CPI);
+BUILD_INTERRUPT(qic_invalidate_interrupt, QIC_INVALIDATE_CPI);
+BUILD_INTERRUPT(qic_reschedule_interrupt, QIC_RESCHEDULE_CPI);
+BUILD_INTERRUPT(qic_enable_irq_interrupt, QIC_ENABLE_IRQ_CPI);
+BUILD_INTERRUPT(qic_call_function_interrupt, QIC_CALL_FUNCTION_CPI);
+BUILD_INTERRUPT(qic_call_function_single_interrupt, QIC_CALL_FUNCTION_SINGLE_CPI);
diff --git a/arch/x86/include/asm/mach-voyager/setup_arch.h b/arch/x86/include/asm/mach-voyager/setup_arch.h
new file mode 100644
index 000000000000..71729ca05cd7
--- /dev/null
+++ b/arch/x86/include/asm/mach-voyager/setup_arch.h
@@ -0,0 +1,12 @@
+#include <asm/voyager.h>
+#include <asm/setup.h>
+#define VOYAGER_BIOS_INFO ((struct voyager_bios_info *) \
+ (&boot_params.apm_bios_info))
+
+/* Hook to call BIOS initialisation function */
+
+/* for voyager, pass the voyager BIOS/SUS info area to the detection
+ * routines */
+
+#define ARCH_SETUP voyager_detect(VOYAGER_BIOS_INFO);
+
diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h
new file mode 100644
index 000000000000..5a65b107ad58
--- /dev/null
+++ b/arch/x86/include/asm/math_emu.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_MATH_EMU_H
+#define _ASM_X86_MATH_EMU_H
+
+/* This structure matches the layout of the data saved to the stack
+ following a device-not-present interrupt, part of it saved
+ automatically by the 80386/80486.
+ */
+struct info {
+ long ___orig_eip;
+ long ___ebx;
+ long ___ecx;
+ long ___edx;
+ long ___esi;
+ long ___edi;
+ long ___ebp;
+ long ___eax;
+ long ___ds;
+ long ___es;
+ long ___fs;
+ long ___orig_eax;
+ long ___eip;
+ long ___cs;
+ long ___eflags;
+ long ___esp;
+ long ___ss;
+ long ___vm86_es; /* This and the following only in vm86 mode */
+ long ___vm86_ds;
+ long ___vm86_fs;
+ long ___vm86_gs;
+};
+#endif /* _ASM_X86_MATH_EMU_H */
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
new file mode 100644
index 000000000000..01fdf5674e24
--- /dev/null
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -0,0 +1,104 @@
+/*
+ * Machine dependent access functions for RTC registers.
+ */
+#ifndef _ASM_X86_MC146818RTC_H
+#define _ASM_X86_MC146818RTC_H
+
+#include <asm/io.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <linux/mc146818rtc.h>
+
+#ifndef RTC_PORT
+#define RTC_PORT(x) (0x70 + (x))
+#define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */
+#endif
+
+#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG)
+/*
+ * This lock provides nmi access to the CMOS/RTC registers. It has some
+ * special properties. It is owned by a CPU and stores the index register
+ * currently being accessed (if owned). The idea here is that it works
+ * like a normal lock (normally). However, in an NMI, the NMI code will
+ * first check to see if its CPU owns the lock, meaning that the NMI
+ * interrupted during the read/write of the device. If it does, it goes ahead
+ * and performs the access and then restores the index register. If it does
+ * not, it locks normally.
+ *
+ * Note that since we are working with NMIs, we need this lock even in
+ * a non-SMP machine just to mark that the lock is owned.
+ *
+ * This only works with compare-and-swap. There is no other way to
+ * atomically claim the lock and set the owner.
+ */
+#include <linux/smp.h>
+extern volatile unsigned long cmos_lock;
+
+/*
+ * All of these below must be called with interrupts off, preempt
+ * disabled, etc.
+ */
+
+static inline void lock_cmos(unsigned char reg)
+{
+ unsigned long new;
+ new = ((smp_processor_id() + 1) << 8) | reg;
+ for (;;) {
+ if (cmos_lock) {
+ cpu_relax();
+ continue;
+ }
+ if (__cmpxchg(&cmos_lock, 0, new, sizeof(cmos_lock)) == 0)
+ return;
+ }
+}
+
+static inline void unlock_cmos(void)
+{
+ cmos_lock = 0;
+}
+
+static inline int do_i_have_lock_cmos(void)
+{
+ return (cmos_lock >> 8) == (smp_processor_id() + 1);
+}
+
+static inline unsigned char current_lock_cmos_reg(void)
+{
+ return cmos_lock & 0xff;
+}
+
+#define lock_cmos_prefix(reg) \
+ do { \
+ unsigned long cmos_flags; \
+ local_irq_save(cmos_flags); \
+ lock_cmos(reg)
+
+#define lock_cmos_suffix(reg) \
+ unlock_cmos(); \
+ local_irq_restore(cmos_flags); \
+ } while (0)
+#else
+#define lock_cmos_prefix(reg) do {} while (0)
+#define lock_cmos_suffix(reg) do {} while (0)
+#define lock_cmos(reg)
+#define unlock_cmos()
+#define do_i_have_lock_cmos() 0
+#define current_lock_cmos_reg() 0
+#endif
+
+/*
+ * The yet supported machines all access the RTC index register via
+ * an ISA port access but the way to access the date register differs ...
+ */
+#define CMOS_READ(addr) rtc_cmos_read(addr)
+#define CMOS_WRITE(val, addr) rtc_cmos_write(val, addr)
+unsigned char rtc_cmos_read(unsigned char addr);
+void rtc_cmos_write(unsigned char val, unsigned char addr);
+
+extern int mach_set_rtc_mmss(unsigned long nowtime);
+extern unsigned long mach_get_cmos_time(void);
+
+#define RTC_IRQ 8
+
+#endif /* _ASM_X86_MC146818RTC_H */
diff --git a/arch/x86/include/asm/mca.h b/arch/x86/include/asm/mca.h
new file mode 100644
index 000000000000..eedbb6cc1efb
--- /dev/null
+++ b/arch/x86/include/asm/mca.h
@@ -0,0 +1,43 @@
+/* -*- mode: c; c-basic-offset: 8 -*- */
+
+/* Platform specific MCA defines */
+#ifndef _ASM_X86_MCA_H
+#define _ASM_X86_MCA_H
+
+/* Maximal number of MCA slots - actually, some machines have less, but
+ * they all have sufficient number of POS registers to cover 8.
+ */
+#define MCA_MAX_SLOT_NR 8
+
+/* Most machines have only one MCA bus. The only multiple bus machines
+ * I know have at most two */
+#define MAX_MCA_BUSSES 2
+
+#define MCA_PRIMARY_BUS 0
+#define MCA_SECONDARY_BUS 1
+
+/* Dummy slot numbers on primary MCA for integrated functions */
+#define MCA_INTEGSCSI (MCA_MAX_SLOT_NR)
+#define MCA_INTEGVIDEO (MCA_MAX_SLOT_NR+1)
+#define MCA_MOTHERBOARD (MCA_MAX_SLOT_NR+2)
+
+/* Dummy POS values for integrated functions */
+#define MCA_DUMMY_POS_START 0x10000
+#define MCA_INTEGSCSI_POS (MCA_DUMMY_POS_START+1)
+#define MCA_INTEGVIDEO_POS (MCA_DUMMY_POS_START+2)
+#define MCA_MOTHERBOARD_POS (MCA_DUMMY_POS_START+3)
+
+/* MCA registers */
+
+#define MCA_MOTHERBOARD_SETUP_REG 0x94
+#define MCA_ADAPTER_SETUP_REG 0x96
+#define MCA_POS_REG(n) (0x100+(n))
+
+#define MCA_ENABLED 0x01 /* POS 2, set if adapter enabled */
+
+/* Max number of adapters, including both slots and various integrated
+ * things.
+ */
+#define MCA_NUMADAPTERS (MCA_MAX_SLOT_NR+3)
+
+#endif /* _ASM_X86_MCA_H */
diff --git a/arch/x86/include/asm/mca_dma.h b/arch/x86/include/asm/mca_dma.h
new file mode 100644
index 000000000000..45271aef82dd
--- /dev/null
+++ b/arch/x86/include/asm/mca_dma.h
@@ -0,0 +1,201 @@
+#ifndef _ASM_X86_MCA_DMA_H
+#define _ASM_X86_MCA_DMA_H
+
+#include <asm/io.h>
+#include <linux/ioport.h>
+
+/*
+ * Microchannel specific DMA stuff. DMA on an MCA machine is fairly similar to
+ * standard PC dma, but it certainly has its quirks. DMA register addresses
+ * are in a different place and there are some added functions. Most of this
+ * should be pretty obvious on inspection. Note that the user must divide
+ * count by 2 when using 16-bit dma; that is not handled by these functions.
+ *
+ * Ramen Noodles are yummy.
+ *
+ * 1998 Tymm Twillman <tymm@computer.org>
+ */
+
+/*
+ * Registers that are used by the DMA controller; FN is the function register
+ * (tell the controller what to do) and EXE is the execution register (how
+ * to do it)
+ */
+
+#define MCA_DMA_REG_FN 0x18
+#define MCA_DMA_REG_EXE 0x1A
+
+/*
+ * Functions that the DMA controller can do
+ */
+
+#define MCA_DMA_FN_SET_IO 0x00
+#define MCA_DMA_FN_SET_ADDR 0x20
+#define MCA_DMA_FN_GET_ADDR 0x30
+#define MCA_DMA_FN_SET_COUNT 0x40
+#define MCA_DMA_FN_GET_COUNT 0x50
+#define MCA_DMA_FN_GET_STATUS 0x60
+#define MCA_DMA_FN_SET_MODE 0x70
+#define MCA_DMA_FN_SET_ARBUS 0x80
+#define MCA_DMA_FN_MASK 0x90
+#define MCA_DMA_FN_RESET_MASK 0xA0
+#define MCA_DMA_FN_MASTER_CLEAR 0xD0
+
+/*
+ * Modes (used by setting MCA_DMA_FN_MODE in the function register)
+ *
+ * Note that the MODE_READ is read from memory (write to device), and
+ * MODE_WRITE is vice-versa.
+ */
+
+#define MCA_DMA_MODE_XFER 0x04 /* read by default */
+#define MCA_DMA_MODE_READ 0x04 /* same as XFER */
+#define MCA_DMA_MODE_WRITE 0x08 /* OR with MODE_XFER to use */
+#define MCA_DMA_MODE_IO 0x01 /* DMA from IO register */
+#define MCA_DMA_MODE_16 0x40 /* 16 bit xfers */
+
+
+/**
+ * mca_enable_dma - channel to enable DMA on
+ * @dmanr: DMA channel
+ *
+ * Enable the MCA bus DMA on a channel. This can be called from
+ * IRQ context.
+ */
+
+static inline void mca_enable_dma(unsigned int dmanr)
+{
+ outb(MCA_DMA_FN_RESET_MASK | dmanr, MCA_DMA_REG_FN);
+}
+
+/**
+ * mca_disble_dma - channel to disable DMA on
+ * @dmanr: DMA channel
+ *
+ * Enable the MCA bus DMA on a channel. This can be called from
+ * IRQ context.
+ */
+
+static inline void mca_disable_dma(unsigned int dmanr)
+{
+ outb(MCA_DMA_FN_MASK | dmanr, MCA_DMA_REG_FN);
+}
+
+/**
+ * mca_set_dma_addr - load a 24bit DMA address
+ * @dmanr: DMA channel
+ * @a: 24bit bus address
+ *
+ * Load the address register in the DMA controller. This has a 24bit
+ * limitation (16Mb).
+ */
+
+static inline void mca_set_dma_addr(unsigned int dmanr, unsigned int a)
+{
+ outb(MCA_DMA_FN_SET_ADDR | dmanr, MCA_DMA_REG_FN);
+ outb(a & 0xff, MCA_DMA_REG_EXE);
+ outb((a >> 8) & 0xff, MCA_DMA_REG_EXE);
+ outb((a >> 16) & 0xff, MCA_DMA_REG_EXE);
+}
+
+/**
+ * mca_get_dma_addr - load a 24bit DMA address
+ * @dmanr: DMA channel
+ *
+ * Read the address register in the DMA controller. This has a 24bit
+ * limitation (16Mb). The return is a bus address.
+ */
+
+static inline unsigned int mca_get_dma_addr(unsigned int dmanr)
+{
+ unsigned int addr;
+
+ outb(MCA_DMA_FN_GET_ADDR | dmanr, MCA_DMA_REG_FN);
+ addr = inb(MCA_DMA_REG_EXE);
+ addr |= inb(MCA_DMA_REG_EXE) << 8;
+ addr |= inb(MCA_DMA_REG_EXE) << 16;
+
+ return addr;
+}
+
+/**
+ * mca_set_dma_count - load a 16bit transfer count
+ * @dmanr: DMA channel
+ * @count: count
+ *
+ * Set the DMA count for this channel. This can be up to 64Kbytes.
+ * Setting a count of zero will not do what you expect.
+ */
+
+static inline void mca_set_dma_count(unsigned int dmanr, unsigned int count)
+{
+ count--; /* transfers one more than count -- correct for this */
+
+ outb(MCA_DMA_FN_SET_COUNT | dmanr, MCA_DMA_REG_FN);
+ outb(count & 0xff, MCA_DMA_REG_EXE);
+ outb((count >> 8) & 0xff, MCA_DMA_REG_EXE);
+}
+
+/**
+ * mca_get_dma_residue - get the remaining bytes to transfer
+ * @dmanr: DMA channel
+ *
+ * This function returns the number of bytes left to transfer
+ * on this DMA channel.
+ */
+
+static inline unsigned int mca_get_dma_residue(unsigned int dmanr)
+{
+ unsigned short count;
+
+ outb(MCA_DMA_FN_GET_COUNT | dmanr, MCA_DMA_REG_FN);
+ count = 1 + inb(MCA_DMA_REG_EXE);
+ count += inb(MCA_DMA_REG_EXE) << 8;
+
+ return count;
+}
+
+/**
+ * mca_set_dma_io - set the port for an I/O transfer
+ * @dmanr: DMA channel
+ * @io_addr: an I/O port number
+ *
+ * Unlike the ISA bus DMA controllers the DMA on MCA bus can transfer
+ * with an I/O port target.
+ */
+
+static inline void mca_set_dma_io(unsigned int dmanr, unsigned int io_addr)
+{
+ /*
+ * DMA from a port address -- set the io address
+ */
+
+ outb(MCA_DMA_FN_SET_IO | dmanr, MCA_DMA_REG_FN);
+ outb(io_addr & 0xff, MCA_DMA_REG_EXE);
+ outb((io_addr >> 8) & 0xff, MCA_DMA_REG_EXE);
+}
+
+/**
+ * mca_set_dma_mode - set the DMA mode
+ * @dmanr: DMA channel
+ * @mode: mode to set
+ *
+ * The DMA controller supports several modes. The mode values you can
+ * set are-
+ *
+ * %MCA_DMA_MODE_READ when reading from the DMA device.
+ *
+ * %MCA_DMA_MODE_WRITE to writing to the DMA device.
+ *
+ * %MCA_DMA_MODE_IO to do DMA to or from an I/O port.
+ *
+ * %MCA_DMA_MODE_16 to do 16bit transfers.
+ */
+
+static inline void mca_set_dma_mode(unsigned int dmanr, unsigned int mode)
+{
+ outb(MCA_DMA_FN_SET_MODE | dmanr, MCA_DMA_REG_FN);
+ outb(mode, MCA_DMA_REG_EXE);
+}
+
+#endif /* _ASM_X86_MCA_DMA_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
new file mode 100644
index 000000000000..1d6e17c2f23a
--- /dev/null
+++ b/arch/x86/include/asm/mce.h
@@ -0,0 +1,130 @@
+#ifndef _ASM_X86_MCE_H
+#define _ASM_X86_MCE_H
+
+#ifdef __x86_64__
+
+#include <asm/ioctls.h>
+#include <asm/types.h>
+
+/*
+ * Machine Check support for x86
+ */
+
+#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
+
+#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */
+#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */
+#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */
+
+#define MCI_STATUS_VAL (1UL<<63) /* valid error */
+#define MCI_STATUS_OVER (1UL<<62) /* previous errors lost */
+#define MCI_STATUS_UC (1UL<<61) /* uncorrected error */
+#define MCI_STATUS_EN (1UL<<60) /* error enabled */
+#define MCI_STATUS_MISCV (1UL<<59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV (1UL<<58) /* addr reg. valid */
+#define MCI_STATUS_PCC (1UL<<57) /* processor context corrupt */
+
+/* Fields are zero when not available */
+struct mce {
+ __u64 status;
+ __u64 misc;
+ __u64 addr;
+ __u64 mcgstatus;
+ __u64 ip;
+ __u64 tsc; /* cpu time stamp counter */
+ __u64 res1; /* for future extension */
+ __u64 res2; /* dito. */
+ __u8 cs; /* code segment */
+ __u8 bank; /* machine check bank */
+ __u8 cpu; /* cpu that raised the error */
+ __u8 finished; /* entry is valid */
+ __u32 pad;
+};
+
+/*
+ * This structure contains all data related to the MCE log. Also
+ * carries a signature to make it easier to find from external
+ * debugging tools. Each entry is only valid when its finished flag
+ * is set.
+ */
+
+#define MCE_LOG_LEN 32
+
+struct mce_log {
+ char signature[12]; /* "MACHINECHECK" */
+ unsigned len; /* = MCE_LOG_LEN */
+ unsigned next;
+ unsigned flags;
+ unsigned pad0;
+ struct mce entry[MCE_LOG_LEN];
+};
+
+#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
+
+#define MCE_LOG_SIGNATURE "MACHINECHECK"
+
+#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
+#define MCE_GET_LOG_LEN _IOR('M', 2, int)
+#define MCE_GETCLEAR_FLAGS _IOR('M', 3, int)
+
+/* Software defined banks */
+#define MCE_EXTENDED_BANK 128
+#define MCE_THERMAL_BANK MCE_EXTENDED_BANK + 0
+
+#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */
+#define K8_MCE_THRESHOLD_BANK_0 (MCE_THRESHOLD_BASE + 0 * 9)
+#define K8_MCE_THRESHOLD_BANK_1 (MCE_THRESHOLD_BASE + 1 * 9)
+#define K8_MCE_THRESHOLD_BANK_2 (MCE_THRESHOLD_BASE + 2 * 9)
+#define K8_MCE_THRESHOLD_BANK_3 (MCE_THRESHOLD_BASE + 3 * 9)
+#define K8_MCE_THRESHOLD_BANK_4 (MCE_THRESHOLD_BASE + 4 * 9)
+#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
+#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
+
+#endif /* __x86_64__ */
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_X86_32
+extern int mce_disabled;
+#else /* CONFIG_X86_32 */
+
+#include <asm/atomic.h>
+
+void mce_log(struct mce *m);
+DECLARE_PER_CPU(struct sys_device, device_mce);
+extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
+
+#ifdef CONFIG_X86_MCE_INTEL
+void mce_intel_feature_init(struct cpuinfo_x86 *c);
+#else
+static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+void mce_amd_feature_init(struct cpuinfo_x86 *c);
+#else
+static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
+#endif
+
+void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
+
+extern atomic_t mce_entry;
+
+extern void do_machine_check(struct pt_regs *, long);
+extern int mce_notify_user(void);
+
+#endif /* !CONFIG_X86_32 */
+
+
+
+#ifdef CONFIG_X86_MCE
+extern void mcheck_init(struct cpuinfo_x86 *c);
+#else
+#define mcheck_init(c) do { } while (0)
+#endif
+extern void stop_mce(void);
+extern void restart_mce(void);
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
new file mode 100644
index 000000000000..c882664716c1
--- /dev/null
+++ b/arch/x86/include/asm/microcode.h
@@ -0,0 +1,47 @@
+#ifndef _ASM_X86_MICROCODE_H
+#define _ASM_X86_MICROCODE_H
+
+struct cpu_signature {
+ unsigned int sig;
+ unsigned int pf;
+ unsigned int rev;
+};
+
+struct device;
+
+struct microcode_ops {
+ int (*request_microcode_user) (int cpu, const void __user *buf, size_t size);
+ int (*request_microcode_fw) (int cpu, struct device *device);
+
+ void (*apply_microcode) (int cpu);
+
+ int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
+ void (*microcode_fini_cpu) (int cpu);
+};
+
+struct ucode_cpu_info {
+ struct cpu_signature cpu_sig;
+ int valid;
+ void *mc;
+};
+extern struct ucode_cpu_info ucode_cpu_info[];
+
+#ifdef CONFIG_MICROCODE_INTEL
+extern struct microcode_ops * __init init_intel_microcode(void);
+#else
+static inline struct microcode_ops * __init init_intel_microcode(void)
+{
+ return NULL;
+}
+#endif /* CONFIG_MICROCODE_INTEL */
+
+#ifdef CONFIG_MICROCODE_AMD
+extern struct microcode_ops * __init init_amd_microcode(void);
+#else
+static inline struct microcode_ops * __init init_amd_microcode(void)
+{
+ return NULL;
+}
+#endif
+
+#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
new file mode 100644
index 000000000000..90bc4108a4fd
--- /dev/null
+++ b/arch/x86/include/asm/mman.h
@@ -0,0 +1,20 @@
+#ifndef _ASM_X86_MMAN_H
+#define _ASM_X86_MMAN_H
+
+#include <asm-generic/mman.h>
+
+#define MAP_32BIT 0x40 /* only give out 32bit addresses */
+
+#define MAP_GROWSDOWN 0x0100 /* stack-like segment */
+#define MAP_DENYWRITE 0x0800 /* ETXTBSY */
+#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */
+#define MAP_LOCKED 0x2000 /* pages are locked */
+#define MAP_NORESERVE 0x4000 /* don't check for reservations */
+#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
+#define MAP_NONBLOCK 0x10000 /* do not block on IO */
+#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
+
+#define MCL_CURRENT 1 /* lock all current mappings */
+#define MCL_FUTURE 2 /* lock all future mappings */
+
+#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/asm/mmconfig.h b/arch/x86/include/asm/mmconfig.h
new file mode 100644
index 000000000000..9b119da1d105
--- /dev/null
+++ b/arch/x86/include/asm/mmconfig.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_MMCONFIG_H
+#define _ASM_X86_MMCONFIG_H
+
+#ifdef CONFIG_PCI_MMCONFIG
+extern void __cpuinit fam10h_check_enable_mmcfg(void);
+extern void __cpuinit check_enable_amd_mmconf_dmi(void);
+#else
+static inline void fam10h_check_enable_mmcfg(void) { }
+static inline void check_enable_amd_mmconf_dmi(void) { }
+#endif
+
+#endif /* _ASM_X86_MMCONFIG_H */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
new file mode 100644
index 000000000000..80a1dee5bea5
--- /dev/null
+++ b/arch/x86/include/asm/mmu.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_X86_MMU_H
+#define _ASM_X86_MMU_H
+
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+
+/*
+ * The x86 doesn't have a mmu context, but
+ * we put the segment information here.
+ */
+typedef struct {
+ void *ldt;
+ int size;
+ struct mutex lock;
+ void *vdso;
+} mm_context_t;
+
+#ifdef CONFIG_SMP
+void leave_mm(int cpu);
+#else
+static inline void leave_mm(int cpu)
+{
+}
+#endif
+
+#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
new file mode 100644
index 000000000000..8aeeb3fd73db
--- /dev/null
+++ b/arch/x86/include/asm/mmu_context.h
@@ -0,0 +1,37 @@
+#ifndef _ASM_X86_MMU_CONTEXT_H
+#define _ASM_X86_MMU_CONTEXT_H
+
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/paravirt.h>
+#ifndef CONFIG_PARAVIRT
+#include <asm-generic/mm_hooks.h>
+
+static inline void paravirt_activate_mm(struct mm_struct *prev,
+ struct mm_struct *next)
+{
+}
+#endif /* !CONFIG_PARAVIRT */
+
+/*
+ * Used for LDT copy/destruction.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context(struct mm_struct *mm);
+
+#ifdef CONFIG_X86_32
+# include "mmu_context_32.h"
+#else
+# include "mmu_context_64.h"
+#endif
+
+#define activate_mm(prev, next) \
+do { \
+ paravirt_activate_mm((prev), (next)); \
+ switch_mm((prev), (next), NULL); \
+} while (0);
+
+
+#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h
new file mode 100644
index 000000000000..7e98ce1d2c0e
--- /dev/null
+++ b/arch/x86/include/asm/mmu_context_32.h
@@ -0,0 +1,55 @@
+#ifndef _ASM_X86_MMU_CONTEXT_32_H
+#define _ASM_X86_MMU_CONTEXT_32_H
+
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+#ifdef CONFIG_SMP
+ if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
+ x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
+#endif
+}
+
+static inline void switch_mm(struct mm_struct *prev,
+ struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ int cpu = smp_processor_id();
+
+ if (likely(prev != next)) {
+ /* stop flush ipis for the previous mm */
+ cpu_clear(cpu, prev->cpu_vm_mask);
+#ifdef CONFIG_SMP
+ x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
+ x86_write_percpu(cpu_tlbstate.active_mm, next);
+#endif
+ cpu_set(cpu, next->cpu_vm_mask);
+
+ /* Re-load page tables */
+ load_cr3(next->pgd);
+
+ /*
+ * load the LDT, if the LDT is different:
+ */
+ if (unlikely(prev->context.ldt != next->context.ldt))
+ load_LDT_nolock(&next->context);
+ }
+#ifdef CONFIG_SMP
+ else {
+ x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
+ BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
+
+ if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+ /* We were in lazy tlb mode and leave_mm disabled
+ * tlb flush IPI delivery. We must reload %cr3.
+ */
+ load_cr3(next->pgd);
+ load_LDT_nolock(&next->context);
+ }
+ }
+#endif
+}
+
+#define deactivate_mm(tsk, mm) \
+ asm("movl %0,%%gs": :"r" (0));
+
+#endif /* _ASM_X86_MMU_CONTEXT_32_H */
diff --git a/arch/x86/include/asm/mmu_context_64.h b/arch/x86/include/asm/mmu_context_64.h
new file mode 100644
index 000000000000..677d36e9540a
--- /dev/null
+++ b/arch/x86/include/asm/mmu_context_64.h
@@ -0,0 +1,54 @@
+#ifndef _ASM_X86_MMU_CONTEXT_64_H
+#define _ASM_X86_MMU_CONTEXT_64_H
+
+#include <asm/pda.h>
+
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+#ifdef CONFIG_SMP
+ if (read_pda(mmu_state) == TLBSTATE_OK)
+ write_pda(mmu_state, TLBSTATE_LAZY);
+#endif
+}
+
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ unsigned cpu = smp_processor_id();
+ if (likely(prev != next)) {
+ /* stop flush ipis for the previous mm */
+ cpu_clear(cpu, prev->cpu_vm_mask);
+#ifdef CONFIG_SMP
+ write_pda(mmu_state, TLBSTATE_OK);
+ write_pda(active_mm, next);
+#endif
+ cpu_set(cpu, next->cpu_vm_mask);
+ load_cr3(next->pgd);
+
+ if (unlikely(next->context.ldt != prev->context.ldt))
+ load_LDT_nolock(&next->context);
+ }
+#ifdef CONFIG_SMP
+ else {
+ write_pda(mmu_state, TLBSTATE_OK);
+ if (read_pda(active_mm) != next)
+ BUG();
+ if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+ /* We were in lazy tlb mode and leave_mm disabled
+ * tlb flush IPI delivery. We must reload CR3
+ * to make sure to use no freed page tables.
+ */
+ load_cr3(next->pgd);
+ load_LDT_nolock(&next->context);
+ }
+ }
+#endif
+}
+
+#define deactivate_mm(tsk, mm) \
+do { \
+ load_gs_index(0); \
+ asm volatile("movl %0,%%fs"::"r"(0)); \
+} while (0)
+
+#endif /* _ASM_X86_MMU_CONTEXT_64_H */
diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h
new file mode 100644
index 000000000000..5cbf3135b971
--- /dev/null
+++ b/arch/x86/include/asm/mmx.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_X86_MMX_H
+#define _ASM_X86_MMX_H
+
+/*
+ * MMX 3Dnow! helper operations
+ */
+
+#include <linux/types.h>
+
+extern void *_mmx_memcpy(void *to, const void *from, size_t size);
+extern void mmx_clear_page(void *page);
+extern void mmx_copy_page(void *to, void *from);
+
+#endif /* _ASM_X86_MMX_H */
diff --git a/arch/x86/include/asm/mmzone.h b/arch/x86/include/asm/mmzone.h
new file mode 100644
index 000000000000..64217ea16a36
--- /dev/null
+++ b/arch/x86/include/asm/mmzone.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "mmzone_32.h"
+#else
+# include "mmzone_64.h"
+#endif
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
new file mode 100644
index 000000000000..07f1af494ca5
--- /dev/null
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -0,0 +1,138 @@
+/*
+ * Written by Pat Gaughen (gone@us.ibm.com) Mar 2002
+ *
+ */
+
+#ifndef _ASM_X86_MMZONE_32_H
+#define _ASM_X86_MMZONE_32_H
+
+#include <asm/smp.h>
+
+#ifdef CONFIG_NUMA
+extern struct pglist_data *node_data[];
+#define NODE_DATA(nid) (node_data[nid])
+
+#include <asm/numaq.h>
+/* summit or generic arch */
+#include <asm/srat.h>
+
+extern int get_memcfg_numa_flat(void);
+/*
+ * This allows any one NUMA architecture to be compiled
+ * for, and still fall back to the flat function if it
+ * fails.
+ */
+static inline void get_memcfg_numa(void)
+{
+
+ if (get_memcfg_numaq())
+ return;
+ if (get_memcfg_from_srat())
+ return;
+ get_memcfg_numa_flat();
+}
+
+extern int early_pfn_to_nid(unsigned long pfn);
+
+extern void resume_map_numa_kva(pgd_t *pgd);
+
+#else /* !CONFIG_NUMA */
+
+#define get_memcfg_numa get_memcfg_numa_flat
+
+static inline void resume_map_numa_kva(pgd_t *pgd) {}
+
+#endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_DISCONTIGMEM
+
+/*
+ * generic node memory support, the following assumptions apply:
+ *
+ * 1) memory comes in 64Mb contigious chunks which are either present or not
+ * 2) we will not have more than 64Gb in total
+ *
+ * for now assume that 64Gb is max amount of RAM for whole system
+ * 64Gb / 4096bytes/page = 16777216 pages
+ */
+#define MAX_NR_PAGES 16777216
+#define MAX_ELEMENTS 1024
+#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS)
+
+extern s8 physnode_map[];
+
+static inline int pfn_to_nid(unsigned long pfn)
+{
+#ifdef CONFIG_NUMA
+ return((int) physnode_map[(pfn) / PAGES_PER_ELEMENT]);
+#else
+ return 0;
+#endif
+}
+
+/*
+ * Following are macros that each numa implmentation must define.
+ */
+
+#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
+#define node_end_pfn(nid) \
+({ \
+ pg_data_t *__pgdat = NODE_DATA(nid); \
+ __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \
+})
+
+static inline int pfn_valid(int pfn)
+{
+ int nid = pfn_to_nid(pfn);
+
+ if (nid >= 0)
+ return (pfn < node_end_pfn(nid));
+ return 0;
+}
+
+#endif /* CONFIG_DISCONTIGMEM */
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+
+/*
+ * Following are macros that are specific to this numa platform.
+ */
+#define reserve_bootmem(addr, size, flags) \
+ reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
+#define alloc_bootmem(x) \
+ __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_nopanic(x) \
+ __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
+ __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_low(x) \
+ __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
+#define alloc_bootmem_pages(x) \
+ __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_pages_nopanic(x) \
+ __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
+ __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_low_pages(x) \
+ __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
+#define alloc_bootmem_node(pgdat, x) \
+({ \
+ struct pglist_data __maybe_unused \
+ *__alloc_bootmem_node__pgdat = (pgdat); \
+ __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
+ __pa(MAX_DMA_ADDRESS)); \
+})
+#define alloc_bootmem_pages_node(pgdat, x) \
+({ \
+ struct pglist_data __maybe_unused \
+ *__alloc_bootmem_node__pgdat = (pgdat); \
+ __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \
+ __pa(MAX_DMA_ADDRESS)); \
+})
+#define alloc_bootmem_low_pages_node(pgdat, x) \
+({ \
+ struct pglist_data __maybe_unused \
+ *__alloc_bootmem_node__pgdat = (pgdat); \
+ __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \
+})
+#endif /* CONFIG_NEED_MULTIPLE_NODES */
+
+#endif /* _ASM_X86_MMZONE_32_H */
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
new file mode 100644
index 000000000000..a5b3817d4b9e
--- /dev/null
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -0,0 +1,51 @@
+/* K8 NUMA support */
+/* Copyright 2002,2003 by Andi Kleen, SuSE Labs */
+/* 2.5 Version loosely based on the NUMAQ Code by Pat Gaughen. */
+#ifndef _ASM_X86_MMZONE_64_H
+#define _ASM_X86_MMZONE_64_H
+
+
+#ifdef CONFIG_NUMA
+
+#include <linux/mmdebug.h>
+
+#include <asm/smp.h>
+
+/* Simple perfect hash to map physical addresses to node numbers */
+struct memnode {
+ int shift;
+ unsigned int mapsize;
+ s16 *map;
+ s16 embedded_map[64 - 8];
+} ____cacheline_aligned; /* total size = 128 bytes */
+extern struct memnode memnode;
+#define memnode_shift memnode.shift
+#define memnodemap memnode.map
+#define memnodemapsize memnode.mapsize
+
+extern struct pglist_data *node_data[];
+
+static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
+{
+ unsigned nid;
+ VIRTUAL_BUG_ON(!memnodemap);
+ nid = memnodemap[addr >> memnode_shift];
+ VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
+ return nid;
+}
+
+#define NODE_DATA(nid) (node_data[nid])
+
+#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
+#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
+ NODE_DATA(nid)->node_spanned_pages)
+
+extern int early_pfn_to_nid(unsigned long pfn);
+
+#ifdef CONFIG_NUMA_EMU
+#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024)
+#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
+#endif
+
+#endif
+#endif /* _ASM_X86_MMZONE_64_H */
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
new file mode 100644
index 000000000000..47d62743c4d5
--- /dev/null
+++ b/arch/x86/include/asm/module.h
@@ -0,0 +1,80 @@
+#ifndef _ASM_X86_MODULE_H
+#define _ASM_X86_MODULE_H
+
+/* x86_32/64 are simple */
+struct mod_arch_specific {};
+
+#ifdef CONFIG_X86_32
+# define Elf_Shdr Elf32_Shdr
+# define Elf_Sym Elf32_Sym
+# define Elf_Ehdr Elf32_Ehdr
+#else
+# define Elf_Shdr Elf64_Shdr
+# define Elf_Sym Elf64_Sym
+# define Elf_Ehdr Elf64_Ehdr
+#endif
+
+#ifdef CONFIG_X86_64
+/* X86_64 does not define MODULE_PROC_FAMILY */
+#elif defined CONFIG_M386
+#define MODULE_PROC_FAMILY "386 "
+#elif defined CONFIG_M486
+#define MODULE_PROC_FAMILY "486 "
+#elif defined CONFIG_M586
+#define MODULE_PROC_FAMILY "586 "
+#elif defined CONFIG_M586TSC
+#define MODULE_PROC_FAMILY "586TSC "
+#elif defined CONFIG_M586MMX
+#define MODULE_PROC_FAMILY "586MMX "
+#elif defined CONFIG_MCORE2
+#define MODULE_PROC_FAMILY "CORE2 "
+#elif defined CONFIG_M686
+#define MODULE_PROC_FAMILY "686 "
+#elif defined CONFIG_MPENTIUMII
+#define MODULE_PROC_FAMILY "PENTIUMII "
+#elif defined CONFIG_MPENTIUMIII
+#define MODULE_PROC_FAMILY "PENTIUMIII "
+#elif defined CONFIG_MPENTIUMM
+#define MODULE_PROC_FAMILY "PENTIUMM "
+#elif defined CONFIG_MPENTIUM4
+#define MODULE_PROC_FAMILY "PENTIUM4 "
+#elif defined CONFIG_MK6
+#define MODULE_PROC_FAMILY "K6 "
+#elif defined CONFIG_MK7
+#define MODULE_PROC_FAMILY "K7 "
+#elif defined CONFIG_MK8
+#define MODULE_PROC_FAMILY "K8 "
+#elif defined CONFIG_X86_ELAN
+#define MODULE_PROC_FAMILY "ELAN "
+#elif defined CONFIG_MCRUSOE
+#define MODULE_PROC_FAMILY "CRUSOE "
+#elif defined CONFIG_MEFFICEON
+#define MODULE_PROC_FAMILY "EFFICEON "
+#elif defined CONFIG_MWINCHIPC6
+#define MODULE_PROC_FAMILY "WINCHIPC6 "
+#elif defined CONFIG_MWINCHIP3D
+#define MODULE_PROC_FAMILY "WINCHIP3D "
+#elif defined CONFIG_MCYRIXIII
+#define MODULE_PROC_FAMILY "CYRIXIII "
+#elif defined CONFIG_MVIAC3_2
+#define MODULE_PROC_FAMILY "VIAC3-2 "
+#elif defined CONFIG_MVIAC7
+#define MODULE_PROC_FAMILY "VIAC7 "
+#elif defined CONFIG_MGEODEGX1
+#define MODULE_PROC_FAMILY "GEODEGX1 "
+#elif defined CONFIG_MGEODE_LX
+#define MODULE_PROC_FAMILY "GEODE "
+#else
+#error unknown processor family
+#endif
+
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_4KSTACKS
+# define MODULE_STACKSIZE "4KSTACKS "
+# else
+# define MODULE_STACKSIZE ""
+# endif
+# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE
+#endif
+
+#endif /* _ASM_X86_MODULE_H */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
new file mode 100644
index 000000000000..91885c28f66b
--- /dev/null
+++ b/arch/x86/include/asm/mpspec.h
@@ -0,0 +1,145 @@
+#ifndef _ASM_X86_MPSPEC_H
+#define _ASM_X86_MPSPEC_H
+
+#include <linux/init.h>
+
+#include <asm/mpspec_def.h>
+
+extern int apic_version[MAX_APICS];
+
+#ifdef CONFIG_X86_32
+#include <mach_mpspec.h>
+
+extern unsigned int def_to_bigsmp;
+extern u8 apicid_2_node[];
+extern int pic_mode;
+
+#ifdef CONFIG_X86_NUMAQ
+extern int mp_bus_id_to_node[MAX_MP_BUSSES];
+extern int mp_bus_id_to_local[MAX_MP_BUSSES];
+extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+#endif
+
+#define MAX_APICID 256
+
+#else
+
+#define MAX_MP_BUSSES 256
+/* Each PCI slot may be a combo card with its own bus. 4 IRQ pins per slot. */
+#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
+
+#endif
+
+extern void early_find_smp_config(void);
+extern void early_get_smp_config(void);
+
+#if defined(CONFIG_MCA) || defined(CONFIG_EISA)
+extern int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
+extern unsigned int boot_cpu_physical_apicid;
+extern unsigned int max_physical_apicid;
+extern int smp_found_config;
+extern int mpc_default_type;
+extern unsigned long mp_lapic_addr;
+
+extern void find_smp_config(void);
+extern void get_smp_config(void);
+#ifdef CONFIG_X86_MPPARSE
+extern void early_reserve_e820_mpc_new(void);
+#else
+static inline void early_reserve_e820_mpc_new(void) { }
+#endif
+
+void __cpuinit generic_processor_info(int apicid, int version);
+#ifdef CONFIG_ACPI
+extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
+extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
+ u32 gsi);
+extern void mp_config_acpi_legacy_irqs(void);
+extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
+#ifdef CONFIG_X86_IO_APIC
+extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
+ u32 gsi, int triggering, int polarity);
+#else
+static inline int
+mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
+ u32 gsi, int triggering, int polarity)
+{
+ return 0;
+}
+#endif
+#endif /* CONFIG_ACPI */
+
+#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS)
+
+struct physid_mask {
+ unsigned long mask[PHYSID_ARRAY_SIZE];
+};
+
+typedef struct physid_mask physid_mask_t;
+
+#define physid_set(physid, map) set_bit(physid, (map).mask)
+#define physid_clear(physid, map) clear_bit(physid, (map).mask)
+#define physid_isset(physid, map) test_bit(physid, (map).mask)
+#define physid_test_and_set(physid, map) \
+ test_and_set_bit(physid, (map).mask)
+
+#define physids_and(dst, src1, src2) \
+ bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+
+#define physids_or(dst, src1, src2) \
+ bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+
+#define physids_clear(map) \
+ bitmap_zero((map).mask, MAX_APICS)
+
+#define physids_complement(dst, src) \
+ bitmap_complement((dst).mask, (src).mask, MAX_APICS)
+
+#define physids_empty(map) \
+ bitmap_empty((map).mask, MAX_APICS)
+
+#define physids_equal(map1, map2) \
+ bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
+
+#define physids_weight(map) \
+ bitmap_weight((map).mask, MAX_APICS)
+
+#define physids_shift_right(d, s, n) \
+ bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
+
+#define physids_shift_left(d, s, n) \
+ bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
+
+#define physids_coerce(map) ((map).mask[0])
+
+#define physids_promote(physids) \
+ ({ \
+ physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
+ __physid_mask.mask[0] = physids; \
+ __physid_mask; \
+ })
+
+/* Note: will create very large stack frames if physid_mask_t is big */
+#define physid_mask_of_physid(physid) \
+ ({ \
+ physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
+ physid_set(physid, __physid_mask); \
+ __physid_mask; \
+ })
+
+static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
+{
+ physids_clear(*map);
+ physid_set(physid, *map);
+}
+
+#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }
+#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} }
+
+extern physid_mask_t phys_cpu_present_map;
+
+#endif /* _ASM_X86_MPSPEC_H */
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
new file mode 100644
index 000000000000..e3ace7d1d35d
--- /dev/null
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -0,0 +1,180 @@
+#ifndef _ASM_X86_MPSPEC_DEF_H
+#define _ASM_X86_MPSPEC_DEF_H
+
+/*
+ * Structure definitions for SMP machines following the
+ * Intel Multiprocessing Specification 1.1 and 1.4.
+ */
+
+/*
+ * This tag identifies where the SMP configuration
+ * information is.
+ */
+
+#define SMP_MAGIC_IDENT (('_'<<24) | ('P'<<16) | ('M'<<8) | '_')
+
+#ifdef CONFIG_X86_32
+# define MAX_MPC_ENTRY 1024
+# define MAX_APICS 256
+#else
+# if NR_CPUS <= 255
+# define MAX_APICS 255
+# else
+# define MAX_APICS 32768
+# endif
+#endif
+
+struct intel_mp_floating {
+ char mpf_signature[4]; /* "_MP_" */
+ unsigned int mpf_physptr; /* Configuration table address */
+ unsigned char mpf_length; /* Our length (paragraphs) */
+ unsigned char mpf_specification;/* Specification version */
+ unsigned char mpf_checksum; /* Checksum (makes sum 0) */
+ unsigned char mpf_feature1; /* Standard or configuration ? */
+ unsigned char mpf_feature2; /* Bit7 set for IMCR|PIC */
+ unsigned char mpf_feature3; /* Unused (0) */
+ unsigned char mpf_feature4; /* Unused (0) */
+ unsigned char mpf_feature5; /* Unused (0) */
+};
+
+#define MPC_SIGNATURE "PCMP"
+
+struct mp_config_table {
+ char mpc_signature[4];
+ unsigned short mpc_length; /* Size of table */
+ char mpc_spec; /* 0x01 */
+ char mpc_checksum;
+ char mpc_oem[8];
+ char mpc_productid[12];
+ unsigned int mpc_oemptr; /* 0 if not present */
+ unsigned short mpc_oemsize; /* 0 if not present */
+ unsigned short mpc_oemcount;
+ unsigned int mpc_lapic; /* APIC address */
+ unsigned int reserved;
+};
+
+/* Followed by entries */
+
+#define MP_PROCESSOR 0
+#define MP_BUS 1
+#define MP_IOAPIC 2
+#define MP_INTSRC 3
+#define MP_LINTSRC 4
+/* Used by IBM NUMA-Q to describe node locality */
+#define MP_TRANSLATION 192
+
+#define CPU_ENABLED 1 /* Processor is available */
+#define CPU_BOOTPROCESSOR 2 /* Processor is the BP */
+
+#define CPU_STEPPING_MASK 0x000F
+#define CPU_MODEL_MASK 0x00F0
+#define CPU_FAMILY_MASK 0x0F00
+
+struct mpc_config_processor {
+ unsigned char mpc_type;
+ unsigned char mpc_apicid; /* Local APIC number */
+ unsigned char mpc_apicver; /* Its versions */
+ unsigned char mpc_cpuflag;
+ unsigned int mpc_cpufeature;
+ unsigned int mpc_featureflag; /* CPUID feature value */
+ unsigned int mpc_reserved[2];
+};
+
+struct mpc_config_bus {
+ unsigned char mpc_type;
+ unsigned char mpc_busid;
+ unsigned char mpc_bustype[6];
+};
+
+/* List of Bus Type string values, Intel MP Spec. */
+#define BUSTYPE_EISA "EISA"
+#define BUSTYPE_ISA "ISA"
+#define BUSTYPE_INTERN "INTERN" /* Internal BUS */
+#define BUSTYPE_MCA "MCA"
+#define BUSTYPE_VL "VL" /* Local bus */
+#define BUSTYPE_PCI "PCI"
+#define BUSTYPE_PCMCIA "PCMCIA"
+#define BUSTYPE_CBUS "CBUS"
+#define BUSTYPE_CBUSII "CBUSII"
+#define BUSTYPE_FUTURE "FUTURE"
+#define BUSTYPE_MBI "MBI"
+#define BUSTYPE_MBII "MBII"
+#define BUSTYPE_MPI "MPI"
+#define BUSTYPE_MPSA "MPSA"
+#define BUSTYPE_NUBUS "NUBUS"
+#define BUSTYPE_TC "TC"
+#define BUSTYPE_VME "VME"
+#define BUSTYPE_XPRESS "XPRESS"
+
+#define MPC_APIC_USABLE 0x01
+
+struct mpc_config_ioapic {
+ unsigned char mpc_type;
+ unsigned char mpc_apicid;
+ unsigned char mpc_apicver;
+ unsigned char mpc_flags;
+ unsigned int mpc_apicaddr;
+};
+
+struct mpc_config_intsrc {
+ unsigned char mpc_type;
+ unsigned char mpc_irqtype;
+ unsigned short mpc_irqflag;
+ unsigned char mpc_srcbus;
+ unsigned char mpc_srcbusirq;
+ unsigned char mpc_dstapic;
+ unsigned char mpc_dstirq;
+};
+
+enum mp_irq_source_types {
+ mp_INT = 0,
+ mp_NMI = 1,
+ mp_SMI = 2,
+ mp_ExtINT = 3
+};
+
+#define MP_IRQDIR_DEFAULT 0
+#define MP_IRQDIR_HIGH 1
+#define MP_IRQDIR_LOW 3
+
+#define MP_APIC_ALL 0xFF
+
+struct mpc_config_lintsrc {
+ unsigned char mpc_type;
+ unsigned char mpc_irqtype;
+ unsigned short mpc_irqflag;
+ unsigned char mpc_srcbusid;
+ unsigned char mpc_srcbusirq;
+ unsigned char mpc_destapic;
+ unsigned char mpc_destapiclint;
+};
+
+#define MPC_OEM_SIGNATURE "_OEM"
+
+struct mp_config_oemtable {
+ char oem_signature[4];
+ unsigned short oem_length; /* Size of table */
+ char oem_rev; /* 0x01 */
+ char oem_checksum;
+ char mpc_oem[8];
+};
+
+/*
+ * Default configurations
+ *
+ * 1 2 CPU ISA 82489DX
+ * 2 2 CPU EISA 82489DX neither IRQ 0 timer nor IRQ 13 DMA chaining
+ * 3 2 CPU EISA 82489DX
+ * 4 2 CPU MCA 82489DX
+ * 5 2 CPU ISA+PCI
+ * 6 2 CPU EISA+PCI
+ * 7 2 CPU MCA+PCI
+ */
+
+enum mp_bustype {
+ MP_BUS_ISA = 1,
+ MP_BUS_EISA,
+ MP_BUS_PCI,
+ MP_BUS_MCA,
+};
+#endif /* _ASM_X86_MPSPEC_DEF_H */
diff --git a/arch/x86/include/asm/msgbuf.h b/arch/x86/include/asm/msgbuf.h
new file mode 100644
index 000000000000..7e4e9481f51c
--- /dev/null
+++ b/arch/x86/include/asm/msgbuf.h
@@ -0,0 +1,39 @@
+#ifndef _ASM_X86_MSGBUF_H
+#define _ASM_X86_MSGBUF_H
+
+/*
+ * The msqid64_ds structure for i386 architecture.
+ * Note extra padding because this structure is passed back and forth
+ * between kernel and user space.
+ *
+ * Pad space on i386 is left for:
+ * - 64-bit time_t to solve y2038 problem
+ * - 2 miscellaneous 32-bit values
+ *
+ * Pad space on x8664 is left for:
+ * - 2 miscellaneous 64-bit values
+ */
+struct msqid64_ds {
+ struct ipc64_perm msg_perm;
+ __kernel_time_t msg_stime; /* last msgsnd time */
+#ifdef __i386__
+ unsigned long __unused1;
+#endif
+ __kernel_time_t msg_rtime; /* last msgrcv time */
+#ifdef __i386__
+ unsigned long __unused2;
+#endif
+ __kernel_time_t msg_ctime; /* last change time */
+#ifdef __i386__
+ unsigned long __unused3;
+#endif
+ unsigned long msg_cbytes; /* current number of bytes on queue */
+ unsigned long msg_qnum; /* number of messages in queue */
+ unsigned long msg_qbytes; /* max number of bytes on queue */
+ __kernel_pid_t msg_lspid; /* pid of last msgsnd */
+ __kernel_pid_t msg_lrpid; /* last receive pid */
+ unsigned long __unused4;
+ unsigned long __unused5;
+};
+
+#endif /* _ASM_X86_MSGBUF_H */
diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h
new file mode 100644
index 000000000000..6706b3006f13
--- /dev/null
+++ b/arch/x86/include/asm/msidef.h
@@ -0,0 +1,55 @@
+#ifndef _ASM_X86_MSIDEF_H
+#define _ASM_X86_MSIDEF_H
+
+/*
+ * Constants for Intel APIC based MSI messages.
+ */
+
+/*
+ * Shifts for MSI data
+ */
+
+#define MSI_DATA_VECTOR_SHIFT 0
+#define MSI_DATA_VECTOR_MASK 0x000000ff
+#define MSI_DATA_VECTOR(v) (((v) << MSI_DATA_VECTOR_SHIFT) & \
+ MSI_DATA_VECTOR_MASK)
+
+#define MSI_DATA_DELIVERY_MODE_SHIFT 8
+#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_MODE_SHIFT)
+#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_MODE_SHIFT)
+
+#define MSI_DATA_LEVEL_SHIFT 14
+#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT)
+#define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT)
+
+#define MSI_DATA_TRIGGER_SHIFT 15
+#define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT)
+#define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT)
+
+/*
+ * Shift/mask fields for msi address
+ */
+
+#define MSI_ADDR_BASE_HI 0
+#define MSI_ADDR_BASE_LO 0xfee00000
+
+#define MSI_ADDR_DEST_MODE_SHIFT 2
+#define MSI_ADDR_DEST_MODE_PHYSICAL (0 << MSI_ADDR_DEST_MODE_SHIFT)
+#define MSI_ADDR_DEST_MODE_LOGICAL (1 << MSI_ADDR_DEST_MODE_SHIFT)
+
+#define MSI_ADDR_REDIRECTION_SHIFT 3
+#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT)
+ /* dedicated cpu */
+#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT)
+ /* lowest priority */
+
+#define MSI_ADDR_DEST_ID_SHIFT 12
+#define MSI_ADDR_DEST_ID_MASK 0x00ffff0
+#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \
+ MSI_ADDR_DEST_ID_MASK)
+
+#define MSI_ADDR_IR_EXT_INT (1 << 4)
+#define MSI_ADDR_IR_SHV (1 << 3)
+#define MSI_ADDR_IR_INDEX1(index) ((index & 0x8000) >> 13)
+#define MSI_ADDR_IR_INDEX2(index) ((index & 0x7fff) << 5)
+#endif /* _ASM_X86_MSIDEF_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
new file mode 100644
index 000000000000..cb58643947b9
--- /dev/null
+++ b/arch/x86/include/asm/msr-index.h
@@ -0,0 +1,334 @@
+#ifndef _ASM_X86_MSR_INDEX_H
+#define _ASM_X86_MSR_INDEX_H
+
+/* CPU model specific register (MSR) numbers */
+
+/* x86-64 specific MSRs */
+#define MSR_EFER 0xc0000080 /* extended feature register */
+#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */
+#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */
+#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target */
+#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */
+#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */
+#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */
+#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */
+
+/* EFER bits: */
+#define _EFER_SCE 0 /* SYSCALL/SYSRET */
+#define _EFER_LME 8 /* Long mode enable */
+#define _EFER_LMA 10 /* Long mode active (read-only) */
+#define _EFER_NX 11 /* No execute enable */
+
+#define EFER_SCE (1<<_EFER_SCE)
+#define EFER_LME (1<<_EFER_LME)
+#define EFER_LMA (1<<_EFER_LMA)
+#define EFER_NX (1<<_EFER_NX)
+
+/* Intel MSRs. Some also available on other CPUs */
+#define MSR_IA32_PERFCTR0 0x000000c1
+#define MSR_IA32_PERFCTR1 0x000000c2
+#define MSR_FSB_FREQ 0x000000cd
+
+#define MSR_MTRRcap 0x000000fe
+#define MSR_IA32_BBL_CR_CTL 0x00000119
+
+#define MSR_IA32_SYSENTER_CS 0x00000174
+#define MSR_IA32_SYSENTER_ESP 0x00000175
+#define MSR_IA32_SYSENTER_EIP 0x00000176
+
+#define MSR_IA32_MCG_CAP 0x00000179
+#define MSR_IA32_MCG_STATUS 0x0000017a
+#define MSR_IA32_MCG_CTL 0x0000017b
+
+#define MSR_IA32_PEBS_ENABLE 0x000003f1
+#define MSR_IA32_DS_AREA 0x00000600
+#define MSR_IA32_PERF_CAPABILITIES 0x00000345
+
+#define MSR_MTRRfix64K_00000 0x00000250
+#define MSR_MTRRfix16K_80000 0x00000258
+#define MSR_MTRRfix16K_A0000 0x00000259
+#define MSR_MTRRfix4K_C0000 0x00000268
+#define MSR_MTRRfix4K_C8000 0x00000269
+#define MSR_MTRRfix4K_D0000 0x0000026a
+#define MSR_MTRRfix4K_D8000 0x0000026b
+#define MSR_MTRRfix4K_E0000 0x0000026c
+#define MSR_MTRRfix4K_E8000 0x0000026d
+#define MSR_MTRRfix4K_F0000 0x0000026e
+#define MSR_MTRRfix4K_F8000 0x0000026f
+#define MSR_MTRRdefType 0x000002ff
+
+#define MSR_IA32_CR_PAT 0x00000277
+
+#define MSR_IA32_DEBUGCTLMSR 0x000001d9
+#define MSR_IA32_LASTBRANCHFROMIP 0x000001db
+#define MSR_IA32_LASTBRANCHTOIP 0x000001dc
+#define MSR_IA32_LASTINTFROMIP 0x000001dd
+#define MSR_IA32_LASTINTTOIP 0x000001de
+
+/* DEBUGCTLMSR bits (others vary by model): */
+#define _DEBUGCTLMSR_LBR 0 /* last branch recording */
+#define _DEBUGCTLMSR_BTF 1 /* single-step on branches */
+
+#define DEBUGCTLMSR_LBR (1UL << _DEBUGCTLMSR_LBR)
+#define DEBUGCTLMSR_BTF (1UL << _DEBUGCTLMSR_BTF)
+
+#define MSR_IA32_MC0_CTL 0x00000400
+#define MSR_IA32_MC0_STATUS 0x00000401
+#define MSR_IA32_MC0_ADDR 0x00000402
+#define MSR_IA32_MC0_MISC 0x00000403
+
+#define MSR_P6_PERFCTR0 0x000000c1
+#define MSR_P6_PERFCTR1 0x000000c2
+#define MSR_P6_EVNTSEL0 0x00000186
+#define MSR_P6_EVNTSEL1 0x00000187
+
+/* AMD64 MSRs. Not complete. See the architecture manual for a more
+ complete list. */
+
+#define MSR_AMD64_PATCH_LEVEL 0x0000008b
+#define MSR_AMD64_NB_CFG 0xc001001f
+#define MSR_AMD64_PATCH_LOADER 0xc0010020
+#define MSR_AMD64_IBSFETCHCTL 0xc0011030
+#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
+#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
+#define MSR_AMD64_IBSOPCTL 0xc0011033
+#define MSR_AMD64_IBSOPRIP 0xc0011034
+#define MSR_AMD64_IBSOPDATA 0xc0011035
+#define MSR_AMD64_IBSOPDATA2 0xc0011036
+#define MSR_AMD64_IBSOPDATA3 0xc0011037
+#define MSR_AMD64_IBSDCLINAD 0xc0011038
+#define MSR_AMD64_IBSDCPHYSAD 0xc0011039
+#define MSR_AMD64_IBSCTL 0xc001103a
+
+/* Fam 10h MSRs */
+#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
+#define FAM10H_MMIO_CONF_ENABLE (1<<0)
+#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
+#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
+#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff
+#define FAM10H_MMIO_CONF_BASE_SHIFT 20
+
+/* K8 MSRs */
+#define MSR_K8_TOP_MEM1 0xc001001a
+#define MSR_K8_TOP_MEM2 0xc001001d
+#define MSR_K8_SYSCFG 0xc0010010
+#define MSR_K8_HWCR 0xc0010015
+#define MSR_K8_INT_PENDING_MSG 0xc0010055
+/* C1E active bits in int pending message */
+#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
+#define MSR_K8_TSEG_ADDR 0xc0010112
+#define K8_MTRRFIXRANGE_DRAM_ENABLE 0x00040000 /* MtrrFixDramEn bit */
+#define K8_MTRRFIXRANGE_DRAM_MODIFY 0x00080000 /* MtrrFixDramModEn bit */
+#define K8_MTRR_RDMEM_WRMEM_MASK 0x18181818 /* Mask: RdMem|WrMem */
+
+/* K7 MSRs */
+#define MSR_K7_EVNTSEL0 0xc0010000
+#define MSR_K7_PERFCTR0 0xc0010004
+#define MSR_K7_EVNTSEL1 0xc0010001
+#define MSR_K7_PERFCTR1 0xc0010005
+#define MSR_K7_EVNTSEL2 0xc0010002
+#define MSR_K7_PERFCTR2 0xc0010006
+#define MSR_K7_EVNTSEL3 0xc0010003
+#define MSR_K7_PERFCTR3 0xc0010007
+#define MSR_K7_CLK_CTL 0xc001001b
+#define MSR_K7_HWCR 0xc0010015
+#define MSR_K7_FID_VID_CTL 0xc0010041
+#define MSR_K7_FID_VID_STATUS 0xc0010042
+
+/* K6 MSRs */
+#define MSR_K6_EFER 0xc0000080
+#define MSR_K6_STAR 0xc0000081
+#define MSR_K6_WHCR 0xc0000082
+#define MSR_K6_UWCCR 0xc0000085
+#define MSR_K6_EPMR 0xc0000086
+#define MSR_K6_PSOR 0xc0000087
+#define MSR_K6_PFIR 0xc0000088
+
+/* Centaur-Hauls/IDT defined MSRs. */
+#define MSR_IDT_FCR1 0x00000107
+#define MSR_IDT_FCR2 0x00000108
+#define MSR_IDT_FCR3 0x00000109
+#define MSR_IDT_FCR4 0x0000010a
+
+#define MSR_IDT_MCR0 0x00000110
+#define MSR_IDT_MCR1 0x00000111
+#define MSR_IDT_MCR2 0x00000112
+#define MSR_IDT_MCR3 0x00000113
+#define MSR_IDT_MCR4 0x00000114
+#define MSR_IDT_MCR5 0x00000115
+#define MSR_IDT_MCR6 0x00000116
+#define MSR_IDT_MCR7 0x00000117
+#define MSR_IDT_MCR_CTRL 0x00000120
+
+/* VIA Cyrix defined MSRs*/
+#define MSR_VIA_FCR 0x00001107
+#define MSR_VIA_LONGHAUL 0x0000110a
+#define MSR_VIA_RNG 0x0000110b
+#define MSR_VIA_BCR2 0x00001147
+
+/* Transmeta defined MSRs */
+#define MSR_TMTA_LONGRUN_CTRL 0x80868010
+#define MSR_TMTA_LONGRUN_FLAGS 0x80868011
+#define MSR_TMTA_LRTI_READOUT 0x80868018
+#define MSR_TMTA_LRTI_VOLT_MHZ 0x8086801a
+
+/* Intel defined MSRs. */
+#define MSR_IA32_P5_MC_ADDR 0x00000000
+#define MSR_IA32_P5_MC_TYPE 0x00000001
+#define MSR_IA32_TSC 0x00000010
+#define MSR_IA32_PLATFORM_ID 0x00000017
+#define MSR_IA32_EBL_CR_POWERON 0x0000002a
+#define MSR_IA32_FEATURE_CONTROL 0x0000003a
+
+#define FEATURE_CONTROL_LOCKED (1<<0)
+#define FEATURE_CONTROL_VMXON_ENABLED (1<<2)
+
+#define MSR_IA32_APICBASE 0x0000001b
+#define MSR_IA32_APICBASE_BSP (1<<8)
+#define MSR_IA32_APICBASE_ENABLE (1<<11)
+#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
+
+#define MSR_IA32_UCODE_WRITE 0x00000079
+#define MSR_IA32_UCODE_REV 0x0000008b
+
+#define MSR_IA32_PERF_STATUS 0x00000198
+#define MSR_IA32_PERF_CTL 0x00000199
+
+#define MSR_IA32_MPERF 0x000000e7
+#define MSR_IA32_APERF 0x000000e8
+
+#define MSR_IA32_THERM_CONTROL 0x0000019a
+#define MSR_IA32_THERM_INTERRUPT 0x0000019b
+#define MSR_IA32_THERM_STATUS 0x0000019c
+#define MSR_IA32_MISC_ENABLE 0x000001a0
+
+/* Intel Model 6 */
+#define MSR_P6_EVNTSEL0 0x00000186
+#define MSR_P6_EVNTSEL1 0x00000187
+
+/* P4/Xeon+ specific */
+#define MSR_IA32_MCG_EAX 0x00000180
+#define MSR_IA32_MCG_EBX 0x00000181
+#define MSR_IA32_MCG_ECX 0x00000182
+#define MSR_IA32_MCG_EDX 0x00000183
+#define MSR_IA32_MCG_ESI 0x00000184
+#define MSR_IA32_MCG_EDI 0x00000185
+#define MSR_IA32_MCG_EBP 0x00000186
+#define MSR_IA32_MCG_ESP 0x00000187
+#define MSR_IA32_MCG_EFLAGS 0x00000188
+#define MSR_IA32_MCG_EIP 0x00000189
+#define MSR_IA32_MCG_RESERVED 0x0000018a
+
+/* Pentium IV performance counter MSRs */
+#define MSR_P4_BPU_PERFCTR0 0x00000300
+#define MSR_P4_BPU_PERFCTR1 0x00000301
+#define MSR_P4_BPU_PERFCTR2 0x00000302
+#define MSR_P4_BPU_PERFCTR3 0x00000303
+#define MSR_P4_MS_PERFCTR0 0x00000304
+#define MSR_P4_MS_PERFCTR1 0x00000305
+#define MSR_P4_MS_PERFCTR2 0x00000306
+#define MSR_P4_MS_PERFCTR3 0x00000307
+#define MSR_P4_FLAME_PERFCTR0 0x00000308
+#define MSR_P4_FLAME_PERFCTR1 0x00000309
+#define MSR_P4_FLAME_PERFCTR2 0x0000030a
+#define MSR_P4_FLAME_PERFCTR3 0x0000030b
+#define MSR_P4_IQ_PERFCTR0 0x0000030c
+#define MSR_P4_IQ_PERFCTR1 0x0000030d
+#define MSR_P4_IQ_PERFCTR2 0x0000030e
+#define MSR_P4_IQ_PERFCTR3 0x0000030f
+#define MSR_P4_IQ_PERFCTR4 0x00000310
+#define MSR_P4_IQ_PERFCTR5 0x00000311
+#define MSR_P4_BPU_CCCR0 0x00000360
+#define MSR_P4_BPU_CCCR1 0x00000361
+#define MSR_P4_BPU_CCCR2 0x00000362
+#define MSR_P4_BPU_CCCR3 0x00000363
+#define MSR_P4_MS_CCCR0 0x00000364
+#define MSR_P4_MS_CCCR1 0x00000365
+#define MSR_P4_MS_CCCR2 0x00000366
+#define MSR_P4_MS_CCCR3 0x00000367
+#define MSR_P4_FLAME_CCCR0 0x00000368
+#define MSR_P4_FLAME_CCCR1 0x00000369
+#define MSR_P4_FLAME_CCCR2 0x0000036a
+#define MSR_P4_FLAME_CCCR3 0x0000036b
+#define MSR_P4_IQ_CCCR0 0x0000036c
+#define MSR_P4_IQ_CCCR1 0x0000036d
+#define MSR_P4_IQ_CCCR2 0x0000036e
+#define MSR_P4_IQ_CCCR3 0x0000036f
+#define MSR_P4_IQ_CCCR4 0x00000370
+#define MSR_P4_IQ_CCCR5 0x00000371
+#define MSR_P4_ALF_ESCR0 0x000003ca
+#define MSR_P4_ALF_ESCR1 0x000003cb
+#define MSR_P4_BPU_ESCR0 0x000003b2
+#define MSR_P4_BPU_ESCR1 0x000003b3
+#define MSR_P4_BSU_ESCR0 0x000003a0
+#define MSR_P4_BSU_ESCR1 0x000003a1
+#define MSR_P4_CRU_ESCR0 0x000003b8
+#define MSR_P4_CRU_ESCR1 0x000003b9
+#define MSR_P4_CRU_ESCR2 0x000003cc
+#define MSR_P4_CRU_ESCR3 0x000003cd
+#define MSR_P4_CRU_ESCR4 0x000003e0
+#define MSR_P4_CRU_ESCR5 0x000003e1
+#define MSR_P4_DAC_ESCR0 0x000003a8
+#define MSR_P4_DAC_ESCR1 0x000003a9
+#define MSR_P4_FIRM_ESCR0 0x000003a4
+#define MSR_P4_FIRM_ESCR1 0x000003a5
+#define MSR_P4_FLAME_ESCR0 0x000003a6
+#define MSR_P4_FLAME_ESCR1 0x000003a7
+#define MSR_P4_FSB_ESCR0 0x000003a2
+#define MSR_P4_FSB_ESCR1 0x000003a3
+#define MSR_P4_IQ_ESCR0 0x000003ba
+#define MSR_P4_IQ_ESCR1 0x000003bb
+#define MSR_P4_IS_ESCR0 0x000003b4
+#define MSR_P4_IS_ESCR1 0x000003b5
+#define MSR_P4_ITLB_ESCR0 0x000003b6
+#define MSR_P4_ITLB_ESCR1 0x000003b7
+#define MSR_P4_IX_ESCR0 0x000003c8
+#define MSR_P4_IX_ESCR1 0x000003c9
+#define MSR_P4_MOB_ESCR0 0x000003aa
+#define MSR_P4_MOB_ESCR1 0x000003ab
+#define MSR_P4_MS_ESCR0 0x000003c0
+#define MSR_P4_MS_ESCR1 0x000003c1
+#define MSR_P4_PMH_ESCR0 0x000003ac
+#define MSR_P4_PMH_ESCR1 0x000003ad
+#define MSR_P4_RAT_ESCR0 0x000003bc
+#define MSR_P4_RAT_ESCR1 0x000003bd
+#define MSR_P4_SAAT_ESCR0 0x000003ae
+#define MSR_P4_SAAT_ESCR1 0x000003af
+#define MSR_P4_SSU_ESCR0 0x000003be
+#define MSR_P4_SSU_ESCR1 0x000003bf /* guess: not in manual */
+
+#define MSR_P4_TBPU_ESCR0 0x000003c2
+#define MSR_P4_TBPU_ESCR1 0x000003c3
+#define MSR_P4_TC_ESCR0 0x000003c4
+#define MSR_P4_TC_ESCR1 0x000003c5
+#define MSR_P4_U2L_ESCR0 0x000003b0
+#define MSR_P4_U2L_ESCR1 0x000003b1
+
+/* Intel Core-based CPU performance counters */
+#define MSR_CORE_PERF_FIXED_CTR0 0x00000309
+#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a
+#define MSR_CORE_PERF_FIXED_CTR2 0x0000030b
+#define MSR_CORE_PERF_FIXED_CTR_CTRL 0x0000038d
+#define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e
+#define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390
+
+/* Geode defined MSRs */
+#define MSR_GEODE_BUSCONT_CONF0 0x00001900
+
+/* Intel VT MSRs */
+#define MSR_IA32_VMX_BASIC 0x00000480
+#define MSR_IA32_VMX_PINBASED_CTLS 0x00000481
+#define MSR_IA32_VMX_PROCBASED_CTLS 0x00000482
+#define MSR_IA32_VMX_EXIT_CTLS 0x00000483
+#define MSR_IA32_VMX_ENTRY_CTLS 0x00000484
+#define MSR_IA32_VMX_MISC 0x00000485
+#define MSR_IA32_VMX_CR0_FIXED0 0x00000486
+#define MSR_IA32_VMX_CR0_FIXED1 0x00000487
+#define MSR_IA32_VMX_CR4_FIXED0 0x00000488
+#define MSR_IA32_VMX_CR4_FIXED1 0x00000489
+#define MSR_IA32_VMX_VMCS_ENUM 0x0000048a
+#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b
+#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c
+
+#endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
new file mode 100644
index 000000000000..638bf6241807
--- /dev/null
+++ b/arch/x86/include/asm/msr.h
@@ -0,0 +1,246 @@
+#ifndef _ASM_X86_MSR_H
+#define _ASM_X86_MSR_H
+
+#include <asm/msr-index.h>
+
+#ifndef __ASSEMBLY__
+# include <linux/types.h>
+#endif
+
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+
+#include <asm/asm.h>
+#include <asm/errno.h>
+
+static inline unsigned long long native_read_tscp(unsigned int *aux)
+{
+ unsigned long low, high;
+ asm volatile(".byte 0x0f,0x01,0xf9"
+ : "=a" (low), "=d" (high), "=c" (*aux));
+ return low | ((u64)high << 32);
+}
+
+/*
+ * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A"
+ * constraint has different meanings. For i386, "A" means exactly
+ * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead,
+ * it means rax *or* rdx.
+ */
+#ifdef CONFIG_X86_64
+#define DECLARE_ARGS(val, low, high) unsigned low, high
+#define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32))
+#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high)
+#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high)
+#else
+#define DECLARE_ARGS(val, low, high) unsigned long long val
+#define EAX_EDX_VAL(val, low, high) (val)
+#define EAX_EDX_ARGS(val, low, high) "A" (val)
+#define EAX_EDX_RET(val, low, high) "=A" (val)
+#endif
+
+static inline unsigned long long native_read_msr(unsigned int msr)
+{
+ DECLARE_ARGS(val, low, high);
+
+ asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
+ return EAX_EDX_VAL(val, low, high);
+}
+
+static inline unsigned long long native_read_msr_safe(unsigned int msr,
+ int *err)
+{
+ DECLARE_ARGS(val, low, high);
+
+ asm volatile("2: rdmsr ; xor %[err],%[err]\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: mov %[fault],%[err] ; jmp 1b\n\t"
+ ".previous\n\t"
+ _ASM_EXTABLE(2b, 3b)
+ : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
+ : "c" (msr), [fault] "i" (-EFAULT));
+ return EAX_EDX_VAL(val, low, high);
+}
+
+static inline unsigned long long native_read_msr_amd_safe(unsigned int msr,
+ int *err)
+{
+ DECLARE_ARGS(val, low, high);
+
+ asm volatile("2: rdmsr ; xor %0,%0\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: mov %3,%0 ; jmp 1b\n\t"
+ ".previous\n\t"
+ _ASM_EXTABLE(2b, 3b)
+ : "=r" (*err), EAX_EDX_RET(val, low, high)
+ : "c" (msr), "D" (0x9c5a203a), "i" (-EFAULT));
+ return EAX_EDX_VAL(val, low, high);
+}
+
+static inline void native_write_msr(unsigned int msr,
+ unsigned low, unsigned high)
+{
+ asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
+}
+
+/* Can be uninlined because referenced by paravirt */
+notrace static inline int native_write_msr_safe(unsigned int msr,
+ unsigned low, unsigned high)
+{
+ int err;
+ asm volatile("2: wrmsr ; xor %[err],%[err]\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: mov %[fault],%[err] ; jmp 1b\n\t"
+ ".previous\n\t"
+ _ASM_EXTABLE(2b, 3b)
+ : [err] "=a" (err)
+ : "c" (msr), "0" (low), "d" (high),
+ [fault] "i" (-EFAULT)
+ : "memory");
+ return err;
+}
+
+extern unsigned long long native_read_tsc(void);
+
+static __always_inline unsigned long long __native_read_tsc(void)
+{
+ DECLARE_ARGS(val, low, high);
+
+ asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));
+
+ return EAX_EDX_VAL(val, low, high);
+}
+
+static inline unsigned long long native_read_pmc(int counter)
+{
+ DECLARE_ARGS(val, low, high);
+
+ asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
+ return EAX_EDX_VAL(val, low, high);
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#include <linux/errno.h>
+/*
+ * Access to machine-specific registers (available on 586 and better only)
+ * Note: the rd* operations modify the parameters directly (without using
+ * pointer indirection), this allows gcc to optimize better
+ */
+
+#define rdmsr(msr, val1, val2) \
+do { \
+ u64 __val = native_read_msr((msr)); \
+ (val1) = (u32)__val; \
+ (val2) = (u32)(__val >> 32); \
+} while (0)
+
+static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
+{
+ native_write_msr(msr, low, high);
+}
+
+#define rdmsrl(msr, val) \
+ ((val) = native_read_msr((msr)))
+
+#define wrmsrl(msr, val) \
+ native_write_msr((msr), (u32)((u64)(val)), (u32)((u64)(val) >> 32))
+
+/* wrmsr with exception handling */
+static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
+{
+ return native_write_msr_safe(msr, low, high);
+}
+
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr, p1, p2) \
+({ \
+ int __err; \
+ u64 __val = native_read_msr_safe((msr), &__err); \
+ (*p1) = (u32)__val; \
+ (*p2) = (u32)(__val >> 32); \
+ __err; \
+})
+
+static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
+{
+ int err;
+
+ *p = native_read_msr_safe(msr, &err);
+ return err;
+}
+static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
+{
+ int err;
+
+ *p = native_read_msr_amd_safe(msr, &err);
+ return err;
+}
+
+#define rdtscl(low) \
+ ((low) = (u32)__native_read_tsc())
+
+#define rdtscll(val) \
+ ((val) = __native_read_tsc())
+
+#define rdpmc(counter, low, high) \
+do { \
+ u64 _l = native_read_pmc((counter)); \
+ (low) = (u32)_l; \
+ (high) = (u32)(_l >> 32); \
+} while (0)
+
+#define rdtscp(low, high, aux) \
+do { \
+ unsigned long long _val = native_read_tscp(&(aux)); \
+ (low) = (u32)_val; \
+ (high) = (u32)(_val >> 32); \
+} while (0)
+
+#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux))
+
+#endif /* !CONFIG_PARAVIRT */
+
+
+#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \
+ (u32)((val) >> 32))
+
+#define write_tsc(val1, val2) wrmsr(0x10, (val1), (val2))
+
+#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0)
+
+#ifdef CONFIG_SMP
+int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
+int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
+int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
+int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
+#else /* CONFIG_SMP */
+static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+ rdmsr(msr_no, *l, *h);
+ return 0;
+}
+static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ wrmsr(msr_no, l, h);
+ return 0;
+}
+static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
+ u32 *l, u32 *h)
+{
+ return rdmsr_safe(msr_no, l, h);
+}
+static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ return wrmsr_safe(msr_no, l, h);
+}
+#endif /* CONFIG_SMP */
+#endif /* __ASSEMBLY__ */
+#endif /* __KERNEL__ */
+
+
+#endif /* _ASM_X86_MSR_H */
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
new file mode 100644
index 000000000000..7c1e4258b31e
--- /dev/null
+++ b/arch/x86/include/asm/mtrr.h
@@ -0,0 +1,173 @@
+/* Generic MTRR (Memory Type Range Register) ioctls.
+
+ Copyright (C) 1997-1999 Richard Gooch
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with this library; if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ Richard Gooch may be reached by email at rgooch@atnf.csiro.au
+ The postal address is:
+ Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
+*/
+#ifndef _ASM_X86_MTRR_H
+#define _ASM_X86_MTRR_H
+
+#include <linux/ioctl.h>
+#include <linux/errno.h>
+
+#define MTRR_IOCTL_BASE 'M'
+
+struct mtrr_sentry {
+ unsigned long base; /* Base address */
+ unsigned int size; /* Size of region */
+ unsigned int type; /* Type of region */
+};
+
+/* Warning: this structure has a different order from i386
+ on x86-64. The 32bit emulation code takes care of that.
+ But you need to use this for 64bit, otherwise your X server
+ will break. */
+
+#ifdef __i386__
+struct mtrr_gentry {
+ unsigned int regnum; /* Register number */
+ unsigned long base; /* Base address */
+ unsigned int size; /* Size of region */
+ unsigned int type; /* Type of region */
+};
+
+#else /* __i386__ */
+
+struct mtrr_gentry {
+ unsigned long base; /* Base address */
+ unsigned int size; /* Size of region */
+ unsigned int regnum; /* Register number */
+ unsigned int type; /* Type of region */
+};
+#endif /* !__i386__ */
+
+/* These are the various ioctls */
+#define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry)
+#define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry)
+#define MTRRIOC_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry)
+#define MTRRIOC_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry)
+#define MTRRIOC_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry)
+#define MTRRIOC_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry)
+#define MTRRIOC_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry)
+#define MTRRIOC_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry)
+#define MTRRIOC_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry)
+#define MTRRIOC_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry)
+
+/* These are the region types */
+#define MTRR_TYPE_UNCACHABLE 0
+#define MTRR_TYPE_WRCOMB 1
+/*#define MTRR_TYPE_ 2*/
+/*#define MTRR_TYPE_ 3*/
+#define MTRR_TYPE_WRTHROUGH 4
+#define MTRR_TYPE_WRPROT 5
+#define MTRR_TYPE_WRBACK 6
+#define MTRR_NUM_TYPES 7
+
+#ifdef __KERNEL__
+
+/* The following functions are for use by other drivers */
+# ifdef CONFIG_MTRR
+extern u8 mtrr_type_lookup(u64 addr, u64 end);
+extern void mtrr_save_fixed_ranges(void *);
+extern void mtrr_save_state(void);
+extern int mtrr_add(unsigned long base, unsigned long size,
+ unsigned int type, bool increment);
+extern int mtrr_add_page(unsigned long base, unsigned long size,
+ unsigned int type, bool increment);
+extern int mtrr_del(int reg, unsigned long base, unsigned long size);
+extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
+extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
+extern void mtrr_ap_init(void);
+extern void mtrr_bp_init(void);
+extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
+extern int amd_special_default_mtrr(void);
+# else
+static inline u8 mtrr_type_lookup(u64 addr, u64 end)
+{
+ /*
+ * Return no-MTRRs:
+ */
+ return 0xff;
+}
+#define mtrr_save_fixed_ranges(arg) do {} while (0)
+#define mtrr_save_state() do {} while (0)
+static inline int mtrr_add(unsigned long base, unsigned long size,
+ unsigned int type, bool increment)
+{
+ return -ENODEV;
+}
+static inline int mtrr_add_page(unsigned long base, unsigned long size,
+ unsigned int type, bool increment)
+{
+ return -ENODEV;
+}
+static inline int mtrr_del(int reg, unsigned long base, unsigned long size)
+{
+ return -ENODEV;
+}
+static inline int mtrr_del_page(int reg, unsigned long base, unsigned long size)
+{
+ return -ENODEV;
+}
+static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
+{
+ return 0;
+}
+static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
+{
+}
+
+#define mtrr_ap_init() do {} while (0)
+#define mtrr_bp_init() do {} while (0)
+# endif
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+
+struct mtrr_sentry32 {
+ compat_ulong_t base; /* Base address */
+ compat_uint_t size; /* Size of region */
+ compat_uint_t type; /* Type of region */
+};
+
+struct mtrr_gentry32 {
+ compat_ulong_t regnum; /* Register number */
+ compat_uint_t base; /* Base address */
+ compat_uint_t size; /* Size of region */
+ compat_uint_t type; /* Type of region */
+};
+
+#define MTRR_IOCTL_BASE 'M'
+
+#define MTRRIOC32_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry32)
+#define MTRRIOC32_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry32)
+#define MTRRIOC32_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry32)
+#define MTRRIOC32_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry32)
+#define MTRRIOC32_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry32)
+#define MTRRIOC32_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry32)
+#define MTRRIOC32_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry32)
+#define MTRRIOC32_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry32)
+#define MTRRIOC32_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry32)
+#define MTRRIOC32_KILL_PAGE_ENTRY \
+ _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32)
+#endif /* CONFIG_COMPAT */
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_MTRR_H */
diff --git a/arch/x86/include/asm/mutex.h b/arch/x86/include/asm/mutex.h
new file mode 100644
index 000000000000..a731b9c573a6
--- /dev/null
+++ b/arch/x86/include/asm/mutex.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "mutex_32.h"
+#else
+# include "mutex_64.h"
+#endif
diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h
new file mode 100644
index 000000000000..03f90c8a5a7c
--- /dev/null
+++ b/arch/x86/include/asm/mutex_32.h
@@ -0,0 +1,125 @@
+/*
+ * Assembly implementation of the mutex fastpath, based on atomic
+ * decrement/increment.
+ *
+ * started by Ingo Molnar:
+ *
+ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#ifndef _ASM_X86_MUTEX_32_H
+#define _ASM_X86_MUTEX_32_H
+
+#include <asm/alternative.h>
+
+/**
+ * __mutex_fastpath_lock - try to take the lock by moving the count
+ * from 1 to a 0 value
+ * @count: pointer of type atomic_t
+ * @fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fn> if it
+ * wasn't 1 originally. This function MUST leave the value lower than 1
+ * even when the "1" assertion wasn't true.
+ */
+#define __mutex_fastpath_lock(count, fail_fn) \
+do { \
+ unsigned int dummy; \
+ \
+ typecheck(atomic_t *, count); \
+ typecheck_fn(void (*)(atomic_t *), fail_fn); \
+ \
+ asm volatile(LOCK_PREFIX " decl (%%eax)\n" \
+ " jns 1f \n" \
+ " call " #fail_fn "\n" \
+ "1:\n" \
+ : "=a" (dummy) \
+ : "a" (count) \
+ : "memory", "ecx", "edx"); \
+} while (0)
+
+
+/**
+ * __mutex_fastpath_lock_retval - try to take the lock by moving the count
+ * from 1 to a 0 value
+ * @count: pointer of type atomic_t
+ * @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if it
+ * wasn't 1 originally. This function returns 0 if the fastpath succeeds,
+ * or anything the slow path function returns
+ */
+static inline int __mutex_fastpath_lock_retval(atomic_t *count,
+ int (*fail_fn)(atomic_t *))
+{
+ if (unlikely(atomic_dec_return(count) < 0))
+ return fail_fn(count);
+ else
+ return 0;
+}
+
+/**
+ * __mutex_fastpath_unlock - try to promote the mutex from 0 to 1
+ * @count: pointer of type atomic_t
+ * @fail_fn: function to call if the original value was not 0
+ *
+ * try to promote the mutex from 0 to 1. if it wasn't 0, call <fail_fn>.
+ * In the failure case, this function is allowed to either set the value
+ * to 1, or to set it to a value lower than 1.
+ *
+ * If the implementation sets it to a value of lower than 1, the
+ * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs
+ * to return 0 otherwise.
+ */
+#define __mutex_fastpath_unlock(count, fail_fn) \
+do { \
+ unsigned int dummy; \
+ \
+ typecheck(atomic_t *, count); \
+ typecheck_fn(void (*)(atomic_t *), fail_fn); \
+ \
+ asm volatile(LOCK_PREFIX " incl (%%eax)\n" \
+ " jg 1f\n" \
+ " call " #fail_fn "\n" \
+ "1:\n" \
+ : "=a" (dummy) \
+ : "a" (count) \
+ : "memory", "ecx", "edx"); \
+} while (0)
+
+#define __mutex_slowpath_needs_to_unlock() 1
+
+/**
+ * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
+ *
+ * @count: pointer of type atomic_t
+ * @fail_fn: fallback function
+ *
+ * Change the count from 1 to a value lower than 1, and return 0 (failure)
+ * if it wasn't 1 originally, or return 1 (success) otherwise. This function
+ * MUST leave the value lower than 1 even when the "1" assertion wasn't true.
+ * Additionally, if the value was < 0 originally, this function must not leave
+ * it to 0 on failure.
+ */
+static inline int __mutex_fastpath_trylock(atomic_t *count,
+ int (*fail_fn)(atomic_t *))
+{
+ /*
+ * We have two variants here. The cmpxchg based one is the best one
+ * because it never induce a false contention state. It is included
+ * here because architectures using the inc/dec algorithms over the
+ * xchg ones are much more likely to support cmpxchg natively.
+ *
+ * If not we fall back to the spinlock based variant - that is
+ * just as efficient (and simpler) as a 'destructive' probing of
+ * the mutex state would be.
+ */
+#ifdef __HAVE_ARCH_CMPXCHG
+ if (likely(atomic_cmpxchg(count, 1, 0) == 1))
+ return 1;
+ return 0;
+#else
+ return fail_fn(count);
+#endif
+}
+
+#endif /* _ASM_X86_MUTEX_32_H */
diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h
new file mode 100644
index 000000000000..68a87b0f8e29
--- /dev/null
+++ b/arch/x86/include/asm/mutex_64.h
@@ -0,0 +1,100 @@
+/*
+ * Assembly implementation of the mutex fastpath, based on atomic
+ * decrement/increment.
+ *
+ * started by Ingo Molnar:
+ *
+ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#ifndef _ASM_X86_MUTEX_64_H
+#define _ASM_X86_MUTEX_64_H
+
+/**
+ * __mutex_fastpath_lock - decrement and call function if negative
+ * @v: pointer of type atomic_t
+ * @fail_fn: function to call if the result is negative
+ *
+ * Atomically decrements @v and calls <fail_fn> if the result is negative.
+ */
+#define __mutex_fastpath_lock(v, fail_fn) \
+do { \
+ unsigned long dummy; \
+ \
+ typecheck(atomic_t *, v); \
+ typecheck_fn(void (*)(atomic_t *), fail_fn); \
+ \
+ asm volatile(LOCK_PREFIX " decl (%%rdi)\n" \
+ " jns 1f \n" \
+ " call " #fail_fn "\n" \
+ "1:" \
+ : "=D" (dummy) \
+ : "D" (v) \
+ : "rax", "rsi", "rdx", "rcx", \
+ "r8", "r9", "r10", "r11", "memory"); \
+} while (0)
+
+/**
+ * __mutex_fastpath_lock_retval - try to take the lock by moving the count
+ * from 1 to a 0 value
+ * @count: pointer of type atomic_t
+ * @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
+ * it wasn't 1 originally. This function returns 0 if the fastpath succeeds,
+ * or anything the slow path function returns
+ */
+static inline int __mutex_fastpath_lock_retval(atomic_t *count,
+ int (*fail_fn)(atomic_t *))
+{
+ if (unlikely(atomic_dec_return(count) < 0))
+ return fail_fn(count);
+ else
+ return 0;
+}
+
+/**
+ * __mutex_fastpath_unlock - increment and call function if nonpositive
+ * @v: pointer of type atomic_t
+ * @fail_fn: function to call if the result is nonpositive
+ *
+ * Atomically increments @v and calls <fail_fn> if the result is nonpositive.
+ */
+#define __mutex_fastpath_unlock(v, fail_fn) \
+do { \
+ unsigned long dummy; \
+ \
+ typecheck(atomic_t *, v); \
+ typecheck_fn(void (*)(atomic_t *), fail_fn); \
+ \
+ asm volatile(LOCK_PREFIX " incl (%%rdi)\n" \
+ " jg 1f\n" \
+ " call " #fail_fn "\n" \
+ "1:" \
+ : "=D" (dummy) \
+ : "D" (v) \
+ : "rax", "rsi", "rdx", "rcx", \
+ "r8", "r9", "r10", "r11", "memory"); \
+} while (0)
+
+#define __mutex_slowpath_needs_to_unlock() 1
+
+/**
+ * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
+ *
+ * @count: pointer of type atomic_t
+ * @fail_fn: fallback function
+ *
+ * Change the count from 1 to 0 and return 1 (success), or return 0 (failure)
+ * if it wasn't 1 originally. [the fallback function is never used on
+ * x86_64, because all x86_64 CPUs have a CMPXCHG instruction.]
+ */
+static inline int __mutex_fastpath_trylock(atomic_t *count,
+ int (*fail_fn)(atomic_t *))
+{
+ if (likely(atomic_cmpxchg(count, 1, 0) == 1))
+ return 1;
+ else
+ return 0;
+}
+
+#endif /* _ASM_X86_MUTEX_64_H */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
new file mode 100644
index 000000000000..c45a0a568dff
--- /dev/null
+++ b/arch/x86/include/asm/nmi.h
@@ -0,0 +1,81 @@
+#ifndef _ASM_X86_NMI_H
+#define _ASM_X86_NMI_H
+
+#include <linux/pm.h>
+#include <asm/irq.h>
+#include <asm/io.h>
+
+#ifdef ARCH_HAS_NMI_WATCHDOG
+
+/**
+ * do_nmi_callback
+ *
+ * Check to see if a callback exists and execute it. Return 1
+ * if the handler exists and was handled successfully.
+ */
+int do_nmi_callback(struct pt_regs *regs, int cpu);
+
+extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
+extern int check_nmi_watchdog(void);
+extern int nmi_watchdog_enabled;
+extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
+extern int avail_to_resrv_perfctr_nmi(unsigned int);
+extern int reserve_perfctr_nmi(unsigned int);
+extern void release_perfctr_nmi(unsigned int);
+extern int reserve_evntsel_nmi(unsigned int);
+extern void release_evntsel_nmi(unsigned int);
+
+extern void setup_apic_nmi_watchdog(void *);
+extern void stop_apic_nmi_watchdog(void *);
+extern void disable_timer_nmi_watchdog(void);
+extern void enable_timer_nmi_watchdog(void);
+extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason);
+extern void cpu_nmi_set_wd_enabled(void);
+
+extern atomic_t nmi_active;
+extern unsigned int nmi_watchdog;
+#define NMI_NONE 0
+#define NMI_IO_APIC 1
+#define NMI_LOCAL_APIC 2
+#define NMI_INVALID 3
+
+struct ctl_table;
+struct file;
+extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
+ void __user *, size_t *, loff_t *);
+extern int unknown_nmi_panic;
+
+void __trigger_all_cpu_backtrace(void);
+#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace()
+
+static inline void localise_nmi_watchdog(void)
+{
+ if (nmi_watchdog == NMI_IO_APIC)
+ nmi_watchdog = NMI_LOCAL_APIC;
+}
+
+/* check if nmi_watchdog is active (ie was specified at boot) */
+static inline int nmi_watchdog_active(void)
+{
+ /*
+ * actually it should be:
+ * return (nmi_watchdog == NMI_LOCAL_APIC ||
+ * nmi_watchdog == NMI_IO_APIC)
+ * but since they are power of two we could use a
+ * cheaper way --cvg
+ */
+ return nmi_watchdog & 0x3;
+}
+#endif
+
+void lapic_watchdog_stop(void);
+int lapic_watchdog_init(unsigned nmi_hz);
+int lapic_wd_event(unsigned nmi_hz);
+unsigned lapic_adjust_nmi_hz(unsigned hz);
+int lapic_watchdog_ok(void);
+void disable_lapic_nmi_watchdog(void);
+void enable_lapic_nmi_watchdog(void);
+void stop_nmi(void);
+void restart_nmi(void);
+
+#endif /* _ASM_X86_NMI_H */
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
new file mode 100644
index 000000000000..ad2668ee1aa7
--- /dev/null
+++ b/arch/x86/include/asm/nops.h
@@ -0,0 +1,118 @@
+#ifndef _ASM_X86_NOPS_H
+#define _ASM_X86_NOPS_H
+
+/* Define nops for use with alternative() */
+
+/* generic versions from gas
+ 1: nop
+ the following instructions are NOT nops in 64-bit mode,
+ for 64-bit mode use K8 or P6 nops instead
+ 2: movl %esi,%esi
+ 3: leal 0x00(%esi),%esi
+ 4: leal 0x00(,%esi,1),%esi
+ 6: leal 0x00000000(%esi),%esi
+ 7: leal 0x00000000(,%esi,1),%esi
+*/
+#define GENERIC_NOP1 ".byte 0x90\n"
+#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
+#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
+#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
+#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
+#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
+#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
+#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
+
+/* Opteron 64bit nops
+ 1: nop
+ 2: osp nop
+ 3: osp osp nop
+ 4: osp osp osp nop
+*/
+#define K8_NOP1 GENERIC_NOP1
+#define K8_NOP2 ".byte 0x66,0x90\n"
+#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
+#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
+#define K8_NOP5 K8_NOP3 K8_NOP2
+#define K8_NOP6 K8_NOP3 K8_NOP3
+#define K8_NOP7 K8_NOP4 K8_NOP3
+#define K8_NOP8 K8_NOP4 K8_NOP4
+
+/* K7 nops
+ uses eax dependencies (arbitary choice)
+ 1: nop
+ 2: movl %eax,%eax
+ 3: leal (,%eax,1),%eax
+ 4: leal 0x00(,%eax,1),%eax
+ 6: leal 0x00000000(%eax),%eax
+ 7: leal 0x00000000(,%eax,1),%eax
+*/
+#define K7_NOP1 GENERIC_NOP1
+#define K7_NOP2 ".byte 0x8b,0xc0\n"
+#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
+#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
+#define K7_NOP5 K7_NOP4 ASM_NOP1
+#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
+#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
+#define K7_NOP8 K7_NOP7 ASM_NOP1
+
+/* P6 nops
+ uses eax dependencies (Intel-recommended choice)
+ 1: nop
+ 2: osp nop
+ 3: nopl (%eax)
+ 4: nopl 0x00(%eax)
+ 5: nopl 0x00(%eax,%eax,1)
+ 6: osp nopl 0x00(%eax,%eax,1)
+ 7: nopl 0x00000000(%eax)
+ 8: nopl 0x00000000(%eax,%eax,1)
+*/
+#define P6_NOP1 GENERIC_NOP1
+#define P6_NOP2 ".byte 0x66,0x90\n"
+#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
+#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
+#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
+#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
+#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
+#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
+
+#if defined(CONFIG_MK7)
+#define ASM_NOP1 K7_NOP1
+#define ASM_NOP2 K7_NOP2
+#define ASM_NOP3 K7_NOP3
+#define ASM_NOP4 K7_NOP4
+#define ASM_NOP5 K7_NOP5
+#define ASM_NOP6 K7_NOP6
+#define ASM_NOP7 K7_NOP7
+#define ASM_NOP8 K7_NOP8
+#elif defined(CONFIG_X86_P6_NOP)
+#define ASM_NOP1 P6_NOP1
+#define ASM_NOP2 P6_NOP2
+#define ASM_NOP3 P6_NOP3
+#define ASM_NOP4 P6_NOP4
+#define ASM_NOP5 P6_NOP5
+#define ASM_NOP6 P6_NOP6
+#define ASM_NOP7 P6_NOP7
+#define ASM_NOP8 P6_NOP8
+#elif defined(CONFIG_X86_64)
+#define ASM_NOP1 K8_NOP1
+#define ASM_NOP2 K8_NOP2
+#define ASM_NOP3 K8_NOP3
+#define ASM_NOP4 K8_NOP4
+#define ASM_NOP5 K8_NOP5
+#define ASM_NOP6 K8_NOP6
+#define ASM_NOP7 K8_NOP7
+#define ASM_NOP8 K8_NOP8
+#else
+#define ASM_NOP1 GENERIC_NOP1
+#define ASM_NOP2 GENERIC_NOP2
+#define ASM_NOP3 GENERIC_NOP3
+#define ASM_NOP4 GENERIC_NOP4
+#define ASM_NOP5 GENERIC_NOP5
+#define ASM_NOP6 GENERIC_NOP6
+#define ASM_NOP7 GENERIC_NOP7
+#define ASM_NOP8 GENERIC_NOP8
+#endif
+
+#define ASM_NOP_MAX 8
+
+#endif /* _ASM_X86_NOPS_H */
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
new file mode 100644
index 000000000000..27da400d3138
--- /dev/null
+++ b/arch/x86/include/asm/numa.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "numa_32.h"
+#else
+# include "numa_64.h"
+#endif
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
new file mode 100644
index 000000000000..e9f5db796244
--- /dev/null
+++ b/arch/x86/include/asm/numa_32.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_X86_NUMA_32_H
+#define _ASM_X86_NUMA_32_H
+
+extern int pxm_to_nid(int pxm);
+extern void numa_remove_cpu(int cpu);
+
+#ifdef CONFIG_NUMA
+extern void set_highmem_pages_init(void);
+#endif
+
+#endif /* _ASM_X86_NUMA_32_H */
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
new file mode 100644
index 000000000000..064ed6df4cbe
--- /dev/null
+++ b/arch/x86/include/asm/numa_64.h
@@ -0,0 +1,43 @@
+#ifndef _ASM_X86_NUMA_64_H
+#define _ASM_X86_NUMA_64_H
+
+#include <linux/nodemask.h>
+#include <asm/apicdef.h>
+
+struct bootnode {
+ u64 start;
+ u64 end;
+};
+
+extern int compute_hash_shift(struct bootnode *nodes, int numblks,
+ int *nodeids);
+
+#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
+
+extern void numa_init_array(void);
+extern int numa_off;
+
+extern void srat_reserve_add_area(int nodeid);
+extern int hotadd_percent;
+
+extern s16 apicid_to_node[MAX_LOCAL_APIC];
+
+extern unsigned long numa_free_all_bootmem(void);
+extern void setup_node_bootmem(int nodeid, unsigned long start,
+ unsigned long end);
+
+#ifdef CONFIG_NUMA
+extern void __init init_cpu_to_node(void);
+extern void __cpuinit numa_set_node(int cpu, int node);
+extern void __cpuinit numa_clear_node(int cpu);
+extern void __cpuinit numa_add_cpu(int cpu);
+extern void __cpuinit numa_remove_cpu(int cpu);
+#else
+static inline void init_cpu_to_node(void) { }
+static inline void numa_set_node(int cpu, int node) { }
+static inline void numa_clear_node(int cpu) { }
+static inline void numa_add_cpu(int cpu, int node) { }
+static inline void numa_remove_cpu(int cpu) { }
+#endif
+
+#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
new file mode 100644
index 000000000000..1e8bd30b4c16
--- /dev/null
+++ b/arch/x86/include/asm/numaq.h
@@ -0,0 +1,169 @@
+/*
+ * Written by: Patricia Gaughen, IBM Corporation
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <gone@us.ibm.com>
+ */
+
+#ifndef _ASM_X86_NUMAQ_H
+#define _ASM_X86_NUMAQ_H
+
+#ifdef CONFIG_X86_NUMAQ
+
+extern int found_numaq;
+extern int get_memcfg_numaq(void);
+
+/*
+ * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the
+ */
+#define SYS_CFG_DATA_PRIV_ADDR 0x0009d000 /* place for scd in private
+ quad space */
+
+/*
+ * Communication area for each processor on lynxer-processor tests.
+ *
+ * NOTE: If you change the size of this eachproc structure you need
+ * to change the definition for EACH_QUAD_SIZE.
+ */
+struct eachquadmem {
+ unsigned int priv_mem_start; /* Starting address of this */
+ /* quad's private memory. */
+ /* This is always 0. */
+ /* In MB. */
+ unsigned int priv_mem_size; /* Size of this quad's */
+ /* private memory. */
+ /* In MB. */
+ unsigned int low_shrd_mem_strp_start;/* Starting address of this */
+ /* quad's low shared block */
+ /* (untranslated). */
+ /* In MB. */
+ unsigned int low_shrd_mem_start; /* Starting address of this */
+ /* quad's low shared memory */
+ /* (untranslated). */
+ /* In MB. */
+ unsigned int low_shrd_mem_size; /* Size of this quad's low */
+ /* shared memory. */
+ /* In MB. */
+ unsigned int lmmio_copb_start; /* Starting address of this */
+ /* quad's local memory */
+ /* mapped I/O in the */
+ /* compatibility OPB. */
+ /* In MB. */
+ unsigned int lmmio_copb_size; /* Size of this quad's local */
+ /* memory mapped I/O in the */
+ /* compatibility OPB. */
+ /* In MB. */
+ unsigned int lmmio_nopb_start; /* Starting address of this */
+ /* quad's local memory */
+ /* mapped I/O in the */
+ /* non-compatibility OPB. */
+ /* In MB. */
+ unsigned int lmmio_nopb_size; /* Size of this quad's local */
+ /* memory mapped I/O in the */
+ /* non-compatibility OPB. */
+ /* In MB. */
+ unsigned int io_apic_0_start; /* Starting address of I/O */
+ /* APIC 0. */
+ unsigned int io_apic_0_sz; /* Size I/O APIC 0. */
+ unsigned int io_apic_1_start; /* Starting address of I/O */
+ /* APIC 1. */
+ unsigned int io_apic_1_sz; /* Size I/O APIC 1. */
+ unsigned int hi_shrd_mem_start; /* Starting address of this */
+ /* quad's high shared memory.*/
+ /* In MB. */
+ unsigned int hi_shrd_mem_size; /* Size of this quad's high */
+ /* shared memory. */
+ /* In MB. */
+ unsigned int mps_table_addr; /* Address of this quad's */
+ /* MPS tables from BIOS, */
+ /* in system space.*/
+ unsigned int lcl_MDC_pio_addr; /* Port-I/O address for */
+ /* local access of MDC. */
+ unsigned int rmt_MDC_mmpio_addr; /* MM-Port-I/O address for */
+ /* remote access of MDC. */
+ unsigned int mm_port_io_start; /* Starting address of this */
+ /* quad's memory mapped Port */
+ /* I/O space. */
+ unsigned int mm_port_io_size; /* Size of this quad's memory*/
+ /* mapped Port I/O space. */
+ unsigned int mm_rmt_io_apic_start; /* Starting address of this */
+ /* quad's memory mapped */
+ /* remote I/O APIC space. */
+ unsigned int mm_rmt_io_apic_size; /* Size of this quad's memory*/
+ /* mapped remote I/O APIC */
+ /* space. */
+ unsigned int mm_isa_start; /* Starting address of this */
+ /* quad's memory mapped ISA */
+ /* space (contains MDC */
+ /* memory space). */
+ unsigned int mm_isa_size; /* Size of this quad's memory*/
+ /* mapped ISA space (contains*/
+ /* MDC memory space). */
+ unsigned int rmt_qmi_addr; /* Remote addr to access QMI.*/
+ unsigned int lcl_qmi_addr; /* Local addr to access QMI. */
+};
+
+/*
+ * Note: This structure must be NOT be changed unless the multiproc and
+ * OS are changed to reflect the new structure.
+ */
+struct sys_cfg_data {
+ unsigned int quad_id;
+ unsigned int bsp_proc_id; /* Boot Strap Processor in this quad. */
+ unsigned int scd_version; /* Version number of this table. */
+ unsigned int first_quad_id;
+ unsigned int quads_present31_0; /* 1 bit for each quad */
+ unsigned int quads_present63_32; /* 1 bit for each quad */
+ unsigned int config_flags;
+ unsigned int boot_flags;
+ unsigned int csr_start_addr; /* Absolute value (not in MB) */
+ unsigned int csr_size; /* Absolute value (not in MB) */
+ unsigned int lcl_apic_start_addr; /* Absolute value (not in MB) */
+ unsigned int lcl_apic_size; /* Absolute value (not in MB) */
+ unsigned int low_shrd_mem_base; /* 0 or 512MB or 1GB */
+ unsigned int low_shrd_mem_quad_offset; /* 0,128M,256M,512M,1G */
+ /* may not be totally populated */
+ unsigned int split_mem_enbl; /* 0 for no low shared memory */
+ unsigned int mmio_sz; /* Size of total system memory mapped I/O */
+ /* (in MB). */
+ unsigned int quad_spin_lock; /* Spare location used for quad */
+ /* bringup. */
+ unsigned int nonzero55; /* For checksumming. */
+ unsigned int nonzeroaa; /* For checksumming. */
+ unsigned int scd_magic_number;
+ unsigned int system_type;
+ unsigned int checksum;
+ /*
+ * memory configuration area for each quad
+ */
+ struct eachquadmem eq[MAX_NUMNODES]; /* indexed by quad id */
+};
+
+void numaq_tsc_disable(void);
+
+#else
+static inline int get_memcfg_numaq(void)
+{
+ return 0;
+}
+#endif /* CONFIG_X86_NUMAQ */
+#endif /* _ASM_X86_NUMAQ_H */
+
diff --git a/arch/x86/include/asm/numaq/apic.h b/arch/x86/include/asm/numaq/apic.h
new file mode 100644
index 000000000000..0bf2a06b7a4e
--- /dev/null
+++ b/arch/x86/include/asm/numaq/apic.h
@@ -0,0 +1,136 @@
+#ifndef __ASM_NUMAQ_APIC_H
+#define __ASM_NUMAQ_APIC_H
+
+#include <asm/io.h>
+#include <linux/mmzone.h>
+#include <linux/nodemask.h>
+
+#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
+
+static inline cpumask_t target_cpus(void)
+{
+ return CPU_MASK_ALL;
+}
+
+#define NO_BALANCE_IRQ (1)
+#define esr_disable (1)
+
+#define INT_DELIVERY_MODE dest_LowestPrio
+#define INT_DEST_MODE 0 /* physical delivery on LOCAL quad */
+
+static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return physid_isset(apicid, bitmap);
+}
+static inline unsigned long check_apicid_present(int bit)
+{
+ return physid_isset(bit, phys_cpu_present_map);
+}
+#define apicid_cluster(apicid) (apicid & 0xF0)
+
+static inline int apic_id_registered(void)
+{
+ return 1;
+}
+
+static inline void init_apic_ldr(void)
+{
+ /* Already done in NUMA-Q firmware */
+}
+
+static inline void setup_apic_routing(void)
+{
+ printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
+ "NUMA-Q", nr_ioapics);
+}
+
+/*
+ * Skip adding the timer int on secondary nodes, which causes
+ * a small but painful rift in the time-space continuum.
+ */
+static inline int multi_timer_check(int apic, int irq)
+{
+ return apic != 0 && irq == 0;
+}
+
+static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
+{
+ /* We don't have a good way to do this yet - hack */
+ return physids_promote(0xFUL);
+}
+
+/* Mapping from cpu number to logical apicid */
+extern u8 cpu_2_logical_apicid[];
+static inline int cpu_to_logical_apicid(int cpu)
+{
+ if (cpu >= NR_CPUS)
+ return BAD_APICID;
+ return (int)cpu_2_logical_apicid[cpu];
+}
+
+/*
+ * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
+ * cpu to APIC ID relation to properly interact with the intelligent
+ * mode of the cluster controller.
+ */
+static inline int cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < 60)
+ return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3));
+ else
+ return BAD_APICID;
+}
+
+static inline int apicid_to_node(int logical_apicid)
+{
+ return logical_apicid >> 4;
+}
+
+static inline physid_mask_t apicid_to_cpu_present(int logical_apicid)
+{
+ int node = apicid_to_node(logical_apicid);
+ int cpu = __ffs(logical_apicid & 0xf);
+
+ return physid_mask_of_physid(cpu + 4*node);
+}
+
+extern void *xquad_portio;
+
+static inline void setup_portio_remap(void)
+{
+ int num_quads = num_online_nodes();
+
+ if (num_quads <= 1)
+ return;
+
+ printk("Remapping cross-quad port I/O for %d quads\n", num_quads);
+ xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD);
+ printk("xquad_portio vaddr 0x%08lx, len %08lx\n",
+ (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
+}
+
+static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return (1);
+}
+
+static inline void enable_apic_mode(void)
+{
+}
+
+/*
+ * We use physical apicids here, not logical, so just return the default
+ * physical broadcast to stop people from breaking us
+ */
+static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ return (int) 0xF;
+}
+
+/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
+static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+#endif /* __ASM_NUMAQ_APIC_H */
diff --git a/arch/x86/include/asm/numaq/apicdef.h b/arch/x86/include/asm/numaq/apicdef.h
new file mode 100644
index 000000000000..e012a46cc22a
--- /dev/null
+++ b/arch/x86/include/asm/numaq/apicdef.h
@@ -0,0 +1,14 @@
+#ifndef __ASM_NUMAQ_APICDEF_H
+#define __ASM_NUMAQ_APICDEF_H
+
+
+#define APIC_ID_MASK (0xF<<24)
+
+static inline unsigned get_apic_id(unsigned long x)
+{
+ return (((x)>>24)&0x0F);
+}
+
+#define GET_APIC_ID(x) get_apic_id(x)
+
+#endif
diff --git a/arch/x86/include/asm/numaq/ipi.h b/arch/x86/include/asm/numaq/ipi.h
new file mode 100644
index 000000000000..935588d286cf
--- /dev/null
+++ b/arch/x86/include/asm/numaq/ipi.h
@@ -0,0 +1,25 @@
+#ifndef __ASM_NUMAQ_IPI_H
+#define __ASM_NUMAQ_IPI_H
+
+void send_IPI_mask_sequence(cpumask_t, int vector);
+
+static inline void send_IPI_mask(cpumask_t mask, int vector)
+{
+ send_IPI_mask_sequence(mask, vector);
+}
+
+static inline void send_IPI_allbutself(int vector)
+{
+ cpumask_t mask = cpu_online_map;
+ cpu_clear(smp_processor_id(), mask);
+
+ if (!cpus_empty(mask))
+ send_IPI_mask(mask, vector);
+}
+
+static inline void send_IPI_all(int vector)
+{
+ send_IPI_mask(cpu_online_map, vector);
+}
+
+#endif /* __ASM_NUMAQ_IPI_H */
diff --git a/arch/x86/include/asm/numaq/mpparse.h b/arch/x86/include/asm/numaq/mpparse.h
new file mode 100644
index 000000000000..252292e077b6
--- /dev/null
+++ b/arch/x86/include/asm/numaq/mpparse.h
@@ -0,0 +1,7 @@
+#ifndef __ASM_NUMAQ_MPPARSE_H
+#define __ASM_NUMAQ_MPPARSE_H
+
+extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
+ char *productid);
+
+#endif /* __ASM_NUMAQ_MPPARSE_H */
diff --git a/arch/x86/include/asm/numaq/wakecpu.h b/arch/x86/include/asm/numaq/wakecpu.h
new file mode 100644
index 000000000000..6f499df8eddb
--- /dev/null
+++ b/arch/x86/include/asm/numaq/wakecpu.h
@@ -0,0 +1,45 @@
+#ifndef __ASM_NUMAQ_WAKECPU_H
+#define __ASM_NUMAQ_WAKECPU_H
+
+/* This file copes with machines that wakeup secondary CPUs by NMIs */
+
+#define TRAMPOLINE_PHYS_LOW (0x8)
+#define TRAMPOLINE_PHYS_HIGH (0xa)
+
+/* We don't do anything here because we use NMI's to boot instead */
+static inline void wait_for_init_deassert(atomic_t *deassert)
+{
+}
+
+/*
+ * Because we use NMIs rather than the INIT-STARTUP sequence to
+ * bootstrap the CPUs, the APIC may be in a weird state. Kick it.
+ */
+static inline void smp_callin_clear_local_apic(void)
+{
+ clear_local_APIC();
+}
+
+static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
+{
+ printk("Storing NMI vector\n");
+ *high =
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH));
+ *low =
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW));
+}
+
+static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
+{
+ printk("Restoring NMI vector\n");
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
+ *high;
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
+ *low;
+}
+
+static inline void inquire_remote_apic(int apicid)
+{
+}
+
+#endif /* __ASM_NUMAQ_WAKECPU_H */
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
new file mode 100644
index 000000000000..834a30295fab
--- /dev/null
+++ b/arch/x86/include/asm/olpc.h
@@ -0,0 +1,132 @@
+/* OLPC machine specific definitions */
+
+#ifndef _ASM_X86_OLPC_H
+#define _ASM_X86_OLPC_H
+
+#include <asm/geode.h>
+
+struct olpc_platform_t {
+ int flags;
+ uint32_t boardrev;
+ int ecver;
+};
+
+#define OLPC_F_PRESENT 0x01
+#define OLPC_F_DCON 0x02
+#define OLPC_F_VSA 0x04
+
+#ifdef CONFIG_OLPC
+
+extern struct olpc_platform_t olpc_platform_info;
+
+/*
+ * OLPC board IDs contain the major build number within the mask 0x0ff0,
+ * and the minor build number withing 0x000f. Pre-builds have a minor
+ * number less than 8, and normal builds start at 8. For example, 0x0B10
+ * is a PreB1, and 0x0C18 is a C1.
+ */
+
+static inline uint32_t olpc_board(uint8_t id)
+{
+ return (id << 4) | 0x8;
+}
+
+static inline uint32_t olpc_board_pre(uint8_t id)
+{
+ return id << 4;
+}
+
+static inline int machine_is_olpc(void)
+{
+ return (olpc_platform_info.flags & OLPC_F_PRESENT) ? 1 : 0;
+}
+
+/*
+ * The DCON is OLPC's Display Controller. It has a number of unique
+ * features that we might want to take advantage of..
+ */
+static inline int olpc_has_dcon(void)
+{
+ return (olpc_platform_info.flags & OLPC_F_DCON) ? 1 : 0;
+}
+
+/*
+ * The VSA is software from AMD that typical Geode bioses will include.
+ * It is used to emulate the PCI bus, VGA, etc. OLPC's Open Firmware does
+ * not include the VSA; instead, PCI is emulated by the kernel.
+ *
+ * The VSA is described further in arch/x86/pci/olpc.c.
+ */
+static inline int olpc_has_vsa(void)
+{
+ return (olpc_platform_info.flags & OLPC_F_VSA) ? 1 : 0;
+}
+
+/*
+ * The "Mass Production" version of OLPC's XO is identified as being model
+ * C2. During the prototype phase, the following models (in chronological
+ * order) were created: A1, B1, B2, B3, B4, C1. The A1 through B2 models
+ * were based on Geode GX CPUs, and models after that were based upon
+ * Geode LX CPUs. There were also some hand-assembled models floating
+ * around, referred to as PreB1, PreB2, etc.
+ */
+static inline int olpc_board_at_least(uint32_t rev)
+{
+ return olpc_platform_info.boardrev >= rev;
+}
+
+#else
+
+static inline int machine_is_olpc(void)
+{
+ return 0;
+}
+
+static inline int olpc_has_dcon(void)
+{
+ return 0;
+}
+
+static inline int olpc_has_vsa(void)
+{
+ return 0;
+}
+
+#endif
+
+/* EC related functions */
+
+extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
+ unsigned char *outbuf, size_t outlen);
+
+extern int olpc_ec_mask_set(uint8_t bits);
+extern int olpc_ec_mask_unset(uint8_t bits);
+
+/* EC commands */
+
+#define EC_FIRMWARE_REV 0x08
+
+/* SCI source values */
+
+#define EC_SCI_SRC_EMPTY 0x00
+#define EC_SCI_SRC_GAME 0x01
+#define EC_SCI_SRC_BATTERY 0x02
+#define EC_SCI_SRC_BATSOC 0x04
+#define EC_SCI_SRC_BATERR 0x08
+#define EC_SCI_SRC_EBOOK 0x10
+#define EC_SCI_SRC_WLAN 0x20
+#define EC_SCI_SRC_ACPWR 0x40
+#define EC_SCI_SRC_ALL 0x7F
+
+/* GPIO assignments */
+
+#define OLPC_GPIO_MIC_AC geode_gpio(1)
+#define OLPC_GPIO_DCON_IRQ geode_gpio(7)
+#define OLPC_GPIO_THRM_ALRM geode_gpio(10)
+#define OLPC_GPIO_SMB_CLK geode_gpio(14)
+#define OLPC_GPIO_SMB_DATA geode_gpio(15)
+#define OLPC_GPIO_WORKAUX geode_gpio(24)
+#define OLPC_GPIO_LID geode_gpio(26)
+#define OLPC_GPIO_ECSCI geode_gpio(27)
+
+#endif /* _ASM_X86_OLPC_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
new file mode 100644
index 000000000000..e9873a2e8695
--- /dev/null
+++ b/arch/x86/include/asm/page.h
@@ -0,0 +1,209 @@
+#ifndef _ASM_X86_PAGE_H
+#define _ASM_X86_PAGE_H
+
+#include <linux/const.h>
+
+/* PAGE_SHIFT determines the page size */
+#define PAGE_SHIFT 12
+#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
+#define PAGE_MASK (~(PAGE_SIZE-1))
+
+#ifdef __KERNEL__
+
+#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
+#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
+
+/* Cast PAGE_MASK to a signed type so that it is sign-extended if
+ virtual addresses are 32-bits but physical addresses are larger
+ (ie, 32-bit PAE). */
+#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
+
+/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
+#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
+
+/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
+#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
+
+#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
+#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
+
+#define HPAGE_SHIFT PMD_SHIFT
+#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
+#define HPAGE_MASK (~(HPAGE_SIZE - 1))
+#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
+
+#define HUGE_MAX_HSTATE 2
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#endif
+
+#ifdef CONFIG_X86_64
+#include <asm/page_64.h>
+#else
+#include <asm/page_32.h>
+#endif /* CONFIG_X86_64 */
+
+#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
+
+#define VM_DATA_DEFAULT_FLAGS \
+ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
+ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+
+#ifndef __ASSEMBLY__
+
+typedef struct { pgdval_t pgd; } pgd_t;
+typedef struct { pgprotval_t pgprot; } pgprot_t;
+
+extern int page_is_ram(unsigned long pagenr);
+extern int pagerange_is_ram(unsigned long start, unsigned long end);
+extern int devmem_is_allowed(unsigned long pagenr);
+extern void map_devmem(unsigned long pfn, unsigned long size,
+ pgprot_t vma_prot);
+extern void unmap_devmem(unsigned long pfn, unsigned long size,
+ pgprot_t vma_prot);
+
+extern unsigned long max_low_pfn_mapped;
+extern unsigned long max_pfn_mapped;
+
+struct page;
+
+static inline void clear_user_page(void *page, unsigned long vaddr,
+ struct page *pg)
+{
+ clear_page(page);
+}
+
+static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
+ struct page *topage)
+{
+ copy_page(to, from);
+}
+
+#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
+ alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
+static inline pgd_t native_make_pgd(pgdval_t val)
+{
+ return (pgd_t) { val };
+}
+
+static inline pgdval_t native_pgd_val(pgd_t pgd)
+{
+ return pgd.pgd;
+}
+
+#if PAGETABLE_LEVELS >= 3
+#if PAGETABLE_LEVELS == 4
+typedef struct { pudval_t pud; } pud_t;
+
+static inline pud_t native_make_pud(pmdval_t val)
+{
+ return (pud_t) { val };
+}
+
+static inline pudval_t native_pud_val(pud_t pud)
+{
+ return pud.pud;
+}
+#else /* PAGETABLE_LEVELS == 3 */
+#include <asm-generic/pgtable-nopud.h>
+
+static inline pudval_t native_pud_val(pud_t pud)
+{
+ return native_pgd_val(pud.pgd);
+}
+#endif /* PAGETABLE_LEVELS == 4 */
+
+typedef struct { pmdval_t pmd; } pmd_t;
+
+static inline pmd_t native_make_pmd(pmdval_t val)
+{
+ return (pmd_t) { val };
+}
+
+static inline pmdval_t native_pmd_val(pmd_t pmd)
+{
+ return pmd.pmd;
+}
+#else /* PAGETABLE_LEVELS == 2 */
+#include <asm-generic/pgtable-nopmd.h>
+
+static inline pmdval_t native_pmd_val(pmd_t pmd)
+{
+ return native_pgd_val(pmd.pud.pgd);
+}
+#endif /* PAGETABLE_LEVELS >= 3 */
+
+static inline pte_t native_make_pte(pteval_t val)
+{
+ return (pte_t) { .pte = val };
+}
+
+static inline pteval_t native_pte_val(pte_t pte)
+{
+ return pte.pte;
+}
+
+static inline pteval_t native_pte_flags(pte_t pte)
+{
+ return native_pte_val(pte) & PTE_FLAGS_MASK;
+}
+
+#define pgprot_val(x) ((x).pgprot)
+#define __pgprot(x) ((pgprot_t) { (x) } )
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else /* !CONFIG_PARAVIRT */
+
+#define pgd_val(x) native_pgd_val(x)
+#define __pgd(x) native_make_pgd(x)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define pud_val(x) native_pud_val(x)
+#define __pud(x) native_make_pud(x)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pmd_val(x) native_pmd_val(x)
+#define __pmd(x) native_make_pmd(x)
+#endif
+
+#define pte_val(x) native_pte_val(x)
+#define pte_flags(x) native_pte_flags(x)
+#define __pte(x) native_make_pte(x)
+
+#endif /* CONFIG_PARAVIRT */
+
+#define __pa(x) __phys_addr((unsigned long)(x))
+#define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x))
+/* __pa_symbol should be used for C visible symbols.
+ This seems to be the official gcc blessed way to do such arithmetic. */
+#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
+
+#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
+
+#define __boot_va(x) __va(x)
+#define __boot_pa(x) __pa(x)
+
+/*
+ * virt_to_page(kaddr) returns a valid pointer if and only if
+ * virt_addr_valid(kaddr) returns true.
+ */
+#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
+#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
+extern bool __virt_addr_valid(unsigned long kaddr);
+#define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr))
+
+#endif /* __ASSEMBLY__ */
+
+#include <asm-generic/memory_model.h>
+#include <asm-generic/page.h>
+
+#define __HAVE_ARCH_GATE_AREA 1
+
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_PAGE_H */
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
new file mode 100644
index 000000000000..bcde0d7b4325
--- /dev/null
+++ b/arch/x86/include/asm/page_32.h
@@ -0,0 +1,136 @@
+#ifndef _ASM_X86_PAGE_32_H
+#define _ASM_X86_PAGE_32_H
+
+/*
+ * This handles the memory map.
+ *
+ * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
+ * a virtual address space of one gigabyte, which limits the
+ * amount of physical memory you can use to about 950MB.
+ *
+ * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
+ * and CONFIG_HIGHMEM64G options in the kernel configuration.
+ */
+#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
+
+#ifdef CONFIG_4KSTACKS
+#define THREAD_ORDER 0
+#else
+#define THREAD_ORDER 1
+#endif
+#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
+
+#define STACKFAULT_STACK 0
+#define DOUBLEFAULT_STACK 1
+#define NMI_STACK 0
+#define DEBUG_STACK 0
+#define MCE_STACK 0
+#define N_EXCEPTION_STACKS 1
+
+#ifdef CONFIG_X86_PAE
+/* 44=32+12, the limit we can fit into an unsigned long pfn */
+#define __PHYSICAL_MASK_SHIFT 44
+#define __VIRTUAL_MASK_SHIFT 32
+#define PAGETABLE_LEVELS 3
+
+#ifndef __ASSEMBLY__
+typedef u64 pteval_t;
+typedef u64 pmdval_t;
+typedef u64 pudval_t;
+typedef u64 pgdval_t;
+typedef u64 pgprotval_t;
+
+typedef union {
+ struct {
+ unsigned long pte_low, pte_high;
+ };
+ pteval_t pte;
+} pte_t;
+#endif /* __ASSEMBLY__
+ */
+#else /* !CONFIG_X86_PAE */
+#define __PHYSICAL_MASK_SHIFT 32
+#define __VIRTUAL_MASK_SHIFT 32
+#define PAGETABLE_LEVELS 2
+
+#ifndef __ASSEMBLY__
+typedef unsigned long pteval_t;
+typedef unsigned long pmdval_t;
+typedef unsigned long pudval_t;
+typedef unsigned long pgdval_t;
+typedef unsigned long pgprotval_t;
+
+typedef union {
+ pteval_t pte;
+ pteval_t pte_low;
+} pte_t;
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_X86_PAE */
+
+#ifndef __ASSEMBLY__
+typedef struct page *pgtable_t;
+#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
+#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+#endif
+
+#ifndef __ASSEMBLY__
+#define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET)
+#ifdef CONFIG_DEBUG_VIRTUAL
+extern unsigned long __phys_addr(unsigned long);
+#else
+#define __phys_addr(x) __phys_addr_nodebug(x)
+#endif
+#define __phys_reloc_hide(x) RELOC_HIDE((x), 0)
+
+#ifdef CONFIG_FLATMEM
+#define pfn_valid(pfn) ((pfn) < max_mapnr)
+#endif /* CONFIG_FLATMEM */
+
+extern int nx_enabled;
+
+/*
+ * This much address space is reserved for vmalloc() and iomap()
+ * as well as fixmap mappings.
+ */
+extern unsigned int __VMALLOC_RESERVE;
+extern int sysctl_legacy_va_layout;
+
+extern void find_low_pfn_range(void);
+extern unsigned long init_memory_mapping(unsigned long start,
+ unsigned long end);
+extern void initmem_init(unsigned long, unsigned long);
+extern void free_initmem(void);
+extern void setup_bootmem_allocator(void);
+
+
+#ifdef CONFIG_X86_USE_3DNOW
+#include <asm/mmx.h>
+
+static inline void clear_page(void *page)
+{
+ mmx_clear_page(page);
+}
+
+static inline void copy_page(void *to, void *from)
+{
+ mmx_copy_page(to, from);
+}
+#else /* !CONFIG_X86_USE_3DNOW */
+#include <linux/string.h>
+
+static inline void clear_page(void *page)
+{
+ memset(page, 0, PAGE_SIZE);
+}
+
+static inline void copy_page(void *to, void *from)
+{
+ memcpy(to, from, PAGE_SIZE);
+}
+#endif /* CONFIG_X86_3DNOW */
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PAGE_32_H */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
new file mode 100644
index 000000000000..5ebca29f44f0
--- /dev/null
+++ b/arch/x86/include/asm/page_64.h
@@ -0,0 +1,105 @@
+#ifndef _ASM_X86_PAGE_64_H
+#define _ASM_X86_PAGE_64_H
+
+#define PAGETABLE_LEVELS 4
+
+#define THREAD_ORDER 1
+#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
+#define CURRENT_MASK (~(THREAD_SIZE - 1))
+
+#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
+
+#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
+#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
+
+#define IRQSTACK_ORDER 2
+#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
+
+#define STACKFAULT_STACK 1
+#define DOUBLEFAULT_STACK 2
+#define NMI_STACK 3
+#define DEBUG_STACK 4
+#define MCE_STACK 5
+#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
+
+#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
+#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
+
+/*
+ * Set __PAGE_OFFSET to the most negative possible address +
+ * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a
+ * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
+ * what Xen requires.
+ */
+#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
+
+#define __PHYSICAL_START CONFIG_PHYSICAL_START
+#define __KERNEL_ALIGN 0x200000
+
+/*
+ * Make sure kernel is aligned to 2MB address. Catching it at compile
+ * time is better. Change your config file and compile the kernel
+ * for a 2MB aligned address (CONFIG_PHYSICAL_START)
+ */
+#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
+#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
+#endif
+
+#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
+#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
+
+/* See Documentation/x86_64/mm.txt for a description of the memory map. */
+#define __PHYSICAL_MASK_SHIFT 46
+#define __VIRTUAL_MASK_SHIFT 48
+
+/*
+ * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
+ * arch/x86/kernel/head_64.S), and it is mapped here:
+ */
+#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
+#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
+
+#ifndef __ASSEMBLY__
+void clear_page(void *page);
+void copy_page(void *to, void *from);
+
+/* duplicated to the one in bootmem.h */
+extern unsigned long max_pfn;
+extern unsigned long phys_base;
+
+extern unsigned long __phys_addr(unsigned long);
+#define __phys_reloc_hide(x) (x)
+
+/*
+ * These are used to make use of C type-checking..
+ */
+typedef unsigned long pteval_t;
+typedef unsigned long pmdval_t;
+typedef unsigned long pudval_t;
+typedef unsigned long pgdval_t;
+typedef unsigned long pgprotval_t;
+
+typedef struct page *pgtable_t;
+
+typedef struct { pteval_t pte; } pte_t;
+
+#define vmemmap ((struct page *)VMEMMAP_START)
+
+extern unsigned long init_memory_mapping(unsigned long start,
+ unsigned long end);
+
+extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
+extern void free_initmem(void);
+
+extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
+extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
+
+#endif /* !__ASSEMBLY__ */
+
+#ifdef CONFIG_FLATMEM
+#define pfn_valid(pfn) ((pfn) < max_pfn)
+#endif
+
+
+#endif /* _ASM_X86_PAGE_64_H */
diff --git a/arch/x86/include/asm/param.h b/arch/x86/include/asm/param.h
new file mode 100644
index 000000000000..6f0d0422f4ca
--- /dev/null
+++ b/arch/x86/include/asm/param.h
@@ -0,0 +1,22 @@
+#ifndef _ASM_X86_PARAM_H
+#define _ASM_X86_PARAM_H
+
+#ifdef __KERNEL__
+# define HZ CONFIG_HZ /* Internal kernel timer frequency */
+# define USER_HZ 100 /* some user interfaces are */
+# define CLOCKS_PER_SEC (USER_HZ) /* in "ticks" like times() */
+#endif
+
+#ifndef HZ
+#define HZ 100
+#endif
+
+#define EXEC_PAGESIZE 4096
+
+#ifndef NOGROUP
+#define NOGROUP (-1)
+#endif
+
+#define MAXHOSTNAMELEN 64 /* max length of hostname */
+
+#endif /* _ASM_X86_PARAM_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
new file mode 100644
index 000000000000..ba3e2ff6aedc
--- /dev/null
+++ b/arch/x86/include/asm/paravirt.h
@@ -0,0 +1,1650 @@
+#ifndef _ASM_X86_PARAVIRT_H
+#define _ASM_X86_PARAVIRT_H
+/* Various instructions on x86 need to be replaced for
+ * para-virtualization: those hooks are defined here. */
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/page.h>
+#include <asm/asm.h>
+
+/* Bitmask of what can be clobbered: usually at least eax. */
+#define CLBR_NONE 0
+#define CLBR_EAX (1 << 0)
+#define CLBR_ECX (1 << 1)
+#define CLBR_EDX (1 << 2)
+
+#ifdef CONFIG_X86_64
+#define CLBR_RSI (1 << 3)
+#define CLBR_RDI (1 << 4)
+#define CLBR_R8 (1 << 5)
+#define CLBR_R9 (1 << 6)
+#define CLBR_R10 (1 << 7)
+#define CLBR_R11 (1 << 8)
+#define CLBR_ANY ((1 << 9) - 1)
+#include <asm/desc_defs.h>
+#else
+/* CLBR_ANY should match all regs platform has. For i386, that's just it */
+#define CLBR_ANY ((1 << 3) - 1)
+#endif /* X86_64 */
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#include <linux/cpumask.h>
+#include <asm/kmap_types.h>
+#include <asm/desc_defs.h>
+
+struct page;
+struct thread_struct;
+struct desc_ptr;
+struct tss_struct;
+struct mm_struct;
+struct desc_struct;
+
+/* general info */
+struct pv_info {
+ unsigned int kernel_rpl;
+ int shared_kernel_pmd;
+ int paravirt_enabled;
+ const char *name;
+};
+
+struct pv_init_ops {
+ /*
+ * Patch may replace one of the defined code sequences with
+ * arbitrary code, subject to the same register constraints.
+ * This generally means the code is not free to clobber any
+ * registers other than EAX. The patch function should return
+ * the number of bytes of code generated, as we nop pad the
+ * rest in generic code.
+ */
+ unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
+ unsigned long addr, unsigned len);
+
+ /* Basic arch-specific setup */
+ void (*arch_setup)(void);
+ char *(*memory_setup)(void);
+ void (*post_allocator_init)(void);
+
+ /* Print a banner to identify the environment */
+ void (*banner)(void);
+};
+
+
+struct pv_lazy_ops {
+ /* Set deferred update mode, used for batching operations. */
+ void (*enter)(void);
+ void (*leave)(void);
+};
+
+struct pv_time_ops {
+ void (*time_init)(void);
+
+ /* Set and set time of day */
+ unsigned long (*get_wallclock)(void);
+ int (*set_wallclock)(unsigned long);
+
+ unsigned long long (*sched_clock)(void);
+ unsigned long (*get_tsc_khz)(void);
+};
+
+struct pv_cpu_ops {
+ /* hooks for various privileged instructions */
+ unsigned long (*get_debugreg)(int regno);
+ void (*set_debugreg)(int regno, unsigned long value);
+
+ void (*clts)(void);
+
+ unsigned long (*read_cr0)(void);
+ void (*write_cr0)(unsigned long);
+
+ unsigned long (*read_cr4_safe)(void);
+ unsigned long (*read_cr4)(void);
+ void (*write_cr4)(unsigned long);
+
+#ifdef CONFIG_X86_64
+ unsigned long (*read_cr8)(void);
+ void (*write_cr8)(unsigned long);
+#endif
+
+ /* Segment descriptor handling */
+ void (*load_tr_desc)(void);
+ void (*load_gdt)(const struct desc_ptr *);
+ void (*load_idt)(const struct desc_ptr *);
+ void (*store_gdt)(struct desc_ptr *);
+ void (*store_idt)(struct desc_ptr *);
+ void (*set_ldt)(const void *desc, unsigned entries);
+ unsigned long (*store_tr)(void);
+ void (*load_tls)(struct thread_struct *t, unsigned int cpu);
+#ifdef CONFIG_X86_64
+ void (*load_gs_index)(unsigned int idx);
+#endif
+ void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum,
+ const void *desc);
+ void (*write_gdt_entry)(struct desc_struct *,
+ int entrynum, const void *desc, int size);
+ void (*write_idt_entry)(gate_desc *,
+ int entrynum, const gate_desc *gate);
+ void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
+ void (*free_ldt)(struct desc_struct *ldt, unsigned entries);
+
+ void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
+
+ void (*set_iopl_mask)(unsigned mask);
+
+ void (*wbinvd)(void);
+ void (*io_delay)(void);
+
+ /* cpuid emulation, mostly so that caps bits can be disabled */
+ void (*cpuid)(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx);
+
+ /* MSR, PMC and TSR operations.
+ err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
+ u64 (*read_msr_amd)(unsigned int msr, int *err);
+ u64 (*read_msr)(unsigned int msr, int *err);
+ int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
+
+ u64 (*read_tsc)(void);
+ u64 (*read_pmc)(int counter);
+ unsigned long long (*read_tscp)(unsigned int *aux);
+
+ /*
+ * Atomically enable interrupts and return to userspace. This
+ * is only ever used to return to 32-bit processes; in a
+ * 64-bit kernel, it's used for 32-on-64 compat processes, but
+ * never native 64-bit processes. (Jump, not call.)
+ */
+ void (*irq_enable_sysexit)(void);
+
+ /*
+ * Switch to usermode gs and return to 64-bit usermode using
+ * sysret. Only used in 64-bit kernels to return to 64-bit
+ * processes. Usermode register state, including %rsp, must
+ * already be restored.
+ */
+ void (*usergs_sysret64)(void);
+
+ /*
+ * Switch to usermode gs and return to 32-bit usermode using
+ * sysret. Used to return to 32-on-64 compat processes.
+ * Other usermode register state, including %esp, must already
+ * be restored.
+ */
+ void (*usergs_sysret32)(void);
+
+ /* Normal iret. Jump to this with the standard iret stack
+ frame set up. */
+ void (*iret)(void);
+
+ void (*swapgs)(void);
+
+ struct pv_lazy_ops lazy_mode;
+};
+
+struct pv_irq_ops {
+ void (*init_IRQ)(void);
+
+ /*
+ * Get/set interrupt state. save_fl and restore_fl are only
+ * expected to use X86_EFLAGS_IF; all other bits
+ * returned from save_fl are undefined, and may be ignored by
+ * restore_fl.
+ */
+ unsigned long (*save_fl)(void);
+ void (*restore_fl)(unsigned long);
+ void (*irq_disable)(void);
+ void (*irq_enable)(void);
+ void (*safe_halt)(void);
+ void (*halt)(void);
+
+#ifdef CONFIG_X86_64
+ void (*adjust_exception_frame)(void);
+#endif
+};
+
+struct pv_apic_ops {
+#ifdef CONFIG_X86_LOCAL_APIC
+ void (*setup_boot_clock)(void);
+ void (*setup_secondary_clock)(void);
+
+ void (*startup_ipi_hook)(int phys_apicid,
+ unsigned long start_eip,
+ unsigned long start_esp);
+#endif
+};
+
+struct pv_mmu_ops {
+ /*
+ * Called before/after init_mm pagetable setup. setup_start
+ * may reset %cr3, and may pre-install parts of the pagetable;
+ * pagetable setup is expected to preserve any existing
+ * mapping.
+ */
+ void (*pagetable_setup_start)(pgd_t *pgd_base);
+ void (*pagetable_setup_done)(pgd_t *pgd_base);
+
+ unsigned long (*read_cr2)(void);
+ void (*write_cr2)(unsigned long);
+
+ unsigned long (*read_cr3)(void);
+ void (*write_cr3)(unsigned long);
+
+ /*
+ * Hooks for intercepting the creation/use/destruction of an
+ * mm_struct.
+ */
+ void (*activate_mm)(struct mm_struct *prev,
+ struct mm_struct *next);
+ void (*dup_mmap)(struct mm_struct *oldmm,
+ struct mm_struct *mm);
+ void (*exit_mmap)(struct mm_struct *mm);
+
+
+ /* TLB operations */
+ void (*flush_tlb_user)(void);
+ void (*flush_tlb_kernel)(void);
+ void (*flush_tlb_single)(unsigned long addr);
+ void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm,
+ unsigned long va);
+
+ /* Hooks for allocating and freeing a pagetable top-level */
+ int (*pgd_alloc)(struct mm_struct *mm);
+ void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd);
+
+ /*
+ * Hooks for allocating/releasing pagetable pages when they're
+ * attached to a pagetable
+ */
+ void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
+ void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
+ void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count);
+ void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
+ void (*release_pte)(unsigned long pfn);
+ void (*release_pmd)(unsigned long pfn);
+ void (*release_pud)(unsigned long pfn);
+
+ /* Pagetable manipulation functions */
+ void (*set_pte)(pte_t *ptep, pte_t pteval);
+ void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval);
+ void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
+ void (*pte_update)(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep);
+ void (*pte_update_defer)(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep);
+
+ pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep);
+ void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte);
+
+ pteval_t (*pte_val)(pte_t);
+ pteval_t (*pte_flags)(pte_t);
+ pte_t (*make_pte)(pteval_t pte);
+
+ pgdval_t (*pgd_val)(pgd_t);
+ pgd_t (*make_pgd)(pgdval_t pgd);
+
+#if PAGETABLE_LEVELS >= 3
+#ifdef CONFIG_X86_PAE
+ void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
+ void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte);
+ void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep);
+ void (*pmd_clear)(pmd_t *pmdp);
+
+#endif /* CONFIG_X86_PAE */
+
+ void (*set_pud)(pud_t *pudp, pud_t pudval);
+
+ pmdval_t (*pmd_val)(pmd_t);
+ pmd_t (*make_pmd)(pmdval_t pmd);
+
+#if PAGETABLE_LEVELS == 4
+ pudval_t (*pud_val)(pud_t);
+ pud_t (*make_pud)(pudval_t pud);
+
+ void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
+#endif /* PAGETABLE_LEVELS == 4 */
+#endif /* PAGETABLE_LEVELS >= 3 */
+
+#ifdef CONFIG_HIGHPTE
+ void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
+#endif
+
+ struct pv_lazy_ops lazy_mode;
+
+ /* dom0 ops */
+
+ /* Sometimes the physical address is a pfn, and sometimes its
+ an mfn. We can tell which is which from the index. */
+ void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx,
+ unsigned long phys, pgprot_t flags);
+};
+
+struct raw_spinlock;
+struct pv_lock_ops {
+ int (*spin_is_locked)(struct raw_spinlock *lock);
+ int (*spin_is_contended)(struct raw_spinlock *lock);
+ void (*spin_lock)(struct raw_spinlock *lock);
+ void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags);
+ int (*spin_trylock)(struct raw_spinlock *lock);
+ void (*spin_unlock)(struct raw_spinlock *lock);
+};
+
+/* This contains all the paravirt structures: we get a convenient
+ * number for each function using the offset which we use to indicate
+ * what to patch. */
+struct paravirt_patch_template {
+ struct pv_init_ops pv_init_ops;
+ struct pv_time_ops pv_time_ops;
+ struct pv_cpu_ops pv_cpu_ops;
+ struct pv_irq_ops pv_irq_ops;
+ struct pv_apic_ops pv_apic_ops;
+ struct pv_mmu_ops pv_mmu_ops;
+ struct pv_lock_ops pv_lock_ops;
+};
+
+extern struct pv_info pv_info;
+extern struct pv_init_ops pv_init_ops;
+extern struct pv_time_ops pv_time_ops;
+extern struct pv_cpu_ops pv_cpu_ops;
+extern struct pv_irq_ops pv_irq_ops;
+extern struct pv_apic_ops pv_apic_ops;
+extern struct pv_mmu_ops pv_mmu_ops;
+extern struct pv_lock_ops pv_lock_ops;
+
+#define PARAVIRT_PATCH(x) \
+ (offsetof(struct paravirt_patch_template, x) / sizeof(void *))
+
+#define paravirt_type(op) \
+ [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
+ [paravirt_opptr] "m" (op)
+#define paravirt_clobber(clobber) \
+ [paravirt_clobber] "i" (clobber)
+
+/*
+ * Generate some code, and mark it as patchable by the
+ * apply_paravirt() alternate instruction patcher.
+ */
+#define _paravirt_alt(insn_string, type, clobber) \
+ "771:\n\t" insn_string "\n" "772:\n" \
+ ".pushsection .parainstructions,\"a\"\n" \
+ _ASM_ALIGN "\n" \
+ _ASM_PTR " 771b\n" \
+ " .byte " type "\n" \
+ " .byte 772b-771b\n" \
+ " .short " clobber "\n" \
+ ".popsection\n"
+
+/* Generate patchable code, with the default asm parameters. */
+#define paravirt_alt(insn_string) \
+ _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
+
+/* Simple instruction patching code. */
+#define DEF_NATIVE(ops, name, code) \
+ extern const char start_##ops##_##name[], end_##ops##_##name[]; \
+ asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
+
+unsigned paravirt_patch_nop(void);
+unsigned paravirt_patch_ignore(unsigned len);
+unsigned paravirt_patch_call(void *insnbuf,
+ const void *target, u16 tgt_clobbers,
+ unsigned long addr, u16 site_clobbers,
+ unsigned len);
+unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
+ unsigned long addr, unsigned len);
+unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
+ unsigned long addr, unsigned len);
+
+unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
+ const char *start, const char *end);
+
+unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+ unsigned long addr, unsigned len);
+
+int paravirt_disable_iospace(void);
+
+/*
+ * This generates an indirect call based on the operation type number.
+ * The type number, computed in PARAVIRT_PATCH, is derived from the
+ * offset into the paravirt_patch_template structure, and can therefore be
+ * freely converted back into a structure offset.
+ */
+#define PARAVIRT_CALL "call *%[paravirt_opptr];"
+
+/*
+ * These macros are intended to wrap calls through one of the paravirt
+ * ops structs, so that they can be later identified and patched at
+ * runtime.
+ *
+ * Normally, a call to a pv_op function is a simple indirect call:
+ * (pv_op_struct.operations)(args...).
+ *
+ * Unfortunately, this is a relatively slow operation for modern CPUs,
+ * because it cannot necessarily determine what the destination
+ * address is. In this case, the address is a runtime constant, so at
+ * the very least we can patch the call to e a simple direct call, or
+ * ideally, patch an inline implementation into the callsite. (Direct
+ * calls are essentially free, because the call and return addresses
+ * are completely predictable.)
+ *
+ * For i386, these macros rely on the standard gcc "regparm(3)" calling
+ * convention, in which the first three arguments are placed in %eax,
+ * %edx, %ecx (in that order), and the remaining arguments are placed
+ * on the stack. All caller-save registers (eax,edx,ecx) are expected
+ * to be modified (either clobbered or used for return values).
+ * X86_64, on the other hand, already specifies a register-based calling
+ * conventions, returning at %rax, with parameteres going on %rdi, %rsi,
+ * %rdx, and %rcx. Note that for this reason, x86_64 does not need any
+ * special handling for dealing with 4 arguments, unlike i386.
+ * However, x86_64 also have to clobber all caller saved registers, which
+ * unfortunately, are quite a bit (r8 - r11)
+ *
+ * The call instruction itself is marked by placing its start address
+ * and size into the .parainstructions section, so that
+ * apply_paravirt() in arch/i386/kernel/alternative.c can do the
+ * appropriate patching under the control of the backend pv_init_ops
+ * implementation.
+ *
+ * Unfortunately there's no way to get gcc to generate the args setup
+ * for the call, and then allow the call itself to be generated by an
+ * inline asm. Because of this, we must do the complete arg setup and
+ * return value handling from within these macros. This is fairly
+ * cumbersome.
+ *
+ * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments.
+ * It could be extended to more arguments, but there would be little
+ * to be gained from that. For each number of arguments, there are
+ * the two VCALL and CALL variants for void and non-void functions.
+ *
+ * When there is a return value, the invoker of the macro must specify
+ * the return type. The macro then uses sizeof() on that type to
+ * determine whether its a 32 or 64 bit value, and places the return
+ * in the right register(s) (just %eax for 32-bit, and %edx:%eax for
+ * 64-bit). For x86_64 machines, it just returns at %rax regardless of
+ * the return value size.
+ *
+ * 64-bit arguments are passed as a pair of adjacent 32-bit arguments
+ * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments
+ * in low,high order
+ *
+ * Small structures are passed and returned in registers. The macro
+ * calling convention can't directly deal with this, so the wrapper
+ * functions must do this.
+ *
+ * These PVOP_* macros are only defined within this header. This
+ * means that all uses must be wrapped in inline functions. This also
+ * makes sure the incoming and outgoing types are always correct.
+ */
+#ifdef CONFIG_X86_32
+#define PVOP_VCALL_ARGS unsigned long __eax, __edx, __ecx
+#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
+#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \
+ "=c" (__ecx)
+#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS
+#define EXTRA_CLOBBERS
+#define VEXTRA_CLOBBERS
+#else
+#define PVOP_VCALL_ARGS unsigned long __edi, __esi, __edx, __ecx
+#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax
+#define PVOP_VCALL_CLOBBERS "=D" (__edi), \
+ "=S" (__esi), "=d" (__edx), \
+ "=c" (__ecx)
+
+#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
+
+#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11"
+#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11"
+#endif
+
+#ifdef CONFIG_PARAVIRT_DEBUG
+#define PVOP_TEST_NULL(op) BUG_ON(op == NULL)
+#else
+#define PVOP_TEST_NULL(op) ((void)op)
+#endif
+
+#define __PVOP_CALL(rettype, op, pre, post, ...) \
+ ({ \
+ rettype __ret; \
+ PVOP_CALL_ARGS; \
+ PVOP_TEST_NULL(op); \
+ /* This is 32-bit specific, but is okay in 64-bit */ \
+ /* since this condition will never hold */ \
+ if (sizeof(rettype) > sizeof(unsigned long)) { \
+ asm volatile(pre \
+ paravirt_alt(PARAVIRT_CALL) \
+ post \
+ : PVOP_CALL_CLOBBERS \
+ : paravirt_type(op), \
+ paravirt_clobber(CLBR_ANY), \
+ ##__VA_ARGS__ \
+ : "memory", "cc" EXTRA_CLOBBERS); \
+ __ret = (rettype)((((u64)__edx) << 32) | __eax); \
+ } else { \
+ asm volatile(pre \
+ paravirt_alt(PARAVIRT_CALL) \
+ post \
+ : PVOP_CALL_CLOBBERS \
+ : paravirt_type(op), \
+ paravirt_clobber(CLBR_ANY), \
+ ##__VA_ARGS__ \
+ : "memory", "cc" EXTRA_CLOBBERS); \
+ __ret = (rettype)__eax; \
+ } \
+ __ret; \
+ })
+#define __PVOP_VCALL(op, pre, post, ...) \
+ ({ \
+ PVOP_VCALL_ARGS; \
+ PVOP_TEST_NULL(op); \
+ asm volatile(pre \
+ paravirt_alt(PARAVIRT_CALL) \
+ post \
+ : PVOP_VCALL_CLOBBERS \
+ : paravirt_type(op), \
+ paravirt_clobber(CLBR_ANY), \
+ ##__VA_ARGS__ \
+ : "memory", "cc" VEXTRA_CLOBBERS); \
+ })
+
+#define PVOP_CALL0(rettype, op) \
+ __PVOP_CALL(rettype, op, "", "")
+#define PVOP_VCALL0(op) \
+ __PVOP_VCALL(op, "", "")
+
+#define PVOP_CALL1(rettype, op, arg1) \
+ __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)))
+#define PVOP_VCALL1(op, arg1) \
+ __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)))
+
+#define PVOP_CALL2(rettype, op, arg1, arg2) \
+ __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
+ "1" ((unsigned long)(arg2)))
+#define PVOP_VCALL2(op, arg1, arg2) \
+ __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
+ "1" ((unsigned long)(arg2)))
+
+#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
+ __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
+ "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
+#define PVOP_VCALL3(op, arg1, arg2, arg3) \
+ __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
+ "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
+
+/* This is the only difference in x86_64. We can make it much simpler */
+#ifdef CONFIG_X86_32
+#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
+ __PVOP_CALL(rettype, op, \
+ "push %[_arg4];", "lea 4(%%esp),%%esp;", \
+ "0" ((u32)(arg1)), "1" ((u32)(arg2)), \
+ "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
+#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
+ __PVOP_VCALL(op, \
+ "push %[_arg4];", "lea 4(%%esp),%%esp;", \
+ "0" ((u32)(arg1)), "1" ((u32)(arg2)), \
+ "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
+#else
+#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
+ __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
+ "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \
+ "3"((unsigned long)(arg4)))
+#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
+ __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
+ "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \
+ "3"((unsigned long)(arg4)))
+#endif
+
+static inline int paravirt_enabled(void)
+{
+ return pv_info.paravirt_enabled;
+}
+
+static inline void load_sp0(struct tss_struct *tss,
+ struct thread_struct *thread)
+{
+ PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
+}
+
+#define ARCH_SETUP pv_init_ops.arch_setup();
+static inline unsigned long get_wallclock(void)
+{
+ return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock);
+}
+
+static inline int set_wallclock(unsigned long nowtime)
+{
+ return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime);
+}
+
+static inline void (*choose_time_init(void))(void)
+{
+ return pv_time_ops.time_init;
+}
+
+/* The paravirtualized CPUID instruction. */
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx);
+}
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+static inline unsigned long paravirt_get_debugreg(int reg)
+{
+ return PVOP_CALL1(unsigned long, pv_cpu_ops.get_debugreg, reg);
+}
+#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
+static inline void set_debugreg(unsigned long val, int reg)
+{
+ PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val);
+}
+
+static inline void clts(void)
+{
+ PVOP_VCALL0(pv_cpu_ops.clts);
+}
+
+static inline unsigned long read_cr0(void)
+{
+ return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0);
+}
+
+static inline void write_cr0(unsigned long x)
+{
+ PVOP_VCALL1(pv_cpu_ops.write_cr0, x);
+}
+
+static inline unsigned long read_cr2(void)
+{
+ return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr2);
+}
+
+static inline void write_cr2(unsigned long x)
+{
+ PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
+}
+
+static inline unsigned long read_cr3(void)
+{
+ return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
+}
+
+static inline void write_cr3(unsigned long x)
+{
+ PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
+}
+
+static inline unsigned long read_cr4(void)
+{
+ return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
+}
+static inline unsigned long read_cr4_safe(void)
+{
+ return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
+}
+
+static inline void write_cr4(unsigned long x)
+{
+ PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long read_cr8(void)
+{
+ return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr8);
+}
+
+static inline void write_cr8(unsigned long x)
+{
+ PVOP_VCALL1(pv_cpu_ops.write_cr8, x);
+}
+#endif
+
+static inline void raw_safe_halt(void)
+{
+ PVOP_VCALL0(pv_irq_ops.safe_halt);
+}
+
+static inline void halt(void)
+{
+ PVOP_VCALL0(pv_irq_ops.safe_halt);
+}
+
+static inline void wbinvd(void)
+{
+ PVOP_VCALL0(pv_cpu_ops.wbinvd);
+}
+
+#define get_kernel_rpl() (pv_info.kernel_rpl)
+
+static inline u64 paravirt_read_msr(unsigned msr, int *err)
+{
+ return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
+}
+static inline u64 paravirt_read_msr_amd(unsigned msr, int *err)
+{
+ return PVOP_CALL2(u64, pv_cpu_ops.read_msr_amd, msr, err);
+}
+static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
+{
+ return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
+}
+
+/* These should all do BUG_ON(_err), but our headers are too tangled. */
+#define rdmsr(msr, val1, val2) \
+do { \
+ int _err; \
+ u64 _l = paravirt_read_msr(msr, &_err); \
+ val1 = (u32)_l; \
+ val2 = _l >> 32; \
+} while (0)
+
+#define wrmsr(msr, val1, val2) \
+do { \
+ paravirt_write_msr(msr, val1, val2); \
+} while (0)
+
+#define rdmsrl(msr, val) \
+do { \
+ int _err; \
+ val = paravirt_read_msr(msr, &_err); \
+} while (0)
+
+#define wrmsrl(msr, val) wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32)
+#define wrmsr_safe(msr, a, b) paravirt_write_msr(msr, a, b)
+
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr, a, b) \
+({ \
+ int _err; \
+ u64 _l = paravirt_read_msr(msr, &_err); \
+ (*a) = (u32)_l; \
+ (*b) = _l >> 32; \
+ _err; \
+})
+
+static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
+{
+ int err;
+
+ *p = paravirt_read_msr(msr, &err);
+ return err;
+}
+static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
+{
+ int err;
+
+ *p = paravirt_read_msr_amd(msr, &err);
+ return err;
+}
+
+static inline u64 paravirt_read_tsc(void)
+{
+ return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
+}
+
+#define rdtscl(low) \
+do { \
+ u64 _l = paravirt_read_tsc(); \
+ low = (int)_l; \
+} while (0)
+
+#define rdtscll(val) (val = paravirt_read_tsc())
+
+static inline unsigned long long paravirt_sched_clock(void)
+{
+ return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
+}
+#define calibrate_tsc() (pv_time_ops.get_tsc_khz())
+
+static inline unsigned long long paravirt_read_pmc(int counter)
+{
+ return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
+}
+
+#define rdpmc(counter, low, high) \
+do { \
+ u64 _l = paravirt_read_pmc(counter); \
+ low = (u32)_l; \
+ high = _l >> 32; \
+} while (0)
+
+static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
+{
+ return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
+}
+
+#define rdtscp(low, high, aux) \
+do { \
+ int __aux; \
+ unsigned long __val = paravirt_rdtscp(&__aux); \
+ (low) = (u32)__val; \
+ (high) = (u32)(__val >> 32); \
+ (aux) = __aux; \
+} while (0)
+
+#define rdtscpll(val, aux) \
+do { \
+ unsigned long __aux; \
+ val = paravirt_rdtscp(&__aux); \
+ (aux) = __aux; \
+} while (0)
+
+static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
+{
+ PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
+}
+
+static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
+{
+ PVOP_VCALL2(pv_cpu_ops.free_ldt, ldt, entries);
+}
+
+static inline void load_TR_desc(void)
+{
+ PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
+}
+static inline void load_gdt(const struct desc_ptr *dtr)
+{
+ PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr);
+}
+static inline void load_idt(const struct desc_ptr *dtr)
+{
+ PVOP_VCALL1(pv_cpu_ops.load_idt, dtr);
+}
+static inline void set_ldt(const void *addr, unsigned entries)
+{
+ PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
+}
+static inline void store_gdt(struct desc_ptr *dtr)
+{
+ PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
+}
+static inline void store_idt(struct desc_ptr *dtr)
+{
+ PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
+}
+static inline unsigned long paravirt_store_tr(void)
+{
+ return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr);
+}
+#define store_tr(tr) ((tr) = paravirt_store_tr())
+static inline void load_TLS(struct thread_struct *t, unsigned cpu)
+{
+ PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
+}
+
+#ifdef CONFIG_X86_64
+static inline void load_gs_index(unsigned int gs)
+{
+ PVOP_VCALL1(pv_cpu_ops.load_gs_index, gs);
+}
+#endif
+
+static inline void write_ldt_entry(struct desc_struct *dt, int entry,
+ const void *desc)
+{
+ PVOP_VCALL3(pv_cpu_ops.write_ldt_entry, dt, entry, desc);
+}
+
+static inline void write_gdt_entry(struct desc_struct *dt, int entry,
+ void *desc, int type)
+{
+ PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, desc, type);
+}
+
+static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
+{
+ PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
+}
+static inline void set_iopl_mask(unsigned mask)
+{
+ PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
+}
+
+/* The paravirtualized I/O functions */
+static inline void slow_down_io(void)
+{
+ pv_cpu_ops.io_delay();
+#ifdef REALLY_SLOW_IO
+ pv_cpu_ops.io_delay();
+ pv_cpu_ops.io_delay();
+ pv_cpu_ops.io_delay();
+#endif
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static inline void setup_boot_clock(void)
+{
+ PVOP_VCALL0(pv_apic_ops.setup_boot_clock);
+}
+
+static inline void setup_secondary_clock(void)
+{
+ PVOP_VCALL0(pv_apic_ops.setup_secondary_clock);
+}
+#endif
+
+static inline void paravirt_post_allocator_init(void)
+{
+ if (pv_init_ops.post_allocator_init)
+ (*pv_init_ops.post_allocator_init)();
+}
+
+static inline void paravirt_pagetable_setup_start(pgd_t *base)
+{
+ (*pv_mmu_ops.pagetable_setup_start)(base);
+}
+
+static inline void paravirt_pagetable_setup_done(pgd_t *base)
+{
+ (*pv_mmu_ops.pagetable_setup_done)(base);
+}
+
+#ifdef CONFIG_SMP
+static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
+ unsigned long start_esp)
+{
+ PVOP_VCALL3(pv_apic_ops.startup_ipi_hook,
+ phys_apicid, start_eip, start_esp);
+}
+#endif
+
+static inline void paravirt_activate_mm(struct mm_struct *prev,
+ struct mm_struct *next)
+{
+ PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next);
+}
+
+static inline void arch_dup_mmap(struct mm_struct *oldmm,
+ struct mm_struct *mm)
+{
+ PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm);
+}
+
+static inline void arch_exit_mmap(struct mm_struct *mm)
+{
+ PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm);
+}
+
+static inline void __flush_tlb(void)
+{
+ PVOP_VCALL0(pv_mmu_ops.flush_tlb_user);
+}
+static inline void __flush_tlb_global(void)
+{
+ PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel);
+}
+static inline void __flush_tlb_single(unsigned long addr)
+{
+ PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
+}
+
+static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
+ unsigned long va)
+{
+ PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va);
+}
+
+static inline int paravirt_pgd_alloc(struct mm_struct *mm)
+{
+ return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm);
+}
+
+static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+ PVOP_VCALL2(pv_mmu_ops.pgd_free, mm, pgd);
+}
+
+static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)
+{
+ PVOP_VCALL2(pv_mmu_ops.alloc_pte, mm, pfn);
+}
+static inline void paravirt_release_pte(unsigned long pfn)
+{
+ PVOP_VCALL1(pv_mmu_ops.release_pte, pfn);
+}
+
+static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
+{
+ PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
+}
+
+static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
+ unsigned long start, unsigned long count)
+{
+ PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count);
+}
+static inline void paravirt_release_pmd(unsigned long pfn)
+{
+ PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
+}
+
+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn)
+{
+ PVOP_VCALL2(pv_mmu_ops.alloc_pud, mm, pfn);
+}
+static inline void paravirt_release_pud(unsigned long pfn)
+{
+ PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
+}
+
+#ifdef CONFIG_HIGHPTE
+static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
+{
+ unsigned long ret;
+ ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
+ return (void *)ret;
+}
+#endif
+
+static inline void pte_update(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
+}
+
+static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
+}
+
+static inline pte_t __pte(pteval_t val)
+{
+ pteval_t ret;
+
+ if (sizeof(pteval_t) > sizeof(long))
+ ret = PVOP_CALL2(pteval_t,
+ pv_mmu_ops.make_pte,
+ val, (u64)val >> 32);
+ else
+ ret = PVOP_CALL1(pteval_t,
+ pv_mmu_ops.make_pte,
+ val);
+
+ return (pte_t) { .pte = ret };
+}
+
+static inline pteval_t pte_val(pte_t pte)
+{
+ pteval_t ret;
+
+ if (sizeof(pteval_t) > sizeof(long))
+ ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val,
+ pte.pte, (u64)pte.pte >> 32);
+ else
+ ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val,
+ pte.pte);
+
+ return ret;
+}
+
+static inline pteval_t pte_flags(pte_t pte)
+{
+ pteval_t ret;
+
+ if (sizeof(pteval_t) > sizeof(long))
+ ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_flags,
+ pte.pte, (u64)pte.pte >> 32);
+ else
+ ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_flags,
+ pte.pte);
+
+#ifdef CONFIG_PARAVIRT_DEBUG
+ BUG_ON(ret & PTE_PFN_MASK);
+#endif
+ return ret;
+}
+
+static inline pgd_t __pgd(pgdval_t val)
+{
+ pgdval_t ret;
+
+ if (sizeof(pgdval_t) > sizeof(long))
+ ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd,
+ val, (u64)val >> 32);
+ else
+ ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd,
+ val);
+
+ return (pgd_t) { ret };
+}
+
+static inline pgdval_t pgd_val(pgd_t pgd)
+{
+ pgdval_t ret;
+
+ if (sizeof(pgdval_t) > sizeof(long))
+ ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val,
+ pgd.pgd, (u64)pgd.pgd >> 32);
+ else
+ ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val,
+ pgd.pgd);
+
+ return ret;
+}
+
+#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
+static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ pteval_t ret;
+
+ ret = PVOP_CALL3(pteval_t, pv_mmu_ops.ptep_modify_prot_start,
+ mm, addr, ptep);
+
+ return (pte_t) { .pte = ret };
+}
+
+static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ if (sizeof(pteval_t) > sizeof(long))
+ /* 5 arg words */
+ pv_mmu_ops.ptep_modify_prot_commit(mm, addr, ptep, pte);
+ else
+ PVOP_VCALL4(pv_mmu_ops.ptep_modify_prot_commit,
+ mm, addr, ptep, pte.pte);
+}
+
+static inline void set_pte(pte_t *ptep, pte_t pte)
+{
+ if (sizeof(pteval_t) > sizeof(long))
+ PVOP_VCALL3(pv_mmu_ops.set_pte, ptep,
+ pte.pte, (u64)pte.pte >> 32);
+ else
+ PVOP_VCALL2(pv_mmu_ops.set_pte, ptep,
+ pte.pte);
+}
+
+static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ if (sizeof(pteval_t) > sizeof(long))
+ /* 5 arg words */
+ pv_mmu_ops.set_pte_at(mm, addr, ptep, pte);
+ else
+ PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
+}
+
+static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ pmdval_t val = native_pmd_val(pmd);
+
+ if (sizeof(pmdval_t) > sizeof(long))
+ PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp, val, (u64)val >> 32);
+ else
+ PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
+}
+
+#if PAGETABLE_LEVELS >= 3
+static inline pmd_t __pmd(pmdval_t val)
+{
+ pmdval_t ret;
+
+ if (sizeof(pmdval_t) > sizeof(long))
+ ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd,
+ val, (u64)val >> 32);
+ else
+ ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd,
+ val);
+
+ return (pmd_t) { ret };
+}
+
+static inline pmdval_t pmd_val(pmd_t pmd)
+{
+ pmdval_t ret;
+
+ if (sizeof(pmdval_t) > sizeof(long))
+ ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val,
+ pmd.pmd, (u64)pmd.pmd >> 32);
+ else
+ ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val,
+ pmd.pmd);
+
+ return ret;
+}
+
+static inline void set_pud(pud_t *pudp, pud_t pud)
+{
+ pudval_t val = native_pud_val(pud);
+
+ if (sizeof(pudval_t) > sizeof(long))
+ PVOP_VCALL3(pv_mmu_ops.set_pud, pudp,
+ val, (u64)val >> 32);
+ else
+ PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
+ val);
+}
+#if PAGETABLE_LEVELS == 4
+static inline pud_t __pud(pudval_t val)
+{
+ pudval_t ret;
+
+ if (sizeof(pudval_t) > sizeof(long))
+ ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud,
+ val, (u64)val >> 32);
+ else
+ ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud,
+ val);
+
+ return (pud_t) { ret };
+}
+
+static inline pudval_t pud_val(pud_t pud)
+{
+ pudval_t ret;
+
+ if (sizeof(pudval_t) > sizeof(long))
+ ret = PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val,
+ pud.pud, (u64)pud.pud >> 32);
+ else
+ ret = PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val,
+ pud.pud);
+
+ return ret;
+}
+
+static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ pgdval_t val = native_pgd_val(pgd);
+
+ if (sizeof(pgdval_t) > sizeof(long))
+ PVOP_VCALL3(pv_mmu_ops.set_pgd, pgdp,
+ val, (u64)val >> 32);
+ else
+ PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp,
+ val);
+}
+
+static inline void pgd_clear(pgd_t *pgdp)
+{
+ set_pgd(pgdp, __pgd(0));
+}
+
+static inline void pud_clear(pud_t *pudp)
+{
+ set_pud(pudp, __pud(0));
+}
+
+#endif /* PAGETABLE_LEVELS == 4 */
+
+#endif /* PAGETABLE_LEVELS >= 3 */
+
+#ifdef CONFIG_X86_PAE
+/* Special-case pte-setting operations for PAE, which can't update a
+ 64-bit pte atomically */
+static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep,
+ pte.pte, pte.pte >> 32);
+}
+
+static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ /* 5 arg words */
+ pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
+}
+
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep);
+}
+
+static inline void pmd_clear(pmd_t *pmdp)
+{
+ PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp);
+}
+#else /* !CONFIG_X86_PAE */
+static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ set_pte(ptep, pte);
+}
+
+static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ set_pte(ptep, pte);
+}
+
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ set_pte_at(mm, addr, ptep, __pte(0));
+}
+
+static inline void pmd_clear(pmd_t *pmdp)
+{
+ set_pmd(pmdp, __pmd(0));
+}
+#endif /* CONFIG_X86_PAE */
+
+/* Lazy mode for batching updates / context switch */
+enum paravirt_lazy_mode {
+ PARAVIRT_LAZY_NONE,
+ PARAVIRT_LAZY_MMU,
+ PARAVIRT_LAZY_CPU,
+};
+
+enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
+void paravirt_enter_lazy_cpu(void);
+void paravirt_leave_lazy_cpu(void);
+void paravirt_enter_lazy_mmu(void);
+void paravirt_leave_lazy_mmu(void);
+void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
+
+#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE
+static inline void arch_enter_lazy_cpu_mode(void)
+{
+ PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
+}
+
+static inline void arch_leave_lazy_cpu_mode(void)
+{
+ PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
+}
+
+static inline void arch_flush_lazy_cpu_mode(void)
+{
+ if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)) {
+ arch_leave_lazy_cpu_mode();
+ arch_enter_lazy_cpu_mode();
+ }
+}
+
+
+#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+static inline void arch_enter_lazy_mmu_mode(void)
+{
+ PVOP_VCALL0(pv_mmu_ops.lazy_mode.enter);
+}
+
+static inline void arch_leave_lazy_mmu_mode(void)
+{
+ PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave);
+}
+
+static inline void arch_flush_lazy_mmu_mode(void)
+{
+ if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU)) {
+ arch_leave_lazy_mmu_mode();
+ arch_enter_lazy_mmu_mode();
+ }
+}
+
+static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
+ unsigned long phys, pgprot_t flags)
+{
+ pv_mmu_ops.set_fixmap(idx, phys, flags);
+}
+
+void _paravirt_nop(void);
+#define paravirt_nop ((void *)_paravirt_nop)
+
+void paravirt_use_bytelocks(void);
+
+#ifdef CONFIG_SMP
+
+static inline int __raw_spin_is_locked(struct raw_spinlock *lock)
+{
+ return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock);
+}
+
+static inline int __raw_spin_is_contended(struct raw_spinlock *lock)
+{
+ return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
+}
+
+static __always_inline void __raw_spin_lock(struct raw_spinlock *lock)
+{
+ PVOP_VCALL1(pv_lock_ops.spin_lock, lock);
+}
+
+static __always_inline void __raw_spin_lock_flags(struct raw_spinlock *lock,
+ unsigned long flags)
+{
+ PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags);
+}
+
+static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock)
+{
+ return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock);
+}
+
+static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
+{
+ PVOP_VCALL1(pv_lock_ops.spin_unlock, lock);
+}
+
+#endif
+
+/* These all sit in the .parainstructions section to tell us what to patch. */
+struct paravirt_patch_site {
+ u8 *instr; /* original instructions */
+ u8 instrtype; /* type of this instruction */
+ u8 len; /* length of original instruction */
+ u16 clobbers; /* what registers you may clobber */
+};
+
+extern struct paravirt_patch_site __parainstructions[],
+ __parainstructions_end[];
+
+#ifdef CONFIG_X86_32
+#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;"
+#define PV_RESTORE_REGS "popl %%edx; popl %%ecx"
+#define PV_FLAGS_ARG "0"
+#define PV_EXTRA_CLOBBERS
+#define PV_VEXTRA_CLOBBERS
+#else
+/* We save some registers, but all of them, that's too much. We clobber all
+ * caller saved registers but the argument parameter */
+#define PV_SAVE_REGS "pushq %%rdi;"
+#define PV_RESTORE_REGS "popq %%rdi;"
+#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx", "rsi"
+#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx", "rsi"
+#define PV_FLAGS_ARG "D"
+#endif
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+ unsigned long f;
+
+ asm volatile(paravirt_alt(PV_SAVE_REGS
+ PARAVIRT_CALL
+ PV_RESTORE_REGS)
+ : "=a"(f)
+ : paravirt_type(pv_irq_ops.save_fl),
+ paravirt_clobber(CLBR_EAX)
+ : "memory", "cc" PV_VEXTRA_CLOBBERS);
+ return f;
+}
+
+static inline void raw_local_irq_restore(unsigned long f)
+{
+ asm volatile(paravirt_alt(PV_SAVE_REGS
+ PARAVIRT_CALL
+ PV_RESTORE_REGS)
+ : "=a"(f)
+ : PV_FLAGS_ARG(f),
+ paravirt_type(pv_irq_ops.restore_fl),
+ paravirt_clobber(CLBR_EAX)
+ : "memory", "cc" PV_EXTRA_CLOBBERS);
+}
+
+static inline void raw_local_irq_disable(void)
+{
+ asm volatile(paravirt_alt(PV_SAVE_REGS
+ PARAVIRT_CALL
+ PV_RESTORE_REGS)
+ :
+ : paravirt_type(pv_irq_ops.irq_disable),
+ paravirt_clobber(CLBR_EAX)
+ : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
+}
+
+static inline void raw_local_irq_enable(void)
+{
+ asm volatile(paravirt_alt(PV_SAVE_REGS
+ PARAVIRT_CALL
+ PV_RESTORE_REGS)
+ :
+ : paravirt_type(pv_irq_ops.irq_enable),
+ paravirt_clobber(CLBR_EAX)
+ : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
+}
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+ unsigned long f;
+
+ f = __raw_local_save_flags();
+ raw_local_irq_disable();
+ return f;
+}
+
+
+/* Make sure as little as possible of this mess escapes. */
+#undef PARAVIRT_CALL
+#undef __PVOP_CALL
+#undef __PVOP_VCALL
+#undef PVOP_VCALL0
+#undef PVOP_CALL0
+#undef PVOP_VCALL1
+#undef PVOP_CALL1
+#undef PVOP_VCALL2
+#undef PVOP_CALL2
+#undef PVOP_VCALL3
+#undef PVOP_CALL3
+#undef PVOP_VCALL4
+#undef PVOP_CALL4
+
+#else /* __ASSEMBLY__ */
+
+#define _PVSITE(ptype, clobbers, ops, word, algn) \
+771:; \
+ ops; \
+772:; \
+ .pushsection .parainstructions,"a"; \
+ .align algn; \
+ word 771b; \
+ .byte ptype; \
+ .byte 772b-771b; \
+ .short clobbers; \
+ .popsection
+
+
+#ifdef CONFIG_X86_64
+#define PV_SAVE_REGS \
+ push %rax; \
+ push %rcx; \
+ push %rdx; \
+ push %rsi; \
+ push %rdi; \
+ push %r8; \
+ push %r9; \
+ push %r10; \
+ push %r11
+#define PV_RESTORE_REGS \
+ pop %r11; \
+ pop %r10; \
+ pop %r9; \
+ pop %r8; \
+ pop %rdi; \
+ pop %rsi; \
+ pop %rdx; \
+ pop %rcx; \
+ pop %rax
+#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8)
+#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
+#define PARA_INDIRECT(addr) *addr(%rip)
+#else
+#define PV_SAVE_REGS pushl %eax; pushl %edi; pushl %ecx; pushl %edx
+#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax
+#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4)
+#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4)
+#define PARA_INDIRECT(addr) *%cs:addr
+#endif
+
+#define INTERRUPT_RETURN \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \
+ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret))
+
+#define DISABLE_INTERRUPTS(clobbers) \
+ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
+ PV_SAVE_REGS; \
+ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \
+ PV_RESTORE_REGS;) \
+
+#define ENABLE_INTERRUPTS(clobbers) \
+ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \
+ PV_SAVE_REGS; \
+ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \
+ PV_RESTORE_REGS;)
+
+#define USERGS_SYSRET32 \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \
+ CLBR_NONE, \
+ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret32))
+
+#ifdef CONFIG_X86_32
+#define GET_CR0_INTO_EAX \
+ push %ecx; push %edx; \
+ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \
+ pop %edx; pop %ecx
+
+#define ENABLE_INTERRUPTS_SYSEXIT \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
+ CLBR_NONE, \
+ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
+
+
+#else /* !CONFIG_X86_32 */
+
+/*
+ * If swapgs is used while the userspace stack is still current,
+ * there's no way to call a pvop. The PV replacement *must* be
+ * inlined, or the swapgs instruction must be trapped and emulated.
+ */
+#define SWAPGS_UNSAFE_STACK \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
+ swapgs)
+
+#define SWAPGS \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
+ PV_SAVE_REGS; \
+ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \
+ PV_RESTORE_REGS \
+ )
+
+#define GET_CR2_INTO_RCX \
+ call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2); \
+ movq %rax, %rcx; \
+ xorq %rax, %rax;
+
+#define PARAVIRT_ADJUST_EXCEPTION_FRAME \
+ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \
+ CLBR_NONE, \
+ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame))
+
+#define USERGS_SYSRET64 \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
+ CLBR_NONE, \
+ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
+
+#define ENABLE_INTERRUPTS_SYSEXIT32 \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
+ CLBR_NONE, \
+ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
+#endif /* CONFIG_X86_32 */
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+#endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/parport.h b/arch/x86/include/asm/parport.h
new file mode 100644
index 000000000000..3c4ffeb467e9
--- /dev/null
+++ b/arch/x86/include/asm/parport.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_PARPORT_H
+#define _ASM_X86_PARPORT_H
+
+static int __devinit parport_pc_find_isa_ports(int autoirq, int autodma);
+static int __devinit parport_pc_find_nonpci_ports(int autoirq, int autodma)
+{
+ return parport_pc_find_isa_ports(autoirq, autodma);
+}
+
+#endif /* _ASM_X86_PARPORT_H */
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
new file mode 100644
index 000000000000..b8493b3b9890
--- /dev/null
+++ b/arch/x86/include/asm/pat.h
@@ -0,0 +1,22 @@
+#ifndef _ASM_X86_PAT_H
+#define _ASM_X86_PAT_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_X86_PAT
+extern int pat_enabled;
+extern void validate_pat_support(struct cpuinfo_x86 *c);
+#else
+static const int pat_enabled;
+static inline void validate_pat_support(struct cpuinfo_x86 *c) { }
+#endif
+
+extern void pat_init(void);
+
+extern int reserve_memtype(u64 start, u64 end,
+ unsigned long req_type, unsigned long *ret_type);
+extern int free_memtype(u64 start, u64 end);
+
+extern void pat_disable(char *reason);
+
+#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h
new file mode 100644
index 000000000000..b1e7a45d868a
--- /dev/null
+++ b/arch/x86/include/asm/pci-direct.h
@@ -0,0 +1,21 @@
+#ifndef _ASM_X86_PCI_DIRECT_H
+#define _ASM_X86_PCI_DIRECT_H
+
+#include <linux/types.h>
+
+/* Direct PCI access. This is used for PCI accesses in early boot before
+ the PCI subsystem works. */
+
+extern u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset);
+extern u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset);
+extern u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset);
+extern void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val);
+extern void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val);
+extern void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val);
+
+extern int early_pci_allowed(void);
+
+extern unsigned int pci_early_dump_regs;
+extern void early_dump_pci_device(u8 bus, u8 slot, u8 func);
+extern void early_dump_pci_devices(void);
+#endif /* _ASM_X86_PCI_DIRECT_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
new file mode 100644
index 000000000000..66834c41c049
--- /dev/null
+++ b/arch/x86/include/asm/pci.h
@@ -0,0 +1,118 @@
+#ifndef _ASM_X86_PCI_H
+#define _ASM_X86_PCI_H
+
+#include <linux/mm.h> /* for struct page */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <asm/scatterlist.h>
+#include <asm/io.h>
+
+#ifdef __KERNEL__
+
+struct pci_sysdata {
+ int domain; /* PCI domain */
+ int node; /* NUMA node */
+#ifdef CONFIG_X86_64
+ void *iommu; /* IOMMU private data */
+#endif
+};
+
+extern int pci_routeirq;
+extern int noioapicquirk;
+extern int noioapicreroute;
+
+/* scan a bus after allocating a pci_sysdata for it */
+extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
+ int node);
+extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);
+
+static inline int pci_domain_nr(struct pci_bus *bus)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+ return sd->domain;
+}
+
+static inline int pci_proc_domain(struct pci_bus *bus)
+{
+ return pci_domain_nr(bus);
+}
+
+
+/* Can be used to override the logic in pci_scan_bus for skipping
+ already-configured bus numbers - to be used for buggy BIOSes
+ or architectures with incomplete PCI setup by the loader */
+
+#ifdef CONFIG_PCI
+extern unsigned int pcibios_assign_all_busses(void);
+#else
+#define pcibios_assign_all_busses() 0
+#endif
+#define pcibios_scan_all_fns(a, b) 0
+
+extern unsigned long pci_mem_start;
+#define PCIBIOS_MIN_IO 0x1000
+#define PCIBIOS_MIN_MEM (pci_mem_start)
+
+#define PCIBIOS_MIN_CARDBUS_IO 0x4000
+
+void pcibios_config_init(void);
+struct pci_bus *pcibios_scan_root(int bus);
+
+void pcibios_set_master(struct pci_dev *dev);
+void pcibios_penalize_isa_irq(int irq, int active);
+struct irq_routing_table *pcibios_get_irq_routing_table(void);
+int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
+
+
+#define HAVE_PCI_MMAP
+extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+ enum pci_mmap_state mmap_state,
+ int write_combine);
+
+
+#ifdef CONFIG_PCI
+extern void early_quirks(void);
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+ enum pci_dma_burst_strategy *strat,
+ unsigned long *strategy_parameter)
+{
+ *strat = PCI_DMA_BURST_INFINITY;
+ *strategy_parameter = ~0UL;
+}
+#else
+static inline void early_quirks(void) { }
+#endif
+
+extern void pci_iommu_alloc(void);
+
+#endif /* __KERNEL__ */
+
+#ifdef CONFIG_X86_32
+# include "pci_32.h"
+#else
+# include "pci_64.h"
+#endif
+
+/* implement the pci_ DMA API in terms of the generic device dma_ one */
+#include <asm-generic/pci-dma-compat.h>
+
+/* generic pci stuff */
+#include <asm-generic/pci.h>
+
+#ifdef CONFIG_NUMA
+/* Returns the node based on pci bus */
+static inline int __pcibus_to_node(struct pci_bus *bus)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+
+ return sd->node;
+}
+
+static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
+{
+ return node_to_cpumask(__pcibus_to_node(bus));
+}
+#endif
+
+#endif /* _ASM_X86_PCI_H */
diff --git a/arch/x86/include/asm/pci_32.h b/arch/x86/include/asm/pci_32.h
new file mode 100644
index 000000000000..6f1213a6ef4f
--- /dev/null
+++ b/arch/x86/include/asm/pci_32.h
@@ -0,0 +1,34 @@
+#ifndef _ASM_X86_PCI_32_H
+#define _ASM_X86_PCI_32_H
+
+
+#ifdef __KERNEL__
+
+
+/* Dynamic DMA mapping stuff.
+ * i386 has everything mapped statically.
+ */
+
+struct pci_dev;
+
+/* The PCI address space does equal the physical memory
+ * address space. The networking and block device layers use
+ * this boolean for bounce buffer decisions.
+ */
+#define PCI_DMA_BUS_IS_PHYS (1)
+
+/* pci_unmap_{page,single} is a nop so... */
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
+#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
+ do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
+#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
+ do { break; } while (pci_unmap_len(PTR, LEN_NAME))
+
+
+#endif /* __KERNEL__ */
+
+
+#endif /* _ASM_X86_PCI_32_H */
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
new file mode 100644
index 000000000000..4da207982777
--- /dev/null
+++ b/arch/x86/include/asm/pci_64.h
@@ -0,0 +1,51 @@
+#ifndef _ASM_X86_PCI_64_H
+#define _ASM_X86_PCI_64_H
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_CALGARY_IOMMU
+static inline void *pci_iommu(struct pci_bus *bus)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+ return sd->iommu;
+}
+
+static inline void set_pci_iommu(struct pci_bus *bus, void *val)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+ sd->iommu = val;
+}
+#endif /* CONFIG_CALGARY_IOMMU */
+
+extern int (*pci_config_read)(int seg, int bus, int dev, int fn,
+ int reg, int len, u32 *value);
+extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
+ int reg, int len, u32 value);
+
+extern void dma32_reserve_bootmem(void);
+
+/* The PCI address space does equal the physical memory
+ * address space. The networking and block device layers use
+ * this boolean for bounce buffer decisions
+ *
+ * On AMD64 it mostly equals, but we set it to zero if a hardware
+ * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
+ */
+#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
+
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
+ dma_addr_t ADDR_NAME;
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
+ __u32 LEN_NAME;
+#define pci_unmap_addr(PTR, ADDR_NAME) \
+ ((PTR)->ADDR_NAME)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
+ (((PTR)->ADDR_NAME) = (VAL))
+#define pci_unmap_len(PTR, LEN_NAME) \
+ ((PTR)->LEN_NAME)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
+ (((PTR)->LEN_NAME) = (VAL))
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_PCI_64_H */
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
new file mode 100644
index 000000000000..3fea2fdb3302
--- /dev/null
+++ b/arch/x86/include/asm/pda.h
@@ -0,0 +1,137 @@
+#ifndef _ASM_X86_PDA_H
+#define _ASM_X86_PDA_H
+
+#ifndef __ASSEMBLY__
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <asm/page.h>
+
+/* Per processor datastructure. %gs points to it while the kernel runs */
+struct x8664_pda {
+ struct task_struct *pcurrent; /* 0 Current process */
+ unsigned long data_offset; /* 8 Per cpu data offset from linker
+ address */
+ unsigned long kernelstack; /* 16 top of kernel stack for current */
+ unsigned long oldrsp; /* 24 user rsp for system call */
+ int irqcount; /* 32 Irq nesting counter. Starts -1 */
+ unsigned int cpunumber; /* 36 Logical CPU number */
+ unsigned long stack_canary; /* 40 stack canary value */
+ /* gcc-ABI: this canary MUST be at
+ offset 40!!! */
+ char *irqstackptr;
+ short nodenumber; /* number of current node (32k max) */
+ short in_bootmem; /* pda lives in bootmem */
+ unsigned int __softirq_pending;
+ unsigned int __nmi_count; /* number of NMI on this CPUs */
+ short mmu_state;
+ short isidle;
+ struct mm_struct *active_mm;
+ unsigned apic_timer_irqs;
+ unsigned irq0_irqs;
+ unsigned irq_resched_count;
+ unsigned irq_call_count;
+ unsigned irq_tlb_count;
+ unsigned irq_thermal_count;
+ unsigned irq_threshold_count;
+ unsigned irq_spurious_count;
+} ____cacheline_aligned_in_smp;
+
+extern struct x8664_pda **_cpu_pda;
+extern void pda_init(int);
+
+#define cpu_pda(i) (_cpu_pda[i])
+
+/*
+ * There is no fast way to get the base address of the PDA, all the accesses
+ * have to mention %fs/%gs. So it needs to be done this Torvaldian way.
+ */
+extern void __bad_pda_field(void) __attribute__((noreturn));
+
+/*
+ * proxy_pda doesn't actually exist, but tell gcc it is accessed for
+ * all PDA accesses so it gets read/write dependencies right.
+ */
+extern struct x8664_pda _proxy_pda;
+
+#define pda_offset(field) offsetof(struct x8664_pda, field)
+
+#define pda_to_op(op, field, val) \
+do { \
+ typedef typeof(_proxy_pda.field) T__; \
+ if (0) { T__ tmp__; tmp__ = (val); } /* type checking */ \
+ switch (sizeof(_proxy_pda.field)) { \
+ case 2: \
+ asm(op "w %1,%%gs:%c2" : \
+ "+m" (_proxy_pda.field) : \
+ "ri" ((T__)val), \
+ "i"(pda_offset(field))); \
+ break; \
+ case 4: \
+ asm(op "l %1,%%gs:%c2" : \
+ "+m" (_proxy_pda.field) : \
+ "ri" ((T__)val), \
+ "i" (pda_offset(field))); \
+ break; \
+ case 8: \
+ asm(op "q %1,%%gs:%c2": \
+ "+m" (_proxy_pda.field) : \
+ "ri" ((T__)val), \
+ "i"(pda_offset(field))); \
+ break; \
+ default: \
+ __bad_pda_field(); \
+ } \
+} while (0)
+
+#define pda_from_op(op, field) \
+({ \
+ typeof(_proxy_pda.field) ret__; \
+ switch (sizeof(_proxy_pda.field)) { \
+ case 2: \
+ asm(op "w %%gs:%c1,%0" : \
+ "=r" (ret__) : \
+ "i" (pda_offset(field)), \
+ "m" (_proxy_pda.field)); \
+ break; \
+ case 4: \
+ asm(op "l %%gs:%c1,%0": \
+ "=r" (ret__): \
+ "i" (pda_offset(field)), \
+ "m" (_proxy_pda.field)); \
+ break; \
+ case 8: \
+ asm(op "q %%gs:%c1,%0": \
+ "=r" (ret__) : \
+ "i" (pda_offset(field)), \
+ "m" (_proxy_pda.field)); \
+ break; \
+ default: \
+ __bad_pda_field(); \
+ } \
+ ret__; \
+})
+
+#define read_pda(field) pda_from_op("mov", field)
+#define write_pda(field, val) pda_to_op("mov", field, val)
+#define add_pda(field, val) pda_to_op("add", field, val)
+#define sub_pda(field, val) pda_to_op("sub", field, val)
+#define or_pda(field, val) pda_to_op("or", field, val)
+
+/* This is not atomic against other CPUs -- CPU preemption needs to be off */
+#define test_and_clear_bit_pda(bit, field) \
+({ \
+ int old__; \
+ asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0" \
+ : "=r" (old__), "+m" (_proxy_pda.field) \
+ : "dIr" (bit), "i" (pda_offset(field)) : "memory");\
+ old__; \
+})
+
+#endif
+
+#define PDA_STACKOFFSET (5*8)
+
+#define refresh_stack_canary() write_pda(stack_canary, current->stack_canary)
+
+#endif /* _ASM_X86_PDA_H */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
new file mode 100644
index 000000000000..ece72053ba63
--- /dev/null
+++ b/arch/x86/include/asm/percpu.h
@@ -0,0 +1,218 @@
+#ifndef _ASM_X86_PERCPU_H
+#define _ASM_X86_PERCPU_H
+
+#ifdef CONFIG_X86_64
+#include <linux/compiler.h>
+
+/* Same as asm-generic/percpu.h, except that we store the per cpu offset
+ in the PDA. Longer term the PDA and every per cpu variable
+ should be just put into a single section and referenced directly
+ from %gs */
+
+#ifdef CONFIG_SMP
+#include <asm/pda.h>
+
+#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
+#define __my_cpu_offset read_pda(data_offset)
+
+#define per_cpu_offset(x) (__per_cpu_offset(x))
+
+#endif
+#include <asm-generic/percpu.h>
+
+DECLARE_PER_CPU(struct x8664_pda, pda);
+
+/*
+ * These are supposed to be implemented as a single instruction which
+ * operates on the per-cpu data base segment. x86-64 doesn't have
+ * that yet, so this is a fairly inefficient workaround for the
+ * meantime. The single instruction is atomic with respect to
+ * preemption and interrupts, so we need to explicitly disable
+ * interrupts here to achieve the same effect. However, because it
+ * can be used from within interrupt-disable/enable, we can't actually
+ * disable interrupts; disabling preemption is enough.
+ */
+#define x86_read_percpu(var) \
+ ({ \
+ typeof(per_cpu_var(var)) __tmp; \
+ preempt_disable(); \
+ __tmp = __get_cpu_var(var); \
+ preempt_enable(); \
+ __tmp; \
+ })
+
+#define x86_write_percpu(var, val) \
+ do { \
+ preempt_disable(); \
+ __get_cpu_var(var) = (val); \
+ preempt_enable(); \
+ } while(0)
+
+#else /* CONFIG_X86_64 */
+
+#ifdef __ASSEMBLY__
+
+/*
+ * PER_CPU finds an address of a per-cpu variable.
+ *
+ * Args:
+ * var - variable name
+ * reg - 32bit register
+ *
+ * The resulting address is stored in the "reg" argument.
+ *
+ * Example:
+ * PER_CPU(cpu_gdt_descr, %ebx)
+ */
+#ifdef CONFIG_SMP
+#define PER_CPU(var, reg) \
+ movl %fs:per_cpu__##this_cpu_off, reg; \
+ lea per_cpu__##var(reg), reg
+#define PER_CPU_VAR(var) %fs:per_cpu__##var
+#else /* ! SMP */
+#define PER_CPU(var, reg) \
+ movl $per_cpu__##var, reg
+#define PER_CPU_VAR(var) per_cpu__##var
+#endif /* SMP */
+
+#else /* ...!ASSEMBLY */
+
+/*
+ * PER_CPU finds an address of a per-cpu variable.
+ *
+ * Args:
+ * var - variable name
+ * cpu - 32bit register containing the current CPU number
+ *
+ * The resulting address is stored in the "cpu" argument.
+ *
+ * Example:
+ * PER_CPU(cpu_gdt_descr, %ebx)
+ */
+#ifdef CONFIG_SMP
+
+#define __my_cpu_offset x86_read_percpu(this_cpu_off)
+
+/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
+#define __percpu_seg "%%fs:"
+
+#else /* !SMP */
+
+#define __percpu_seg ""
+
+#endif /* SMP */
+
+#include <asm-generic/percpu.h>
+
+/* We can use this directly for local CPU (faster). */
+DECLARE_PER_CPU(unsigned long, this_cpu_off);
+
+/* For arch-specific code, we can use direct single-insn ops (they
+ * don't give an lvalue though). */
+extern void __bad_percpu_size(void);
+
+#define percpu_to_op(op, var, val) \
+do { \
+ typedef typeof(var) T__; \
+ if (0) { \
+ T__ tmp__; \
+ tmp__ = (val); \
+ } \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm(op "b %1,"__percpu_seg"%0" \
+ : "+m" (var) \
+ : "ri" ((T__)val)); \
+ break; \
+ case 2: \
+ asm(op "w %1,"__percpu_seg"%0" \
+ : "+m" (var) \
+ : "ri" ((T__)val)); \
+ break; \
+ case 4: \
+ asm(op "l %1,"__percpu_seg"%0" \
+ : "+m" (var) \
+ : "ri" ((T__)val)); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+} while (0)
+
+#define percpu_from_op(op, var) \
+({ \
+ typeof(var) ret__; \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm(op "b "__percpu_seg"%1,%0" \
+ : "=r" (ret__) \
+ : "m" (var)); \
+ break; \
+ case 2: \
+ asm(op "w "__percpu_seg"%1,%0" \
+ : "=r" (ret__) \
+ : "m" (var)); \
+ break; \
+ case 4: \
+ asm(op "l "__percpu_seg"%1,%0" \
+ : "=r" (ret__) \
+ : "m" (var)); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+ ret__; \
+})
+
+#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
+#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val)
+#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val)
+#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
+#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
+#endif /* !__ASSEMBLY__ */
+#endif /* !CONFIG_X86_64 */
+
+#ifdef CONFIG_SMP
+
+/*
+ * Define the "EARLY_PER_CPU" macros. These are used for some per_cpu
+ * variables that are initialized and accessed before there are per_cpu
+ * areas allocated.
+ */
+
+#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
+ DEFINE_PER_CPU(_type, _name) = _initvalue; \
+ __typeof__(_type) _name##_early_map[NR_CPUS] __initdata = \
+ { [0 ... NR_CPUS-1] = _initvalue }; \
+ __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map
+
+#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
+ EXPORT_PER_CPU_SYMBOL(_name)
+
+#define DECLARE_EARLY_PER_CPU(_type, _name) \
+ DECLARE_PER_CPU(_type, _name); \
+ extern __typeof__(_type) *_name##_early_ptr; \
+ extern __typeof__(_type) _name##_early_map[]
+
+#define early_per_cpu_ptr(_name) (_name##_early_ptr)
+#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
+#define early_per_cpu(_name, _cpu) \
+ (early_per_cpu_ptr(_name) ? \
+ early_per_cpu_ptr(_name)[_cpu] : \
+ per_cpu(_name, _cpu))
+
+#else /* !CONFIG_SMP */
+#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
+ DEFINE_PER_CPU(_type, _name) = _initvalue
+
+#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
+ EXPORT_PER_CPU_SYMBOL(_name)
+
+#define DECLARE_EARLY_PER_CPU(_type, _name) \
+ DECLARE_PER_CPU(_type, _name)
+
+#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu)
+#define early_per_cpu_ptr(_name) NULL
+/* no early_per_cpu_map() */
+
+#endif /* !CONFIG_SMP */
+
+#endif /* _ASM_X86_PERCPU_H */
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
new file mode 100644
index 000000000000..cb7c151a8bff
--- /dev/null
+++ b/arch/x86/include/asm/pgalloc.h
@@ -0,0 +1,114 @@
+#ifndef _ASM_X86_PGALLOC_H
+#define _ASM_X86_PGALLOC_H
+
+#include <linux/threads.h>
+#include <linux/mm.h> /* for struct page */
+#include <linux/pagemap.h>
+
+static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define paravirt_pgd_alloc(mm) __paravirt_pgd_alloc(mm)
+static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
+static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
+ unsigned long start, unsigned long count) {}
+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_release_pte(unsigned long pfn) {}
+static inline void paravirt_release_pmd(unsigned long pfn) {}
+static inline void paravirt_release_pud(unsigned long pfn) {}
+#endif
+
+/*
+ * Allocate and free page tables.
+ */
+extern pgd_t *pgd_alloc(struct mm_struct *);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+
+extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
+extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
+
+/* Should really implement gc for free page table pages. This could be
+ done with a reference count in struct page. */
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+ BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
+ free_page((unsigned long)pte);
+}
+
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
+{
+ __free_page(pte);
+}
+
+extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+
+static inline void pmd_populate_kernel(struct mm_struct *mm,
+ pmd_t *pmd, pte_t *pte)
+{
+ paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
+ set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
+ struct page *pte)
+{
+ unsigned long pfn = page_to_pfn(pte);
+
+ paravirt_alloc_pte(mm, pfn);
+ set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
+}
+
+#define pmd_pgtable(pmd) pmd_page(pmd)
+
+#if PAGETABLE_LEVELS > 2
+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
+ BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+ free_page((unsigned long)pmd);
+}
+
+extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+
+#ifdef CONFIG_X86_PAE
+extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
+#else /* !CONFIG_X86_PAE */
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+ set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+}
+#endif /* CONFIG_X86_PAE */
+
+#if PAGETABLE_LEVELS > 3
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+ paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
+ set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
+}
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+ BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
+ free_page((unsigned long)pud);
+}
+
+extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
+#endif /* PAGETABLE_LEVELS > 3 */
+#endif /* PAGETABLE_LEVELS > 2 */
+
+#endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/asm/pgtable-2level-defs.h b/arch/x86/include/asm/pgtable-2level-defs.h
new file mode 100644
index 000000000000..d77db8990eaa
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-2level-defs.h
@@ -0,0 +1,20 @@
+#ifndef _ASM_X86_PGTABLE_2LEVEL_DEFS_H
+#define _ASM_X86_PGTABLE_2LEVEL_DEFS_H
+
+#define SHARED_KERNEL_PMD 0
+
+/*
+ * traditional i386 two-level paging structure:
+ */
+
+#define PGDIR_SHIFT 22
+#define PTRS_PER_PGD 1024
+
+/*
+ * the i386 is two-level, so we don't really have any
+ * PMD directory physically.
+ */
+
+#define PTRS_PER_PTE 1024
+
+#endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
new file mode 100644
index 000000000000..e0d199fe1d83
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -0,0 +1,111 @@
+#ifndef _ASM_X86_PGTABLE_2LEVEL_H
+#define _ASM_X86_PGTABLE_2LEVEL_H
+
+#define pte_ERROR(e) \
+ printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low)
+#define pgd_ERROR(e) \
+ printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
+
+/*
+ * Certain architectures need to do special things when PTEs
+ * within a page table are directly modified. Thus, the following
+ * hook is made available.
+ */
+static inline void native_set_pte(pte_t *ptep , pte_t pte)
+{
+ *ptep = pte;
+}
+
+static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ *pmdp = pmd;
+}
+
+static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ native_set_pte(ptep, pte);
+}
+
+static inline void native_set_pte_present(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ native_set_pte(ptep, pte);
+}
+
+static inline void native_pmd_clear(pmd_t *pmdp)
+{
+ native_set_pmd(pmdp, __pmd(0));
+}
+
+static inline void native_pte_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *xp)
+{
+ *xp = native_make_pte(0);
+}
+
+#ifdef CONFIG_SMP
+static inline pte_t native_ptep_get_and_clear(pte_t *xp)
+{
+ return __pte(xchg(&xp->pte_low, 0));
+}
+#else
+#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
+#endif
+
+#define pte_none(x) (!(x).pte_low)
+
+/*
+ * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
+ * split up the 29 bits of offset into this range:
+ */
+#define PTE_FILE_MAX_BITS 29
+#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1)
+#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
+#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1)
+#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1)
+#else
+#define PTE_FILE_SHIFT2 (_PAGE_BIT_PROTNONE + 1)
+#define PTE_FILE_SHIFT3 (_PAGE_BIT_FILE + 1)
+#endif
+#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
+#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
+
+#define pte_to_pgoff(pte) \
+ ((((pte).pte_low >> PTE_FILE_SHIFT1) \
+ & ((1U << PTE_FILE_BITS1) - 1)) \
+ + ((((pte).pte_low >> PTE_FILE_SHIFT2) \
+ & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1) \
+ + (((pte).pte_low >> PTE_FILE_SHIFT3) \
+ << (PTE_FILE_BITS1 + PTE_FILE_BITS2)))
+
+#define pgoff_to_pte(off) \
+ ((pte_t) { .pte_low = \
+ (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \
+ + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1)) \
+ << PTE_FILE_SHIFT2) \
+ + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \
+ << PTE_FILE_SHIFT3) \
+ + _PAGE_FILE })
+
+/* Encode and de-code a swap entry */
+#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
+#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
+#else
+#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
+#endif
+
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+
+#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
+ & ((1U << SWP_TYPE_BITS) - 1))
+#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
+#define __swp_entry(type, offset) ((swp_entry_t) { \
+ ((type) << (_PAGE_BIT_PRESENT + 1)) \
+ | ((offset) << SWP_OFFSET_SHIFT) })
+#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
+#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+
+#endif /* _ASM_X86_PGTABLE_2LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable-3level-defs.h b/arch/x86/include/asm/pgtable-3level-defs.h
new file mode 100644
index 000000000000..62561367653c
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-3level-defs.h
@@ -0,0 +1,28 @@
+#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H
+#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H
+
+#ifdef CONFIG_PARAVIRT
+#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd)
+#else
+#define SHARED_KERNEL_PMD 1
+#endif
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT 30
+#define PTRS_PER_PGD 4
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT 21
+#define PTRS_PER_PMD 512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE 512
+
+#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
new file mode 100644
index 000000000000..447da43cddb3
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -0,0 +1,176 @@
+#ifndef _ASM_X86_PGTABLE_3LEVEL_H
+#define _ASM_X86_PGTABLE_3LEVEL_H
+
+/*
+ * Intel Physical Address Extension (PAE) Mode - three-level page
+ * tables on PPro+ CPUs.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#define pte_ERROR(e) \
+ printk("%s:%d: bad pte %p(%08lx%08lx).\n", \
+ __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
+#define pmd_ERROR(e) \
+ printk("%s:%d: bad pmd %p(%016Lx).\n", \
+ __FILE__, __LINE__, &(e), pmd_val(e))
+#define pgd_ERROR(e) \
+ printk("%s:%d: bad pgd %p(%016Lx).\n", \
+ __FILE__, __LINE__, &(e), pgd_val(e))
+
+static inline int pud_none(pud_t pud)
+{
+ return pud_val(pud) == 0;
+}
+
+static inline int pud_bad(pud_t pud)
+{
+ return (pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
+}
+
+static inline int pud_present(pud_t pud)
+{
+ return pud_val(pud) & _PAGE_PRESENT;
+}
+
+/* Rules for using set_pte: the pte being assigned *must* be
+ * either not present or in a state where the hardware will
+ * not attempt to update the pte. In places where this is
+ * not possible, use pte_get_and_clear to obtain the old pte
+ * value and then use set_pte to update it. -ben
+ */
+static inline void native_set_pte(pte_t *ptep, pte_t pte)
+{
+ ptep->pte_high = pte.pte_high;
+ smp_wmb();
+ ptep->pte_low = pte.pte_low;
+}
+
+/*
+ * Since this is only called on user PTEs, and the page fault handler
+ * must handle the already racy situation of simultaneous page faults,
+ * we are justified in merely clearing the PTE present bit, followed
+ * by a set. The ordering here is important.
+ */
+static inline void native_set_pte_present(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ ptep->pte_low = 0;
+ smp_wmb();
+ ptep->pte_high = pte.pte_high;
+ smp_wmb();
+ ptep->pte_low = pte.pte_low;
+}
+
+static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
+}
+
+static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd));
+}
+
+static inline void native_set_pud(pud_t *pudp, pud_t pud)
+{
+ set_64bit((unsigned long long *)(pudp), native_pud_val(pud));
+}
+
+/*
+ * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
+ * entry, so clear the bottom half first and enforce ordering with a compiler
+ * barrier.
+ */
+static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ ptep->pte_low = 0;
+ smp_wmb();
+ ptep->pte_high = 0;
+}
+
+static inline void native_pmd_clear(pmd_t *pmd)
+{
+ u32 *tmp = (u32 *)pmd;
+ *tmp = 0;
+ smp_wmb();
+ *(tmp + 1) = 0;
+}
+
+static inline void pud_clear(pud_t *pudp)
+{
+ unsigned long pgd;
+
+ set_pud(pudp, __pud(0));
+
+ /*
+ * According to Intel App note "TLBs, Paging-Structure Caches,
+ * and Their Invalidation", April 2007, document 317080-001,
+ * section 8.1: in PAE mode we explicitly have to flush the
+ * TLB via cr3 if the top-level pgd is changed...
+ *
+ * Make sure the pud entry we're updating is within the
+ * current pgd to avoid unnecessary TLB flushes.
+ */
+ pgd = read_cr3();
+ if (__pa(pudp) >= pgd && __pa(pudp) <
+ (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
+ write_cr3(pgd);
+}
+
+#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
+
+#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))
+
+
+/* Find an entry in the second-level page table.. */
+#define pmd_offset(pud, address) ((pmd_t *)pud_page_vaddr(*(pud)) + \
+ pmd_index(address))
+
+#ifdef CONFIG_SMP
+static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
+{
+ pte_t res;
+
+ /* xchg acts as a barrier before the setting of the high bits */
+ res.pte_low = xchg(&ptep->pte_low, 0);
+ res.pte_high = ptep->pte_high;
+ ptep->pte_high = 0;
+
+ return res;
+}
+#else
+#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
+#endif
+
+#define __HAVE_ARCH_PTE_SAME
+static inline int pte_same(pte_t a, pte_t b)
+{
+ return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
+}
+
+static inline int pte_none(pte_t pte)
+{
+ return !pte.pte_low && !pte.pte_high;
+}
+
+/*
+ * Bits 0, 6 and 7 are taken in the low part of the pte,
+ * put the 32 bits of offset into the high part.
+ */
+#define pte_to_pgoff(pte) ((pte).pte_high)
+#define pgoff_to_pte(off) \
+ ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
+#define PTE_FILE_MAX_BITS 32
+
+/* Encode and de-code a swap entry */
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
+#define __swp_type(x) (((x).val) & 0x1f)
+#define __swp_offset(x) ((x).val >> 5)
+#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
+#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
+#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
+
+#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
new file mode 100644
index 000000000000..83e69f4a37f0
--- /dev/null
+++ b/arch/x86/include/asm/pgtable.h
@@ -0,0 +1,578 @@
+#ifndef _ASM_X86_PGTABLE_H
+#define _ASM_X86_PGTABLE_H
+
+#define FIRST_USER_ADDRESS 0
+
+#define _PAGE_BIT_PRESENT 0 /* is present */
+#define _PAGE_BIT_RW 1 /* writeable */
+#define _PAGE_BIT_USER 2 /* userspace addressable */
+#define _PAGE_BIT_PWT 3 /* page write through */
+#define _PAGE_BIT_PCD 4 /* page cache disabled */
+#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
+#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
+#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
+#define _PAGE_BIT_PAT 7 /* on 4KB pages */
+#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
+#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
+#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
+#define _PAGE_BIT_UNUSED3 11
+#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
+#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
+#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
+#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+
+/* If _PAGE_BIT_PRESENT is clear, we use these: */
+/* - if the user mapped it with PROT_NONE; pte_present gives true */
+#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
+/* - set: nonlinear file mapping, saved PTE; unset:swap */
+#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
+
+#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
+#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
+#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
+#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
+#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
+#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
+#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
+#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
+#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
+#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
+#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
+#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
+#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#define __HAVE_ARCH_PTE_SPECIAL
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
+#else
+#define _PAGE_NX (_AT(pteval_t, 0))
+#endif
+
+#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+
+#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
+ _PAGE_DIRTY)
+
+/* Set of bits not changed in pte_modify */
+#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
+ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
+
+#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
+#define _PAGE_CACHE_WB (0)
+#define _PAGE_CACHE_WC (_PAGE_PWT)
+#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD)
+#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT)
+
+#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
+#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+
+#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
+ _PAGE_USER | _PAGE_ACCESSED)
+#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED)
+#define PAGE_COPY PAGE_COPY_NOEXEC
+#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED)
+
+#define __PAGE_KERNEL_EXEC \
+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL)
+#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
+
+#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
+#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
+#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC)
+#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
+#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
+#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+
+#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP)
+
+#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
+#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
+#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
+#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
+#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
+#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
+#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
+#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
+#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
+#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
+#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+
+#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
+#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
+#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS)
+#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC)
+
+/* xwr */
+#define __P000 PAGE_NONE
+#define __P001 PAGE_READONLY
+#define __P010 PAGE_COPY
+#define __P011 PAGE_COPY
+#define __P100 PAGE_READONLY_EXEC
+#define __P101 PAGE_READONLY_EXEC
+#define __P110 PAGE_COPY_EXEC
+#define __P111 PAGE_COPY_EXEC
+
+#define __S000 PAGE_NONE
+#define __S001 PAGE_READONLY
+#define __S010 PAGE_SHARED
+#define __S011 PAGE_SHARED
+#define __S100 PAGE_READONLY_EXEC
+#define __S101 PAGE_READONLY_EXEC
+#define __S110 PAGE_SHARED_EXEC
+#define __S111 PAGE_SHARED_EXEC
+
+/*
+ * early identity mapping pte attrib macros.
+ */
+#ifdef CONFIG_X86_64
+#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
+#else
+/*
+ * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection
+ * bits are combined, this will alow user to access the high address mapped
+ * VDSO in the presence of CONFIG_COMPAT_VDSO
+ */
+#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */
+#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
+#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
+#endif
+
+/*
+ * Macro to mark a page protection value as UC-
+ */
+#define pgprot_noncached(prot) \
+ ((boot_cpu_data.x86 > 3) \
+ ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \
+ : (prot))
+
+#ifndef __ASSEMBLY__
+
+#define pgprot_writecombine pgprot_writecombine
+extern pgprot_t pgprot_writecombine(pgprot_t prot);
+
+/*
+ * ZERO_PAGE is a global shared page that is always zero: used
+ * for zero-mapped memory areas etc..
+ */
+extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
+
+extern spinlock_t pgd_lock;
+extern struct list_head pgd_list;
+
+/*
+ * The following only work if pte_present() is true.
+ * Undefined behaviour if not..
+ */
+static inline int pte_dirty(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_DIRTY;
+}
+
+static inline int pte_young(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_ACCESSED;
+}
+
+static inline int pte_write(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_RW;
+}
+
+static inline int pte_file(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_FILE;
+}
+
+static inline int pte_huge(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_PSE;
+}
+
+static inline int pte_global(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_GLOBAL;
+}
+
+static inline int pte_exec(pte_t pte)
+{
+ return !(pte_flags(pte) & _PAGE_NX);
+}
+
+static inline int pte_special(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_SPECIAL;
+}
+
+static inline unsigned long pte_pfn(pte_t pte)
+{
+ return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
+#define pte_page(pte) pfn_to_page(pte_pfn(pte))
+
+static inline int pmd_large(pmd_t pte)
+{
+ return (pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+ (_PAGE_PSE | _PAGE_PRESENT);
+}
+
+static inline pte_t pte_mkclean(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkold(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
+}
+
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_RW);
+}
+
+static inline pte_t pte_mkexec(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_NX);
+}
+
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkyoung(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkwrite(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_RW);
+}
+
+static inline pte_t pte_mkhuge(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_PSE);
+}
+
+static inline pte_t pte_clrhuge(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_PSE);
+}
+
+static inline pte_t pte_mkglobal(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_GLOBAL);
+}
+
+static inline pte_t pte_clrglobal(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_GLOBAL);
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+ return __pte(pte_val(pte) | _PAGE_SPECIAL);
+}
+
+extern pteval_t __supported_pte_mask;
+
+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
+{
+ return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
+ pgprot_val(pgprot)) & __supported_pte_mask);
+}
+
+static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
+{
+ return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
+ pgprot_val(pgprot)) & __supported_pte_mask);
+}
+
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+ pteval_t val = pte_val(pte);
+
+ /*
+ * Chop off the NX bit (if present), and add the NX portion of
+ * the newprot (if present):
+ */
+ val &= _PAGE_CHG_MASK;
+ val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
+
+ return __pte(val);
+}
+
+/* mprotect needs to preserve PAT bits when updating vm_page_prot */
+#define pgprot_modify pgprot_modify
+static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+{
+ pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
+ pgprotval_t addbits = pgprot_val(newprot);
+ return __pgprot(preservebits | addbits);
+}
+
+#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
+
+#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
+
+#ifndef __ASSEMBLY__
+/* Indicate that x86 has its own track and untrack pfn vma functions */
+#define __HAVE_PFNMAP_TRACKING
+
+#define __HAVE_PHYS_MEM_ACCESS_PROT
+struct file;
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t vma_prot);
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t *vma_prot);
+#endif
+
+/* Install a pte for a particular vaddr in kernel space. */
+void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+
+#ifdef CONFIG_X86_32
+extern void native_pagetable_setup_start(pgd_t *base);
+extern void native_pagetable_setup_done(pgd_t *base);
+#else
+static inline void native_pagetable_setup_start(pgd_t *base) {}
+static inline void native_pagetable_setup_done(pgd_t *base) {}
+#endif
+
+struct seq_file;
+extern void arch_report_meminfo(struct seq_file *m);
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else /* !CONFIG_PARAVIRT */
+#define set_pte(ptep, pte) native_set_pte(ptep, pte)
+#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
+
+#define set_pte_present(mm, addr, ptep, pte) \
+ native_set_pte_present(mm, addr, ptep, pte)
+#define set_pte_atomic(ptep, pte) \
+ native_set_pte_atomic(ptep, pte)
+
+#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd)
+#define pgd_clear(pgd) native_pgd_clear(pgd)
+#endif
+
+#ifndef set_pud
+# define set_pud(pudp, pud) native_set_pud(pudp, pud)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pud_clear(pud) native_pud_clear(pud)
+#endif
+
+#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep)
+#define pmd_clear(pmd) native_pmd_clear(pmd)
+
+#define pte_update(mm, addr, ptep) do { } while (0)
+#define pte_update_defer(mm, addr, ptep) do { } while (0)
+
+static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
+{
+ native_pagetable_setup_start(base);
+}
+
+static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
+{
+ native_pagetable_setup_done(base);
+}
+#endif /* CONFIG_PARAVIRT */
+
+#endif /* __ASSEMBLY__ */
+
+#ifdef CONFIG_X86_32
+# include "pgtable_32.h"
+#else
+# include "pgtable_64.h"
+#endif
+
+/*
+ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
+ *
+ * this macro returns the index of the entry in the pgd page which would
+ * control the given virtual address
+ */
+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
+
+/*
+ * pgd_offset() returns a (pgd_t *)
+ * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
+ */
+#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
+/*
+ * a shortcut which implies the use of the kernel's pgd, instead
+ * of a process's
+ */
+#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
+
+
+#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
+#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
+
+#ifndef __ASSEMBLY__
+
+enum {
+ PG_LEVEL_NONE,
+ PG_LEVEL_4K,
+ PG_LEVEL_2M,
+ PG_LEVEL_1G,
+ PG_LEVEL_NUM
+};
+
+#ifdef CONFIG_PROC_FS
+extern void update_page_count(int level, unsigned long pages);
+#else
+static inline void update_page_count(int level, unsigned long pages) { }
+#endif
+
+/*
+ * Helper function that returns the kernel pagetable entry controlling
+ * the virtual address 'address'. NULL means no pagetable entry present.
+ * NOTE: the return type is pte_t but if the pmd is PSE then we return it
+ * as a pte too.
+ */
+extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+
+/* local pte updates need not use xchg for locking */
+static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
+{
+ pte_t res = *ptep;
+
+ /* Pure native function needs no input for mm, addr */
+ native_pte_clear(NULL, 0, ptep);
+ return res;
+}
+
+static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep , pte_t pte)
+{
+ native_set_pte(ptep, pte);
+}
+
+#ifndef CONFIG_PARAVIRT
+/*
+ * Rules for using pte_update - it must be called after any PTE update which
+ * has not been done using the set_pte / clear_pte interfaces. It is used by
+ * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
+ * updates should either be sets, clears, or set_pte_atomic for P->P
+ * transitions, which means this hook should only be called for user PTEs.
+ * This hook implies a P->P protection or access change has taken place, which
+ * requires a subsequent TLB flush. The notification can optionally be delayed
+ * until the TLB flush event by using the pte_update_defer form of the
+ * interface, but care must be taken to assure that the flush happens while
+ * still holding the same page table lock so that the shadow and primary pages
+ * do not become out of sync on SMP.
+ */
+#define pte_update(mm, addr, ptep) do { } while (0)
+#define pte_update_defer(mm, addr, ptep) do { } while (0)
+#endif
+
+/*
+ * We only update the dirty/accessed state if we set
+ * the dirty bit by hand in the kernel, since the hardware
+ * will do the accessed bit for us, and we don't want to
+ * race with other CPU's that might be updating the dirty
+ * bit at the same time.
+ */
+struct vm_area_struct;
+
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty);
+
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+extern int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep);
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ pte_t pte = native_ptep_get_and_clear(ptep);
+ pte_update(mm, addr, ptep);
+ return pte;
+}
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
+static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ int full)
+{
+ pte_t pte;
+ if (full) {
+ /*
+ * Full address destruction in progress; paravirt does not
+ * care about updates and native needs no locking
+ */
+ pte = native_local_ptep_get_and_clear(ptep);
+ } else {
+ pte = ptep_get_and_clear(mm, addr, ptep);
+ }
+ return pte;
+}
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
+ pte_update(mm, addr, ptep);
+}
+
+/*
+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
+ *
+ * dst - pointer to pgd range anwhere on a pgd page
+ * src - ""
+ * count - the number of pgds to copy.
+ *
+ * dst and src can be on the same page, but the range must not overlap,
+ * and must not cross a page boundary.
+ */
+static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+{
+ memcpy(dst, src, count * sizeof(pgd_t));
+}
+
+
+#include <asm-generic/pgtable.h>
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
new file mode 100644
index 000000000000..72b020deb46b
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -0,0 +1,182 @@
+#ifndef _ASM_X86_PGTABLE_32_H
+#define _ASM_X86_PGTABLE_32_H
+
+
+/*
+ * The Linux memory management assumes a three-level page table setup. On
+ * the i386, we use that, but "fold" the mid level into the top-level page
+ * table, so that we physically have the same two-level page table as the
+ * i386 mmu expects.
+ *
+ * This file contains the functions and defines necessary to modify and use
+ * the i386 page table tree.
+ */
+#ifndef __ASSEMBLY__
+#include <asm/processor.h>
+#include <asm/fixmap.h>
+#include <linux/threads.h>
+#include <asm/paravirt.h>
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+struct mm_struct;
+struct vm_area_struct;
+
+extern pgd_t swapper_pg_dir[1024];
+
+static inline void pgtable_cache_init(void) { }
+static inline void check_pgt_cache(void) { }
+void paging_init(void);
+
+extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
+
+/*
+ * The Linux x86 paging architecture is 'compile-time dual-mode', it
+ * implements both the traditional 2-level x86 page tables and the
+ * newer 3-level PAE-mode page tables.
+ */
+#ifdef CONFIG_X86_PAE
+# include <asm/pgtable-3level-defs.h>
+# define PMD_SIZE (1UL << PMD_SHIFT)
+# define PMD_MASK (~(PMD_SIZE - 1))
+#else
+# include <asm/pgtable-2level-defs.h>
+#endif
+
+#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
+#define PGDIR_MASK (~(PGDIR_SIZE - 1))
+
+/* Just any arbitrary offset to the start of the vmalloc VM area: the
+ * current 8MB value just means that there will be a 8MB "hole" after the
+ * physical memory until the kernel virtual memory starts. That means that
+ * any out-of-bounds memory accesses will hopefully be caught.
+ * The vmalloc() routines leaves a hole of 4kB between each vmalloced
+ * area for the same reason. ;)
+ */
+#define VMALLOC_OFFSET (8 * 1024 * 1024)
+#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
+#ifdef CONFIG_X86_PAE
+#define LAST_PKMAP 512
+#else
+#define LAST_PKMAP 1024
+#endif
+
+#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
+ & PMD_MASK)
+
+#ifdef CONFIG_HIGHMEM
+# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
+#else
+# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
+#endif
+
+#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
+
+/*
+ * Define this if things work differently on an i386 and an i486:
+ * it will (on an i486) warn about kernel memory accesses that are
+ * done without a 'access_ok(VERIFY_WRITE,..)'
+ */
+#undef TEST_ACCESS_OK
+
+/* The boot page tables (all created as a single array) */
+extern unsigned long pg0[];
+
+#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
+
+/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
+#define pmd_none(x) (!(unsigned long)pmd_val((x)))
+#define pmd_present(x) (pmd_val((x)) & _PAGE_PRESENT)
+#define pmd_bad(x) ((pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
+
+#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
+
+#ifdef CONFIG_X86_PAE
+# include <asm/pgtable-3level.h>
+#else
+# include <asm/pgtable-2level.h>
+#endif
+
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ */
+#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
+
+
+static inline int pud_large(pud_t pud) { return 0; }
+
+/*
+ * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
+ *
+ * this macro returns the index of the entry in the pmd page which would
+ * control the given virtual address
+ */
+#define pmd_index(address) \
+ (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
+
+/*
+ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
+ *
+ * this macro returns the index of the entry in the pte page which would
+ * control the given virtual address
+ */
+#define pte_index(address) \
+ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+#define pte_offset_kernel(dir, address) \
+ ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index((address)))
+
+#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
+
+#define pmd_page_vaddr(pmd) \
+ ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))
+
+#if defined(CONFIG_HIGHPTE)
+#define pte_offset_map(dir, address) \
+ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \
+ pte_index((address)))
+#define pte_offset_map_nested(dir, address) \
+ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \
+ pte_index((address)))
+#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0)
+#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
+#else
+#define pte_offset_map(dir, address) \
+ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
+#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
+#define pte_unmap(pte) do { } while (0)
+#define pte_unmap_nested(pte) do { } while (0)
+#endif
+
+/* Clear a kernel PTE and flush it from the TLB */
+#define kpte_clear_flush(ptep, vaddr) \
+do { \
+ pte_clear(&init_mm, (vaddr), (ptep)); \
+ __flush_tlb_one((vaddr)); \
+} while (0)
+
+/*
+ * The i386 doesn't have any external MMU info: the kernel page
+ * tables contain all the necessary information.
+ */
+#define update_mmu_cache(vma, address, pte) do { } while (0)
+
+#endif /* !__ASSEMBLY__ */
+
+/*
+ * kern_addr_valid() is (1) for FLATMEM and (0) for
+ * SPARSEMEM and DISCONTIGMEM
+ */
+#ifdef CONFIG_FLATMEM
+#define kern_addr_valid(addr) (1)
+#else
+#define kern_addr_valid(kaddr) (0)
+#endif
+
+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
+ remap_pfn_range(vma, vaddr, pfn, size, prot)
+
+#endif /* _ASM_X86_PGTABLE_32_H */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
new file mode 100644
index 000000000000..ba09289accaa
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -0,0 +1,291 @@
+#ifndef _ASM_X86_PGTABLE_64_H
+#define _ASM_X86_PGTABLE_64_H
+
+#include <linux/const.h>
+#ifndef __ASSEMBLY__
+
+/*
+ * This file contains the functions and defines necessary to modify and use
+ * the x86-64 page table tree.
+ */
+#include <asm/processor.h>
+#include <linux/bitops.h>
+#include <linux/threads.h>
+#include <asm/pda.h>
+
+extern pud_t level3_kernel_pgt[512];
+extern pud_t level3_ident_pgt[512];
+extern pmd_t level2_kernel_pgt[512];
+extern pmd_t level2_fixmap_pgt[512];
+extern pmd_t level2_ident_pgt[512];
+extern pgd_t init_level4_pgt[];
+
+#define swapper_pg_dir init_level4_pgt
+
+extern void paging_init(void);
+
+#endif /* !__ASSEMBLY__ */
+
+#define SHARED_KERNEL_PMD 0
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT 39
+#define PTRS_PER_PGD 512
+
+/*
+ * 3rd level page
+ */
+#define PUD_SHIFT 30
+#define PTRS_PER_PUD 512
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT 21
+#define PTRS_PER_PMD 512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE 512
+
+#ifndef __ASSEMBLY__
+
+#define pte_ERROR(e) \
+ printk("%s:%d: bad pte %p(%016lx).\n", \
+ __FILE__, __LINE__, &(e), pte_val(e))
+#define pmd_ERROR(e) \
+ printk("%s:%d: bad pmd %p(%016lx).\n", \
+ __FILE__, __LINE__, &(e), pmd_val(e))
+#define pud_ERROR(e) \
+ printk("%s:%d: bad pud %p(%016lx).\n", \
+ __FILE__, __LINE__, &(e), pud_val(e))
+#define pgd_ERROR(e) \
+ printk("%s:%d: bad pgd %p(%016lx).\n", \
+ __FILE__, __LINE__, &(e), pgd_val(e))
+
+#define pgd_none(x) (!pgd_val(x))
+#define pud_none(x) (!pud_val(x))
+
+struct mm_struct;
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
+
+
+static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ *ptep = native_make_pte(0);
+}
+
+static inline void native_set_pte(pte_t *ptep, pte_t pte)
+{
+ *ptep = pte;
+}
+
+static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ native_set_pte(ptep, pte);
+}
+
+static inline pte_t native_ptep_get_and_clear(pte_t *xp)
+{
+#ifdef CONFIG_SMP
+ return native_make_pte(xchg(&xp->pte, 0));
+#else
+ /* native_local_ptep_get_and_clear,
+ but duplicated because of cyclic dependency */
+ pte_t ret = *xp;
+ native_pte_clear(NULL, 0, xp);
+ return ret;
+#endif
+}
+
+static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ *pmdp = pmd;
+}
+
+static inline void native_pmd_clear(pmd_t *pmd)
+{
+ native_set_pmd(pmd, native_make_pmd(0));
+}
+
+static inline void native_set_pud(pud_t *pudp, pud_t pud)
+{
+ *pudp = pud;
+}
+
+static inline void native_pud_clear(pud_t *pud)
+{
+ native_set_pud(pud, native_make_pud(0));
+}
+
+static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ *pgdp = pgd;
+}
+
+static inline void native_pgd_clear(pgd_t *pgd)
+{
+ native_set_pgd(pgd, native_make_pgd(0));
+}
+
+#define pte_same(a, b) ((a).pte == (b).pte)
+
+#endif /* !__ASSEMBLY__ */
+
+#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
+#define PMD_MASK (~(PMD_SIZE - 1))
+#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
+#define PUD_MASK (~(PUD_SIZE - 1))
+#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
+#define PGDIR_MASK (~(PGDIR_SIZE - 1))
+
+
+#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+#define VMALLOC_START _AC(0xffffc20000000000, UL)
+#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
+#define VMEMMAP_START _AC(0xffffe20000000000, UL)
+#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
+#define MODULES_END _AC(0xffffffffff000000, UL)
+#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+
+#ifndef __ASSEMBLY__
+
+static inline int pgd_bad(pgd_t pgd)
+{
+ return (pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
+}
+
+static inline int pud_bad(pud_t pud)
+{
+ return (pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
+}
+
+static inline int pmd_bad(pmd_t pmd)
+{
+ return (pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
+}
+
+#define pte_none(x) (!pte_val((x)))
+#define pte_present(x) (pte_val((x)) & (_PAGE_PRESENT | _PAGE_PROTNONE))
+
+#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
+
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ */
+
+/*
+ * Level 4 access.
+ */
+#define pgd_page_vaddr(pgd) \
+ ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
+#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
+#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT)
+static inline int pgd_large(pgd_t pgd) { return 0; }
+#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
+
+/* PUD - Level3 access */
+/* to find an entry in a page-table-directory. */
+#define pud_page_vaddr(pud) \
+ ((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK))
+#define pud_page(pud) (pfn_to_page(pud_val((pud)) >> PAGE_SHIFT))
+#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
+#define pud_offset(pgd, address) \
+ ((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address)))
+#define pud_present(pud) (pud_val((pud)) & _PAGE_PRESENT)
+
+static inline int pud_large(pud_t pte)
+{
+ return (pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+ (_PAGE_PSE | _PAGE_PRESENT);
+}
+
+/* PMD - Level 2 access */
+#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
+#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
+
+#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
+#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \
+ pmd_index(address))
+#define pmd_none(x) (!pmd_val((x)))
+#define pmd_present(x) (pmd_val((x)) & _PAGE_PRESENT)
+#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot))))
+#define pmd_pfn(x) ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT)
+
+#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
+#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \
+ _PAGE_FILE })
+#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
+
+/* PTE - Level 1 access. */
+
+/* page, protection -> pte */
+#define mk_pte(page, pgprot) pfn_pte(page_to_pfn((page)), (pgprot))
+
+#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
+ pte_index((address)))
+
+/* x86-64 always has all page tables mapped. */
+#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
+#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
+#define pte_unmap(pte) /* NOP */
+#define pte_unmap_nested(pte) /* NOP */
+
+#define update_mmu_cache(vma, address, pte) do { } while (0)
+
+extern int direct_gbpages;
+
+/* Encode and de-code a swap entry */
+#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
+#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
+#else
+#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
+#endif
+
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+
+#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
+ & ((1U << SWP_TYPE_BITS) - 1))
+#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
+#define __swp_entry(type, offset) ((swp_entry_t) { \
+ ((type) << (_PAGE_BIT_PRESENT + 1)) \
+ | ((offset) << SWP_OFFSET_SHIFT) })
+#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
+#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+
+extern int kern_addr_valid(unsigned long addr);
+extern void cleanup_highmap(void);
+
+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
+ remap_pfn_range(vma, vaddr, pfn, size, prot)
+
+#define HAVE_ARCH_UNMAPPED_AREA
+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+
+#define pgtable_cache_init() do { } while (0)
+#define check_pgt_cache() do { } while (0)
+
+#define PAGE_AGP PAGE_KERNEL_NOCACHE
+#define HAVE_PAGE_AGP 1
+
+/* fs/proc/kcore.c */
+#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
+#define kc_offset_to_vaddr(o) \
+ (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \
+ ? ((o) | ~__VIRTUAL_MASK) \
+ : (o))
+
+#define __HAVE_ARCH_PTE_SAME
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/poll.h b/arch/x86/include/asm/poll.h
new file mode 100644
index 000000000000..c98509d3149e
--- /dev/null
+++ b/arch/x86/include/asm/poll.h
@@ -0,0 +1 @@
+#include <asm-generic/poll.h>
diff --git a/arch/x86/include/asm/posix_types.h b/arch/x86/include/asm/posix_types.h
new file mode 100644
index 000000000000..bb7133dc155d
--- /dev/null
+++ b/arch/x86/include/asm/posix_types.h
@@ -0,0 +1,13 @@
+#ifdef __KERNEL__
+# ifdef CONFIG_X86_32
+# include "posix_types_32.h"
+# else
+# include "posix_types_64.h"
+# endif
+#else
+# ifdef __i386__
+# include "posix_types_32.h"
+# else
+# include "posix_types_64.h"
+# endif
+#endif
diff --git a/arch/x86/include/asm/posix_types_32.h b/arch/x86/include/asm/posix_types_32.h
new file mode 100644
index 000000000000..f7d9adf82e53
--- /dev/null
+++ b/arch/x86/include/asm/posix_types_32.h
@@ -0,0 +1,85 @@
+#ifndef _ASM_X86_POSIX_TYPES_32_H
+#define _ASM_X86_POSIX_TYPES_32_H
+
+/*
+ * This file is generally used by user-level software, so you need to
+ * be a little careful about namespace pollution etc. Also, we cannot
+ * assume GCC is being used.
+ */
+
+typedef unsigned long __kernel_ino_t;
+typedef unsigned short __kernel_mode_t;
+typedef unsigned short __kernel_nlink_t;
+typedef long __kernel_off_t;
+typedef int __kernel_pid_t;
+typedef unsigned short __kernel_ipc_pid_t;
+typedef unsigned short __kernel_uid_t;
+typedef unsigned short __kernel_gid_t;
+typedef unsigned int __kernel_size_t;
+typedef int __kernel_ssize_t;
+typedef int __kernel_ptrdiff_t;
+typedef long __kernel_time_t;
+typedef long __kernel_suseconds_t;
+typedef long __kernel_clock_t;
+typedef int __kernel_timer_t;
+typedef int __kernel_clockid_t;
+typedef int __kernel_daddr_t;
+typedef char * __kernel_caddr_t;
+typedef unsigned short __kernel_uid16_t;
+typedef unsigned short __kernel_gid16_t;
+typedef unsigned int __kernel_uid32_t;
+typedef unsigned int __kernel_gid32_t;
+
+typedef unsigned short __kernel_old_uid_t;
+typedef unsigned short __kernel_old_gid_t;
+typedef unsigned short __kernel_old_dev_t;
+
+#ifdef __GNUC__
+typedef long long __kernel_loff_t;
+#endif
+
+typedef struct {
+ int val[2];
+} __kernel_fsid_t;
+
+#if defined(__KERNEL__)
+
+#undef __FD_SET
+#define __FD_SET(fd,fdsetp) \
+ asm volatile("btsl %1,%0": \
+ "+m" (*(__kernel_fd_set *)(fdsetp)) \
+ : "r" ((int)(fd)))
+
+#undef __FD_CLR
+#define __FD_CLR(fd,fdsetp) \
+ asm volatile("btrl %1,%0": \
+ "+m" (*(__kernel_fd_set *)(fdsetp)) \
+ : "r" ((int) (fd)))
+
+#undef __FD_ISSET
+#define __FD_ISSET(fd,fdsetp) \
+ (__extension__ \
+ ({ \
+ unsigned char __result; \
+ asm volatile("btl %1,%2 ; setb %0" \
+ : "=q" (__result) \
+ : "r" ((int)(fd)), \
+ "m" (*(__kernel_fd_set *)(fdsetp))); \
+ __result; \
+}))
+
+#undef __FD_ZERO
+#define __FD_ZERO(fdsetp) \
+do { \
+ int __d0, __d1; \
+ asm volatile("cld ; rep ; stosl" \
+ : "=m" (*(__kernel_fd_set *)(fdsetp)), \
+ "=&c" (__d0), "=&D" (__d1) \
+ : "a" (0), "1" (__FDSET_LONGS), \
+ "2" ((__kernel_fd_set *)(fdsetp)) \
+ : "memory"); \
+} while (0)
+
+#endif /* defined(__KERNEL__) */
+
+#endif /* _ASM_X86_POSIX_TYPES_32_H */
diff --git a/arch/x86/include/asm/posix_types_64.h b/arch/x86/include/asm/posix_types_64.h
new file mode 100644
index 000000000000..eb8d2d92b63e
--- /dev/null
+++ b/arch/x86/include/asm/posix_types_64.h
@@ -0,0 +1,119 @@
+#ifndef _ASM_X86_POSIX_TYPES_64_H
+#define _ASM_X86_POSIX_TYPES_64_H
+
+/*
+ * This file is generally used by user-level software, so you need to
+ * be a little careful about namespace pollution etc. Also, we cannot
+ * assume GCC is being used.
+ */
+
+typedef unsigned long __kernel_ino_t;
+typedef unsigned int __kernel_mode_t;
+typedef unsigned long __kernel_nlink_t;
+typedef long __kernel_off_t;
+typedef int __kernel_pid_t;
+typedef int __kernel_ipc_pid_t;
+typedef unsigned int __kernel_uid_t;
+typedef unsigned int __kernel_gid_t;
+typedef unsigned long __kernel_size_t;
+typedef long __kernel_ssize_t;
+typedef long __kernel_ptrdiff_t;
+typedef long __kernel_time_t;
+typedef long __kernel_suseconds_t;
+typedef long __kernel_clock_t;
+typedef int __kernel_timer_t;
+typedef int __kernel_clockid_t;
+typedef int __kernel_daddr_t;
+typedef char * __kernel_caddr_t;
+typedef unsigned short __kernel_uid16_t;
+typedef unsigned short __kernel_gid16_t;
+
+#ifdef __GNUC__
+typedef long long __kernel_loff_t;
+#endif
+
+typedef struct {
+ int val[2];
+} __kernel_fsid_t;
+
+typedef unsigned short __kernel_old_uid_t;
+typedef unsigned short __kernel_old_gid_t;
+typedef __kernel_uid_t __kernel_uid32_t;
+typedef __kernel_gid_t __kernel_gid32_t;
+
+typedef unsigned long __kernel_old_dev_t;
+
+#ifdef __KERNEL__
+
+#undef __FD_SET
+static inline void __FD_SET(unsigned long fd, __kernel_fd_set *fdsetp)
+{
+ unsigned long _tmp = fd / __NFDBITS;
+ unsigned long _rem = fd % __NFDBITS;
+ fdsetp->fds_bits[_tmp] |= (1UL<<_rem);
+}
+
+#undef __FD_CLR
+static inline void __FD_CLR(unsigned long fd, __kernel_fd_set *fdsetp)
+{
+ unsigned long _tmp = fd / __NFDBITS;
+ unsigned long _rem = fd % __NFDBITS;
+ fdsetp->fds_bits[_tmp] &= ~(1UL<<_rem);
+}
+
+#undef __FD_ISSET
+static inline int __FD_ISSET(unsigned long fd, __const__ __kernel_fd_set *p)
+{
+ unsigned long _tmp = fd / __NFDBITS;
+ unsigned long _rem = fd % __NFDBITS;
+ return (p->fds_bits[_tmp] & (1UL<<_rem)) != 0;
+}
+
+/*
+ * This will unroll the loop for the normal constant cases (8 or 32 longs,
+ * for 256 and 1024-bit fd_sets respectively)
+ */
+#undef __FD_ZERO
+static inline void __FD_ZERO(__kernel_fd_set *p)
+{
+ unsigned long *tmp = p->fds_bits;
+ int i;
+
+ if (__builtin_constant_p(__FDSET_LONGS)) {
+ switch (__FDSET_LONGS) {
+ case 32:
+ tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
+ tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
+ tmp[ 8] = 0; tmp[ 9] = 0; tmp[10] = 0; tmp[11] = 0;
+ tmp[12] = 0; tmp[13] = 0; tmp[14] = 0; tmp[15] = 0;
+ tmp[16] = 0; tmp[17] = 0; tmp[18] = 0; tmp[19] = 0;
+ tmp[20] = 0; tmp[21] = 0; tmp[22] = 0; tmp[23] = 0;
+ tmp[24] = 0; tmp[25] = 0; tmp[26] = 0; tmp[27] = 0;
+ tmp[28] = 0; tmp[29] = 0; tmp[30] = 0; tmp[31] = 0;
+ return;
+ case 16:
+ tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
+ tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
+ tmp[ 8] = 0; tmp[ 9] = 0; tmp[10] = 0; tmp[11] = 0;
+ tmp[12] = 0; tmp[13] = 0; tmp[14] = 0; tmp[15] = 0;
+ return;
+ case 8:
+ tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
+ tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
+ return;
+ case 4:
+ tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
+ return;
+ }
+ }
+ i = __FDSET_LONGS;
+ while (i) {
+ i--;
+ *tmp = 0;
+ tmp++;
+ }
+}
+
+#endif /* defined(__KERNEL__) */
+
+#endif /* _ASM_X86_POSIX_TYPES_64_H */
diff --git a/arch/x86/include/asm/prctl.h b/arch/x86/include/asm/prctl.h
new file mode 100644
index 000000000000..a8894647dd9a
--- /dev/null
+++ b/arch/x86/include/asm/prctl.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_X86_PRCTL_H
+#define _ASM_X86_PRCTL_H
+
+#define ARCH_SET_GS 0x1001
+#define ARCH_SET_FS 0x1002
+#define ARCH_GET_FS 0x1003
+#define ARCH_GET_GS 0x1004
+
+#ifdef CONFIG_X86_64
+extern long sys_arch_prctl(int, unsigned long);
+#endif /* CONFIG_X86_64 */
+
+#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/include/asm/processor-cyrix.h b/arch/x86/include/asm/processor-cyrix.h
new file mode 100644
index 000000000000..1198f2a0e42c
--- /dev/null
+++ b/arch/x86/include/asm/processor-cyrix.h
@@ -0,0 +1,38 @@
+/*
+ * NSC/Cyrix CPU indexed register access. Must be inlined instead of
+ * macros to ensure correct access ordering
+ * Access order is always 0x22 (=offset), 0x23 (=value)
+ *
+ * When using the old macros a line like
+ * setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
+ * gets expanded to:
+ * do {
+ * outb((CX86_CCR2), 0x22);
+ * outb((({
+ * outb((CX86_CCR2), 0x22);
+ * inb(0x23);
+ * }) | 0x88), 0x23);
+ * } while (0);
+ *
+ * which in fact violates the access order (= 0x22, 0x22, 0x23, 0x23).
+ */
+
+static inline u8 getCx86(u8 reg)
+{
+ outb(reg, 0x22);
+ return inb(0x23);
+}
+
+static inline void setCx86(u8 reg, u8 data)
+{
+ outb(reg, 0x22);
+ outb(data, 0x23);
+}
+
+#define getCx86_old(reg) ({ outb((reg), 0x22); inb(0x23); })
+
+#define setCx86_old(reg, data) do { \
+ outb((reg), 0x22); \
+ outb((data), 0x23); \
+} while (0)
+
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
new file mode 100644
index 000000000000..7a3e836eb2a9
--- /dev/null
+++ b/arch/x86/include/asm/processor-flags.h
@@ -0,0 +1,100 @@
+#ifndef _ASM_X86_PROCESSOR_FLAGS_H
+#define _ASM_X86_PROCESSOR_FLAGS_H
+/* Various flags defined: can be included from assembler. */
+
+/*
+ * EFLAGS bits
+ */
+#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
+#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
+#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
+#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
+#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
+#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
+#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
+#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
+#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
+#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
+#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
+#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
+#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
+#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
+#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
+#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
+#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
+
+/*
+ * Basic CPU control in CR0
+ */
+#define X86_CR0_PE 0x00000001 /* Protection Enable */
+#define X86_CR0_MP 0x00000002 /* Monitor Coprocessor */
+#define X86_CR0_EM 0x00000004 /* Emulation */
+#define X86_CR0_TS 0x00000008 /* Task Switched */
+#define X86_CR0_ET 0x00000010 /* Extension Type */
+#define X86_CR0_NE 0x00000020 /* Numeric Error */
+#define X86_CR0_WP 0x00010000 /* Write Protect */
+#define X86_CR0_AM 0x00040000 /* Alignment Mask */
+#define X86_CR0_NW 0x20000000 /* Not Write-through */
+#define X86_CR0_CD 0x40000000 /* Cache Disable */
+#define X86_CR0_PG 0x80000000 /* Paging */
+
+/*
+ * Paging options in CR3
+ */
+#define X86_CR3_PWT 0x00000008 /* Page Write Through */
+#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */
+
+/*
+ * Intel CPU features in CR4
+ */
+#define X86_CR4_VME 0x00000001 /* enable vm86 extensions */
+#define X86_CR4_PVI 0x00000002 /* virtual interrupts flag enable */
+#define X86_CR4_TSD 0x00000004 /* disable time stamp at ipl 3 */
+#define X86_CR4_DE 0x00000008 /* enable debugging extensions */
+#define X86_CR4_PSE 0x00000010 /* enable page size extensions */
+#define X86_CR4_PAE 0x00000020 /* enable physical address extensions */
+#define X86_CR4_MCE 0x00000040 /* Machine check enable */
+#define X86_CR4_PGE 0x00000080 /* enable global pages */
+#define X86_CR4_PCE 0x00000100 /* enable performance counters at ipl 3 */
+#define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */
+#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
+#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
+#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
+
+/*
+ * x86-64 Task Priority Register, CR8
+ */
+#define X86_CR8_TPR 0x0000000F /* task priority register */
+
+/*
+ * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h>
+ */
+
+/*
+ * NSC/Cyrix CPU configuration register indexes
+ */
+#define CX86_PCR0 0x20
+#define CX86_GCR 0xb8
+#define CX86_CCR0 0xc0
+#define CX86_CCR1 0xc1
+#define CX86_CCR2 0xc2
+#define CX86_CCR3 0xc3
+#define CX86_CCR4 0xe8
+#define CX86_CCR5 0xe9
+#define CX86_CCR6 0xea
+#define CX86_CCR7 0xeb
+#define CX86_PCR1 0xf0
+#define CX86_DIR0 0xfe
+#define CX86_DIR1 0xff
+#define CX86_ARR_BASE 0xc4
+#define CX86_RCR_BASE 0xdc
+
+#ifdef __KERNEL__
+#ifdef CONFIG_VM86
+#define X86_VM_MASK X86_EFLAGS_VM
+#else
+#define X86_VM_MASK 0 /* No VM86 support */
+#endif
+#endif
+
+#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
new file mode 100644
index 000000000000..091cd8855f2e
--- /dev/null
+++ b/arch/x86/include/asm/processor.h
@@ -0,0 +1,953 @@
+#ifndef _ASM_X86_PROCESSOR_H
+#define _ASM_X86_PROCESSOR_H
+
+#include <asm/processor-flags.h>
+
+/* Forward declaration, a strange C thing */
+struct task_struct;
+struct mm_struct;
+
+#include <asm/vm86.h>
+#include <asm/math_emu.h>
+#include <asm/segment.h>
+#include <asm/types.h>
+#include <asm/sigcontext.h>
+#include <asm/current.h>
+#include <asm/cpufeature.h>
+#include <asm/system.h>
+#include <asm/page.h>
+#include <asm/percpu.h>
+#include <asm/msr.h>
+#include <asm/desc_defs.h>
+#include <asm/nops.h>
+#include <asm/ds.h>
+
+#include <linux/personality.h>
+#include <linux/cpumask.h>
+#include <linux/cache.h>
+#include <linux/threads.h>
+#include <linux/init.h>
+
+/*
+ * Default implementation of macro that returns current
+ * instruction pointer ("program counter").
+ */
+static inline void *current_text_addr(void)
+{
+ void *pc;
+
+ asm volatile("mov $1f, %0; 1:":"=r" (pc));
+
+ return pc;
+}
+
+#ifdef CONFIG_X86_VSMP
+# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
+# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
+#else
+# define ARCH_MIN_TASKALIGN 16
+# define ARCH_MIN_MMSTRUCT_ALIGN 0
+#endif
+
+/*
+ * CPU type and hardware bug flags. Kept separately for each CPU.
+ * Members of this structure are referenced in head.S, so think twice
+ * before touching them. [mj]
+ */
+
+struct cpuinfo_x86 {
+ __u8 x86; /* CPU family */
+ __u8 x86_vendor; /* CPU vendor */
+ __u8 x86_model;
+ __u8 x86_mask;
+#ifdef CONFIG_X86_32
+ char wp_works_ok; /* It doesn't on 386's */
+
+ /* Problems on some 486Dx4's and old 386's: */
+ char hlt_works_ok;
+ char hard_math;
+ char rfu;
+ char fdiv_bug;
+ char f00f_bug;
+ char coma_bug;
+ char pad0;
+#else
+ /* Number of 4K pages in DTLB/ITLB combined(in pages): */
+ int x86_tlbsize;
+ __u8 x86_virt_bits;
+ __u8 x86_phys_bits;
+#endif
+ /* CPUID returned core id bits: */
+ __u8 x86_coreid_bits;
+ /* Max extended CPUID function supported: */
+ __u32 extended_cpuid_level;
+ /* Maximum supported CPUID level, -1=no CPUID: */
+ int cpuid_level;
+ __u32 x86_capability[NCAPINTS];
+ char x86_vendor_id[16];
+ char x86_model_id[64];
+ /* in KB - valid for CPUS which support this call: */
+ int x86_cache_size;
+ int x86_cache_alignment; /* In bytes */
+ int x86_power;
+ unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+ /* cpus sharing the last level cache: */
+ cpumask_t llc_shared_map;
+#endif
+ /* cpuid returned max cores value: */
+ u16 x86_max_cores;
+ u16 apicid;
+ u16 initial_apicid;
+ u16 x86_clflush_size;
+#ifdef CONFIG_SMP
+ /* number of cores as seen by the OS: */
+ u16 booted_cores;
+ /* Physical processor id: */
+ u16 phys_proc_id;
+ /* Core id: */
+ u16 cpu_core_id;
+ /* Index into per_cpu list: */
+ u16 cpu_index;
+#endif
+ unsigned int x86_hyper_vendor;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+#define X86_VENDOR_INTEL 0
+#define X86_VENDOR_CYRIX 1
+#define X86_VENDOR_AMD 2
+#define X86_VENDOR_UMC 3
+#define X86_VENDOR_CENTAUR 5
+#define X86_VENDOR_TRANSMETA 7
+#define X86_VENDOR_NSC 8
+#define X86_VENDOR_NUM 9
+
+#define X86_VENDOR_UNKNOWN 0xff
+
+#define X86_HYPER_VENDOR_NONE 0
+#define X86_HYPER_VENDOR_VMWARE 1
+
+/*
+ * capabilities of CPUs
+ */
+extern struct cpuinfo_x86 boot_cpu_data;
+extern struct cpuinfo_x86 new_cpu_data;
+
+extern struct tss_struct doublefault_tss;
+extern __u32 cleared_cpu_caps[NCAPINTS];
+
+#ifdef CONFIG_SMP
+DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
+#define cpu_data(cpu) per_cpu(cpu_info, cpu)
+#define current_cpu_data __get_cpu_var(cpu_info)
+#else
+#define cpu_data(cpu) boot_cpu_data
+#define current_cpu_data boot_cpu_data
+#endif
+
+extern const struct seq_operations cpuinfo_op;
+
+static inline int hlt_works(int cpu)
+{
+#ifdef CONFIG_X86_32
+ return cpu_data(cpu).hlt_works_ok;
+#else
+ return 1;
+#endif
+}
+
+#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
+
+extern void cpu_detect(struct cpuinfo_x86 *c);
+
+extern struct pt_regs *idle_regs(struct pt_regs *);
+
+extern void early_cpu_init(void);
+extern void identify_boot_cpu(void);
+extern void identify_secondary_cpu(struct cpuinfo_x86 *);
+extern void print_cpu_info(struct cpuinfo_x86 *);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern unsigned short num_cache_leaves;
+
+extern void detect_extended_topology(struct cpuinfo_x86 *c);
+extern void detect_ht(struct cpuinfo_x86 *c);
+
+static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* ecx is often an input as well as an output. */
+ asm("cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+}
+
+static inline void load_cr3(pgd_t *pgdir)
+{
+ write_cr3(__pa(pgdir));
+}
+
+#ifdef CONFIG_X86_32
+/* This is the TSS defined by the hardware. */
+struct x86_hw_tss {
+ unsigned short back_link, __blh;
+ unsigned long sp0;
+ unsigned short ss0, __ss0h;
+ unsigned long sp1;
+ /* ss1 caches MSR_IA32_SYSENTER_CS: */
+ unsigned short ss1, __ss1h;
+ unsigned long sp2;
+ unsigned short ss2, __ss2h;
+ unsigned long __cr3;
+ unsigned long ip;
+ unsigned long flags;
+ unsigned long ax;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long bx;
+ unsigned long sp;
+ unsigned long bp;
+ unsigned long si;
+ unsigned long di;
+ unsigned short es, __esh;
+ unsigned short cs, __csh;
+ unsigned short ss, __ssh;
+ unsigned short ds, __dsh;
+ unsigned short fs, __fsh;
+ unsigned short gs, __gsh;
+ unsigned short ldt, __ldth;
+ unsigned short trace;
+ unsigned short io_bitmap_base;
+
+} __attribute__((packed));
+#else
+struct x86_hw_tss {
+ u32 reserved1;
+ u64 sp0;
+ u64 sp1;
+ u64 sp2;
+ u64 reserved2;
+ u64 ist[7];
+ u32 reserved3;
+ u32 reserved4;
+ u16 reserved5;
+ u16 io_bitmap_base;
+
+} __attribute__((packed)) ____cacheline_aligned;
+#endif
+
+/*
+ * IO-bitmap sizes:
+ */
+#define IO_BITMAP_BITS 65536
+#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
+#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
+#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
+#define INVALID_IO_BITMAP_OFFSET 0x8000
+#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
+
+struct tss_struct {
+ /*
+ * The hardware state:
+ */
+ struct x86_hw_tss x86_tss;
+
+ /*
+ * The extra 1 is there because the CPU will access an
+ * additional byte beyond the end of the IO permission
+ * bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
+ /*
+ * Cache the current maximum and the last task that used the bitmap:
+ */
+ unsigned long io_bitmap_max;
+ struct thread_struct *io_bitmap_owner;
+
+ /*
+ * .. and then another 0x100 bytes for the emergency kernel stack:
+ */
+ unsigned long stack[64];
+
+} ____cacheline_aligned;
+
+DECLARE_PER_CPU(struct tss_struct, init_tss);
+
+/*
+ * Save the original ist values for checking stack pointers during debugging
+ */
+struct orig_ist {
+ unsigned long ist[7];
+};
+
+#define MXCSR_DEFAULT 0x1f80
+
+struct i387_fsave_struct {
+ u32 cwd; /* FPU Control Word */
+ u32 swd; /* FPU Status Word */
+ u32 twd; /* FPU Tag Word */
+ u32 fip; /* FPU IP Offset */
+ u32 fcs; /* FPU IP Selector */
+ u32 foo; /* FPU Operand Pointer Offset */
+ u32 fos; /* FPU Operand Pointer Selector */
+
+ /* 8*10 bytes for each FP-reg = 80 bytes: */
+ u32 st_space[20];
+
+ /* Software status information [not touched by FSAVE ]: */
+ u32 status;
+};
+
+struct i387_fxsave_struct {
+ u16 cwd; /* Control Word */
+ u16 swd; /* Status Word */
+ u16 twd; /* Tag Word */
+ u16 fop; /* Last Instruction Opcode */
+ union {
+ struct {
+ u64 rip; /* Instruction Pointer */
+ u64 rdp; /* Data Pointer */
+ };
+ struct {
+ u32 fip; /* FPU IP Offset */
+ u32 fcs; /* FPU IP Selector */
+ u32 foo; /* FPU Operand Offset */
+ u32 fos; /* FPU Operand Selector */
+ };
+ };
+ u32 mxcsr; /* MXCSR Register State */
+ u32 mxcsr_mask; /* MXCSR Mask */
+
+ /* 8*16 bytes for each FP-reg = 128 bytes: */
+ u32 st_space[32];
+
+ /* 16*16 bytes for each XMM-reg = 256 bytes: */
+ u32 xmm_space[64];
+
+ u32 padding[12];
+
+ union {
+ u32 padding1[12];
+ u32 sw_reserved[12];
+ };
+
+} __attribute__((aligned(16)));
+
+struct i387_soft_struct {
+ u32 cwd;
+ u32 swd;
+ u32 twd;
+ u32 fip;
+ u32 fcs;
+ u32 foo;
+ u32 fos;
+ /* 8*10 bytes for each FP-reg = 80 bytes: */
+ u32 st_space[20];
+ u8 ftop;
+ u8 changed;
+ u8 lookahead;
+ u8 no_update;
+ u8 rm;
+ u8 alimit;
+ struct info *info;
+ u32 entry_eip;
+};
+
+struct xsave_hdr_struct {
+ u64 xstate_bv;
+ u64 reserved1[2];
+ u64 reserved2[5];
+} __attribute__((packed));
+
+struct xsave_struct {
+ struct i387_fxsave_struct i387;
+ struct xsave_hdr_struct xsave_hdr;
+ /* new processor state extensions will go here */
+} __attribute__ ((packed, aligned (64)));
+
+union thread_xstate {
+ struct i387_fsave_struct fsave;
+ struct i387_fxsave_struct fxsave;
+ struct i387_soft_struct soft;
+ struct xsave_struct xsave;
+};
+
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU(struct orig_ist, orig_ist);
+#endif
+
+extern void print_cpu_info(struct cpuinfo_x86 *);
+extern unsigned int xstate_size;
+extern void free_thread_xstate(struct task_struct *);
+extern struct kmem_cache *task_xstate_cachep;
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern unsigned short num_cache_leaves;
+
+struct thread_struct {
+ /* Cached TLS descriptors: */
+ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
+ unsigned long sp0;
+ unsigned long sp;
+#ifdef CONFIG_X86_32
+ unsigned long sysenter_cs;
+#else
+ unsigned long usersp; /* Copy from PDA */
+ unsigned short es;
+ unsigned short ds;
+ unsigned short fsindex;
+ unsigned short gsindex;
+#endif
+ unsigned long ip;
+ unsigned long fs;
+ unsigned long gs;
+ /* Hardware debugging registers: */
+ unsigned long debugreg0;
+ unsigned long debugreg1;
+ unsigned long debugreg2;
+ unsigned long debugreg3;
+ unsigned long debugreg6;
+ unsigned long debugreg7;
+ /* Fault info: */
+ unsigned long cr2;
+ unsigned long trap_no;
+ unsigned long error_code;
+ /* floating point and extended processor state */
+ union thread_xstate *xstate;
+#ifdef CONFIG_X86_32
+ /* Virtual 86 mode info */
+ struct vm86_struct __user *vm86_info;
+ unsigned long screen_bitmap;
+ unsigned long v86flags;
+ unsigned long v86mask;
+ unsigned long saved_sp0;
+ unsigned int saved_fs;
+ unsigned int saved_gs;
+#endif
+ /* IO permissions: */
+ unsigned long *io_bitmap_ptr;
+ unsigned long iopl;
+ /* Max allowed port in the bitmap, in bytes: */
+ unsigned io_bitmap_max;
+/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
+ unsigned long debugctlmsr;
+#ifdef CONFIG_X86_DS
+/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */
+ struct ds_context *ds_ctx;
+#endif /* CONFIG_X86_DS */
+#ifdef CONFIG_X86_PTRACE_BTS
+/* the signal to send on a bts buffer overflow */
+ unsigned int bts_ovfl_signal;
+#endif /* CONFIG_X86_PTRACE_BTS */
+};
+
+static inline unsigned long native_get_debugreg(int regno)
+{
+ unsigned long val = 0; /* Damn you, gcc! */
+
+ switch (regno) {
+ case 0:
+ asm("mov %%db0, %0" :"=r" (val));
+ break;
+ case 1:
+ asm("mov %%db1, %0" :"=r" (val));
+ break;
+ case 2:
+ asm("mov %%db2, %0" :"=r" (val));
+ break;
+ case 3:
+ asm("mov %%db3, %0" :"=r" (val));
+ break;
+ case 6:
+ asm("mov %%db6, %0" :"=r" (val));
+ break;
+ case 7:
+ asm("mov %%db7, %0" :"=r" (val));
+ break;
+ default:
+ BUG();
+ }
+ return val;
+}
+
+static inline void native_set_debugreg(int regno, unsigned long value)
+{
+ switch (regno) {
+ case 0:
+ asm("mov %0, %%db0" ::"r" (value));
+ break;
+ case 1:
+ asm("mov %0, %%db1" ::"r" (value));
+ break;
+ case 2:
+ asm("mov %0, %%db2" ::"r" (value));
+ break;
+ case 3:
+ asm("mov %0, %%db3" ::"r" (value));
+ break;
+ case 6:
+ asm("mov %0, %%db6" ::"r" (value));
+ break;
+ case 7:
+ asm("mov %0, %%db7" ::"r" (value));
+ break;
+ default:
+ BUG();
+ }
+}
+
+/*
+ * Set IOPL bits in EFLAGS from given mask
+ */
+static inline void native_set_iopl_mask(unsigned mask)
+{
+#ifdef CONFIG_X86_32
+ unsigned int reg;
+
+ asm volatile ("pushfl;"
+ "popl %0;"
+ "andl %1, %0;"
+ "orl %2, %0;"
+ "pushl %0;"
+ "popfl"
+ : "=&r" (reg)
+ : "i" (~X86_EFLAGS_IOPL), "r" (mask));
+#endif
+}
+
+static inline void
+native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
+{
+ tss->x86_tss.sp0 = thread->sp0;
+#ifdef CONFIG_X86_32
+ /* Only happens when SEP is enabled, no need to test "SEP"arately: */
+ if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
+ tss->x86_tss.ss1 = thread->sysenter_cs;
+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+ }
+#endif
+}
+
+static inline void native_swapgs(void)
+{
+#ifdef CONFIG_X86_64
+ asm volatile("swapgs" ::: "memory");
+#endif
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define __cpuid native_cpuid
+#define paravirt_enabled() 0
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, register) \
+ (var) = native_get_debugreg(register)
+#define set_debugreg(value, register) \
+ native_set_debugreg(register, value)
+
+static inline void load_sp0(struct tss_struct *tss,
+ struct thread_struct *thread)
+{
+ native_load_sp0(tss, thread);
+}
+
+#define set_iopl_mask native_set_iopl_mask
+#endif /* CONFIG_PARAVIRT */
+
+/*
+ * Save the cr4 feature set we're using (ie
+ * Pentium 4MB enable and PPro Global page
+ * enable), so that any CPU's that boot up
+ * after us can get the correct flags.
+ */
+extern unsigned long mmu_cr4_features;
+
+static inline void set_in_cr4(unsigned long mask)
+{
+ unsigned cr4;
+
+ mmu_cr4_features |= mask;
+ cr4 = read_cr4();
+ cr4 |= mask;
+ write_cr4(cr4);
+}
+
+static inline void clear_in_cr4(unsigned long mask)
+{
+ unsigned cr4;
+
+ mmu_cr4_features &= ~mask;
+ cr4 = read_cr4();
+ cr4 &= ~mask;
+ write_cr4(cr4);
+}
+
+typedef struct {
+ unsigned long seg;
+} mm_segment_t;
+
+
+/*
+ * create a kernel thread without removing it from tasklists
+ */
+extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+
+/* Free all resources held by a thread. */
+extern void release_thread(struct task_struct *);
+
+/* Prepare to copy thread state - unlazy all lazy state */
+extern void prepare_to_copy(struct task_struct *tsk);
+
+unsigned long get_wchan(struct task_struct *p);
+
+/*
+ * Generic CPUID function
+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
+ * resulting in stale register contents being returned.
+ */
+static inline void cpuid(unsigned int op,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ *eax = op;
+ *ecx = 0;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ecx */
+static inline void cpuid_count(unsigned int op, int count,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ *eax = op;
+ *ecx = count;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * CPUID functions returning a single datum
+ */
+static inline unsigned int cpuid_eax(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return eax;
+}
+
+static inline unsigned int cpuid_ebx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ebx;
+}
+
+static inline unsigned int cpuid_ecx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ecx;
+}
+
+static inline unsigned int cpuid_edx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return edx;
+}
+
+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+static inline void rep_nop(void)
+{
+ asm volatile("rep; nop" ::: "memory");
+}
+
+static inline void cpu_relax(void)
+{
+ rep_nop();
+}
+
+/* Stop speculative execution: */
+static inline void sync_core(void)
+{
+ int tmp;
+
+ asm volatile("cpuid" : "=a" (tmp) : "0" (1)
+ : "ebx", "ecx", "edx", "memory");
+}
+
+static inline void __monitor(const void *eax, unsigned long ecx,
+ unsigned long edx)
+{
+ /* "monitor %eax, %ecx, %edx;" */
+ asm volatile(".byte 0x0f, 0x01, 0xc8;"
+ :: "a" (eax), "c" (ecx), "d"(edx));
+}
+
+static inline void __mwait(unsigned long eax, unsigned long ecx)
+{
+ /* "mwait %eax, %ecx;" */
+ asm volatile(".byte 0x0f, 0x01, 0xc9;"
+ :: "a" (eax), "c" (ecx));
+}
+
+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+{
+ trace_hardirqs_on();
+ /* "mwait %eax, %ecx;" */
+ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
+ :: "a" (eax), "c" (ecx));
+}
+
+extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
+
+extern void select_idle_routine(const struct cpuinfo_x86 *c);
+
+extern unsigned long boot_option_idle_override;
+extern unsigned long idle_halt;
+extern unsigned long idle_nomwait;
+
+/*
+ * on systems with caches, caches must be flashed as the absolute
+ * last instruction before going into a suspended halt. Otherwise,
+ * dirty data can linger in the cache and become stale on resume,
+ * leading to strange errors.
+ *
+ * perform a variety of operations to guarantee that the compiler
+ * will not reorder instructions. wbinvd itself is serializing
+ * so the processor will not reorder.
+ *
+ * Systems without cache can just go into halt.
+ */
+static inline void wbinvd_halt(void)
+{
+ mb();
+ /* check for clflush to determine if wbinvd is legal */
+ if (cpu_has_clflush)
+ asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
+ else
+ while (1)
+ halt();
+}
+
+extern void enable_sep_cpu(void);
+extern int sysenter_setup(void);
+
+/* Defined in head.S */
+extern struct desc_ptr early_gdt_descr;
+
+extern void cpu_set_gdt(int);
+extern void switch_to_new_gdt(void);
+extern void cpu_init(void);
+extern void init_gdt(int cpu);
+
+static inline unsigned long get_debugctlmsr(void)
+{
+ unsigned long debugctlmsr = 0;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return 0;
+#endif
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+
+ return debugctlmsr;
+}
+
+static inline void update_debugctlmsr(unsigned long debugctlmsr)
+{
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return;
+#endif
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+}
+
+/*
+ * from system description table in BIOS. Mostly for MCA use, but
+ * others may find it useful:
+ */
+extern unsigned int machine_id;
+extern unsigned int machine_submodel_id;
+extern unsigned int BIOS_revision;
+
+/* Boot loader type from the setup header: */
+extern int bootloader_type;
+
+extern char ignore_fpu_irq;
+
+#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
+#define ARCH_HAS_PREFETCHW
+#define ARCH_HAS_SPINLOCK_PREFETCH
+
+#ifdef CONFIG_X86_32
+# define BASE_PREFETCH ASM_NOP4
+# define ARCH_HAS_PREFETCH
+#else
+# define BASE_PREFETCH "prefetcht0 (%1)"
+#endif
+
+/*
+ * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
+ *
+ * It's not worth to care about 3dnow prefetches for the K6
+ * because they are microcoded there and very slow.
+ */
+static inline void prefetch(const void *x)
+{
+ alternative_input(BASE_PREFETCH,
+ "prefetchnta (%1)",
+ X86_FEATURE_XMM,
+ "r" (x));
+}
+
+/*
+ * 3dnow prefetch to get an exclusive cache line.
+ * Useful for spinlocks to avoid one state transition in the
+ * cache coherency protocol:
+ */
+static inline void prefetchw(const void *x)
+{
+ alternative_input(BASE_PREFETCH,
+ "prefetchw (%1)",
+ X86_FEATURE_3DNOW,
+ "r" (x));
+}
+
+static inline void spin_lock_prefetch(const void *x)
+{
+ prefetchw(x);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * User space process size: 3GB (default).
+ */
+#define TASK_SIZE PAGE_OFFSET
+#define STACK_TOP TASK_SIZE
+#define STACK_TOP_MAX STACK_TOP
+
+#define INIT_THREAD { \
+ .sp0 = sizeof(init_stack) + (long)&init_stack, \
+ .vm86_info = NULL, \
+ .sysenter_cs = __KERNEL_CS, \
+ .io_bitmap_ptr = NULL, \
+ .fs = __KERNEL_PERCPU, \
+}
+
+/*
+ * Note that the .io_bitmap member must be extra-big. This is because
+ * the CPU will access an additional byte beyond the end of the IO
+ * permission bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+#define INIT_TSS { \
+ .x86_tss = { \
+ .sp0 = sizeof(init_stack) + (long)&init_stack, \
+ .ss0 = __KERNEL_DS, \
+ .ss1 = __KERNEL_CS, \
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
+ }, \
+ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
+}
+
+extern unsigned long thread_saved_pc(struct task_struct *tsk);
+
+#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
+#define KSTK_TOP(info) \
+({ \
+ unsigned long *__ptr = (unsigned long *)(info); \
+ (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
+})
+
+/*
+ * The below -8 is to reserve 8 bytes on top of the ring0 stack.
+ * This is necessary to guarantee that the entire "struct pt_regs"
+ * is accessable even if the CPU haven't stored the SS/ESP registers
+ * on the stack (interrupt gate does not save these registers
+ * when switching to the same priv ring).
+ * Therefore beware: accessing the ss/esp fields of the
+ * "struct pt_regs" is possible, but they may contain the
+ * completely wrong values.
+ */
+#define task_pt_regs(task) \
+({ \
+ struct pt_regs *__regs__; \
+ __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
+ __regs__ - 1; \
+})
+
+#define KSTK_ESP(task) (task_pt_regs(task)->sp)
+
+#else
+/*
+ * User space process size. 47bits minus one guard page.
+ */
+#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE)
+
+/* This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
+ 0xc0000000 : 0xFFFFe000)
+
+#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
+ IA32_PAGE_OFFSET : TASK_SIZE64)
+#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
+ IA32_PAGE_OFFSET : TASK_SIZE64)
+
+#define STACK_TOP TASK_SIZE
+#define STACK_TOP_MAX TASK_SIZE64
+
+#define INIT_THREAD { \
+ .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+}
+
+#define INIT_TSS { \
+ .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+}
+
+/*
+ * Return saved PC of a blocked thread.
+ * What is this good for? it will be always the scheduler or ret_from_fork.
+ */
+#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
+
+#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
+#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
+#endif /* CONFIG_X86_64 */
+
+extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
+ unsigned long new_sp);
+
+/*
+ * This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
+
+#define KSTK_EIP(task) (task_pt_regs(task)->ip)
+
+/* Get/set a process' ability to use the timestamp counter instruction */
+#define GET_TSC_CTL(adr) get_tsc_mode((adr))
+#define SET_TSC_CTL(val) set_tsc_mode((val))
+
+extern int get_tsc_mode(unsigned long adr);
+extern int set_tsc_mode(unsigned int val);
+
+#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
new file mode 100644
index 000000000000..d6a22f92ba77
--- /dev/null
+++ b/arch/x86/include/asm/proto.h
@@ -0,0 +1,32 @@
+#ifndef _ASM_X86_PROTO_H
+#define _ASM_X86_PROTO_H
+
+#include <asm/ldt.h>
+
+/* misc architecture specific prototypes */
+
+extern void early_idt_handler(void);
+
+extern void system_call(void);
+extern void syscall_init(void);
+
+extern void ia32_syscall(void);
+extern void ia32_cstar_target(void);
+extern void ia32_sysenter_target(void);
+
+extern void syscall32_cpu_init(void);
+
+extern void check_efer(void);
+
+#ifdef CONFIG_X86_BIOS_REBOOT
+extern int reboot_force;
+#else
+static const int reboot_force = 0;
+#endif
+
+long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
+
+#define round_up(x, y) (((x) + (y) - 1) & ~((y) - 1))
+#define round_down(x, y) ((x) & ~((y) - 1))
+
+#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
new file mode 100644
index 000000000000..25f1bb8fc626
--- /dev/null
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -0,0 +1,145 @@
+#ifndef _ASM_X86_PTRACE_ABI_H
+#define _ASM_X86_PTRACE_ABI_H
+
+#ifdef __i386__
+
+#define EBX 0
+#define ECX 1
+#define EDX 2
+#define ESI 3
+#define EDI 4
+#define EBP 5
+#define EAX 6
+#define DS 7
+#define ES 8
+#define FS 9
+#define GS 10
+#define ORIG_EAX 11
+#define EIP 12
+#define CS 13
+#define EFL 14
+#define UESP 15
+#define SS 16
+#define FRAME_SIZE 17
+
+#else /* __i386__ */
+
+#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
+#define R15 0
+#define R14 8
+#define R13 16
+#define R12 24
+#define RBP 32
+#define RBX 40
+/* arguments: interrupts/non tracing syscalls only save upto here*/
+#define R11 48
+#define R10 56
+#define R9 64
+#define R8 72
+#define RAX 80
+#define RCX 88
+#define RDX 96
+#define RSI 104
+#define RDI 112
+#define ORIG_RAX 120 /* = ERROR */
+/* end of arguments */
+/* cpu exception frame or undefined in case of fast syscall. */
+#define RIP 128
+#define CS 136
+#define EFLAGS 144
+#define RSP 152
+#define SS 160
+#define ARGOFFSET R11
+#endif /* __ASSEMBLY__ */
+
+/* top of stack page */
+#define FRAME_SIZE 168
+
+#endif /* !__i386__ */
+
+/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
+#define PTRACE_GETREGS 12
+#define PTRACE_SETREGS 13
+#define PTRACE_GETFPREGS 14
+#define PTRACE_SETFPREGS 15
+#define PTRACE_GETFPXREGS 18
+#define PTRACE_SETFPXREGS 19
+
+#define PTRACE_OLDSETOPTIONS 21
+
+/* only useful for access 32bit programs / kernels */
+#define PTRACE_GET_THREAD_AREA 25
+#define PTRACE_SET_THREAD_AREA 26
+
+#ifdef __x86_64__
+# define PTRACE_ARCH_PRCTL 30
+#endif
+
+#define PTRACE_SYSEMU 31
+#define PTRACE_SYSEMU_SINGLESTEP 32
+
+#define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */
+
+#ifdef CONFIG_X86_PTRACE_BTS
+
+#ifndef __ASSEMBLY__
+#include <asm/types.h>
+
+/* configuration/status structure used in PTRACE_BTS_CONFIG and
+ PTRACE_BTS_STATUS commands.
+*/
+struct ptrace_bts_config {
+ /* requested or actual size of BTS buffer in bytes */
+ __u32 size;
+ /* bitmask of below flags */
+ __u32 flags;
+ /* buffer overflow signal */
+ __u32 signal;
+ /* actual size of bts_struct in bytes */
+ __u32 bts_size;
+};
+#endif /* __ASSEMBLY__ */
+
+#define PTRACE_BTS_O_TRACE 0x1 /* branch trace */
+#define PTRACE_BTS_O_SCHED 0x2 /* scheduling events w/ jiffies */
+#define PTRACE_BTS_O_SIGNAL 0x4 /* send SIG<signal> on buffer overflow
+ instead of wrapping around */
+#define PTRACE_BTS_O_ALLOC 0x8 /* (re)allocate buffer */
+
+#define PTRACE_BTS_CONFIG 40
+/* Configure branch trace recording.
+ ADDR points to a struct ptrace_bts_config.
+ DATA gives the size of that buffer.
+ A new buffer is allocated, if requested in the flags.
+ An overflow signal may only be requested for new buffers.
+ Returns the number of bytes read.
+*/
+#define PTRACE_BTS_STATUS 41
+/* Return the current configuration in a struct ptrace_bts_config
+ pointed to by ADDR; DATA gives the size of that buffer.
+ Returns the number of bytes written.
+*/
+#define PTRACE_BTS_SIZE 42
+/* Return the number of available BTS records for draining.
+ DATA and ADDR are ignored.
+*/
+#define PTRACE_BTS_GET 43
+/* Get a single BTS record.
+ DATA defines the index into the BTS array, where 0 is the newest
+ entry, and higher indices refer to older entries.
+ ADDR is pointing to struct bts_struct (see asm/ds.h).
+*/
+#define PTRACE_BTS_CLEAR 44
+/* Clear the BTS buffer.
+ DATA and ADDR are ignored.
+*/
+#define PTRACE_BTS_DRAIN 45
+/* Read all available BTS records and clear the buffer.
+ ADDR points to an array of struct bts_struct.
+ DATA gives the size of that buffer.
+ BTS records are read from oldest to newest.
+ Returns number of BTS records drained.
+*/
+#endif /* CONFIG_X86_PTRACE_BTS */
+
+#endif /* _ASM_X86_PTRACE_ABI_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
new file mode 100644
index 000000000000..6d34d954c228
--- /dev/null
+++ b/arch/x86/include/asm/ptrace.h
@@ -0,0 +1,249 @@
+#ifndef _ASM_X86_PTRACE_H
+#define _ASM_X86_PTRACE_H
+
+#include <linux/compiler.h> /* For __user */
+#include <asm/ptrace-abi.h>
+#include <asm/processor-flags.h>
+
+#ifdef __KERNEL__
+#include <asm/segment.h>
+#endif
+
+#ifndef __ASSEMBLY__
+
+#ifdef __i386__
+/* this struct defines the way the registers are stored on the
+ stack during a system call. */
+
+#ifndef __KERNEL__
+
+struct pt_regs {
+ long ebx;
+ long ecx;
+ long edx;
+ long esi;
+ long edi;
+ long ebp;
+ long eax;
+ int xds;
+ int xes;
+ int xfs;
+ /* int gs; */
+ long orig_eax;
+ long eip;
+ int xcs;
+ long eflags;
+ long esp;
+ int xss;
+};
+
+#else /* __KERNEL__ */
+
+struct pt_regs {
+ unsigned long bx;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long si;
+ unsigned long di;
+ unsigned long bp;
+ unsigned long ax;
+ unsigned long ds;
+ unsigned long es;
+ unsigned long fs;
+ /* int gs; */
+ unsigned long orig_ax;
+ unsigned long ip;
+ unsigned long cs;
+ unsigned long flags;
+ unsigned long sp;
+ unsigned long ss;
+};
+
+#endif /* __KERNEL__ */
+
+#else /* __i386__ */
+
+#ifndef __KERNEL__
+
+struct pt_regs {
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+ unsigned long rbp;
+ unsigned long rbx;
+/* arguments: non interrupts/non tracing syscalls only save upto here*/
+ unsigned long r11;
+ unsigned long r10;
+ unsigned long r9;
+ unsigned long r8;
+ unsigned long rax;
+ unsigned long rcx;
+ unsigned long rdx;
+ unsigned long rsi;
+ unsigned long rdi;
+ unsigned long orig_rax;
+/* end of arguments */
+/* cpu exception frame or undefined */
+ unsigned long rip;
+ unsigned long cs;
+ unsigned long eflags;
+ unsigned long rsp;
+ unsigned long ss;
+/* top of stack page */
+};
+
+#else /* __KERNEL__ */
+
+struct pt_regs {
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+ unsigned long bp;
+ unsigned long bx;
+/* arguments: non interrupts/non tracing syscalls only save upto here*/
+ unsigned long r11;
+ unsigned long r10;
+ unsigned long r9;
+ unsigned long r8;
+ unsigned long ax;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long si;
+ unsigned long di;
+ unsigned long orig_ax;
+/* end of arguments */
+/* cpu exception frame or undefined */
+ unsigned long ip;
+ unsigned long cs;
+ unsigned long flags;
+ unsigned long sp;
+ unsigned long ss;
+/* top of stack page */
+};
+
+#endif /* __KERNEL__ */
+#endif /* !__i386__ */
+
+
+#ifdef __KERNEL__
+
+#include <linux/init.h>
+
+struct cpuinfo_x86;
+struct task_struct;
+
+extern unsigned long profile_pc(struct pt_regs *regs);
+
+extern unsigned long
+convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
+extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
+ int error_code, int si_code);
+void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
+
+extern long syscall_trace_enter(struct pt_regs *);
+extern void syscall_trace_leave(struct pt_regs *);
+
+static inline unsigned long regs_return_value(struct pt_regs *regs)
+{
+ return regs->ax;
+}
+
+/*
+ * user_mode_vm(regs) determines whether a register set came from user mode.
+ * This is true if V8086 mode was enabled OR if the register set was from
+ * protected mode with RPL-3 CS value. This tricky test checks that with
+ * one comparison. Many places in the kernel can bypass this full check
+ * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ */
+static inline int user_mode(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_32
+ return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL;
+#else
+ return !!(regs->cs & 3);
+#endif
+}
+
+static inline int user_mode_vm(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_32
+ return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >=
+ USER_RPL;
+#else
+ return user_mode(regs);
+#endif
+}
+
+static inline int v8086_mode(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_32
+ return (regs->flags & X86_VM_MASK);
+#else
+ return 0; /* No V86 mode support in long mode */
+#endif
+}
+
+/*
+ * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
+ * when it traps. So regs will be the current sp.
+ *
+ * This is valid only for kernel mode traps.
+ */
+static inline unsigned long kernel_trap_sp(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_32
+ return (unsigned long)regs;
+#else
+ return regs->sp;
+#endif
+}
+
+static inline unsigned long instruction_pointer(struct pt_regs *regs)
+{
+ return regs->ip;
+}
+
+static inline unsigned long frame_pointer(struct pt_regs *regs)
+{
+ return regs->bp;
+}
+
+static inline unsigned long user_stack_pointer(struct pt_regs *regs)
+{
+ return regs->sp;
+}
+
+/*
+ * These are defined as per linux/ptrace.h, which see.
+ */
+#define arch_has_single_step() (1)
+extern void user_enable_single_step(struct task_struct *);
+extern void user_disable_single_step(struct task_struct *);
+
+extern void user_enable_block_step(struct task_struct *);
+#ifdef CONFIG_X86_DEBUGCTLMSR
+#define arch_has_block_step() (1)
+#else
+#define arch_has_block_step() (boot_cpu_data.x86 >= 6)
+#endif
+
+struct user_desc;
+extern int do_get_thread_area(struct task_struct *p, int idx,
+ struct user_desc __user *info);
+extern int do_set_thread_area(struct task_struct *p, int idx,
+ struct user_desc __user *info, int can_allocate);
+
+extern void x86_ptrace_untrace(struct task_struct *);
+extern void x86_ptrace_fork(struct task_struct *child,
+ unsigned long clone_flags);
+
+#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk)
+#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags)
+
+#endif /* __KERNEL__ */
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PTRACE_H */
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
new file mode 100644
index 000000000000..6d93508f2626
--- /dev/null
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -0,0 +1,42 @@
+#ifndef _ASM_X86_PVCLOCK_ABI_H
+#define _ASM_X86_PVCLOCK_ABI_H
+#ifndef __ASSEMBLY__
+
+/*
+ * These structs MUST NOT be changed.
+ * They are the ABI between hypervisor and guest OS.
+ * Both Xen and KVM are using this.
+ *
+ * pvclock_vcpu_time_info holds the system time and the tsc timestamp
+ * of the last update. So the guest can use the tsc delta to get a
+ * more precise system time. There is one per virtual cpu.
+ *
+ * pvclock_wall_clock references the point in time when the system
+ * time was zero (usually boot time), thus the guest calculates the
+ * current wall clock by adding the system time.
+ *
+ * Protocol for the "version" fields is: hypervisor raises it (making
+ * it uneven) before it starts updating the fields and raises it again
+ * (making it even) when it is done. Thus the guest can make sure the
+ * time values it got are consistent by checking the version before
+ * and after reading them.
+ */
+
+struct pvclock_vcpu_time_info {
+ u32 version;
+ u32 pad0;
+ u64 tsc_timestamp;
+ u64 system_time;
+ u32 tsc_to_system_mul;
+ s8 tsc_shift;
+ u8 pad[3];
+} __attribute__((__packed__)); /* 32 bytes */
+
+struct pvclock_wall_clock {
+ u32 version;
+ u32 sec;
+ u32 nsec;
+} __attribute__((__packed__));
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
new file mode 100644
index 000000000000..53235fd5f8ce
--- /dev/null
+++ b/arch/x86/include/asm/pvclock.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_X86_PVCLOCK_H
+#define _ASM_X86_PVCLOCK_H
+
+#include <linux/clocksource.h>
+#include <asm/pvclock-abi.h>
+
+/* some helper functions for xen and kvm pv clock sources */
+cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
+void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
+ struct pvclock_vcpu_time_info *vcpu,
+ struct timespec *ts);
+
+#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
new file mode 100644
index 000000000000..562d4fd31ba8
--- /dev/null
+++ b/arch/x86/include/asm/reboot.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_X86_REBOOT_H
+#define _ASM_X86_REBOOT_H
+
+#include <linux/kdebug.h>
+
+struct pt_regs;
+
+struct machine_ops {
+ void (*restart)(char *cmd);
+ void (*halt)(void);
+ void (*power_off)(void);
+ void (*shutdown)(void);
+ void (*crash_shutdown)(struct pt_regs *);
+ void (*emergency_restart)(void);
+};
+
+extern struct machine_ops machine_ops;
+
+void native_machine_crash_shutdown(struct pt_regs *regs);
+void native_machine_shutdown(void);
+void machine_real_restart(const unsigned char *code, int length);
+
+typedef void (*nmi_shootdown_cb)(int, struct die_args*);
+void nmi_shootdown_cpus(nmi_shootdown_cb callback);
+
+#endif /* _ASM_X86_REBOOT_H */
diff --git a/arch/x86/include/asm/reboot_fixups.h b/arch/x86/include/asm/reboot_fixups.h
new file mode 100644
index 000000000000..765debe4c54c
--- /dev/null
+++ b/arch/x86/include/asm/reboot_fixups.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_X86_REBOOT_FIXUPS_H
+#define _ASM_X86_REBOOT_FIXUPS_H
+
+extern void mach_reboot_fixups(void);
+
+#endif /* _ASM_X86_REBOOT_FIXUPS_H */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
new file mode 100644
index 000000000000..d5cd6c586881
--- /dev/null
+++ b/arch/x86/include/asm/required-features.h
@@ -0,0 +1,82 @@
+#ifndef _ASM_X86_REQUIRED_FEATURES_H
+#define _ASM_X86_REQUIRED_FEATURES_H
+
+/* Define minimum CPUID feature set for kernel These bits are checked
+ really early to actually display a visible error message before the
+ kernel dies. Make sure to assign features to the proper mask!
+
+ Some requirements that are not in CPUID yet are also in the
+ CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too.
+
+ The real information is in arch/x86/Kconfig.cpu, this just converts
+ the CONFIGs into a bitmask */
+
+#ifndef CONFIG_MATH_EMULATION
+# define NEED_FPU (1<<(X86_FEATURE_FPU & 31))
+#else
+# define NEED_FPU 0
+#endif
+
+#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+# define NEED_PAE (1<<(X86_FEATURE_PAE & 31))
+#else
+# define NEED_PAE 0
+#endif
+
+#ifdef CONFIG_X86_CMPXCHG64
+# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31))
+#else
+# define NEED_CX8 0
+#endif
+
+#if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64)
+# define NEED_CMOV (1<<(X86_FEATURE_CMOV & 31))
+#else
+# define NEED_CMOV 0
+#endif
+
+#ifdef CONFIG_X86_USE_3DNOW
+# define NEED_3DNOW (1<<(X86_FEATURE_3DNOW & 31))
+#else
+# define NEED_3DNOW 0
+#endif
+
+#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64)
+# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31))
+#else
+# define NEED_NOPL 0
+#endif
+
+#ifdef CONFIG_X86_64
+#define NEED_PSE 0
+#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
+#define NEED_PGE (1<<(X86_FEATURE_PGE & 31))
+#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31))
+#define NEED_XMM (1<<(X86_FEATURE_XMM & 31))
+#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31))
+#define NEED_LM (1<<(X86_FEATURE_LM & 31))
+#else
+#define NEED_PSE 0
+#define NEED_MSR 0
+#define NEED_PGE 0
+#define NEED_FXSR 0
+#define NEED_XMM 0
+#define NEED_XMM2 0
+#define NEED_LM 0
+#endif
+
+#define REQUIRED_MASK0 (NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\
+ NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\
+ NEED_XMM|NEED_XMM2)
+#define SSE_MASK (NEED_XMM|NEED_XMM2)
+
+#define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW)
+
+#define REQUIRED_MASK2 0
+#define REQUIRED_MASK3 (NEED_NOPL)
+#define REQUIRED_MASK4 0
+#define REQUIRED_MASK5 0
+#define REQUIRED_MASK6 0
+#define REQUIRED_MASK7 0
+
+#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/resource.h b/arch/x86/include/asm/resource.h
new file mode 100644
index 000000000000..04bc4db8921b
--- /dev/null
+++ b/arch/x86/include/asm/resource.h
@@ -0,0 +1 @@
+#include <asm-generic/resource.h>
diff --git a/arch/x86/include/asm/resume-trace.h b/arch/x86/include/asm/resume-trace.h
new file mode 100644
index 000000000000..3ff1c2cb1da5
--- /dev/null
+++ b/arch/x86/include/asm/resume-trace.h
@@ -0,0 +1,21 @@
+#ifndef _ASM_X86_RESUME_TRACE_H
+#define _ASM_X86_RESUME_TRACE_H
+
+#include <asm/asm.h>
+
+#define TRACE_RESUME(user) \
+do { \
+ if (pm_trace_enabled) { \
+ const void *tracedata; \
+ asm volatile(_ASM_MOV " $1f,%0\n" \
+ ".section .tracedata,\"a\"\n" \
+ "1:\t.word %c1\n\t" \
+ _ASM_PTR " %c2\n" \
+ ".previous" \
+ :"=r" (tracedata) \
+ : "i" (__LINE__), "i" (__FILE__)); \
+ generate_resume_trace(tracedata, user); \
+ } \
+} while (0)
+
+#endif /* _ASM_X86_RESUME_TRACE_H */
diff --git a/arch/x86/include/asm/rio.h b/arch/x86/include/asm/rio.h
new file mode 100644
index 000000000000..97bab6388a92
--- /dev/null
+++ b/arch/x86/include/asm/rio.h
@@ -0,0 +1,63 @@
+/*
+ * Derived from include/asm-x86/mach-summit/mach_mpparse.h
+ * and include/asm-x86/mach-default/bios_ebda.h
+ *
+ * Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ */
+
+#ifndef _ASM_X86_RIO_H
+#define _ASM_X86_RIO_H
+
+#define RIO_TABLE_VERSION 3
+
+struct rio_table_hdr {
+ u8 version; /* Version number of this data structure */
+ u8 num_scal_dev; /* # of Scalability devices */
+ u8 num_rio_dev; /* # of RIO I/O devices */
+} __attribute__((packed));
+
+struct scal_detail {
+ u8 node_id; /* Scalability Node ID */
+ u32 CBAR; /* Address of 1MB register space */
+ u8 port0node; /* Node ID port connected to: 0xFF=None */
+ u8 port0port; /* Port num port connected to: 0,1,2, or */
+ /* 0xFF=None */
+ u8 port1node; /* Node ID port connected to: 0xFF = None */
+ u8 port1port; /* Port num port connected to: 0,1,2, or */
+ /* 0xFF=None */
+ u8 port2node; /* Node ID port connected to: 0xFF = None */
+ u8 port2port; /* Port num port connected to: 0,1,2, or */
+ /* 0xFF=None */
+ u8 chassis_num; /* 1 based Chassis number (1 = boot node) */
+} __attribute__((packed));
+
+struct rio_detail {
+ u8 node_id; /* RIO Node ID */
+ u32 BBAR; /* Address of 1MB register space */
+ u8 type; /* Type of device */
+ u8 owner_id; /* Node ID of Hurricane that owns this */
+ /* node */
+ u8 port0node; /* Node ID port connected to: 0xFF=None */
+ u8 port0port; /* Port num port connected to: 0,1,2, or */
+ /* 0xFF=None */
+ u8 port1node; /* Node ID port connected to: 0xFF=None */
+ u8 port1port; /* Port num port connected to: 0,1,2, or */
+ /* 0xFF=None */
+ u8 first_slot; /* Lowest slot number below this Calgary */
+ u8 status; /* Bit 0 = 1 : the XAPIC is used */
+ /* = 0 : the XAPIC is not used, ie: */
+ /* ints fwded to another XAPIC */
+ /* Bits1:7 Reserved */
+ u8 WP_index; /* instance index - lower ones have */
+ /* lower slot numbers/PCI bus numbers */
+ u8 chassis_num; /* 1 based Chassis number */
+} __attribute__((packed));
+
+enum {
+ HURR_SCALABILTY = 0, /* Hurricane Scalability info */
+ HURR_RIOIB = 2, /* Hurricane RIOIB info */
+ COMPAT_CALGARY = 4, /* Compatibility Calgary */
+ ALT_CALGARY = 5, /* Second Planar Calgary */
+};
+
+#endif /* _ASM_X86_RIO_H */
diff --git a/arch/x86/include/asm/rtc.h b/arch/x86/include/asm/rtc.h
new file mode 100644
index 000000000000..f71c3b0ed360
--- /dev/null
+++ b/arch/x86/include/asm/rtc.h
@@ -0,0 +1 @@
+#include <asm-generic/rtc.h>
diff --git a/arch/x86/include/asm/rwlock.h b/arch/x86/include/asm/rwlock.h
new file mode 100644
index 000000000000..6a8c0d645108
--- /dev/null
+++ b/arch/x86/include/asm/rwlock.h
@@ -0,0 +1,8 @@
+#ifndef _ASM_X86_RWLOCK_H
+#define _ASM_X86_RWLOCK_H
+
+#define RW_LOCK_BIAS 0x01000000
+
+/* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */
+
+#endif /* _ASM_X86_RWLOCK_H */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
new file mode 100644
index 000000000000..ca7517d33776
--- /dev/null
+++ b/arch/x86/include/asm/rwsem.h
@@ -0,0 +1,265 @@
+/* rwsem.h: R/W semaphores implemented using XADD/CMPXCHG for i486+
+ *
+ * Written by David Howells (dhowells@redhat.com).
+ *
+ * Derived from asm-x86/semaphore.h
+ *
+ *
+ * The MSW of the count is the negated number of active writers and waiting
+ * lockers, and the LSW is the total number of active locks
+ *
+ * The lock count is initialized to 0 (no active and no waiting lockers).
+ *
+ * When a writer subtracts WRITE_BIAS, it'll get 0xffff0001 for the case of an
+ * uncontended lock. This can be determined because XADD returns the old value.
+ * Readers increment by 1 and see a positive value when uncontended, negative
+ * if there are writers (and maybe) readers waiting (in which case it goes to
+ * sleep).
+ *
+ * The value of WAITING_BIAS supports up to 32766 waiting processes. This can
+ * be extended to 65534 by manually checking the whole MSW rather than relying
+ * on the S flag.
+ *
+ * The value of ACTIVE_BIAS supports up to 65535 active processes.
+ *
+ * This should be totally fair - if anything is waiting, a process that wants a
+ * lock will go to the back of the queue. When the currently active lock is
+ * released, if there's a writer at the front of the queue, then that and only
+ * that will be woken up; if there's a bunch of consequtive readers at the
+ * front, then they'll all be woken up, but no other readers will be.
+ */
+
+#ifndef _ASM_X86_RWSEM_H
+#define _ASM_X86_RWSEM_H
+
+#ifndef _LINUX_RWSEM_H
+#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
+#endif
+
+#ifdef __KERNEL__
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/lockdep.h>
+
+struct rwsem_waiter;
+
+extern asmregparm struct rw_semaphore *
+ rwsem_down_read_failed(struct rw_semaphore *sem);
+extern asmregparm struct rw_semaphore *
+ rwsem_down_write_failed(struct rw_semaphore *sem);
+extern asmregparm struct rw_semaphore *
+ rwsem_wake(struct rw_semaphore *);
+extern asmregparm struct rw_semaphore *
+ rwsem_downgrade_wake(struct rw_semaphore *sem);
+
+/*
+ * the semaphore definition
+ */
+
+#define RWSEM_UNLOCKED_VALUE 0x00000000
+#define RWSEM_ACTIVE_BIAS 0x00000001
+#define RWSEM_ACTIVE_MASK 0x0000ffff
+#define RWSEM_WAITING_BIAS (-0x00010000)
+#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
+#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
+
+struct rw_semaphore {
+ signed long count;
+ spinlock_t wait_lock;
+ struct list_head wait_list;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
+#else
+# define __RWSEM_DEP_MAP_INIT(lockname)
+#endif
+
+
+#define __RWSEM_INITIALIZER(name) \
+{ \
+ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
+ LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \
+}
+
+#define DECLARE_RWSEM(name) \
+ struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+
+extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key);
+
+#define init_rwsem(sem) \
+do { \
+ static struct lock_class_key __key; \
+ \
+ __init_rwsem((sem), #sem, &__key); \
+} while (0)
+
+/*
+ * lock for reading
+ */
+static inline void __down_read(struct rw_semaphore *sem)
+{
+ asm volatile("# beginning down_read\n\t"
+ LOCK_PREFIX " incl (%%eax)\n\t"
+ /* adds 0x00000001, returns the old value */
+ " jns 1f\n"
+ " call call_rwsem_down_read_failed\n"
+ "1:\n\t"
+ "# ending down_read\n\t"
+ : "+m" (sem->count)
+ : "a" (sem)
+ : "memory", "cc");
+}
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+ __s32 result, tmp;
+ asm volatile("# beginning __down_read_trylock\n\t"
+ " movl %0,%1\n\t"
+ "1:\n\t"
+ " movl %1,%2\n\t"
+ " addl %3,%2\n\t"
+ " jle 2f\n\t"
+ LOCK_PREFIX " cmpxchgl %2,%0\n\t"
+ " jnz 1b\n\t"
+ "2:\n\t"
+ "# ending __down_read_trylock\n\t"
+ : "+m" (sem->count), "=&a" (result), "=&r" (tmp)
+ : "i" (RWSEM_ACTIVE_READ_BIAS)
+ : "memory", "cc");
+ return result >= 0 ? 1 : 0;
+}
+
+/*
+ * lock for writing
+ */
+static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
+{
+ int tmp;
+
+ tmp = RWSEM_ACTIVE_WRITE_BIAS;
+ asm volatile("# beginning down_write\n\t"
+ LOCK_PREFIX " xadd %%edx,(%%eax)\n\t"
+ /* subtract 0x0000ffff, returns the old value */
+ " testl %%edx,%%edx\n\t"
+ /* was the count 0 before? */
+ " jz 1f\n"
+ " call call_rwsem_down_write_failed\n"
+ "1:\n"
+ "# ending down_write"
+ : "+m" (sem->count), "=d" (tmp)
+ : "a" (sem), "1" (tmp)
+ : "memory", "cc");
+}
+
+static inline void __down_write(struct rw_semaphore *sem)
+{
+ __down_write_nested(sem, 0);
+}
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+ signed long ret = cmpxchg(&sem->count,
+ RWSEM_UNLOCKED_VALUE,
+ RWSEM_ACTIVE_WRITE_BIAS);
+ if (ret == RWSEM_UNLOCKED_VALUE)
+ return 1;
+ return 0;
+}
+
+/*
+ * unlock after reading
+ */
+static inline void __up_read(struct rw_semaphore *sem)
+{
+ __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
+ asm volatile("# beginning __up_read\n\t"
+ LOCK_PREFIX " xadd %%edx,(%%eax)\n\t"
+ /* subtracts 1, returns the old value */
+ " jns 1f\n\t"
+ " call call_rwsem_wake\n"
+ "1:\n"
+ "# ending __up_read\n"
+ : "+m" (sem->count), "=d" (tmp)
+ : "a" (sem), "1" (tmp)
+ : "memory", "cc");
+}
+
+/*
+ * unlock after writing
+ */
+static inline void __up_write(struct rw_semaphore *sem)
+{
+ asm volatile("# beginning __up_write\n\t"
+ " movl %2,%%edx\n\t"
+ LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t"
+ /* tries to transition
+ 0xffff0001 -> 0x00000000 */
+ " jz 1f\n"
+ " call call_rwsem_wake\n"
+ "1:\n\t"
+ "# ending __up_write\n"
+ : "+m" (sem->count)
+ : "a" (sem), "i" (-RWSEM_ACTIVE_WRITE_BIAS)
+ : "memory", "cc", "edx");
+}
+
+/*
+ * downgrade write lock to read lock
+ */
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+ asm volatile("# beginning __downgrade_write\n\t"
+ LOCK_PREFIX " addl %2,(%%eax)\n\t"
+ /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
+ " jns 1f\n\t"
+ " call call_rwsem_downgrade_wake\n"
+ "1:\n\t"
+ "# ending __downgrade_write\n"
+ : "+m" (sem->count)
+ : "a" (sem), "i" (-RWSEM_WAITING_BIAS)
+ : "memory", "cc");
+}
+
+/*
+ * implement atomic add functionality
+ */
+static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+{
+ asm volatile(LOCK_PREFIX "addl %1,%0"
+ : "+m" (sem->count)
+ : "ir" (delta));
+}
+
+/*
+ * implement exchange and add functionality
+ */
+static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
+{
+ int tmp = delta;
+
+ asm volatile(LOCK_PREFIX "xadd %0,%1"
+ : "+r" (tmp), "+m" (sem->count)
+ : : "memory");
+
+ return tmp + delta;
+}
+
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+ return (sem->count != 0);
+}
+
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_RWSEM_H */
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
new file mode 100644
index 000000000000..263d397d2eef
--- /dev/null
+++ b/arch/x86/include/asm/scatterlist.h
@@ -0,0 +1,33 @@
+#ifndef _ASM_X86_SCATTERLIST_H
+#define _ASM_X86_SCATTERLIST_H
+
+#include <asm/types.h>
+
+struct scatterlist {
+#ifdef CONFIG_DEBUG_SG
+ unsigned long sg_magic;
+#endif
+ unsigned long page_link;
+ unsigned int offset;
+ unsigned int length;
+ dma_addr_t dma_address;
+ unsigned int dma_length;
+};
+
+#define ARCH_HAS_SG_CHAIN
+#define ISA_DMA_THRESHOLD (0x00ffffff)
+
+/*
+ * These macros should be used after a pci_map_sg call has been done
+ * to get bus addresses of each of the SG entries and their lengths.
+ * You should only work with the number of sg entries pci_map_sg
+ * returns.
+ */
+#define sg_dma_address(sg) ((sg)->dma_address)
+#ifdef CONFIG_X86_32
+# define sg_dma_len(sg) ((sg)->length)
+#else
+# define sg_dma_len(sg) ((sg)->dma_length)
+#endif
+
+#endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
new file mode 100644
index 000000000000..c62e58a5a90d
--- /dev/null
+++ b/arch/x86/include/asm/seccomp.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "seccomp_32.h"
+#else
+# include "seccomp_64.h"
+#endif
diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h
new file mode 100644
index 000000000000..a6ad87b352c4
--- /dev/null
+++ b/arch/x86/include/asm/seccomp_32.h
@@ -0,0 +1,17 @@
+#ifndef _ASM_X86_SECCOMP_32_H
+#define _ASM_X86_SECCOMP_32_H
+
+#include <linux/thread_info.h>
+
+#ifdef TIF_32BIT
+#error "unexpected TIF_32BIT on i386"
+#endif
+
+#include <linux/unistd.h>
+
+#define __NR_seccomp_read __NR_read
+#define __NR_seccomp_write __NR_write
+#define __NR_seccomp_exit __NR_exit
+#define __NR_seccomp_sigreturn __NR_sigreturn
+
+#endif /* _ASM_X86_SECCOMP_32_H */
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h
new file mode 100644
index 000000000000..4171bb794e9e
--- /dev/null
+++ b/arch/x86/include/asm/seccomp_64.h
@@ -0,0 +1,25 @@
+#ifndef _ASM_X86_SECCOMP_64_H
+#define _ASM_X86_SECCOMP_64_H
+
+#include <linux/thread_info.h>
+
+#ifdef TIF_32BIT
+#error "unexpected TIF_32BIT on x86_64"
+#else
+#define TIF_32BIT TIF_IA32
+#endif
+
+#include <linux/unistd.h>
+#include <asm/ia32_unistd.h>
+
+#define __NR_seccomp_read __NR_read
+#define __NR_seccomp_write __NR_write
+#define __NR_seccomp_exit __NR_exit
+#define __NR_seccomp_sigreturn __NR_rt_sigreturn
+
+#define __NR_seccomp_read_32 __NR_ia32_read
+#define __NR_seccomp_write_32 __NR_ia32_write
+#define __NR_seccomp_exit_32 __NR_ia32_exit
+#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
+
+#endif /* _ASM_X86_SECCOMP_64_H */
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
new file mode 100644
index 000000000000..2b8c5160388f
--- /dev/null
+++ b/arch/x86/include/asm/sections.h
@@ -0,0 +1 @@
+#include <asm-generic/sections.h>
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
new file mode 100644
index 000000000000..1dc1b51ac623
--- /dev/null
+++ b/arch/x86/include/asm/segment.h
@@ -0,0 +1,209 @@
+#ifndef _ASM_X86_SEGMENT_H
+#define _ASM_X86_SEGMENT_H
+
+/* Constructor for a conventional segment GDT (or LDT) entry */
+/* This is a macro so it can be used in initializers */
+#define GDT_ENTRY(flags, base, limit) \
+ ((((base) & 0xff000000ULL) << (56-24)) | \
+ (((flags) & 0x0000f0ffULL) << 40) | \
+ (((limit) & 0x000f0000ULL) << (48-16)) | \
+ (((base) & 0x00ffffffULL) << 16) | \
+ (((limit) & 0x0000ffffULL)))
+
+/* Simple and small GDT entries for booting only */
+
+#define GDT_ENTRY_BOOT_CS 2
+#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
+
+#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
+#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
+
+#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2)
+#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8)
+
+#ifdef CONFIG_X86_32
+/*
+ * The layout of the per-CPU GDT under Linux:
+ *
+ * 0 - null
+ * 1 - reserved
+ * 2 - reserved
+ * 3 - reserved
+ *
+ * 4 - unused <==== new cacheline
+ * 5 - unused
+ *
+ * ------- start of TLS (Thread-Local Storage) segments:
+ *
+ * 6 - TLS segment #1 [ glibc's TLS segment ]
+ * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
+ * 8 - TLS segment #3
+ * 9 - reserved
+ * 10 - reserved
+ * 11 - reserved
+ *
+ * ------- start of kernel segments:
+ *
+ * 12 - kernel code segment <==== new cacheline
+ * 13 - kernel data segment
+ * 14 - default user CS
+ * 15 - default user DS
+ * 16 - TSS
+ * 17 - LDT
+ * 18 - PNPBIOS support (16->32 gate)
+ * 19 - PNPBIOS support
+ * 20 - PNPBIOS support
+ * 21 - PNPBIOS support
+ * 22 - PNPBIOS support
+ * 23 - APM BIOS support
+ * 24 - APM BIOS support
+ * 25 - APM BIOS support
+ *
+ * 26 - ESPFIX small SS
+ * 27 - per-cpu [ offset to per-cpu data area ]
+ * 28 - unused
+ * 29 - unused
+ * 30 - unused
+ * 31 - TSS for double fault handler
+ */
+#define GDT_ENTRY_TLS_MIN 6
+#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
+
+#define GDT_ENTRY_DEFAULT_USER_CS 14
+
+#define GDT_ENTRY_DEFAULT_USER_DS 15
+
+#define GDT_ENTRY_KERNEL_BASE 12
+
+#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
+
+#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
+
+#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
+#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
+
+#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
+#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
+
+#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
+#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
+
+#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
+#ifdef CONFIG_SMP
+#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
+#else
+#define __KERNEL_PERCPU 0
+#endif
+
+#define GDT_ENTRY_DOUBLEFAULT_TSS 31
+
+/*
+ * The GDT has 32 entries
+ */
+#define GDT_ENTRIES 32
+
+/* The PnP BIOS entries in the GDT */
+#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
+#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
+#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
+#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
+#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
+
+/* The PnP BIOS selectors */
+#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
+#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
+#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
+#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
+#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
+
+/* Bottom two bits of selector give the ring privilege level */
+#define SEGMENT_RPL_MASK 0x3
+/* Bit 2 is table indicator (LDT/GDT) */
+#define SEGMENT_TI_MASK 0x4
+
+/* User mode is privilege level 3 */
+#define USER_RPL 0x3
+/* LDT segment has TI set, GDT has it cleared */
+#define SEGMENT_LDT 0x4
+#define SEGMENT_GDT 0x0
+
+/*
+ * Matching rules for certain types of segments.
+ */
+
+/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
+#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
+
+
+#else
+#include <asm/cache.h>
+
+#define GDT_ENTRY_KERNEL32_CS 1
+#define GDT_ENTRY_KERNEL_CS 2
+#define GDT_ENTRY_KERNEL_DS 3
+
+#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8)
+
+/*
+ * we cannot use the same code segment descriptor for user and kernel
+ * -- not even in the long flat mode, because of different DPL /kkeil
+ * The segment offset needs to contain a RPL. Grr. -AK
+ * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
+ */
+#define GDT_ENTRY_DEFAULT_USER32_CS 4
+#define GDT_ENTRY_DEFAULT_USER_DS 5
+#define GDT_ENTRY_DEFAULT_USER_CS 6
+#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
+#define __USER32_DS __USER_DS
+
+#define GDT_ENTRY_TSS 8 /* needs two entries */
+#define GDT_ENTRY_LDT 10 /* needs two entries */
+#define GDT_ENTRY_TLS_MIN 12
+#define GDT_ENTRY_TLS_MAX 14
+
+#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
+#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
+
+/* TLS indexes for 64bit - hardcoded in arch_prctl */
+#define FS_TLS 0
+#define GS_TLS 1
+
+#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
+#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
+
+#define GDT_ENTRIES 16
+
+#endif
+
+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
+#ifndef CONFIG_PARAVIRT
+#define get_kernel_rpl() 0
+#endif
+
+/* User mode is privilege level 3 */
+#define USER_RPL 0x3
+/* LDT segment has TI set, GDT has it cleared */
+#define SEGMENT_LDT 0x4
+#define SEGMENT_GDT 0x0
+
+/* Bottom two bits of selector give the ring privilege level */
+#define SEGMENT_RPL_MASK 0x3
+/* Bit 2 is table indicator (LDT/GDT) */
+#define SEGMENT_TI_MASK 0x4
+
+#define IDT_ENTRIES 256
+#define NUM_EXCEPTION_VECTORS 32
+#define GDT_SIZE (GDT_ENTRIES * 8)
+#define GDT_ENTRY_TLS_ENTRIES 3
+#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
+
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10];
+#endif
+#endif
+
+#endif /* _ASM_X86_SEGMENT_H */
diff --git a/arch/x86/include/asm/sembuf.h b/arch/x86/include/asm/sembuf.h
new file mode 100644
index 000000000000..ee50c801f7b7
--- /dev/null
+++ b/arch/x86/include/asm/sembuf.h
@@ -0,0 +1,24 @@
+#ifndef _ASM_X86_SEMBUF_H
+#define _ASM_X86_SEMBUF_H
+
+/*
+ * The semid64_ds structure for x86 architecture.
+ * Note extra padding because this structure is passed back and forth
+ * between kernel and user space.
+ *
+ * Pad space is left for:
+ * - 64-bit time_t to solve y2038 problem
+ * - 2 miscellaneous 32-bit values
+ */
+struct semid64_ds {
+ struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
+ __kernel_time_t sem_otime; /* last semop time */
+ unsigned long __unused1;
+ __kernel_time_t sem_ctime; /* last change time */
+ unsigned long __unused2;
+ unsigned long sem_nsems; /* no. of semaphores in array */
+ unsigned long __unused3;
+ unsigned long __unused4;
+};
+
+#endif /* _ASM_X86_SEMBUF_H */
diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h
new file mode 100644
index 000000000000..628c801535ea
--- /dev/null
+++ b/arch/x86/include/asm/serial.h
@@ -0,0 +1,29 @@
+#ifndef _ASM_X86_SERIAL_H
+#define _ASM_X86_SERIAL_H
+
+/*
+ * This assumes you have a 1.8432 MHz clock for your UART.
+ *
+ * It'd be nice if someone built a serial card with a 24.576 MHz
+ * clock, since the 16550A is capable of handling a top speed of 1.5
+ * megabits/second; but this requires the faster clock.
+ */
+#define BASE_BAUD ( 1843200 / 16 )
+
+/* Standard COM flags (except for COM4, because of the 8514 problem) */
+#ifdef CONFIG_SERIAL_DETECT_IRQ
+#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ)
+#define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_AUTO_IRQ)
+#else
+#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST)
+#define STD_COM4_FLAGS ASYNC_BOOT_AUTOCONF
+#endif
+
+#define SERIAL_PORT_DFNS \
+ /* UART CLK PORT IRQ FLAGS */ \
+ { 0, BASE_BAUD, 0x3F8, 4, STD_COM_FLAGS }, /* ttyS0 */ \
+ { 0, BASE_BAUD, 0x2F8, 3, STD_COM_FLAGS }, /* ttyS1 */ \
+ { 0, BASE_BAUD, 0x3E8, 4, STD_COM_FLAGS }, /* ttyS2 */ \
+ { 0, BASE_BAUD, 0x2E8, 3, STD_COM4_FLAGS }, /* ttyS3 */
+
+#endif /* _ASM_X86_SERIAL_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
new file mode 100644
index 000000000000..4fcd53fd5f43
--- /dev/null
+++ b/arch/x86/include/asm/setup.h
@@ -0,0 +1,112 @@
+#ifndef _ASM_X86_SETUP_H
+#define _ASM_X86_SETUP_H
+
+#define COMMAND_LINE_SIZE 2048
+
+#ifndef __ASSEMBLY__
+
+/* Interrupt control for vSMPowered x86_64 systems */
+void vsmp_init(void);
+
+
+void setup_bios_corruption_check(void);
+
+
+#ifdef CONFIG_X86_VISWS
+extern void visws_early_detect(void);
+extern int is_visws_box(void);
+#else
+static inline void visws_early_detect(void) { }
+static inline int is_visws_box(void) { return 0; }
+#endif
+
+extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
+extern int wakeup_secondary_cpu_via_init(int apicid, unsigned long start_eip);
+/*
+ * Any setup quirks to be performed?
+ */
+struct mpc_config_processor;
+struct mpc_config_bus;
+struct mp_config_oemtable;
+struct x86_quirks {
+ int (*arch_pre_time_init)(void);
+ int (*arch_time_init)(void);
+ int (*arch_pre_intr_init)(void);
+ int (*arch_intr_init)(void);
+ int (*arch_trap_init)(void);
+ char * (*arch_memory_setup)(void);
+ int (*mach_get_smp_config)(unsigned int early);
+ int (*mach_find_smp_config)(unsigned int reserve);
+
+ int *mpc_record;
+ int (*mpc_apic_id)(struct mpc_config_processor *m);
+ void (*mpc_oem_bus_info)(struct mpc_config_bus *m, char *name);
+ void (*mpc_oem_pci_bus)(struct mpc_config_bus *m);
+ void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable,
+ unsigned short oemsize);
+ int (*setup_ioapic_ids)(void);
+ int (*update_genapic)(void);
+};
+
+extern struct x86_quirks *x86_quirks;
+extern unsigned long saved_video_mode;
+
+#ifndef CONFIG_PARAVIRT
+#define paravirt_post_allocator_init() do {} while (0)
+#endif
+#endif /* __ASSEMBLY__ */
+
+#ifdef __KERNEL__
+
+#ifdef __i386__
+
+#include <linux/pfn.h>
+/*
+ * Reserved space for vmalloc and iomap - defined in asm/page.h
+ */
+#define MAXMEM_PFN PFN_DOWN(MAXMEM)
+#define MAX_NONPAE_PFN (1 << 20)
+
+#endif /* __i386__ */
+
+#define PARAM_SIZE 4096 /* sizeof(struct boot_params) */
+
+#define OLD_CL_MAGIC 0xA33F
+#define OLD_CL_ADDRESS 0x020 /* Relative to real mode data */
+#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
+
+#ifndef __ASSEMBLY__
+#include <asm/bootparam.h>
+
+#ifndef _SETUP
+
+/*
+ * This is set up by the setup-routine at boot-time
+ */
+extern struct boot_params boot_params;
+
+/*
+ * Do NOT EVER look at the BIOS memory size location.
+ * It does not work on many machines.
+ */
+#define LOWMEMSIZE() (0x9f000)
+
+#ifdef __i386__
+
+void __init i386_start_kernel(void);
+extern void probe_roms(void);
+
+extern unsigned long init_pg_tables_start;
+extern unsigned long init_pg_tables_end;
+
+#else
+void __init x86_64_init_pda(void);
+void __init x86_64_start_kernel(char *real_mode);
+void __init x86_64_start_reservations(char *real_mode_data);
+
+#endif /* __i386__ */
+#endif /* _SETUP */
+#endif /* __ASSEMBLY__ */
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_SETUP_H */
diff --git a/arch/x86/include/asm/shmbuf.h b/arch/x86/include/asm/shmbuf.h
new file mode 100644
index 000000000000..b51413b74971
--- /dev/null
+++ b/arch/x86/include/asm/shmbuf.h
@@ -0,0 +1,51 @@
+#ifndef _ASM_X86_SHMBUF_H
+#define _ASM_X86_SHMBUF_H
+
+/*
+ * The shmid64_ds structure for x86 architecture.
+ * Note extra padding because this structure is passed back and forth
+ * between kernel and user space.
+ *
+ * Pad space on 32 bit is left for:
+ * - 64-bit time_t to solve y2038 problem
+ * - 2 miscellaneous 32-bit values
+ *
+ * Pad space on 64 bit is left for:
+ * - 2 miscellaneous 64-bit values
+ */
+
+struct shmid64_ds {
+ struct ipc64_perm shm_perm; /* operation perms */
+ size_t shm_segsz; /* size of segment (bytes) */
+ __kernel_time_t shm_atime; /* last attach time */
+#ifdef __i386__
+ unsigned long __unused1;
+#endif
+ __kernel_time_t shm_dtime; /* last detach time */
+#ifdef __i386__
+ unsigned long __unused2;
+#endif
+ __kernel_time_t shm_ctime; /* last change time */
+#ifdef __i386__
+ unsigned long __unused3;
+#endif
+ __kernel_pid_t shm_cpid; /* pid of creator */
+ __kernel_pid_t shm_lpid; /* pid of last operator */
+ unsigned long shm_nattch; /* no. of current attaches */
+ unsigned long __unused4;
+ unsigned long __unused5;
+};
+
+struct shminfo64 {
+ unsigned long shmmax;
+ unsigned long shmmin;
+ unsigned long shmmni;
+ unsigned long shmseg;
+ unsigned long shmall;
+ unsigned long __unused1;
+ unsigned long __unused2;
+ unsigned long __unused3;
+ unsigned long __unused4;
+};
+
+#endif /* _ASM_X86_SHMBUF_H */
diff --git a/arch/x86/include/asm/shmparam.h b/arch/x86/include/asm/shmparam.h
new file mode 100644
index 000000000000..0880cf0917b9
--- /dev/null
+++ b/arch/x86/include/asm/shmparam.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_X86_SHMPARAM_H
+#define _ASM_X86_SHMPARAM_H
+
+#define SHMLBA PAGE_SIZE /* attach addr a multiple of this */
+
+#endif /* _ASM_X86_SHMPARAM_H */
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
new file mode 100644
index 000000000000..0afcb5e58acc
--- /dev/null
+++ b/arch/x86/include/asm/sigcontext.h
@@ -0,0 +1,284 @@
+#ifndef _ASM_X86_SIGCONTEXT_H
+#define _ASM_X86_SIGCONTEXT_H
+
+#include <linux/compiler.h>
+#include <asm/types.h>
+
+#define FP_XSTATE_MAGIC1 0x46505853U
+#define FP_XSTATE_MAGIC2 0x46505845U
+#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2)
+
+/*
+ * bytes 464..511 in the current 512byte layout of fxsave/fxrstor frame
+ * are reserved for SW usage. On cpu's supporting xsave/xrstor, these bytes
+ * are used to extended the fpstate pointer in the sigcontext, which now
+ * includes the extended state information along with fpstate information.
+ *
+ * Presence of FP_XSTATE_MAGIC1 at the beginning of this SW reserved
+ * area and FP_XSTATE_MAGIC2 at the end of memory layout
+ * (extended_size - FP_XSTATE_MAGIC2_SIZE) indicates the presence of the
+ * extended state information in the memory layout pointed by the fpstate
+ * pointer in sigcontext.
+ */
+struct _fpx_sw_bytes {
+ __u32 magic1; /* FP_XSTATE_MAGIC1 */
+ __u32 extended_size; /* total size of the layout referred by
+ * fpstate pointer in the sigcontext.
+ */
+ __u64 xstate_bv;
+ /* feature bit mask (including fp/sse/extended
+ * state) that is present in the memory
+ * layout.
+ */
+ __u32 xstate_size; /* actual xsave state size, based on the
+ * features saved in the layout.
+ * 'extended_size' will be greater than
+ * 'xstate_size'.
+ */
+ __u32 padding[7]; /* for future use. */
+};
+
+#ifdef __i386__
+/*
+ * As documented in the iBCS2 standard..
+ *
+ * The first part of "struct _fpstate" is just the normal i387
+ * hardware setup, the extra "status" word is used to save the
+ * coprocessor status word before entering the handler.
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ * The FPU state data structure has had to grow to accommodate the
+ * extended FPU state required by the Streaming SIMD Extensions.
+ * There is no documented standard to accomplish this at the moment.
+ */
+struct _fpreg {
+ unsigned short significand[4];
+ unsigned short exponent;
+};
+
+struct _fpxreg {
+ unsigned short significand[4];
+ unsigned short exponent;
+ unsigned short padding[3];
+};
+
+struct _xmmreg {
+ unsigned long element[4];
+};
+
+struct _fpstate {
+ /* Regular FPU environment */
+ unsigned long cw;
+ unsigned long sw;
+ unsigned long tag;
+ unsigned long ipoff;
+ unsigned long cssel;
+ unsigned long dataoff;
+ unsigned long datasel;
+ struct _fpreg _st[8];
+ unsigned short status;
+ unsigned short magic; /* 0xffff = regular FPU data only */
+
+ /* FXSR FPU environment */
+ unsigned long _fxsr_env[6]; /* FXSR FPU env is ignored */
+ unsigned long mxcsr;
+ unsigned long reserved;
+ struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */
+ struct _xmmreg _xmm[8];
+ unsigned long padding1[44];
+
+ union {
+ unsigned long padding2[12];
+ struct _fpx_sw_bytes sw_reserved; /* represents the extended
+ * state info */
+ };
+};
+
+#define X86_FXSR_MAGIC 0x0000
+
+#ifdef __KERNEL__
+struct sigcontext {
+ unsigned short gs, __gsh;
+ unsigned short fs, __fsh;
+ unsigned short es, __esh;
+ unsigned short ds, __dsh;
+ unsigned long di;
+ unsigned long si;
+ unsigned long bp;
+ unsigned long sp;
+ unsigned long bx;
+ unsigned long dx;
+ unsigned long cx;
+ unsigned long ax;
+ unsigned long trapno;
+ unsigned long err;
+ unsigned long ip;
+ unsigned short cs, __csh;
+ unsigned long flags;
+ unsigned long sp_at_signal;
+ unsigned short ss, __ssh;
+
+ /*
+ * fpstate is really (struct _fpstate *) or (struct _xstate *)
+ * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
+ * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
+ * of extended memory layout. See comments at the defintion of
+ * (struct _fpx_sw_bytes)
+ */
+ void __user *fpstate; /* zero when no FPU/extended context */
+ unsigned long oldmask;
+ unsigned long cr2;
+};
+#else /* __KERNEL__ */
+/*
+ * User-space might still rely on the old definition:
+ */
+struct sigcontext {
+ unsigned short gs, __gsh;
+ unsigned short fs, __fsh;
+ unsigned short es, __esh;
+ unsigned short ds, __dsh;
+ unsigned long edi;
+ unsigned long esi;
+ unsigned long ebp;
+ unsigned long esp;
+ unsigned long ebx;
+ unsigned long edx;
+ unsigned long ecx;
+ unsigned long eax;
+ unsigned long trapno;
+ unsigned long err;
+ unsigned long eip;
+ unsigned short cs, __csh;
+ unsigned long eflags;
+ unsigned long esp_at_signal;
+ unsigned short ss, __ssh;
+ struct _fpstate __user *fpstate;
+ unsigned long oldmask;
+ unsigned long cr2;
+};
+#endif /* !__KERNEL__ */
+
+#else /* __i386__ */
+
+/* FXSAVE frame */
+/* Note: reserved1/2 may someday contain valuable data. Always save/restore
+ them when you change signal frames. */
+struct _fpstate {
+ __u16 cwd;
+ __u16 swd;
+ __u16 twd; /* Note this is not the same as the
+ 32bit/x87/FSAVE twd */
+ __u16 fop;
+ __u64 rip;
+ __u64 rdp;
+ __u32 mxcsr;
+ __u32 mxcsr_mask;
+ __u32 st_space[32]; /* 8*16 bytes for each FP-reg */
+ __u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg */
+ __u32 reserved2[12];
+ union {
+ __u32 reserved3[12];
+ struct _fpx_sw_bytes sw_reserved; /* represents the extended
+ * state information */
+ };
+};
+
+#ifdef __KERNEL__
+struct sigcontext {
+ unsigned long r8;
+ unsigned long r9;
+ unsigned long r10;
+ unsigned long r11;
+ unsigned long r12;
+ unsigned long r13;
+ unsigned long r14;
+ unsigned long r15;
+ unsigned long di;
+ unsigned long si;
+ unsigned long bp;
+ unsigned long bx;
+ unsigned long dx;
+ unsigned long ax;
+ unsigned long cx;
+ unsigned long sp;
+ unsigned long ip;
+ unsigned long flags;
+ unsigned short cs;
+ unsigned short gs;
+ unsigned short fs;
+ unsigned short __pad0;
+ unsigned long err;
+ unsigned long trapno;
+ unsigned long oldmask;
+ unsigned long cr2;
+
+ /*
+ * fpstate is really (struct _fpstate *) or (struct _xstate *)
+ * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
+ * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
+ * of extended memory layout. See comments at the defintion of
+ * (struct _fpx_sw_bytes)
+ */
+ void __user *fpstate; /* zero when no FPU/extended context */
+ unsigned long reserved1[8];
+};
+#else /* __KERNEL__ */
+/*
+ * User-space might still rely on the old definition:
+ */
+struct sigcontext {
+ unsigned long r8;
+ unsigned long r9;
+ unsigned long r10;
+ unsigned long r11;
+ unsigned long r12;
+ unsigned long r13;
+ unsigned long r14;
+ unsigned long r15;
+ unsigned long rdi;
+ unsigned long rsi;
+ unsigned long rbp;
+ unsigned long rbx;
+ unsigned long rdx;
+ unsigned long rax;
+ unsigned long rcx;
+ unsigned long rsp;
+ unsigned long rip;
+ unsigned long eflags; /* RFLAGS */
+ unsigned short cs;
+ unsigned short gs;
+ unsigned short fs;
+ unsigned short __pad0;
+ unsigned long err;
+ unsigned long trapno;
+ unsigned long oldmask;
+ unsigned long cr2;
+ struct _fpstate __user *fpstate; /* zero when no FPU context */
+ unsigned long reserved1[8];
+};
+#endif /* !__KERNEL__ */
+
+#endif /* !__i386__ */
+
+struct _xsave_hdr {
+ __u64 xstate_bv;
+ __u64 reserved1[2];
+ __u64 reserved2[5];
+};
+
+/*
+ * Extended state pointed by the fpstate pointer in the sigcontext.
+ * In addition to the fpstate, information encoded in the xstate_hdr
+ * indicates the presence of other extended state information
+ * supported by the processor and OS.
+ */
+struct _xstate {
+ struct _fpstate fpstate;
+ struct _xsave_hdr xstate_hdr;
+ /* new processor state extensions go here */
+};
+
+#endif /* _ASM_X86_SIGCONTEXT_H */
diff --git a/arch/x86/include/asm/sigcontext32.h b/arch/x86/include/asm/sigcontext32.h
new file mode 100644
index 000000000000..6126188cf3a9
--- /dev/null
+++ b/arch/x86/include/asm/sigcontext32.h
@@ -0,0 +1,75 @@
+#ifndef _ASM_X86_SIGCONTEXT32_H
+#define _ASM_X86_SIGCONTEXT32_H
+
+/* signal context for 32bit programs. */
+
+#define X86_FXSR_MAGIC 0x0000
+
+struct _fpreg {
+ unsigned short significand[4];
+ unsigned short exponent;
+};
+
+struct _fpxreg {
+ unsigned short significand[4];
+ unsigned short exponent;
+ unsigned short padding[3];
+};
+
+struct _xmmreg {
+ __u32 element[4];
+};
+
+/* FSAVE frame with extensions */
+struct _fpstate_ia32 {
+ /* Regular FPU environment */
+ __u32 cw;
+ __u32 sw;
+ __u32 tag; /* not compatible to 64bit twd */
+ __u32 ipoff;
+ __u32 cssel;
+ __u32 dataoff;
+ __u32 datasel;
+ struct _fpreg _st[8];
+ unsigned short status;
+ unsigned short magic; /* 0xffff = regular FPU data only */
+
+ /* FXSR FPU environment */
+ __u32 _fxsr_env[6];
+ __u32 mxcsr;
+ __u32 reserved;
+ struct _fpxreg _fxsr_st[8];
+ struct _xmmreg _xmm[8]; /* It's actually 16 */
+ __u32 padding[44];
+ union {
+ __u32 padding2[12];
+ struct _fpx_sw_bytes sw_reserved;
+ };
+};
+
+struct sigcontext_ia32 {
+ unsigned short gs, __gsh;
+ unsigned short fs, __fsh;
+ unsigned short es, __esh;
+ unsigned short ds, __dsh;
+ unsigned int di;
+ unsigned int si;
+ unsigned int bp;
+ unsigned int sp;
+ unsigned int bx;
+ unsigned int dx;
+ unsigned int cx;
+ unsigned int ax;
+ unsigned int trapno;
+ unsigned int err;
+ unsigned int ip;
+ unsigned short cs, __csh;
+ unsigned int flags;
+ unsigned int sp_at_signal;
+ unsigned short ss, __ssh;
+ unsigned int fpstate; /* really (struct _fpstate_ia32 *) */
+ unsigned int oldmask;
+ unsigned int cr2;
+};
+
+#endif /* _ASM_X86_SIGCONTEXT32_H */
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
new file mode 100644
index 000000000000..4e0fe26d27d3
--- /dev/null
+++ b/arch/x86/include/asm/sigframe.h
@@ -0,0 +1,70 @@
+#ifndef _ASM_X86_SIGFRAME_H
+#define _ASM_X86_SIGFRAME_H
+
+#include <asm/sigcontext.h>
+#include <asm/siginfo.h>
+#include <asm/ucontext.h>
+
+#ifdef CONFIG_X86_32
+#define sigframe_ia32 sigframe
+#define rt_sigframe_ia32 rt_sigframe
+#define sigcontext_ia32 sigcontext
+#define _fpstate_ia32 _fpstate
+#define ucontext_ia32 ucontext
+#else /* !CONFIG_X86_32 */
+
+#ifdef CONFIG_IA32_EMULATION
+#include <asm/ia32.h>
+#endif /* CONFIG_IA32_EMULATION */
+
+#endif /* CONFIG_X86_32 */
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+struct sigframe_ia32 {
+ u32 pretcode;
+ int sig;
+ struct sigcontext_ia32 sc;
+ /*
+ * fpstate is unused. fpstate is moved/allocated after
+ * retcode[] below. This movement allows to have the FP state and the
+ * future state extensions (xsave) stay together.
+ * And at the same time retaining the unused fpstate, prevents changing
+ * the offset of extramask[] in the sigframe and thus prevent any
+ * legacy application accessing/modifying it.
+ */
+ struct _fpstate_ia32 fpstate_unused;
+#ifdef CONFIG_IA32_EMULATION
+ unsigned int extramask[_COMPAT_NSIG_WORDS-1];
+#else /* !CONFIG_IA32_EMULATION */
+ unsigned long extramask[_NSIG_WORDS-1];
+#endif /* CONFIG_IA32_EMULATION */
+ char retcode[8];
+ /* fp state follows here */
+};
+
+struct rt_sigframe_ia32 {
+ u32 pretcode;
+ int sig;
+ u32 pinfo;
+ u32 puc;
+#ifdef CONFIG_IA32_EMULATION
+ compat_siginfo_t info;
+#else /* !CONFIG_IA32_EMULATION */
+ struct siginfo info;
+#endif /* CONFIG_IA32_EMULATION */
+ struct ucontext_ia32 uc;
+ char retcode[8];
+ /* fp state follows here */
+};
+#endif /* defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) */
+
+#ifdef CONFIG_X86_64
+struct rt_sigframe {
+ char __user *pretcode;
+ struct ucontext uc;
+ struct siginfo info;
+ /* fp state follows here */
+};
+#endif /* CONFIG_X86_64 */
+
+#endif /* _ASM_X86_SIGFRAME_H */
diff --git a/arch/x86/include/asm/siginfo.h b/arch/x86/include/asm/siginfo.h
new file mode 100644
index 000000000000..fc1aa5535646
--- /dev/null
+++ b/arch/x86/include/asm/siginfo.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_SIGINFO_H
+#define _ASM_X86_SIGINFO_H
+
+#ifdef __x86_64__
+# define __ARCH_SI_PREAMBLE_SIZE (4 * sizeof(int))
+#endif
+
+#include <asm-generic/siginfo.h>
+
+#endif /* _ASM_X86_SIGINFO_H */
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
new file mode 100644
index 000000000000..7761a5d554bb
--- /dev/null
+++ b/arch/x86/include/asm/signal.h
@@ -0,0 +1,264 @@
+#ifndef _ASM_X86_SIGNAL_H
+#define _ASM_X86_SIGNAL_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/compiler.h>
+
+/* Avoid too many header ordering problems. */
+struct siginfo;
+
+#ifdef __KERNEL__
+#include <linux/linkage.h>
+
+/* Most things should be clean enough to redefine this at will, if care
+ is taken to make libc match. */
+
+#define _NSIG 64
+
+#ifdef __i386__
+# define _NSIG_BPW 32
+#else
+# define _NSIG_BPW 64
+#endif
+
+#define _NSIG_WORDS (_NSIG / _NSIG_BPW)
+
+typedef unsigned long old_sigset_t; /* at least 32 bits */
+
+typedef struct {
+ unsigned long sig[_NSIG_WORDS];
+} sigset_t;
+
+#else
+/* Here we must cater to libcs that poke about in kernel headers. */
+
+#define NSIG 32
+typedef unsigned long sigset_t;
+
+#endif /* __KERNEL__ */
+#endif /* __ASSEMBLY__ */
+
+#define SIGHUP 1
+#define SIGINT 2
+#define SIGQUIT 3
+#define SIGILL 4
+#define SIGTRAP 5
+#define SIGABRT 6
+#define SIGIOT 6
+#define SIGBUS 7
+#define SIGFPE 8
+#define SIGKILL 9
+#define SIGUSR1 10
+#define SIGSEGV 11
+#define SIGUSR2 12
+#define SIGPIPE 13
+#define SIGALRM 14
+#define SIGTERM 15
+#define SIGSTKFLT 16
+#define SIGCHLD 17
+#define SIGCONT 18
+#define SIGSTOP 19
+#define SIGTSTP 20
+#define SIGTTIN 21
+#define SIGTTOU 22
+#define SIGURG 23
+#define SIGXCPU 24
+#define SIGXFSZ 25
+#define SIGVTALRM 26
+#define SIGPROF 27
+#define SIGWINCH 28
+#define SIGIO 29
+#define SIGPOLL SIGIO
+/*
+#define SIGLOST 29
+*/
+#define SIGPWR 30
+#define SIGSYS 31
+#define SIGUNUSED 31
+
+/* These should not be considered constants from userland. */
+#define SIGRTMIN 32
+#define SIGRTMAX _NSIG
+
+/*
+ * SA_FLAGS values:
+ *
+ * SA_ONSTACK indicates that a registered stack_t will be used.
+ * SA_RESTART flag to get restarting signals (which were the default long ago)
+ * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop.
+ * SA_RESETHAND clears the handler when the signal is delivered.
+ * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies.
+ * SA_NODEFER prevents the current signal from being masked in the handler.
+ *
+ * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
+ * Unix names RESETHAND and NODEFER respectively.
+ */
+#define SA_NOCLDSTOP 0x00000001u
+#define SA_NOCLDWAIT 0x00000002u
+#define SA_SIGINFO 0x00000004u
+#define SA_ONSTACK 0x08000000u
+#define SA_RESTART 0x10000000u
+#define SA_NODEFER 0x40000000u
+#define SA_RESETHAND 0x80000000u
+
+#define SA_NOMASK SA_NODEFER
+#define SA_ONESHOT SA_RESETHAND
+
+#define SA_RESTORER 0x04000000
+
+/*
+ * sigaltstack controls
+ */
+#define SS_ONSTACK 1
+#define SS_DISABLE 2
+
+#define MINSIGSTKSZ 2048
+#define SIGSTKSZ 8192
+
+#include <asm-generic/signal.h>
+
+#ifndef __ASSEMBLY__
+
+# ifdef __KERNEL__
+extern void do_notify_resume(struct pt_regs *, void *, __u32);
+# endif /* __KERNEL__ */
+
+#ifdef __i386__
+# ifdef __KERNEL__
+struct old_sigaction {
+ __sighandler_t sa_handler;
+ old_sigset_t sa_mask;
+ unsigned long sa_flags;
+ __sigrestore_t sa_restorer;
+};
+
+struct sigaction {
+ __sighandler_t sa_handler;
+ unsigned long sa_flags;
+ __sigrestore_t sa_restorer;
+ sigset_t sa_mask; /* mask last for extensibility */
+};
+
+struct k_sigaction {
+ struct sigaction sa;
+};
+
+# else /* __KERNEL__ */
+/* Here we must cater to libcs that poke about in kernel headers. */
+
+struct sigaction {
+ union {
+ __sighandler_t _sa_handler;
+ void (*_sa_sigaction)(int, struct siginfo *, void *);
+ } _u;
+ sigset_t sa_mask;
+ unsigned long sa_flags;
+ void (*sa_restorer)(void);
+};
+
+#define sa_handler _u._sa_handler
+#define sa_sigaction _u._sa_sigaction
+
+# endif /* ! __KERNEL__ */
+#else /* __i386__ */
+
+struct sigaction {
+ __sighandler_t sa_handler;
+ unsigned long sa_flags;
+ __sigrestore_t sa_restorer;
+ sigset_t sa_mask; /* mask last for extensibility */
+};
+
+struct k_sigaction {
+ struct sigaction sa;
+};
+
+#endif /* !__i386__ */
+
+typedef struct sigaltstack {
+ void __user *ss_sp;
+ int ss_flags;
+ size_t ss_size;
+} stack_t;
+
+#ifdef __KERNEL__
+#include <asm/sigcontext.h>
+
+#ifdef __i386__
+
+#define __HAVE_ARCH_SIG_BITOPS
+
+#define sigaddset(set,sig) \
+ (__builtin_constant_p(sig) \
+ ? __const_sigaddset((set), (sig)) \
+ : __gen_sigaddset((set), (sig)))
+
+static inline void __gen_sigaddset(sigset_t *set, int _sig)
+{
+ asm("btsl %1,%0" : "+m"(*set) : "Ir"(_sig - 1) : "cc");
+}
+
+static inline void __const_sigaddset(sigset_t *set, int _sig)
+{
+ unsigned long sig = _sig - 1;
+ set->sig[sig / _NSIG_BPW] |= 1 << (sig % _NSIG_BPW);
+}
+
+#define sigdelset(set, sig) \
+ (__builtin_constant_p(sig) \
+ ? __const_sigdelset((set), (sig)) \
+ : __gen_sigdelset((set), (sig)))
+
+
+static inline void __gen_sigdelset(sigset_t *set, int _sig)
+{
+ asm("btrl %1,%0" : "+m"(*set) : "Ir"(_sig - 1) : "cc");
+}
+
+static inline void __const_sigdelset(sigset_t *set, int _sig)
+{
+ unsigned long sig = _sig - 1;
+ set->sig[sig / _NSIG_BPW] &= ~(1 << (sig % _NSIG_BPW));
+}
+
+static inline int __const_sigismember(sigset_t *set, int _sig)
+{
+ unsigned long sig = _sig - 1;
+ return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW));
+}
+
+static inline int __gen_sigismember(sigset_t *set, int _sig)
+{
+ int ret;
+ asm("btl %2,%1\n\tsbbl %0,%0"
+ : "=r"(ret) : "m"(*set), "Ir"(_sig-1) : "cc");
+ return ret;
+}
+
+#define sigismember(set, sig) \
+ (__builtin_constant_p(sig) \
+ ? __const_sigismember((set), (sig)) \
+ : __gen_sigismember((set), (sig)))
+
+static inline int sigfindinword(unsigned long word)
+{
+ asm("bsfl %1,%0" : "=r"(word) : "rm"(word) : "cc");
+ return word;
+}
+
+struct pt_regs;
+
+#else /* __i386__ */
+
+#undef __HAVE_ARCH_SIG_BITOPS
+
+#endif /* !__i386__ */
+
+#define ptrace_signal_deliver(regs, cookie) do { } while (0)
+
+#endif /* __KERNEL__ */
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_SIGNAL_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
new file mode 100644
index 000000000000..d12811ce51d9
--- /dev/null
+++ b/arch/x86/include/asm/smp.h
@@ -0,0 +1,235 @@
+#ifndef _ASM_X86_SMP_H
+#define _ASM_X86_SMP_H
+#ifndef __ASSEMBLY__
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <asm/percpu.h>
+
+/*
+ * We need the APIC definitions automatically as part of 'smp.h'
+ */
+#ifdef CONFIG_X86_LOCAL_APIC
+# include <asm/mpspec.h>
+# include <asm/apic.h>
+# ifdef CONFIG_X86_IO_APIC
+# include <asm/io_apic.h>
+# endif
+#endif
+#include <asm/pda.h>
+#include <asm/thread_info.h>
+
+extern cpumask_t cpu_callout_map;
+extern cpumask_t cpu_initialized;
+extern cpumask_t cpu_callin_map;
+
+extern void (*mtrr_hook)(void);
+extern void zap_low_mappings(void);
+
+extern int __cpuinit get_local_pda(int cpu);
+
+extern int smp_num_siblings;
+extern unsigned int num_processors;
+extern cpumask_t cpu_initialized;
+
+DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
+DECLARE_PER_CPU(cpumask_t, cpu_core_map);
+DECLARE_PER_CPU(u16, cpu_llc_id);
+#ifdef CONFIG_X86_32
+DECLARE_PER_CPU(int, cpu_number);
+#endif
+
+DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
+DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
+
+/* Static state in head.S used to set up a CPU */
+extern struct {
+ void *sp;
+ unsigned short ss;
+} stack_start;
+
+struct smp_ops {
+ void (*smp_prepare_boot_cpu)(void);
+ void (*smp_prepare_cpus)(unsigned max_cpus);
+ void (*smp_cpus_done)(unsigned max_cpus);
+
+ void (*smp_send_stop)(void);
+ void (*smp_send_reschedule)(int cpu);
+
+ int (*cpu_up)(unsigned cpu);
+ int (*cpu_disable)(void);
+ void (*cpu_die)(unsigned int cpu);
+ void (*play_dead)(void);
+
+ void (*send_call_func_ipi)(cpumask_t mask);
+ void (*send_call_func_single_ipi)(int cpu);
+};
+
+/* Globals due to paravirt */
+extern void set_cpu_sibling_map(int cpu);
+
+#ifdef CONFIG_SMP
+#ifndef CONFIG_PARAVIRT
+#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
+#endif
+extern struct smp_ops smp_ops;
+
+static inline void smp_send_stop(void)
+{
+ smp_ops.smp_send_stop();
+}
+
+static inline void smp_prepare_boot_cpu(void)
+{
+ smp_ops.smp_prepare_boot_cpu();
+}
+
+static inline void smp_prepare_cpus(unsigned int max_cpus)
+{
+ smp_ops.smp_prepare_cpus(max_cpus);
+}
+
+static inline void smp_cpus_done(unsigned int max_cpus)
+{
+ smp_ops.smp_cpus_done(max_cpus);
+}
+
+static inline int __cpu_up(unsigned int cpu)
+{
+ return smp_ops.cpu_up(cpu);
+}
+
+static inline int __cpu_disable(void)
+{
+ return smp_ops.cpu_disable();
+}
+
+static inline void __cpu_die(unsigned int cpu)
+{
+ smp_ops.cpu_die(cpu);
+}
+
+static inline void play_dead(void)
+{
+ smp_ops.play_dead();
+}
+
+static inline void smp_send_reschedule(int cpu)
+{
+ smp_ops.smp_send_reschedule(cpu);
+}
+
+static inline void arch_send_call_function_single_ipi(int cpu)
+{
+ smp_ops.send_call_func_single_ipi(cpu);
+}
+
+static inline void arch_send_call_function_ipi(cpumask_t mask)
+{
+ smp_ops.send_call_func_ipi(mask);
+}
+
+void cpu_disable_common(void);
+void native_smp_prepare_boot_cpu(void);
+void native_smp_prepare_cpus(unsigned int max_cpus);
+void native_smp_cpus_done(unsigned int max_cpus);
+int native_cpu_up(unsigned int cpunum);
+int native_cpu_disable(void);
+void native_cpu_die(unsigned int cpu);
+void native_play_dead(void);
+void play_dead_common(void);
+
+void native_send_call_func_ipi(cpumask_t mask);
+void native_send_call_func_single_ipi(int cpu);
+
+extern void prefill_possible_map(void);
+
+void smp_store_cpu_info(int id);
+#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
+
+/* We don't mark CPUs online until __cpu_up(), so we need another measure */
+static inline int num_booting_cpus(void)
+{
+ return cpus_weight(cpu_callout_map);
+}
+#else
+static inline void prefill_possible_map(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+extern unsigned disabled_cpus __cpuinitdata;
+
+#ifdef CONFIG_X86_32_SMP
+/*
+ * This function is needed by all SMP systems. It must _always_ be valid
+ * from the initial startup. We map APIC_BASE very early in page_setup(),
+ * so this is correct in the x86 case.
+ */
+#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
+extern int safe_smp_processor_id(void);
+
+#elif defined(CONFIG_X86_64_SMP)
+#define raw_smp_processor_id() read_pda(cpunumber)
+
+#define stack_smp_processor_id() \
+({ \
+ struct thread_info *ti; \
+ __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
+ ti->cpu; \
+})
+#define safe_smp_processor_id() smp_processor_id()
+
+#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */
+#define cpu_physical_id(cpu) boot_cpu_physical_apicid
+#define safe_smp_processor_id() 0
+#define stack_smp_processor_id() 0
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+#ifndef CONFIG_X86_64
+static inline int logical_smp_processor_id(void)
+{
+ /* we don't want to mark this access volatile - bad code generation */
+ return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
+}
+
+#include <mach_apicdef.h>
+static inline unsigned int read_apic_id(void)
+{
+ unsigned int reg;
+
+ reg = *(u32 *)(APIC_BASE + APIC_ID);
+
+ return GET_APIC_ID(reg);
+}
+#endif
+
+
+# if defined(APIC_DEFINITION) || defined(CONFIG_X86_64)
+extern int hard_smp_processor_id(void);
+# else
+#include <mach_apicdef.h>
+static inline int hard_smp_processor_id(void)
+{
+ /* we don't want to mark this access volatile - bad code generation */
+ return read_apic_id();
+}
+# endif /* APIC_DEFINITION */
+
+#else /* CONFIG_X86_LOCAL_APIC */
+
+# ifndef CONFIG_SMP
+# define hard_smp_processor_id() 0
+# endif
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_HAS_BOOT_CPU_ID
+extern unsigned char boot_cpu_id;
+#else
+#define boot_cpu_id 0
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h
new file mode 100644
index 000000000000..8ab9cc8b2ecc
--- /dev/null
+++ b/arch/x86/include/asm/socket.h
@@ -0,0 +1,57 @@
+#ifndef _ASM_X86_SOCKET_H
+#define _ASM_X86_SOCKET_H
+
+#include <asm/sockios.h>
+
+/* For setsockopt(2) */
+#define SOL_SOCKET 1
+
+#define SO_DEBUG 1
+#define SO_REUSEADDR 2
+#define SO_TYPE 3
+#define SO_ERROR 4
+#define SO_DONTROUTE 5
+#define SO_BROADCAST 6
+#define SO_SNDBUF 7
+#define SO_RCVBUF 8
+#define SO_SNDBUFFORCE 32
+#define SO_RCVBUFFORCE 33
+#define SO_KEEPALIVE 9
+#define SO_OOBINLINE 10
+#define SO_NO_CHECK 11
+#define SO_PRIORITY 12
+#define SO_LINGER 13
+#define SO_BSDCOMPAT 14
+/* To add :#define SO_REUSEPORT 15 */
+#define SO_PASSCRED 16
+#define SO_PEERCRED 17
+#define SO_RCVLOWAT 18
+#define SO_SNDLOWAT 19
+#define SO_RCVTIMEO 20
+#define SO_SNDTIMEO 21
+
+/* Security levels - as per NRL IPv6 - don't actually do anything */
+#define SO_SECURITY_AUTHENTICATION 22
+#define SO_SECURITY_ENCRYPTION_TRANSPORT 23
+#define SO_SECURITY_ENCRYPTION_NETWORK 24
+
+#define SO_BINDTODEVICE 25
+
+/* Socket filtering */
+#define SO_ATTACH_FILTER 26
+#define SO_DETACH_FILTER 27
+
+#define SO_PEERNAME 28
+#define SO_TIMESTAMP 29
+#define SCM_TIMESTAMP SO_TIMESTAMP
+
+#define SO_ACCEPTCONN 30
+
+#define SO_PEERSEC 31
+#define SO_PASSSEC 34
+#define SO_TIMESTAMPNS 35
+#define SCM_TIMESTAMPNS SO_TIMESTAMPNS
+
+#define SO_MARK 36
+
+#endif /* _ASM_X86_SOCKET_H */
diff --git a/arch/x86/include/asm/sockios.h b/arch/x86/include/asm/sockios.h
new file mode 100644
index 000000000000..49cc72b5d3c9
--- /dev/null
+++ b/arch/x86/include/asm/sockios.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_X86_SOCKIOS_H
+#define _ASM_X86_SOCKIOS_H
+
+/* Socket-level I/O control calls. */
+#define FIOSETOWN 0x8901
+#define SIOCSPGRP 0x8902
+#define FIOGETOWN 0x8903
+#define SIOCGPGRP 0x8904
+#define SIOCATMARK 0x8905
+#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */
+#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */
+
+#endif /* _ASM_X86_SOCKIOS_H */
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
new file mode 100644
index 000000000000..e3cc3c063ec5
--- /dev/null
+++ b/arch/x86/include/asm/sparsemem.h
@@ -0,0 +1,34 @@
+#ifndef _ASM_X86_SPARSEMEM_H
+#define _ASM_X86_SPARSEMEM_H
+
+#ifdef CONFIG_SPARSEMEM
+/*
+ * generic non-linear memory support:
+ *
+ * 1) we will not split memory into more chunks than will fit into the flags
+ * field of the struct page
+ *
+ * SECTION_SIZE_BITS 2^n: size of each section
+ * MAX_PHYSADDR_BITS 2^n: max size of physical address space
+ * MAX_PHYSMEM_BITS 2^n: how much memory we can have in that space
+ *
+ */
+
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_PAE
+# define SECTION_SIZE_BITS 29
+# define MAX_PHYSADDR_BITS 36
+# define MAX_PHYSMEM_BITS 36
+# else
+# define SECTION_SIZE_BITS 26
+# define MAX_PHYSADDR_BITS 32
+# define MAX_PHYSMEM_BITS 32
+# endif
+#else /* CONFIG_X86_32 */
+# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
+# define MAX_PHYSADDR_BITS 44
+# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */
+#endif
+
+#endif /* CONFIG_SPARSEMEM */
+#endif /* _ASM_X86_SPARSEMEM_H */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
new file mode 100644
index 000000000000..d17c91981da2
--- /dev/null
+++ b/arch/x86/include/asm/spinlock.h
@@ -0,0 +1,364 @@
+#ifndef _ASM_X86_SPINLOCK_H
+#define _ASM_X86_SPINLOCK_H
+
+#include <asm/atomic.h>
+#include <asm/rwlock.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+#include <linux/compiler.h>
+#include <asm/paravirt.h>
+/*
+ * Your basic SMP spinlocks, allowing only a single CPU anywhere
+ *
+ * Simple spin lock operations. There are two variants, one clears IRQ's
+ * on the local processor, one does not.
+ *
+ * These are fair FIFO ticket locks, which are currently limited to 256
+ * CPUs.
+ *
+ * (the type definitions are in asm/spinlock_types.h)
+ */
+
+#ifdef CONFIG_X86_32
+# define LOCK_PTR_REG "a"
+# define REG_PTR_MODE "k"
+#else
+# define LOCK_PTR_REG "D"
+# define REG_PTR_MODE "q"
+#endif
+
+#if defined(CONFIG_X86_32) && \
+ (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))
+/*
+ * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
+ * (PPro errata 66, 92)
+ */
+# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
+#else
+# define UNLOCK_LOCK_PREFIX
+#endif
+
+/*
+ * Ticket locks are conceptually two parts, one indicating the current head of
+ * the queue, and the other indicating the current tail. The lock is acquired
+ * by atomically noting the tail and incrementing it by one (thus adding
+ * ourself to the queue and noting our position), then waiting until the head
+ * becomes equal to the the initial value of the tail.
+ *
+ * We use an xadd covering *both* parts of the lock, to increment the tail and
+ * also load the position of the head, which takes care of memory ordering
+ * issues and should be optimal for the uncontended case. Note the tail must be
+ * in the high part, because a wide xadd increment of the low part would carry
+ * up and contaminate the high part.
+ *
+ * With fewer than 2^8 possible CPUs, we can use x86's partial registers to
+ * save some instructions and make the code more elegant. There really isn't
+ * much between them in performance though, especially as locks are out of line.
+ */
+#if (NR_CPUS < 256)
+#define TICKET_SHIFT 8
+
+static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
+{
+ short inc = 0x0100;
+
+ asm volatile (
+ LOCK_PREFIX "xaddw %w0, %1\n"
+ "1:\t"
+ "cmpb %h0, %b0\n\t"
+ "je 2f\n\t"
+ "rep ; nop\n\t"
+ "movb %1, %b0\n\t"
+ /* don't need lfence here, because loads are in-order */
+ "jmp 1b\n"
+ "2:"
+ : "+Q" (inc), "+m" (lock->slock)
+ :
+ : "memory", "cc");
+}
+
+static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
+{
+ int tmp, new;
+
+ asm volatile("movzwl %2, %0\n\t"
+ "cmpb %h0,%b0\n\t"
+ "leal 0x100(%" REG_PTR_MODE "0), %1\n\t"
+ "jne 1f\n\t"
+ LOCK_PREFIX "cmpxchgw %w1,%2\n\t"
+ "1:"
+ "sete %b1\n\t"
+ "movzbl %b1,%0\n\t"
+ : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
+ :
+ : "memory", "cc");
+
+ return tmp;
+}
+
+static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
+{
+ asm volatile(UNLOCK_LOCK_PREFIX "incb %0"
+ : "+m" (lock->slock)
+ :
+ : "memory", "cc");
+}
+#else
+#define TICKET_SHIFT 16
+
+static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
+{
+ int inc = 0x00010000;
+ int tmp;
+
+ asm volatile(LOCK_PREFIX "xaddl %0, %1\n"
+ "movzwl %w0, %2\n\t"
+ "shrl $16, %0\n\t"
+ "1:\t"
+ "cmpl %0, %2\n\t"
+ "je 2f\n\t"
+ "rep ; nop\n\t"
+ "movzwl %1, %2\n\t"
+ /* don't need lfence here, because loads are in-order */
+ "jmp 1b\n"
+ "2:"
+ : "+r" (inc), "+m" (lock->slock), "=&r" (tmp)
+ :
+ : "memory", "cc");
+}
+
+static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
+{
+ int tmp;
+ int new;
+
+ asm volatile("movl %2,%0\n\t"
+ "movl %0,%1\n\t"
+ "roll $16, %0\n\t"
+ "cmpl %0,%1\n\t"
+ "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t"
+ "jne 1f\n\t"
+ LOCK_PREFIX "cmpxchgl %1,%2\n\t"
+ "1:"
+ "sete %b1\n\t"
+ "movzbl %b1,%0\n\t"
+ : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
+ :
+ : "memory", "cc");
+
+ return tmp;
+}
+
+static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
+{
+ asm volatile(UNLOCK_LOCK_PREFIX "incw %0"
+ : "+m" (lock->slock)
+ :
+ : "memory", "cc");
+}
+#endif
+
+static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
+{
+ int tmp = ACCESS_ONCE(lock->slock);
+
+ return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
+}
+
+static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
+{
+ int tmp = ACCESS_ONCE(lock->slock);
+
+ return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
+}
+
+#ifdef CONFIG_PARAVIRT
+/*
+ * Define virtualization-friendly old-style lock byte lock, for use in
+ * pv_lock_ops if desired.
+ *
+ * This differs from the pre-2.6.24 spinlock by always using xchgb
+ * rather than decb to take the lock; this allows it to use a
+ * zero-initialized lock structure. It also maintains a 1-byte
+ * contention counter, so that we can implement
+ * __byte_spin_is_contended.
+ */
+struct __byte_spinlock {
+ s8 lock;
+ s8 spinners;
+};
+
+static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ return bl->lock != 0;
+}
+
+static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ return bl->spinners != 0;
+}
+
+static inline void __byte_spin_lock(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ s8 val = 1;
+
+ asm("1: xchgb %1, %0\n"
+ " test %1,%1\n"
+ " jz 3f\n"
+ " " LOCK_PREFIX "incb %2\n"
+ "2: rep;nop\n"
+ " cmpb $1, %0\n"
+ " je 2b\n"
+ " " LOCK_PREFIX "decb %2\n"
+ " jmp 1b\n"
+ "3:"
+ : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
+}
+
+static inline int __byte_spin_trylock(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ u8 old = 1;
+
+ asm("xchgb %1,%0"
+ : "+m" (bl->lock), "+q" (old) : : "memory");
+
+ return old == 0;
+}
+
+static inline void __byte_spin_unlock(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ smp_wmb();
+ bl->lock = 0;
+}
+#else /* !CONFIG_PARAVIRT */
+static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+{
+ return __ticket_spin_is_locked(lock);
+}
+
+static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+{
+ return __ticket_spin_is_contended(lock);
+}
+
+static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
+{
+ __ticket_spin_lock(lock);
+}
+
+static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
+{
+ return __ticket_spin_trylock(lock);
+}
+
+static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
+{
+ __ticket_spin_unlock(lock);
+}
+
+static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
+ unsigned long flags)
+{
+ __raw_spin_lock(lock);
+}
+
+#endif /* CONFIG_PARAVIRT */
+
+static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
+{
+ while (__raw_spin_is_locked(lock))
+ cpu_relax();
+}
+
+/*
+ * Read-write spinlocks, allowing multiple readers
+ * but only one writer.
+ *
+ * NOTE! it is quite common to have readers in interrupts
+ * but no interrupt writers. For those circumstances we
+ * can "mix" irq-safe locks - any writer needs to get a
+ * irq-safe write-lock, but readers can get non-irqsafe
+ * read-locks.
+ *
+ * On x86, we implement read-write locks as a 32-bit counter
+ * with the high bit (sign) being the "contended" bit.
+ */
+
+/**
+ * read_can_lock - would read_trylock() succeed?
+ * @lock: the rwlock in question.
+ */
+static inline int __raw_read_can_lock(raw_rwlock_t *lock)
+{
+ return (int)(lock)->lock > 0;
+}
+
+/**
+ * write_can_lock - would write_trylock() succeed?
+ * @lock: the rwlock in question.
+ */
+static inline int __raw_write_can_lock(raw_rwlock_t *lock)
+{
+ return (lock)->lock == RW_LOCK_BIAS;
+}
+
+static inline void __raw_read_lock(raw_rwlock_t *rw)
+{
+ asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
+ "jns 1f\n"
+ "call __read_lock_failed\n\t"
+ "1:\n"
+ ::LOCK_PTR_REG (rw) : "memory");
+}
+
+static inline void __raw_write_lock(raw_rwlock_t *rw)
+{
+ asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
+ "jz 1f\n"
+ "call __write_lock_failed\n\t"
+ "1:\n"
+ ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
+}
+
+static inline int __raw_read_trylock(raw_rwlock_t *lock)
+{
+ atomic_t *count = (atomic_t *)lock;
+
+ atomic_dec(count);
+ if (atomic_read(count) >= 0)
+ return 1;
+ atomic_inc(count);
+ return 0;
+}
+
+static inline int __raw_write_trylock(raw_rwlock_t *lock)
+{
+ atomic_t *count = (atomic_t *)lock;
+
+ if (atomic_sub_and_test(RW_LOCK_BIAS, count))
+ return 1;
+ atomic_add(RW_LOCK_BIAS, count);
+ return 0;
+}
+
+static inline void __raw_read_unlock(raw_rwlock_t *rw)
+{
+ asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
+}
+
+static inline void __raw_write_unlock(raw_rwlock_t *rw)
+{
+ asm volatile(LOCK_PREFIX "addl %1, %0"
+ : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
+}
+
+#define _raw_spin_relax(lock) cpu_relax()
+#define _raw_read_relax(lock) cpu_relax()
+#define _raw_write_relax(lock) cpu_relax()
+
+#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
new file mode 100644
index 000000000000..845f81c87091
--- /dev/null
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -0,0 +1,20 @@
+#ifndef _ASM_X86_SPINLOCK_TYPES_H
+#define _ASM_X86_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct raw_spinlock {
+ unsigned int slock;
+} raw_spinlock_t;
+
+#define __RAW_SPIN_LOCK_UNLOCKED { 0 }
+
+typedef struct {
+ unsigned int lock;
+} raw_rwlock_t;
+
+#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
+
+#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h
new file mode 100644
index 000000000000..b508d639d1a7
--- /dev/null
+++ b/arch/x86/include/asm/srat.h
@@ -0,0 +1,39 @@
+/*
+ * Some of the code in this file has been gleaned from the 64 bit
+ * discontigmem support code base.
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to Pat Gaughen <gone@us.ibm.com>
+ */
+
+#ifndef _ASM_X86_SRAT_H
+#define _ASM_X86_SRAT_H
+
+#ifdef CONFIG_ACPI_NUMA
+extern int get_memcfg_from_srat(void);
+#else
+static inline int get_memcfg_from_srat(void)
+{
+ return 0;
+}
+#endif
+
+#endif /* _ASM_X86_SRAT_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
new file mode 100644
index 000000000000..f517944b2b17
--- /dev/null
+++ b/arch/x86/include/asm/stacktrace.h
@@ -0,0 +1,21 @@
+#ifndef _ASM_X86_STACKTRACE_H
+#define _ASM_X86_STACKTRACE_H
+
+extern int kstack_depth_to_print;
+
+/* Generic stack tracer with callbacks */
+
+struct stacktrace_ops {
+ void (*warning)(void *data, char *msg);
+ /* msg must contain %s for the symbol */
+ void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
+ void (*address)(void *data, unsigned long address, int reliable);
+ /* On negative return stop dumping */
+ int (*stack)(void *data, char *name);
+};
+
+void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data);
+
+#endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/include/asm/stat.h b/arch/x86/include/asm/stat.h
new file mode 100644
index 000000000000..e0b1d9bbcbc6
--- /dev/null
+++ b/arch/x86/include/asm/stat.h
@@ -0,0 +1,114 @@
+#ifndef _ASM_X86_STAT_H
+#define _ASM_X86_STAT_H
+
+#define STAT_HAVE_NSEC 1
+
+#ifdef __i386__
+struct stat {
+ unsigned long st_dev;
+ unsigned long st_ino;
+ unsigned short st_mode;
+ unsigned short st_nlink;
+ unsigned short st_uid;
+ unsigned short st_gid;
+ unsigned long st_rdev;
+ unsigned long st_size;
+ unsigned long st_blksize;
+ unsigned long st_blocks;
+ unsigned long st_atime;
+ unsigned long st_atime_nsec;
+ unsigned long st_mtime;
+ unsigned long st_mtime_nsec;
+ unsigned long st_ctime;
+ unsigned long st_ctime_nsec;
+ unsigned long __unused4;
+ unsigned long __unused5;
+};
+
+#define STAT64_HAS_BROKEN_ST_INO 1
+
+/* This matches struct stat64 in glibc2.1, hence the absolutely
+ * insane amounts of padding around dev_t's.
+ */
+struct stat64 {
+ unsigned long long st_dev;
+ unsigned char __pad0[4];
+
+ unsigned long __st_ino;
+
+ unsigned int st_mode;
+ unsigned int st_nlink;
+
+ unsigned long st_uid;
+ unsigned long st_gid;
+
+ unsigned long long st_rdev;
+ unsigned char __pad3[4];
+
+ long long st_size;
+ unsigned long st_blksize;
+
+ /* Number 512-byte blocks allocated. */
+ unsigned long long st_blocks;
+
+ unsigned long st_atime;
+ unsigned long st_atime_nsec;
+
+ unsigned long st_mtime;
+ unsigned int st_mtime_nsec;
+
+ unsigned long st_ctime;
+ unsigned long st_ctime_nsec;
+
+ unsigned long long st_ino;
+};
+
+#else /* __i386__ */
+
+struct stat {
+ unsigned long st_dev;
+ unsigned long st_ino;
+ unsigned long st_nlink;
+
+ unsigned int st_mode;
+ unsigned int st_uid;
+ unsigned int st_gid;
+ unsigned int __pad0;
+ unsigned long st_rdev;
+ long st_size;
+ long st_blksize;
+ long st_blocks; /* Number 512-byte blocks allocated. */
+
+ unsigned long st_atime;
+ unsigned long st_atime_nsec;
+ unsigned long st_mtime;
+ unsigned long st_mtime_nsec;
+ unsigned long st_ctime;
+ unsigned long st_ctime_nsec;
+ long __unused[3];
+};
+#endif
+
+/* for 32bit emulation and 32 bit kernels */
+struct __old_kernel_stat {
+ unsigned short st_dev;
+ unsigned short st_ino;
+ unsigned short st_mode;
+ unsigned short st_nlink;
+ unsigned short st_uid;
+ unsigned short st_gid;
+ unsigned short st_rdev;
+#ifdef __i386__
+ unsigned long st_size;
+ unsigned long st_atime;
+ unsigned long st_mtime;
+ unsigned long st_ctime;
+#else
+ unsigned int st_size;
+ unsigned int st_atime;
+ unsigned int st_mtime;
+ unsigned int st_ctime;
+#endif
+};
+
+#endif /* _ASM_X86_STAT_H */
diff --git a/arch/x86/include/asm/statfs.h b/arch/x86/include/asm/statfs.h
new file mode 100644
index 000000000000..2d0adbf99a8e
--- /dev/null
+++ b/arch/x86/include/asm/statfs.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_STATFS_H
+#define _ASM_X86_STATFS_H
+
+/*
+ * We need compat_statfs64 to be packed, because the i386 ABI won't
+ * add padding at the end to bring it to a multiple of 8 bytes, but
+ * the x86_64 ABI will.
+ */
+#define ARCH_PACK_COMPAT_STATFS64 __attribute__((packed,aligned(4)))
+
+#include <asm-generic/statfs.h>
+#endif /* _ASM_X86_STATFS_H */
diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h
new file mode 100644
index 000000000000..6dfd6d9373a0
--- /dev/null
+++ b/arch/x86/include/asm/string.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "string_32.h"
+#else
+# include "string_64.h"
+#endif
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
new file mode 100644
index 000000000000..0e0e3ba827f7
--- /dev/null
+++ b/arch/x86/include/asm/string_32.h
@@ -0,0 +1,326 @@
+#ifndef _ASM_X86_STRING_32_H
+#define _ASM_X86_STRING_32_H
+
+#ifdef __KERNEL__
+
+/* Let gcc decide whether to inline or use the out of line functions */
+
+#define __HAVE_ARCH_STRCPY
+extern char *strcpy(char *dest, const char *src);
+
+#define __HAVE_ARCH_STRNCPY
+extern char *strncpy(char *dest, const char *src, size_t count);
+
+#define __HAVE_ARCH_STRCAT
+extern char *strcat(char *dest, const char *src);
+
+#define __HAVE_ARCH_STRNCAT
+extern char *strncat(char *dest, const char *src, size_t count);
+
+#define __HAVE_ARCH_STRCMP
+extern int strcmp(const char *cs, const char *ct);
+
+#define __HAVE_ARCH_STRNCMP
+extern int strncmp(const char *cs, const char *ct, size_t count);
+
+#define __HAVE_ARCH_STRCHR
+extern char *strchr(const char *s, int c);
+
+#define __HAVE_ARCH_STRLEN
+extern size_t strlen(const char *s);
+
+static __always_inline void *__memcpy(void *to, const void *from, size_t n)
+{
+ int d0, d1, d2;
+ asm volatile("rep ; movsl\n\t"
+ "movl %4,%%ecx\n\t"
+ "andl $3,%%ecx\n\t"
+ "jz 1f\n\t"
+ "rep ; movsb\n\t"
+ "1:"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from)
+ : "memory");
+ return to;
+}
+
+/*
+ * This looks ugly, but the compiler can optimize it totally,
+ * as the count is constant.
+ */
+static __always_inline void *__constant_memcpy(void *to, const void *from,
+ size_t n)
+{
+ long esi, edi;
+ if (!n)
+ return to;
+
+ switch (n) {
+ case 1:
+ *(char *)to = *(char *)from;
+ return to;
+ case 2:
+ *(short *)to = *(short *)from;
+ return to;
+ case 4:
+ *(int *)to = *(int *)from;
+ return to;
+
+ case 3:
+ *(short *)to = *(short *)from;
+ *((char *)to + 2) = *((char *)from + 2);
+ return to;
+ case 5:
+ *(int *)to = *(int *)from;
+ *((char *)to + 4) = *((char *)from + 4);
+ return to;
+ case 6:
+ *(int *)to = *(int *)from;
+ *((short *)to + 2) = *((short *)from + 2);
+ return to;
+ case 8:
+ *(int *)to = *(int *)from;
+ *((int *)to + 1) = *((int *)from + 1);
+ return to;
+ }
+
+ esi = (long)from;
+ edi = (long)to;
+ if (n >= 5 * 4) {
+ /* large block: use rep prefix */
+ int ecx;
+ asm volatile("rep ; movsl"
+ : "=&c" (ecx), "=&D" (edi), "=&S" (esi)
+ : "0" (n / 4), "1" (edi), "2" (esi)
+ : "memory"
+ );
+ } else {
+ /* small block: don't clobber ecx + smaller code */
+ if (n >= 4 * 4)
+ asm volatile("movsl"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ if (n >= 3 * 4)
+ asm volatile("movsl"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ if (n >= 2 * 4)
+ asm volatile("movsl"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ if (n >= 1 * 4)
+ asm volatile("movsl"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ }
+ switch (n % 4) {
+ /* tail */
+ case 0:
+ return to;
+ case 1:
+ asm volatile("movsb"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ return to;
+ case 2:
+ asm volatile("movsw"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ return to;
+ default:
+ asm volatile("movsw\n\tmovsb"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ return to;
+ }
+}
+
+#define __HAVE_ARCH_MEMCPY
+
+#ifdef CONFIG_X86_USE_3DNOW
+
+#include <asm/mmx.h>
+
+/*
+ * This CPU favours 3DNow strongly (eg AMD Athlon)
+ */
+
+static inline void *__constant_memcpy3d(void *to, const void *from, size_t len)
+{
+ if (len < 512)
+ return __constant_memcpy(to, from, len);
+ return _mmx_memcpy(to, from, len);
+}
+
+static inline void *__memcpy3d(void *to, const void *from, size_t len)
+{
+ if (len < 512)
+ return __memcpy(to, from, len);
+ return _mmx_memcpy(to, from, len);
+}
+
+#define memcpy(t, f, n) \
+ (__builtin_constant_p((n)) \
+ ? __constant_memcpy3d((t), (f), (n)) \
+ : __memcpy3d((t), (f), (n)))
+
+#else
+
+/*
+ * No 3D Now!
+ */
+
+#define memcpy(t, f, n) \
+ (__builtin_constant_p((n)) \
+ ? __constant_memcpy((t), (f), (n)) \
+ : __memcpy((t), (f), (n)))
+
+#endif
+
+#define __HAVE_ARCH_MEMMOVE
+void *memmove(void *dest, const void *src, size_t n);
+
+#define memcmp __builtin_memcmp
+
+#define __HAVE_ARCH_MEMCHR
+extern void *memchr(const void *cs, int c, size_t count);
+
+static inline void *__memset_generic(void *s, char c, size_t count)
+{
+ int d0, d1;
+ asm volatile("rep\n\t"
+ "stosb"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (c), "1" (s), "0" (count)
+ : "memory");
+ return s;
+}
+
+/* we might want to write optimized versions of these later */
+#define __constant_count_memset(s, c, count) __memset_generic((s), (c), (count))
+
+/*
+ * memset(x, 0, y) is a reasonably common thing to do, so we want to fill
+ * things 32 bits at a time even when we don't know the size of the
+ * area at compile-time..
+ */
+static __always_inline
+void *__constant_c_memset(void *s, unsigned long c, size_t count)
+{
+ int d0, d1;
+ asm volatile("rep ; stosl\n\t"
+ "testb $2,%b3\n\t"
+ "je 1f\n\t"
+ "stosw\n"
+ "1:\ttestb $1,%b3\n\t"
+ "je 2f\n\t"
+ "stosb\n"
+ "2:"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (c), "q" (count), "0" (count/4), "1" ((long)s)
+ : "memory");
+ return s;
+}
+
+/* Added by Gertjan van Wingerde to make minix and sysv module work */
+#define __HAVE_ARCH_STRNLEN
+extern size_t strnlen(const char *s, size_t count);
+/* end of additional stuff */
+
+#define __HAVE_ARCH_STRSTR
+extern char *strstr(const char *cs, const char *ct);
+
+/*
+ * This looks horribly ugly, but the compiler can optimize it totally,
+ * as we by now know that both pattern and count is constant..
+ */
+static __always_inline
+void *__constant_c_and_count_memset(void *s, unsigned long pattern,
+ size_t count)
+{
+ switch (count) {
+ case 0:
+ return s;
+ case 1:
+ *(unsigned char *)s = pattern & 0xff;
+ return s;
+ case 2:
+ *(unsigned short *)s = pattern & 0xffff;
+ return s;
+ case 3:
+ *(unsigned short *)s = pattern & 0xffff;
+ *((unsigned char *)s + 2) = pattern & 0xff;
+ return s;
+ case 4:
+ *(unsigned long *)s = pattern;
+ return s;
+ }
+
+#define COMMON(x) \
+ asm volatile("rep ; stosl" \
+ x \
+ : "=&c" (d0), "=&D" (d1) \
+ : "a" (eax), "0" (count/4), "1" ((long)s) \
+ : "memory")
+
+ {
+ int d0, d1;
+#if __GNUC__ == 4 && __GNUC_MINOR__ == 0
+ /* Workaround for broken gcc 4.0 */
+ register unsigned long eax asm("%eax") = pattern;
+#else
+ unsigned long eax = pattern;
+#endif
+
+ switch (count % 4) {
+ case 0:
+ COMMON("");
+ return s;
+ case 1:
+ COMMON("\n\tstosb");
+ return s;
+ case 2:
+ COMMON("\n\tstosw");
+ return s;
+ default:
+ COMMON("\n\tstosw\n\tstosb");
+ return s;
+ }
+ }
+
+#undef COMMON
+}
+
+#define __constant_c_x_memset(s, c, count) \
+ (__builtin_constant_p(count) \
+ ? __constant_c_and_count_memset((s), (c), (count)) \
+ : __constant_c_memset((s), (c), (count)))
+
+#define __memset(s, c, count) \
+ (__builtin_constant_p(count) \
+ ? __constant_count_memset((s), (c), (count)) \
+ : __memset_generic((s), (c), (count)))
+
+#define __HAVE_ARCH_MEMSET
+#define memset(s, c, count) \
+ (__builtin_constant_p(c) \
+ ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \
+ (count)) \
+ : __memset((s), (c), (count)))
+
+/*
+ * find the first occurrence of byte 'c', or 1 past the area if none
+ */
+#define __HAVE_ARCH_MEMSCAN
+extern void *memscan(void *addr, int c, size_t size);
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_STRING_32_H */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
new file mode 100644
index 000000000000..2afe164bf1e6
--- /dev/null
+++ b/arch/x86/include/asm/string_64.h
@@ -0,0 +1,60 @@
+#ifndef _ASM_X86_STRING_64_H
+#define _ASM_X86_STRING_64_H
+
+#ifdef __KERNEL__
+
+/* Written 2002 by Andi Kleen */
+
+/* Only used for special circumstances. Stolen from i386/string.h */
+static __always_inline void *__inline_memcpy(void *to, const void *from, size_t n)
+{
+ unsigned long d0, d1, d2;
+ asm volatile("rep ; movsl\n\t"
+ "testb $2,%b4\n\t"
+ "je 1f\n\t"
+ "movsw\n"
+ "1:\ttestb $1,%b4\n\t"
+ "je 2f\n\t"
+ "movsb\n"
+ "2:"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n / 4), "q" (n), "1" ((long)to), "2" ((long)from)
+ : "memory");
+ return to;
+}
+
+/* Even with __builtin_ the compiler may decide to use the out of line
+ function. */
+
+#define __HAVE_ARCH_MEMCPY 1
+#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
+extern void *memcpy(void *to, const void *from, size_t len);
+#else
+extern void *__memcpy(void *to, const void *from, size_t len);
+#define memcpy(dst, src, len) \
+({ \
+ size_t __len = (len); \
+ void *__ret; \
+ if (__builtin_constant_p(len) && __len >= 64) \
+ __ret = __memcpy((dst), (src), __len); \
+ else \
+ __ret = __builtin_memcpy((dst), (src), __len); \
+ __ret; \
+})
+#endif
+
+#define __HAVE_ARCH_MEMSET
+void *memset(void *s, int c, size_t n);
+
+#define __HAVE_ARCH_MEMMOVE
+void *memmove(void *dest, const void *src, size_t count);
+
+int memcmp(const void *cs, const void *ct, size_t count);
+size_t strlen(const char *s);
+char *strcpy(char *dest, const char *src);
+char *strcat(char *dest, const char *src);
+int strcmp(const char *cs, const char *ct);
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/include/asm/summit/apic.h b/arch/x86/include/asm/summit/apic.h
new file mode 100644
index 000000000000..9b3070f1c2ac
--- /dev/null
+++ b/arch/x86/include/asm/summit/apic.h
@@ -0,0 +1,184 @@
+#ifndef __ASM_SUMMIT_APIC_H
+#define __ASM_SUMMIT_APIC_H
+
+#include <asm/smp.h>
+
+#define esr_disable (1)
+#define NO_BALANCE_IRQ (0)
+
+/* In clustered mode, the high nibble of APIC ID is a cluster number.
+ * The low nibble is a 4-bit bitmap. */
+#define XAPIC_DEST_CPUS_SHIFT 4
+#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
+#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
+
+#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
+
+static inline cpumask_t target_cpus(void)
+{
+ /* CPU_MASK_ALL (0xff) has undefined behaviour with
+ * dest_LowestPrio mode logical clustered apic interrupt routing
+ * Just start on cpu 0. IRQ balancing will spread load
+ */
+ return cpumask_of_cpu(0);
+}
+
+#define INT_DELIVERY_MODE (dest_LowestPrio)
+#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */
+
+static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
+{
+ return 0;
+}
+
+/* we don't use the phys_cpu_present_map to indicate apicid presence */
+static inline unsigned long check_apicid_present(int bit)
+{
+ return 1;
+}
+
+#define apicid_cluster(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK)
+
+extern u8 cpu_2_logical_apicid[];
+
+static inline void init_apic_ldr(void)
+{
+ unsigned long val, id;
+ int count = 0;
+ u8 my_id = (u8)hard_smp_processor_id();
+ u8 my_cluster = (u8)apicid_cluster(my_id);
+#ifdef CONFIG_SMP
+ u8 lid;
+ int i;
+
+ /* Create logical APIC IDs by counting CPUs already in cluster. */
+ for (count = 0, i = NR_CPUS; --i >= 0; ) {
+ lid = cpu_2_logical_apicid[i];
+ if (lid != BAD_APICID && apicid_cluster(lid) == my_cluster)
+ ++count;
+ }
+#endif
+ /* We only have a 4 wide bitmap in cluster mode. If a deranged
+ * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
+ BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
+ id = my_cluster | (1UL << count);
+ apic_write(APIC_DFR, APIC_DFR_VALUE);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(id);
+ apic_write(APIC_LDR, val);
+}
+
+static inline int multi_timer_check(int apic, int irq)
+{
+ return 0;
+}
+
+static inline int apic_id_registered(void)
+{
+ return 1;
+}
+
+static inline void setup_apic_routing(void)
+{
+ printk("Enabling APIC mode: Summit. Using %d I/O APICs\n",
+ nr_ioapics);
+}
+
+static inline int apicid_to_node(int logical_apicid)
+{
+#ifdef CONFIG_SMP
+ return apicid_2_node[hard_smp_processor_id()];
+#else
+ return 0;
+#endif
+}
+
+/* Mapping from cpu number to logical apicid */
+static inline int cpu_to_logical_apicid(int cpu)
+{
+#ifdef CONFIG_SMP
+ if (cpu >= NR_CPUS)
+ return BAD_APICID;
+ return (int)cpu_2_logical_apicid[cpu];
+#else
+ return logical_smp_processor_id();
+#endif
+}
+
+static inline int cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < NR_CPUS)
+ return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+
+static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_id_map)
+{
+ /* For clustered we don't have a good way to do this yet - hack */
+ return physids_promote(0x0F);
+}
+
+static inline physid_mask_t apicid_to_cpu_present(int apicid)
+{
+ return physid_mask_of_physid(0);
+}
+
+static inline void setup_portio_remap(void)
+{
+}
+
+static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
+{
+ return 1;
+}
+
+static inline void enable_apic_mode(void)
+{
+}
+
+static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ int num_bits_set;
+ int cpus_found = 0;
+ int cpu;
+ int apicid;
+
+ num_bits_set = cpus_weight(cpumask);
+ /* Return id to all */
+ if (num_bits_set == NR_CPUS)
+ return (int) 0xFF;
+ /*
+ * The cpus in the mask must all be on the apic cluster. If are not
+ * on the same apicid cluster return default value of TARGET_CPUS.
+ */
+ cpu = first_cpu(cpumask);
+ apicid = cpu_to_logical_apicid(cpu);
+ while (cpus_found < num_bits_set) {
+ if (cpu_isset(cpu, cpumask)) {
+ int new_apicid = cpu_to_logical_apicid(cpu);
+ if (apicid_cluster(apicid) !=
+ apicid_cluster(new_apicid)){
+ printk ("%s: Not a valid mask!\n", __func__);
+ return 0xFF;
+ }
+ apicid = apicid | new_apicid;
+ cpus_found++;
+ }
+ cpu++;
+ }
+ return apicid;
+}
+
+/* cpuid returns the value latched in the HW at reset, not the APIC ID
+ * register's value. For any box whose BIOS changes APIC IDs, like
+ * clustered APIC systems, we must use hard_smp_processor_id.
+ *
+ * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
+ */
+static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
+{
+ return hard_smp_processor_id() >> index_msb;
+}
+
+#endif /* __ASM_SUMMIT_APIC_H */
diff --git a/arch/x86/include/asm/summit/apicdef.h b/arch/x86/include/asm/summit/apicdef.h
new file mode 100644
index 000000000000..f3fbca1f61c1
--- /dev/null
+++ b/arch/x86/include/asm/summit/apicdef.h
@@ -0,0 +1,13 @@
+#ifndef __ASM_SUMMIT_APICDEF_H
+#define __ASM_SUMMIT_APICDEF_H
+
+#define APIC_ID_MASK (0xFF<<24)
+
+static inline unsigned get_apic_id(unsigned long x)
+{
+ return (x>>24)&0xFF;
+}
+
+#define GET_APIC_ID(x) get_apic_id(x)
+
+#endif
diff --git a/arch/x86/include/asm/summit/ipi.h b/arch/x86/include/asm/summit/ipi.h
new file mode 100644
index 000000000000..53bd1e7bd7b4
--- /dev/null
+++ b/arch/x86/include/asm/summit/ipi.h
@@ -0,0 +1,25 @@
+#ifndef __ASM_SUMMIT_IPI_H
+#define __ASM_SUMMIT_IPI_H
+
+void send_IPI_mask_sequence(cpumask_t mask, int vector);
+
+static inline void send_IPI_mask(cpumask_t mask, int vector)
+{
+ send_IPI_mask_sequence(mask, vector);
+}
+
+static inline void send_IPI_allbutself(int vector)
+{
+ cpumask_t mask = cpu_online_map;
+ cpu_clear(smp_processor_id(), mask);
+
+ if (!cpus_empty(mask))
+ send_IPI_mask(mask, vector);
+}
+
+static inline void send_IPI_all(int vector)
+{
+ send_IPI_mask(cpu_online_map, vector);
+}
+
+#endif /* __ASM_SUMMIT_IPI_H */
diff --git a/arch/x86/include/asm/summit/mpparse.h b/arch/x86/include/asm/summit/mpparse.h
new file mode 100644
index 000000000000..013ce6fab2d5
--- /dev/null
+++ b/arch/x86/include/asm/summit/mpparse.h
@@ -0,0 +1,109 @@
+#ifndef __ASM_SUMMIT_MPPARSE_H
+#define __ASM_SUMMIT_MPPARSE_H
+
+#include <asm/tsc.h>
+
+extern int use_cyclone;
+
+#ifdef CONFIG_X86_SUMMIT_NUMA
+extern void setup_summit(void);
+#else
+#define setup_summit() {}
+#endif
+
+static inline int mps_oem_check(struct mp_config_table *mpc, char *oem,
+ char *productid)
+{
+ if (!strncmp(oem, "IBM ENSW", 8) &&
+ (!strncmp(productid, "VIGIL SMP", 9)
+ || !strncmp(productid, "EXA", 3)
+ || !strncmp(productid, "RUTHLESS SMP", 12))){
+ mark_tsc_unstable("Summit based system");
+ use_cyclone = 1; /*enable cyclone-timer*/
+ setup_summit();
+ return 1;
+ }
+ return 0;
+}
+
+/* Hook from generic ACPI tables.c */
+static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if (!strncmp(oem_id, "IBM", 3) &&
+ (!strncmp(oem_table_id, "SERVIGIL", 8)
+ || !strncmp(oem_table_id, "EXA", 3))){
+ mark_tsc_unstable("Summit based system");
+ use_cyclone = 1; /*enable cyclone-timer*/
+ setup_summit();
+ return 1;
+ }
+ return 0;
+}
+
+struct rio_table_hdr {
+ unsigned char version; /* Version number of this data structure */
+ /* Version 3 adds chassis_num & WP_index */
+ unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */
+ unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */
+} __attribute__((packed));
+
+struct scal_detail {
+ unsigned char node_id; /* Scalability Node ID */
+ unsigned long CBAR; /* Address of 1MB register space */
+ unsigned char port0node; /* Node ID port connected to: 0xFF=None */
+ unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char port1node; /* Node ID port connected to: 0xFF = None */
+ unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char port2node; /* Node ID port connected to: 0xFF = None */
+ unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */
+} __attribute__((packed));
+
+struct rio_detail {
+ unsigned char node_id; /* RIO Node ID */
+ unsigned long BBAR; /* Address of 1MB register space */
+ unsigned char type; /* Type of device */
+ unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/
+ /* For CYC: Node ID of Twister that owns this CYC */
+ unsigned char port0node; /* Node ID port connected to: 0xFF=None */
+ unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char port1node; /* Node ID port connected to: 0xFF=None */
+ unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */
+ /* For CYC: 0 */
+ unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */
+ /* = 0 : the XAPIC is not used, ie:*/
+ /* ints fwded to another XAPIC */
+ /* Bits1:7 Reserved */
+ /* For CYC: Bits0:7 Reserved */
+ unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */
+ /* lower slot numbers/PCI bus numbers */
+ /* For CYC: No meaning */
+ unsigned char chassis_num; /* 1 based Chassis number */
+ /* For LookOut WPEGs this field indicates the */
+ /* Expansion Chassis #, enumerated from Boot */
+ /* Node WPEG external port, then Boot Node CYC */
+ /* external port, then Next Vigil chassis WPEG */
+ /* external port, etc. */
+ /* Shared Lookouts have only 1 chassis number (the */
+ /* first one assigned) */
+} __attribute__((packed));
+
+
+typedef enum {
+ CompatTwister = 0, /* Compatibility Twister */
+ AltTwister = 1, /* Alternate Twister of internal 8-way */
+ CompatCyclone = 2, /* Compatibility Cyclone */
+ AltCyclone = 3, /* Alternate Cyclone of internal 8-way */
+ CompatWPEG = 4, /* Compatibility WPEG */
+ AltWPEG = 5, /* Second Planar WPEG */
+ LookOutAWPEG = 6, /* LookOut WPEG */
+ LookOutBWPEG = 7, /* LookOut WPEG */
+} node_type;
+
+static inline int is_WPEG(struct rio_detail *rio){
+ return (rio->type == CompatWPEG || rio->type == AltWPEG ||
+ rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
+}
+
+#endif /* __ASM_SUMMIT_MPPARSE_H */
diff --git a/arch/x86/include/asm/suspend.h b/arch/x86/include/asm/suspend.h
new file mode 100644
index 000000000000..9bd521fe4570
--- /dev/null
+++ b/arch/x86/include/asm/suspend.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "suspend_32.h"
+#else
+# include "suspend_64.h"
+#endif
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
new file mode 100644
index 000000000000..a5074bd0f8be
--- /dev/null
+++ b/arch/x86/include/asm/suspend_32.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2001-2002 Pavel Machek <pavel@suse.cz>
+ * Based on code
+ * Copyright 2001 Patrick Mochel <mochel@osdl.org>
+ */
+#ifndef _ASM_X86_SUSPEND_32_H
+#define _ASM_X86_SUSPEND_32_H
+
+#include <asm/desc.h>
+#include <asm/i387.h>
+
+static inline int arch_prepare_suspend(void) { return 0; }
+
+/* image of the saved processor state */
+struct saved_context {
+ u16 es, fs, gs, ss;
+ unsigned long cr0, cr2, cr3, cr4;
+ struct desc_ptr gdt;
+ struct desc_ptr idt;
+ u16 ldt;
+ u16 tss;
+ unsigned long tr;
+ unsigned long safety;
+ unsigned long return_address;
+} __attribute__((packed));
+
+#ifdef CONFIG_ACPI
+extern unsigned long saved_eip;
+extern unsigned long saved_esp;
+extern unsigned long saved_ebp;
+extern unsigned long saved_ebx;
+extern unsigned long saved_esi;
+extern unsigned long saved_edi;
+
+static inline void acpi_save_register_state(unsigned long return_point)
+{
+ saved_eip = return_point;
+ asm volatile("movl %%esp,%0" : "=m" (saved_esp));
+ asm volatile("movl %%ebp,%0" : "=m" (saved_ebp));
+ asm volatile("movl %%ebx,%0" : "=m" (saved_ebx));
+ asm volatile("movl %%edi,%0" : "=m" (saved_edi));
+ asm volatile("movl %%esi,%0" : "=m" (saved_esi));
+}
+
+#define acpi_restore_register_state() do {} while (0)
+
+/* routines for saving/restoring kernel state */
+extern int acpi_save_state_mem(void);
+#endif
+
+#endif /* _ASM_X86_SUSPEND_32_H */
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
new file mode 100644
index 000000000000..06284f42b759
--- /dev/null
+++ b/arch/x86/include/asm/suspend_64.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2001-2003 Pavel Machek <pavel@suse.cz>
+ * Based on code
+ * Copyright 2001 Patrick Mochel <mochel@osdl.org>
+ */
+#ifndef _ASM_X86_SUSPEND_64_H
+#define _ASM_X86_SUSPEND_64_H
+
+#include <asm/desc.h>
+#include <asm/i387.h>
+
+static inline int arch_prepare_suspend(void)
+{
+ return 0;
+}
+
+/*
+ * Image of the saved processor state, used by the low level ACPI suspend to
+ * RAM code and by the low level hibernation code.
+ *
+ * If you modify it, fix arch/x86/kernel/acpi/wakeup_64.S and make sure that
+ * __save/__restore_processor_state(), defined in arch/x86/kernel/suspend_64.c,
+ * still work as required.
+ */
+struct saved_context {
+ struct pt_regs regs;
+ u16 ds, es, fs, gs, ss;
+ unsigned long gs_base, gs_kernel_base, fs_base;
+ unsigned long cr0, cr2, cr3, cr4, cr8;
+ unsigned long efer;
+ u16 gdt_pad;
+ u16 gdt_limit;
+ unsigned long gdt_base;
+ u16 idt_pad;
+ u16 idt_limit;
+ unsigned long idt_base;
+ u16 ldt;
+ u16 tss;
+ unsigned long tr;
+ unsigned long safety;
+ unsigned long return_address;
+} __attribute__((packed));
+
+#define loaddebug(thread,register) \
+ set_debugreg((thread)->debugreg##register, register)
+
+/* routines for saving/restoring kernel state */
+extern int acpi_save_state_mem(void);
+extern char core_restore_code;
+extern char restore_registers;
+
+#endif /* _ASM_X86_SUSPEND_64_H */
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
new file mode 100644
index 000000000000..51fb2c76ad74
--- /dev/null
+++ b/arch/x86/include/asm/swiotlb.h
@@ -0,0 +1,58 @@
+#ifndef _ASM_X86_SWIOTLB_H
+#define _ASM_X86_SWIOTLB_H
+
+#include <asm/dma-mapping.h>
+
+/* SWIOTLB interface */
+
+extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr,
+ size_t size, int dir);
+extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flags);
+extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, int dir);
+extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
+ dma_addr_t dev_addr,
+ size_t size, int dir);
+extern void swiotlb_sync_single_for_device(struct device *hwdev,
+ dma_addr_t dev_addr,
+ size_t size, int dir);
+extern void swiotlb_sync_single_range_for_cpu(struct device *hwdev,
+ dma_addr_t dev_addr,
+ unsigned long offset,
+ size_t size, int dir);
+extern void swiotlb_sync_single_range_for_device(struct device *hwdev,
+ dma_addr_t dev_addr,
+ unsigned long offset,
+ size_t size, int dir);
+extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int dir);
+extern void swiotlb_sync_sg_for_device(struct device *hwdev,
+ struct scatterlist *sg, int nelems,
+ int dir);
+extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
+ int nents, int direction);
+extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
+ int nents, int direction);
+extern int swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
+extern void swiotlb_free_coherent(struct device *hwdev, size_t size,
+ void *vaddr, dma_addr_t dma_handle);
+extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
+extern void swiotlb_init(void);
+
+extern int swiotlb_force;
+
+#ifdef CONFIG_SWIOTLB
+extern int swiotlb;
+extern void pci_swiotlb_init(void);
+#else
+#define swiotlb 0
+static inline void pci_swiotlb_init(void)
+{
+}
+#endif
+
+static inline void dma_mark_clean(void *addr, size_t size) {}
+
+#endif /* _ASM_X86_SWIOTLB_H */
diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h
new file mode 100644
index 000000000000..9d09b4073b60
--- /dev/null
+++ b/arch/x86/include/asm/sync_bitops.h
@@ -0,0 +1,130 @@
+#ifndef _ASM_X86_SYNC_BITOPS_H
+#define _ASM_X86_SYNC_BITOPS_H
+
+/*
+ * Copyright 1992, Linus Torvalds.
+ */
+
+/*
+ * These have to be done with inline assembly: that way the bit-setting
+ * is guaranteed to be atomic. All bit operations return 0 if the bit
+ * was cleared before the operation and != 0 if it was not.
+ *
+ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
+ */
+
+#define ADDR (*(volatile long *)addr)
+
+/**
+ * sync_set_bit - Atomically set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * This function is atomic and may not be reordered. See __set_bit()
+ * if you do not require the atomic guarantees.
+ *
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void sync_set_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("lock; btsl %1,%0"
+ : "+m" (ADDR)
+ : "Ir" (nr)
+ : "memory");
+}
+
+/**
+ * sync_clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * sync_clear_bit() is atomic and may not be reordered. However, it does
+ * not contain a memory barrier, so if it is used for locking purposes,
+ * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * in order to ensure changes are visible on other processors.
+ */
+static inline void sync_clear_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("lock; btrl %1,%0"
+ : "+m" (ADDR)
+ : "Ir" (nr)
+ : "memory");
+}
+
+/**
+ * sync_change_bit - Toggle a bit in memory
+ * @nr: Bit to change
+ * @addr: Address to start counting from
+ *
+ * sync_change_bit() is atomic and may not be reordered.
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void sync_change_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("lock; btcl %1,%0"
+ : "+m" (ADDR)
+ : "Ir" (nr)
+ : "memory");
+}
+
+/**
+ * sync_test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile("lock; btsl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit), "+m" (ADDR)
+ : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+/**
+ * sync_test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile("lock; btrl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit), "+m" (ADDR)
+ : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+/**
+ * sync_test_and_change_bit - Change a bit and return its old value
+ * @nr: Bit to change
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int sync_test_and_change_bit(int nr, volatile unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile("lock; btcl %2,%1\n\tsbbl %0,%0"
+ : "=r" (oldbit), "+m" (ADDR)
+ : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+#define sync_test_bit(nr, addr) test_bit(nr, addr)
+
+#undef ADDR
+
+#endif /* _ASM_X86_SYNC_BITOPS_H */
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
new file mode 100644
index 000000000000..d82f39bb7905
--- /dev/null
+++ b/arch/x86/include/asm/syscall.h
@@ -0,0 +1,213 @@
+/*
+ * Access to user system call parameters and results
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * See asm-generic/syscall.h for descriptions of what we must do here.
+ */
+
+#ifndef _ASM_X86_SYSCALL_H
+#define _ASM_X86_SYSCALL_H
+
+#include <linux/sched.h>
+#include <linux/err.h>
+
+static inline long syscall_get_nr(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ /*
+ * We always sign-extend a -1 value being set here,
+ * so this is always either -1L or a syscall number.
+ */
+ return regs->orig_ax;
+}
+
+static inline void syscall_rollback(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ regs->ax = regs->orig_ax;
+}
+
+static inline long syscall_get_error(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ unsigned long error = regs->ax;
+#ifdef CONFIG_IA32_EMULATION
+ /*
+ * TS_COMPAT is set for 32-bit syscall entries and then
+ * remains set until we return to user mode.
+ */
+ if (task_thread_info(task)->status & TS_COMPAT)
+ /*
+ * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
+ * and will match correctly in comparisons.
+ */
+ error = (long) (int) error;
+#endif
+ return IS_ERR_VALUE(error) ? error : 0;
+}
+
+static inline long syscall_get_return_value(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ return regs->ax;
+}
+
+static inline void syscall_set_return_value(struct task_struct *task,
+ struct pt_regs *regs,
+ int error, long val)
+{
+ regs->ax = (long) error ?: val;
+}
+
+#ifdef CONFIG_X86_32
+
+static inline void syscall_get_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned int i, unsigned int n,
+ unsigned long *args)
+{
+ BUG_ON(i + n > 6);
+ memcpy(args, &regs->bx + i, n * sizeof(args[0]));
+}
+
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned int i, unsigned int n,
+ const unsigned long *args)
+{
+ BUG_ON(i + n > 6);
+ memcpy(&regs->bx + i, args, n * sizeof(args[0]));
+}
+
+#else /* CONFIG_X86_64 */
+
+static inline void syscall_get_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned int i, unsigned int n,
+ unsigned long *args)
+{
+# ifdef CONFIG_IA32_EMULATION
+ if (task_thread_info(task)->status & TS_COMPAT)
+ switch (i) {
+ case 0:
+ if (!n--) break;
+ *args++ = regs->bx;
+ case 1:
+ if (!n--) break;
+ *args++ = regs->cx;
+ case 2:
+ if (!n--) break;
+ *args++ = regs->dx;
+ case 3:
+ if (!n--) break;
+ *args++ = regs->si;
+ case 4:
+ if (!n--) break;
+ *args++ = regs->di;
+ case 5:
+ if (!n--) break;
+ *args++ = regs->bp;
+ case 6:
+ if (!n--) break;
+ default:
+ BUG();
+ break;
+ }
+ else
+# endif
+ switch (i) {
+ case 0:
+ if (!n--) break;
+ *args++ = regs->di;
+ case 1:
+ if (!n--) break;
+ *args++ = regs->si;
+ case 2:
+ if (!n--) break;
+ *args++ = regs->dx;
+ case 3:
+ if (!n--) break;
+ *args++ = regs->r10;
+ case 4:
+ if (!n--) break;
+ *args++ = regs->r8;
+ case 5:
+ if (!n--) break;
+ *args++ = regs->r9;
+ case 6:
+ if (!n--) break;
+ default:
+ BUG();
+ break;
+ }
+}
+
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned int i, unsigned int n,
+ const unsigned long *args)
+{
+# ifdef CONFIG_IA32_EMULATION
+ if (task_thread_info(task)->status & TS_COMPAT)
+ switch (i) {
+ case 0:
+ if (!n--) break;
+ regs->bx = *args++;
+ case 1:
+ if (!n--) break;
+ regs->cx = *args++;
+ case 2:
+ if (!n--) break;
+ regs->dx = *args++;
+ case 3:
+ if (!n--) break;
+ regs->si = *args++;
+ case 4:
+ if (!n--) break;
+ regs->di = *args++;
+ case 5:
+ if (!n--) break;
+ regs->bp = *args++;
+ case 6:
+ if (!n--) break;
+ default:
+ BUG();
+ break;
+ }
+ else
+# endif
+ switch (i) {
+ case 0:
+ if (!n--) break;
+ regs->di = *args++;
+ case 1:
+ if (!n--) break;
+ regs->si = *args++;
+ case 2:
+ if (!n--) break;
+ regs->dx = *args++;
+ case 3:
+ if (!n--) break;
+ regs->r10 = *args++;
+ case 4:
+ if (!n--) break;
+ regs->r8 = *args++;
+ case 5:
+ if (!n--) break;
+ regs->r9 = *args++;
+ case 6:
+ if (!n--) break;
+ default:
+ BUG();
+ break;
+ }
+}
+
+#endif /* CONFIG_X86_32 */
+
+#endif /* _ASM_X86_SYSCALL_H */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
new file mode 100644
index 000000000000..9c6797c3e56c
--- /dev/null
+++ b/arch/x86/include/asm/syscalls.h
@@ -0,0 +1,93 @@
+/*
+ * syscalls.h - Linux syscall interfaces (arch-specific)
+ *
+ * Copyright (c) 2008 Jaswinder Singh
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#ifndef _ASM_X86_SYSCALLS_H
+#define _ASM_X86_SYSCALLS_H
+
+#include <linux/compiler.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/signal.h>
+
+/* Common in X86_32 and X86_64 */
+/* kernel/ioport.c */
+asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
+
+/* kernel/ldt.c */
+asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
+
+/* kernel/tls.c */
+asmlinkage int sys_set_thread_area(struct user_desc __user *);
+asmlinkage int sys_get_thread_area(struct user_desc __user *);
+
+/* X86_32 only */
+#ifdef CONFIG_X86_32
+/* kernel/process_32.c */
+asmlinkage int sys_fork(struct pt_regs);
+asmlinkage int sys_clone(struct pt_regs);
+asmlinkage int sys_vfork(struct pt_regs);
+asmlinkage int sys_execve(struct pt_regs);
+
+/* kernel/signal_32.c */
+asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
+asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
+ struct old_sigaction __user *);
+asmlinkage int sys_sigaltstack(unsigned long);
+asmlinkage unsigned long sys_sigreturn(unsigned long);
+asmlinkage int sys_rt_sigreturn(struct pt_regs);
+
+/* kernel/ioport.c */
+asmlinkage long sys_iopl(unsigned long);
+
+/* kernel/sys_i386_32.c */
+asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
+ unsigned long, unsigned long, unsigned long);
+struct mmap_arg_struct;
+asmlinkage int old_mmap(struct mmap_arg_struct __user *);
+struct sel_arg_struct;
+asmlinkage int old_select(struct sel_arg_struct __user *);
+asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
+struct old_utsname;
+asmlinkage int sys_uname(struct old_utsname __user *);
+struct oldold_utsname;
+asmlinkage int sys_olduname(struct oldold_utsname __user *);
+
+/* kernel/vm86_32.c */
+asmlinkage int sys_vm86old(struct pt_regs);
+asmlinkage int sys_vm86(struct pt_regs);
+
+#else /* CONFIG_X86_32 */
+
+/* X86_64 only */
+/* kernel/process_64.c */
+asmlinkage long sys_fork(struct pt_regs *);
+asmlinkage long sys_clone(unsigned long, unsigned long,
+ void __user *, void __user *,
+ struct pt_regs *);
+asmlinkage long sys_vfork(struct pt_regs *);
+asmlinkage long sys_execve(char __user *, char __user * __user *,
+ char __user * __user *,
+ struct pt_regs *);
+
+/* kernel/ioport.c */
+asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
+
+/* kernel/signal_64.c */
+asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
+ struct pt_regs *);
+asmlinkage long sys_rt_sigreturn(struct pt_regs *);
+
+/* kernel/sys_x86_64.c */
+asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
+ unsigned long, unsigned long, unsigned long);
+struct new_utsname;
+asmlinkage long sys_uname(struct new_utsname __user *);
+
+#endif /* CONFIG_X86_32 */
+#endif /* _ASM_X86_SYSCALLS_H */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
new file mode 100644
index 000000000000..2f6340a44291
--- /dev/null
+++ b/arch/x86/include/asm/system.h
@@ -0,0 +1,431 @@
+#ifndef _ASM_X86_SYSTEM_H
+#define _ASM_X86_SYSTEM_H
+
+#include <asm/asm.h>
+#include <asm/segment.h>
+#include <asm/cpufeature.h>
+#include <asm/cmpxchg.h>
+#include <asm/nops.h>
+
+#include <linux/kernel.h>
+#include <linux/irqflags.h>
+
+/* entries in ARCH_DLINFO: */
+#ifdef CONFIG_IA32_EMULATION
+# define AT_VECTOR_SIZE_ARCH 2
+#else
+# define AT_VECTOR_SIZE_ARCH 1
+#endif
+
+struct task_struct; /* one of the stranger aspects of C forward declarations */
+struct task_struct *__switch_to(struct task_struct *prev,
+ struct task_struct *next);
+
+#ifdef CONFIG_X86_32
+
+/*
+ * Saving eflags is important. It switches not only IOPL between tasks,
+ * it also protects other tasks from NT leaking through sysenter etc.
+ */
+#define switch_to(prev, next, last) \
+do { \
+ /* \
+ * Context-switching clobbers all registers, so we clobber \
+ * them explicitly, via unused output variables. \
+ * (EAX and EBP is not listed because EBP is saved/restored \
+ * explicitly for wchan access and EAX is the return value of \
+ * __switch_to()) \
+ */ \
+ unsigned long ebx, ecx, edx, esi, edi; \
+ \
+ asm volatile("pushfl\n\t" /* save flags */ \
+ "pushl %%ebp\n\t" /* save EBP */ \
+ "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
+ "movl %[next_sp],%%esp\n\t" /* restore ESP */ \
+ "movl $1f,%[prev_ip]\n\t" /* save EIP */ \
+ "pushl %[next_ip]\n\t" /* restore EIP */ \
+ "jmp __switch_to\n" /* regparm call */ \
+ "1:\t" \
+ "popl %%ebp\n\t" /* restore EBP */ \
+ "popfl\n" /* restore flags */ \
+ \
+ /* output parameters */ \
+ : [prev_sp] "=m" (prev->thread.sp), \
+ [prev_ip] "=m" (prev->thread.ip), \
+ "=a" (last), \
+ \
+ /* clobbered output registers: */ \
+ "=b" (ebx), "=c" (ecx), "=d" (edx), \
+ "=S" (esi), "=D" (edi) \
+ \
+ /* input parameters: */ \
+ : [next_sp] "m" (next->thread.sp), \
+ [next_ip] "m" (next->thread.ip), \
+ \
+ /* regparm parameters for __switch_to(): */ \
+ [prev] "a" (prev), \
+ [next] "d" (next) \
+ \
+ : /* reloaded segment registers */ \
+ "memory"); \
+} while (0)
+
+/*
+ * disable hlt during certain critical i/o operations
+ */
+#define HAVE_DISABLE_HLT
+#else
+#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
+#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
+
+/* frame pointer must be last for get_wchan */
+#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
+#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
+
+#define __EXTRA_CLOBBER \
+ , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
+ "r12", "r13", "r14", "r15"
+
+/* Save restore flags to clear handle leaking NT */
+#define switch_to(prev, next, last) \
+ asm volatile(SAVE_CONTEXT \
+ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
+ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
+ "call __switch_to\n\t" \
+ ".globl thread_return\n" \
+ "thread_return:\n\t" \
+ "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
+ "movq %P[task_canary](%%rsi),%%r8\n\t" \
+ "movq %%r8,%%gs:%P[pda_canary]\n\t" \
+ "movq %P[thread_info](%%rsi),%%r8\n\t" \
+ LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
+ "movq %%rax,%%rdi\n\t" \
+ "jc ret_from_fork\n\t" \
+ RESTORE_CONTEXT \
+ : "=a" (last) \
+ : [next] "S" (next), [prev] "D" (prev), \
+ [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
+ [ti_flags] "i" (offsetof(struct thread_info, flags)), \
+ [tif_fork] "i" (TIF_FORK), \
+ [thread_info] "i" (offsetof(struct task_struct, stack)), \
+ [task_canary] "i" (offsetof(struct task_struct, stack_canary)),\
+ [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)), \
+ [pda_canary] "i" (offsetof(struct x8664_pda, stack_canary))\
+ : "memory", "cc" __EXTRA_CLOBBER)
+#endif
+
+#ifdef __KERNEL__
+#define _set_base(addr, base) do { unsigned long __pr; \
+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
+ "rorl $16,%%edx\n\t" \
+ "movb %%dl,%2\n\t" \
+ "movb %%dh,%3" \
+ :"=&d" (__pr) \
+ :"m" (*((addr)+2)), \
+ "m" (*((addr)+4)), \
+ "m" (*((addr)+7)), \
+ "0" (base) \
+ ); } while (0)
+
+#define _set_limit(addr, limit) do { unsigned long __lr; \
+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
+ "rorl $16,%%edx\n\t" \
+ "movb %2,%%dh\n\t" \
+ "andb $0xf0,%%dh\n\t" \
+ "orb %%dh,%%dl\n\t" \
+ "movb %%dl,%2" \
+ :"=&d" (__lr) \
+ :"m" (*(addr)), \
+ "m" (*((addr)+6)), \
+ "0" (limit) \
+ ); } while (0)
+
+#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
+#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
+
+extern void native_load_gs_index(unsigned);
+
+/*
+ * Load a segment. Fall back on loading the zero
+ * segment if something goes wrong..
+ */
+#define loadsegment(seg, value) \
+ asm volatile("\n" \
+ "1:\t" \
+ "movl %k0,%%" #seg "\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3:\t" \
+ "movl %k1, %%" #seg "\n\t" \
+ "jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b,3b) \
+ : :"r" (value), "r" (0) : "memory")
+
+
+/*
+ * Save a segment register away
+ */
+#define savesegment(seg, value) \
+ asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
+
+static inline unsigned long get_limit(unsigned long segment)
+{
+ unsigned long __limit;
+ asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
+ return __limit + 1;
+}
+
+static inline void native_clts(void)
+{
+ asm volatile("clts");
+}
+
+/*
+ * Volatile isn't enough to prevent the compiler from reordering the
+ * read/write functions for the control registers and messing everything up.
+ * A memory clobber would solve the problem, but would prevent reordering of
+ * all loads stores around it, which can hurt performance. Solution is to
+ * use a variable and mimic reads and writes to it to enforce serialization
+ */
+static unsigned long __force_order;
+
+static inline unsigned long native_read_cr0(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+static inline void native_write_cr0(unsigned long val)
+{
+ asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr2(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+static inline void native_write_cr2(unsigned long val)
+{
+ asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr3(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+static inline void native_write_cr3(unsigned long val)
+{
+ asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr4(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+static inline unsigned long native_read_cr4_safe(void)
+{
+ unsigned long val;
+ /* This could fault if %cr4 does not exist. In x86_64, a cr4 always
+ * exists, so it will never fail. */
+#ifdef CONFIG_X86_32
+ asm volatile("1: mov %%cr4, %0\n"
+ "2:\n"
+ _ASM_EXTABLE(1b, 2b)
+ : "=r" (val), "=m" (__force_order) : "0" (0));
+#else
+ val = native_read_cr4();
+#endif
+ return val;
+}
+
+static inline void native_write_cr4(unsigned long val)
+{
+ asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long native_read_cr8(void)
+{
+ unsigned long cr8;
+ asm volatile("movq %%cr8,%0" : "=r" (cr8));
+ return cr8;
+}
+
+static inline void native_write_cr8(unsigned long val)
+{
+ asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
+}
+#endif
+
+static inline void native_wbinvd(void)
+{
+ asm volatile("wbinvd": : :"memory");
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define read_cr0() (native_read_cr0())
+#define write_cr0(x) (native_write_cr0(x))
+#define read_cr2() (native_read_cr2())
+#define write_cr2(x) (native_write_cr2(x))
+#define read_cr3() (native_read_cr3())
+#define write_cr3(x) (native_write_cr3(x))
+#define read_cr4() (native_read_cr4())
+#define read_cr4_safe() (native_read_cr4_safe())
+#define write_cr4(x) (native_write_cr4(x))
+#define wbinvd() (native_wbinvd())
+#ifdef CONFIG_X86_64
+#define read_cr8() (native_read_cr8())
+#define write_cr8(x) (native_write_cr8(x))
+#define load_gs_index native_load_gs_index
+#endif
+
+/* Clear the 'TS' bit */
+#define clts() (native_clts())
+
+#endif/* CONFIG_PARAVIRT */
+
+#define stts() write_cr0(read_cr0() | X86_CR0_TS)
+
+#endif /* __KERNEL__ */
+
+static inline void clflush(volatile void *__p)
+{
+ asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
+}
+
+#define nop() asm volatile ("nop")
+
+void disable_hlt(void);
+void enable_hlt(void);
+
+void cpu_idle_wait(void);
+
+extern unsigned long arch_align_stack(unsigned long sp);
+extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
+
+void default_idle(void);
+
+void stop_this_cpu(void *dummy);
+
+/*
+ * Force strict CPU ordering.
+ * And yes, this is required on UP too when we're talking
+ * to devices.
+ */
+#ifdef CONFIG_X86_32
+/*
+ * Some non-Intel clones support out of order store. wmb() ceases to be a
+ * nop for these.
+ */
+#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
+#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
+#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
+#else
+#define mb() asm volatile("mfence":::"memory")
+#define rmb() asm volatile("lfence":::"memory")
+#define wmb() asm volatile("sfence" ::: "memory")
+#endif
+
+/**
+ * read_barrier_depends - Flush all pending reads that subsequents reads
+ * depend on.
+ *
+ * No data-dependent reads from memory-like regions are ever reordered
+ * over this barrier. All reads preceding this primitive are guaranteed
+ * to access memory (but not necessarily other CPUs' caches) before any
+ * reads following this primitive that depend on the data return by
+ * any of the preceding reads. This primitive is much lighter weight than
+ * rmb() on most CPUs, and is never heavier weight than is
+ * rmb().
+ *
+ * These ordering constraints are respected by both the local CPU
+ * and the compiler.
+ *
+ * Ordering is not guaranteed by anything other than these primitives,
+ * not even by data dependencies. See the documentation for
+ * memory_barrier() for examples and URLs to more information.
+ *
+ * For example, the following code would force ordering (the initial
+ * value of "a" is zero, "b" is one, and "p" is "&a"):
+ *
+ * <programlisting>
+ * CPU 0 CPU 1
+ *
+ * b = 2;
+ * memory_barrier();
+ * p = &b; q = p;
+ * read_barrier_depends();
+ * d = *q;
+ * </programlisting>
+ *
+ * because the read of "*q" depends on the read of "p" and these
+ * two reads are separated by a read_barrier_depends(). However,
+ * the following code, with the same initial values for "a" and "b":
+ *
+ * <programlisting>
+ * CPU 0 CPU 1
+ *
+ * a = 2;
+ * memory_barrier();
+ * b = 3; y = b;
+ * read_barrier_depends();
+ * x = a;
+ * </programlisting>
+ *
+ * does not enforce ordering, since there is no data dependency between
+ * the read of "a" and the read of "b". Therefore, on some CPUs, such
+ * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
+ * in cases like this where there are no data dependencies.
+ **/
+
+#define read_barrier_depends() do { } while (0)
+
+#ifdef CONFIG_SMP
+#define smp_mb() mb()
+#ifdef CONFIG_X86_PPRO_FENCE
+# define smp_rmb() rmb()
+#else
+# define smp_rmb() barrier()
+#endif
+#ifdef CONFIG_X86_OOSTORE
+# define smp_wmb() wmb()
+#else
+# define smp_wmb() barrier()
+#endif
+#define smp_read_barrier_depends() read_barrier_depends()
+#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#else
+#define smp_mb() barrier()
+#define smp_rmb() barrier()
+#define smp_wmb() barrier()
+#define smp_read_barrier_depends() do { } while (0)
+#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#endif
+
+/*
+ * Stop RDTSC speculation. This is needed when you need to use RDTSC
+ * (or get_cycles or vread that possibly accesses the TSC) in a defined
+ * code region.
+ *
+ * (Could use an alternative three way for this if there was one.)
+ */
+static inline void rdtsc_barrier(void)
+{
+ alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
+ alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+}
+
+#endif /* _ASM_X86_SYSTEM_H */
diff --git a/arch/x86/include/asm/system_64.h b/arch/x86/include/asm/system_64.h
new file mode 100644
index 000000000000..1159e091ad09
--- /dev/null
+++ b/arch/x86/include/asm/system_64.h
@@ -0,0 +1,22 @@
+#ifndef _ASM_X86_SYSTEM_64_H
+#define _ASM_X86_SYSTEM_64_H
+
+#include <asm/segment.h>
+#include <asm/cmpxchg.h>
+
+
+static inline unsigned long read_cr8(void)
+{
+ unsigned long cr8;
+ asm volatile("movq %%cr8,%0" : "=r" (cr8));
+ return cr8;
+}
+
+static inline void write_cr8(unsigned long val)
+{
+ asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
+}
+
+#include <linux/irqflags.h>
+
+#endif /* _ASM_X86_SYSTEM_64_H */
diff --git a/arch/x86/include/asm/tce.h b/arch/x86/include/asm/tce.h
new file mode 100644
index 000000000000..7a6677c1a715
--- /dev/null
+++ b/arch/x86/include/asm/tce.h
@@ -0,0 +1,48 @@
+/*
+ * This file is derived from asm-powerpc/tce.h.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
+ * Author: Jon Mason <jdmason@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _ASM_X86_TCE_H
+#define _ASM_X86_TCE_H
+
+extern unsigned int specified_table_size;
+struct iommu_table;
+
+#define TCE_ENTRY_SIZE 8 /* in bytes */
+
+#define TCE_READ_SHIFT 0
+#define TCE_WRITE_SHIFT 1
+#define TCE_HUBID_SHIFT 2 /* unused */
+#define TCE_RSVD_SHIFT 8 /* unused */
+#define TCE_RPN_SHIFT 12
+#define TCE_UNUSED_SHIFT 48 /* unused */
+
+#define TCE_RPN_MASK 0x0000fffffffff000ULL
+
+extern void tce_build(struct iommu_table *tbl, unsigned long index,
+ unsigned int npages, unsigned long uaddr, int direction);
+extern void tce_free(struct iommu_table *tbl, long index, unsigned int npages);
+extern void * __init alloc_tce_table(void);
+extern void __init free_tce_table(void *tbl);
+extern int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar);
+
+#endif /* _ASM_X86_TCE_H */
diff --git a/arch/x86/include/asm/termbits.h b/arch/x86/include/asm/termbits.h
new file mode 100644
index 000000000000..af1b70ea440f
--- /dev/null
+++ b/arch/x86/include/asm/termbits.h
@@ -0,0 +1,198 @@
+#ifndef _ASM_X86_TERMBITS_H
+#define _ASM_X86_TERMBITS_H
+
+#include <linux/posix_types.h>
+
+typedef unsigned char cc_t;
+typedef unsigned int speed_t;
+typedef unsigned int tcflag_t;
+
+#define NCCS 19
+struct termios {
+ tcflag_t c_iflag; /* input mode flags */
+ tcflag_t c_oflag; /* output mode flags */
+ tcflag_t c_cflag; /* control mode flags */
+ tcflag_t c_lflag; /* local mode flags */
+ cc_t c_line; /* line discipline */
+ cc_t c_cc[NCCS]; /* control characters */
+};
+
+struct termios2 {
+ tcflag_t c_iflag; /* input mode flags */
+ tcflag_t c_oflag; /* output mode flags */
+ tcflag_t c_cflag; /* control mode flags */
+ tcflag_t c_lflag; /* local mode flags */
+ cc_t c_line; /* line discipline */
+ cc_t c_cc[NCCS]; /* control characters */
+ speed_t c_ispeed; /* input speed */
+ speed_t c_ospeed; /* output speed */
+};
+
+struct ktermios {
+ tcflag_t c_iflag; /* input mode flags */
+ tcflag_t c_oflag; /* output mode flags */
+ tcflag_t c_cflag; /* control mode flags */
+ tcflag_t c_lflag; /* local mode flags */
+ cc_t c_line; /* line discipline */
+ cc_t c_cc[NCCS]; /* control characters */
+ speed_t c_ispeed; /* input speed */
+ speed_t c_ospeed; /* output speed */
+};
+
+/* c_cc characters */
+#define VINTR 0
+#define VQUIT 1
+#define VERASE 2
+#define VKILL 3
+#define VEOF 4
+#define VTIME 5
+#define VMIN 6
+#define VSWTC 7
+#define VSTART 8
+#define VSTOP 9
+#define VSUSP 10
+#define VEOL 11
+#define VREPRINT 12
+#define VDISCARD 13
+#define VWERASE 14
+#define VLNEXT 15
+#define VEOL2 16
+
+/* c_iflag bits */
+#define IGNBRK 0000001
+#define BRKINT 0000002
+#define IGNPAR 0000004
+#define PARMRK 0000010
+#define INPCK 0000020
+#define ISTRIP 0000040
+#define INLCR 0000100
+#define IGNCR 0000200
+#define ICRNL 0000400
+#define IUCLC 0001000
+#define IXON 0002000
+#define IXANY 0004000
+#define IXOFF 0010000
+#define IMAXBEL 0020000
+#define IUTF8 0040000
+
+/* c_oflag bits */
+#define OPOST 0000001
+#define OLCUC 0000002
+#define ONLCR 0000004
+#define OCRNL 0000010
+#define ONOCR 0000020
+#define ONLRET 0000040
+#define OFILL 0000100
+#define OFDEL 0000200
+#define NLDLY 0000400
+#define NL0 0000000
+#define NL1 0000400
+#define CRDLY 0003000
+#define CR0 0000000
+#define CR1 0001000
+#define CR2 0002000
+#define CR3 0003000
+#define TABDLY 0014000
+#define TAB0 0000000
+#define TAB1 0004000
+#define TAB2 0010000
+#define TAB3 0014000
+#define XTABS 0014000
+#define BSDLY 0020000
+#define BS0 0000000
+#define BS1 0020000
+#define VTDLY 0040000
+#define VT0 0000000
+#define VT1 0040000
+#define FFDLY 0100000
+#define FF0 0000000
+#define FF1 0100000
+
+/* c_cflag bit meaning */
+#define CBAUD 0010017
+#define B0 0000000 /* hang up */
+#define B50 0000001
+#define B75 0000002
+#define B110 0000003
+#define B134 0000004
+#define B150 0000005
+#define B200 0000006
+#define B300 0000007
+#define B600 0000010
+#define B1200 0000011
+#define B1800 0000012
+#define B2400 0000013
+#define B4800 0000014
+#define B9600 0000015
+#define B19200 0000016
+#define B38400 0000017
+#define EXTA B19200
+#define EXTB B38400
+#define CSIZE 0000060
+#define CS5 0000000
+#define CS6 0000020
+#define CS7 0000040
+#define CS8 0000060
+#define CSTOPB 0000100
+#define CREAD 0000200
+#define PARENB 0000400
+#define PARODD 0001000
+#define HUPCL 0002000
+#define CLOCAL 0004000
+#define CBAUDEX 0010000
+#define BOTHER 0010000 /* non standard rate */
+#define B57600 0010001
+#define B115200 0010002
+#define B230400 0010003
+#define B460800 0010004
+#define B500000 0010005
+#define B576000 0010006
+#define B921600 0010007
+#define B1000000 0010010
+#define B1152000 0010011
+#define B1500000 0010012
+#define B2000000 0010013
+#define B2500000 0010014
+#define B3000000 0010015
+#define B3500000 0010016
+#define B4000000 0010017
+#define CIBAUD 002003600000 /* input baud rate */
+#define CMSPAR 010000000000 /* mark or space (stick) parity */
+#define CRTSCTS 020000000000 /* flow control */
+
+#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */
+
+/* c_lflag bits */
+#define ISIG 0000001
+#define ICANON 0000002
+#define XCASE 0000004
+#define ECHO 0000010
+#define ECHOE 0000020
+#define ECHOK 0000040
+#define ECHONL 0000100
+#define NOFLSH 0000200
+#define TOSTOP 0000400
+#define ECHOCTL 0001000
+#define ECHOPRT 0002000
+#define ECHOKE 0004000
+#define FLUSHO 0010000
+#define PENDIN 0040000
+#define IEXTEN 0100000
+
+/* tcflow() and TCXONC use these */
+#define TCOOFF 0
+#define TCOON 1
+#define TCIOFF 2
+#define TCION 3
+
+/* tcflush() and TCFLSH use these */
+#define TCIFLUSH 0
+#define TCOFLUSH 1
+#define TCIOFLUSH 2
+
+/* tcsetattr uses these */
+#define TCSANOW 0
+#define TCSADRAIN 1
+#define TCSAFLUSH 2
+
+#endif /* _ASM_X86_TERMBITS_H */
diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h
new file mode 100644
index 000000000000..f72956331c49
--- /dev/null
+++ b/arch/x86/include/asm/termios.h
@@ -0,0 +1,113 @@
+#ifndef _ASM_X86_TERMIOS_H
+#define _ASM_X86_TERMIOS_H
+
+#include <asm/termbits.h>
+#include <asm/ioctls.h>
+
+struct winsize {
+ unsigned short ws_row;
+ unsigned short ws_col;
+ unsigned short ws_xpixel;
+ unsigned short ws_ypixel;
+};
+
+#define NCC 8
+struct termio {
+ unsigned short c_iflag; /* input mode flags */
+ unsigned short c_oflag; /* output mode flags */
+ unsigned short c_cflag; /* control mode flags */
+ unsigned short c_lflag; /* local mode flags */
+ unsigned char c_line; /* line discipline */
+ unsigned char c_cc[NCC]; /* control characters */
+};
+
+/* modem lines */
+#define TIOCM_LE 0x001
+#define TIOCM_DTR 0x002
+#define TIOCM_RTS 0x004
+#define TIOCM_ST 0x008
+#define TIOCM_SR 0x010
+#define TIOCM_CTS 0x020
+#define TIOCM_CAR 0x040
+#define TIOCM_RNG 0x080
+#define TIOCM_DSR 0x100
+#define TIOCM_CD TIOCM_CAR
+#define TIOCM_RI TIOCM_RNG
+#define TIOCM_OUT1 0x2000
+#define TIOCM_OUT2 0x4000
+#define TIOCM_LOOP 0x8000
+
+/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */
+
+#ifdef __KERNEL__
+
+#include <asm/uaccess.h>
+
+/* intr=^C quit=^\ erase=del kill=^U
+ eof=^D vtime=\0 vmin=\1 sxtc=\0
+ start=^Q stop=^S susp=^Z eol=\0
+ reprint=^R discard=^U werase=^W lnext=^V
+ eol2=\0
+*/
+#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0"
+
+/*
+ * Translate a "termio" structure into a "termios". Ugh.
+ */
+#define SET_LOW_TERMIOS_BITS(termios, termio, x) { \
+ unsigned short __tmp; \
+ get_user(__tmp,&(termio)->x); \
+ *(unsigned short *) &(termios)->x = __tmp; \
+}
+
+static inline int user_termio_to_kernel_termios(struct ktermios *termios,
+ struct termio __user *termio)
+{
+ SET_LOW_TERMIOS_BITS(termios, termio, c_iflag);
+ SET_LOW_TERMIOS_BITS(termios, termio, c_oflag);
+ SET_LOW_TERMIOS_BITS(termios, termio, c_cflag);
+ SET_LOW_TERMIOS_BITS(termios, termio, c_lflag);
+ return copy_from_user(termios->c_cc, termio->c_cc, NCC);
+}
+
+/*
+ * Translate a "termios" structure into a "termio". Ugh.
+ */
+static inline int kernel_termios_to_user_termio(struct termio __user *termio,
+ struct ktermios *termios)
+{
+ put_user((termios)->c_iflag, &(termio)->c_iflag);
+ put_user((termios)->c_oflag, &(termio)->c_oflag);
+ put_user((termios)->c_cflag, &(termio)->c_cflag);
+ put_user((termios)->c_lflag, &(termio)->c_lflag);
+ put_user((termios)->c_line, &(termio)->c_line);
+ return copy_to_user((termio)->c_cc, (termios)->c_cc, NCC);
+}
+
+static inline int user_termios_to_kernel_termios(struct ktermios *k,
+ struct termios2 __user *u)
+{
+ return copy_from_user(k, u, sizeof(struct termios2));
+}
+
+static inline int kernel_termios_to_user_termios(struct termios2 __user *u,
+ struct ktermios *k)
+{
+ return copy_to_user(u, k, sizeof(struct termios2));
+}
+
+static inline int user_termios_to_kernel_termios_1(struct ktermios *k,
+ struct termios __user *u)
+{
+ return copy_from_user(k, u, sizeof(struct termios));
+}
+
+static inline int kernel_termios_to_user_termios_1(struct termios __user *u,
+ struct ktermios *k)
+{
+ return copy_to_user(u, k, sizeof(struct termios));
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_TERMIOS_H */
diff --git a/arch/x86/include/asm/therm_throt.h b/arch/x86/include/asm/therm_throt.h
new file mode 100644
index 000000000000..c62349ee7860
--- /dev/null
+++ b/arch/x86/include/asm/therm_throt.h
@@ -0,0 +1,9 @@
+#ifndef _ASM_X86_THERM_THROT_H
+#define _ASM_X86_THERM_THROT_H
+
+#include <asm/atomic.h>
+
+extern atomic_t therm_throt_en;
+int therm_throt_process(int curr);
+
+#endif /* _ASM_X86_THERM_THROT_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
new file mode 100644
index 000000000000..98789647baa9
--- /dev/null
+++ b/arch/x86/include/asm/thread_info.h
@@ -0,0 +1,263 @@
+/* thread_info.h: low-level thread information
+ *
+ * Copyright (C) 2002 David Howells (dhowells@redhat.com)
+ * - Incorporating suggestions made by Linus Torvalds and Dave Miller
+ */
+
+#ifndef _ASM_X86_THREAD_INFO_H
+#define _ASM_X86_THREAD_INFO_H
+
+#include <linux/compiler.h>
+#include <asm/page.h>
+#include <asm/types.h>
+
+/*
+ * low level task data that entry.S needs immediate access to
+ * - this struct should fit entirely inside of one cache line
+ * - this struct shares the supervisor stack pages
+ */
+#ifndef __ASSEMBLY__
+struct task_struct;
+struct exec_domain;
+#include <asm/processor.h>
+#include <asm/ftrace.h>
+#include <asm/atomic.h>
+
+struct thread_info {
+ struct task_struct *task; /* main task structure */
+ struct exec_domain *exec_domain; /* execution domain */
+ __u32 flags; /* low level flags */
+ __u32 status; /* thread synchronous flags */
+ __u32 cpu; /* current CPU */
+ int preempt_count; /* 0 => preemptable,
+ <0 => BUG */
+ mm_segment_t addr_limit;
+ struct restart_block restart_block;
+ void __user *sysenter_return;
+#ifdef CONFIG_X86_32
+ unsigned long previous_esp; /* ESP of the previous stack in
+ case of nested (IRQ) stacks
+ */
+ __u8 supervisor_stack[0];
+#endif
+};
+
+#define INIT_THREAD_INFO(tsk) \
+{ \
+ .task = &tsk, \
+ .exec_domain = &default_exec_domain, \
+ .flags = 0, \
+ .cpu = 0, \
+ .preempt_count = 1, \
+ .addr_limit = KERNEL_DS, \
+ .restart_block = { \
+ .fn = do_no_restart_syscall, \
+ }, \
+}
+
+#define init_thread_info (init_thread_union.thread_info)
+#define init_stack (init_thread_union.stack)
+
+#else /* !__ASSEMBLY__ */
+
+#include <asm/asm-offsets.h>
+
+#endif
+
+/*
+ * thread information flags
+ * - these are process state flags that various assembly files
+ * may need to access
+ * - pending work-to-be-done flags are in LSW
+ * - other flags in MSW
+ * Warning: layout of LSW is hardcoded in entry.S
+ */
+#define TIF_SYSCALL_TRACE 0 /* syscall trace active */
+#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */
+#define TIF_SIGPENDING 2 /* signal pending */
+#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
+#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
+#define TIF_IRET 5 /* force IRET */
+#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
+#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
+#define TIF_SECCOMP 8 /* secure computing */
+#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
+#define TIF_NOTSC 16 /* TSC is not accessible in userland */
+#define TIF_IA32 17 /* 32bit process */
+#define TIF_FORK 18 /* ret_from_fork */
+#define TIF_ABI_PENDING 19
+#define TIF_MEMDIE 20
+#define TIF_DEBUG 21 /* uses debug registers */
+#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
+#define TIF_FREEZE 23 /* is freezing for suspend */
+#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
+#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
+#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
+
+#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
+#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
+#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
+#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
+#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
+#define _TIF_IRET (1 << TIF_IRET)
+#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
+#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
+#define _TIF_SECCOMP (1 << TIF_SECCOMP)
+#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
+#define _TIF_NOTSC (1 << TIF_NOTSC)
+#define _TIF_IA32 (1 << TIF_IA32)
+#define _TIF_FORK (1 << TIF_FORK)
+#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING)
+#define _TIF_DEBUG (1 << TIF_DEBUG)
+#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
+#define _TIF_FREEZE (1 << TIF_FREEZE)
+#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
+#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
+#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
+
+/* work to do in syscall_trace_enter() */
+#define _TIF_WORK_SYSCALL_ENTRY \
+ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | \
+ _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP)
+
+/* work to do in syscall_trace_leave() */
+#define _TIF_WORK_SYSCALL_EXIT \
+ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP)
+
+/* work to do on interrupt/exception return */
+#define _TIF_WORK_MASK \
+ (0x0000FFFF & \
+ ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \
+ _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
+
+/* work to do on any return to user space */
+#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
+
+/* Only used for 64 bit */
+#define _TIF_DO_NOTIFY_MASK \
+ (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+
+/* flags to check in __switch_to() */
+#define _TIF_WORK_CTXSW \
+ (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
+
+#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
+
+#define PREEMPT_ACTIVE 0x10000000
+
+/* thread information allocation */
+#ifdef CONFIG_DEBUG_STACK_USAGE
+#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO)
+#else
+#define THREAD_FLAGS GFP_KERNEL
+#endif
+
+#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
+
+#define alloc_thread_info(tsk) \
+ ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER))
+
+#ifdef CONFIG_X86_32
+
+#define STACK_WARN (THREAD_SIZE/8)
+/*
+ * macros/functions for gaining access to the thread information structure
+ *
+ * preempt_count needs to be 1 initially, until the scheduler is functional.
+ */
+#ifndef __ASSEMBLY__
+
+
+/* how to get the current stack pointer from C */
+register unsigned long current_stack_pointer asm("esp") __used;
+
+/* how to get the thread information struct from C */
+static inline struct thread_info *current_thread_info(void)
+{
+ return (struct thread_info *)
+ (current_stack_pointer & ~(THREAD_SIZE - 1));
+}
+
+#else /* !__ASSEMBLY__ */
+
+/* how to get the thread information struct from ASM */
+#define GET_THREAD_INFO(reg) \
+ movl $-THREAD_SIZE, reg; \
+ andl %esp, reg
+
+/* use this one if reg already contains %esp */
+#define GET_THREAD_INFO_WITH_ESP(reg) \
+ andl $-THREAD_SIZE, reg
+
+#endif
+
+#else /* X86_32 */
+
+#include <asm/pda.h>
+
+/*
+ * macros/functions for gaining access to the thread information structure
+ * preempt_count needs to be 1 initially, until the scheduler is functional.
+ */
+#ifndef __ASSEMBLY__
+static inline struct thread_info *current_thread_info(void)
+{
+ struct thread_info *ti;
+ ti = (void *)(read_pda(kernelstack) + PDA_STACKOFFSET - THREAD_SIZE);
+ return ti;
+}
+
+/* do not use in interrupt context */
+static inline struct thread_info *stack_thread_info(void)
+{
+ struct thread_info *ti;
+ asm("andq %%rsp,%0; " : "=r" (ti) : "0" (~(THREAD_SIZE - 1)));
+ return ti;
+}
+
+#else /* !__ASSEMBLY__ */
+
+/* how to get the thread information struct from ASM */
+#define GET_THREAD_INFO(reg) \
+ movq %gs:pda_kernelstack,reg ; \
+ subq $(THREAD_SIZE-PDA_STACKOFFSET),reg
+
+#endif
+
+#endif /* !X86_32 */
+
+/*
+ * Thread-synchronous status.
+ *
+ * This is different from the flags in that nobody else
+ * ever touches our thread-synchronous status, so we don't
+ * have to worry about atomic accesses.
+ */
+#define TS_USEDFPU 0x0001 /* FPU was used by this task
+ this quantum (SMP) */
+#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
+#define TS_POLLING 0x0004 /* true if in idle loop
+ and not sleeping */
+#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
+#define TS_XSAVE 0x0010 /* Use xsave/xrstor */
+
+#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
+
+#ifndef __ASSEMBLY__
+#define HAVE_SET_RESTORE_SIGMASK 1
+static inline void set_restore_sigmask(void)
+{
+ struct thread_info *ti = current_thread_info();
+ ti->status |= TS_RESTORE_SIGMASK;
+ set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
+}
+#endif /* !__ASSEMBLY__ */
+
+#ifndef __ASSEMBLY__
+extern void arch_task_cache_init(void);
+extern void free_thread_info(struct thread_info *ti);
+extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
+#define arch_task_cache_init arch_task_cache_init
+#endif
+#endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
new file mode 100644
index 000000000000..50c733aac421
--- /dev/null
+++ b/arch/x86/include/asm/time.h
@@ -0,0 +1,63 @@
+#ifndef _ASM_X86_TIME_H
+#define _ASM_X86_TIME_H
+
+extern void hpet_time_init(void);
+
+#include <asm/mc146818rtc.h>
+#ifdef CONFIG_X86_32
+#include <linux/efi.h>
+
+static inline unsigned long native_get_wallclock(void)
+{
+ unsigned long retval;
+
+ if (efi_enabled)
+ retval = efi_get_time();
+ else
+ retval = mach_get_cmos_time();
+
+ return retval;
+}
+
+static inline int native_set_wallclock(unsigned long nowtime)
+{
+ int retval;
+
+ if (efi_enabled)
+ retval = efi_set_rtc_mmss(nowtime);
+ else
+ retval = mach_set_rtc_mmss(nowtime);
+
+ return retval;
+}
+
+#else
+extern void native_time_init_hook(void);
+
+static inline unsigned long native_get_wallclock(void)
+{
+ return mach_get_cmos_time();
+}
+
+static inline int native_set_wallclock(unsigned long nowtime)
+{
+ return mach_set_rtc_mmss(nowtime);
+}
+
+#endif
+
+extern void time_init(void);
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else /* !CONFIG_PARAVIRT */
+
+#define get_wallclock() native_get_wallclock()
+#define set_wallclock(x) native_set_wallclock(x)
+#define choose_time_init() hpet_time_init
+
+#endif /* CONFIG_PARAVIRT */
+
+extern unsigned long __init calibrate_cpu(void);
+
+#endif /* _ASM_X86_TIME_H */
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
new file mode 100644
index 000000000000..2bb6a835c453
--- /dev/null
+++ b/arch/x86/include/asm/timer.h
@@ -0,0 +1,66 @@
+#ifndef _ASM_X86_TIMER_H
+#define _ASM_X86_TIMER_H
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <linux/percpu.h>
+
+#define TICK_SIZE (tick_nsec / 1000)
+
+unsigned long long native_sched_clock(void);
+unsigned long native_calibrate_tsc(void);
+
+#ifdef CONFIG_X86_32
+extern int timer_ack;
+extern int recalibrate_cpu_khz(void);
+#endif /* CONFIG_X86_32 */
+
+extern int no_timer_check;
+
+#ifndef CONFIG_PARAVIRT
+#define calibrate_tsc() native_calibrate_tsc()
+#endif
+
+/* Accelerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ * basic equation:
+ * ns = cycles / (freq / ns_per_sec)
+ * ns = cycles * (ns_per_sec / freq)
+ * ns = cycles * (10^9 / (cpu_khz * 10^3))
+ * ns = cycles * (10^6 / cpu_khz)
+ *
+ * Then we use scaling math (suggested by george@mvista.com) to get:
+ * ns = cycles * (10^6 * SC / cpu_khz) / SC
+ * ns = cycles * cyc2ns_scale / SC
+ *
+ * And since SC is a constant power of two, we can convert the div
+ * into a shift.
+ *
+ * We can use khz divisor instead of mhz to keep a better precision, since
+ * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ * (mathieu.desnoyers@polymtl.ca)
+ *
+ * -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+
+DECLARE_PER_CPU(unsigned long, cyc2ns);
+
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+
+static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
+{
+ return cyc * per_cpu(cyc2ns, smp_processor_id()) >> CYC2NS_SCALE_FACTOR;
+}
+
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+ unsigned long long ns;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ns = __cycles_2_ns(cyc);
+ local_irq_restore(flags);
+
+ return ns;
+}
+
+#endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h
new file mode 100644
index 000000000000..1287dc1347d6
--- /dev/null
+++ b/arch/x86/include/asm/timex.h
@@ -0,0 +1,19 @@
+/* x86 architecture timex specifications */
+#ifndef _ASM_X86_TIMEX_H
+#define _ASM_X86_TIMEX_H
+
+#include <asm/processor.h>
+#include <asm/tsc.h>
+
+#ifdef CONFIG_X86_ELAN
+# define PIT_TICK_RATE 1189200 /* AMD Elan has different frequency! */
+#elif defined(CONFIG_X86_RDC321X)
+# define PIT_TICK_RATE 1041667 /* Underlying HZ for R8610 */
+#else
+# define PIT_TICK_RATE 1193182 /* Underlying HZ */
+#endif
+#define CLOCK_TICK_RATE PIT_TICK_RATE
+
+#define ARCH_HAS_READ_CURRENT_TIMER
+
+#endif /* _ASM_X86_TIMEX_H */
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
new file mode 100644
index 000000000000..829215fef9ee
--- /dev/null
+++ b/arch/x86/include/asm/tlb.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_X86_TLB_H
+#define _ASM_X86_TLB_H
+
+#define tlb_start_vma(tlb, vma) do { } while (0)
+#define tlb_end_vma(tlb, vma) do { } while (0)
+#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
+#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
+
+#include <asm-generic/tlb.h>
+
+#endif /* _ASM_X86_TLB_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
new file mode 100644
index 000000000000..0e7bbb549116
--- /dev/null
+++ b/arch/x86/include/asm/tlbflush.h
@@ -0,0 +1,178 @@
+#ifndef _ASM_X86_TLBFLUSH_H
+#define _ASM_X86_TLBFLUSH_H
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define __flush_tlb() __native_flush_tlb()
+#define __flush_tlb_global() __native_flush_tlb_global()
+#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
+#endif
+
+static inline void __native_flush_tlb(void)
+{
+ write_cr3(read_cr3());
+}
+
+static inline void __native_flush_tlb_global(void)
+{
+ unsigned long flags;
+ unsigned long cr4;
+
+ /*
+ * Read-modify-write to CR4 - protect it from preemption and
+ * from interrupts. (Use the raw variant because this code can
+ * be called from deep inside debugging code.)
+ */
+ raw_local_irq_save(flags);
+
+ cr4 = read_cr4();
+ /* clear PGE */
+ write_cr4(cr4 & ~X86_CR4_PGE);
+ /* write old PGE again and flush TLBs */
+ write_cr4(cr4);
+
+ raw_local_irq_restore(flags);
+}
+
+static inline void __native_flush_tlb_single(unsigned long addr)
+{
+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+}
+
+static inline void __flush_tlb_all(void)
+{
+ if (cpu_has_pge)
+ __flush_tlb_global();
+ else
+ __flush_tlb();
+}
+
+static inline void __flush_tlb_one(unsigned long addr)
+{
+ if (cpu_has_invlpg)
+ __flush_tlb_single(addr);
+ else
+ __flush_tlb();
+}
+
+#ifdef CONFIG_X86_32
+# define TLB_FLUSH_ALL 0xffffffff
+#else
+# define TLB_FLUSH_ALL -1ULL
+#endif
+
+/*
+ * TLB flushing:
+ *
+ * - flush_tlb() flushes the current mm struct TLBs
+ * - flush_tlb_all() flushes all processes TLBs
+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ * - flush_tlb_page(vma, vmaddr) flushes one page
+ * - flush_tlb_range(vma, start, end) flushes a range of pages
+ * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
+ * - flush_tlb_others(cpumask, mm, va) flushes TLBs on other cpus
+ *
+ * ..but the i386 has somewhat limited tlb flushing capabilities,
+ * and page-granular flushes are available only on i486 and up.
+ *
+ * x86-64 can only flush individual pages or full VMs. For a range flush
+ * we always do the full VM. Might be worth trying if for a small
+ * range a few INVLPGs in a row are a win.
+ */
+
+#ifndef CONFIG_SMP
+
+#define flush_tlb() __flush_tlb()
+#define flush_tlb_all() __flush_tlb_all()
+#define local_flush_tlb() __flush_tlb()
+
+static inline void flush_tlb_mm(struct mm_struct *mm)
+{
+ if (mm == current->active_mm)
+ __flush_tlb();
+}
+
+static inline void flush_tlb_page(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ if (vma->vm_mm == current->active_mm)
+ __flush_tlb_one(addr);
+}
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ if (vma->vm_mm == current->active_mm)
+ __flush_tlb();
+}
+
+static inline void native_flush_tlb_others(const cpumask_t *cpumask,
+ struct mm_struct *mm,
+ unsigned long va)
+{
+}
+
+static inline void reset_lazy_tlbstate(void)
+{
+}
+
+#else /* SMP */
+
+#include <asm/smp.h>
+
+#define local_flush_tlb() __flush_tlb()
+
+extern void flush_tlb_all(void);
+extern void flush_tlb_current_task(void);
+extern void flush_tlb_mm(struct mm_struct *);
+extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
+
+#define flush_tlb() flush_tlb_current_task()
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ flush_tlb_mm(vma->vm_mm);
+}
+
+void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm,
+ unsigned long va);
+
+#define TLBSTATE_OK 1
+#define TLBSTATE_LAZY 2
+
+#ifdef CONFIG_X86_32
+struct tlb_state {
+ struct mm_struct *active_mm;
+ int state;
+ char __cacheline_padding[L1_CACHE_BYTES-8];
+};
+DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
+
+void reset_lazy_tlbstate(void);
+#else
+static inline void reset_lazy_tlbstate(void)
+{
+}
+#endif
+
+#endif /* SMP */
+
+#ifndef CONFIG_PARAVIRT
+#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(&mask, mm, va)
+#endif
+
+static inline void flush_tlb_kernel_range(unsigned long start,
+ unsigned long end)
+{
+ flush_tlb_all();
+}
+
+#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
new file mode 100644
index 000000000000..ff386ff50ed7
--- /dev/null
+++ b/arch/x86/include/asm/topology.h
@@ -0,0 +1,259 @@
+/*
+ * Written by: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ */
+#ifndef _ASM_X86_TOPOLOGY_H
+#define _ASM_X86_TOPOLOGY_H
+
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_HT
+# define ENABLE_TOPO_DEFINES
+# endif
+#else
+# ifdef CONFIG_SMP
+# define ENABLE_TOPO_DEFINES
+# endif
+#endif
+
+/* Node not present */
+#define NUMA_NO_NODE (-1)
+
+#ifdef CONFIG_NUMA
+#include <linux/cpumask.h>
+#include <asm/mpspec.h>
+
+#ifdef CONFIG_X86_32
+
+/* Mappings between node number and cpus on that node. */
+extern cpumask_t node_to_cpumask_map[];
+
+/* Mappings between logical cpu number and node number */
+extern int cpu_to_node_map[];
+
+/* Returns the number of the node containing CPU 'cpu' */
+static inline int cpu_to_node(int cpu)
+{
+ return cpu_to_node_map[cpu];
+}
+#define early_cpu_to_node(cpu) cpu_to_node(cpu)
+
+/* Returns a bitmask of CPUs on Node 'node'.
+ *
+ * Side note: this function creates the returned cpumask on the stack
+ * so with a high NR_CPUS count, excessive stack space is used. The
+ * node_to_cpumask_ptr function should be used whenever possible.
+ */
+static inline cpumask_t node_to_cpumask(int node)
+{
+ return node_to_cpumask_map[node];
+}
+
+#else /* CONFIG_X86_64 */
+
+/* Mappings between node number and cpus on that node. */
+extern cpumask_t *node_to_cpumask_map;
+
+/* Mappings between logical cpu number and node number */
+DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
+
+/* Returns the number of the current Node. */
+#define numa_node_id() read_pda(nodenumber)
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+extern int cpu_to_node(int cpu);
+extern int early_cpu_to_node(int cpu);
+extern const cpumask_t *_node_to_cpumask_ptr(int node);
+extern cpumask_t node_to_cpumask(int node);
+
+#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+/* Returns the number of the node containing CPU 'cpu' */
+static inline int cpu_to_node(int cpu)
+{
+ return per_cpu(x86_cpu_to_node_map, cpu);
+}
+
+/* Same function but used if called before per_cpu areas are setup */
+static inline int early_cpu_to_node(int cpu)
+{
+ if (early_per_cpu_ptr(x86_cpu_to_node_map))
+ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+
+ return per_cpu(x86_cpu_to_node_map, cpu);
+}
+
+/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
+static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+{
+ return &node_to_cpumask_map[node];
+}
+
+/* Returns a bitmask of CPUs on Node 'node'. */
+static inline cpumask_t node_to_cpumask(int node)
+{
+ return node_to_cpumask_map[node];
+}
+
+#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+/* Replace default node_to_cpumask_ptr with optimized version */
+#define node_to_cpumask_ptr(v, node) \
+ const cpumask_t *v = _node_to_cpumask_ptr(node)
+
+#define node_to_cpumask_ptr_next(v, node) \
+ v = _node_to_cpumask_ptr(node)
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Returns the number of the node containing Node 'node'. This
+ * architecture is flat, so it is a pretty simple function!
+ */
+#define parent_node(node) (node)
+
+#define pcibus_to_node(bus) __pcibus_to_node(bus)
+#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus)
+
+#ifdef CONFIG_X86_32
+extern unsigned long node_start_pfn[];
+extern unsigned long node_end_pfn[];
+extern unsigned long node_remap_size[];
+#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
+
+# define SD_CACHE_NICE_TRIES 1
+# define SD_IDLE_IDX 1
+# define SD_NEWIDLE_IDX 2
+# define SD_FORKEXEC_IDX 0
+
+#else
+
+# define SD_CACHE_NICE_TRIES 2
+# define SD_IDLE_IDX 2
+# define SD_NEWIDLE_IDX 2
+# define SD_FORKEXEC_IDX 1
+
+#endif
+
+/* sched_domains SD_NODE_INIT for NUMA machines */
+#define SD_NODE_INIT (struct sched_domain) { \
+ .min_interval = 8, \
+ .max_interval = 32, \
+ .busy_factor = 32, \
+ .imbalance_pct = 125, \
+ .cache_nice_tries = SD_CACHE_NICE_TRIES, \
+ .busy_idx = 3, \
+ .idle_idx = SD_IDLE_IDX, \
+ .newidle_idx = SD_NEWIDLE_IDX, \
+ .wake_idx = 1, \
+ .forkexec_idx = SD_FORKEXEC_IDX, \
+ .flags = SD_LOAD_BALANCE \
+ | SD_BALANCE_EXEC \
+ | SD_BALANCE_FORK \
+ | SD_WAKE_AFFINE \
+ | SD_WAKE_BALANCE \
+ | SD_SERIALIZE, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+}
+
+#ifdef CONFIG_X86_64_ACPI_NUMA
+extern int __node_distance(int, int);
+#define node_distance(a, b) __node_distance(a, b)
+#endif
+
+#else /* !CONFIG_NUMA */
+
+#define numa_node_id() 0
+#define cpu_to_node(cpu) 0
+#define early_cpu_to_node(cpu) 0
+
+static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+{
+ return &cpu_online_map;
+}
+static inline cpumask_t node_to_cpumask(int node)
+{
+ return cpu_online_map;
+}
+static inline int node_to_first_cpu(int node)
+{
+ return first_cpu(cpu_online_map);
+}
+
+/* Replace default node_to_cpumask_ptr with optimized version */
+#define node_to_cpumask_ptr(v, node) \
+ const cpumask_t *v = _node_to_cpumask_ptr(node)
+
+#define node_to_cpumask_ptr_next(v, node) \
+ v = _node_to_cpumask_ptr(node)
+#endif
+
+#include <asm-generic/topology.h>
+
+#ifdef CONFIG_NUMA
+/* Returns the number of the first CPU on Node 'node'. */
+static inline int node_to_first_cpu(int node)
+{
+ node_to_cpumask_ptr(mask, node);
+ return first_cpu(*mask);
+}
+#endif
+
+extern cpumask_t cpu_coregroup_map(int cpu);
+
+#ifdef ENABLE_TOPO_DEFINES
+#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
+#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
+#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu))
+#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu))
+
+/* indicates that pointers to the topology cpumask_t maps are valid */
+#define arch_provides_topology_pointers yes
+#endif
+
+static inline void arch_fix_phys_package_id(int num, u32 slot)
+{
+}
+
+struct pci_bus;
+void set_pci_bus_resources_arch_default(struct pci_bus *b);
+
+#ifdef CONFIG_SMP
+#define mc_capable() (cpus_weight(per_cpu(cpu_core_map, 0)) != nr_cpu_ids)
+#define smt_capable() (smp_num_siblings > 1)
+#endif
+
+#ifdef CONFIG_NUMA
+extern int get_mp_bus_to_node(int busnum);
+extern void set_mp_bus_to_node(int busnum, int node);
+#else
+static inline int get_mp_bus_to_node(int busnum)
+{
+ return 0;
+}
+static inline void set_mp_bus_to_node(int busnum, int node)
+{
+}
+#endif
+
+#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
new file mode 100644
index 000000000000..780ba0ab94f9
--- /dev/null
+++ b/arch/x86/include/asm/trampoline.h
@@ -0,0 +1,28 @@
+#ifndef _ASM_X86_TRAMPOLINE_H
+#define _ASM_X86_TRAMPOLINE_H
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_X86_TRAMPOLINE
+/*
+ * Trampoline 80x86 program as an array.
+ */
+extern const unsigned char trampoline_data [];
+extern const unsigned char trampoline_end [];
+extern unsigned char *trampoline_base;
+
+extern unsigned long init_rsp;
+extern unsigned long initial_code;
+
+#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)
+#define TRAMPOLINE_BASE 0x6000
+
+extern unsigned long setup_trampoline(void);
+extern void __init reserve_trampoline_memory(void);
+#else
+static inline void reserve_trampoline_memory(void) {};
+#endif /* CONFIG_X86_TRAMPOLINE */
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_TRAMPOLINE_H */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
new file mode 100644
index 000000000000..2ee0a3bceedf
--- /dev/null
+++ b/arch/x86/include/asm/traps.h
@@ -0,0 +1,88 @@
+#ifndef _ASM_X86_TRAPS_H
+#define _ASM_X86_TRAPS_H
+
+#include <asm/debugreg.h>
+
+#ifdef CONFIG_X86_32
+#define dotraplinkage
+#else
+#define dotraplinkage asmlinkage
+#endif
+
+asmlinkage void divide_error(void);
+asmlinkage void debug(void);
+asmlinkage void nmi(void);
+asmlinkage void int3(void);
+asmlinkage void overflow(void);
+asmlinkage void bounds(void);
+asmlinkage void invalid_op(void);
+asmlinkage void device_not_available(void);
+#ifdef CONFIG_X86_64
+asmlinkage void double_fault(void);
+#endif
+asmlinkage void coprocessor_segment_overrun(void);
+asmlinkage void invalid_TSS(void);
+asmlinkage void segment_not_present(void);
+asmlinkage void stack_segment(void);
+asmlinkage void general_protection(void);
+asmlinkage void page_fault(void);
+asmlinkage void spurious_interrupt_bug(void);
+asmlinkage void coprocessor_error(void);
+asmlinkage void alignment_check(void);
+#ifdef CONFIG_X86_MCE
+asmlinkage void machine_check(void);
+#endif /* CONFIG_X86_MCE */
+asmlinkage void simd_coprocessor_error(void);
+
+dotraplinkage void do_divide_error(struct pt_regs *, long);
+dotraplinkage void do_debug(struct pt_regs *, long);
+dotraplinkage void do_nmi(struct pt_regs *, long);
+dotraplinkage void do_int3(struct pt_regs *, long);
+dotraplinkage void do_overflow(struct pt_regs *, long);
+dotraplinkage void do_bounds(struct pt_regs *, long);
+dotraplinkage void do_invalid_op(struct pt_regs *, long);
+dotraplinkage void do_device_not_available(struct pt_regs *, long);
+dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long);
+dotraplinkage void do_invalid_TSS(struct pt_regs *, long);
+dotraplinkage void do_segment_not_present(struct pt_regs *, long);
+dotraplinkage void do_stack_segment(struct pt_regs *, long);
+#ifdef CONFIG_X86_64
+dotraplinkage void do_double_fault(struct pt_regs *, long);
+asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *);
+#endif
+dotraplinkage void do_general_protection(struct pt_regs *, long);
+dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
+dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
+dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
+dotraplinkage void do_alignment_check(struct pt_regs *, long);
+#ifdef CONFIG_X86_MCE
+dotraplinkage void do_machine_check(struct pt_regs *, long);
+#endif
+dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
+#ifdef CONFIG_X86_32
+dotraplinkage void do_iret_error(struct pt_regs *, long);
+#endif
+
+static inline int get_si_code(unsigned long condition)
+{
+ if (condition & DR_STEP)
+ return TRAP_TRACE;
+ else if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3))
+ return TRAP_HWBKPT;
+ else
+ return TRAP_BRKPT;
+}
+
+extern int panic_on_unrecovered_nmi;
+extern int kstack_depth_to_print;
+
+void math_error(void __user *);
+asmlinkage void math_emulate(long);
+#ifdef CONFIG_X86_32
+unsigned long patch_espfix_desc(unsigned long, unsigned long);
+#else
+asmlinkage void smp_thermal_interrupt(void);
+asmlinkage void mce_threshold_interrupt(void);
+#endif
+
+#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
new file mode 100644
index 000000000000..38ae163cc91b
--- /dev/null
+++ b/arch/x86/include/asm/tsc.h
@@ -0,0 +1,62 @@
+/*
+ * x86 TSC related functions
+ */
+#ifndef _ASM_X86_TSC_H
+#define _ASM_X86_TSC_H
+
+#include <asm/processor.h>
+
+#define NS_SCALE 10 /* 2^10, carefully chosen */
+#define US_SCALE 32 /* 2^32, arbitralrily chosen */
+
+/*
+ * Standard way to access the cycle counter.
+ */
+typedef unsigned long long cycles_t;
+
+extern unsigned int cpu_khz;
+extern unsigned int tsc_khz;
+
+extern void disable_TSC(void);
+
+static inline cycles_t get_cycles(void)
+{
+ unsigned long long ret = 0;
+
+#ifndef CONFIG_X86_TSC
+ if (!cpu_has_tsc)
+ return 0;
+#endif
+ rdtscll(ret);
+
+ return ret;
+}
+
+static __always_inline cycles_t vget_cycles(void)
+{
+ /*
+ * We only do VDSOs on TSC capable CPUs, so this shouldnt
+ * access boot_cpu_data (which is not VDSO-safe):
+ */
+#ifndef CONFIG_X86_TSC
+ if (!cpu_has_tsc)
+ return 0;
+#endif
+ return (cycles_t)__native_read_tsc();
+}
+
+extern void tsc_init(void);
+extern void mark_tsc_unstable(char *reason);
+extern int unsynchronized_tsc(void);
+int check_tsc_unstable(void);
+
+/*
+ * Boot-time check whether the TSCs are synchronized across
+ * all CPUs/cores:
+ */
+extern void check_tsc_sync_source(int cpu);
+extern void check_tsc_sync_target(void);
+
+extern int notsc_setup(char *);
+
+#endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h
new file mode 100644
index 000000000000..e6f736320077
--- /dev/null
+++ b/arch/x86/include/asm/types.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_X86_TYPES_H
+#define _ASM_X86_TYPES_H
+
+#include <asm-generic/int-ll64.h>
+
+#ifndef __ASSEMBLY__
+
+typedef unsigned short umode_t;
+
+#endif /* __ASSEMBLY__ */
+
+/*
+ * These aren't exported outside the kernel to avoid name space clashes
+ */
+#ifdef __KERNEL__
+
+#ifdef CONFIG_X86_32
+# define BITS_PER_LONG 32
+#else
+# define BITS_PER_LONG 64
+#endif
+
+#ifndef __ASSEMBLY__
+
+typedef u64 dma64_addr_t;
+#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G)
+/* DMA addresses come in 32-bit and 64-bit flavours. */
+typedef u64 dma_addr_t;
+#else
+typedef u32 dma_addr_t;
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_TYPES_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
new file mode 100644
index 000000000000..4340055b7559
--- /dev/null
+++ b/arch/x86/include/asm/uaccess.h
@@ -0,0 +1,456 @@
+#ifndef _ASM_X86_UACCESS_H
+#define _ASM_X86_UACCESS_H
+/*
+ * User space memory access functions
+ */
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/thread_info.h>
+#include <linux/prefetch.h>
+#include <linux/string.h>
+#include <asm/asm.h>
+#include <asm/page.h>
+
+#define VERIFY_READ 0
+#define VERIFY_WRITE 1
+
+/*
+ * The fs value determines whether argument validity checking should be
+ * performed or not. If get_fs() == USER_DS, checking is performed, with
+ * get_fs() == KERNEL_DS, checking is bypassed.
+ *
+ * For historical reasons, these macros are grossly misnamed.
+ */
+
+#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
+
+#define KERNEL_DS MAKE_MM_SEG(-1UL)
+#define USER_DS MAKE_MM_SEG(PAGE_OFFSET)
+
+#define get_ds() (KERNEL_DS)
+#define get_fs() (current_thread_info()->addr_limit)
+#define set_fs(x) (current_thread_info()->addr_limit = (x))
+
+#define segment_eq(a, b) ((a).seg == (b).seg)
+
+#define __addr_ok(addr) \
+ ((unsigned long __force)(addr) < \
+ (current_thread_info()->addr_limit.seg))
+
+/*
+ * Test whether a block of memory is a valid user space address.
+ * Returns 0 if the range is valid, nonzero otherwise.
+ *
+ * This is equivalent to the following test:
+ * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64)
+ *
+ * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
+ */
+
+#define __range_not_ok(addr, size) \
+({ \
+ unsigned long flag, roksum; \
+ __chk_user_ptr(addr); \
+ asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \
+ : "=&r" (flag), "=r" (roksum) \
+ : "1" (addr), "g" ((long)(size)), \
+ "rm" (current_thread_info()->addr_limit.seg)); \
+ flag; \
+})
+
+/**
+ * access_ok: - Checks if a user space pointer is valid
+ * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that
+ * %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe
+ * to write to a block, it is always safe to read from it.
+ * @addr: User space pointer to start of block to check
+ * @size: Size of block to check
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * Checks if a pointer to a block of memory in user space is valid.
+ *
+ * Returns true (nonzero) if the memory block may be valid, false (zero)
+ * if it is definitely invalid.
+ *
+ * Note that, depending on architecture, this function probably just
+ * checks that the pointer is in the user space range - after calling
+ * this function, memory access functions may still return -EFAULT.
+ */
+#define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0))
+
+/*
+ * The exception table consists of pairs of addresses: the first is the
+ * address of an instruction that is allowed to fault, and the second is
+ * the address at which the program should continue. No registers are
+ * modified, so it is entirely up to the continuation code to figure out
+ * what to do.
+ *
+ * All the routines below use bits of fixup code that are out of line
+ * with the main instruction path. This means when everything is well,
+ * we don't even have to jump over them. Further, they do not intrude
+ * on our cache or tlb entries.
+ */
+
+struct exception_table_entry {
+ unsigned long insn, fixup;
+};
+
+extern int fixup_exception(struct pt_regs *regs);
+
+/*
+ * These are the main single-value transfer routines. They automatically
+ * use the right size if we just have the right pointer type.
+ *
+ * This gets kind of ugly. We want to return _two_ values in "get_user()"
+ * and yet we don't want to do any pointers, because that is too much
+ * of a performance impact. Thus we have a few rather ugly macros here,
+ * and hide all the ugliness from the user.
+ *
+ * The "__xxx" versions of the user access functions are versions that
+ * do not verify the address space, that must have been done previously
+ * with a separate "access_ok()" call (this is used when we do multiple
+ * accesses to the same area of user memory).
+ */
+
+extern int __get_user_1(void);
+extern int __get_user_2(void);
+extern int __get_user_4(void);
+extern int __get_user_8(void);
+extern int __get_user_bad(void);
+
+#define __get_user_x(size, ret, x, ptr) \
+ asm volatile("call __get_user_" #size \
+ : "=a" (ret),"=d" (x) \
+ : "0" (ptr)) \
+
+/* Careful: we have to cast the result to the type of the pointer
+ * for sign reasons */
+
+/**
+ * get_user: - Get a simple variable from user space.
+ * @x: Variable to store result.
+ * @ptr: Source address, in user space.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * This macro copies a single simple variable from user space to kernel
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and the result of
+ * dereferencing @ptr must be assignable to @x without a cast.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ * On error, the variable @x is set to zero.
+ */
+#ifdef CONFIG_X86_32
+#define __get_user_8(__ret_gu, __val_gu, ptr) \
+ __get_user_x(X, __ret_gu, __val_gu, ptr)
+#else
+#define __get_user_8(__ret_gu, __val_gu, ptr) \
+ __get_user_x(8, __ret_gu, __val_gu, ptr)
+#endif
+
+#define get_user(x, ptr) \
+({ \
+ int __ret_gu; \
+ unsigned long __val_gu; \
+ __chk_user_ptr(ptr); \
+ might_fault(); \
+ switch (sizeof(*(ptr))) { \
+ case 1: \
+ __get_user_x(1, __ret_gu, __val_gu, ptr); \
+ break; \
+ case 2: \
+ __get_user_x(2, __ret_gu, __val_gu, ptr); \
+ break; \
+ case 4: \
+ __get_user_x(4, __ret_gu, __val_gu, ptr); \
+ break; \
+ case 8: \
+ __get_user_8(__ret_gu, __val_gu, ptr); \
+ break; \
+ default: \
+ __get_user_x(X, __ret_gu, __val_gu, ptr); \
+ break; \
+ } \
+ (x) = (__typeof__(*(ptr)))__val_gu; \
+ __ret_gu; \
+})
+
+#define __put_user_x(size, x, ptr, __ret_pu) \
+ asm volatile("call __put_user_" #size : "=a" (__ret_pu) \
+ :"0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
+
+
+
+#ifdef CONFIG_X86_32
+#define __put_user_u64(x, addr, err) \
+ asm volatile("1: movl %%eax,0(%2)\n" \
+ "2: movl %%edx,4(%2)\n" \
+ "3:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "4: movl %3,%0\n" \
+ " jmp 3b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 4b) \
+ _ASM_EXTABLE(2b, 4b) \
+ : "=r" (err) \
+ : "A" (x), "r" (addr), "i" (-EFAULT), "0" (err))
+
+#define __put_user_x8(x, ptr, __ret_pu) \
+ asm volatile("call __put_user_8" : "=a" (__ret_pu) \
+ : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
+#else
+#define __put_user_u64(x, ptr, retval) \
+ __put_user_asm(x, ptr, retval, "q", "", "Zr", -EFAULT)
+#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
+#endif
+
+extern void __put_user_bad(void);
+
+/*
+ * Strange magic calling convention: pointer in %ecx,
+ * value in %eax(:%edx), return value in %eax. clobbers %rbx
+ */
+extern void __put_user_1(void);
+extern void __put_user_2(void);
+extern void __put_user_4(void);
+extern void __put_user_8(void);
+
+#ifdef CONFIG_X86_WP_WORKS_OK
+
+/**
+ * put_user: - Write a simple value into user space.
+ * @x: Value to copy to user space.
+ * @ptr: Destination address, in user space.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * This macro copies a single simple value from kernel space to user
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and @x must be assignable
+ * to the result of dereferencing @ptr.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ */
+#define put_user(x, ptr) \
+({ \
+ int __ret_pu; \
+ __typeof__(*(ptr)) __pu_val; \
+ __chk_user_ptr(ptr); \
+ might_fault(); \
+ __pu_val = x; \
+ switch (sizeof(*(ptr))) { \
+ case 1: \
+ __put_user_x(1, __pu_val, ptr, __ret_pu); \
+ break; \
+ case 2: \
+ __put_user_x(2, __pu_val, ptr, __ret_pu); \
+ break; \
+ case 4: \
+ __put_user_x(4, __pu_val, ptr, __ret_pu); \
+ break; \
+ case 8: \
+ __put_user_x8(__pu_val, ptr, __ret_pu); \
+ break; \
+ default: \
+ __put_user_x(X, __pu_val, ptr, __ret_pu); \
+ break; \
+ } \
+ __ret_pu; \
+})
+
+#define __put_user_size(x, ptr, size, retval, errret) \
+do { \
+ retval = 0; \
+ __chk_user_ptr(ptr); \
+ switch (size) { \
+ case 1: \
+ __put_user_asm(x, ptr, retval, "b", "b", "iq", errret); \
+ break; \
+ case 2: \
+ __put_user_asm(x, ptr, retval, "w", "w", "ir", errret); \
+ break; \
+ case 4: \
+ __put_user_asm(x, ptr, retval, "l", "k", "ir", errret);\
+ break; \
+ case 8: \
+ __put_user_u64((__typeof__(*ptr))(x), ptr, retval); \
+ break; \
+ default: \
+ __put_user_bad(); \
+ } \
+} while (0)
+
+#else
+
+#define __put_user_size(x, ptr, size, retval, errret) \
+do { \
+ __typeof__(*(ptr))__pus_tmp = x; \
+ retval = 0; \
+ \
+ if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, size) != 0)) \
+ retval = errret; \
+} while (0)
+
+#define put_user(x, ptr) \
+({ \
+ int __ret_pu; \
+ __typeof__(*(ptr))__pus_tmp = x; \
+ __ret_pu = 0; \
+ if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, \
+ sizeof(*(ptr))) != 0)) \
+ __ret_pu = -EFAULT; \
+ __ret_pu; \
+})
+#endif
+
+#ifdef CONFIG_X86_32
+#define __get_user_asm_u64(x, ptr, retval, errret) (x) = __get_user_bad()
+#else
+#define __get_user_asm_u64(x, ptr, retval, errret) \
+ __get_user_asm(x, ptr, retval, "q", "", "=r", errret)
+#endif
+
+#define __get_user_size(x, ptr, size, retval, errret) \
+do { \
+ retval = 0; \
+ __chk_user_ptr(ptr); \
+ switch (size) { \
+ case 1: \
+ __get_user_asm(x, ptr, retval, "b", "b", "=q", errret); \
+ break; \
+ case 2: \
+ __get_user_asm(x, ptr, retval, "w", "w", "=r", errret); \
+ break; \
+ case 4: \
+ __get_user_asm(x, ptr, retval, "l", "k", "=r", errret); \
+ break; \
+ case 8: \
+ __get_user_asm_u64(x, ptr, retval, errret); \
+ break; \
+ default: \
+ (x) = __get_user_bad(); \
+ } \
+} while (0)
+
+#define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \
+ asm volatile("1: mov"itype" %2,%"rtype"1\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: mov %3,%0\n" \
+ " xor"itype" %"rtype"1,%"rtype"1\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=r" (err), ltype(x) \
+ : "m" (__m(addr)), "i" (errret), "0" (err))
+
+#define __put_user_nocheck(x, ptr, size) \
+({ \
+ int __pu_err; \
+ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \
+ __pu_err; \
+})
+
+#define __get_user_nocheck(x, ptr, size) \
+({ \
+ int __gu_err; \
+ unsigned long __gu_val; \
+ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
+ (x) = (__force __typeof__(*(ptr)))__gu_val; \
+ __gu_err; \
+})
+
+/* FIXME: this hack is definitely wrong -AK */
+struct __large_struct { unsigned long buf[100]; };
+#define __m(x) (*(struct __large_struct __user *)(x))
+
+/*
+ * Tell gcc we read from memory instead of writing: this is because
+ * we do not write to any memory gcc knows about, so there are no
+ * aliasing issues.
+ */
+#define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \
+ asm volatile("1: mov"itype" %"rtype"1,%2\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: mov %3,%0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=r"(err) \
+ : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err))
+/**
+ * __get_user: - Get a simple variable from user space, with less checking.
+ * @x: Variable to store result.
+ * @ptr: Source address, in user space.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * This macro copies a single simple variable from user space to kernel
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and the result of
+ * dereferencing @ptr must be assignable to @x without a cast.
+ *
+ * Caller must check the pointer with access_ok() before calling this
+ * function.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ * On error, the variable @x is set to zero.
+ */
+
+#define __get_user(x, ptr) \
+ __get_user_nocheck((x), (ptr), sizeof(*(ptr)))
+/**
+ * __put_user: - Write a simple value into user space, with less checking.
+ * @x: Value to copy to user space.
+ * @ptr: Destination address, in user space.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * This macro copies a single simple value from kernel space to user
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and @x must be assignable
+ * to the result of dereferencing @ptr.
+ *
+ * Caller must check the pointer with access_ok() before calling this
+ * function.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ */
+
+#define __put_user(x, ptr) \
+ __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
+
+#define __get_user_unaligned __get_user
+#define __put_user_unaligned __put_user
+
+/*
+ * movsl can be slow when source and dest are not both 8-byte aligned
+ */
+#ifdef CONFIG_X86_INTEL_USERCOPY
+extern struct movsl_mask {
+ int mask;
+} ____cacheline_aligned_in_smp movsl_mask;
+#endif
+
+#define ARCH_HAS_NOCACHE_UACCESS 1
+
+#ifdef CONFIG_X86_32
+# include "uaccess_32.h"
+#else
+# define ARCH_HAS_SEARCH_EXTABLE
+# include "uaccess_64.h"
+#endif
+
+#endif /* _ASM_X86_UACCESS_H */
+
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
new file mode 100644
index 000000000000..5e06259e90e5
--- /dev/null
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -0,0 +1,218 @@
+#ifndef _ASM_X86_UACCESS_32_H
+#define _ASM_X86_UACCESS_32_H
+
+/*
+ * User space memory access functions
+ */
+#include <linux/errno.h>
+#include <linux/thread_info.h>
+#include <linux/prefetch.h>
+#include <linux/string.h>
+#include <asm/asm.h>
+#include <asm/page.h>
+
+unsigned long __must_check __copy_to_user_ll
+ (void __user *to, const void *from, unsigned long n);
+unsigned long __must_check __copy_from_user_ll
+ (void *to, const void __user *from, unsigned long n);
+unsigned long __must_check __copy_from_user_ll_nozero
+ (void *to, const void __user *from, unsigned long n);
+unsigned long __must_check __copy_from_user_ll_nocache
+ (void *to, const void __user *from, unsigned long n);
+unsigned long __must_check __copy_from_user_ll_nocache_nozero
+ (void *to, const void __user *from, unsigned long n);
+
+/**
+ * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
+ * @to: Destination address, in user space.
+ * @from: Source address, in kernel space.
+ * @n: Number of bytes to copy.
+ *
+ * Context: User context only.
+ *
+ * Copy data from kernel space to user space. Caller must check
+ * the specified block with access_ok() before calling this function.
+ * The caller should also make sure he pins the user space address
+ * so that the we don't result in page fault and sleep.
+ *
+ * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault
+ * we return the initial request size (1, 2 or 4), as copy_*_user should do.
+ * If a store crosses a page boundary and gets a fault, the x86 will not write
+ * anything, so this is accurate.
+ */
+
+static __always_inline unsigned long __must_check
+__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
+{
+ if (__builtin_constant_p(n)) {
+ unsigned long ret;
+
+ switch (n) {
+ case 1:
+ __put_user_size(*(u8 *)from, (u8 __user *)to,
+ 1, ret, 1);
+ return ret;
+ case 2:
+ __put_user_size(*(u16 *)from, (u16 __user *)to,
+ 2, ret, 2);
+ return ret;
+ case 4:
+ __put_user_size(*(u32 *)from, (u32 __user *)to,
+ 4, ret, 4);
+ return ret;
+ }
+ }
+ return __copy_to_user_ll(to, from, n);
+}
+
+/**
+ * __copy_to_user: - Copy a block of data into user space, with less checking.
+ * @to: Destination address, in user space.
+ * @from: Source address, in kernel space.
+ * @n: Number of bytes to copy.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * Copy data from kernel space to user space. Caller must check
+ * the specified block with access_ok() before calling this function.
+ *
+ * Returns number of bytes that could not be copied.
+ * On success, this will be zero.
+ */
+static __always_inline unsigned long __must_check
+__copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+ might_fault();
+ return __copy_to_user_inatomic(to, from, n);
+}
+
+static __always_inline unsigned long
+__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
+{
+ /* Avoid zeroing the tail if the copy fails..
+ * If 'n' is constant and 1, 2, or 4, we do still zero on a failure,
+ * but as the zeroing behaviour is only significant when n is not
+ * constant, that shouldn't be a problem.
+ */
+ if (__builtin_constant_p(n)) {
+ unsigned long ret;
+
+ switch (n) {
+ case 1:
+ __get_user_size(*(u8 *)to, from, 1, ret, 1);
+ return ret;
+ case 2:
+ __get_user_size(*(u16 *)to, from, 2, ret, 2);
+ return ret;
+ case 4:
+ __get_user_size(*(u32 *)to, from, 4, ret, 4);
+ return ret;
+ }
+ }
+ return __copy_from_user_ll_nozero(to, from, n);
+}
+
+/**
+ * __copy_from_user: - Copy a block of data from user space, with less checking.
+ * @to: Destination address, in kernel space.
+ * @from: Source address, in user space.
+ * @n: Number of bytes to copy.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * Copy data from user space to kernel space. Caller must check
+ * the specified block with access_ok() before calling this function.
+ *
+ * Returns number of bytes that could not be copied.
+ * On success, this will be zero.
+ *
+ * If some data could not be copied, this function will pad the copied
+ * data to the requested size using zero bytes.
+ *
+ * An alternate version - __copy_from_user_inatomic() - may be called from
+ * atomic context and will fail rather than sleep. In this case the
+ * uncopied bytes will *NOT* be padded with zeros. See fs/filemap.h
+ * for explanation of why this is needed.
+ */
+static __always_inline unsigned long
+__copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+ might_fault();
+ if (__builtin_constant_p(n)) {
+ unsigned long ret;
+
+ switch (n) {
+ case 1:
+ __get_user_size(*(u8 *)to, from, 1, ret, 1);
+ return ret;
+ case 2:
+ __get_user_size(*(u16 *)to, from, 2, ret, 2);
+ return ret;
+ case 4:
+ __get_user_size(*(u32 *)to, from, 4, ret, 4);
+ return ret;
+ }
+ }
+ return __copy_from_user_ll(to, from, n);
+}
+
+static __always_inline unsigned long __copy_from_user_nocache(void *to,
+ const void __user *from, unsigned long n)
+{
+ might_fault();
+ if (__builtin_constant_p(n)) {
+ unsigned long ret;
+
+ switch (n) {
+ case 1:
+ __get_user_size(*(u8 *)to, from, 1, ret, 1);
+ return ret;
+ case 2:
+ __get_user_size(*(u16 *)to, from, 2, ret, 2);
+ return ret;
+ case 4:
+ __get_user_size(*(u32 *)to, from, 4, ret, 4);
+ return ret;
+ }
+ }
+ return __copy_from_user_ll_nocache(to, from, n);
+}
+
+static __always_inline unsigned long
+__copy_from_user_inatomic_nocache(void *to, const void __user *from,
+ unsigned long n)
+{
+ return __copy_from_user_ll_nocache_nozero(to, from, n);
+}
+
+unsigned long __must_check copy_to_user(void __user *to,
+ const void *from, unsigned long n);
+unsigned long __must_check copy_from_user(void *to,
+ const void __user *from,
+ unsigned long n);
+long __must_check strncpy_from_user(char *dst, const char __user *src,
+ long count);
+long __must_check __strncpy_from_user(char *dst,
+ const char __user *src, long count);
+
+/**
+ * strlen_user: - Get the size of a string in user space.
+ * @str: The string to measure.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * Get the size of a NUL-terminated string in user space.
+ *
+ * Returns the size of the string INCLUDING the terminating NUL.
+ * On exception, returns 0.
+ *
+ * If there is a limit on the length of a valid string, you may wish to
+ * consider using strnlen_user() instead.
+ */
+#define strlen_user(str) strnlen_user(str, LONG_MAX)
+
+long strnlen_user(const char __user *str, long n);
+unsigned long __must_check clear_user(void __user *mem, unsigned long len);
+unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
+
+#endif /* _ASM_X86_UACCESS_32_H */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
new file mode 100644
index 000000000000..84210c479fca
--- /dev/null
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -0,0 +1,208 @@
+#ifndef _ASM_X86_UACCESS_64_H
+#define _ASM_X86_UACCESS_64_H
+
+/*
+ * User space memory access functions
+ */
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/prefetch.h>
+#include <linux/lockdep.h>
+#include <asm/page.h>
+
+/*
+ * Copy To/From Userspace
+ */
+
+/* Handles exceptions in both to and from, but doesn't do access_ok */
+__must_check unsigned long
+copy_user_generic(void *to, const void *from, unsigned len);
+
+__must_check unsigned long
+copy_to_user(void __user *to, const void *from, unsigned len);
+__must_check unsigned long
+copy_from_user(void *to, const void __user *from, unsigned len);
+__must_check unsigned long
+copy_in_user(void __user *to, const void __user *from, unsigned len);
+
+static __always_inline __must_check
+int __copy_from_user(void *dst, const void __user *src, unsigned size)
+{
+ int ret = 0;
+
+ might_fault();
+ if (!__builtin_constant_p(size))
+ return copy_user_generic(dst, (__force void *)src, size);
+ switch (size) {
+ case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src,
+ ret, "b", "b", "=q", 1);
+ return ret;
+ case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src,
+ ret, "w", "w", "=r", 2);
+ return ret;
+ case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src,
+ ret, "l", "k", "=r", 4);
+ return ret;
+ case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src,
+ ret, "q", "", "=r", 8);
+ return ret;
+ case 10:
+ __get_user_asm(*(u64 *)dst, (u64 __user *)src,
+ ret, "q", "", "=r", 10);
+ if (unlikely(ret))
+ return ret;
+ __get_user_asm(*(u16 *)(8 + (char *)dst),
+ (u16 __user *)(8 + (char __user *)src),
+ ret, "w", "w", "=r", 2);
+ return ret;
+ case 16:
+ __get_user_asm(*(u64 *)dst, (u64 __user *)src,
+ ret, "q", "", "=r", 16);
+ if (unlikely(ret))
+ return ret;
+ __get_user_asm(*(u64 *)(8 + (char *)dst),
+ (u64 __user *)(8 + (char __user *)src),
+ ret, "q", "", "=r", 8);
+ return ret;
+ default:
+ return copy_user_generic(dst, (__force void *)src, size);
+ }
+}
+
+static __always_inline __must_check
+int __copy_to_user(void __user *dst, const void *src, unsigned size)
+{
+ int ret = 0;
+
+ might_fault();
+ if (!__builtin_constant_p(size))
+ return copy_user_generic((__force void *)dst, src, size);
+ switch (size) {
+ case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst,
+ ret, "b", "b", "iq", 1);
+ return ret;
+ case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst,
+ ret, "w", "w", "ir", 2);
+ return ret;
+ case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst,
+ ret, "l", "k", "ir", 4);
+ return ret;
+ case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
+ ret, "q", "", "ir", 8);
+ return ret;
+ case 10:
+ __put_user_asm(*(u64 *)src, (u64 __user *)dst,
+ ret, "q", "", "ir", 10);
+ if (unlikely(ret))
+ return ret;
+ asm("":::"memory");
+ __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
+ ret, "w", "w", "ir", 2);
+ return ret;
+ case 16:
+ __put_user_asm(*(u64 *)src, (u64 __user *)dst,
+ ret, "q", "", "ir", 16);
+ if (unlikely(ret))
+ return ret;
+ asm("":::"memory");
+ __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
+ ret, "q", "", "ir", 8);
+ return ret;
+ default:
+ return copy_user_generic((__force void *)dst, src, size);
+ }
+}
+
+static __always_inline __must_check
+int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
+{
+ int ret = 0;
+
+ might_fault();
+ if (!__builtin_constant_p(size))
+ return copy_user_generic((__force void *)dst,
+ (__force void *)src, size);
+ switch (size) {
+ case 1: {
+ u8 tmp;
+ __get_user_asm(tmp, (u8 __user *)src,
+ ret, "b", "b", "=q", 1);
+ if (likely(!ret))
+ __put_user_asm(tmp, (u8 __user *)dst,
+ ret, "b", "b", "iq", 1);
+ return ret;
+ }
+ case 2: {
+ u16 tmp;
+ __get_user_asm(tmp, (u16 __user *)src,
+ ret, "w", "w", "=r", 2);
+ if (likely(!ret))
+ __put_user_asm(tmp, (u16 __user *)dst,
+ ret, "w", "w", "ir", 2);
+ return ret;
+ }
+
+ case 4: {
+ u32 tmp;
+ __get_user_asm(tmp, (u32 __user *)src,
+ ret, "l", "k", "=r", 4);
+ if (likely(!ret))
+ __put_user_asm(tmp, (u32 __user *)dst,
+ ret, "l", "k", "ir", 4);
+ return ret;
+ }
+ case 8: {
+ u64 tmp;
+ __get_user_asm(tmp, (u64 __user *)src,
+ ret, "q", "", "=r", 8);
+ if (likely(!ret))
+ __put_user_asm(tmp, (u64 __user *)dst,
+ ret, "q", "", "ir", 8);
+ return ret;
+ }
+ default:
+ return copy_user_generic((__force void *)dst,
+ (__force void *)src, size);
+ }
+}
+
+__must_check long
+strncpy_from_user(char *dst, const char __user *src, long count);
+__must_check long
+__strncpy_from_user(char *dst, const char __user *src, long count);
+__must_check long strnlen_user(const char __user *str, long n);
+__must_check long __strnlen_user(const char __user *str, long n);
+__must_check long strlen_user(const char __user *str);
+__must_check unsigned long clear_user(void __user *mem, unsigned long len);
+__must_check unsigned long __clear_user(void __user *mem, unsigned long len);
+
+__must_check long __copy_from_user_inatomic(void *dst, const void __user *src,
+ unsigned size);
+
+static __must_check __always_inline int
+__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
+{
+ return copy_user_generic((__force void *)dst, src, size);
+}
+
+extern long __copy_user_nocache(void *dst, const void __user *src,
+ unsigned size, int zerorest);
+
+static inline int __copy_from_user_nocache(void *dst, const void __user *src,
+ unsigned size)
+{
+ might_sleep();
+ return __copy_user_nocache(dst, src, size, 1);
+}
+
+static inline int __copy_from_user_inatomic_nocache(void *dst,
+ const void __user *src,
+ unsigned size)
+{
+ return __copy_user_nocache(dst, src, size, 0);
+}
+
+unsigned long
+copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest);
+
+#endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/include/asm/ucontext.h b/arch/x86/include/asm/ucontext.h
new file mode 100644
index 000000000000..87324cf439d9
--- /dev/null
+++ b/arch/x86/include/asm/ucontext.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_UCONTEXT_H
+#define _ASM_X86_UCONTEXT_H
+
+#define UC_FP_XSTATE 0x1 /* indicates the presence of extended state
+ * information in the memory layout pointed
+ * by the fpstate pointer in the ucontext's
+ * sigcontext struct (uc_mcontext).
+ */
+
+struct ucontext {
+ unsigned long uc_flags;
+ struct ucontext *uc_link;
+ stack_t uc_stack;
+ struct sigcontext uc_mcontext;
+ sigset_t uc_sigmask; /* mask last for extensibility */
+};
+
+#endif /* _ASM_X86_UCONTEXT_H */
diff --git a/arch/x86/include/asm/unaligned.h b/arch/x86/include/asm/unaligned.h
new file mode 100644
index 000000000000..a7bd416b4763
--- /dev/null
+++ b/arch/x86/include/asm/unaligned.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_X86_UNALIGNED_H
+#define _ASM_X86_UNALIGNED_H
+
+/*
+ * The x86 can do unaligned accesses itself.
+ */
+
+#include <linux/unaligned/access_ok.h>
+#include <linux/unaligned/generic.h>
+
+#define get_unaligned __get_unaligned_le
+#define put_unaligned __put_unaligned_le
+
+#endif /* _ASM_X86_UNALIGNED_H */
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
new file mode 100644
index 000000000000..2a58ed3e51d8
--- /dev/null
+++ b/arch/x86/include/asm/unistd.h
@@ -0,0 +1,13 @@
+#ifdef __KERNEL__
+# ifdef CONFIG_X86_32
+# include "unistd_32.h"
+# else
+# include "unistd_64.h"
+# endif
+#else
+# ifdef __i386__
+# include "unistd_32.h"
+# else
+# include "unistd_64.h"
+# endif
+#endif
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
new file mode 100644
index 000000000000..f2bba78430a4
--- /dev/null
+++ b/arch/x86/include/asm/unistd_32.h
@@ -0,0 +1,379 @@
+#ifndef _ASM_X86_UNISTD_32_H
+#define _ASM_X86_UNISTD_32_H
+
+/*
+ * This file contains the system call numbers.
+ */
+
+#define __NR_restart_syscall 0
+#define __NR_exit 1
+#define __NR_fork 2
+#define __NR_read 3
+#define __NR_write 4
+#define __NR_open 5
+#define __NR_close 6
+#define __NR_waitpid 7
+#define __NR_creat 8
+#define __NR_link 9
+#define __NR_unlink 10
+#define __NR_execve 11
+#define __NR_chdir 12
+#define __NR_time 13
+#define __NR_mknod 14
+#define __NR_chmod 15
+#define __NR_lchown 16
+#define __NR_break 17
+#define __NR_oldstat 18
+#define __NR_lseek 19
+#define __NR_getpid 20
+#define __NR_mount 21
+#define __NR_umount 22
+#define __NR_setuid 23
+#define __NR_getuid 24
+#define __NR_stime 25
+#define __NR_ptrace 26
+#define __NR_alarm 27
+#define __NR_oldfstat 28
+#define __NR_pause 29
+#define __NR_utime 30
+#define __NR_stty 31
+#define __NR_gtty 32
+#define __NR_access 33
+#define __NR_nice 34
+#define __NR_ftime 35
+#define __NR_sync 36
+#define __NR_kill 37
+#define __NR_rename 38
+#define __NR_mkdir 39
+#define __NR_rmdir 40
+#define __NR_dup 41
+#define __NR_pipe 42
+#define __NR_times 43
+#define __NR_prof 44
+#define __NR_brk 45
+#define __NR_setgid 46
+#define __NR_getgid 47
+#define __NR_signal 48
+#define __NR_geteuid 49
+#define __NR_getegid 50
+#define __NR_acct 51
+#define __NR_umount2 52
+#define __NR_lock 53
+#define __NR_ioctl 54
+#define __NR_fcntl 55
+#define __NR_mpx 56
+#define __NR_setpgid 57
+#define __NR_ulimit 58
+#define __NR_oldolduname 59
+#define __NR_umask 60
+#define __NR_chroot 61
+#define __NR_ustat 62
+#define __NR_dup2 63
+#define __NR_getppid 64
+#define __NR_getpgrp 65
+#define __NR_setsid 66
+#define __NR_sigaction 67
+#define __NR_sgetmask 68
+#define __NR_ssetmask 69
+#define __NR_setreuid 70
+#define __NR_setregid 71
+#define __NR_sigsuspend 72
+#define __NR_sigpending 73
+#define __NR_sethostname 74
+#define __NR_setrlimit 75
+#define __NR_getrlimit 76 /* Back compatible 2Gig limited rlimit */
+#define __NR_getrusage 77
+#define __NR_gettimeofday 78
+#define __NR_settimeofday 79
+#define __NR_getgroups 80
+#define __NR_setgroups 81
+#define __NR_select 82
+#define __NR_symlink 83
+#define __NR_oldlstat 84
+#define __NR_readlink 85
+#define __NR_uselib 86
+#define __NR_swapon 87
+#define __NR_reboot 88
+#define __NR_readdir 89
+#define __NR_mmap 90
+#define __NR_munmap 91
+#define __NR_truncate 92
+#define __NR_ftruncate 93
+#define __NR_fchmod 94
+#define __NR_fchown 95
+#define __NR_getpriority 96
+#define __NR_setpriority 97
+#define __NR_profil 98
+#define __NR_statfs 99
+#define __NR_fstatfs 100
+#define __NR_ioperm 101
+#define __NR_socketcall 102
+#define __NR_syslog 103
+#define __NR_setitimer 104
+#define __NR_getitimer 105
+#define __NR_stat 106
+#define __NR_lstat 107
+#define __NR_fstat 108
+#define __NR_olduname 109
+#define __NR_iopl 110
+#define __NR_vhangup 111
+#define __NR_idle 112
+#define __NR_vm86old 113
+#define __NR_wait4 114
+#define __NR_swapoff 115
+#define __NR_sysinfo 116
+#define __NR_ipc 117
+#define __NR_fsync 118
+#define __NR_sigreturn 119
+#define __NR_clone 120
+#define __NR_setdomainname 121
+#define __NR_uname 122
+#define __NR_modify_ldt 123
+#define __NR_adjtimex 124
+#define __NR_mprotect 125
+#define __NR_sigprocmask 126
+#define __NR_create_module 127
+#define __NR_init_module 128
+#define __NR_delete_module 129
+#define __NR_get_kernel_syms 130
+#define __NR_quotactl 131
+#define __NR_getpgid 132
+#define __NR_fchdir 133
+#define __NR_bdflush 134
+#define __NR_sysfs 135
+#define __NR_personality 136
+#define __NR_afs_syscall 137 /* Syscall for Andrew File System */
+#define __NR_setfsuid 138
+#define __NR_setfsgid 139
+#define __NR__llseek 140
+#define __NR_getdents 141
+#define __NR__newselect 142
+#define __NR_flock 143
+#define __NR_msync 144
+#define __NR_readv 145
+#define __NR_writev 146
+#define __NR_getsid 147
+#define __NR_fdatasync 148
+#define __NR__sysctl 149
+#define __NR_mlock 150
+#define __NR_munlock 151
+#define __NR_mlockall 152
+#define __NR_munlockall 153
+#define __NR_sched_setparam 154
+#define __NR_sched_getparam 155
+#define __NR_sched_setscheduler 156
+#define __NR_sched_getscheduler 157
+#define __NR_sched_yield 158
+#define __NR_sched_get_priority_max 159
+#define __NR_sched_get_priority_min 160
+#define __NR_sched_rr_get_interval 161
+#define __NR_nanosleep 162
+#define __NR_mremap 163
+#define __NR_setresuid 164
+#define __NR_getresuid 165
+#define __NR_vm86 166
+#define __NR_query_module 167
+#define __NR_poll 168
+#define __NR_nfsservctl 169
+#define __NR_setresgid 170
+#define __NR_getresgid 171
+#define __NR_prctl 172
+#define __NR_rt_sigreturn 173
+#define __NR_rt_sigaction 174
+#define __NR_rt_sigprocmask 175
+#define __NR_rt_sigpending 176
+#define __NR_rt_sigtimedwait 177
+#define __NR_rt_sigqueueinfo 178
+#define __NR_rt_sigsuspend 179
+#define __NR_pread64 180
+#define __NR_pwrite64 181
+#define __NR_chown 182
+#define __NR_getcwd 183
+#define __NR_capget 184
+#define __NR_capset 185
+#define __NR_sigaltstack 186
+#define __NR_sendfile 187
+#define __NR_getpmsg 188 /* some people actually want streams */
+#define __NR_putpmsg 189 /* some people actually want streams */
+#define __NR_vfork 190
+#define __NR_ugetrlimit 191 /* SuS compliant getrlimit */
+#define __NR_mmap2 192
+#define __NR_truncate64 193
+#define __NR_ftruncate64 194
+#define __NR_stat64 195
+#define __NR_lstat64 196
+#define __NR_fstat64 197
+#define __NR_lchown32 198
+#define __NR_getuid32 199
+#define __NR_getgid32 200
+#define __NR_geteuid32 201
+#define __NR_getegid32 202
+#define __NR_setreuid32 203
+#define __NR_setregid32 204
+#define __NR_getgroups32 205
+#define __NR_setgroups32 206
+#define __NR_fchown32 207
+#define __NR_setresuid32 208
+#define __NR_getresuid32 209
+#define __NR_setresgid32 210
+#define __NR_getresgid32 211
+#define __NR_chown32 212
+#define __NR_setuid32 213
+#define __NR_setgid32 214
+#define __NR_setfsuid32 215
+#define __NR_setfsgid32 216
+#define __NR_pivot_root 217
+#define __NR_mincore 218
+#define __NR_madvise 219
+#define __NR_madvise1 219 /* delete when C lib stub is removed */
+#define __NR_getdents64 220
+#define __NR_fcntl64 221
+/* 223 is unused */
+#define __NR_gettid 224
+#define __NR_readahead 225
+#define __NR_setxattr 226
+#define __NR_lsetxattr 227
+#define __NR_fsetxattr 228
+#define __NR_getxattr 229
+#define __NR_lgetxattr 230
+#define __NR_fgetxattr 231
+#define __NR_listxattr 232
+#define __NR_llistxattr 233
+#define __NR_flistxattr 234
+#define __NR_removexattr 235
+#define __NR_lremovexattr 236
+#define __NR_fremovexattr 237
+#define __NR_tkill 238
+#define __NR_sendfile64 239
+#define __NR_futex 240
+#define __NR_sched_setaffinity 241
+#define __NR_sched_getaffinity 242
+#define __NR_set_thread_area 243
+#define __NR_get_thread_area 244
+#define __NR_io_setup 245
+#define __NR_io_destroy 246
+#define __NR_io_getevents 247
+#define __NR_io_submit 248
+#define __NR_io_cancel 249
+#define __NR_fadvise64 250
+/* 251 is available for reuse (was briefly sys_set_zone_reclaim) */
+#define __NR_exit_group 252
+#define __NR_lookup_dcookie 253
+#define __NR_epoll_create 254
+#define __NR_epoll_ctl 255
+#define __NR_epoll_wait 256
+#define __NR_remap_file_pages 257
+#define __NR_set_tid_address 258
+#define __NR_timer_create 259
+#define __NR_timer_settime (__NR_timer_create+1)
+#define __NR_timer_gettime (__NR_timer_create+2)
+#define __NR_timer_getoverrun (__NR_timer_create+3)
+#define __NR_timer_delete (__NR_timer_create+4)
+#define __NR_clock_settime (__NR_timer_create+5)
+#define __NR_clock_gettime (__NR_timer_create+6)
+#define __NR_clock_getres (__NR_timer_create+7)
+#define __NR_clock_nanosleep (__NR_timer_create+8)
+#define __NR_statfs64 268
+#define __NR_fstatfs64 269
+#define __NR_tgkill 270
+#define __NR_utimes 271
+#define __NR_fadvise64_64 272
+#define __NR_vserver 273
+#define __NR_mbind 274
+#define __NR_get_mempolicy 275
+#define __NR_set_mempolicy 276
+#define __NR_mq_open 277
+#define __NR_mq_unlink (__NR_mq_open+1)
+#define __NR_mq_timedsend (__NR_mq_open+2)
+#define __NR_mq_timedreceive (__NR_mq_open+3)
+#define __NR_mq_notify (__NR_mq_open+4)
+#define __NR_mq_getsetattr (__NR_mq_open+5)
+#define __NR_kexec_load 283
+#define __NR_waitid 284
+/* #define __NR_sys_setaltroot 285 */
+#define __NR_add_key 286
+#define __NR_request_key 287
+#define __NR_keyctl 288
+#define __NR_ioprio_set 289
+#define __NR_ioprio_get 290
+#define __NR_inotify_init 291
+#define __NR_inotify_add_watch 292
+#define __NR_inotify_rm_watch 293
+#define __NR_migrate_pages 294
+#define __NR_openat 295
+#define __NR_mkdirat 296
+#define __NR_mknodat 297
+#define __NR_fchownat 298
+#define __NR_futimesat 299
+#define __NR_fstatat64 300
+#define __NR_unlinkat 301
+#define __NR_renameat 302
+#define __NR_linkat 303
+#define __NR_symlinkat 304
+#define __NR_readlinkat 305
+#define __NR_fchmodat 306
+#define __NR_faccessat 307
+#define __NR_pselect6 308
+#define __NR_ppoll 309
+#define __NR_unshare 310
+#define __NR_set_robust_list 311
+#define __NR_get_robust_list 312
+#define __NR_splice 313
+#define __NR_sync_file_range 314
+#define __NR_tee 315
+#define __NR_vmsplice 316
+#define __NR_move_pages 317
+#define __NR_getcpu 318
+#define __NR_epoll_pwait 319
+#define __NR_utimensat 320
+#define __NR_signalfd 321
+#define __NR_timerfd_create 322
+#define __NR_eventfd 323
+#define __NR_fallocate 324
+#define __NR_timerfd_settime 325
+#define __NR_timerfd_gettime 326
+#define __NR_signalfd4 327
+#define __NR_eventfd2 328
+#define __NR_epoll_create1 329
+#define __NR_dup3 330
+#define __NR_pipe2 331
+#define __NR_inotify_init1 332
+
+#ifdef __KERNEL__
+
+#define __ARCH_WANT_IPC_PARSE_VERSION
+#define __ARCH_WANT_OLD_READDIR
+#define __ARCH_WANT_OLD_STAT
+#define __ARCH_WANT_STAT64
+#define __ARCH_WANT_SYS_ALARM
+#define __ARCH_WANT_SYS_GETHOSTNAME
+#define __ARCH_WANT_SYS_PAUSE
+#define __ARCH_WANT_SYS_SGETMASK
+#define __ARCH_WANT_SYS_SIGNAL
+#define __ARCH_WANT_SYS_TIME
+#define __ARCH_WANT_SYS_UTIME
+#define __ARCH_WANT_SYS_WAITPID
+#define __ARCH_WANT_SYS_SOCKETCALL
+#define __ARCH_WANT_SYS_FADVISE64
+#define __ARCH_WANT_SYS_GETPGRP
+#define __ARCH_WANT_SYS_LLSEEK
+#define __ARCH_WANT_SYS_NICE
+#define __ARCH_WANT_SYS_OLD_GETRLIMIT
+#define __ARCH_WANT_SYS_OLDUMOUNT
+#define __ARCH_WANT_SYS_SIGPENDING
+#define __ARCH_WANT_SYS_SIGPROCMASK
+#define __ARCH_WANT_SYS_RT_SIGACTION
+#define __ARCH_WANT_SYS_RT_SIGSUSPEND
+
+/*
+ * "Conditional" syscalls
+ *
+ * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
+ * but it doesn't work on all toolchains, so we just do it by hand
+ */
+#ifndef cond_syscall
+#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_UNISTD_32_H */
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
new file mode 100644
index 000000000000..d2e415e6666f
--- /dev/null
+++ b/arch/x86/include/asm/unistd_64.h
@@ -0,0 +1,693 @@
+#ifndef _ASM_X86_UNISTD_64_H
+#define _ASM_X86_UNISTD_64_H
+
+#ifndef __SYSCALL
+#define __SYSCALL(a, b)
+#endif
+
+/*
+ * This file contains the system call numbers.
+ *
+ * Note: holes are not allowed.
+ */
+
+/* at least 8 syscall per cacheline */
+#define __NR_read 0
+__SYSCALL(__NR_read, sys_read)
+#define __NR_write 1
+__SYSCALL(__NR_write, sys_write)
+#define __NR_open 2
+__SYSCALL(__NR_open, sys_open)
+#define __NR_close 3
+__SYSCALL(__NR_close, sys_close)
+#define __NR_stat 4
+__SYSCALL(__NR_stat, sys_newstat)
+#define __NR_fstat 5
+__SYSCALL(__NR_fstat, sys_newfstat)
+#define __NR_lstat 6
+__SYSCALL(__NR_lstat, sys_newlstat)
+#define __NR_poll 7
+__SYSCALL(__NR_poll, sys_poll)
+
+#define __NR_lseek 8
+__SYSCALL(__NR_lseek, sys_lseek)
+#define __NR_mmap 9
+__SYSCALL(__NR_mmap, sys_mmap)
+#define __NR_mprotect 10
+__SYSCALL(__NR_mprotect, sys_mprotect)
+#define __NR_munmap 11
+__SYSCALL(__NR_munmap, sys_munmap)
+#define __NR_brk 12
+__SYSCALL(__NR_brk, sys_brk)
+#define __NR_rt_sigaction 13
+__SYSCALL(__NR_rt_sigaction, sys_rt_sigaction)
+#define __NR_rt_sigprocmask 14
+__SYSCALL(__NR_rt_sigprocmask, sys_rt_sigprocmask)
+#define __NR_rt_sigreturn 15
+__SYSCALL(__NR_rt_sigreturn, stub_rt_sigreturn)
+
+#define __NR_ioctl 16
+__SYSCALL(__NR_ioctl, sys_ioctl)
+#define __NR_pread64 17
+__SYSCALL(__NR_pread64, sys_pread64)
+#define __NR_pwrite64 18
+__SYSCALL(__NR_pwrite64, sys_pwrite64)
+#define __NR_readv 19
+__SYSCALL(__NR_readv, sys_readv)
+#define __NR_writev 20
+__SYSCALL(__NR_writev, sys_writev)
+#define __NR_access 21
+__SYSCALL(__NR_access, sys_access)
+#define __NR_pipe 22
+__SYSCALL(__NR_pipe, sys_pipe)
+#define __NR_select 23
+__SYSCALL(__NR_select, sys_select)
+
+#define __NR_sched_yield 24
+__SYSCALL(__NR_sched_yield, sys_sched_yield)
+#define __NR_mremap 25
+__SYSCALL(__NR_mremap, sys_mremap)
+#define __NR_msync 26
+__SYSCALL(__NR_msync, sys_msync)
+#define __NR_mincore 27
+__SYSCALL(__NR_mincore, sys_mincore)
+#define __NR_madvise 28
+__SYSCALL(__NR_madvise, sys_madvise)
+#define __NR_shmget 29
+__SYSCALL(__NR_shmget, sys_shmget)
+#define __NR_shmat 30
+__SYSCALL(__NR_shmat, sys_shmat)
+#define __NR_shmctl 31
+__SYSCALL(__NR_shmctl, sys_shmctl)
+
+#define __NR_dup 32
+__SYSCALL(__NR_dup, sys_dup)
+#define __NR_dup2 33
+__SYSCALL(__NR_dup2, sys_dup2)
+#define __NR_pause 34
+__SYSCALL(__NR_pause, sys_pause)
+#define __NR_nanosleep 35
+__SYSCALL(__NR_nanosleep, sys_nanosleep)
+#define __NR_getitimer 36
+__SYSCALL(__NR_getitimer, sys_getitimer)
+#define __NR_alarm 37
+__SYSCALL(__NR_alarm, sys_alarm)
+#define __NR_setitimer 38
+__SYSCALL(__NR_setitimer, sys_setitimer)
+#define __NR_getpid 39
+__SYSCALL(__NR_getpid, sys_getpid)
+
+#define __NR_sendfile 40
+__SYSCALL(__NR_sendfile, sys_sendfile64)
+#define __NR_socket 41
+__SYSCALL(__NR_socket, sys_socket)
+#define __NR_connect 42
+__SYSCALL(__NR_connect, sys_connect)
+#define __NR_accept 43
+__SYSCALL(__NR_accept, sys_accept)
+#define __NR_sendto 44
+__SYSCALL(__NR_sendto, sys_sendto)
+#define __NR_recvfrom 45
+__SYSCALL(__NR_recvfrom, sys_recvfrom)
+#define __NR_sendmsg 46
+__SYSCALL(__NR_sendmsg, sys_sendmsg)
+#define __NR_recvmsg 47
+__SYSCALL(__NR_recvmsg, sys_recvmsg)
+
+#define __NR_shutdown 48
+__SYSCALL(__NR_shutdown, sys_shutdown)
+#define __NR_bind 49
+__SYSCALL(__NR_bind, sys_bind)
+#define __NR_listen 50
+__SYSCALL(__NR_listen, sys_listen)
+#define __NR_getsockname 51
+__SYSCALL(__NR_getsockname, sys_getsockname)
+#define __NR_getpeername 52
+__SYSCALL(__NR_getpeername, sys_getpeername)
+#define __NR_socketpair 53
+__SYSCALL(__NR_socketpair, sys_socketpair)
+#define __NR_setsockopt 54
+__SYSCALL(__NR_setsockopt, sys_setsockopt)
+#define __NR_getsockopt 55
+__SYSCALL(__NR_getsockopt, sys_getsockopt)
+
+#define __NR_clone 56
+__SYSCALL(__NR_clone, stub_clone)
+#define __NR_fork 57
+__SYSCALL(__NR_fork, stub_fork)
+#define __NR_vfork 58
+__SYSCALL(__NR_vfork, stub_vfork)
+#define __NR_execve 59
+__SYSCALL(__NR_execve, stub_execve)
+#define __NR_exit 60
+__SYSCALL(__NR_exit, sys_exit)
+#define __NR_wait4 61
+__SYSCALL(__NR_wait4, sys_wait4)
+#define __NR_kill 62
+__SYSCALL(__NR_kill, sys_kill)
+#define __NR_uname 63
+__SYSCALL(__NR_uname, sys_uname)
+
+#define __NR_semget 64
+__SYSCALL(__NR_semget, sys_semget)
+#define __NR_semop 65
+__SYSCALL(__NR_semop, sys_semop)
+#define __NR_semctl 66
+__SYSCALL(__NR_semctl, sys_semctl)
+#define __NR_shmdt 67
+__SYSCALL(__NR_shmdt, sys_shmdt)
+#define __NR_msgget 68
+__SYSCALL(__NR_msgget, sys_msgget)
+#define __NR_msgsnd 69
+__SYSCALL(__NR_msgsnd, sys_msgsnd)
+#define __NR_msgrcv 70
+__SYSCALL(__NR_msgrcv, sys_msgrcv)
+#define __NR_msgctl 71
+__SYSCALL(__NR_msgctl, sys_msgctl)
+
+#define __NR_fcntl 72
+__SYSCALL(__NR_fcntl, sys_fcntl)
+#define __NR_flock 73
+__SYSCALL(__NR_flock, sys_flock)
+#define __NR_fsync 74
+__SYSCALL(__NR_fsync, sys_fsync)
+#define __NR_fdatasync 75
+__SYSCALL(__NR_fdatasync, sys_fdatasync)
+#define __NR_truncate 76
+__SYSCALL(__NR_truncate, sys_truncate)
+#define __NR_ftruncate 77
+__SYSCALL(__NR_ftruncate, sys_ftruncate)
+#define __NR_getdents 78
+__SYSCALL(__NR_getdents, sys_getdents)
+#define __NR_getcwd 79
+__SYSCALL(__NR_getcwd, sys_getcwd)
+
+#define __NR_chdir 80
+__SYSCALL(__NR_chdir, sys_chdir)
+#define __NR_fchdir 81
+__SYSCALL(__NR_fchdir, sys_fchdir)
+#define __NR_rename 82
+__SYSCALL(__NR_rename, sys_rename)
+#define __NR_mkdir 83
+__SYSCALL(__NR_mkdir, sys_mkdir)
+#define __NR_rmdir 84
+__SYSCALL(__NR_rmdir, sys_rmdir)
+#define __NR_creat 85
+__SYSCALL(__NR_creat, sys_creat)
+#define __NR_link 86
+__SYSCALL(__NR_link, sys_link)
+#define __NR_unlink 87
+__SYSCALL(__NR_unlink, sys_unlink)
+
+#define __NR_symlink 88
+__SYSCALL(__NR_symlink, sys_symlink)
+#define __NR_readlink 89
+__SYSCALL(__NR_readlink, sys_readlink)
+#define __NR_chmod 90
+__SYSCALL(__NR_chmod, sys_chmod)
+#define __NR_fchmod 91
+__SYSCALL(__NR_fchmod, sys_fchmod)
+#define __NR_chown 92
+__SYSCALL(__NR_chown, sys_chown)
+#define __NR_fchown 93
+__SYSCALL(__NR_fchown, sys_fchown)
+#define __NR_lchown 94
+__SYSCALL(__NR_lchown, sys_lchown)
+#define __NR_umask 95
+__SYSCALL(__NR_umask, sys_umask)
+
+#define __NR_gettimeofday 96
+__SYSCALL(__NR_gettimeofday, sys_gettimeofday)
+#define __NR_getrlimit 97
+__SYSCALL(__NR_getrlimit, sys_getrlimit)
+#define __NR_getrusage 98
+__SYSCALL(__NR_getrusage, sys_getrusage)
+#define __NR_sysinfo 99
+__SYSCALL(__NR_sysinfo, sys_sysinfo)
+#define __NR_times 100
+__SYSCALL(__NR_times, sys_times)
+#define __NR_ptrace 101
+__SYSCALL(__NR_ptrace, sys_ptrace)
+#define __NR_getuid 102
+__SYSCALL(__NR_getuid, sys_getuid)
+#define __NR_syslog 103
+__SYSCALL(__NR_syslog, sys_syslog)
+
+/* at the very end the stuff that never runs during the benchmarks */
+#define __NR_getgid 104
+__SYSCALL(__NR_getgid, sys_getgid)
+#define __NR_setuid 105
+__SYSCALL(__NR_setuid, sys_setuid)
+#define __NR_setgid 106
+__SYSCALL(__NR_setgid, sys_setgid)
+#define __NR_geteuid 107
+__SYSCALL(__NR_geteuid, sys_geteuid)
+#define __NR_getegid 108
+__SYSCALL(__NR_getegid, sys_getegid)
+#define __NR_setpgid 109
+__SYSCALL(__NR_setpgid, sys_setpgid)
+#define __NR_getppid 110
+__SYSCALL(__NR_getppid, sys_getppid)
+#define __NR_getpgrp 111
+__SYSCALL(__NR_getpgrp, sys_getpgrp)
+
+#define __NR_setsid 112
+__SYSCALL(__NR_setsid, sys_setsid)
+#define __NR_setreuid 113
+__SYSCALL(__NR_setreuid, sys_setreuid)
+#define __NR_setregid 114
+__SYSCALL(__NR_setregid, sys_setregid)
+#define __NR_getgroups 115
+__SYSCALL(__NR_getgroups, sys_getgroups)
+#define __NR_setgroups 116
+__SYSCALL(__NR_setgroups, sys_setgroups)
+#define __NR_setresuid 117
+__SYSCALL(__NR_setresuid, sys_setresuid)
+#define __NR_getresuid 118
+__SYSCALL(__NR_getresuid, sys_getresuid)
+#define __NR_setresgid 119
+__SYSCALL(__NR_setresgid, sys_setresgid)
+
+#define __NR_getresgid 120
+__SYSCALL(__NR_getresgid, sys_getresgid)
+#define __NR_getpgid 121
+__SYSCALL(__NR_getpgid, sys_getpgid)
+#define __NR_setfsuid 122
+__SYSCALL(__NR_setfsuid, sys_setfsuid)
+#define __NR_setfsgid 123
+__SYSCALL(__NR_setfsgid, sys_setfsgid)
+#define __NR_getsid 124
+__SYSCALL(__NR_getsid, sys_getsid)
+#define __NR_capget 125
+__SYSCALL(__NR_capget, sys_capget)
+#define __NR_capset 126
+__SYSCALL(__NR_capset, sys_capset)
+
+#define __NR_rt_sigpending 127
+__SYSCALL(__NR_rt_sigpending, sys_rt_sigpending)
+#define __NR_rt_sigtimedwait 128
+__SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait)
+#define __NR_rt_sigqueueinfo 129
+__SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo)
+#define __NR_rt_sigsuspend 130
+__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend)
+#define __NR_sigaltstack 131
+__SYSCALL(__NR_sigaltstack, stub_sigaltstack)
+#define __NR_utime 132
+__SYSCALL(__NR_utime, sys_utime)
+#define __NR_mknod 133
+__SYSCALL(__NR_mknod, sys_mknod)
+
+/* Only needed for a.out */
+#define __NR_uselib 134
+__SYSCALL(__NR_uselib, sys_ni_syscall)
+#define __NR_personality 135
+__SYSCALL(__NR_personality, sys_personality)
+
+#define __NR_ustat 136
+__SYSCALL(__NR_ustat, sys_ustat)
+#define __NR_statfs 137
+__SYSCALL(__NR_statfs, sys_statfs)
+#define __NR_fstatfs 138
+__SYSCALL(__NR_fstatfs, sys_fstatfs)
+#define __NR_sysfs 139
+__SYSCALL(__NR_sysfs, sys_sysfs)
+
+#define __NR_getpriority 140
+__SYSCALL(__NR_getpriority, sys_getpriority)
+#define __NR_setpriority 141
+__SYSCALL(__NR_setpriority, sys_setpriority)
+#define __NR_sched_setparam 142
+__SYSCALL(__NR_sched_setparam, sys_sched_setparam)
+#define __NR_sched_getparam 143
+__SYSCALL(__NR_sched_getparam, sys_sched_getparam)
+#define __NR_sched_setscheduler 144
+__SYSCALL(__NR_sched_setscheduler, sys_sched_setscheduler)
+#define __NR_sched_getscheduler 145
+__SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler)
+#define __NR_sched_get_priority_max 146
+__SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max)
+#define __NR_sched_get_priority_min 147
+__SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min)
+#define __NR_sched_rr_get_interval 148
+__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval)
+
+#define __NR_mlock 149
+__SYSCALL(__NR_mlock, sys_mlock)
+#define __NR_munlock 150
+__SYSCALL(__NR_munlock, sys_munlock)
+#define __NR_mlockall 151
+__SYSCALL(__NR_mlockall, sys_mlockall)
+#define __NR_munlockall 152
+__SYSCALL(__NR_munlockall, sys_munlockall)
+
+#define __NR_vhangup 153
+__SYSCALL(__NR_vhangup, sys_vhangup)
+
+#define __NR_modify_ldt 154
+__SYSCALL(__NR_modify_ldt, sys_modify_ldt)
+
+#define __NR_pivot_root 155
+__SYSCALL(__NR_pivot_root, sys_pivot_root)
+
+#define __NR__sysctl 156
+__SYSCALL(__NR__sysctl, sys_sysctl)
+
+#define __NR_prctl 157
+__SYSCALL(__NR_prctl, sys_prctl)
+#define __NR_arch_prctl 158
+__SYSCALL(__NR_arch_prctl, sys_arch_prctl)
+
+#define __NR_adjtimex 159
+__SYSCALL(__NR_adjtimex, sys_adjtimex)
+
+#define __NR_setrlimit 160
+__SYSCALL(__NR_setrlimit, sys_setrlimit)
+
+#define __NR_chroot 161
+__SYSCALL(__NR_chroot, sys_chroot)
+
+#define __NR_sync 162
+__SYSCALL(__NR_sync, sys_sync)
+
+#define __NR_acct 163
+__SYSCALL(__NR_acct, sys_acct)
+
+#define __NR_settimeofday 164
+__SYSCALL(__NR_settimeofday, sys_settimeofday)
+
+#define __NR_mount 165
+__SYSCALL(__NR_mount, sys_mount)
+#define __NR_umount2 166
+__SYSCALL(__NR_umount2, sys_umount)
+
+#define __NR_swapon 167
+__SYSCALL(__NR_swapon, sys_swapon)
+#define __NR_swapoff 168
+__SYSCALL(__NR_swapoff, sys_swapoff)
+
+#define __NR_reboot 169
+__SYSCALL(__NR_reboot, sys_reboot)
+
+#define __NR_sethostname 170
+__SYSCALL(__NR_sethostname, sys_sethostname)
+#define __NR_setdomainname 171
+__SYSCALL(__NR_setdomainname, sys_setdomainname)
+
+#define __NR_iopl 172
+__SYSCALL(__NR_iopl, stub_iopl)
+#define __NR_ioperm 173
+__SYSCALL(__NR_ioperm, sys_ioperm)
+
+#define __NR_create_module 174
+__SYSCALL(__NR_create_module, sys_ni_syscall)
+#define __NR_init_module 175
+__SYSCALL(__NR_init_module, sys_init_module)
+#define __NR_delete_module 176
+__SYSCALL(__NR_delete_module, sys_delete_module)
+#define __NR_get_kernel_syms 177
+__SYSCALL(__NR_get_kernel_syms, sys_ni_syscall)
+#define __NR_query_module 178
+__SYSCALL(__NR_query_module, sys_ni_syscall)
+
+#define __NR_quotactl 179
+__SYSCALL(__NR_quotactl, sys_quotactl)
+
+#define __NR_nfsservctl 180
+__SYSCALL(__NR_nfsservctl, sys_nfsservctl)
+
+/* reserved for LiS/STREAMS */
+#define __NR_getpmsg 181
+__SYSCALL(__NR_getpmsg, sys_ni_syscall)
+#define __NR_putpmsg 182
+__SYSCALL(__NR_putpmsg, sys_ni_syscall)
+
+/* reserved for AFS */
+#define __NR_afs_syscall 183
+__SYSCALL(__NR_afs_syscall, sys_ni_syscall)
+
+/* reserved for tux */
+#define __NR_tuxcall 184
+__SYSCALL(__NR_tuxcall, sys_ni_syscall)
+
+#define __NR_security 185
+__SYSCALL(__NR_security, sys_ni_syscall)
+
+#define __NR_gettid 186
+__SYSCALL(__NR_gettid, sys_gettid)
+
+#define __NR_readahead 187
+__SYSCALL(__NR_readahead, sys_readahead)
+#define __NR_setxattr 188
+__SYSCALL(__NR_setxattr, sys_setxattr)
+#define __NR_lsetxattr 189
+__SYSCALL(__NR_lsetxattr, sys_lsetxattr)
+#define __NR_fsetxattr 190
+__SYSCALL(__NR_fsetxattr, sys_fsetxattr)
+#define __NR_getxattr 191
+__SYSCALL(__NR_getxattr, sys_getxattr)
+#define __NR_lgetxattr 192
+__SYSCALL(__NR_lgetxattr, sys_lgetxattr)
+#define __NR_fgetxattr 193
+__SYSCALL(__NR_fgetxattr, sys_fgetxattr)
+#define __NR_listxattr 194
+__SYSCALL(__NR_listxattr, sys_listxattr)
+#define __NR_llistxattr 195
+__SYSCALL(__NR_llistxattr, sys_llistxattr)
+#define __NR_flistxattr 196
+__SYSCALL(__NR_flistxattr, sys_flistxattr)
+#define __NR_removexattr 197
+__SYSCALL(__NR_removexattr, sys_removexattr)
+#define __NR_lremovexattr 198
+__SYSCALL(__NR_lremovexattr, sys_lremovexattr)
+#define __NR_fremovexattr 199
+__SYSCALL(__NR_fremovexattr, sys_fremovexattr)
+#define __NR_tkill 200
+__SYSCALL(__NR_tkill, sys_tkill)
+#define __NR_time 201
+__SYSCALL(__NR_time, sys_time)
+#define __NR_futex 202
+__SYSCALL(__NR_futex, sys_futex)
+#define __NR_sched_setaffinity 203
+__SYSCALL(__NR_sched_setaffinity, sys_sched_setaffinity)
+#define __NR_sched_getaffinity 204
+__SYSCALL(__NR_sched_getaffinity, sys_sched_getaffinity)
+#define __NR_set_thread_area 205
+__SYSCALL(__NR_set_thread_area, sys_ni_syscall) /* use arch_prctl */
+#define __NR_io_setup 206
+__SYSCALL(__NR_io_setup, sys_io_setup)
+#define __NR_io_destroy 207
+__SYSCALL(__NR_io_destroy, sys_io_destroy)
+#define __NR_io_getevents 208
+__SYSCALL(__NR_io_getevents, sys_io_getevents)
+#define __NR_io_submit 209
+__SYSCALL(__NR_io_submit, sys_io_submit)
+#define __NR_io_cancel 210
+__SYSCALL(__NR_io_cancel, sys_io_cancel)
+#define __NR_get_thread_area 211
+__SYSCALL(__NR_get_thread_area, sys_ni_syscall) /* use arch_prctl */
+#define __NR_lookup_dcookie 212
+__SYSCALL(__NR_lookup_dcookie, sys_lookup_dcookie)
+#define __NR_epoll_create 213
+__SYSCALL(__NR_epoll_create, sys_epoll_create)
+#define __NR_epoll_ctl_old 214
+__SYSCALL(__NR_epoll_ctl_old, sys_ni_syscall)
+#define __NR_epoll_wait_old 215
+__SYSCALL(__NR_epoll_wait_old, sys_ni_syscall)
+#define __NR_remap_file_pages 216
+__SYSCALL(__NR_remap_file_pages, sys_remap_file_pages)
+#define __NR_getdents64 217
+__SYSCALL(__NR_getdents64, sys_getdents64)
+#define __NR_set_tid_address 218
+__SYSCALL(__NR_set_tid_address, sys_set_tid_address)
+#define __NR_restart_syscall 219
+__SYSCALL(__NR_restart_syscall, sys_restart_syscall)
+#define __NR_semtimedop 220
+__SYSCALL(__NR_semtimedop, sys_semtimedop)
+#define __NR_fadvise64 221
+__SYSCALL(__NR_fadvise64, sys_fadvise64)
+#define __NR_timer_create 222
+__SYSCALL(__NR_timer_create, sys_timer_create)
+#define __NR_timer_settime 223
+__SYSCALL(__NR_timer_settime, sys_timer_settime)
+#define __NR_timer_gettime 224
+__SYSCALL(__NR_timer_gettime, sys_timer_gettime)
+#define __NR_timer_getoverrun 225
+__SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun)
+#define __NR_timer_delete 226
+__SYSCALL(__NR_timer_delete, sys_timer_delete)
+#define __NR_clock_settime 227
+__SYSCALL(__NR_clock_settime, sys_clock_settime)
+#define __NR_clock_gettime 228
+__SYSCALL(__NR_clock_gettime, sys_clock_gettime)
+#define __NR_clock_getres 229
+__SYSCALL(__NR_clock_getres, sys_clock_getres)
+#define __NR_clock_nanosleep 230
+__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep)
+#define __NR_exit_group 231
+__SYSCALL(__NR_exit_group, sys_exit_group)
+#define __NR_epoll_wait 232
+__SYSCALL(__NR_epoll_wait, sys_epoll_wait)
+#define __NR_epoll_ctl 233
+__SYSCALL(__NR_epoll_ctl, sys_epoll_ctl)
+#define __NR_tgkill 234
+__SYSCALL(__NR_tgkill, sys_tgkill)
+#define __NR_utimes 235
+__SYSCALL(__NR_utimes, sys_utimes)
+#define __NR_vserver 236
+__SYSCALL(__NR_vserver, sys_ni_syscall)
+#define __NR_mbind 237
+__SYSCALL(__NR_mbind, sys_mbind)
+#define __NR_set_mempolicy 238
+__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy)
+#define __NR_get_mempolicy 239
+__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy)
+#define __NR_mq_open 240
+__SYSCALL(__NR_mq_open, sys_mq_open)
+#define __NR_mq_unlink 241
+__SYSCALL(__NR_mq_unlink, sys_mq_unlink)
+#define __NR_mq_timedsend 242
+__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend)
+#define __NR_mq_timedreceive 243
+__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive)
+#define __NR_mq_notify 244
+__SYSCALL(__NR_mq_notify, sys_mq_notify)
+#define __NR_mq_getsetattr 245
+__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr)
+#define __NR_kexec_load 246
+__SYSCALL(__NR_kexec_load, sys_kexec_load)
+#define __NR_waitid 247
+__SYSCALL(__NR_waitid, sys_waitid)
+#define __NR_add_key 248
+__SYSCALL(__NR_add_key, sys_add_key)
+#define __NR_request_key 249
+__SYSCALL(__NR_request_key, sys_request_key)
+#define __NR_keyctl 250
+__SYSCALL(__NR_keyctl, sys_keyctl)
+#define __NR_ioprio_set 251
+__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
+#define __NR_ioprio_get 252
+__SYSCALL(__NR_ioprio_get, sys_ioprio_get)
+#define __NR_inotify_init 253
+__SYSCALL(__NR_inotify_init, sys_inotify_init)
+#define __NR_inotify_add_watch 254
+__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
+#define __NR_inotify_rm_watch 255
+__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
+#define __NR_migrate_pages 256
+__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
+#define __NR_openat 257
+__SYSCALL(__NR_openat, sys_openat)
+#define __NR_mkdirat 258
+__SYSCALL(__NR_mkdirat, sys_mkdirat)
+#define __NR_mknodat 259
+__SYSCALL(__NR_mknodat, sys_mknodat)
+#define __NR_fchownat 260
+__SYSCALL(__NR_fchownat, sys_fchownat)
+#define __NR_futimesat 261
+__SYSCALL(__NR_futimesat, sys_futimesat)
+#define __NR_newfstatat 262
+__SYSCALL(__NR_newfstatat, sys_newfstatat)
+#define __NR_unlinkat 263
+__SYSCALL(__NR_unlinkat, sys_unlinkat)
+#define __NR_renameat 264
+__SYSCALL(__NR_renameat, sys_renameat)
+#define __NR_linkat 265
+__SYSCALL(__NR_linkat, sys_linkat)
+#define __NR_symlinkat 266
+__SYSCALL(__NR_symlinkat, sys_symlinkat)
+#define __NR_readlinkat 267
+__SYSCALL(__NR_readlinkat, sys_readlinkat)
+#define __NR_fchmodat 268
+__SYSCALL(__NR_fchmodat, sys_fchmodat)
+#define __NR_faccessat 269
+__SYSCALL(__NR_faccessat, sys_faccessat)
+#define __NR_pselect6 270
+__SYSCALL(__NR_pselect6, sys_pselect6)
+#define __NR_ppoll 271
+__SYSCALL(__NR_ppoll, sys_ppoll)
+#define __NR_unshare 272
+__SYSCALL(__NR_unshare, sys_unshare)
+#define __NR_set_robust_list 273
+__SYSCALL(__NR_set_robust_list, sys_set_robust_list)
+#define __NR_get_robust_list 274
+__SYSCALL(__NR_get_robust_list, sys_get_robust_list)
+#define __NR_splice 275
+__SYSCALL(__NR_splice, sys_splice)
+#define __NR_tee 276
+__SYSCALL(__NR_tee, sys_tee)
+#define __NR_sync_file_range 277
+__SYSCALL(__NR_sync_file_range, sys_sync_file_range)
+#define __NR_vmsplice 278
+__SYSCALL(__NR_vmsplice, sys_vmsplice)
+#define __NR_move_pages 279
+__SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_utimensat 280
+__SYSCALL(__NR_utimensat, sys_utimensat)
+#define __IGNORE_getcpu /* implemented as a vsyscall */
+#define __NR_epoll_pwait 281
+__SYSCALL(__NR_epoll_pwait, sys_epoll_pwait)
+#define __NR_signalfd 282
+__SYSCALL(__NR_signalfd, sys_signalfd)
+#define __NR_timerfd_create 283
+__SYSCALL(__NR_timerfd_create, sys_timerfd_create)
+#define __NR_eventfd 284
+__SYSCALL(__NR_eventfd, sys_eventfd)
+#define __NR_fallocate 285
+__SYSCALL(__NR_fallocate, sys_fallocate)
+#define __NR_timerfd_settime 286
+__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime)
+#define __NR_timerfd_gettime 287
+__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
+#define __NR_accept4 288
+__SYSCALL(__NR_accept4, sys_accept4)
+#define __NR_signalfd4 289
+__SYSCALL(__NR_signalfd4, sys_signalfd4)
+#define __NR_eventfd2 290
+__SYSCALL(__NR_eventfd2, sys_eventfd2)
+#define __NR_epoll_create1 291
+__SYSCALL(__NR_epoll_create1, sys_epoll_create1)
+#define __NR_dup3 292
+__SYSCALL(__NR_dup3, sys_dup3)
+#define __NR_pipe2 293
+__SYSCALL(__NR_pipe2, sys_pipe2)
+#define __NR_inotify_init1 294
+__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
+
+
+#ifndef __NO_STUBS
+#define __ARCH_WANT_OLD_READDIR
+#define __ARCH_WANT_OLD_STAT
+#define __ARCH_WANT_SYS_ALARM
+#define __ARCH_WANT_SYS_GETHOSTNAME
+#define __ARCH_WANT_SYS_PAUSE
+#define __ARCH_WANT_SYS_SGETMASK
+#define __ARCH_WANT_SYS_SIGNAL
+#define __ARCH_WANT_SYS_UTIME
+#define __ARCH_WANT_SYS_WAITPID
+#define __ARCH_WANT_SYS_SOCKETCALL
+#define __ARCH_WANT_SYS_FADVISE64
+#define __ARCH_WANT_SYS_GETPGRP
+#define __ARCH_WANT_SYS_LLSEEK
+#define __ARCH_WANT_SYS_NICE
+#define __ARCH_WANT_SYS_OLD_GETRLIMIT
+#define __ARCH_WANT_SYS_OLDUMOUNT
+#define __ARCH_WANT_SYS_SIGPENDING
+#define __ARCH_WANT_SYS_SIGPROCMASK
+#define __ARCH_WANT_SYS_RT_SIGACTION
+#define __ARCH_WANT_SYS_RT_SIGSUSPEND
+#define __ARCH_WANT_SYS_TIME
+#define __ARCH_WANT_COMPAT_SYS_TIME
+#endif /* __NO_STUBS */
+
+#ifdef __KERNEL__
+/*
+ * "Conditional" syscalls
+ *
+ * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
+ * but it doesn't work on all toolchains, so we just do it by hand
+ */
+#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_UNISTD_64_H */
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
new file mode 100644
index 000000000000..8b064bd9c553
--- /dev/null
+++ b/arch/x86/include/asm/unwind.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_X86_UNWIND_H
+#define _ASM_X86_UNWIND_H
+
+#define UNW_PC(frame) ((void)(frame), 0UL)
+#define UNW_SP(frame) ((void)(frame), 0UL)
+#define UNW_FP(frame) ((void)(frame), 0UL)
+
+static inline int arch_unw_user_mode(const void *info)
+{
+ return 0;
+}
+
+#endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h
new file mode 100644
index 000000000000..999873b22e7f
--- /dev/null
+++ b/arch/x86/include/asm/user.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "user_32.h"
+#else
+# include "user_64.h"
+#endif
diff --git a/arch/x86/include/asm/user32.h b/arch/x86/include/asm/user32.h
new file mode 100644
index 000000000000..14cbb73ebcba
--- /dev/null
+++ b/arch/x86/include/asm/user32.h
@@ -0,0 +1,70 @@
+#ifndef _ASM_X86_USER32_H
+#define _ASM_X86_USER32_H
+
+/* IA32 compatible user structures for ptrace.
+ * These should be used for 32bit coredumps too. */
+
+struct user_i387_ia32_struct {
+ u32 cwd;
+ u32 swd;
+ u32 twd;
+ u32 fip;
+ u32 fcs;
+ u32 foo;
+ u32 fos;
+ u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
+};
+
+/* FSAVE frame with extensions */
+struct user32_fxsr_struct {
+ unsigned short cwd;
+ unsigned short swd;
+ unsigned short twd; /* not compatible to 64bit twd */
+ unsigned short fop;
+ int fip;
+ int fcs;
+ int foo;
+ int fos;
+ int mxcsr;
+ int reserved;
+ int st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+ int xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
+ int padding[56];
+};
+
+struct user_regs_struct32 {
+ __u32 ebx, ecx, edx, esi, edi, ebp, eax;
+ unsigned short ds, __ds, es, __es;
+ unsigned short fs, __fs, gs, __gs;
+ __u32 orig_eax, eip;
+ unsigned short cs, __cs;
+ __u32 eflags, esp;
+ unsigned short ss, __ss;
+};
+
+struct user32 {
+ struct user_regs_struct32 regs; /* Where the registers are actually stored */
+ int u_fpvalid; /* True if math co-processor being used. */
+ /* for this mess. Not yet used. */
+ struct user_i387_ia32_struct i387; /* Math Co-processor registers. */
+/* The rest of this junk is to help gdb figure out what goes where */
+ __u32 u_tsize; /* Text segment size (pages). */
+ __u32 u_dsize; /* Data segment size (pages). */
+ __u32 u_ssize; /* Stack segment size (pages). */
+ __u32 start_code; /* Starting virtual address of text. */
+ __u32 start_stack; /* Starting virtual address of stack area.
+ This is actually the bottom of the stack,
+ the top of the stack is always found in the
+ esp register. */
+ __u32 signal; /* Signal that caused the core dump. */
+ int reserved; /* No __u32er used */
+ __u32 u_ar0; /* Used by gdb to help find the values for */
+ /* the registers. */
+ __u32 u_fpstate; /* Math Co-processor pointer. */
+ __u32 magic; /* To uniquely identify a core file */
+ char u_comm[32]; /* User command that was responsible */
+ int u_debugreg[8];
+};
+
+
+#endif /* _ASM_X86_USER32_H */
diff --git a/arch/x86/include/asm/user_32.h b/arch/x86/include/asm/user_32.h
new file mode 100644
index 000000000000..bebfd8644016
--- /dev/null
+++ b/arch/x86/include/asm/user_32.h
@@ -0,0 +1,131 @@
+#ifndef _ASM_X86_USER_32_H
+#define _ASM_X86_USER_32_H
+
+#include <asm/page.h>
+/* Core file format: The core file is written in such a way that gdb
+ can understand it and provide useful information to the user (under
+ linux we use the 'trad-core' bfd). There are quite a number of
+ obstacles to being able to view the contents of the floating point
+ registers, and until these are solved you will not be able to view the
+ contents of them. Actually, you can read in the core file and look at
+ the contents of the user struct to find out what the floating point
+ registers contain.
+ The actual file contents are as follows:
+ UPAGE: 1 page consisting of a user struct that tells gdb what is present
+ in the file. Directly after this is a copy of the task_struct, which
+ is currently not used by gdb, but it may come in useful at some point.
+ All of the registers are stored as part of the upage. The upage should
+ always be only one page.
+ DATA: The data area is stored. We use current->end_text to
+ current->brk to pick up all of the user variables, plus any memory
+ that may have been malloced. No attempt is made to determine if a page
+ is demand-zero or if a page is totally unused, we just cover the entire
+ range. All of the addresses are rounded in such a way that an integral
+ number of pages is written.
+ STACK: We need the stack information in order to get a meaningful
+ backtrace. We need to write the data from (esp) to
+ current->start_stack, so we round each of these off in order to be able
+ to write an integer number of pages.
+ The minimum core file size is 3 pages, or 12288 bytes.
+*/
+
+/*
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ * Provide support for the GDB 5.0+ PTRACE_{GET|SET}FPXREGS requests for
+ * interacting with the FXSR-format floating point environment. Floating
+ * point data can be accessed in the regular format in the usual manner,
+ * and both the standard and SIMD floating point data can be accessed via
+ * the new ptrace requests. In either case, changes to the FPU environment
+ * will be reflected in the task's state as expected.
+ */
+
+struct user_i387_struct {
+ long cwd;
+ long swd;
+ long twd;
+ long fip;
+ long fcs;
+ long foo;
+ long fos;
+ long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
+};
+
+struct user_fxsr_struct {
+ unsigned short cwd;
+ unsigned short swd;
+ unsigned short twd;
+ unsigned short fop;
+ long fip;
+ long fcs;
+ long foo;
+ long fos;
+ long mxcsr;
+ long reserved;
+ long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+ long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
+ long padding[56];
+};
+
+/*
+ * This is the old layout of "struct pt_regs", and
+ * is still the layout used by user mode (the new
+ * pt_regs doesn't have all registers as the kernel
+ * doesn't use the extra segment registers)
+ */
+struct user_regs_struct {
+ unsigned long bx;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long si;
+ unsigned long di;
+ unsigned long bp;
+ unsigned long ax;
+ unsigned long ds;
+ unsigned long es;
+ unsigned long fs;
+ unsigned long gs;
+ unsigned long orig_ax;
+ unsigned long ip;
+ unsigned long cs;
+ unsigned long flags;
+ unsigned long sp;
+ unsigned long ss;
+};
+
+/* When the kernel dumps core, it starts by dumping the user struct -
+ this will be used by gdb to figure out where the data and stack segments
+ are within the file, and what virtual addresses to use. */
+struct user{
+/* We start with the registers, to mimic the way that "memory" is returned
+ from the ptrace(3,...) function. */
+ struct user_regs_struct regs; /* Where the registers are actually stored */
+/* ptrace does not yet supply these. Someday.... */
+ int u_fpvalid; /* True if math co-processor being used. */
+ /* for this mess. Not yet used. */
+ struct user_i387_struct i387; /* Math Co-processor registers. */
+/* The rest of this junk is to help gdb figure out what goes where */
+ unsigned long int u_tsize; /* Text segment size (pages). */
+ unsigned long int u_dsize; /* Data segment size (pages). */
+ unsigned long int u_ssize; /* Stack segment size (pages). */
+ unsigned long start_code; /* Starting virtual address of text. */
+ unsigned long start_stack; /* Starting virtual address of stack area.
+ This is actually the bottom of the stack,
+ the top of the stack is always found in the
+ esp register. */
+ long int signal; /* Signal that caused the core dump. */
+ int reserved; /* No longer used */
+ unsigned long u_ar0; /* Used by gdb to help find the values for */
+ /* the registers. */
+ struct user_i387_struct *u_fpstate; /* Math Co-processor pointer. */
+ unsigned long magic; /* To uniquely identify a core file */
+ char u_comm[32]; /* User command that was responsible */
+ int u_debugreg[8];
+};
+#define NBPG PAGE_SIZE
+#define UPAGES 1
+#define HOST_TEXT_START_ADDR (u.start_code)
+#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
+
+#endif /* _ASM_X86_USER_32_H */
diff --git a/arch/x86/include/asm/user_64.h b/arch/x86/include/asm/user_64.h
new file mode 100644
index 000000000000..faf2cd3e0d76
--- /dev/null
+++ b/arch/x86/include/asm/user_64.h
@@ -0,0 +1,137 @@
+#ifndef _ASM_X86_USER_64_H
+#define _ASM_X86_USER_64_H
+
+#include <asm/types.h>
+#include <asm/page.h>
+/* Core file format: The core file is written in such a way that gdb
+ can understand it and provide useful information to the user.
+ There are quite a number of obstacles to being able to view the
+ contents of the floating point registers, and until these are
+ solved you will not be able to view the contents of them.
+ Actually, you can read in the core file and look at the contents of
+ the user struct to find out what the floating point registers
+ contain.
+
+ The actual file contents are as follows:
+ UPAGE: 1 page consisting of a user struct that tells gdb what is present
+ in the file. Directly after this is a copy of the task_struct, which
+ is currently not used by gdb, but it may come in useful at some point.
+ All of the registers are stored as part of the upage. The upage should
+ always be only one page.
+ DATA: The data area is stored. We use current->end_text to
+ current->brk to pick up all of the user variables, plus any memory
+ that may have been malloced. No attempt is made to determine if a page
+ is demand-zero or if a page is totally unused, we just cover the entire
+ range. All of the addresses are rounded in such a way that an integral
+ number of pages is written.
+ STACK: We need the stack information in order to get a meaningful
+ backtrace. We need to write the data from (esp) to
+ current->start_stack, so we round each of these off in order to be able
+ to write an integer number of pages.
+ The minimum core file size is 3 pages, or 12288 bytes. */
+
+/*
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ * Provide support for the GDB 5.0+ PTRACE_{GET|SET}FPXREGS requests for
+ * interacting with the FXSR-format floating point environment. Floating
+ * point data can be accessed in the regular format in the usual manner,
+ * and both the standard and SIMD floating point data can be accessed via
+ * the new ptrace requests. In either case, changes to the FPU environment
+ * will be reflected in the task's state as expected.
+ *
+ * x86-64 support by Andi Kleen.
+ */
+
+/* This matches the 64bit FXSAVE format as defined by AMD. It is the same
+ as the 32bit format defined by Intel, except that the selector:offset pairs
+ for data and eip are replaced with flat 64bit pointers. */
+struct user_i387_struct {
+ unsigned short cwd;
+ unsigned short swd;
+ unsigned short twd; /* Note this is not the same as
+ the 32bit/x87/FSAVE twd */
+ unsigned short fop;
+ __u64 rip;
+ __u64 rdp;
+ __u32 mxcsr;
+ __u32 mxcsr_mask;
+ __u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+ __u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
+ __u32 padding[24];
+};
+
+/*
+ * Segment register layout in coredumps.
+ */
+struct user_regs_struct {
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+ unsigned long bp;
+ unsigned long bx;
+ unsigned long r11;
+ unsigned long r10;
+ unsigned long r9;
+ unsigned long r8;
+ unsigned long ax;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long si;
+ unsigned long di;
+ unsigned long orig_ax;
+ unsigned long ip;
+ unsigned long cs;
+ unsigned long flags;
+ unsigned long sp;
+ unsigned long ss;
+ unsigned long fs_base;
+ unsigned long gs_base;
+ unsigned long ds;
+ unsigned long es;
+ unsigned long fs;
+ unsigned long gs;
+};
+
+/* When the kernel dumps core, it starts by dumping the user struct -
+ this will be used by gdb to figure out where the data and stack segments
+ are within the file, and what virtual addresses to use. */
+
+struct user {
+/* We start with the registers, to mimic the way that "memory" is returned
+ from the ptrace(3,...) function. */
+ struct user_regs_struct regs; /* Where the registers are actually stored */
+/* ptrace does not yet supply these. Someday.... */
+ int u_fpvalid; /* True if math co-processor being used. */
+ /* for this mess. Not yet used. */
+ int pad0;
+ struct user_i387_struct i387; /* Math Co-processor registers. */
+/* The rest of this junk is to help gdb figure out what goes where */
+ unsigned long int u_tsize; /* Text segment size (pages). */
+ unsigned long int u_dsize; /* Data segment size (pages). */
+ unsigned long int u_ssize; /* Stack segment size (pages). */
+ unsigned long start_code; /* Starting virtual address of text. */
+ unsigned long start_stack; /* Starting virtual address of stack area.
+ This is actually the bottom of the stack,
+ the top of the stack is always found in the
+ esp register. */
+ long int signal; /* Signal that caused the core dump. */
+ int reserved; /* No longer used */
+ int pad1;
+ unsigned long u_ar0; /* Used by gdb to help find the values for */
+ /* the registers. */
+ struct user_i387_struct *u_fpstate; /* Math Co-processor pointer. */
+ unsigned long magic; /* To uniquely identify a core file */
+ char u_comm[32]; /* User command that was responsible */
+ unsigned long u_debugreg[8];
+ unsigned long error_code; /* CPU error code or 0 */
+ unsigned long fault_address; /* CR3 or 0 */
+};
+#define NBPG PAGE_SIZE
+#define UPAGES 1
+#define HOST_TEXT_START_ADDR (u.start_code)
+#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
+
+#endif /* _ASM_X86_USER_64_H */
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
new file mode 100644
index 000000000000..7ed17ff502b9
--- /dev/null
+++ b/arch/x86/include/asm/uv/bios.h
@@ -0,0 +1,120 @@
+#ifndef _ASM_X86_UV_BIOS_H
+#define _ASM_X86_UV_BIOS_H
+
+/*
+ * UV BIOS layer definitions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Russ Anderson
+ */
+
+#include <linux/rtc.h>
+
+/*
+ * Values for the BIOS calls. It is passed as the first * argument in the
+ * BIOS call. Passing any other value in the first argument will result
+ * in a BIOS_STATUS_UNIMPLEMENTED return status.
+ */
+enum uv_bios_cmd {
+ UV_BIOS_COMMON,
+ UV_BIOS_GET_SN_INFO,
+ UV_BIOS_FREQ_BASE,
+ UV_BIOS_WATCHLIST_ALLOC,
+ UV_BIOS_WATCHLIST_FREE,
+ UV_BIOS_MEMPROTECT,
+ UV_BIOS_GET_PARTITION_ADDR
+};
+
+/*
+ * Status values returned from a BIOS call.
+ */
+enum {
+ BIOS_STATUS_MORE_PASSES = 1,
+ BIOS_STATUS_SUCCESS = 0,
+ BIOS_STATUS_UNIMPLEMENTED = -ENOSYS,
+ BIOS_STATUS_EINVAL = -EINVAL,
+ BIOS_STATUS_UNAVAIL = -EBUSY
+};
+
+/*
+ * The UV system table describes specific firmware
+ * capabilities available to the Linux kernel at runtime.
+ */
+struct uv_systab {
+ char signature[4]; /* must be "UVST" */
+ u32 revision; /* distinguish different firmware revs */
+ u64 function; /* BIOS runtime callback function ptr */
+};
+
+enum {
+ BIOS_FREQ_BASE_PLATFORM = 0,
+ BIOS_FREQ_BASE_INTERVAL_TIMER = 1,
+ BIOS_FREQ_BASE_REALTIME_CLOCK = 2
+};
+
+union partition_info_u {
+ u64 val;
+ struct {
+ u64 hub_version : 8,
+ partition_id : 16,
+ coherence_id : 16,
+ region_size : 24;
+ };
+};
+
+union uv_watchlist_u {
+ u64 val;
+ struct {
+ u64 blade : 16,
+ size : 32,
+ filler : 16;
+ };
+};
+
+enum uv_memprotect {
+ UV_MEMPROT_RESTRICT_ACCESS,
+ UV_MEMPROT_ALLOW_AMO,
+ UV_MEMPROT_ALLOW_RW
+};
+
+/*
+ * bios calls have 6 parameters
+ */
+extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64);
+extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64);
+extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
+
+extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *);
+extern s64 uv_bios_freq_base(u64, u64 *);
+extern int uv_bios_mq_watchlist_alloc(int, unsigned long, unsigned int,
+ unsigned long *);
+extern int uv_bios_mq_watchlist_free(int, int);
+extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
+extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *);
+
+extern void uv_bios_init(void);
+
+extern unsigned long sn_rtc_cycles_per_second;
+extern int uv_type;
+extern long sn_partition_id;
+extern long sn_coherency_id;
+extern long sn_region_size;
+#define partition_coherence_id() (sn_coherency_id)
+
+extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
+
+#endif /* _ASM_X86_UV_BIOS_H */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
new file mode 100644
index 000000000000..e2363253bbbf
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -0,0 +1,332 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV Broadcast Assist Unit definitions
+ *
+ * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#ifndef _ASM_X86_UV_UV_BAU_H
+#define _ASM_X86_UV_UV_BAU_H
+
+#include <linux/bitmap.h>
+#define BITSPERBYTE 8
+
+/*
+ * Broadcast Assist Unit messaging structures
+ *
+ * Selective Broadcast activations are induced by software action
+ * specifying a particular 8-descriptor "set" via a 6-bit index written
+ * to an MMR.
+ * Thus there are 64 unique 512-byte sets of SB descriptors - one set for
+ * each 6-bit index value. These descriptor sets are mapped in sequence
+ * starting with set 0 located at the address specified in the
+ * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512,
+ * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
+ *
+ * We will use 31 sets, one for sending BAU messages from each of the 32
+ * cpu's on the node.
+ *
+ * TLB shootdown will use the first of the 8 descriptors of each set.
+ * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
+ */
+
+#define UV_ITEMS_PER_DESCRIPTOR 8
+#define UV_CPUS_PER_ACT_STATUS 32
+#define UV_ACT_STATUS_MASK 0x3
+#define UV_ACT_STATUS_SIZE 2
+#define UV_ACTIVATION_DESCRIPTOR_SIZE 32
+#define UV_DISTRIBUTION_SIZE 256
+#define UV_SW_ACK_NPENDING 8
+#define UV_NET_ENDPOINT_INTD 0x38
+#define UV_DESC_BASE_PNODE_SHIFT 49
+#define UV_PAYLOADQ_PNODE_SHIFT 49
+#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
+#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
+
+/*
+ * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
+ */
+#define DESC_STATUS_IDLE 0
+#define DESC_STATUS_ACTIVE 1
+#define DESC_STATUS_DESTINATION_TIMEOUT 2
+#define DESC_STATUS_SOURCE_TIMEOUT 3
+
+/*
+ * source side threshholds at which message retries print a warning
+ */
+#define SOURCE_TIMEOUT_LIMIT 20
+#define DESTINATION_TIMEOUT_LIMIT 20
+
+/*
+ * number of entries in the destination side payload queue
+ */
+#define DEST_Q_SIZE 17
+/*
+ * number of destination side software ack resources
+ */
+#define DEST_NUM_RESOURCES 8
+#define MAX_CPUS_PER_NODE 32
+/*
+ * completion statuses for sending a TLB flush message
+ */
+#define FLUSH_RETRY 1
+#define FLUSH_GIVEUP 2
+#define FLUSH_COMPLETE 3
+
+/*
+ * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
+ * If the 'multilevel' flag in the header portion of the descriptor
+ * has been set to 0, then endpoint multi-unicast mode is selected.
+ * The distribution specification (32 bytes) is interpreted as a 256-bit
+ * distribution vector. Adjacent bits correspond to consecutive even numbered
+ * nodeIDs. The result of adding the index of a given bit to the 15-bit
+ * 'base_dest_nodeid' field of the header corresponds to the
+ * destination nodeID associated with that specified bit.
+ */
+struct bau_target_nodemask {
+ unsigned long bits[BITS_TO_LONGS(256)];
+};
+
+/*
+ * mask of cpu's on a node
+ * (during initialization we need to check that unsigned long has
+ * enough bits for max. cpu's per node)
+ */
+struct bau_local_cpumask {
+ unsigned long bits;
+};
+
+/*
+ * Payload: 16 bytes (128 bits) (bytes 0x20-0x2f of descriptor)
+ * only 12 bytes (96 bits) of the payload area are usable.
+ * An additional 3 bytes (bits 27:4) of the header address are carried
+ * to the next bytes of the destination payload queue.
+ * And an additional 2 bytes of the header Suppl_A field are also
+ * carried to the destination payload queue.
+ * But the first byte of the Suppl_A becomes bits 127:120 (the 16th byte)
+ * of the destination payload queue, which is written by the hardware
+ * with the s/w ack resource bit vector.
+ * [ effective message contents (16 bytes (128 bits) maximum), not counting
+ * the s/w ack bit vector ]
+ */
+
+/*
+ * The payload is software-defined for INTD transactions
+ */
+struct bau_msg_payload {
+ unsigned long address; /* signifies a page or all TLB's
+ of the cpu */
+ /* 64 bits */
+ unsigned short sending_cpu; /* filled in by sender */
+ /* 16 bits */
+ unsigned short acknowledge_count;/* filled in by destination */
+ /* 16 bits */
+ unsigned int reserved1:32; /* not usable */
+};
+
+
+/*
+ * Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
+ * see table 4.2.3.0.1 in broacast_assist spec.
+ */
+struct bau_msg_header {
+ int dest_subnodeid:6; /* must be zero */
+ /* bits 5:0 */
+ int base_dest_nodeid:15; /* nasid>>1 (pnode) of first bit in node_map */
+ /* bits 20:6 */
+ int command:8; /* message type */
+ /* bits 28:21 */
+ /* 0x38: SN3net EndPoint Message */
+ int rsvd_1:3; /* must be zero */
+ /* bits 31:29 */
+ /* int will align on 32 bits */
+ int rsvd_2:9; /* must be zero */
+ /* bits 40:32 */
+ /* Suppl_A is 56-41 */
+ int payload_2a:8; /* becomes byte 16 of msg */
+ /* bits 48:41 */ /* not currently using */
+ int payload_2b:8; /* becomes byte 17 of msg */
+ /* bits 56:49 */ /* not currently using */
+ /* Address field (96:57) is never used as an
+ address (these are address bits 42:3) */
+ int rsvd_3:1; /* must be zero */
+ /* bit 57 */
+ /* address bits 27:4 are payload */
+ /* these 24 bits become bytes 12-14 of msg */
+ int replied_to:1; /* sent as 0 by the source to byte 12 */
+ /* bit 58 */
+
+ int payload_1a:5; /* not currently used */
+ /* bits 63:59 */
+ int payload_1b:8; /* not currently used */
+ /* bits 71:64 */
+ int payload_1c:8; /* not currently used */
+ /* bits 79:72 */
+ int payload_1d:2; /* not currently used */
+ /* bits 81:80 */
+
+ int rsvd_4:7; /* must be zero */
+ /* bits 88:82 */
+ int sw_ack_flag:1; /* software acknowledge flag */
+ /* bit 89 */
+ /* INTD trasactions at destination are to
+ wait for software acknowledge */
+ int rsvd_5:6; /* must be zero */
+ /* bits 95:90 */
+ int rsvd_6:5; /* must be zero */
+ /* bits 100:96 */
+ int int_both:1; /* if 1, interrupt both sockets on the blade */
+ /* bit 101*/
+ int fairness:3; /* usually zero */
+ /* bits 104:102 */
+ int multilevel:1; /* multi-level multicast format */
+ /* bit 105 */
+ /* 0 for TLB: endpoint multi-unicast messages */
+ int chaining:1; /* next descriptor is part of this activation*/
+ /* bit 106 */
+ int rsvd_7:21; /* must be zero */
+ /* bits 127:107 */
+};
+
+/*
+ * The activation descriptor:
+ * The format of the message to send, plus all accompanying control
+ * Should be 64 bytes
+ */
+struct bau_desc {
+ struct bau_target_nodemask distribution;
+ /*
+ * message template, consisting of header and payload:
+ */
+ struct bau_msg_header header;
+ struct bau_msg_payload payload;
+};
+/*
+ * -payload-- ---------header------
+ * bytes 0-11 bits 41-56 bits 58-81
+ * A B (2) C (3)
+ *
+ * A/B/C are moved to:
+ * A C B
+ * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector)
+ * ------------payload queue-----------
+ */
+
+/*
+ * The payload queue on the destination side is an array of these.
+ * With BAU_MISC_CONTROL set for software acknowledge mode, the messages
+ * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17
+ * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120)
+ * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from
+ * sw_ack_vector and payload_2)
+ * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software
+ * Acknowledge Processing) also selects 32 byte (17 bytes usable) payload
+ * operation."
+ */
+struct bau_payload_queue_entry {
+ unsigned long address; /* signifies a page or all TLB's
+ of the cpu */
+ /* 64 bits, bytes 0-7 */
+
+ unsigned short sending_cpu; /* cpu that sent the message */
+ /* 16 bits, bytes 8-9 */
+
+ unsigned short acknowledge_count; /* filled in by destination */
+ /* 16 bits, bytes 10-11 */
+
+ unsigned short replied_to:1; /* sent as 0 by the source */
+ /* 1 bit */
+ unsigned short unused1:7; /* not currently using */
+ /* 7 bits: byte 12) */
+
+ unsigned char unused2[2]; /* not currently using */
+ /* bytes 13-14 */
+
+ unsigned char sw_ack_vector; /* filled in by the hardware */
+ /* byte 15 (bits 127:120) */
+
+ unsigned char unused4[3]; /* not currently using bytes 17-19 */
+ /* bytes 17-19 */
+
+ int number_of_cpus; /* filled in at destination */
+ /* 32 bits, bytes 20-23 (aligned) */
+
+ unsigned char unused5[8]; /* not using */
+ /* bytes 24-31 */
+};
+
+/*
+ * one for every slot in the destination payload queue
+ */
+struct bau_msg_status {
+ struct bau_local_cpumask seen_by; /* map of cpu's */
+};
+
+/*
+ * one for every slot in the destination software ack resources
+ */
+struct bau_sw_ack_status {
+ struct bau_payload_queue_entry *msg; /* associated message */
+ int watcher; /* cpu monitoring, or -1 */
+};
+
+/*
+ * one on every node and per-cpu; to locate the software tables
+ */
+struct bau_control {
+ struct bau_desc *descriptor_base;
+ struct bau_payload_queue_entry *bau_msg_head;
+ struct bau_payload_queue_entry *va_queue_first;
+ struct bau_payload_queue_entry *va_queue_last;
+ struct bau_msg_status *msg_statuses;
+ int *watching; /* pointer to array */
+};
+
+/*
+ * This structure is allocated per_cpu for UV TLB shootdown statistics.
+ */
+struct ptc_stats {
+ unsigned long ptc_i; /* number of IPI-style flushes */
+ unsigned long requestor; /* number of nodes this cpu sent to */
+ unsigned long requestee; /* times cpu was remotely requested */
+ unsigned long alltlb; /* times all tlb's on this cpu were flushed */
+ unsigned long onetlb; /* times just one tlb on this cpu was flushed */
+ unsigned long s_retry; /* retries on source side timeouts */
+ unsigned long d_retry; /* retries on destination side timeouts */
+ unsigned long sflush; /* cycles spent in uv_flush_tlb_others */
+ unsigned long dflush; /* cycles spent on destination side */
+ unsigned long retriesok; /* successes on retries */
+ unsigned long nomsg; /* interrupts with no message */
+ unsigned long multmsg; /* interrupts with multiple messages */
+ unsigned long ntargeted;/* nodes targeted */
+};
+
+static inline int bau_node_isset(int node, struct bau_target_nodemask *dstp)
+{
+ return constant_test_bit(node, &dstp->bits[0]);
+}
+static inline void bau_node_set(int node, struct bau_target_nodemask *dstp)
+{
+ __set_bit(node, &dstp->bits[0]);
+}
+static inline void bau_nodes_clear(struct bau_target_nodemask *dstp, int nbits)
+{
+ bitmap_zero(&dstp->bits[0], nbits);
+}
+
+static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
+{
+ bitmap_zero(&dstp->bits, nbits);
+}
+
+#define cpubit_isset(cpu, bau_local_cpumask) \
+ test_bit((cpu), (bau_local_cpumask).bits)
+
+extern int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long);
+extern void uv_bau_message_intr1(void);
+extern void uv_bau_timeout_intr1(void);
+
+#endif /* _ASM_X86_UV_UV_BAU_H */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
new file mode 100644
index 000000000000..777327ef05c1
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -0,0 +1,404 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV architectural definitions
+ *
+ * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#ifndef _ASM_X86_UV_UV_HUB_H
+#define _ASM_X86_UV_UV_HUB_H
+
+#include <linux/numa.h>
+#include <linux/percpu.h>
+#include <linux/timer.h>
+#include <asm/types.h>
+#include <asm/percpu.h>
+
+
+/*
+ * Addressing Terminology
+ *
+ * M - The low M bits of a physical address represent the offset
+ * into the blade local memory. RAM memory on a blade is physically
+ * contiguous (although various IO spaces may punch holes in
+ * it)..
+ *
+ * N - Number of bits in the node portion of a socket physical
+ * address.
+ *
+ * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of
+ * routers always have low bit of 1, C/MBricks have low bit
+ * equal to 0. Most addressing macros that target UV hub chips
+ * right shift the NASID by 1 to exclude the always-zero bit.
+ * NASIDs contain up to 15 bits.
+ *
+ * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead
+ * of nasids.
+ *
+ * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant
+ * of the nasid for socket usage.
+ *
+ *
+ * NumaLink Global Physical Address Format:
+ * +--------------------------------+---------------------+
+ * |00..000| GNODE | NodeOffset |
+ * +--------------------------------+---------------------+
+ * |<-------53 - M bits --->|<--------M bits ----->
+ *
+ * M - number of node offset bits (35 .. 40)
+ *
+ *
+ * Memory/UV-HUB Processor Socket Address Format:
+ * +----------------+---------------+---------------------+
+ * |00..000000000000| PNODE | NodeOffset |
+ * +----------------+---------------+---------------------+
+ * <--- N bits --->|<--------M bits ----->
+ *
+ * M - number of node offset bits (35 .. 40)
+ * N - number of PNODE bits (0 .. 10)
+ *
+ * Note: M + N cannot currently exceed 44 (x86_64) or 46 (IA64).
+ * The actual values are configuration dependent and are set at
+ * boot time. M & N values are set by the hardware/BIOS at boot.
+ *
+ *
+ * APICID format
+ * NOTE!!!!!! This is the current format of the APICID. However, code
+ * should assume that this will change in the future. Use functions
+ * in this file for all APICID bit manipulations and conversion.
+ *
+ * 1111110000000000
+ * 5432109876543210
+ * pppppppppplc0cch
+ * sssssssssss
+ *
+ * p = pnode bits
+ * l = socket number on board
+ * c = core
+ * h = hyperthread
+ * s = bits that are in the SOCKET_ID CSR
+ *
+ * Note: Processor only supports 12 bits in the APICID register. The ACPI
+ * tables hold all 16 bits. Software needs to be aware of this.
+ *
+ * Unless otherwise specified, all references to APICID refer to
+ * the FULL value contained in ACPI tables, not the subset in the
+ * processor APICID register.
+ */
+
+
+/*
+ * Maximum number of bricks in all partitions and in all coherency domains.
+ * This is the total number of bricks accessible in the numalink fabric. It
+ * includes all C & M bricks. Routers are NOT included.
+ *
+ * This value is also the value of the maximum number of non-router NASIDs
+ * in the numalink fabric.
+ *
+ * NOTE: a brick may contain 1 or 2 OS nodes. Don't get these confused.
+ */
+#define UV_MAX_NUMALINK_BLADES 16384
+
+/*
+ * Maximum number of C/Mbricks within a software SSI (hardware may support
+ * more).
+ */
+#define UV_MAX_SSI_BLADES 256
+
+/*
+ * The largest possible NASID of a C or M brick (+ 2)
+ */
+#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2)
+
+struct uv_scir_s {
+ struct timer_list timer;
+ unsigned long offset;
+ unsigned long last;
+ unsigned long idle_on;
+ unsigned long idle_off;
+ unsigned char state;
+ unsigned char enabled;
+};
+
+/*
+ * The following defines attributes of the HUB chip. These attributes are
+ * frequently referenced and are kept in the per-cpu data areas of each cpu.
+ * They are kept together in a struct to minimize cache misses.
+ */
+struct uv_hub_info_s {
+ unsigned long global_mmr_base;
+ unsigned long gpa_mask;
+ unsigned long gnode_upper;
+ unsigned long lowmem_remap_top;
+ unsigned long lowmem_remap_base;
+ unsigned short pnode;
+ unsigned short pnode_mask;
+ unsigned short coherency_domain_number;
+ unsigned short numa_blade_id;
+ unsigned char blade_processor_id;
+ unsigned char m_val;
+ unsigned char n_val;
+ struct uv_scir_s scir;
+};
+
+DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
+#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
+#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
+
+/*
+ * Local & Global MMR space macros.
+ * Note: macros are intended to be used ONLY by inline functions
+ * in this file - not by other kernel code.
+ * n - NASID (full 15-bit global nasid)
+ * g - GNODE (full 15-bit global nasid, right shifted 1)
+ * p - PNODE (local part of nsids, right shifted 1)
+ */
+#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask)
+#define UV_PNODE_TO_NASID(p) (((p) << 1) | uv_hub_info->gnode_upper)
+
+#define UV_LOCAL_MMR_BASE 0xf4000000UL
+#define UV_GLOBAL_MMR32_BASE 0xf8000000UL
+#define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base)
+#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
+#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
+
+#define UV_GLOBAL_MMR32_PNODE_SHIFT 15
+#define UV_GLOBAL_MMR64_PNODE_SHIFT 26
+
+#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
+
+#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
+ ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT)
+
+#define UV_APIC_PNODE_SHIFT 6
+
+/* Local Bus from cpu's perspective */
+#define LOCAL_BUS_BASE 0x1c00000
+#define LOCAL_BUS_SIZE (4 * 1024 * 1024)
+
+/*
+ * System Controller Interface Reg
+ *
+ * Note there are NO leds on a UV system. This register is only
+ * used by the system controller to monitor system-wide operation.
+ * There are 64 regs per node. With Nahelem cpus (2 cores per node,
+ * 8 cpus per core, 2 threads per cpu) there are 32 cpu threads on
+ * a node.
+ *
+ * The window is located at top of ACPI MMR space
+ */
+#define SCIR_WINDOW_COUNT 64
+#define SCIR_LOCAL_MMR_BASE (LOCAL_BUS_BASE + \
+ LOCAL_BUS_SIZE - \
+ SCIR_WINDOW_COUNT)
+
+#define SCIR_CPU_HEARTBEAT 0x01 /* timer interrupt */
+#define SCIR_CPU_ACTIVITY 0x02 /* not idle */
+#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */
+
+/*
+ * Macros for converting between kernel virtual addresses, socket local physical
+ * addresses, and UV global physical addresses.
+ * Note: use the standard __pa() & __va() macros for converting
+ * between socket virtual and socket physical addresses.
+ */
+
+/* socket phys RAM --> UV global physical address */
+static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr)
+{
+ if (paddr < uv_hub_info->lowmem_remap_top)
+ paddr |= uv_hub_info->lowmem_remap_base;
+ return paddr | uv_hub_info->gnode_upper;
+}
+
+
+/* socket virtual --> UV global physical address */
+static inline unsigned long uv_gpa(void *v)
+{
+ return uv_soc_phys_ram_to_gpa(__pa(v));
+}
+
+/* pnode, offset --> socket virtual */
+static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
+{
+ return __va(((unsigned long)pnode << uv_hub_info->m_val) | offset);
+}
+
+
+/*
+ * Extract a PNODE from an APICID (full apicid, not processor subset)
+ */
+static inline int uv_apicid_to_pnode(int apicid)
+{
+ return (apicid >> UV_APIC_PNODE_SHIFT);
+}
+
+/*
+ * Access global MMRs using the low memory MMR32 space. This region supports
+ * faster MMR access but not all MMRs are accessible in this space.
+ */
+static inline unsigned long *uv_global_mmr32_address(int pnode,
+ unsigned long offset)
+{
+ return __va(UV_GLOBAL_MMR32_BASE |
+ UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset);
+}
+
+static inline void uv_write_global_mmr32(int pnode, unsigned long offset,
+ unsigned long val)
+{
+ *uv_global_mmr32_address(pnode, offset) = val;
+}
+
+static inline unsigned long uv_read_global_mmr32(int pnode,
+ unsigned long offset)
+{
+ return *uv_global_mmr32_address(pnode, offset);
+}
+
+/*
+ * Access Global MMR space using the MMR space located at the top of physical
+ * memory.
+ */
+static inline unsigned long *uv_global_mmr64_address(int pnode,
+ unsigned long offset)
+{
+ return __va(UV_GLOBAL_MMR64_BASE |
+ UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
+}
+
+static inline void uv_write_global_mmr64(int pnode, unsigned long offset,
+ unsigned long val)
+{
+ *uv_global_mmr64_address(pnode, offset) = val;
+}
+
+static inline unsigned long uv_read_global_mmr64(int pnode,
+ unsigned long offset)
+{
+ return *uv_global_mmr64_address(pnode, offset);
+}
+
+/*
+ * Access hub local MMRs. Faster than using global space but only local MMRs
+ * are accessible.
+ */
+static inline unsigned long *uv_local_mmr_address(unsigned long offset)
+{
+ return __va(UV_LOCAL_MMR_BASE | offset);
+}
+
+static inline unsigned long uv_read_local_mmr(unsigned long offset)
+{
+ return *uv_local_mmr_address(offset);
+}
+
+static inline void uv_write_local_mmr(unsigned long offset, unsigned long val)
+{
+ *uv_local_mmr_address(offset) = val;
+}
+
+static inline unsigned char uv_read_local_mmr8(unsigned long offset)
+{
+ return *((unsigned char *)uv_local_mmr_address(offset));
+}
+
+static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val)
+{
+ *((unsigned char *)uv_local_mmr_address(offset)) = val;
+}
+
+/*
+ * Structures and definitions for converting between cpu, node, pnode, and blade
+ * numbers.
+ */
+struct uv_blade_info {
+ unsigned short nr_possible_cpus;
+ unsigned short nr_online_cpus;
+ unsigned short pnode;
+};
+extern struct uv_blade_info *uv_blade_info;
+extern short *uv_node_to_blade;
+extern short *uv_cpu_to_blade;
+extern short uv_possible_blades;
+
+/* Blade-local cpu number of current cpu. Numbered 0 .. <# cpus on the blade> */
+static inline int uv_blade_processor_id(void)
+{
+ return uv_hub_info->blade_processor_id;
+}
+
+/* Blade number of current cpu. Numnbered 0 .. <#blades -1> */
+static inline int uv_numa_blade_id(void)
+{
+ return uv_hub_info->numa_blade_id;
+}
+
+/* Convert a cpu number to the the UV blade number */
+static inline int uv_cpu_to_blade_id(int cpu)
+{
+ return uv_cpu_to_blade[cpu];
+}
+
+/* Convert linux node number to the UV blade number */
+static inline int uv_node_to_blade_id(int nid)
+{
+ return uv_node_to_blade[nid];
+}
+
+/* Convert a blade id to the PNODE of the blade */
+static inline int uv_blade_to_pnode(int bid)
+{
+ return uv_blade_info[bid].pnode;
+}
+
+/* Determine the number of possible cpus on a blade */
+static inline int uv_blade_nr_possible_cpus(int bid)
+{
+ return uv_blade_info[bid].nr_possible_cpus;
+}
+
+/* Determine the number of online cpus on a blade */
+static inline int uv_blade_nr_online_cpus(int bid)
+{
+ return uv_blade_info[bid].nr_online_cpus;
+}
+
+/* Convert a cpu id to the PNODE of the blade containing the cpu */
+static inline int uv_cpu_to_pnode(int cpu)
+{
+ return uv_blade_info[uv_cpu_to_blade_id(cpu)].pnode;
+}
+
+/* Convert a linux node number to the PNODE of the blade */
+static inline int uv_node_to_pnode(int nid)
+{
+ return uv_blade_info[uv_node_to_blade_id(nid)].pnode;
+}
+
+/* Maximum possible number of blades */
+static inline int uv_num_possible_blades(void)
+{
+ return uv_possible_blades;
+}
+
+/* Update SCIR state */
+static inline void uv_set_scir_bits(unsigned char value)
+{
+ if (uv_hub_info->scir.state != value) {
+ uv_hub_info->scir.state = value;
+ uv_write_local_mmr8(uv_hub_info->scir.offset, value);
+ }
+}
+static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
+{
+ if (uv_cpu_hub_info(cpu)->scir.state != value) {
+ uv_cpu_hub_info(cpu)->scir.state = value;
+ uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value);
+ }
+}
+
+#endif /* _ASM_X86_UV_UV_HUB_H */
diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h
new file mode 100644
index 000000000000..9613c8c0b647
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_irq.h
@@ -0,0 +1,36 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV IRQ definitions
+ *
+ * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#ifndef _ASM_X86_UV_UV_IRQ_H
+#define _ASM_X86_UV_UV_IRQ_H
+
+/* If a generic version of this structure gets defined, eliminate this one. */
+struct uv_IO_APIC_route_entry {
+ __u64 vector : 8,
+ delivery_mode : 3,
+ dest_mode : 1,
+ delivery_status : 1,
+ polarity : 1,
+ __reserved_1 : 1,
+ trigger : 1,
+ mask : 1,
+ __reserved_2 : 15,
+ dest : 32;
+};
+
+extern struct irq_chip uv_irq_chip;
+
+extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long);
+extern void arch_disable_uv_irq(int, unsigned long);
+
+extern int uv_setup_irq(char *, int, int, unsigned long);
+extern void uv_teardown_irq(unsigned int, int, unsigned long);
+
+#endif /* _ASM_X86_UV_UV_IRQ_H */
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
new file mode 100644
index 000000000000..dd627793a234
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -0,0 +1,1295 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV MMR definitions
+ *
+ * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#ifndef _ASM_X86_UV_UV_MMRS_H
+#define _ASM_X86_UV_UV_MMRS_H
+
+#define UV_MMR_ENABLE (1UL << 63)
+
+/* ========================================================================= */
+/* UVH_BAU_DATA_CONFIG */
+/* ========================================================================= */
+#define UVH_BAU_DATA_CONFIG 0x61680UL
+#define UVH_BAU_DATA_CONFIG_32 0x0438
+
+#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
+#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_BAU_DATA_CONFIG_DM_SHFT 8
+#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11
+#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12
+#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_BAU_DATA_CONFIG_P_SHFT 13
+#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_BAU_DATA_CONFIG_T_SHFT 15
+#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_BAU_DATA_CONFIG_M_SHFT 16
+#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32
+#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+union uvh_bau_data_config_u {
+ unsigned long v;
+ struct uvh_bau_data_config_s {
+ unsigned long vector_ : 8; /* RW */
+ unsigned long dm : 3; /* RW */
+ unsigned long destmode : 1; /* RW */
+ unsigned long status : 1; /* RO */
+ unsigned long p : 1; /* RO */
+ unsigned long rsvd_14 : 1; /* */
+ unsigned long t : 1; /* RO */
+ unsigned long m : 1; /* RW */
+ unsigned long rsvd_17_31: 15; /* */
+ unsigned long apic_id : 32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_EVENT_OCCURRED0 */
+/* ========================================================================= */
+#define UVH_EVENT_OCCURRED0 0x70000UL
+#define UVH_EVENT_OCCURRED0_32 0x005e8
+
+#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0
+#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
+#define UVH_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
+#define UVH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
+#define UVH_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
+#define UVH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
+#define UVH_EVENT_OCCURRED0_LH_HCERR_SHFT 3
+#define UVH_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
+#define UVH_EVENT_OCCURRED0_RH_HCERR_SHFT 4
+#define UVH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
+#define UVH_EVENT_OCCURRED0_XN_HCERR_SHFT 5
+#define UVH_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
+#define UVH_EVENT_OCCURRED0_SI_HCERR_SHFT 6
+#define UVH_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
+#define UVH_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
+#define UVH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
+#define UVH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
+#define UVH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
+#define UVH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
+#define UVH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
+#define UVH_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
+#define UVH_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
+#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
+#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
+#define UVH_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
+#define UVH_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
+#define UVH_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
+#define UVH_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
+#define UVH_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
+#define UVH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
+#define UVH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
+#define UVH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
+#define UVH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
+#define UVH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
+#define UVH_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
+#define UVH_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
+#define UVH_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
+#define UVH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
+#define UVH_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
+#define UVH_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
+#define UVH_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
+#define UVH_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
+#define UVH_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
+#define UVH_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
+#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
+#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
+#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
+#define UVH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
+#define UVH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
+#define UVH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
+#define UVH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
+#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
+#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
+#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
+#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
+#define UVH_EVENT_OCCURRED0_LTC_INT_SHFT 43
+#define UVH_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
+#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
+#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
+#define UVH_EVENT_OCCURRED0_IPI_INT_SHFT 45
+#define UVH_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
+#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
+#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
+#define UVH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
+#define UVH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
+#define UVH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
+#define UVH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
+#define UVH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
+#define UVH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
+#define UVH_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
+#define UVH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
+#define UVH_EVENT_OCCURRED0_RTC0_SHFT 51
+#define UVH_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
+#define UVH_EVENT_OCCURRED0_RTC1_SHFT 52
+#define UVH_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
+#define UVH_EVENT_OCCURRED0_RTC2_SHFT 53
+#define UVH_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
+#define UVH_EVENT_OCCURRED0_RTC3_SHFT 54
+#define UVH_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
+#define UVH_EVENT_OCCURRED0_BAU_DATA_SHFT 55
+#define UVH_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
+#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
+#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
+union uvh_event_occurred0_u {
+ unsigned long v;
+ struct uvh_event_occurred0_s {
+ unsigned long lb_hcerr : 1; /* RW, W1C */
+ unsigned long gr0_hcerr : 1; /* RW, W1C */
+ unsigned long gr1_hcerr : 1; /* RW, W1C */
+ unsigned long lh_hcerr : 1; /* RW, W1C */
+ unsigned long rh_hcerr : 1; /* RW, W1C */
+ unsigned long xn_hcerr : 1; /* RW, W1C */
+ unsigned long si_hcerr : 1; /* RW, W1C */
+ unsigned long lb_aoerr0 : 1; /* RW, W1C */
+ unsigned long gr0_aoerr0 : 1; /* RW, W1C */
+ unsigned long gr1_aoerr0 : 1; /* RW, W1C */
+ unsigned long lh_aoerr0 : 1; /* RW, W1C */
+ unsigned long rh_aoerr0 : 1; /* RW, W1C */
+ unsigned long xn_aoerr0 : 1; /* RW, W1C */
+ unsigned long si_aoerr0 : 1; /* RW, W1C */
+ unsigned long lb_aoerr1 : 1; /* RW, W1C */
+ unsigned long gr0_aoerr1 : 1; /* RW, W1C */
+ unsigned long gr1_aoerr1 : 1; /* RW, W1C */
+ unsigned long lh_aoerr1 : 1; /* RW, W1C */
+ unsigned long rh_aoerr1 : 1; /* RW, W1C */
+ unsigned long xn_aoerr1 : 1; /* RW, W1C */
+ unsigned long si_aoerr1 : 1; /* RW, W1C */
+ unsigned long rh_vpi_int : 1; /* RW, W1C */
+ unsigned long system_shutdown_int : 1; /* RW, W1C */
+ unsigned long lb_irq_int_0 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_1 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_2 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_3 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_4 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_5 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_6 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_7 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_8 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_9 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_10 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_11 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_12 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_13 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_14 : 1; /* RW, W1C */
+ unsigned long lb_irq_int_15 : 1; /* RW, W1C */
+ unsigned long l1_nmi_int : 1; /* RW, W1C */
+ unsigned long stop_clock : 1; /* RW, W1C */
+ unsigned long asic_to_l1 : 1; /* RW, W1C */
+ unsigned long l1_to_asic : 1; /* RW, W1C */
+ unsigned long ltc_int : 1; /* RW, W1C */
+ unsigned long la_seq_trigger : 1; /* RW, W1C */
+ unsigned long ipi_int : 1; /* RW, W1C */
+ unsigned long extio_int0 : 1; /* RW, W1C */
+ unsigned long extio_int1 : 1; /* RW, W1C */
+ unsigned long extio_int2 : 1; /* RW, W1C */
+ unsigned long extio_int3 : 1; /* RW, W1C */
+ unsigned long profile_int : 1; /* RW, W1C */
+ unsigned long rtc0 : 1; /* RW, W1C */
+ unsigned long rtc1 : 1; /* RW, W1C */
+ unsigned long rtc2 : 1; /* RW, W1C */
+ unsigned long rtc3 : 1; /* RW, W1C */
+ unsigned long bau_data : 1; /* RW, W1C */
+ unsigned long power_management_req : 1; /* RW, W1C */
+ unsigned long rsvd_57_63 : 7; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_EVENT_OCCURRED0_ALIAS */
+/* ========================================================================= */
+#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL
+#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0
+
+/* ========================================================================= */
+/* UVH_INT_CMPB */
+/* ========================================================================= */
+#define UVH_INT_CMPB 0x22080UL
+
+#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0
+#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL
+
+union uvh_int_cmpb_u {
+ unsigned long v;
+ struct uvh_int_cmpb_s {
+ unsigned long real_time_cmpb : 56; /* RW */
+ unsigned long rsvd_56_63 : 8; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_INT_CMPC */
+/* ========================================================================= */
+#define UVH_INT_CMPC 0x22100UL
+
+#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0
+#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL
+
+union uvh_int_cmpc_u {
+ unsigned long v;
+ struct uvh_int_cmpc_s {
+ unsigned long real_time_cmpc : 56; /* RW */
+ unsigned long rsvd_56_63 : 8; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_INT_CMPD */
+/* ========================================================================= */
+#define UVH_INT_CMPD 0x22180UL
+
+#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0
+#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL
+
+union uvh_int_cmpd_u {
+ unsigned long v;
+ struct uvh_int_cmpd_s {
+ unsigned long real_time_cmpd : 56; /* RW */
+ unsigned long rsvd_56_63 : 8; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_IPI_INT */
+/* ========================================================================= */
+#define UVH_IPI_INT 0x60500UL
+#define UVH_IPI_INT_32 0x0348
+
+#define UVH_IPI_INT_VECTOR_SHFT 0
+#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8
+#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL
+#define UVH_IPI_INT_DESTMODE_SHFT 11
+#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_IPI_INT_APIC_ID_SHFT 16
+#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL
+#define UVH_IPI_INT_SEND_SHFT 63
+#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL
+
+union uvh_ipi_int_u {
+ unsigned long v;
+ struct uvh_ipi_int_s {
+ unsigned long vector_ : 8; /* RW */
+ unsigned long delivery_mode : 3; /* RW */
+ unsigned long destmode : 1; /* RW */
+ unsigned long rsvd_12_15 : 4; /* */
+ unsigned long apic_id : 32; /* RW */
+ unsigned long rsvd_48_62 : 15; /* */
+ unsigned long send : 1; /* WP */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */
+/* ========================================================================= */
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x009c0
+
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL
+
+union uvh_lb_bau_intd_payload_queue_first_u {
+ unsigned long v;
+ struct uvh_lb_bau_intd_payload_queue_first_s {
+ unsigned long rsvd_0_3: 4; /* */
+ unsigned long address : 39; /* RW */
+ unsigned long rsvd_43_48: 6; /* */
+ unsigned long node_id : 14; /* RW */
+ unsigned long rsvd_63 : 1; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */
+/* ========================================================================= */
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x009c8
+
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
+
+union uvh_lb_bau_intd_payload_queue_last_u {
+ unsigned long v;
+ struct uvh_lb_bau_intd_payload_queue_last_s {
+ unsigned long rsvd_0_3: 4; /* */
+ unsigned long address : 39; /* RW */
+ unsigned long rsvd_43_63: 21; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */
+/* ========================================================================= */
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x009d0
+
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
+
+union uvh_lb_bau_intd_payload_queue_tail_u {
+ unsigned long v;
+ struct uvh_lb_bau_intd_payload_queue_tail_s {
+ unsigned long rsvd_0_3: 4; /* */
+ unsigned long address : 39; /* RW */
+ unsigned long rsvd_43_63: 21; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */
+/* ========================================================================= */
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0a68
+
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
+union uvh_lb_bau_intd_software_acknowledge_u {
+ unsigned long v;
+ struct uvh_lb_bau_intd_software_acknowledge_s {
+ unsigned long pending_0 : 1; /* RW, W1C */
+ unsigned long pending_1 : 1; /* RW, W1C */
+ unsigned long pending_2 : 1; /* RW, W1C */
+ unsigned long pending_3 : 1; /* RW, W1C */
+ unsigned long pending_4 : 1; /* RW, W1C */
+ unsigned long pending_5 : 1; /* RW, W1C */
+ unsigned long pending_6 : 1; /* RW, W1C */
+ unsigned long pending_7 : 1; /* RW, W1C */
+ unsigned long timeout_0 : 1; /* RW, W1C */
+ unsigned long timeout_1 : 1; /* RW, W1C */
+ unsigned long timeout_2 : 1; /* RW, W1C */
+ unsigned long timeout_3 : 1; /* RW, W1C */
+ unsigned long timeout_4 : 1; /* RW, W1C */
+ unsigned long timeout_5 : 1; /* RW, W1C */
+ unsigned long timeout_6 : 1; /* RW, W1C */
+ unsigned long timeout_7 : 1; /* RW, W1C */
+ unsigned long rsvd_16_63: 48; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */
+/* ========================================================================= */
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70
+
+/* ========================================================================= */
+/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */
+/* ========================================================================= */
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x009a8
+
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL
+
+union uvh_lb_bau_sb_activation_control_u {
+ unsigned long v;
+ struct uvh_lb_bau_sb_activation_control_s {
+ unsigned long index : 6; /* RW */
+ unsigned long rsvd_6_61: 56; /* */
+ unsigned long push : 1; /* WP */
+ unsigned long init : 1; /* WP */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */
+/* ========================================================================= */
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x009b0
+
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL
+
+union uvh_lb_bau_sb_activation_status_0_u {
+ unsigned long v;
+ struct uvh_lb_bau_sb_activation_status_0_s {
+ unsigned long status : 64; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */
+/* ========================================================================= */
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x009b8
+
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL
+
+union uvh_lb_bau_sb_activation_status_1_u {
+ unsigned long v;
+ struct uvh_lb_bau_sb_activation_status_1_s {
+ unsigned long status : 64; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_SB_DESCRIPTOR_BASE */
+/* ========================================================================= */
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x009a0
+
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL
+
+union uvh_lb_bau_sb_descriptor_base_u {
+ unsigned long v;
+ struct uvh_lb_bau_sb_descriptor_base_s {
+ unsigned long rsvd_0_11 : 12; /* */
+ unsigned long page_address : 31; /* RW */
+ unsigned long rsvd_43_48 : 6; /* */
+ unsigned long node_id : 14; /* RW */
+ unsigned long rsvd_63 : 1; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_MCAST_AOERR0_RPT_ENABLE */
+/* ========================================================================= */
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE 0x50b20UL
+
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_SHFT 0
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_MASK 0x0000000000000001UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_SHFT 1
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_MASK 0x0000000000000002UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_SHFT 2
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_MASK 0x0000000000000004UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_SHFT 3
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_MASK 0x0000000000000008UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_SHFT 4
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_MASK 0x0000000000000010UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_SHFT 5
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_MASK 0x0000000000000020UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_SHFT 6
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_MASK 0x0000000000000040UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_SHFT 7
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_MASK 0x0000000000000080UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_SHFT 8
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_MASK 0x0000000000000100UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_SHFT 9
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_MASK 0x0000000000000200UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_SHFT 10
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_MASK 0x0000000000000400UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_SHFT 11
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_MASK 0x0000000000000800UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_SHFT 12
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_MASK 0x0000000000001000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_SHFT 13
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_MASK 0x0000000000002000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_SHFT 14
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_MASK 0x0000000000004000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_SHFT 15
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_MASK 0x0000000000008000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_SHFT 16
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_MASK 0x0000000000010000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_SHFT 17
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_MASK 0x0000000000020000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_SHFT 18
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_MASK 0x0000000000040000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_SHFT 19
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_MASK 0x0000000000080000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_SHFT 20
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_MASK 0x0000000000100000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_SHFT 21
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_MASK 0x0000000000200000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_SHFT 22
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_MASK 0x0000000000400000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_SHFT 23
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_MASK 0x0000000000800000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_SHFT 24
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_MASK 0x0000000001000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_SHFT 25
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_MASK 0x0000000002000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_SHFT 26
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_MASK 0x0000000004000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_SHFT 27
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_MASK 0x0000000008000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_SHFT 28
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_MASK 0x0000000010000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_SHFT 29
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_MASK 0x0000000020000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_SHFT 30
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_MASK 0x0000000040000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_SHFT 31
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_MASK 0x0000000080000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_SHFT 32
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_MASK 0x0000000100000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_SHFT 33
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_MASK 0x0000000200000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_SHFT 34
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_MASK 0x0000000400000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_SHFT 35
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_MASK 0x0000000800000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_SHFT 36
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_MASK 0x0000001000000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_SHFT 37
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_MASK 0x0000002000000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_SHFT 38
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_MASK 0x0000004000000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_SHFT 39
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_MASK 0x0000008000000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_SHFT 40
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_MASK 0x0000010000000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_SHFT 41
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_MASK 0x0000020000000000UL
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_SHFT 42
+#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_MASK 0x0000040000000000UL
+
+union uvh_lb_mcast_aoerr0_rpt_enable_u {
+ unsigned long v;
+ struct uvh_lb_mcast_aoerr0_rpt_enable_s {
+ unsigned long mcast_obese_msg : 1; /* RW */
+ unsigned long mcast_data_sb_err : 1; /* RW */
+ unsigned long mcast_nack_buff_parity : 1; /* RW */
+ unsigned long mcast_timeout : 1; /* RW */
+ unsigned long mcast_inactive_reply : 1; /* RW */
+ unsigned long mcast_upgrade_error : 1; /* RW */
+ unsigned long mcast_reg_count_underflow : 1; /* RW */
+ unsigned long mcast_rep_obese_msg : 1; /* RW */
+ unsigned long ucache_req_runt_msg : 1; /* RW */
+ unsigned long ucache_req_obese_msg : 1; /* RW */
+ unsigned long ucache_req_data_sb_err : 1; /* RW */
+ unsigned long ucache_rep_runt_msg : 1; /* RW */
+ unsigned long ucache_rep_obese_msg : 1; /* RW */
+ unsigned long ucache_rep_data_sb_err : 1; /* RW */
+ unsigned long ucache_rep_command_err : 1; /* RW */
+ unsigned long ucache_pend_timeout : 1; /* RW */
+ unsigned long macc_req_runt_msg : 1; /* RW */
+ unsigned long macc_req_obese_msg : 1; /* RW */
+ unsigned long macc_req_data_sb_err : 1; /* RW */
+ unsigned long macc_rep_runt_msg : 1; /* RW */
+ unsigned long macc_rep_obese_msg : 1; /* RW */
+ unsigned long macc_rep_data_sb_err : 1; /* RW */
+ unsigned long macc_amo_timeout : 1; /* RW */
+ unsigned long macc_put_timeout : 1; /* RW */
+ unsigned long macc_spurious_event : 1; /* RW */
+ unsigned long ioh_destination_table_parity : 1; /* RW */
+ unsigned long get_had_error_reply : 1; /* RW */
+ unsigned long get_timeout : 1; /* RW */
+ unsigned long lock_manager_had_error_reply : 1; /* RW */
+ unsigned long put_had_error_reply : 1; /* RW */
+ unsigned long put_timeout : 1; /* RW */
+ unsigned long sb_activation_overrun : 1; /* RW */
+ unsigned long completed_gb_activation_had_error_reply : 1; /* RW */
+ unsigned long completed_gb_activation_timeout : 1; /* RW */
+ unsigned long descriptor_buffer_0_parity : 1; /* RW */
+ unsigned long descriptor_buffer_1_parity : 1; /* RW */
+ unsigned long socket_destination_table_parity : 1; /* RW */
+ unsigned long bau_reply_payload_corruption : 1; /* RW */
+ unsigned long io_port_destination_table_parity : 1; /* RW */
+ unsigned long intd_soft_ack_timeout : 1; /* RW */
+ unsigned long int_rep_obese_msg : 1; /* RW */
+ unsigned long int_rep_command_err : 1; /* RW */
+ unsigned long int_timeout : 1; /* RW */
+ unsigned long rsvd_43_63 : 21; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LOCAL_INT0_CONFIG */
+/* ========================================================================= */
+#define UVH_LOCAL_INT0_CONFIG 0x61000UL
+
+#define UVH_LOCAL_INT0_CONFIG_VECTOR_SHFT 0
+#define UVH_LOCAL_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_LOCAL_INT0_CONFIG_DM_SHFT 8
+#define UVH_LOCAL_INT0_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_LOCAL_INT0_CONFIG_DESTMODE_SHFT 11
+#define UVH_LOCAL_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_LOCAL_INT0_CONFIG_STATUS_SHFT 12
+#define UVH_LOCAL_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_LOCAL_INT0_CONFIG_P_SHFT 13
+#define UVH_LOCAL_INT0_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_LOCAL_INT0_CONFIG_T_SHFT 15
+#define UVH_LOCAL_INT0_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_LOCAL_INT0_CONFIG_M_SHFT 16
+#define UVH_LOCAL_INT0_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_LOCAL_INT0_CONFIG_APIC_ID_SHFT 32
+#define UVH_LOCAL_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+union uvh_local_int0_config_u {
+ unsigned long v;
+ struct uvh_local_int0_config_s {
+ unsigned long vector_ : 8; /* RW */
+ unsigned long dm : 3; /* RW */
+ unsigned long destmode : 1; /* RW */
+ unsigned long status : 1; /* RO */
+ unsigned long p : 1; /* RO */
+ unsigned long rsvd_14 : 1; /* */
+ unsigned long t : 1; /* RO */
+ unsigned long m : 1; /* RW */
+ unsigned long rsvd_17_31: 15; /* */
+ unsigned long apic_id : 32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LOCAL_INT0_ENABLE */
+/* ========================================================================= */
+#define UVH_LOCAL_INT0_ENABLE 0x65000UL
+
+#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_SHFT 0
+#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_MASK 0x0000000000000001UL
+#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_SHFT 1
+#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_MASK 0x0000000000000002UL
+#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_SHFT 2
+#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_MASK 0x0000000000000004UL
+#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_SHFT 3
+#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_MASK 0x0000000000000008UL
+#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_SHFT 4
+#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_MASK 0x0000000000000010UL
+#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_SHFT 5
+#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_MASK 0x0000000000000020UL
+#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_SHFT 6
+#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_MASK 0x0000000000000040UL
+#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_SHFT 7
+#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_MASK 0x0000000000000080UL
+#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_SHFT 8
+#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_MASK 0x0000000000000100UL
+#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_SHFT 9
+#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_MASK 0x0000000000000200UL
+#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_SHFT 10
+#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_MASK 0x0000000000000400UL
+#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_SHFT 11
+#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_MASK 0x0000000000000800UL
+#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_SHFT 12
+#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_MASK 0x0000000000001000UL
+#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_SHFT 13
+#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_MASK 0x0000000000002000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_SHFT 14
+#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_MASK 0x0000000000004000UL
+#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_SHFT 15
+#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_MASK 0x0000000000008000UL
+#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_SHFT 16
+#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_MASK 0x0000000000010000UL
+#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_SHFT 17
+#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_MASK 0x0000000000020000UL
+#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_SHFT 18
+#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_MASK 0x0000000000040000UL
+#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_SHFT 19
+#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_MASK 0x0000000000080000UL
+#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_SHFT 20
+#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_MASK 0x0000000000100000UL
+#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_SHFT 21
+#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_MASK 0x0000000000200000UL
+#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_SHFT 22
+#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_SHFT 23
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_MASK 0x0000000000800000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_SHFT 24
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_MASK 0x0000000001000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_SHFT 25
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_MASK 0x0000000002000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_SHFT 26
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_MASK 0x0000000004000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_SHFT 27
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_MASK 0x0000000008000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_SHFT 28
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_MASK 0x0000000010000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_SHFT 29
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_MASK 0x0000000020000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_SHFT 30
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_MASK 0x0000000040000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_SHFT 31
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_MASK 0x0000000080000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_SHFT 32
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_MASK 0x0000000100000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_SHFT 33
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_MASK 0x0000000200000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_SHFT 34
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_MASK 0x0000000400000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_SHFT 35
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_MASK 0x0000000800000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_SHFT 36
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_MASK 0x0000001000000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_SHFT 37
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_MASK 0x0000002000000000UL
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_SHFT 38
+#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_MASK 0x0000004000000000UL
+#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_SHFT 39
+#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_MASK 0x0000008000000000UL
+#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_SHFT 40
+#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_MASK 0x0000010000000000UL
+#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_SHFT 41
+#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_MASK 0x0000020000000000UL
+#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_SHFT 42
+#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_MASK 0x0000040000000000UL
+#define UVH_LOCAL_INT0_ENABLE_LTC_INT_SHFT 43
+#define UVH_LOCAL_INT0_ENABLE_LTC_INT_MASK 0x0000080000000000UL
+#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_SHFT 44
+#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
+
+union uvh_local_int0_enable_u {
+ unsigned long v;
+ struct uvh_local_int0_enable_s {
+ unsigned long lb_hcerr : 1; /* RW */
+ unsigned long gr0_hcerr : 1; /* RW */
+ unsigned long gr1_hcerr : 1; /* RW */
+ unsigned long lh_hcerr : 1; /* RW */
+ unsigned long rh_hcerr : 1; /* RW */
+ unsigned long xn_hcerr : 1; /* RW */
+ unsigned long si_hcerr : 1; /* RW */
+ unsigned long lb_aoerr0 : 1; /* RW */
+ unsigned long gr0_aoerr0 : 1; /* RW */
+ unsigned long gr1_aoerr0 : 1; /* RW */
+ unsigned long lh_aoerr0 : 1; /* RW */
+ unsigned long rh_aoerr0 : 1; /* RW */
+ unsigned long xn_aoerr0 : 1; /* RW */
+ unsigned long si_aoerr0 : 1; /* RW */
+ unsigned long lb_aoerr1 : 1; /* RW */
+ unsigned long gr0_aoerr1 : 1; /* RW */
+ unsigned long gr1_aoerr1 : 1; /* RW */
+ unsigned long lh_aoerr1 : 1; /* RW */
+ unsigned long rh_aoerr1 : 1; /* RW */
+ unsigned long xn_aoerr1 : 1; /* RW */
+ unsigned long si_aoerr1 : 1; /* RW */
+ unsigned long rh_vpi_int : 1; /* RW */
+ unsigned long system_shutdown_int : 1; /* RW */
+ unsigned long lb_irq_int_0 : 1; /* RW */
+ unsigned long lb_irq_int_1 : 1; /* RW */
+ unsigned long lb_irq_int_2 : 1; /* RW */
+ unsigned long lb_irq_int_3 : 1; /* RW */
+ unsigned long lb_irq_int_4 : 1; /* RW */
+ unsigned long lb_irq_int_5 : 1; /* RW */
+ unsigned long lb_irq_int_6 : 1; /* RW */
+ unsigned long lb_irq_int_7 : 1; /* RW */
+ unsigned long lb_irq_int_8 : 1; /* RW */
+ unsigned long lb_irq_int_9 : 1; /* RW */
+ unsigned long lb_irq_int_10 : 1; /* RW */
+ unsigned long lb_irq_int_11 : 1; /* RW */
+ unsigned long lb_irq_int_12 : 1; /* RW */
+ unsigned long lb_irq_int_13 : 1; /* RW */
+ unsigned long lb_irq_int_14 : 1; /* RW */
+ unsigned long lb_irq_int_15 : 1; /* RW */
+ unsigned long l1_nmi_int : 1; /* RW */
+ unsigned long stop_clock : 1; /* RW */
+ unsigned long asic_to_l1 : 1; /* RW */
+ unsigned long l1_to_asic : 1; /* RW */
+ unsigned long ltc_int : 1; /* RW */
+ unsigned long la_seq_trigger : 1; /* RW */
+ unsigned long rsvd_45_63 : 19; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_NODE_ID */
+/* ========================================================================= */
+#define UVH_NODE_ID 0x0UL
+
+#define UVH_NODE_ID_FORCE1_SHFT 0
+#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL
+#define UVH_NODE_ID_MANUFACTURER_SHFT 1
+#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
+#define UVH_NODE_ID_PART_NUMBER_SHFT 12
+#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
+#define UVH_NODE_ID_REVISION_SHFT 28
+#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
+#define UVH_NODE_ID_NODE_ID_SHFT 32
+#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
+#define UVH_NODE_ID_NODES_PER_BIT_SHFT 48
+#define UVH_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
+#define UVH_NODE_ID_NI_PORT_SHFT 56
+#define UVH_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
+
+union uvh_node_id_u {
+ unsigned long v;
+ struct uvh_node_id_s {
+ unsigned long force1 : 1; /* RO */
+ unsigned long manufacturer : 11; /* RO */
+ unsigned long part_number : 16; /* RO */
+ unsigned long revision : 4; /* RO */
+ unsigned long node_id : 15; /* RW */
+ unsigned long rsvd_47 : 1; /* */
+ unsigned long nodes_per_bit : 7; /* RW */
+ unsigned long rsvd_55 : 1; /* */
+ unsigned long ni_port : 4; /* RO */
+ unsigned long rsvd_60_63 : 4; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_NODE_PRESENT_TABLE */
+/* ========================================================================= */
+#define UVH_NODE_PRESENT_TABLE 0x1400UL
+#define UVH_NODE_PRESENT_TABLE_DEPTH 16
+
+#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0
+#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL
+
+union uvh_node_present_table_u {
+ unsigned long v;
+ struct uvh_node_present_table_s {
+ unsigned long nodes : 64; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
+
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+union uvh_rh_gam_alias210_redirect_config_0_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_redirect_config_0_mmr_s {
+ unsigned long rsvd_0_23 : 24; /* */
+ unsigned long dest_base : 22; /* RW */
+ unsigned long rsvd_46_63: 18; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
+
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+union uvh_rh_gam_alias210_redirect_config_1_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_redirect_config_1_mmr_s {
+ unsigned long rsvd_0_23 : 24; /* */
+ unsigned long dest_base : 22; /* RW */
+ unsigned long rsvd_46_63: 18; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
+
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_redirect_config_2_mmr_s {
+ unsigned long rsvd_0_23 : 24; /* */
+ unsigned long dest_base : 22; /* RW */
+ unsigned long rsvd_46_63: 18; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR 0x1600020UL
+
+#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_cfg_overlay_config_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_cfg_overlay_config_mmr_s {
+ unsigned long rsvd_0_25: 26; /* */
+ unsigned long base : 20; /* RW */
+ unsigned long rsvd_46_62: 17; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
+
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_gru_overlay_config_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_gru_overlay_config_mmr_s {
+ unsigned long rsvd_0_27: 28; /* */
+ unsigned long base : 18; /* RW */
+ unsigned long rsvd_46_47: 2; /* */
+ unsigned long gr4 : 1; /* RW */
+ unsigned long rsvd_49_51: 3; /* */
+ unsigned long n_gru : 4; /* RW */
+ unsigned long rsvd_56_62: 7; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
+
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_mmioh_overlay_config_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_mmioh_overlay_config_mmr_s {
+ unsigned long rsvd_0_29: 30; /* */
+ unsigned long base : 16; /* RW */
+ unsigned long m_io : 6; /* RW */
+ unsigned long n_io : 4; /* RW */
+ unsigned long rsvd_56_62: 7; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
+
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_mmr_overlay_config_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_mmr_overlay_config_mmr_s {
+ unsigned long rsvd_0_25: 26; /* */
+ unsigned long base : 20; /* RW */
+ unsigned long dual_hub : 1; /* RW */
+ unsigned long rsvd_47_62: 16; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RTC */
+/* ========================================================================= */
+#define UVH_RTC 0x340000UL
+
+#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0
+#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL
+
+union uvh_rtc_u {
+ unsigned long v;
+ struct uvh_rtc_s {
+ unsigned long real_time_clock : 56; /* RW */
+ unsigned long rsvd_56_63 : 8; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RTC1_INT_CONFIG */
+/* ========================================================================= */
+#define UVH_RTC1_INT_CONFIG 0x615c0UL
+
+#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0
+#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_RTC1_INT_CONFIG_DM_SHFT 8
+#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11
+#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12
+#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_RTC1_INT_CONFIG_P_SHFT 13
+#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_RTC1_INT_CONFIG_T_SHFT 15
+#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_RTC1_INT_CONFIG_M_SHFT 16
+#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32
+#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+union uvh_rtc1_int_config_u {
+ unsigned long v;
+ struct uvh_rtc1_int_config_s {
+ unsigned long vector_ : 8; /* RW */
+ unsigned long dm : 3; /* RW */
+ unsigned long destmode : 1; /* RW */
+ unsigned long status : 1; /* RO */
+ unsigned long p : 1; /* RO */
+ unsigned long rsvd_14 : 1; /* */
+ unsigned long t : 1; /* RO */
+ unsigned long m : 1; /* RW */
+ unsigned long rsvd_17_31: 15; /* */
+ unsigned long apic_id : 32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RTC2_INT_CONFIG */
+/* ========================================================================= */
+#define UVH_RTC2_INT_CONFIG 0x61600UL
+
+#define UVH_RTC2_INT_CONFIG_VECTOR_SHFT 0
+#define UVH_RTC2_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_RTC2_INT_CONFIG_DM_SHFT 8
+#define UVH_RTC2_INT_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_RTC2_INT_CONFIG_DESTMODE_SHFT 11
+#define UVH_RTC2_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_RTC2_INT_CONFIG_STATUS_SHFT 12
+#define UVH_RTC2_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_RTC2_INT_CONFIG_P_SHFT 13
+#define UVH_RTC2_INT_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_RTC2_INT_CONFIG_T_SHFT 15
+#define UVH_RTC2_INT_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_RTC2_INT_CONFIG_M_SHFT 16
+#define UVH_RTC2_INT_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_RTC2_INT_CONFIG_APIC_ID_SHFT 32
+#define UVH_RTC2_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+union uvh_rtc2_int_config_u {
+ unsigned long v;
+ struct uvh_rtc2_int_config_s {
+ unsigned long vector_ : 8; /* RW */
+ unsigned long dm : 3; /* RW */
+ unsigned long destmode : 1; /* RW */
+ unsigned long status : 1; /* RO */
+ unsigned long p : 1; /* RO */
+ unsigned long rsvd_14 : 1; /* */
+ unsigned long t : 1; /* RO */
+ unsigned long m : 1; /* RW */
+ unsigned long rsvd_17_31: 15; /* */
+ unsigned long apic_id : 32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RTC3_INT_CONFIG */
+/* ========================================================================= */
+#define UVH_RTC3_INT_CONFIG 0x61640UL
+
+#define UVH_RTC3_INT_CONFIG_VECTOR_SHFT 0
+#define UVH_RTC3_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_RTC3_INT_CONFIG_DM_SHFT 8
+#define UVH_RTC3_INT_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_RTC3_INT_CONFIG_DESTMODE_SHFT 11
+#define UVH_RTC3_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_RTC3_INT_CONFIG_STATUS_SHFT 12
+#define UVH_RTC3_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_RTC3_INT_CONFIG_P_SHFT 13
+#define UVH_RTC3_INT_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_RTC3_INT_CONFIG_T_SHFT 15
+#define UVH_RTC3_INT_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_RTC3_INT_CONFIG_M_SHFT 16
+#define UVH_RTC3_INT_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_RTC3_INT_CONFIG_APIC_ID_SHFT 32
+#define UVH_RTC3_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+union uvh_rtc3_int_config_u {
+ unsigned long v;
+ struct uvh_rtc3_int_config_s {
+ unsigned long vector_ : 8; /* RW */
+ unsigned long dm : 3; /* RW */
+ unsigned long destmode : 1; /* RW */
+ unsigned long status : 1; /* RO */
+ unsigned long p : 1; /* RO */
+ unsigned long rsvd_14 : 1; /* */
+ unsigned long t : 1; /* RO */
+ unsigned long m : 1; /* RW */
+ unsigned long rsvd_17_31: 15; /* */
+ unsigned long apic_id : 32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RTC_INC_RATIO */
+/* ========================================================================= */
+#define UVH_RTC_INC_RATIO 0x350000UL
+
+#define UVH_RTC_INC_RATIO_FRACTION_SHFT 0
+#define UVH_RTC_INC_RATIO_FRACTION_MASK 0x00000000000fffffUL
+#define UVH_RTC_INC_RATIO_RATIO_SHFT 20
+#define UVH_RTC_INC_RATIO_RATIO_MASK 0x0000000000700000UL
+
+union uvh_rtc_inc_ratio_u {
+ unsigned long v;
+ struct uvh_rtc_inc_ratio_s {
+ unsigned long fraction : 20; /* RW */
+ unsigned long ratio : 3; /* RW */
+ unsigned long rsvd_23_63: 41; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_SI_ADDR_MAP_CONFIG */
+/* ========================================================================= */
+#define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL
+
+#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_SHFT 0
+#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL
+#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_SHFT 8
+#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_MASK 0x0000000000000f00UL
+
+union uvh_si_addr_map_config_u {
+ unsigned long v;
+ struct uvh_si_addr_map_config_s {
+ unsigned long m_skt : 6; /* RW */
+ unsigned long rsvd_6_7: 2; /* */
+ unsigned long n_skt : 4; /* RW */
+ unsigned long rsvd_12_63: 52; /* */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_SI_ALIAS0_OVERLAY_CONFIG */
+/* ========================================================================= */
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL
+
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63
+#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_si_alias0_overlay_config_u {
+ unsigned long v;
+ struct uvh_si_alias0_overlay_config_s {
+ unsigned long rsvd_0_23: 24; /* */
+ unsigned long base : 8; /* RW */
+ unsigned long rsvd_32_47: 16; /* */
+ unsigned long m_alias : 5; /* RW */
+ unsigned long rsvd_53_62: 10; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_SI_ALIAS1_OVERLAY_CONFIG */
+/* ========================================================================= */
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL
+
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63
+#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_si_alias1_overlay_config_u {
+ unsigned long v;
+ struct uvh_si_alias1_overlay_config_s {
+ unsigned long rsvd_0_23: 24; /* */
+ unsigned long base : 8; /* RW */
+ unsigned long rsvd_32_47: 16; /* */
+ unsigned long m_alias : 5; /* RW */
+ unsigned long rsvd_53_62: 10; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_SI_ALIAS2_OVERLAY_CONFIG */
+/* ========================================================================= */
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL
+
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63
+#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_si_alias2_overlay_config_u {
+ unsigned long v;
+ struct uvh_si_alias2_overlay_config_s {
+ unsigned long rsvd_0_23: 24; /* */
+ unsigned long base : 8; /* RW */
+ unsigned long rsvd_32_47: 16; /* */
+ unsigned long m_alias : 5; /* RW */
+ unsigned long rsvd_53_62: 10; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+
+#endif /* _ASM_X86_UV_UV_MMRS_H */
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
new file mode 100644
index 000000000000..9064052b73de
--- /dev/null
+++ b/arch/x86/include/asm/vdso.h
@@ -0,0 +1,47 @@
+#ifndef _ASM_X86_VDSO_H
+#define _ASM_X86_VDSO_H
+
+#ifdef CONFIG_X86_64
+extern const char VDSO64_PRELINK[];
+
+/*
+ * Given a pointer to the vDSO image, find the pointer to VDSO64_name
+ * as that symbol is defined in the vDSO sources or linker script.
+ */
+#define VDSO64_SYMBOL(base, name) \
+({ \
+ extern const char VDSO64_##name[]; \
+ (void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \
+})
+#endif
+
+#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
+extern const char VDSO32_PRELINK[];
+
+/*
+ * Given a pointer to the vDSO image, find the pointer to VDSO32_name
+ * as that symbol is defined in the vDSO sources or linker script.
+ */
+#define VDSO32_SYMBOL(base, name) \
+({ \
+ extern const char VDSO32_##name[]; \
+ (void *)(VDSO32_##name - VDSO32_PRELINK + (unsigned long)(base)); \
+})
+#endif
+
+/*
+ * These symbols are defined with the addresses in the vsyscall page.
+ * See vsyscall-sigreturn.S.
+ */
+extern void __user __kernel_sigreturn;
+extern void __user __kernel_rt_sigreturn;
+
+/*
+ * These symbols are defined by vdso32.S to mark the bounds
+ * of the ELF DSO images included therein.
+ */
+extern const char vdso32_int80_start, vdso32_int80_end;
+extern const char vdso32_syscall_start, vdso32_syscall_end;
+extern const char vdso32_sysenter_start, vdso32_sysenter_end;
+
+#endif /* _ASM_X86_VDSO_H */
diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h
new file mode 100644
index 000000000000..c4b9dc2f67c5
--- /dev/null
+++ b/arch/x86/include/asm/vga.h
@@ -0,0 +1,20 @@
+/*
+ * Access to VGA videoram
+ *
+ * (c) 1998 Martin Mares <mj@ucw.cz>
+ */
+
+#ifndef _ASM_X86_VGA_H
+#define _ASM_X86_VGA_H
+
+/*
+ * On the PC, we can just recalculate addresses and then
+ * access the videoram directly without any black magic.
+ */
+
+#define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x)
+
+#define vga_readb(x) (*(x))
+#define vga_writeb(x, y) (*(y) = (x))
+
+#endif /* _ASM_X86_VGA_H */
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
new file mode 100644
index 000000000000..dc27a69e5d2a
--- /dev/null
+++ b/arch/x86/include/asm/vgtod.h
@@ -0,0 +1,29 @@
+#ifndef _ASM_X86_VGTOD_H
+#define _ASM_X86_VGTOD_H
+
+#include <asm/vsyscall.h>
+#include <linux/clocksource.h>
+
+struct vsyscall_gtod_data {
+ seqlock_t lock;
+
+ /* open coded 'struct timespec' */
+ time_t wall_time_sec;
+ u32 wall_time_nsec;
+
+ int sysctl_enabled;
+ struct timezone sys_tz;
+ struct { /* extract of a clocksource struct */
+ cycle_t (*vread)(void);
+ cycle_t cycle_last;
+ cycle_t mask;
+ u32 mult;
+ u32 shift;
+ } clock;
+ struct timespec wall_to_monotonic;
+};
+extern struct vsyscall_gtod_data __vsyscall_gtod_data
+__section_vsyscall_gtod_data;
+extern struct vsyscall_gtod_data vsyscall_gtod_data;
+
+#endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/vic.h b/arch/x86/include/asm/vic.h
new file mode 100644
index 000000000000..53100f353612
--- /dev/null
+++ b/arch/x86/include/asm/vic.h
@@ -0,0 +1,61 @@
+/* Copyright (C) 1999,2001
+ *
+ * Author: J.E.J.Bottomley@HansenPartnership.com
+ *
+ * Standard include definitions for the NCR Voyager Interrupt Controller */
+
+/* The eight CPI vectors. To activate a CPI, you write a bit mask
+ * corresponding to the processor set to be interrupted into the
+ * relevant register. That set of CPUs will then be interrupted with
+ * the CPI */
+static const int VIC_CPI_Registers[] =
+ {0xFC00, 0xFC01, 0xFC08, 0xFC09,
+ 0xFC10, 0xFC11, 0xFC18, 0xFC19 };
+
+#define VIC_PROC_WHO_AM_I 0xfc29
+# define QUAD_IDENTIFIER 0xC0
+# define EIGHT_SLOT_IDENTIFIER 0xE0
+#define QIC_EXTENDED_PROCESSOR_SELECT 0xFC72
+#define VIC_CPI_BASE_REGISTER 0xFC41
+#define VIC_PROCESSOR_ID 0xFC21
+# define VIC_CPU_MASQUERADE_ENABLE 0x8
+
+#define VIC_CLAIM_REGISTER_0 0xFC38
+#define VIC_CLAIM_REGISTER_1 0xFC39
+#define VIC_REDIRECT_REGISTER_0 0xFC60
+#define VIC_REDIRECT_REGISTER_1 0xFC61
+#define VIC_PRIORITY_REGISTER 0xFC20
+
+#define VIC_PRIMARY_MC_BASE 0xFC48
+#define VIC_SECONDARY_MC_BASE 0xFC49
+
+#define QIC_PROCESSOR_ID 0xFC71
+# define QIC_CPUID_ENABLE 0x08
+
+#define QIC_VIC_CPI_BASE_REGISTER 0xFC79
+#define QIC_CPI_BASE_REGISTER 0xFC7A
+
+#define QIC_MASK_REGISTER0 0xFC80
+/* NOTE: these are masked high, enabled low */
+# define QIC_PERF_TIMER 0x01
+# define QIC_LPE 0x02
+# define QIC_SYS_INT 0x04
+# define QIC_CMN_INT 0x08
+/* at the moment, just enable CMN_INT, disable SYS_INT */
+# define QIC_DEFAULT_MASK0 (~(QIC_CMN_INT /* | VIC_SYS_INT */))
+#define QIC_MASK_REGISTER1 0xFC81
+# define QIC_BOOT_CPI_MASK 0xFE
+/* Enable CPI's 1-6 inclusive */
+# define QIC_CPI_ENABLE 0x81
+
+#define QIC_INTERRUPT_CLEAR0 0xFC8A
+#define QIC_INTERRUPT_CLEAR1 0xFC8B
+
+/* this is where we place the CPI vectors */
+#define VIC_DEFAULT_CPI_BASE 0xC0
+/* this is where we place the QIC CPI vectors */
+#define QIC_DEFAULT_CPI_BASE 0xD0
+
+#define VIC_BOOT_INTERRUPT_MASK 0xfe
+
+extern void smp_vic_timer_interrupt(void);
diff --git a/arch/x86/include/asm/visws/cobalt.h b/arch/x86/include/asm/visws/cobalt.h
new file mode 100644
index 000000000000..166adf61e770
--- /dev/null
+++ b/arch/x86/include/asm/visws/cobalt.h
@@ -0,0 +1,125 @@
+#ifndef _ASM_X86_VISWS_COBALT_H
+#define _ASM_X86_VISWS_COBALT_H
+
+#include <asm/fixmap.h>
+
+/*
+ * Cobalt SGI Visual Workstation system ASIC
+ */
+
+#define CO_CPU_NUM_PHYS 0x1e00
+#define CO_CPU_TAB_PHYS (CO_CPU_NUM_PHYS + 2)
+
+#define CO_CPU_MAX 4
+
+#define CO_CPU_PHYS 0xc2000000
+#define CO_APIC_PHYS 0xc4000000
+
+/* see set_fixmap() and asm/fixmap.h */
+#define CO_CPU_VADDR (fix_to_virt(FIX_CO_CPU))
+#define CO_APIC_VADDR (fix_to_virt(FIX_CO_APIC))
+
+/* Cobalt CPU registers -- relative to CO_CPU_VADDR, use co_cpu_*() */
+#define CO_CPU_REV 0x08
+#define CO_CPU_CTRL 0x10
+#define CO_CPU_STAT 0x20
+#define CO_CPU_TIMEVAL 0x30
+
+/* CO_CPU_CTRL bits */
+#define CO_CTRL_TIMERUN 0x04 /* 0 == disabled */
+#define CO_CTRL_TIMEMASK 0x08 /* 0 == unmasked */
+
+/* CO_CPU_STATUS bits */
+#define CO_STAT_TIMEINTR 0x02 /* (r) 1 == int pend, (w) 0 == clear */
+
+/* CO_CPU_TIMEVAL value */
+#define CO_TIME_HZ 100000000 /* Cobalt core rate */
+
+/* Cobalt APIC registers -- relative to CO_APIC_VADDR, use co_apic_*() */
+#define CO_APIC_HI(n) (((n) * 0x10) + 4)
+#define CO_APIC_LO(n) ((n) * 0x10)
+#define CO_APIC_ID 0x0ffc
+
+/* CO_APIC_ID bits */
+#define CO_APIC_ENABLE 0x00000100
+
+/* CO_APIC_LO bits */
+#define CO_APIC_MASK 0x00010000 /* 0 = enabled */
+#define CO_APIC_LEVEL 0x00008000 /* 0 = edge */
+
+/*
+ * Where things are physically wired to Cobalt
+ * #defines with no board _<type>_<rev>_ are common to all (thus far)
+ */
+#define CO_APIC_IDE0 4
+#define CO_APIC_IDE1 2 /* Only on 320 */
+
+#define CO_APIC_8259 12 /* serial, floppy, par-l-l */
+
+/* Lithium PCI Bridge A -- "the one with 82557 Ethernet" */
+#define CO_APIC_PCIA_BASE0 0 /* and 1 */ /* slot 0, line 0 */
+#define CO_APIC_PCIA_BASE123 5 /* and 6 */ /* slot 0, line 1 */
+
+#define CO_APIC_PIIX4_USB 7 /* this one is weird */
+
+/* Lithium PCI Bridge B -- "the one with PIIX4" */
+#define CO_APIC_PCIB_BASE0 8 /* and 9-12 *//* slot 0, line 0 */
+#define CO_APIC_PCIB_BASE123 13 /* 14.15 */ /* slot 0, line 1 */
+
+#define CO_APIC_VIDOUT0 16
+#define CO_APIC_VIDOUT1 17
+#define CO_APIC_VIDIN0 18
+#define CO_APIC_VIDIN1 19
+
+#define CO_APIC_LI_AUDIO 22
+
+#define CO_APIC_AS 24
+#define CO_APIC_RE 25
+
+#define CO_APIC_CPU 28 /* Timer and Cache interrupt */
+#define CO_APIC_NMI 29
+#define CO_APIC_LAST CO_APIC_NMI
+
+/*
+ * This is how irqs are assigned on the Visual Workstation.
+ * Legacy devices get irq's 1-15 (system clock is 0 and is CO_APIC_CPU).
+ * All other devices (including PCI) go to Cobalt and are irq's 16 on up.
+ */
+#define CO_IRQ_APIC0 16 /* irq of apic entry 0 */
+#define IS_CO_APIC(irq) ((irq) >= CO_IRQ_APIC0)
+#define CO_IRQ(apic) (CO_IRQ_APIC0 + (apic)) /* apic ent to irq */
+#define CO_APIC(irq) ((irq) - CO_IRQ_APIC0) /* irq to apic ent */
+#define CO_IRQ_IDE0 14 /* knowledge of... */
+#define CO_IRQ_IDE1 15 /* ... ide driver defaults! */
+#define CO_IRQ_8259 CO_IRQ(CO_APIC_8259)
+
+#ifdef CONFIG_X86_VISWS_APIC
+static inline void co_cpu_write(unsigned long reg, unsigned long v)
+{
+ *((volatile unsigned long *)(CO_CPU_VADDR+reg))=v;
+}
+
+static inline unsigned long co_cpu_read(unsigned long reg)
+{
+ return *((volatile unsigned long *)(CO_CPU_VADDR+reg));
+}
+
+static inline void co_apic_write(unsigned long reg, unsigned long v)
+{
+ *((volatile unsigned long *)(CO_APIC_VADDR+reg))=v;
+}
+
+static inline unsigned long co_apic_read(unsigned long reg)
+{
+ return *((volatile unsigned long *)(CO_APIC_VADDR+reg));
+}
+#endif
+
+extern char visws_board_type;
+
+#define VISWS_320 0
+#define VISWS_540 1
+
+extern char visws_board_rev;
+
+#endif /* _ASM_X86_VISWS_COBALT_H */
diff --git a/arch/x86/include/asm/visws/lithium.h b/arch/x86/include/asm/visws/lithium.h
new file mode 100644
index 000000000000..a10d89bc1270
--- /dev/null
+++ b/arch/x86/include/asm/visws/lithium.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_X86_VISWS_LITHIUM_H
+#define _ASM_X86_VISWS_LITHIUM_H
+
+#include <asm/fixmap.h>
+
+/*
+ * Lithium is the SGI Visual Workstation I/O ASIC
+ */
+
+#define LI_PCI_A_PHYS 0xfc000000 /* Enet is dev 3 */
+#define LI_PCI_B_PHYS 0xfd000000 /* PIIX4 is here */
+
+/* see set_fixmap() and asm/fixmap.h */
+#define LI_PCIA_VADDR (fix_to_virt(FIX_LI_PCIA))
+#define LI_PCIB_VADDR (fix_to_virt(FIX_LI_PCIB))
+
+/* Not a standard PCI? (not in linux/pci.h) */
+#define LI_PCI_BUSNUM 0x44 /* lo8: primary, hi8: sub */
+#define LI_PCI_INTEN 0x46
+
+/* LI_PCI_INTENT bits */
+#define LI_INTA_0 0x0001
+#define LI_INTA_1 0x0002
+#define LI_INTA_2 0x0004
+#define LI_INTA_3 0x0008
+#define LI_INTA_4 0x0010
+#define LI_INTB 0x0020
+#define LI_INTC 0x0040
+#define LI_INTD 0x0080
+
+/* More special purpose macros... */
+static inline void li_pcia_write16(unsigned long reg, unsigned short v)
+{
+ *((volatile unsigned short *)(LI_PCIA_VADDR+reg))=v;
+}
+
+static inline unsigned short li_pcia_read16(unsigned long reg)
+{
+ return *((volatile unsigned short *)(LI_PCIA_VADDR+reg));
+}
+
+static inline void li_pcib_write16(unsigned long reg, unsigned short v)
+{
+ *((volatile unsigned short *)(LI_PCIB_VADDR+reg))=v;
+}
+
+static inline unsigned short li_pcib_read16(unsigned long reg)
+{
+ return *((volatile unsigned short *)(LI_PCIB_VADDR+reg));
+}
+
+#endif /* _ASM_X86_VISWS_LITHIUM_H */
+
diff --git a/arch/x86/include/asm/visws/piix4.h b/arch/x86/include/asm/visws/piix4.h
new file mode 100644
index 000000000000..d0af4d338e7f
--- /dev/null
+++ b/arch/x86/include/asm/visws/piix4.h
@@ -0,0 +1,107 @@
+#ifndef _ASM_X86_VISWS_PIIX4_H
+#define _ASM_X86_VISWS_PIIX4_H
+
+/*
+ * PIIX4 as used on SGI Visual Workstations
+ */
+
+#define PIIX_PM_START 0x0F80
+
+#define SIO_GPIO_START 0x0FC0
+
+#define SIO_PM_START 0x0FC8
+
+#define PMBASE PIIX_PM_START
+#define GPIREG0 (PMBASE+0x30)
+#define GPIREG(x) (GPIREG0+((x)/8))
+#define GPIBIT(x) (1 << ((x)%8))
+
+#define PIIX_GPI_BD_ID1 18
+#define PIIX_GPI_BD_ID2 19
+#define PIIX_GPI_BD_ID3 20
+#define PIIX_GPI_BD_ID4 21
+#define PIIX_GPI_BD_REG GPIREG(PIIX_GPI_BD_ID1)
+#define PIIX_GPI_BD_MASK (GPIBIT(PIIX_GPI_BD_ID1) | \
+ GPIBIT(PIIX_GPI_BD_ID2) | \
+ GPIBIT(PIIX_GPI_BD_ID3) | \
+ GPIBIT(PIIX_GPI_BD_ID4) )
+
+#define PIIX_GPI_BD_SHIFT (PIIX_GPI_BD_ID1 % 8)
+
+#define SIO_INDEX 0x2e
+#define SIO_DATA 0x2f
+
+#define SIO_DEV_SEL 0x7
+#define SIO_DEV_ENB 0x30
+#define SIO_DEV_MSB 0x60
+#define SIO_DEV_LSB 0x61
+
+#define SIO_GP_DEV 0x7
+
+#define SIO_GP_BASE SIO_GPIO_START
+#define SIO_GP_MSB (SIO_GP_BASE>>8)
+#define SIO_GP_LSB (SIO_GP_BASE&0xff)
+
+#define SIO_GP_DATA1 (SIO_GP_BASE+0)
+
+#define SIO_PM_DEV 0x8
+
+#define SIO_PM_BASE SIO_PM_START
+#define SIO_PM_MSB (SIO_PM_BASE>>8)
+#define SIO_PM_LSB (SIO_PM_BASE&0xff)
+#define SIO_PM_INDEX (SIO_PM_BASE+0)
+#define SIO_PM_DATA (SIO_PM_BASE+1)
+
+#define SIO_PM_FER2 0x1
+
+#define SIO_PM_GP_EN 0x80
+
+
+
+/*
+ * This is the dev/reg where generating a config cycle will
+ * result in a PCI special cycle.
+ */
+#define SPECIAL_DEV 0xff
+#define SPECIAL_REG 0x00
+
+/*
+ * PIIX4 needs to see a special cycle with the following data
+ * to be convinced the processor has gone into the stop grant
+ * state. PIIX4 insists on seeing this before it will power
+ * down a system.
+ */
+#define PIIX_SPECIAL_STOP 0x00120002
+
+#define PIIX4_RESET_PORT 0xcf9
+#define PIIX4_RESET_VAL 0x6
+
+#define PMSTS_PORT 0xf80 // 2 bytes PM Status
+#define PMEN_PORT 0xf82 // 2 bytes PM Enable
+#define PMCNTRL_PORT 0xf84 // 2 bytes PM Control
+
+#define PM_SUSPEND_ENABLE 0x2000 // start sequence to suspend state
+
+/*
+ * PMSTS and PMEN I/O bit definitions.
+ * (Bits are the same in both registers)
+ */
+#define PM_STS_RSM (1<<15) // Resume Status
+#define PM_STS_PWRBTNOR (1<<11) // Power Button Override
+#define PM_STS_RTC (1<<10) // RTC status
+#define PM_STS_PWRBTN (1<<8) // Power Button Pressed?
+#define PM_STS_GBL (1<<5) // Global Status
+#define PM_STS_BM (1<<4) // Bus Master Status
+#define PM_STS_TMROF (1<<0) // Timer Overflow Status.
+
+/*
+ * Stop clock GPI register
+ */
+#define PIIX_GPIREG0 (0xf80 + 0x30)
+
+/*
+ * Stop clock GPI bit in GPIREG0
+ */
+#define PIIX_GPI_STPCLK 0x4 // STPCLK signal routed back in
+
+#endif /* _ASM_X86_VISWS_PIIX4_H */
diff --git a/arch/x86/include/asm/visws/sgivw.h b/arch/x86/include/asm/visws/sgivw.h
new file mode 100644
index 000000000000..5fbf63e1003c
--- /dev/null
+++ b/arch/x86/include/asm/visws/sgivw.h
@@ -0,0 +1,5 @@
+/*
+ * Frame buffer position and size:
+ */
+extern unsigned long sgivwfb_mem_phys;
+extern unsigned long sgivwfb_mem_size;
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
new file mode 100644
index 000000000000..f9303602fbc0
--- /dev/null
+++ b/arch/x86/include/asm/vm86.h
@@ -0,0 +1,208 @@
+#ifndef _ASM_X86_VM86_H
+#define _ASM_X86_VM86_H
+
+/*
+ * I'm guessing at the VIF/VIP flag usage, but hope that this is how
+ * the Pentium uses them. Linux will return from vm86 mode when both
+ * VIF and VIP is set.
+ *
+ * On a Pentium, we could probably optimize the virtual flags directly
+ * in the eflags register instead of doing it "by hand" in vflags...
+ *
+ * Linus
+ */
+
+#include <asm/processor-flags.h>
+
+#define BIOSSEG 0x0f000
+
+#define CPU_086 0
+#define CPU_186 1
+#define CPU_286 2
+#define CPU_386 3
+#define CPU_486 4
+#define CPU_586 5
+
+/*
+ * Return values for the 'vm86()' system call
+ */
+#define VM86_TYPE(retval) ((retval) & 0xff)
+#define VM86_ARG(retval) ((retval) >> 8)
+
+#define VM86_SIGNAL 0 /* return due to signal */
+#define VM86_UNKNOWN 1 /* unhandled GP fault
+ - IO-instruction or similar */
+#define VM86_INTx 2 /* int3/int x instruction (ARG = x) */
+#define VM86_STI 3 /* sti/popf/iret instruction enabled
+ virtual interrupts */
+
+/*
+ * Additional return values when invoking new vm86()
+ */
+#define VM86_PICRETURN 4 /* return due to pending PIC request */
+#define VM86_TRAP 6 /* return due to DOS-debugger request */
+
+/*
+ * function codes when invoking new vm86()
+ */
+#define VM86_PLUS_INSTALL_CHECK 0
+#define VM86_ENTER 1
+#define VM86_ENTER_NO_BYPASS 2
+#define VM86_REQUEST_IRQ 3
+#define VM86_FREE_IRQ 4
+#define VM86_GET_IRQ_BITS 5
+#define VM86_GET_AND_RESET_IRQ 6
+
+/*
+ * This is the stack-layout seen by the user space program when we have
+ * done a translation of "SAVE_ALL" from vm86 mode. The real kernel layout
+ * is 'kernel_vm86_regs' (see below).
+ */
+
+struct vm86_regs {
+/*
+ * normal regs, with special meaning for the segment descriptors..
+ */
+ long ebx;
+ long ecx;
+ long edx;
+ long esi;
+ long edi;
+ long ebp;
+ long eax;
+ long __null_ds;
+ long __null_es;
+ long __null_fs;
+ long __null_gs;
+ long orig_eax;
+ long eip;
+ unsigned short cs, __csh;
+ long eflags;
+ long esp;
+ unsigned short ss, __ssh;
+/*
+ * these are specific to v86 mode:
+ */
+ unsigned short es, __esh;
+ unsigned short ds, __dsh;
+ unsigned short fs, __fsh;
+ unsigned short gs, __gsh;
+};
+
+struct revectored_struct {
+ unsigned long __map[8]; /* 256 bits */
+};
+
+struct vm86_struct {
+ struct vm86_regs regs;
+ unsigned long flags;
+ unsigned long screen_bitmap;
+ unsigned long cpu_type;
+ struct revectored_struct int_revectored;
+ struct revectored_struct int21_revectored;
+};
+
+/*
+ * flags masks
+ */
+#define VM86_SCREEN_BITMAP 0x0001
+
+struct vm86plus_info_struct {
+ unsigned long force_return_for_pic:1;
+ unsigned long vm86dbg_active:1; /* for debugger */
+ unsigned long vm86dbg_TFpendig:1; /* for debugger */
+ unsigned long unused:28;
+ unsigned long is_vm86pus:1; /* for vm86 internal use */
+ unsigned char vm86dbg_intxxtab[32]; /* for debugger */
+};
+struct vm86plus_struct {
+ struct vm86_regs regs;
+ unsigned long flags;
+ unsigned long screen_bitmap;
+ unsigned long cpu_type;
+ struct revectored_struct int_revectored;
+ struct revectored_struct int21_revectored;
+ struct vm86plus_info_struct vm86plus;
+};
+
+#ifdef __KERNEL__
+
+#include <asm/ptrace.h>
+
+/*
+ * This is the (kernel) stack-layout when we have done a "SAVE_ALL" from vm86
+ * mode - the main change is that the old segment descriptors aren't
+ * useful any more and are forced to be zero by the kernel (and the
+ * hardware when a trap occurs), and the real segment descriptors are
+ * at the end of the structure. Look at ptrace.h to see the "normal"
+ * setup. For user space layout see 'struct vm86_regs' above.
+ */
+
+struct kernel_vm86_regs {
+/*
+ * normal regs, with special meaning for the segment descriptors..
+ */
+ struct pt_regs pt;
+/*
+ * these are specific to v86 mode:
+ */
+ unsigned short es, __esh;
+ unsigned short ds, __dsh;
+ unsigned short fs, __fsh;
+ unsigned short gs, __gsh;
+};
+
+struct kernel_vm86_struct {
+ struct kernel_vm86_regs regs;
+/*
+ * the below part remains on the kernel stack while we are in VM86 mode.
+ * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we
+ * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above
+ * 'struct kernel_vm86_regs' with the then actual values.
+ * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct'
+ * in kernelspace, hence we need not reget the data from userspace.
+ */
+#define VM86_TSS_ESP0 flags
+ unsigned long flags;
+ unsigned long screen_bitmap;
+ unsigned long cpu_type;
+ struct revectored_struct int_revectored;
+ struct revectored_struct int21_revectored;
+ struct vm86plus_info_struct vm86plus;
+ struct pt_regs *regs32; /* here we save the pointer to the old regs */
+/*
+ * The below is not part of the structure, but the stack layout continues
+ * this way. In front of 'return-eip' may be some data, depending on
+ * compilation, so we don't rely on this and save the pointer to 'oldregs'
+ * in 'regs32' above.
+ * However, with GCC-2.7.2 and the current CFLAGS you see exactly this:
+
+ long return-eip; from call to vm86()
+ struct pt_regs oldregs; user space registers as saved by syscall
+ */
+};
+
+#ifdef CONFIG_VM86
+
+void handle_vm86_fault(struct kernel_vm86_regs *, long);
+int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
+struct pt_regs *save_v86_state(struct kernel_vm86_regs *);
+
+struct task_struct;
+void release_vm86_irqs(struct task_struct *);
+
+#else
+
+#define handle_vm86_fault(a, b)
+#define release_vm86_irqs(a)
+
+static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
+{
+ return 0;
+}
+
+#endif /* CONFIG_VM86 */
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_VM86_H */
diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h
new file mode 100644
index 000000000000..61e08c0a2907
--- /dev/null
+++ b/arch/x86/include/asm/vmi.h
@@ -0,0 +1,269 @@
+/*
+ * VMI interface definition
+ *
+ * Copyright (C) 2005, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Maintained by: Zachary Amsden zach@vmware.com
+ *
+ */
+#include <linux/types.h>
+
+/*
+ *---------------------------------------------------------------------
+ *
+ * VMI Option ROM API
+ *
+ *---------------------------------------------------------------------
+ */
+#define VMI_SIGNATURE 0x696d5663 /* "cVmi" */
+
+#define PCI_VENDOR_ID_VMWARE 0x15AD
+#define PCI_DEVICE_ID_VMWARE_VMI 0x0801
+
+/*
+ * We use two version numbers for compatibility, with the major
+ * number signifying interface breakages, and the minor number
+ * interface extensions.
+ */
+#define VMI_API_REV_MAJOR 3
+#define VMI_API_REV_MINOR 0
+
+#define VMI_CALL_CPUID 0
+#define VMI_CALL_WRMSR 1
+#define VMI_CALL_RDMSR 2
+#define VMI_CALL_SetGDT 3
+#define VMI_CALL_SetLDT 4
+#define VMI_CALL_SetIDT 5
+#define VMI_CALL_SetTR 6
+#define VMI_CALL_GetGDT 7
+#define VMI_CALL_GetLDT 8
+#define VMI_CALL_GetIDT 9
+#define VMI_CALL_GetTR 10
+#define VMI_CALL_WriteGDTEntry 11
+#define VMI_CALL_WriteLDTEntry 12
+#define VMI_CALL_WriteIDTEntry 13
+#define VMI_CALL_UpdateKernelStack 14
+#define VMI_CALL_SetCR0 15
+#define VMI_CALL_SetCR2 16
+#define VMI_CALL_SetCR3 17
+#define VMI_CALL_SetCR4 18
+#define VMI_CALL_GetCR0 19
+#define VMI_CALL_GetCR2 20
+#define VMI_CALL_GetCR3 21
+#define VMI_CALL_GetCR4 22
+#define VMI_CALL_WBINVD 23
+#define VMI_CALL_SetDR 24
+#define VMI_CALL_GetDR 25
+#define VMI_CALL_RDPMC 26
+#define VMI_CALL_RDTSC 27
+#define VMI_CALL_CLTS 28
+#define VMI_CALL_EnableInterrupts 29
+#define VMI_CALL_DisableInterrupts 30
+#define VMI_CALL_GetInterruptMask 31
+#define VMI_CALL_SetInterruptMask 32
+#define VMI_CALL_IRET 33
+#define VMI_CALL_SYSEXIT 34
+#define VMI_CALL_Halt 35
+#define VMI_CALL_Reboot 36
+#define VMI_CALL_Shutdown 37
+#define VMI_CALL_SetPxE 38
+#define VMI_CALL_SetPxELong 39
+#define VMI_CALL_UpdatePxE 40
+#define VMI_CALL_UpdatePxELong 41
+#define VMI_CALL_MachineToPhysical 42
+#define VMI_CALL_PhysicalToMachine 43
+#define VMI_CALL_AllocatePage 44
+#define VMI_CALL_ReleasePage 45
+#define VMI_CALL_InvalPage 46
+#define VMI_CALL_FlushTLB 47
+#define VMI_CALL_SetLinearMapping 48
+
+#define VMI_CALL_SetIOPLMask 61
+#define VMI_CALL_SetInitialAPState 62
+#define VMI_CALL_APICWrite 63
+#define VMI_CALL_APICRead 64
+#define VMI_CALL_IODelay 65
+#define VMI_CALL_SetLazyMode 73
+
+/*
+ *---------------------------------------------------------------------
+ *
+ * MMU operation flags
+ *
+ *---------------------------------------------------------------------
+ */
+
+/* Flags used by VMI_{Allocate|Release}Page call */
+#define VMI_PAGE_PAE 0x10 /* Allocate PAE shadow */
+#define VMI_PAGE_CLONE 0x20 /* Clone from another shadow */
+#define VMI_PAGE_ZEROED 0x40 /* Page is pre-zeroed */
+
+
+/* Flags shared by Allocate|Release Page and PTE updates */
+#define VMI_PAGE_PT 0x01
+#define VMI_PAGE_PD 0x02
+#define VMI_PAGE_PDP 0x04
+#define VMI_PAGE_PML4 0x08
+
+#define VMI_PAGE_NORMAL 0x00 /* for debugging */
+
+/* Flags used by PTE updates */
+#define VMI_PAGE_CURRENT_AS 0x10 /* implies VMI_PAGE_VA_MASK is valid */
+#define VMI_PAGE_DEFER 0x20 /* may queue update until TLB inval */
+#define VMI_PAGE_VA_MASK 0xfffff000
+
+#ifdef CONFIG_X86_PAE
+#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
+#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
+#else
+#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_ZEROED)
+#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_ZEROED)
+#endif
+
+/* Flags used by VMI_FlushTLB call */
+#define VMI_FLUSH_TLB 0x01
+#define VMI_FLUSH_GLOBAL 0x02
+
+/*
+ *---------------------------------------------------------------------
+ *
+ * VMI relocation definitions for ROM call get_reloc
+ *
+ *---------------------------------------------------------------------
+ */
+
+/* VMI Relocation types */
+#define VMI_RELOCATION_NONE 0
+#define VMI_RELOCATION_CALL_REL 1
+#define VMI_RELOCATION_JUMP_REL 2
+#define VMI_RELOCATION_NOP 3
+
+#ifndef __ASSEMBLY__
+struct vmi_relocation_info {
+ unsigned char *eip;
+ unsigned char type;
+ unsigned char reserved[3];
+};
+#endif
+
+
+/*
+ *---------------------------------------------------------------------
+ *
+ * Generic ROM structures and definitions
+ *
+ *---------------------------------------------------------------------
+ */
+
+#ifndef __ASSEMBLY__
+
+struct vrom_header {
+ u16 rom_signature; /* option ROM signature */
+ u8 rom_length; /* ROM length in 512 byte chunks */
+ u8 rom_entry[4]; /* 16-bit code entry point */
+ u8 rom_pad0; /* 4-byte align pad */
+ u32 vrom_signature; /* VROM identification signature */
+ u8 api_version_min;/* Minor version of API */
+ u8 api_version_maj;/* Major version of API */
+ u8 jump_slots; /* Number of jump slots */
+ u8 reserved1; /* Reserved for expansion */
+ u32 virtual_top; /* Hypervisor virtual address start */
+ u16 reserved2; /* Reserved for expansion */
+ u16 license_offs; /* Offset to License string */
+ u16 pci_header_offs;/* Offset to PCI OPROM header */
+ u16 pnp_header_offs;/* Offset to PnP OPROM header */
+ u32 rom_pad3; /* PnP reserverd / VMI reserved */
+ u8 reserved[96]; /* Reserved for headers */
+ char vmi_init[8]; /* VMI_Init jump point */
+ char get_reloc[8]; /* VMI_GetRelocationInfo jump point */
+} __attribute__((packed));
+
+struct pnp_header {
+ char sig[4];
+ char rev;
+ char size;
+ short next;
+ short res;
+ long devID;
+ unsigned short manufacturer_offset;
+ unsigned short product_offset;
+} __attribute__((packed));
+
+struct pci_header {
+ char sig[4];
+ short vendorID;
+ short deviceID;
+ short vpdData;
+ short size;
+ char rev;
+ char class;
+ char subclass;
+ char interface;
+ short chunks;
+ char rom_version_min;
+ char rom_version_maj;
+ char codetype;
+ char lastRom;
+ short reserved;
+} __attribute__((packed));
+
+/* Function prototypes for bootstrapping */
+#ifdef CONFIG_VMI
+extern void vmi_init(void);
+extern void vmi_activate(void);
+extern void vmi_bringup(void);
+#else
+static inline void vmi_init(void) {}
+static inline void vmi_activate(void) {}
+static inline void vmi_bringup(void) {}
+#endif
+
+/* State needed to start an application processor in an SMP system. */
+struct vmi_ap_state {
+ u32 cr0;
+ u32 cr2;
+ u32 cr3;
+ u32 cr4;
+
+ u64 efer;
+
+ u32 eip;
+ u32 eflags;
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+ u32 esp;
+ u32 ebp;
+ u32 esi;
+ u32 edi;
+ u16 cs;
+ u16 ss;
+ u16 ds;
+ u16 es;
+ u16 fs;
+ u16 gs;
+ u16 ldtr;
+
+ u16 gdtr_limit;
+ u32 gdtr_base;
+ u32 idtr_base;
+ u16 idtr_limit;
+};
+
+#endif
diff --git a/arch/x86/include/asm/vmi_time.h b/arch/x86/include/asm/vmi_time.h
new file mode 100644
index 000000000000..c6e0bee93e3c
--- /dev/null
+++ b/arch/x86/include/asm/vmi_time.h
@@ -0,0 +1,98 @@
+/*
+ * VMI Time wrappers
+ *
+ * Copyright (C) 2006, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to dhecht@vmware.com
+ *
+ */
+
+#ifndef _ASM_X86_VMI_TIME_H
+#define _ASM_X86_VMI_TIME_H
+
+/*
+ * Raw VMI call indices for timer functions
+ */
+#define VMI_CALL_GetCycleFrequency 66
+#define VMI_CALL_GetCycleCounter 67
+#define VMI_CALL_SetAlarm 68
+#define VMI_CALL_CancelAlarm 69
+#define VMI_CALL_GetWallclockTime 70
+#define VMI_CALL_WallclockUpdated 71
+
+/* Cached VMI timer operations */
+extern struct vmi_timer_ops {
+ u64 (*get_cycle_frequency)(void);
+ u64 (*get_cycle_counter)(int);
+ u64 (*get_wallclock)(void);
+ int (*wallclock_updated)(void);
+ void (*set_alarm)(u32 flags, u64 expiry, u64 period);
+ void (*cancel_alarm)(u32 flags);
+} vmi_timer_ops;
+
+/* Prototypes */
+extern void __init vmi_time_init(void);
+extern unsigned long vmi_get_wallclock(void);
+extern int vmi_set_wallclock(unsigned long now);
+extern unsigned long long vmi_sched_clock(void);
+extern unsigned long vmi_tsc_khz(void);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern void __devinit vmi_time_bsp_init(void);
+extern void __devinit vmi_time_ap_init(void);
+#endif
+
+/*
+ * When run under a hypervisor, a vcpu is always in one of three states:
+ * running, halted, or ready. The vcpu is in the 'running' state if it
+ * is executing. When the vcpu executes the halt interface, the vcpu
+ * enters the 'halted' state and remains halted until there is some work
+ * pending for the vcpu (e.g. an alarm expires, host I/O completes on
+ * behalf of virtual I/O). At this point, the vcpu enters the 'ready'
+ * state (waiting for the hypervisor to reschedule it). Finally, at any
+ * time when the vcpu is not in the 'running' state nor the 'halted'
+ * state, it is in the 'ready' state.
+ *
+ * Real time is advances while the vcpu is 'running', 'ready', or
+ * 'halted'. Stolen time is the time in which the vcpu is in the
+ * 'ready' state. Available time is the remaining time -- the vcpu is
+ * either 'running' or 'halted'.
+ *
+ * All three views of time are accessible through the VMI cycle
+ * counters.
+ */
+
+/* The cycle counters. */
+#define VMI_CYCLES_REAL 0
+#define VMI_CYCLES_AVAILABLE 1
+#define VMI_CYCLES_STOLEN 2
+
+/* The alarm interface 'flags' bits */
+#define VMI_ALARM_COUNTERS 2
+
+#define VMI_ALARM_COUNTER_MASK 0x000000ff
+
+#define VMI_ALARM_WIRED_IRQ0 0x00000000
+#define VMI_ALARM_WIRED_LVTT 0x00010000
+
+#define VMI_ALARM_IS_ONESHOT 0x00000000
+#define VMI_ALARM_IS_PERIODIC 0x00000100
+
+#define CONFIG_VMI_ALARM_HZ 100
+
+#endif /* _ASM_X86_VMI_TIME_H */
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h
new file mode 100644
index 000000000000..c11b7e100d83
--- /dev/null
+++ b/arch/x86/include/asm/vmware.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2008, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#ifndef ASM_X86__VMWARE_H
+#define ASM_X86__VMWARE_H
+
+extern unsigned long vmware_get_tsc_khz(void);
+extern int vmware_platform(void);
+extern void vmware_set_feature_bits(struct cpuinfo_x86 *c);
+
+#endif
diff --git a/arch/x86/include/asm/voyager.h b/arch/x86/include/asm/voyager.h
new file mode 100644
index 000000000000..b3e647307625
--- /dev/null
+++ b/arch/x86/include/asm/voyager.h
@@ -0,0 +1,529 @@
+/* Copyright (C) 1999,2001
+ *
+ * Author: J.E.J.Bottomley@HansenPartnership.com
+ *
+ * Standard include definitions for the NCR Voyager system */
+
+#undef VOYAGER_DEBUG
+#undef VOYAGER_CAT_DEBUG
+
+#ifdef VOYAGER_DEBUG
+#define VDEBUG(x) printk x
+#else
+#define VDEBUG(x)
+#endif
+
+/* There are three levels of voyager machine: 3,4 and 5. The rule is
+ * if it's less than 3435 it's a Level 3 except for a 3360 which is
+ * a level 4. A 3435 or above is a Level 5 */
+#define VOYAGER_LEVEL5_AND_ABOVE 0x3435
+#define VOYAGER_LEVEL4 0x3360
+
+/* The L4 DINO ASIC */
+#define VOYAGER_DINO 0x43
+
+/* voyager ports in standard I/O space */
+#define VOYAGER_MC_SETUP 0x96
+
+
+#define VOYAGER_CAT_CONFIG_PORT 0x97
+# define VOYAGER_CAT_DESELECT 0xff
+#define VOYAGER_SSPB_RELOCATION_PORT 0x98
+
+/* Valid CAT controller commands */
+/* start instruction register cycle */
+#define VOYAGER_CAT_IRCYC 0x01
+/* start data register cycle */
+#define VOYAGER_CAT_DRCYC 0x02
+/* move to execute state */
+#define VOYAGER_CAT_RUN 0x0F
+/* end operation */
+#define VOYAGER_CAT_END 0x80
+/* hold in idle state */
+#define VOYAGER_CAT_HOLD 0x90
+/* single step an "intest" vector */
+#define VOYAGER_CAT_STEP 0xE0
+/* return cat controller to CLEMSON mode */
+#define VOYAGER_CAT_CLEMSON 0xFF
+
+/* the default cat command header */
+#define VOYAGER_CAT_HEADER 0x7F
+
+/* the range of possible CAT module ids in the system */
+#define VOYAGER_MIN_MODULE 0x10
+#define VOYAGER_MAX_MODULE 0x1f
+
+/* The voyager registers per asic */
+#define VOYAGER_ASIC_ID_REG 0x00
+#define VOYAGER_ASIC_TYPE_REG 0x01
+/* the sub address registers can be made auto incrementing on reads */
+#define VOYAGER_AUTO_INC_REG 0x02
+# define VOYAGER_AUTO_INC 0x04
+# define VOYAGER_NO_AUTO_INC 0xfb
+#define VOYAGER_SUBADDRDATA 0x03
+#define VOYAGER_SCANPATH 0x05
+# define VOYAGER_CONNECT_ASIC 0x01
+# define VOYAGER_DISCONNECT_ASIC 0xfe
+#define VOYAGER_SUBADDRLO 0x06
+#define VOYAGER_SUBADDRHI 0x07
+#define VOYAGER_SUBMODSELECT 0x08
+#define VOYAGER_SUBMODPRESENT 0x09
+
+#define VOYAGER_SUBADDR_LO 0xff
+#define VOYAGER_SUBADDR_HI 0xffff
+
+/* the maximum size of a scan path -- used to form instructions */
+#define VOYAGER_MAX_SCAN_PATH 0x100
+/* the biggest possible register size (in bytes) */
+#define VOYAGER_MAX_REG_SIZE 4
+
+/* Total number of possible modules (including submodules) */
+#define VOYAGER_MAX_MODULES 16
+/* Largest number of asics per module */
+#define VOYAGER_MAX_ASICS_PER_MODULE 7
+
+/* the CAT asic of each module is always the first one */
+#define VOYAGER_CAT_ID 0
+#define VOYAGER_PSI 0x1a
+
+/* voyager instruction operations and registers */
+#define VOYAGER_READ_CONFIG 0x1
+#define VOYAGER_WRITE_CONFIG 0x2
+#define VOYAGER_BYPASS 0xff
+
+typedef struct voyager_asic {
+ __u8 asic_addr; /* ASIC address; Level 4 */
+ __u8 asic_type; /* ASIC type */
+ __u8 asic_id; /* ASIC id */
+ __u8 jtag_id[4]; /* JTAG id */
+ __u8 asic_location; /* Location within scan path; start w/ 0 */
+ __u8 bit_location; /* Location within bit stream; start w/ 0 */
+ __u8 ireg_length; /* Instruction register length */
+ __u16 subaddr; /* Amount of sub address space */
+ struct voyager_asic *next; /* Next asic in linked list */
+} voyager_asic_t;
+
+typedef struct voyager_module {
+ __u8 module_addr; /* Module address */
+ __u8 scan_path_connected; /* Scan path connected */
+ __u16 ee_size; /* Size of the EEPROM */
+ __u16 num_asics; /* Number of Asics */
+ __u16 inst_bits; /* Instruction bits in the scan path */
+ __u16 largest_reg; /* Largest register in the scan path */
+ __u16 smallest_reg; /* Smallest register in the scan path */
+ voyager_asic_t *asic; /* First ASIC in scan path (CAT_I) */
+ struct voyager_module *submodule; /* Submodule pointer */
+ struct voyager_module *next; /* Next module in linked list */
+} voyager_module_t;
+
+typedef struct voyager_eeprom_hdr {
+ __u8 module_id[4];
+ __u8 version_id;
+ __u8 config_id;
+ __u16 boundry_id; /* boundary scan id */
+ __u16 ee_size; /* size of EEPROM */
+ __u8 assembly[11]; /* assembly # */
+ __u8 assembly_rev; /* assembly rev */
+ __u8 tracer[4]; /* tracer number */
+ __u16 assembly_cksum; /* asm checksum */
+ __u16 power_consump; /* pwr requirements */
+ __u16 num_asics; /* number of asics */
+ __u16 bist_time; /* min. bist time */
+ __u16 err_log_offset; /* error log offset */
+ __u16 scan_path_offset;/* scan path offset */
+ __u16 cct_offset;
+ __u16 log_length; /* length of err log */
+ __u16 xsum_end; /* offset to end of
+ checksum */
+ __u8 reserved[4];
+ __u8 sflag; /* starting sentinal */
+ __u8 part_number[13]; /* prom part number */
+ __u8 version[10]; /* version number */
+ __u8 signature[8];
+ __u16 eeprom_chksum;
+ __u32 data_stamp_offset;
+ __u8 eflag ; /* ending sentinal */
+} __attribute__((packed)) voyager_eprom_hdr_t;
+
+
+
+#define VOYAGER_EPROM_SIZE_OFFSET \
+ ((__u16)(&(((voyager_eprom_hdr_t *)0)->ee_size)))
+#define VOYAGER_XSUM_END_OFFSET 0x2a
+
+/* the following three definitions are for internal table layouts
+ * in the module EPROMs. We really only care about the IDs and
+ * offsets */
+typedef struct voyager_sp_table {
+ __u8 asic_id;
+ __u8 bypass_flag;
+ __u16 asic_data_offset;
+ __u16 config_data_offset;
+} __attribute__((packed)) voyager_sp_table_t;
+
+typedef struct voyager_jtag_table {
+ __u8 icode[4];
+ __u8 runbist[4];
+ __u8 intest[4];
+ __u8 samp_preld[4];
+ __u8 ireg_len;
+} __attribute__((packed)) voyager_jtt_t;
+
+typedef struct voyager_asic_data_table {
+ __u8 jtag_id[4];
+ __u16 length_bsr;
+ __u16 length_bist_reg;
+ __u32 bist_clk;
+ __u16 subaddr_bits;
+ __u16 seed_bits;
+ __u16 sig_bits;
+ __u16 jtag_offset;
+} __attribute__((packed)) voyager_at_t;
+
+/* Voyager Interrupt Controller (VIC) registers */
+
+/* Base to add to Cross Processor Interrupts (CPIs) when triggering
+ * the CPU IRQ line */
+/* register defines for the WCBICs (one per processor) */
+#define VOYAGER_WCBIC0 0x41 /* bus A node P1 processor 0 */
+#define VOYAGER_WCBIC1 0x49 /* bus A node P1 processor 1 */
+#define VOYAGER_WCBIC2 0x51 /* bus A node P2 processor 0 */
+#define VOYAGER_WCBIC3 0x59 /* bus A node P2 processor 1 */
+#define VOYAGER_WCBIC4 0x61 /* bus B node P1 processor 0 */
+#define VOYAGER_WCBIC5 0x69 /* bus B node P1 processor 1 */
+#define VOYAGER_WCBIC6 0x71 /* bus B node P2 processor 0 */
+#define VOYAGER_WCBIC7 0x79 /* bus B node P2 processor 1 */
+
+
+/* top of memory registers */
+#define VOYAGER_WCBIC_TOM_L 0x4
+#define VOYAGER_WCBIC_TOM_H 0x5
+
+/* register defines for Voyager Memory Contol (VMC)
+ * these are present on L4 machines only */
+#define VOYAGER_VMC1 0x81
+#define VOYAGER_VMC2 0x91
+#define VOYAGER_VMC3 0xa1
+#define VOYAGER_VMC4 0xb1
+
+/* VMC Ports */
+#define VOYAGER_VMC_MEMORY_SETUP 0x9
+# define VMC_Interleaving 0x01
+# define VMC_4Way 0x02
+# define VMC_EvenCacheLines 0x04
+# define VMC_HighLine 0x08
+# define VMC_Start0_Enable 0x20
+# define VMC_Start1_Enable 0x40
+# define VMC_Vremap 0x80
+#define VOYAGER_VMC_BANK_DENSITY 0xa
+# define VMC_BANK_EMPTY 0
+# define VMC_BANK_4MB 1
+# define VMC_BANK_16MB 2
+# define VMC_BANK_64MB 3
+# define VMC_BANK0_MASK 0x03
+# define VMC_BANK1_MASK 0x0C
+# define VMC_BANK2_MASK 0x30
+# define VMC_BANK3_MASK 0xC0
+
+/* Magellan Memory Controller (MMC) defines - present on L5 */
+#define VOYAGER_MMC_ASIC_ID 1
+/* the two memory modules corresponding to memory cards in the system */
+#define VOYAGER_MMC_MEMORY0_MODULE 0x14
+#define VOYAGER_MMC_MEMORY1_MODULE 0x15
+/* the Magellan Memory Address (MMA) defines */
+#define VOYAGER_MMA_ASIC_ID 2
+
+/* Submodule number for the Quad Baseboard */
+#define VOYAGER_QUAD_BASEBOARD 1
+
+/* ASIC defines for the Quad Baseboard */
+#define VOYAGER_QUAD_QDATA0 1
+#define VOYAGER_QUAD_QDATA1 2
+#define VOYAGER_QUAD_QABC 3
+
+/* Useful areas in extended CMOS */
+#define VOYAGER_PROCESSOR_PRESENT_MASK 0x88a
+#define VOYAGER_MEMORY_CLICKMAP 0xa23
+#define VOYAGER_DUMP_LOCATION 0xb1a
+
+/* SUS In Control bit - used to tell SUS that we don't need to be
+ * babysat anymore */
+#define VOYAGER_SUS_IN_CONTROL_PORT 0x3ff
+# define VOYAGER_IN_CONTROL_FLAG 0x80
+
+/* Voyager PSI defines */
+#define VOYAGER_PSI_STATUS_REG 0x08
+# define PSI_DC_FAIL 0x01
+# define PSI_MON 0x02
+# define PSI_FAULT 0x04
+# define PSI_ALARM 0x08
+# define PSI_CURRENT 0x10
+# define PSI_DVM 0x20
+# define PSI_PSCFAULT 0x40
+# define PSI_STAT_CHG 0x80
+
+#define VOYAGER_PSI_SUPPLY_REG 0x8000
+ /* read */
+# define PSI_FAIL_DC 0x01
+# define PSI_FAIL_AC 0x02
+# define PSI_MON_INT 0x04
+# define PSI_SWITCH_OFF 0x08
+# define PSI_HX_OFF 0x10
+# define PSI_SECURITY 0x20
+# define PSI_CMOS_BATT_LOW 0x40
+# define PSI_CMOS_BATT_FAIL 0x80
+ /* write */
+# define PSI_CLR_SWITCH_OFF 0x13
+# define PSI_CLR_HX_OFF 0x14
+# define PSI_CLR_CMOS_BATT_FAIL 0x17
+
+#define VOYAGER_PSI_MASK 0x8001
+# define PSI_MASK_MASK 0x10
+
+#define VOYAGER_PSI_AC_FAIL_REG 0x8004
+#define AC_FAIL_STAT_CHANGE 0x80
+
+#define VOYAGER_PSI_GENERAL_REG 0x8007
+ /* read */
+# define PSI_SWITCH_ON 0x01
+# define PSI_SWITCH_ENABLED 0x02
+# define PSI_ALARM_ENABLED 0x08
+# define PSI_SECURE_ENABLED 0x10
+# define PSI_COLD_RESET 0x20
+# define PSI_COLD_START 0x80
+ /* write */
+# define PSI_POWER_DOWN 0x10
+# define PSI_SWITCH_DISABLE 0x01
+# define PSI_SWITCH_ENABLE 0x11
+# define PSI_CLEAR 0x12
+# define PSI_ALARM_DISABLE 0x03
+# define PSI_ALARM_ENABLE 0x13
+# define PSI_CLEAR_COLD_RESET 0x05
+# define PSI_SET_COLD_RESET 0x15
+# define PSI_CLEAR_COLD_START 0x07
+# define PSI_SET_COLD_START 0x17
+
+
+
+struct voyager_bios_info {
+ __u8 len;
+ __u8 major;
+ __u8 minor;
+ __u8 debug;
+ __u8 num_classes;
+ __u8 class_1;
+ __u8 class_2;
+};
+
+/* The following structures and definitions are for the Kernel/SUS
+ * interface these are needed to find out how SUS initialised any Quad
+ * boards in the system */
+
+#define NUMBER_OF_MC_BUSSES 2
+#define SLOTS_PER_MC_BUS 8
+#define MAX_CPUS 16 /* 16 way CPU system */
+#define MAX_PROCESSOR_BOARDS 4 /* 4 processor slot system */
+#define MAX_CACHE_LEVELS 4 /* # of cache levels supported */
+#define MAX_SHARED_CPUS 4 /* # of CPUs that can share a LARC */
+#define NUMBER_OF_POS_REGS 8
+
+typedef struct {
+ __u8 MC_Slot;
+ __u8 POS_Values[NUMBER_OF_POS_REGS];
+} __attribute__((packed)) MC_SlotInformation_t;
+
+struct QuadDescription {
+ __u8 Type; /* for type 0 (DYADIC or MONADIC) all fields
+ * will be zero except for slot */
+ __u8 StructureVersion;
+ __u32 CPI_BaseAddress;
+ __u32 LARC_BankSize;
+ __u32 LocalMemoryStateBits;
+ __u8 Slot; /* Processor slots 1 - 4 */
+} __attribute__((packed));
+
+struct ProcBoardInfo {
+ __u8 Type;
+ __u8 StructureVersion;
+ __u8 NumberOfBoards;
+ struct QuadDescription QuadData[MAX_PROCESSOR_BOARDS];
+} __attribute__((packed));
+
+struct CacheDescription {
+ __u8 Level;
+ __u32 TotalSize;
+ __u16 LineSize;
+ __u8 Associativity;
+ __u8 CacheType;
+ __u8 WriteType;
+ __u8 Number_CPUs_SharedBy;
+ __u8 Shared_CPUs_Hardware_IDs[MAX_SHARED_CPUS];
+
+} __attribute__((packed));
+
+struct CPU_Description {
+ __u8 CPU_HardwareId;
+ char *FRU_String;
+ __u8 NumberOfCacheLevels;
+ struct CacheDescription CacheLevelData[MAX_CACHE_LEVELS];
+} __attribute__((packed));
+
+struct CPU_Info {
+ __u8 Type;
+ __u8 StructureVersion;
+ __u8 NumberOf_CPUs;
+ struct CPU_Description CPU_Data[MAX_CPUS];
+} __attribute__((packed));
+
+
+/*
+ * This structure will be used by SUS and the OS.
+ * The assumption about this structure is that no blank space is
+ * packed in it by our friend the compiler.
+ */
+typedef struct {
+ __u8 Mailbox_SUS; /* Written to by SUS to give
+ commands/response to the OS */
+ __u8 Mailbox_OS; /* Written to by the OS to give
+ commands/response to SUS */
+ __u8 SUS_MailboxVersion; /* Tells the OS which iteration of the
+ interface SUS supports */
+ __u8 OS_MailboxVersion; /* Tells SUS which iteration of the
+ interface the OS supports */
+ __u32 OS_Flags; /* Flags set by the OS as info for
+ SUS */
+ __u32 SUS_Flags; /* Flags set by SUS as info
+ for the OS */
+ __u32 WatchDogPeriod; /* Watchdog period (in seconds) which
+ the DP uses to see if the OS
+ is dead */
+ __u32 WatchDogCount; /* Updated by the OS on every tic. */
+ __u32 MemoryFor_SUS_ErrorLog; /* Flat 32 bit address which tells SUS
+ where to stuff the SUS error log
+ on a dump */
+ MC_SlotInformation_t MC_SlotInfo[NUMBER_OF_MC_BUSSES*SLOTS_PER_MC_BUS];
+ /* Storage for MCA POS data */
+ /* All new SECOND_PASS_INTERFACE fields added from this point */
+ struct ProcBoardInfo *BoardData;
+ struct CPU_Info *CPU_Data;
+ /* All new fields must be added from this point */
+} Voyager_KernelSUS_Mbox_t;
+
+/* structure for finding the right memory address to send a QIC CPI to */
+struct voyager_qic_cpi {
+ /* Each cache line (32 bytes) can trigger a cpi. The cpi
+ * read/write may occur anywhere in the cache line---pick the
+ * middle to be safe */
+ struct {
+ __u32 pad1[3];
+ __u32 cpi;
+ __u32 pad2[4];
+ } qic_cpi[8];
+};
+
+struct voyager_status {
+ __u32 power_fail:1;
+ __u32 switch_off:1;
+ __u32 request_from_kernel:1;
+};
+
+struct voyager_psi_regs {
+ __u8 cat_id;
+ __u8 cat_dev;
+ __u8 cat_control;
+ __u8 subaddr;
+ __u8 dummy4;
+ __u8 checkbit;
+ __u8 subaddr_low;
+ __u8 subaddr_high;
+ __u8 intstatus;
+ __u8 stat1;
+ __u8 stat3;
+ __u8 fault;
+ __u8 tms;
+ __u8 gen;
+ __u8 sysconf;
+ __u8 dummy15;
+};
+
+struct voyager_psi_subregs {
+ __u8 supply;
+ __u8 mask;
+ __u8 present;
+ __u8 DCfail;
+ __u8 ACfail;
+ __u8 fail;
+ __u8 UPSfail;
+ __u8 genstatus;
+};
+
+struct voyager_psi {
+ struct voyager_psi_regs regs;
+ struct voyager_psi_subregs subregs;
+};
+
+struct voyager_SUS {
+#define VOYAGER_DUMP_BUTTON_NMI 0x1
+#define VOYAGER_SUS_VALID 0x2
+#define VOYAGER_SYSINT_COMPLETE 0x3
+ __u8 SUS_mbox;
+#define VOYAGER_NO_COMMAND 0x0
+#define VOYAGER_IGNORE_DUMP 0x1
+#define VOYAGER_DO_DUMP 0x2
+#define VOYAGER_SYSINT_HANDSHAKE 0x3
+#define VOYAGER_DO_MEM_DUMP 0x4
+#define VOYAGER_SYSINT_WAS_RECOVERED 0x5
+ __u8 kernel_mbox;
+#define VOYAGER_MAILBOX_VERSION 0x10
+ __u8 SUS_version;
+ __u8 kernel_version;
+#define VOYAGER_OS_HAS_SYSINT 0x1
+#define VOYAGER_OS_IN_PROGRESS 0x2
+#define VOYAGER_UPDATING_WDPERIOD 0x4
+ __u32 kernel_flags;
+#define VOYAGER_SUS_BOOTING 0x1
+#define VOYAGER_SUS_IN_PROGRESS 0x2
+ __u32 SUS_flags;
+ __u32 watchdog_period;
+ __u32 watchdog_count;
+ __u32 SUS_errorlog;
+ /* lots of system configuration stuff under here */
+};
+
+/* Variables exported by voyager_smp */
+extern __u32 voyager_extended_vic_processors;
+extern __u32 voyager_allowed_boot_processors;
+extern __u32 voyager_quad_processors;
+extern struct voyager_qic_cpi *voyager_quad_cpi_addr[NR_CPUS];
+extern struct voyager_SUS *voyager_SUS;
+
+/* variables exported always */
+extern struct task_struct *voyager_thread;
+extern int voyager_level;
+extern struct voyager_status voyager_status;
+
+/* functions exported by the voyager and voyager_smp modules */
+extern int voyager_cat_readb(__u8 module, __u8 asic, int reg);
+extern void voyager_cat_init(void);
+extern void voyager_detect(struct voyager_bios_info *);
+extern void voyager_trap_init(void);
+extern void voyager_setup_irqs(void);
+extern int voyager_memory_detect(int region, __u32 *addr, __u32 *length);
+extern void voyager_smp_intr_init(void);
+extern __u8 voyager_extended_cmos_read(__u16 cmos_address);
+extern void voyager_smp_dump(void);
+extern void voyager_timer_interrupt(void);
+extern void smp_local_timer_interrupt(void);
+extern void voyager_power_off(void);
+extern void smp_voyager_power_off(void *dummy);
+extern void voyager_restart(void);
+extern void voyager_cat_power_off(void);
+extern void voyager_cat_do_common_interrupt(void);
+extern void voyager_handle_nmi(void);
+extern void voyager_smp_intr_init(void);
+/* Commands for the following are */
+#define VOYAGER_PSI_READ 0
+#define VOYAGER_PSI_WRITE 1
+#define VOYAGER_PSI_SUBREAD 2
+#define VOYAGER_PSI_SUBWRITE 3
+extern void voyager_cat_psi(__u8, __u16, __u8 *);
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
new file mode 100644
index 000000000000..d0983d255fbd
--- /dev/null
+++ b/arch/x86/include/asm/vsyscall.h
@@ -0,0 +1,44 @@
+#ifndef _ASM_X86_VSYSCALL_H
+#define _ASM_X86_VSYSCALL_H
+
+enum vsyscall_num {
+ __NR_vgettimeofday,
+ __NR_vtime,
+ __NR_vgetcpu,
+};
+
+#define VSYSCALL_START (-10UL << 20)
+#define VSYSCALL_SIZE 1024
+#define VSYSCALL_END (-2UL << 20)
+#define VSYSCALL_MAPPED_PAGES 1
+#define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
+
+#ifdef __KERNEL__
+#include <linux/seqlock.h>
+
+#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16)))
+#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
+
+/* Definitions for CONFIG_GENERIC_TIME definitions */
+#define __section_vsyscall_gtod_data __attribute__ \
+ ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
+#define __section_vsyscall_clock __attribute__ \
+ ((unused, __section__ (".vsyscall_clock"),aligned(16)))
+#define __vsyscall_fn \
+ __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
+
+#define VGETCPU_RDTSCP 1
+#define VGETCPU_LSL 2
+
+extern int __vgetcpu_mode;
+extern volatile unsigned long __jiffies;
+
+/* kernel space (writeable) */
+extern int vgetcpu_mode;
+extern struct timezone sys_tz;
+
+extern void map_vsyscall(void);
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/include/asm/xcr.h b/arch/x86/include/asm/xcr.h
new file mode 100644
index 000000000000..f2cba4e79a23
--- /dev/null
+++ b/arch/x86/include/asm/xcr.h
@@ -0,0 +1,49 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright 2008 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2 or (at your
+ * option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * asm-x86/xcr.h
+ *
+ * Definitions for the eXtended Control Register instructions
+ */
+
+#ifndef _ASM_X86_XCR_H
+#define _ASM_X86_XCR_H
+
+#define XCR_XFEATURE_ENABLED_MASK 0x00000000
+
+#ifdef __KERNEL__
+# ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+static inline u64 xgetbv(u32 index)
+{
+ u32 eax, edx;
+
+ asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
+ : "=a" (eax), "=d" (edx)
+ : "c" (index));
+ return eax + ((u64)edx << 32);
+}
+
+static inline void xsetbv(u32 index, u64 value)
+{
+ u32 eax = value;
+ u32 edx = value >> 32;
+
+ asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
+ : : "a" (eax), "d" (edx), "c" (index));
+}
+
+# endif /* __ASSEMBLY__ */
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_XCR_H */
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
new file mode 100644
index 000000000000..19144184983a
--- /dev/null
+++ b/arch/x86/include/asm/xen/events.h
@@ -0,0 +1,24 @@
+#ifndef _ASM_X86_XEN_EVENTS_H
+#define _ASM_X86_XEN_EVENTS_H
+
+enum ipi_vector {
+ XEN_RESCHEDULE_VECTOR,
+ XEN_CALL_FUNCTION_VECTOR,
+ XEN_CALL_FUNCTION_SINGLE_VECTOR,
+ XEN_SPIN_UNLOCK_VECTOR,
+
+ XEN_NR_IPIS,
+};
+
+static inline int xen_irqs_disabled(struct pt_regs *regs)
+{
+ return raw_irqs_disabled_flags(regs->flags);
+}
+
+static inline void xen_do_IRQ(int irq, struct pt_regs *regs)
+{
+ regs->orig_ax = ~irq;
+ do_IRQ(regs);
+}
+
+#endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/grant_table.h b/arch/x86/include/asm/xen/grant_table.h
new file mode 100644
index 000000000000..fdbbb45767a6
--- /dev/null
+++ b/arch/x86/include/asm/xen/grant_table.h
@@ -0,0 +1,7 @@
+#ifndef _ASM_X86_XEN_GRANT_TABLE_H
+#define _ASM_X86_XEN_GRANT_TABLE_H
+
+#define xen_alloc_vm_area(size) alloc_vm_area(size)
+#define xen_free_vm_area(area) free_vm_area(area)
+
+#endif /* _ASM_X86_XEN_GRANT_TABLE_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
new file mode 100644
index 000000000000..5e79ca694326
--- /dev/null
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -0,0 +1,533 @@
+/******************************************************************************
+ * hypercall.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _ASM_X86_XEN_HYPERCALL_H
+#define _ASM_X86_XEN_HYPERCALL_H
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/sched.h>
+#include <xen/interface/physdev.h>
+
+/*
+ * The hypercall asms have to meet several constraints:
+ * - Work on 32- and 64-bit.
+ * The two architectures put their arguments in different sets of
+ * registers.
+ *
+ * - Work around asm syntax quirks
+ * It isn't possible to specify one of the rNN registers in a
+ * constraint, so we use explicit register variables to get the
+ * args into the right place.
+ *
+ * - Mark all registers as potentially clobbered
+ * Even unused parameters can be clobbered by the hypervisor, so we
+ * need to make sure gcc knows it.
+ *
+ * - Avoid compiler bugs.
+ * This is the tricky part. Because x86_32 has such a constrained
+ * register set, gcc versions below 4.3 have trouble generating
+ * code when all the arg registers and memory are trashed by the
+ * asm. There are syntactically simpler ways of achieving the
+ * semantics below, but they cause the compiler to crash.
+ *
+ * The only combination I found which works is:
+ * - assign the __argX variables first
+ * - list all actually used parameters as "+r" (__argX)
+ * - clobber the rest
+ *
+ * The result certainly isn't pretty, and it really shows up cpp's
+ * weakness as as macro language. Sorry. (But let's just give thanks
+ * there aren't more than 5 arguments...)
+ */
+
+extern struct { char _entry[32]; } hypercall_page[];
+
+#define __HYPERCALL "call hypercall_page+%c[offset]"
+#define __HYPERCALL_ENTRY(x) \
+ [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0]))
+
+#ifdef CONFIG_X86_32
+#define __HYPERCALL_RETREG "eax"
+#define __HYPERCALL_ARG1REG "ebx"
+#define __HYPERCALL_ARG2REG "ecx"
+#define __HYPERCALL_ARG3REG "edx"
+#define __HYPERCALL_ARG4REG "esi"
+#define __HYPERCALL_ARG5REG "edi"
+#else
+#define __HYPERCALL_RETREG "rax"
+#define __HYPERCALL_ARG1REG "rdi"
+#define __HYPERCALL_ARG2REG "rsi"
+#define __HYPERCALL_ARG3REG "rdx"
+#define __HYPERCALL_ARG4REG "r10"
+#define __HYPERCALL_ARG5REG "r8"
+#endif
+
+#define __HYPERCALL_DECLS \
+ register unsigned long __res asm(__HYPERCALL_RETREG); \
+ register unsigned long __arg1 asm(__HYPERCALL_ARG1REG) = __arg1; \
+ register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \
+ register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \
+ register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \
+ register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5;
+
+#define __HYPERCALL_0PARAM "=r" (__res)
+#define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1)
+#define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2)
+#define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3)
+#define __HYPERCALL_4PARAM __HYPERCALL_3PARAM, "+r" (__arg4)
+#define __HYPERCALL_5PARAM __HYPERCALL_4PARAM, "+r" (__arg5)
+
+#define __HYPERCALL_0ARG()
+#define __HYPERCALL_1ARG(a1) \
+ __HYPERCALL_0ARG() __arg1 = (unsigned long)(a1);
+#define __HYPERCALL_2ARG(a1,a2) \
+ __HYPERCALL_1ARG(a1) __arg2 = (unsigned long)(a2);
+#define __HYPERCALL_3ARG(a1,a2,a3) \
+ __HYPERCALL_2ARG(a1,a2) __arg3 = (unsigned long)(a3);
+#define __HYPERCALL_4ARG(a1,a2,a3,a4) \
+ __HYPERCALL_3ARG(a1,a2,a3) __arg4 = (unsigned long)(a4);
+#define __HYPERCALL_5ARG(a1,a2,a3,a4,a5) \
+ __HYPERCALL_4ARG(a1,a2,a3,a4) __arg5 = (unsigned long)(a5);
+
+#define __HYPERCALL_CLOBBER5 "memory"
+#define __HYPERCALL_CLOBBER4 __HYPERCALL_CLOBBER5, __HYPERCALL_ARG5REG
+#define __HYPERCALL_CLOBBER3 __HYPERCALL_CLOBBER4, __HYPERCALL_ARG4REG
+#define __HYPERCALL_CLOBBER2 __HYPERCALL_CLOBBER3, __HYPERCALL_ARG3REG
+#define __HYPERCALL_CLOBBER1 __HYPERCALL_CLOBBER2, __HYPERCALL_ARG2REG
+#define __HYPERCALL_CLOBBER0 __HYPERCALL_CLOBBER1, __HYPERCALL_ARG1REG
+
+#define _hypercall0(type, name) \
+({ \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_0ARG(); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_0PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER0); \
+ (type)__res; \
+})
+
+#define _hypercall1(type, name, a1) \
+({ \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_1ARG(a1); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_1PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER1); \
+ (type)__res; \
+})
+
+#define _hypercall2(type, name, a1, a2) \
+({ \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_2ARG(a1, a2); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_2PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER2); \
+ (type)__res; \
+})
+
+#define _hypercall3(type, name, a1, a2, a3) \
+({ \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_3ARG(a1, a2, a3); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_3PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER3); \
+ (type)__res; \
+})
+
+#define _hypercall4(type, name, a1, a2, a3, a4) \
+({ \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_4ARG(a1, a2, a3, a4); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_4PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER4); \
+ (type)__res; \
+})
+
+#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
+({ \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_5ARG(a1, a2, a3, a4, a5); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_5PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER5); \
+ (type)__res; \
+})
+
+static inline int
+HYPERVISOR_set_trap_table(struct trap_info *table)
+{
+ return _hypercall1(int, set_trap_table, table);
+}
+
+static inline int
+HYPERVISOR_mmu_update(struct mmu_update *req, int count,
+ int *success_count, domid_t domid)
+{
+ return _hypercall4(int, mmu_update, req, count, success_count, domid);
+}
+
+static inline int
+HYPERVISOR_mmuext_op(struct mmuext_op *op, int count,
+ int *success_count, domid_t domid)
+{
+ return _hypercall4(int, mmuext_op, op, count, success_count, domid);
+}
+
+static inline int
+HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
+{
+ return _hypercall2(int, set_gdt, frame_list, entries);
+}
+
+static inline int
+HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
+{
+ return _hypercall2(int, stack_switch, ss, esp);
+}
+
+#ifdef CONFIG_X86_32
+static inline int
+HYPERVISOR_set_callbacks(unsigned long event_selector,
+ unsigned long event_address,
+ unsigned long failsafe_selector,
+ unsigned long failsafe_address)
+{
+ return _hypercall4(int, set_callbacks,
+ event_selector, event_address,
+ failsafe_selector, failsafe_address);
+}
+#else /* CONFIG_X86_64 */
+static inline int
+HYPERVISOR_set_callbacks(unsigned long event_address,
+ unsigned long failsafe_address,
+ unsigned long syscall_address)
+{
+ return _hypercall3(int, set_callbacks,
+ event_address, failsafe_address,
+ syscall_address);
+}
+#endif /* CONFIG_X86_{32,64} */
+
+static inline int
+HYPERVISOR_callback_op(int cmd, void *arg)
+{
+ return _hypercall2(int, callback_op, cmd, arg);
+}
+
+static inline int
+HYPERVISOR_fpu_taskswitch(int set)
+{
+ return _hypercall1(int, fpu_taskswitch, set);
+}
+
+static inline int
+HYPERVISOR_sched_op(int cmd, void *arg)
+{
+ return _hypercall2(int, sched_op_new, cmd, arg);
+}
+
+static inline long
+HYPERVISOR_set_timer_op(u64 timeout)
+{
+ unsigned long timeout_hi = (unsigned long)(timeout>>32);
+ unsigned long timeout_lo = (unsigned long)timeout;
+ return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
+}
+
+static inline int
+HYPERVISOR_set_debugreg(int reg, unsigned long value)
+{
+ return _hypercall2(int, set_debugreg, reg, value);
+}
+
+static inline unsigned long
+HYPERVISOR_get_debugreg(int reg)
+{
+ return _hypercall1(unsigned long, get_debugreg, reg);
+}
+
+static inline int
+HYPERVISOR_update_descriptor(u64 ma, u64 desc)
+{
+ return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
+}
+
+static inline int
+HYPERVISOR_memory_op(unsigned int cmd, void *arg)
+{
+ return _hypercall2(int, memory_op, cmd, arg);
+}
+
+static inline int
+HYPERVISOR_multicall(void *call_list, int nr_calls)
+{
+ return _hypercall2(int, multicall, call_list, nr_calls);
+}
+
+static inline int
+HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
+ unsigned long flags)
+{
+ if (sizeof(new_val) == sizeof(long))
+ return _hypercall3(int, update_va_mapping, va,
+ new_val.pte, flags);
+ else
+ return _hypercall4(int, update_va_mapping, va,
+ new_val.pte, new_val.pte >> 32, flags);
+}
+
+static inline int
+HYPERVISOR_event_channel_op(int cmd, void *arg)
+{
+ int rc = _hypercall2(int, event_channel_op, cmd, arg);
+ if (unlikely(rc == -ENOSYS)) {
+ struct evtchn_op op;
+ op.cmd = cmd;
+ memcpy(&op.u, arg, sizeof(op.u));
+ rc = _hypercall1(int, event_channel_op_compat, &op);
+ memcpy(arg, &op.u, sizeof(op.u));
+ }
+ return rc;
+}
+
+static inline int
+HYPERVISOR_xen_version(int cmd, void *arg)
+{
+ return _hypercall2(int, xen_version, cmd, arg);
+}
+
+static inline int
+HYPERVISOR_console_io(int cmd, int count, char *str)
+{
+ return _hypercall3(int, console_io, cmd, count, str);
+}
+
+static inline int
+HYPERVISOR_physdev_op(int cmd, void *arg)
+{
+ int rc = _hypercall2(int, physdev_op, cmd, arg);
+ if (unlikely(rc == -ENOSYS)) {
+ struct physdev_op op;
+ op.cmd = cmd;
+ memcpy(&op.u, arg, sizeof(op.u));
+ rc = _hypercall1(int, physdev_op_compat, &op);
+ memcpy(arg, &op.u, sizeof(op.u));
+ }
+ return rc;
+}
+
+static inline int
+HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count)
+{
+ return _hypercall3(int, grant_table_op, cmd, uop, count);
+}
+
+static inline int
+HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val,
+ unsigned long flags, domid_t domid)
+{
+ if (sizeof(new_val) == sizeof(long))
+ return _hypercall4(int, update_va_mapping_otherdomain, va,
+ new_val.pte, flags, domid);
+ else
+ return _hypercall5(int, update_va_mapping_otherdomain, va,
+ new_val.pte, new_val.pte >> 32,
+ flags, domid);
+}
+
+static inline int
+HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type)
+{
+ return _hypercall2(int, vm_assist, cmd, type);
+}
+
+static inline int
+HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args)
+{
+ return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
+}
+
+#ifdef CONFIG_X86_64
+static inline int
+HYPERVISOR_set_segment_base(int reg, unsigned long value)
+{
+ return _hypercall2(int, set_segment_base, reg, value);
+}
+#endif
+
+static inline int
+HYPERVISOR_suspend(unsigned long srec)
+{
+ return _hypercall3(int, sched_op, SCHEDOP_shutdown,
+ SHUTDOWN_suspend, srec);
+}
+
+static inline int
+HYPERVISOR_nmi_op(unsigned long op, unsigned long arg)
+{
+ return _hypercall2(int, nmi_op, op, arg);
+}
+
+static inline void
+MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
+{
+ mcl->op = __HYPERVISOR_fpu_taskswitch;
+ mcl->args[0] = set;
+}
+
+static inline void
+MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
+ pte_t new_val, unsigned long flags)
+{
+ mcl->op = __HYPERVISOR_update_va_mapping;
+ mcl->args[0] = va;
+ if (sizeof(new_val) == sizeof(long)) {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = flags;
+ } else {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = new_val.pte >> 32;
+ mcl->args[3] = flags;
+ }
+}
+
+static inline void
+MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd,
+ void *uop, unsigned int count)
+{
+ mcl->op = __HYPERVISOR_grant_table_op;
+ mcl->args[0] = cmd;
+ mcl->args[1] = (unsigned long)uop;
+ mcl->args[2] = count;
+}
+
+static inline void
+MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long va,
+ pte_t new_val, unsigned long flags,
+ domid_t domid)
+{
+ mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
+ mcl->args[0] = va;
+ if (sizeof(new_val) == sizeof(long)) {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = flags;
+ mcl->args[3] = domid;
+ } else {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = new_val.pte >> 32;
+ mcl->args[3] = flags;
+ mcl->args[4] = domid;
+ }
+}
+
+static inline void
+MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
+ struct desc_struct desc)
+{
+ mcl->op = __HYPERVISOR_update_descriptor;
+ if (sizeof(maddr) == sizeof(long)) {
+ mcl->args[0] = maddr;
+ mcl->args[1] = *(unsigned long *)&desc;
+ } else {
+ mcl->args[0] = maddr;
+ mcl->args[1] = maddr >> 32;
+ mcl->args[2] = desc.a;
+ mcl->args[3] = desc.b;
+ }
+}
+
+static inline void
+MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg)
+{
+ mcl->op = __HYPERVISOR_memory_op;
+ mcl->args[0] = cmd;
+ mcl->args[1] = (unsigned long)arg;
+}
+
+static inline void
+MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
+ int count, int *success_count, domid_t domid)
+{
+ mcl->op = __HYPERVISOR_mmu_update;
+ mcl->args[0] = (unsigned long)req;
+ mcl->args[1] = count;
+ mcl->args[2] = (unsigned long)success_count;
+ mcl->args[3] = domid;
+}
+
+static inline void
+MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
+ int *success_count, domid_t domid)
+{
+ mcl->op = __HYPERVISOR_mmuext_op;
+ mcl->args[0] = (unsigned long)op;
+ mcl->args[1] = count;
+ mcl->args[2] = (unsigned long)success_count;
+ mcl->args[3] = domid;
+}
+
+static inline void
+MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries)
+{
+ mcl->op = __HYPERVISOR_set_gdt;
+ mcl->args[0] = (unsigned long)frames;
+ mcl->args[1] = entries;
+}
+
+static inline void
+MULTI_stack_switch(struct multicall_entry *mcl,
+ unsigned long ss, unsigned long esp)
+{
+ mcl->op = __HYPERVISOR_stack_switch;
+ mcl->args[0] = ss;
+ mcl->args[1] = esp;
+}
+
+#endif /* _ASM_X86_XEN_HYPERCALL_H */
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
new file mode 100644
index 000000000000..81fbd735aec4
--- /dev/null
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -0,0 +1,59 @@
+/******************************************************************************
+ * hypervisor.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _ASM_X86_XEN_HYPERVISOR_H
+#define _ASM_X86_XEN_HYPERVISOR_H
+
+/* arch/i386/kernel/setup.c */
+extern struct shared_info *HYPERVISOR_shared_info;
+extern struct start_info *xen_start_info;
+
+enum xen_domain_type {
+ XEN_NATIVE,
+ XEN_PV_DOMAIN,
+ XEN_HVM_DOMAIN,
+};
+
+extern enum xen_domain_type xen_domain_type;
+
+#ifdef CONFIG_XEN
+#define xen_domain() (xen_domain_type != XEN_NATIVE)
+#else
+#define xen_domain() (0)
+#endif
+
+#define xen_pv_domain() (xen_domain() && xen_domain_type == XEN_PV_DOMAIN)
+#define xen_hvm_domain() (xen_domain() && xen_domain_type == XEN_HVM_DOMAIN)
+
+#define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN)
+
+#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
new file mode 100644
index 000000000000..e8506c1f0c55
--- /dev/null
+++ b/arch/x86/include/asm/xen/interface.h
@@ -0,0 +1,175 @@
+/******************************************************************************
+ * arch-x86_32.h
+ *
+ * Guest OS interface to x86 Xen.
+ *
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef _ASM_X86_XEN_INTERFACE_H
+#define _ASM_X86_XEN_INTERFACE_H
+
+#ifdef __XEN__
+#define __DEFINE_GUEST_HANDLE(name, type) \
+ typedef struct { type *p; } __guest_handle_ ## name
+#else
+#define __DEFINE_GUEST_HANDLE(name, type) \
+ typedef type * __guest_handle_ ## name
+#endif
+
+#define DEFINE_GUEST_HANDLE_STRUCT(name) \
+ __DEFINE_GUEST_HANDLE(name, struct name)
+#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
+#define GUEST_HANDLE(name) __guest_handle_ ## name
+
+#ifdef __XEN__
+#if defined(__i386__)
+#define set_xen_guest_handle(hnd, val) \
+ do { \
+ if (sizeof(hnd) == 8) \
+ *(uint64_t *)&(hnd) = 0; \
+ (hnd).p = val; \
+ } while (0)
+#elif defined(__x86_64__)
+#define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0)
+#endif
+#else
+#if defined(__i386__)
+#define set_xen_guest_handle(hnd, val) \
+ do { \
+ if (sizeof(hnd) == 8) \
+ *(uint64_t *)&(hnd) = 0; \
+ (hnd) = val; \
+ } while (0)
+#elif defined(__x86_64__)
+#define set_xen_guest_handle(hnd, val) do { (hnd) = val; } while (0)
+#endif
+#endif
+
+#ifndef __ASSEMBLY__
+/* Guest handles for primitive C types. */
+__DEFINE_GUEST_HANDLE(uchar, unsigned char);
+__DEFINE_GUEST_HANDLE(uint, unsigned int);
+__DEFINE_GUEST_HANDLE(ulong, unsigned long);
+DEFINE_GUEST_HANDLE(char);
+DEFINE_GUEST_HANDLE(int);
+DEFINE_GUEST_HANDLE(long);
+DEFINE_GUEST_HANDLE(void);
+#endif
+
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#endif
+
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+#endif
+
+/* Maximum number of virtual CPUs in multi-processor guests. */
+#define MAX_VIRT_CPUS 32
+
+/*
+ * SEGMENT DESCRIPTOR TABLES
+ */
+/*
+ * A number of GDT entries are reserved by Xen. These are not situated at the
+ * start of the GDT because some stupid OSes export hard-coded selector values
+ * in their ABI. These hard-coded values are always near the start of the GDT,
+ * so Xen places itself out of the way, at the far end of the GDT.
+ */
+#define FIRST_RESERVED_GDT_PAGE 14
+#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096)
+#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
+
+/*
+ * Send an array of these to HYPERVISOR_set_trap_table()
+ * The privilege level specifies which modes may enter a trap via a software
+ * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
+ * privilege levels as follows:
+ * Level == 0: Noone may enter
+ * Level == 1: Kernel may enter
+ * Level == 2: Kernel may enter
+ * Level == 3: Everyone may enter
+ */
+#define TI_GET_DPL(_ti) ((_ti)->flags & 3)
+#define TI_GET_IF(_ti) ((_ti)->flags & 4)
+#define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl))
+#define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2))
+
+#ifndef __ASSEMBLY__
+struct trap_info {
+ uint8_t vector; /* exception vector */
+ uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */
+ uint16_t cs; /* code selector */
+ unsigned long address; /* code offset */
+};
+DEFINE_GUEST_HANDLE_STRUCT(trap_info);
+
+struct arch_shared_info {
+ unsigned long max_pfn; /* max pfn that appears in table */
+ /* Frame containing list of mfns containing list of mfns containing p2m. */
+ unsigned long pfn_to_mfn_frame_list_list;
+ unsigned long nmi_reason;
+};
+#endif /* !__ASSEMBLY__ */
+
+#ifdef CONFIG_X86_32
+#include "interface_32.h"
+#else
+#include "interface_64.h"
+#endif
+
+#ifndef __ASSEMBLY__
+/*
+ * The following is all CPU context. Note that the fpu_ctxt block is filled
+ * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ */
+struct vcpu_guest_context {
+ /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
+ struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
+#define VGCF_I387_VALID (1<<0)
+#define VGCF_HVM_GUEST (1<<1)
+#define VGCF_IN_KERNEL (1<<2)
+ unsigned long flags; /* VGCF_* flags */
+ struct cpu_user_regs user_regs; /* User-level CPU registers */
+ struct trap_info trap_ctxt[256]; /* Virtual IDT */
+ unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
+ unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
+ unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
+ /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
+ unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
+ unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
+#ifdef __i386__
+ unsigned long event_callback_cs; /* CS:EIP of event callback */
+ unsigned long event_callback_eip;
+ unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
+ unsigned long failsafe_callback_eip;
+#else
+ unsigned long event_callback_eip;
+ unsigned long failsafe_callback_eip;
+ unsigned long syscall_callback_eip;
+#endif
+ unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
+#ifdef __x86_64__
+ /* Segment base addresses. */
+ uint64_t fs_base;
+ uint64_t gs_base_kernel;
+ uint64_t gs_base_user;
+#endif
+};
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
+#endif /* !__ASSEMBLY__ */
+
+/*
+ * Prefix forces emulation of some non-trapping instructions.
+ * Currently only CPUID.
+ */
+#ifdef __ASSEMBLY__
+#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
+#define XEN_CPUID XEN_EMULATE_PREFIX cpuid
+#else
+#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
+#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid"
+#endif
+
+#endif /* _ASM_X86_XEN_INTERFACE_H */
diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h
new file mode 100644
index 000000000000..42a7e004ae5c
--- /dev/null
+++ b/arch/x86/include/asm/xen/interface_32.h
@@ -0,0 +1,97 @@
+/******************************************************************************
+ * arch-x86_32.h
+ *
+ * Guest OS interface to x86 32-bit Xen.
+ *
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef _ASM_X86_XEN_INTERFACE_32_H
+#define _ASM_X86_XEN_INTERFACE_32_H
+
+
+/*
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+#define FLAT_RING1_CS 0xe019 /* GDT index 259 */
+#define FLAT_RING1_DS 0xe021 /* GDT index 260 */
+#define FLAT_RING1_SS 0xe021 /* GDT index 260 */
+#define FLAT_RING3_CS 0xe02b /* GDT index 261 */
+#define FLAT_RING3_DS 0xe033 /* GDT index 262 */
+#define FLAT_RING3_SS 0xe033 /* GDT index 262 */
+
+#define FLAT_KERNEL_CS FLAT_RING1_CS
+#define FLAT_KERNEL_DS FLAT_RING1_DS
+#define FLAT_KERNEL_SS FLAT_RING1_SS
+#define FLAT_USER_CS FLAT_RING3_CS
+#define FLAT_USER_DS FLAT_RING3_DS
+#define FLAT_USER_SS FLAT_RING3_SS
+
+/* And the trap vector is... */
+#define TRAP_INSTR "int $0x82"
+
+/*
+ * Virtual addresses beyond this are not modifiable by guest OSes. The
+ * machine->physical mapping table starts at this address, read-only.
+ */
+#define __HYPERVISOR_VIRT_START 0xF5800000
+
+#ifndef __ASSEMBLY__
+
+struct cpu_user_regs {
+ uint32_t ebx;
+ uint32_t ecx;
+ uint32_t edx;
+ uint32_t esi;
+ uint32_t edi;
+ uint32_t ebp;
+ uint32_t eax;
+ uint16_t error_code; /* private */
+ uint16_t entry_vector; /* private */
+ uint32_t eip;
+ uint16_t cs;
+ uint8_t saved_upcall_mask;
+ uint8_t _pad0;
+ uint32_t eflags; /* eflags.IF == !saved_upcall_mask */
+ uint32_t esp;
+ uint16_t ss, _pad1;
+ uint16_t es, _pad2;
+ uint16_t ds, _pad3;
+ uint16_t fs, _pad4;
+ uint16_t gs, _pad5;
+};
+DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+
+typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+
+struct arch_vcpu_info {
+ unsigned long cr2;
+ unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
+};
+
+struct xen_callback {
+ unsigned long cs;
+ unsigned long eip;
+};
+typedef struct xen_callback xen_callback_t;
+
+#define XEN_CALLBACK(__cs, __eip) \
+ ((struct xen_callback){ .cs = (__cs), .eip = (unsigned long)(__eip) })
+#endif /* !__ASSEMBLY__ */
+
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ *
+ * Note that Xen is using the fact that the pagetable base is always
+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
+ * of cr3.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+#endif /* _ASM_X86_XEN_INTERFACE_32_H */
diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
new file mode 100644
index 000000000000..100d2662b97c
--- /dev/null
+++ b/arch/x86/include/asm/xen/interface_64.h
@@ -0,0 +1,159 @@
+#ifndef _ASM_X86_XEN_INTERFACE_64_H
+#define _ASM_X86_XEN_INTERFACE_64_H
+
+/*
+ * 64-bit segment selectors
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+
+#define FLAT_RING3_CS32 0xe023 /* GDT index 260 */
+#define FLAT_RING3_CS64 0xe033 /* GDT index 261 */
+#define FLAT_RING3_DS32 0xe02b /* GDT index 262 */
+#define FLAT_RING3_DS64 0x0000 /* NULL selector */
+#define FLAT_RING3_SS32 0xe02b /* GDT index 262 */
+#define FLAT_RING3_SS64 0xe02b /* GDT index 262 */
+
+#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
+#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
+#define FLAT_KERNEL_DS FLAT_KERNEL_DS64
+#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
+#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
+#define FLAT_KERNEL_CS FLAT_KERNEL_CS64
+#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
+#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
+#define FLAT_KERNEL_SS FLAT_KERNEL_SS64
+
+#define FLAT_USER_DS64 FLAT_RING3_DS64
+#define FLAT_USER_DS32 FLAT_RING3_DS32
+#define FLAT_USER_DS FLAT_USER_DS64
+#define FLAT_USER_CS64 FLAT_RING3_CS64
+#define FLAT_USER_CS32 FLAT_RING3_CS32
+#define FLAT_USER_CS FLAT_USER_CS64
+#define FLAT_USER_SS64 FLAT_RING3_SS64
+#define FLAT_USER_SS32 FLAT_RING3_SS32
+#define FLAT_USER_SS FLAT_USER_SS64
+
+#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
+#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
+#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
+#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
+
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
+#endif
+
+#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+#endif
+
+/*
+ * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
+ * @which == SEGBASE_* ; @base == 64-bit base address
+ * Returns 0 on success.
+ */
+#define SEGBASE_FS 0
+#define SEGBASE_GS_USER 1
+#define SEGBASE_GS_KERNEL 2
+#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
+
+/*
+ * int HYPERVISOR_iret(void)
+ * All arguments are on the kernel stack, in the following format.
+ * Never returns if successful. Current kernel context is lost.
+ * The saved CS is mapped as follows:
+ * RING0 -> RING3 kernel mode.
+ * RING1 -> RING3 kernel mode.
+ * RING2 -> RING3 kernel mode.
+ * RING3 -> RING3 user mode.
+ * However RING0 indicates that the guest kernel should return to iteself
+ * directly with
+ * orb $3,1*8(%rsp)
+ * iretq
+ * If flags contains VGCF_in_syscall:
+ * Restore RAX, RIP, RFLAGS, RSP.
+ * Discard R11, RCX, CS, SS.
+ * Otherwise:
+ * Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
+ * All other registers are saved on hypercall entry and restored to user.
+ */
+/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
+#define _VGCF_in_syscall 8
+#define VGCF_in_syscall (1<<_VGCF_in_syscall)
+#define VGCF_IN_SYSCALL VGCF_in_syscall
+
+#ifndef __ASSEMBLY__
+
+struct iret_context {
+ /* Top of stack (%rsp at point of hypercall). */
+ uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
+ /* Bottom of iret stack frame. */
+};
+
+#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
+#define __DECL_REG(name) union { \
+ uint64_t r ## name, e ## name; \
+ uint32_t _e ## name; \
+}
+#else
+/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
+#define __DECL_REG(name) uint64_t r ## name
+#endif
+
+struct cpu_user_regs {
+ uint64_t r15;
+ uint64_t r14;
+ uint64_t r13;
+ uint64_t r12;
+ __DECL_REG(bp);
+ __DECL_REG(bx);
+ uint64_t r11;
+ uint64_t r10;
+ uint64_t r9;
+ uint64_t r8;
+ __DECL_REG(ax);
+ __DECL_REG(cx);
+ __DECL_REG(dx);
+ __DECL_REG(si);
+ __DECL_REG(di);
+ uint32_t error_code; /* private */
+ uint32_t entry_vector; /* private */
+ __DECL_REG(ip);
+ uint16_t cs, _pad0[1];
+ uint8_t saved_upcall_mask;
+ uint8_t _pad1[3];
+ __DECL_REG(flags); /* rflags.IF == !saved_upcall_mask */
+ __DECL_REG(sp);
+ uint16_t ss, _pad2[3];
+ uint16_t es, _pad3[3];
+ uint16_t ds, _pad4[3];
+ uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
+ uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
+};
+DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+
+#undef __DECL_REG
+
+#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
+#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
+
+struct arch_vcpu_info {
+ unsigned long cr2;
+ unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
+};
+
+typedef unsigned long xen_callback_t;
+
+#define XEN_CALLBACK(__cs, __rip) \
+ ((unsigned long)(__rip))
+
+#endif /* !__ASSEMBLY__ */
+
+
+#endif /* _ASM_X86_XEN_INTERFACE_64_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
new file mode 100644
index 000000000000..7ef617ef1df3
--- /dev/null
+++ b/arch/x86/include/asm/xen/page.h
@@ -0,0 +1,170 @@
+#ifndef _ASM_X86_XEN_PAGE_H
+#define _ASM_X86_XEN_PAGE_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/pfn.h>
+
+#include <asm/uaccess.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#include <xen/interface/xen.h>
+#include <xen/features.h>
+
+/* Xen machine address */
+typedef struct xmaddr {
+ phys_addr_t maddr;
+} xmaddr_t;
+
+/* Xen pseudo-physical address */
+typedef struct xpaddr {
+ phys_addr_t paddr;
+} xpaddr_t;
+
+#define XMADDR(x) ((xmaddr_t) { .maddr = (x) })
+#define XPADDR(x) ((xpaddr_t) { .paddr = (x) })
+
+/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
+#define INVALID_P2M_ENTRY (~0UL)
+#define FOREIGN_FRAME_BIT (1UL<<31)
+#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
+
+/* Maximum amount of memory we can handle in a domain in pages */
+#define MAX_DOMAIN_PAGES \
+ ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
+
+
+extern unsigned long get_phys_to_machine(unsigned long pfn);
+extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+
+static inline unsigned long pfn_to_mfn(unsigned long pfn)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return pfn;
+
+ return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT;
+}
+
+static inline int phys_to_machine_mapping_valid(unsigned long pfn)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 1;
+
+ return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY;
+}
+
+static inline unsigned long mfn_to_pfn(unsigned long mfn)
+{
+ unsigned long pfn;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return mfn;
+
+#if 0
+ if (unlikely((mfn >> machine_to_phys_order) != 0))
+ return max_mapnr;
+#endif
+
+ pfn = 0;
+ /*
+ * The array access can fail (e.g., device space beyond end of RAM).
+ * In such cases it doesn't matter what we return (we return garbage),
+ * but we must handle the fault without crashing!
+ */
+ __get_user(pfn, &machine_to_phys_mapping[mfn]);
+
+ return pfn;
+}
+
+static inline xmaddr_t phys_to_machine(xpaddr_t phys)
+{
+ unsigned offset = phys.paddr & ~PAGE_MASK;
+ return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
+}
+
+static inline xpaddr_t machine_to_phys(xmaddr_t machine)
+{
+ unsigned offset = machine.maddr & ~PAGE_MASK;
+ return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
+}
+
+/*
+ * We detect special mappings in one of two ways:
+ * 1. If the MFN is an I/O page then Xen will set the m2p entry
+ * to be outside our maximum possible pseudophys range.
+ * 2. If the MFN belongs to a different domain then we will certainly
+ * not have MFN in our p2m table. Conversely, if the page is ours,
+ * then we'll have p2m(m2p(MFN))==MFN.
+ * If we detect a special mapping then it doesn't have a 'struct page'.
+ * We force !pfn_valid() by returning an out-of-range pointer.
+ *
+ * NB. These checks require that, for any MFN that is not in our reservation,
+ * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
+ * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
+ * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
+ *
+ * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
+ * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
+ * require. In all the cases we care about, the FOREIGN_FRAME bit is
+ * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
+ */
+static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
+{
+ extern unsigned long max_mapnr;
+ unsigned long pfn = mfn_to_pfn(mfn);
+ if ((pfn < max_mapnr)
+ && !xen_feature(XENFEAT_auto_translated_physmap)
+ && (get_phys_to_machine(pfn) != mfn))
+ return max_mapnr; /* force !pfn_valid() */
+ /* XXX fixme; not true with sparsemem */
+ return pfn;
+}
+
+/* VIRT <-> MACHINE conversion */
+#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v))))
+#define virt_to_mfn(v) (pfn_to_mfn(PFN_DOWN(__pa(v))))
+#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
+
+static inline unsigned long pte_mfn(pte_t pte)
+{
+ return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
+static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
+{
+ pte_t pte;
+
+ pte.pte = ((phys_addr_t)page_nr << PAGE_SHIFT) |
+ (pgprot_val(pgprot) & __supported_pte_mask);
+
+ return pte;
+}
+
+static inline pteval_t pte_val_ma(pte_t pte)
+{
+ return pte.pte;
+}
+
+static inline pte_t __pte_ma(pteval_t x)
+{
+ return (pte_t) { .pte = x };
+}
+
+#define pmd_val_ma(v) ((v).pmd)
+#ifdef __PAGETABLE_PUD_FOLDED
+#define pud_val_ma(v) ((v).pgd.pgd)
+#else
+#define pud_val_ma(v) ((v).pud)
+#endif
+#define __pmd_ma(x) ((pmd_t) { (x) } )
+
+#define pgd_val_ma(x) ((x).pgd)
+
+
+xmaddr_t arbitrary_virt_to_machine(void *address);
+void make_lowmem_page_readonly(void *vaddr);
+void make_lowmem_page_readwrite(void *vaddr);
+
+#endif /* _ASM_X86_XEN_PAGE_H */
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
new file mode 100644
index 000000000000..11b3bb86e17b
--- /dev/null
+++ b/arch/x86/include/asm/xor.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_X86_32
+# include "xor_32.h"
+#else
+# include "xor_64.h"
+#endif
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
new file mode 100644
index 000000000000..133b40a0f495
--- /dev/null
+++ b/arch/x86/include/asm/xor_32.h
@@ -0,0 +1,888 @@
+#ifndef _ASM_X86_XOR_32_H
+#define _ASM_X86_XOR_32_H
+
+/*
+ * Optimized RAID-5 checksumming functions for MMX and SSE.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * High-speed RAID5 checksumming functions utilizing MMX instructions.
+ * Copyright (C) 1998 Ingo Molnar.
+ */
+
+#define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
+#define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
+#define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
+#define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
+#define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
+#define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
+
+#include <asm/i387.h>
+
+static void
+xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ unsigned long lines = bytes >> 7;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ XO1(i, 0) \
+ ST(i, 0) \
+ XO1(i+1, 1) \
+ ST(i+1, 1) \
+ XO1(i + 2, 2) \
+ ST(i + 2, 2) \
+ XO1(i + 3, 3) \
+ ST(i + 3, 3)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $128, %1 ;\n"
+ " addl $128, %2 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2)
+ :
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ unsigned long lines = bytes >> 7;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ XO2(i, 0) \
+ ST(i, 0) \
+ XO2(i + 1, 1) \
+ ST(i + 1, 1) \
+ XO2(i + 2, 2) \
+ ST(i + 2, 2) \
+ XO2(i + 3, 3) \
+ ST(i + 3, 3)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $128, %1 ;\n"
+ " addl $128, %2 ;\n"
+ " addl $128, %3 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3)
+ :
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ unsigned long lines = bytes >> 7;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ XO3(i, 0) \
+ ST(i, 0) \
+ XO3(i + 1, 1) \
+ ST(i + 1, 1) \
+ XO3(i + 2, 2) \
+ ST(i + 2, 2) \
+ XO3(i + 3, 3) \
+ ST(i + 3, 3)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $128, %1 ;\n"
+ " addl $128, %2 ;\n"
+ " addl $128, %3 ;\n"
+ " addl $128, %4 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
+ :
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+
+static void
+xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ unsigned long lines = bytes >> 7;
+
+ kernel_fpu_begin();
+
+ /* Make sure GCC forgets anything it knows about p4 or p5,
+ such that it won't pass to the asm volatile below a
+ register that is shared with any other variable. That's
+ because we modify p4 and p5 there, but we can't mark them
+ as read/write, otherwise we'd overflow the 10-asm-operands
+ limit of GCC < 3.1. */
+ asm("" : "+r" (p4), "+r" (p5));
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ XO3(i, 0) \
+ XO3(i + 1, 1) \
+ XO3(i + 2, 2) \
+ XO3(i + 3, 3) \
+ XO4(i, 0) \
+ ST(i, 0) \
+ XO4(i + 1, 1) \
+ ST(i + 1, 1) \
+ XO4(i + 2, 2) \
+ ST(i + 2, 2) \
+ XO4(i + 3, 3) \
+ ST(i + 3, 3)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $128, %1 ;\n"
+ " addl $128, %2 ;\n"
+ " addl $128, %3 ;\n"
+ " addl $128, %4 ;\n"
+ " addl $128, %5 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3)
+ : "r" (p4), "r" (p5)
+ : "memory");
+
+ /* p4 and p5 were modified, and now the variables are dead.
+ Clobber them just to be sure nobody does something stupid
+ like assuming they have some legal value. */
+ asm("" : "=r" (p4), "=r" (p5));
+
+ kernel_fpu_end();
+}
+
+#undef LD
+#undef XO1
+#undef XO2
+#undef XO3
+#undef XO4
+#undef ST
+#undef BLOCK
+
+static void
+xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ unsigned long lines = bytes >> 6;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+ " .align 32 ;\n"
+ " 1: ;\n"
+ " movq (%1), %%mm0 ;\n"
+ " movq 8(%1), %%mm1 ;\n"
+ " pxor (%2), %%mm0 ;\n"
+ " movq 16(%1), %%mm2 ;\n"
+ " movq %%mm0, (%1) ;\n"
+ " pxor 8(%2), %%mm1 ;\n"
+ " movq 24(%1), %%mm3 ;\n"
+ " movq %%mm1, 8(%1) ;\n"
+ " pxor 16(%2), %%mm2 ;\n"
+ " movq 32(%1), %%mm4 ;\n"
+ " movq %%mm2, 16(%1) ;\n"
+ " pxor 24(%2), %%mm3 ;\n"
+ " movq 40(%1), %%mm5 ;\n"
+ " movq %%mm3, 24(%1) ;\n"
+ " pxor 32(%2), %%mm4 ;\n"
+ " movq 48(%1), %%mm6 ;\n"
+ " movq %%mm4, 32(%1) ;\n"
+ " pxor 40(%2), %%mm5 ;\n"
+ " movq 56(%1), %%mm7 ;\n"
+ " movq %%mm5, 40(%1) ;\n"
+ " pxor 48(%2), %%mm6 ;\n"
+ " pxor 56(%2), %%mm7 ;\n"
+ " movq %%mm6, 48(%1) ;\n"
+ " movq %%mm7, 56(%1) ;\n"
+
+ " addl $64, %1 ;\n"
+ " addl $64, %2 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2)
+ :
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ unsigned long lines = bytes >> 6;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+ " .align 32,0x90 ;\n"
+ " 1: ;\n"
+ " movq (%1), %%mm0 ;\n"
+ " movq 8(%1), %%mm1 ;\n"
+ " pxor (%2), %%mm0 ;\n"
+ " movq 16(%1), %%mm2 ;\n"
+ " pxor 8(%2), %%mm1 ;\n"
+ " pxor (%3), %%mm0 ;\n"
+ " pxor 16(%2), %%mm2 ;\n"
+ " movq %%mm0, (%1) ;\n"
+ " pxor 8(%3), %%mm1 ;\n"
+ " pxor 16(%3), %%mm2 ;\n"
+ " movq 24(%1), %%mm3 ;\n"
+ " movq %%mm1, 8(%1) ;\n"
+ " movq 32(%1), %%mm4 ;\n"
+ " movq 40(%1), %%mm5 ;\n"
+ " pxor 24(%2), %%mm3 ;\n"
+ " movq %%mm2, 16(%1) ;\n"
+ " pxor 32(%2), %%mm4 ;\n"
+ " pxor 24(%3), %%mm3 ;\n"
+ " pxor 40(%2), %%mm5 ;\n"
+ " movq %%mm3, 24(%1) ;\n"
+ " pxor 32(%3), %%mm4 ;\n"
+ " pxor 40(%3), %%mm5 ;\n"
+ " movq 48(%1), %%mm6 ;\n"
+ " movq %%mm4, 32(%1) ;\n"
+ " movq 56(%1), %%mm7 ;\n"
+ " pxor 48(%2), %%mm6 ;\n"
+ " movq %%mm5, 40(%1) ;\n"
+ " pxor 56(%2), %%mm7 ;\n"
+ " pxor 48(%3), %%mm6 ;\n"
+ " pxor 56(%3), %%mm7 ;\n"
+ " movq %%mm6, 48(%1) ;\n"
+ " movq %%mm7, 56(%1) ;\n"
+
+ " addl $64, %1 ;\n"
+ " addl $64, %2 ;\n"
+ " addl $64, %3 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3)
+ :
+ : "memory" );
+
+ kernel_fpu_end();
+}
+
+static void
+xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ unsigned long lines = bytes >> 6;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+ " .align 32,0x90 ;\n"
+ " 1: ;\n"
+ " movq (%1), %%mm0 ;\n"
+ " movq 8(%1), %%mm1 ;\n"
+ " pxor (%2), %%mm0 ;\n"
+ " movq 16(%1), %%mm2 ;\n"
+ " pxor 8(%2), %%mm1 ;\n"
+ " pxor (%3), %%mm0 ;\n"
+ " pxor 16(%2), %%mm2 ;\n"
+ " pxor 8(%3), %%mm1 ;\n"
+ " pxor (%4), %%mm0 ;\n"
+ " movq 24(%1), %%mm3 ;\n"
+ " pxor 16(%3), %%mm2 ;\n"
+ " pxor 8(%4), %%mm1 ;\n"
+ " movq %%mm0, (%1) ;\n"
+ " movq 32(%1), %%mm4 ;\n"
+ " pxor 24(%2), %%mm3 ;\n"
+ " pxor 16(%4), %%mm2 ;\n"
+ " movq %%mm1, 8(%1) ;\n"
+ " movq 40(%1), %%mm5 ;\n"
+ " pxor 32(%2), %%mm4 ;\n"
+ " pxor 24(%3), %%mm3 ;\n"
+ " movq %%mm2, 16(%1) ;\n"
+ " pxor 40(%2), %%mm5 ;\n"
+ " pxor 32(%3), %%mm4 ;\n"
+ " pxor 24(%4), %%mm3 ;\n"
+ " movq %%mm3, 24(%1) ;\n"
+ " movq 56(%1), %%mm7 ;\n"
+ " movq 48(%1), %%mm6 ;\n"
+ " pxor 40(%3), %%mm5 ;\n"
+ " pxor 32(%4), %%mm4 ;\n"
+ " pxor 48(%2), %%mm6 ;\n"
+ " movq %%mm4, 32(%1) ;\n"
+ " pxor 56(%2), %%mm7 ;\n"
+ " pxor 40(%4), %%mm5 ;\n"
+ " pxor 48(%3), %%mm6 ;\n"
+ " pxor 56(%3), %%mm7 ;\n"
+ " movq %%mm5, 40(%1) ;\n"
+ " pxor 48(%4), %%mm6 ;\n"
+ " pxor 56(%4), %%mm7 ;\n"
+ " movq %%mm6, 48(%1) ;\n"
+ " movq %%mm7, 56(%1) ;\n"
+
+ " addl $64, %1 ;\n"
+ " addl $64, %2 ;\n"
+ " addl $64, %3 ;\n"
+ " addl $64, %4 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
+ :
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ unsigned long lines = bytes >> 6;
+
+ kernel_fpu_begin();
+
+ /* Make sure GCC forgets anything it knows about p4 or p5,
+ such that it won't pass to the asm volatile below a
+ register that is shared with any other variable. That's
+ because we modify p4 and p5 there, but we can't mark them
+ as read/write, otherwise we'd overflow the 10-asm-operands
+ limit of GCC < 3.1. */
+ asm("" : "+r" (p4), "+r" (p5));
+
+ asm volatile(
+ " .align 32,0x90 ;\n"
+ " 1: ;\n"
+ " movq (%1), %%mm0 ;\n"
+ " movq 8(%1), %%mm1 ;\n"
+ " pxor (%2), %%mm0 ;\n"
+ " pxor 8(%2), %%mm1 ;\n"
+ " movq 16(%1), %%mm2 ;\n"
+ " pxor (%3), %%mm0 ;\n"
+ " pxor 8(%3), %%mm1 ;\n"
+ " pxor 16(%2), %%mm2 ;\n"
+ " pxor (%4), %%mm0 ;\n"
+ " pxor 8(%4), %%mm1 ;\n"
+ " pxor 16(%3), %%mm2 ;\n"
+ " movq 24(%1), %%mm3 ;\n"
+ " pxor (%5), %%mm0 ;\n"
+ " pxor 8(%5), %%mm1 ;\n"
+ " movq %%mm0, (%1) ;\n"
+ " pxor 16(%4), %%mm2 ;\n"
+ " pxor 24(%2), %%mm3 ;\n"
+ " movq %%mm1, 8(%1) ;\n"
+ " pxor 16(%5), %%mm2 ;\n"
+ " pxor 24(%3), %%mm3 ;\n"
+ " movq 32(%1), %%mm4 ;\n"
+ " movq %%mm2, 16(%1) ;\n"
+ " pxor 24(%4), %%mm3 ;\n"
+ " pxor 32(%2), %%mm4 ;\n"
+ " movq 40(%1), %%mm5 ;\n"
+ " pxor 24(%5), %%mm3 ;\n"
+ " pxor 32(%3), %%mm4 ;\n"
+ " pxor 40(%2), %%mm5 ;\n"
+ " movq %%mm3, 24(%1) ;\n"
+ " pxor 32(%4), %%mm4 ;\n"
+ " pxor 40(%3), %%mm5 ;\n"
+ " movq 48(%1), %%mm6 ;\n"
+ " movq 56(%1), %%mm7 ;\n"
+ " pxor 32(%5), %%mm4 ;\n"
+ " pxor 40(%4), %%mm5 ;\n"
+ " pxor 48(%2), %%mm6 ;\n"
+ " pxor 56(%2), %%mm7 ;\n"
+ " movq %%mm4, 32(%1) ;\n"
+ " pxor 48(%3), %%mm6 ;\n"
+ " pxor 56(%3), %%mm7 ;\n"
+ " pxor 40(%5), %%mm5 ;\n"
+ " pxor 48(%4), %%mm6 ;\n"
+ " pxor 56(%4), %%mm7 ;\n"
+ " movq %%mm5, 40(%1) ;\n"
+ " pxor 48(%5), %%mm6 ;\n"
+ " pxor 56(%5), %%mm7 ;\n"
+ " movq %%mm6, 48(%1) ;\n"
+ " movq %%mm7, 56(%1) ;\n"
+
+ " addl $64, %1 ;\n"
+ " addl $64, %2 ;\n"
+ " addl $64, %3 ;\n"
+ " addl $64, %4 ;\n"
+ " addl $64, %5 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3)
+ : "r" (p4), "r" (p5)
+ : "memory");
+
+ /* p4 and p5 were modified, and now the variables are dead.
+ Clobber them just to be sure nobody does something stupid
+ like assuming they have some legal value. */
+ asm("" : "=r" (p4), "=r" (p5));
+
+ kernel_fpu_end();
+}
+
+static struct xor_block_template xor_block_pII_mmx = {
+ .name = "pII_mmx",
+ .do_2 = xor_pII_mmx_2,
+ .do_3 = xor_pII_mmx_3,
+ .do_4 = xor_pII_mmx_4,
+ .do_5 = xor_pII_mmx_5,
+};
+
+static struct xor_block_template xor_block_p5_mmx = {
+ .name = "p5_mmx",
+ .do_2 = xor_p5_mmx_2,
+ .do_3 = xor_p5_mmx_3,
+ .do_4 = xor_p5_mmx_4,
+ .do_5 = xor_p5_mmx_5,
+};
+
+/*
+ * Cache avoiding checksumming functions utilizing KNI instructions
+ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
+ */
+
+#define XMMS_SAVE \
+do { \
+ preempt_disable(); \
+ cr0 = read_cr0(); \
+ clts(); \
+ asm volatile( \
+ "movups %%xmm0,(%0) ;\n\t" \
+ "movups %%xmm1,0x10(%0) ;\n\t" \
+ "movups %%xmm2,0x20(%0) ;\n\t" \
+ "movups %%xmm3,0x30(%0) ;\n\t" \
+ : \
+ : "r" (xmm_save) \
+ : "memory"); \
+} while (0)
+
+#define XMMS_RESTORE \
+do { \
+ asm volatile( \
+ "sfence ;\n\t" \
+ "movups (%0),%%xmm0 ;\n\t" \
+ "movups 0x10(%0),%%xmm1 ;\n\t" \
+ "movups 0x20(%0),%%xmm2 ;\n\t" \
+ "movups 0x30(%0),%%xmm3 ;\n\t" \
+ : \
+ : "r" (xmm_save) \
+ : "memory"); \
+ write_cr0(cr0); \
+ preempt_enable(); \
+} while (0)
+
+#define ALIGN16 __attribute__((aligned(16)))
+
+#define OFFS(x) "16*("#x")"
+#define PF_OFFS(x) "256+16*("#x")"
+#define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
+#define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
+#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
+#define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n"
+#define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n"
+#define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n"
+#define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n"
+#define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n"
+#define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
+#define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
+#define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
+#define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
+#define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
+
+
+static void
+xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ unsigned long lines = bytes >> 8;
+ char xmm_save[16*4] ALIGN16;
+ int cr0;
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $256, %1 ;\n"
+ " addl $256, %2 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2)
+ :
+ : "memory");
+
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ unsigned long lines = bytes >> 8;
+ char xmm_save[16*4] ALIGN16;
+ int cr0;
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i,0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO1(i,0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ XO2(i,0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ ST(i,0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $256, %1 ;\n"
+ " addl $256, %2 ;\n"
+ " addl $256, %3 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r"(p2), "+r"(p3)
+ :
+ : "memory" );
+
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ unsigned long lines = bytes >> 8;
+ char xmm_save[16*4] ALIGN16;
+ int cr0;
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i,0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ XO1(i,0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ PF3(i) \
+ PF3(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO2(i,0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ XO3(i,0) \
+ XO3(i + 1, 1) \
+ XO3(i + 2, 2) \
+ XO3(i + 3, 3) \
+ ST(i,0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $256, %1 ;\n"
+ " addl $256, %2 ;\n"
+ " addl $256, %3 ;\n"
+ " addl $256, %4 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
+ :
+ : "memory" );
+
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ unsigned long lines = bytes >> 8;
+ char xmm_save[16*4] ALIGN16;
+ int cr0;
+
+ XMMS_SAVE;
+
+ /* Make sure GCC forgets anything it knows about p4 or p5,
+ such that it won't pass to the asm volatile below a
+ register that is shared with any other variable. That's
+ because we modify p4 and p5 there, but we can't mark them
+ as read/write, otherwise we'd overflow the 10-asm-operands
+ limit of GCC < 3.1. */
+ asm("" : "+r" (p4), "+r" (p5));
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i,0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ XO1(i,0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ PF3(i) \
+ PF3(i + 2) \
+ XO2(i,0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ PF4(i) \
+ PF4(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO3(i,0) \
+ XO3(i + 1, 1) \
+ XO3(i + 2, 2) \
+ XO3(i + 3, 3) \
+ XO4(i,0) \
+ XO4(i + 1, 1) \
+ XO4(i + 2, 2) \
+ XO4(i + 3, 3) \
+ ST(i,0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $256, %1 ;\n"
+ " addl $256, %2 ;\n"
+ " addl $256, %3 ;\n"
+ " addl $256, %4 ;\n"
+ " addl $256, %5 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3)
+ : "r" (p4), "r" (p5)
+ : "memory");
+
+ /* p4 and p5 were modified, and now the variables are dead.
+ Clobber them just to be sure nobody does something stupid
+ like assuming they have some legal value. */
+ asm("" : "=r" (p4), "=r" (p5));
+
+ XMMS_RESTORE;
+}
+
+static struct xor_block_template xor_block_pIII_sse = {
+ .name = "pIII_sse",
+ .do_2 = xor_sse_2,
+ .do_3 = xor_sse_3,
+ .do_4 = xor_sse_4,
+ .do_5 = xor_sse_5,
+};
+
+/* Also try the generic routines. */
+#include <asm-generic/xor.h>
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES \
+do { \
+ xor_speed(&xor_block_8regs); \
+ xor_speed(&xor_block_8regs_p); \
+ xor_speed(&xor_block_32regs); \
+ xor_speed(&xor_block_32regs_p); \
+ if (cpu_has_xmm) \
+ xor_speed(&xor_block_pIII_sse); \
+ if (cpu_has_mmx) { \
+ xor_speed(&xor_block_pII_mmx); \
+ xor_speed(&xor_block_p5_mmx); \
+ } \
+} while (0)
+
+/* We force the use of the SSE xor block because it can write around L2.
+ We may also be able to load into the L1 only depending on how the cpu
+ deals with a load to a line that is being prefetched. */
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+ (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+
+#endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
new file mode 100644
index 000000000000..1549b5e261f6
--- /dev/null
+++ b/arch/x86/include/asm/xor_64.h
@@ -0,0 +1,361 @@
+#ifndef _ASM_X86_XOR_64_H
+#define _ASM_X86_XOR_64_H
+
+/*
+ * Optimized RAID-5 checksumming functions for MMX and SSE.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+/*
+ * Cache avoiding checksumming functions utilizing KNI instructions
+ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
+ */
+
+/*
+ * Based on
+ * High-speed RAID5 checksumming functions utilizing SSE instructions.
+ * Copyright (C) 1998 Ingo Molnar.
+ */
+
+/*
+ * x86-64 changes / gcc fixes from Andi Kleen.
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ *
+ * This hasn't been optimized for the hammer yet, but there are likely
+ * no advantages to be gotten from x86-64 here anyways.
+ */
+
+typedef struct {
+ unsigned long a, b;
+} __attribute__((aligned(16))) xmm_store_t;
+
+/* Doesn't use gcc to save the XMM registers, because there is no easy way to
+ tell it to do a clts before the register saving. */
+#define XMMS_SAVE \
+do { \
+ preempt_disable(); \
+ asm volatile( \
+ "movq %%cr0,%0 ;\n\t" \
+ "clts ;\n\t" \
+ "movups %%xmm0,(%1) ;\n\t" \
+ "movups %%xmm1,0x10(%1) ;\n\t" \
+ "movups %%xmm2,0x20(%1) ;\n\t" \
+ "movups %%xmm3,0x30(%1) ;\n\t" \
+ : "=&r" (cr0) \
+ : "r" (xmm_save) \
+ : "memory"); \
+} while (0)
+
+#define XMMS_RESTORE \
+do { \
+ asm volatile( \
+ "sfence ;\n\t" \
+ "movups (%1),%%xmm0 ;\n\t" \
+ "movups 0x10(%1),%%xmm1 ;\n\t" \
+ "movups 0x20(%1),%%xmm2 ;\n\t" \
+ "movups 0x30(%1),%%xmm3 ;\n\t" \
+ "movq %0,%%cr0 ;\n\t" \
+ : \
+ : "r" (cr0), "r" (xmm_save) \
+ : "memory"); \
+ preempt_enable(); \
+} while (0)
+
+#define OFFS(x) "16*("#x")"
+#define PF_OFFS(x) "256+16*("#x")"
+#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
+#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
+#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
+#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
+#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
+#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
+#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
+#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
+#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
+#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
+#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
+#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
+#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
+
+
+static void
+xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ unsigned int lines = bytes >> 8;
+ unsigned long cr0;
+ xmm_store_t xmm_save[4];
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addq %[inc], %[p1] ;\n"
+ " addq %[inc], %[p2] ;\n"
+ " decl %[cnt] ; jnz 1b"
+ : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
+ : [inc] "r" (256UL)
+ : "memory");
+
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ unsigned int lines = bytes >> 8;
+ xmm_store_t xmm_save[4];
+ unsigned long cr0;
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addq %[inc], %[p1] ;\n"
+ " addq %[inc], %[p2] ;\n"
+ " addq %[inc], %[p3] ;\n"
+ " decl %[cnt] ; jnz 1b"
+ : [cnt] "+r" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
+ : [inc] "r" (256UL)
+ : "memory");
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ unsigned int lines = bytes >> 8;
+ xmm_store_t xmm_save[4];
+ unsigned long cr0;
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ PF3(i) \
+ PF3(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ XO3(i, 0) \
+ XO3(i + 1, 1) \
+ XO3(i + 2, 2) \
+ XO3(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addq %[inc], %[p1] ;\n"
+ " addq %[inc], %[p2] ;\n"
+ " addq %[inc], %[p3] ;\n"
+ " addq %[inc], %[p4] ;\n"
+ " decl %[cnt] ; jnz 1b"
+ : [cnt] "+c" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
+ : [inc] "r" (256UL)
+ : "memory" );
+
+ XMMS_RESTORE;
+}
+
+static void
+xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ unsigned int lines = bytes >> 8;
+ xmm_store_t xmm_save[4];
+ unsigned long cr0;
+
+ XMMS_SAVE;
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ PF3(i) \
+ PF3(i + 2) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ PF4(i) \
+ PF4(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO3(i, 0) \
+ XO3(i + 1, 1) \
+ XO3(i + 2, 2) \
+ XO3(i + 3, 3) \
+ XO4(i, 0) \
+ XO4(i + 1, 1) \
+ XO4(i + 2, 2) \
+ XO4(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addq %[inc], %[p1] ;\n"
+ " addq %[inc], %[p2] ;\n"
+ " addq %[inc], %[p3] ;\n"
+ " addq %[inc], %[p4] ;\n"
+ " addq %[inc], %[p5] ;\n"
+ " decl %[cnt] ; jnz 1b"
+ : [cnt] "+c" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
+ [p5] "+r" (p5)
+ : [inc] "r" (256UL)
+ : "memory");
+
+ XMMS_RESTORE;
+}
+
+static struct xor_block_template xor_block_sse = {
+ .name = "generic_sse",
+ .do_2 = xor_sse_2,
+ .do_3 = xor_sse_3,
+ .do_4 = xor_sse_4,
+ .do_5 = xor_sse_5,
+};
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES \
+do { \
+ xor_speed(&xor_block_sse); \
+} while (0)
+
+/* We force the use of the SSE xor block because it can write around L2.
+ We may also be able to load into the L1 only depending on how the cpu
+ deals with a load to a line that is being prefetched. */
+#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
+
+#endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
new file mode 100644
index 000000000000..08e9a1ac07a9
--- /dev/null
+++ b/arch/x86/include/asm/xsave.h
@@ -0,0 +1,118 @@
+#ifndef __ASM_X86_XSAVE_H
+#define __ASM_X86_XSAVE_H
+
+#include <linux/types.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+
+#define XSTATE_FP 0x1
+#define XSTATE_SSE 0x2
+
+#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE)
+
+#define FXSAVE_SIZE 512
+
+/*
+ * These are the features that the OS can handle currently.
+ */
+#define XCNTXT_MASK (XSTATE_FP | XSTATE_SSE)
+
+#ifdef CONFIG_X86_64
+#define REX_PREFIX "0x48, "
+#else
+#define REX_PREFIX
+#endif
+
+extern unsigned int xstate_size;
+extern u64 pcntxt_mask;
+extern struct xsave_struct *init_xstate_buf;
+
+extern void xsave_cntxt_init(void);
+extern void xsave_init(void);
+extern int init_fpu(struct task_struct *child);
+extern int check_for_xstate(struct i387_fxsave_struct __user *buf,
+ void __user *fpstate,
+ struct _fpx_sw_bytes *sw);
+
+static inline int xrstor_checking(struct xsave_struct *fx)
+{
+ int err;
+
+ asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : [err] "=r" (err)
+ : "D" (fx), "m" (*fx), "a" (-1), "d" (-1), "0" (0)
+ : "memory");
+
+ return err;
+}
+
+static inline int xsave_user(struct xsave_struct __user *buf)
+{
+ int err;
+ __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ _ASM_ALIGN "\n"
+ _ASM_PTR "1b,3b\n"
+ ".previous"
+ : [err] "=r" (err)
+ : "D" (buf), "a" (-1), "d" (-1), "0" (0)
+ : "memory");
+ if (unlikely(err) && __clear_user(buf, xstate_size))
+ err = -EFAULT;
+ /* No need to clear here because the caller clears USED_MATH */
+ return err;
+}
+
+static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)
+{
+ int err;
+ struct xsave_struct *xstate = ((__force struct xsave_struct *)buf);
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ _ASM_ALIGN "\n"
+ _ASM_PTR "1b,3b\n"
+ ".previous"
+ : [err] "=r" (err)
+ : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0)
+ : "memory"); /* memory required? */
+ return err;
+}
+
+static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
+{
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+}
+
+static inline void xsave(struct task_struct *tsk)
+{
+ /* This, however, we can work around by forcing the compiler to select
+ an addressing mode that doesn't require extended registers. */
+ __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27"
+ : : "D" (&(tsk->thread.xstate->xsave)),
+ "a" (-1), "d"(-1) : "memory");
+}
+#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 50632e16d01c..eb074530c7d3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -6,11 +6,13 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu
CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
-ifdef CONFIG_FTRACE
+ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
CFLAGS_REMOVE_tsc.o = -pg
CFLAGS_REMOVE_rtc.o = -pg
CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
+CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_early_printk.o = -pg
endif
#
@@ -23,9 +25,9 @@ CFLAGS_hpet.o := $(nostackp)
CFLAGS_tsc.o := $(nostackp)
CFLAGS_paravirt.o := $(nostackp)
-obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
-obj-y += traps.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y += time_$(BITS).o ioport.o ldt.o
+obj-y := process_$(BITS).o signal.o entry_$(BITS).o
+obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
+obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
obj-$(CONFIG_X86_VISWS) += visws_quirks.o
obj-$(CONFIG_X86_32) += probe_roms_32.o
@@ -41,7 +43,7 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
obj-y += process.o
obj-y += i387.o xsave.o
obj-y += ptrace.o
-obj-y += ds.o
+obj-$(CONFIG_X86_DS) += ds.o
obj-$(CONFIG_X86_32) += tls.o
obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
@@ -61,10 +63,11 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o
obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
-obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o
-obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
+obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
@@ -105,11 +108,15 @@ microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
obj-$(CONFIG_MICROCODE) += microcode.o
+obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
+
+obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
- obj-y += bios_uv.o
+ obj-y += bios_uv.o uv_irq.o uv_sysfs.o
obj-y += genx2apic_cluster.o
obj-y += genx2apic_phys.o
obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
@@ -118,7 +125,6 @@ ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
- obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index eb875cdc7367..65d0b72777ea 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -153,12 +153,13 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
}
#ifdef CONFIG_PCI_MMCONFIG
+
+static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
+
/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
struct acpi_mcfg_allocation *pci_mmcfg_config;
int pci_mmcfg_config_num;
-static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
-
static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
{
if (!strcmp(mcfg->header.oem_id, "SGI"))
@@ -1136,7 +1137,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
return gsi;
}
if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
- pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n",
+ pr_debug("Pin %d-%d already programmed\n",
mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
#ifdef CONFIG_X86_32
return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
@@ -1256,7 +1257,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
count =
acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
- NR_IRQ_VECTORS);
+ nr_irqs);
if (count < 0) {
printk(KERN_ERR PREFIX
"Error parsing interrupt source overrides entry\n");
@@ -1276,7 +1277,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
count =
acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
- NR_IRQ_VECTORS);
+ nr_irqs);
if (count < 0) {
printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
/* TBD: Cleanup to allow fallback to MPS */
@@ -1342,7 +1343,6 @@ static void __init acpi_process_madt(void)
error = acpi_parse_madt_ioapic_entries();
if (!error) {
acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
- acpi_irq_balance_set(NULL);
acpi_ioapic = 1;
smp_found_config = 1;
@@ -1360,6 +1360,17 @@ static void __init acpi_process_madt(void)
disable_acpi();
}
}
+
+ /*
+ * ACPI supports both logical (e.g. Hyper-Threading) and physical
+ * processors, where MPS only supports physical.
+ */
+ if (acpi_lapic && acpi_ioapic)
+ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
+ "information\n");
+ else if (acpi_lapic)
+ printk(KERN_INFO "Using ACPI for processor (LAPIC) "
+ "configuration information\n");
#endif
return;
}
@@ -1598,6 +1609,11 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
},
},
+ {}
+};
+
+/* second table for DMI checks that should run after early-quirks */
+static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
/*
* HP laptops which use a DSDT reporting as HP/SB400/10000,
* which includes some code which overrides all temperature
@@ -1726,6 +1742,9 @@ int __init early_acpi_boot_init(void)
int __init acpi_boot_init(void)
{
+ /* those are executed after early-quirks are executed */
+ dmi_check_system(acpi_dmi_table_late);
+
/*
* If acpi_disabled, bail out
* One exception: acpi=ht continues far enough to enumerate LAPICs
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 426e5d91b63a..806b4e9051b4 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -10,6 +10,7 @@
#include <linux/dmi.h>
#include <linux/cpumask.h>
#include <asm/segment.h>
+#include <asm/desc.h>
#include "realmode/wakeup.h"
#include "sleep.h"
@@ -21,7 +22,7 @@ unsigned long acpi_realmode_flags;
static unsigned long acpi_realmode;
#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
-static char temp_stack[10240];
+static char temp_stack[4096];
#endif
/**
@@ -97,7 +98,9 @@ int acpi_save_state_mem(void)
#else /* CONFIG_64BIT */
header->trampoline_segment = setup_trampoline() >> 4;
#ifdef CONFIG_SMP
- stack_start.sp = temp_stack + 4096;
+ stack_start.sp = temp_stack + sizeof(temp_stack);
+ early_gdt_descr.address =
+ (unsigned long)get_cpu_gdt_table(smp_processor_id());
#endif
initial_code = (unsigned long)wakeup_long64;
saved_magic = 0x123456789abcdef0;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 34e4d112b1ef..2e2da717b350 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -24,6 +24,7 @@
#include <linux/iommu-helper.h>
#include <asm/proto.h>
#include <asm/iommu.h>
+#include <asm/gart.h>
#include <asm/amd_iommu_types.h>
#include <asm/amd_iommu.h>
@@ -50,7 +51,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
/* returns !0 if the IOMMU is caching non-present entries in its TLB */
static int iommu_has_npcache(struct amd_iommu *iommu)
{
- return iommu->cap & IOMMU_CAP_NPCACHE;
+ return iommu->cap & (1UL << IOMMU_CAP_NPCACHE);
}
/****************************************************************************
@@ -187,6 +188,8 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
spin_lock_irqsave(&iommu->lock, flags);
ret = __iommu_queue_command(iommu, cmd);
+ if (!ret)
+ iommu->need_sync = 1;
spin_unlock_irqrestore(&iommu->lock, flags);
return ret;
@@ -210,10 +213,13 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
- iommu->need_sync = 0;
-
spin_lock_irqsave(&iommu->lock, flags);
+ if (!iommu->need_sync)
+ goto out;
+
+ iommu->need_sync = 0;
+
ret = __iommu_queue_command(iommu, &cmd);
if (ret)
@@ -230,8 +236,9 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
- if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
- printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
+ if (unlikely(i == EXIT_LOOP_COUNT))
+ panic("AMD IOMMU: Completion wait loop failed\n");
+
out:
spin_unlock_irqrestore(&iommu->lock, flags);
@@ -254,8 +261,6 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
ret = iommu_queue_command(iommu, &cmd);
- iommu->need_sync = 1;
-
return ret;
}
@@ -281,8 +286,6 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
ret = iommu_queue_command(iommu, &cmd);
- iommu->need_sync = 1;
-
return ret;
}
@@ -295,7 +298,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
u64 address, size_t size)
{
int s = 0;
- unsigned pages = iommu_num_pages(address, size);
+ unsigned pages = iommu_num_pages(address, size, PAGE_SIZE);
address &= PAGE_MASK;
@@ -343,7 +346,7 @@ static int iommu_map(struct protection_domain *dom,
u64 __pte, *pte, *page;
bus_addr = PAGE_ALIGN(bus_addr);
- phys_addr = PAGE_ALIGN(bus_addr);
+ phys_addr = PAGE_ALIGN(phys_addr);
/* only support 512GB address spaces for now */
if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
@@ -536,6 +539,9 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
{
address >>= PAGE_SHIFT;
iommu_area_free(dom->bitmap, address, pages);
+
+ if (address >= dom->next_bit)
+ dom->need_flush = true;
}
/****************************************************************************
@@ -596,7 +602,7 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
continue;
p2 = IOMMU_PTE_PAGE(p1[i]);
- for (j = 0; j < 512; ++i) {
+ for (j = 0; j < 512; ++j) {
if (!IOMMU_PTE_PRESENT(p2[j]))
continue;
p3 = IOMMU_PTE_PAGE(p2[j]);
@@ -680,7 +686,8 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
iommu->exclusion_start < dma_dom->aperture_size) {
unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
int pages = iommu_num_pages(iommu->exclusion_start,
- iommu->exclusion_length);
+ iommu->exclusion_length,
+ PAGE_SIZE);
dma_ops_reserve_addresses(dma_dom, startpage, pages);
}
@@ -758,8 +765,6 @@ static void set_device_domain(struct amd_iommu *iommu,
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
iommu_queue_inv_dev_entry(iommu, devid);
-
- iommu->need_sync = 1;
}
/*****************************************************************************
@@ -854,6 +859,9 @@ static int get_device_resources(struct device *dev,
print_devid(_bdf, 1);
}
+ if (domain_for_device(_bdf) == NULL)
+ set_device_domain(*iommu, *domain, _bdf);
+
return 1;
}
@@ -904,7 +912,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
if (address >= dom->aperture_size)
return;
- WARN_ON(address & 0xfffULL || address > dom->aperture_size);
+ WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size);
pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
pte += IOMMU_PTE_L0_INDEX(address);
@@ -916,8 +924,8 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
/*
* This function contains common code for mapping of a physically
- * contiguous memory region into DMA address space. It is uses by all
- * mapping functions provided by this IOMMU driver.
+ * contiguous memory region into DMA address space. It is used by all
+ * mapping functions provided with this IOMMU driver.
* Must be called with the domain lock held.
*/
static dma_addr_t __map_single(struct device *dev,
@@ -935,7 +943,7 @@ static dma_addr_t __map_single(struct device *dev,
unsigned long align_mask = 0;
int i;
- pages = iommu_num_pages(paddr, size);
+ pages = iommu_num_pages(paddr, size, PAGE_SIZE);
paddr &= PAGE_MASK;
if (align)
@@ -977,10 +985,11 @@ static void __unmap_single(struct amd_iommu *iommu,
dma_addr_t i, start;
unsigned int pages;
- if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
+ if ((dma_addr == bad_dma_address) ||
+ (dma_addr + size > dma_dom->aperture_size))
return;
- pages = iommu_num_pages(dma_addr, size);
+ pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
dma_addr &= PAGE_MASK;
start = dma_addr;
@@ -991,8 +1000,10 @@ static void __unmap_single(struct amd_iommu *iommu,
dma_ops_free_addresses(dma_dom, dma_addr, pages);
- if (amd_iommu_unmap_flush)
+ if (amd_iommu_unmap_flush || dma_dom->need_flush) {
iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size);
+ dma_dom->need_flush = false;
+ }
}
/*
@@ -1025,8 +1036,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
if (addr == bad_dma_address)
goto out;
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
out:
spin_unlock_irqrestore(&domain->lock, flags);
@@ -1054,8 +1064,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
__unmap_single(iommu, domain->priv, dma_addr, size, dir);
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
spin_unlock_irqrestore(&domain->lock, flags);
}
@@ -1121,8 +1130,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
goto unmap;
}
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
out:
spin_unlock_irqrestore(&domain->lock, flags);
@@ -1167,8 +1175,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
s->dma_address = s->dma_length = 0;
}
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
spin_unlock_irqrestore(&domain->lock, flags);
}
@@ -1219,8 +1226,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
goto out;
}
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
out:
spin_unlock_irqrestore(&domain->lock, flags);
@@ -1251,8 +1257,7 @@ static void free_coherent(struct device *dev, size_t size,
__unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
spin_unlock_irqrestore(&domain->lock, flags);
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 4cd8083c58be..c625800c55ca 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -28,6 +28,7 @@
#include <asm/amd_iommu_types.h>
#include <asm/amd_iommu.h>
#include <asm/iommu.h>
+#include <asm/gart.h>
/*
* definitions for the ACPI scanning code
@@ -121,7 +122,7 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
we find in ACPI */
unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
-int amd_iommu_isolate; /* if 1, device isolation is enabled */
+int amd_iommu_isolate = 1; /* if 1, device isolation is enabled */
bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
@@ -212,7 +213,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
/* Programs the physical address of the device table into the IOMMU hardware */
static void __init iommu_set_device_table(struct amd_iommu *iommu)
{
- u32 entry;
+ u64 entry;
BUG_ON(iommu->mmio_base == NULL);
@@ -427,6 +428,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
&entry, sizeof(entry));
+ /* set head and tail to zero manually */
+ writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+ writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+
iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
return cmd_buf;
@@ -1074,7 +1079,8 @@ int __init amd_iommu_init(void)
goto free;
/* IOMMU rlookup table - find the IOMMU for a specific device */
- amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL,
+ amd_iommu_rlookup_table = (void *)__get_free_pages(
+ GFP_KERNEL | __GFP_ZERO,
get_order(rlookup_table_size));
if (amd_iommu_rlookup_table == NULL)
goto free;
@@ -1213,7 +1219,9 @@ static int __init parse_amd_iommu_options(char *str)
for (; *str; ++str) {
if (strncmp(str, "isolate", 7) == 0)
amd_iommu_isolate = 1;
- if (strncmp(str, "fullflush", 11) == 0)
+ if (strncmp(str, "share", 5) == 0)
+ amd_iommu_isolate = 0;
+ if (strncmp(str, "fullflush", 9) == 0)
amd_iommu_unmap_flush = true;
}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 9a32b37ee2ee..676debfc1702 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -1,8 +1,9 @@
/*
* Firmware replacement code.
*
- * Work around broken BIOSes that don't set an aperture or only set the
- * aperture in the AGP bridge.
+ * Work around broken BIOSes that don't set an aperture, only set the
+ * aperture in the AGP bridge, or set too small aperture.
+ *
* If all fails map the aperture over some low memory. This is cheaper than
* doing bounce buffering. The memory is lost. This is done at early boot
* because only the bootmem allocator can allocate 32+MB.
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic.c
index 21c831d96af3..b5229affb953 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic.c
@@ -23,11 +23,14 @@
#include <linux/mc146818rtc.h>
#include <linux/kernel_stat.h>
#include <linux/sysdev.h>
+#include <linux/ioport.h>
#include <linux/cpu.h>
#include <linux/clockchips.h>
#include <linux/acpi_pmtmr.h>
#include <linux/module.h>
#include <linux/dmi.h>
+#include <linux/dmar.h>
+#include <linux/ftrace.h>
#include <asm/atomic.h>
#include <asm/smp.h>
@@ -36,8 +39,14 @@
#include <asm/desc.h>
#include <asm/arch_hooks.h>
#include <asm/hpet.h>
+#include <asm/pgalloc.h>
#include <asm/i8253.h>
#include <asm/nmi.h>
+#include <asm/idle.h>
+#include <asm/proto.h>
+#include <asm/timex.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
#include <mach_apic.h>
#include <mach_apicdef.h>
@@ -50,16 +59,58 @@
# error SPURIOUS_APIC_VECTOR definition error
#endif
-unsigned long mp_lapic_addr;
-
+#ifdef CONFIG_X86_32
/*
* Knob to control our willingness to enable the local APIC.
*
* +1=force-enable
*/
static int force_enable_local_apic;
-int disable_apic;
+/*
+ * APIC command line parameters
+ */
+static int __init parse_lapic(char *arg)
+{
+ force_enable_local_apic = 1;
+ return 0;
+}
+early_param("lapic", parse_lapic);
+/* Local APIC was disabled by the BIOS and enabled by the kernel */
+static int enabled_via_apicbase;
+
+#endif
+#ifdef CONFIG_X86_64
+static int apic_calibrate_pmtmr __initdata;
+static __init int setup_apicpmtimer(char *s)
+{
+ apic_calibrate_pmtmr = 1;
+ notsc_setup(NULL);
+ return 0;
+}
+__setup("apicpmtimer", setup_apicpmtimer);
+#endif
+
+#ifdef CONFIG_X86_64
+#define HAVE_X2APIC
+#endif
+
+#ifdef HAVE_X2APIC
+int x2apic;
+/* x2apic enabled before OS handover */
+int x2apic_preenabled;
+int disable_x2apic;
+static __init int setup_nox2apic(char *str)
+{
+ disable_x2apic = 1;
+ setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+ return 0;
+}
+early_param("nox2apic", setup_nox2apic);
+#endif
+
+unsigned long mp_lapic_addr;
+int disable_apic;
/* Disable local APIC timer from the kernel commandline or via dmi quirk */
static int disable_apic_timer __cpuinitdata;
/* Local APIC timer works in C2 */
@@ -110,9 +161,6 @@ static struct clock_event_device lapic_clockevent = {
};
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
-/* Local APIC was disabled by the BIOS and enabled by the kernel */
-static int enabled_via_apicbase;
-
static unsigned long apic_phys;
/*
@@ -202,6 +250,42 @@ static struct apic_ops xapic_ops = {
struct apic_ops __read_mostly *apic_ops = &xapic_ops;
EXPORT_SYMBOL_GPL(apic_ops);
+#ifdef HAVE_X2APIC
+static void x2apic_wait_icr_idle(void)
+{
+ /* no need to wait for icr idle in x2apic */
+ return;
+}
+
+static u32 safe_x2apic_wait_icr_idle(void)
+{
+ /* no need to wait for icr idle in x2apic */
+ return 0;
+}
+
+void x2apic_icr_write(u32 low, u32 id)
+{
+ wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
+}
+
+u64 x2apic_icr_read(void)
+{
+ unsigned long val;
+
+ rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
+ return val;
+}
+
+static struct apic_ops x2apic_ops = {
+ .read = native_apic_msr_read,
+ .write = native_apic_msr_write,
+ .icr_read = x2apic_icr_read,
+ .icr_write = x2apic_icr_write,
+ .wait_icr_idle = x2apic_wait_icr_idle,
+ .safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
+};
+#endif
+
/**
* enable_NMI_through_LVT0 - enable NMI through local vector table 0
*/
@@ -219,6 +303,7 @@ void __cpuinit enable_NMI_through_LVT0(void)
apic_write(APIC_LVT0, v);
}
+#ifdef CONFIG_X86_32
/**
* get_physical_broadcast - Get number of physical broadcast IDs
*/
@@ -226,6 +311,7 @@ int get_physical_broadcast(void)
{
return modern_apic() ? 0xff : 0xf;
}
+#endif
/**
* lapic_get_maxlvt - get the maximum number of local vector table entries
@@ -247,11 +333,7 @@ int lapic_get_maxlvt(void)
*/
/* Clock divisor */
-#ifdef CONFG_X86_64
-#define APIC_DIVISOR 1
-#else
#define APIC_DIVISOR 16
-#endif
/*
* This function sets up the local APIC timer, with a timeout of
@@ -360,6 +442,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
v = apic_read(APIC_LVTT);
v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
apic_write(APIC_LVTT, v);
+ apic_write(APIC_TMICT, 0xffffffff);
break;
case CLOCK_EVT_MODE_RESUME:
/* Nothing to do here */
@@ -383,7 +466,7 @@ static void lapic_timer_broadcast(cpumask_t mask)
* Setup the local APIC timer for this CPU. Copy the initilized values
* of the boot CPU and register the clock event in the framework.
*/
-static void __devinit setup_APIC_timer(void)
+static void __cpuinit setup_APIC_timer(void)
{
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
@@ -453,14 +536,51 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
}
}
+static int __init calibrate_by_pmtimer(long deltapm, long *delta)
+{
+ const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
+ const long pm_thresh = pm_100ms / 100;
+ unsigned long mult;
+ u64 res;
+
+#ifndef CONFIG_X86_PM_TIMER
+ return -1;
+#endif
+
+ apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
+
+ /* Check, if the PM timer is available */
+ if (!deltapm)
+ return -1;
+
+ mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
+
+ if (deltapm > (pm_100ms - pm_thresh) &&
+ deltapm < (pm_100ms + pm_thresh)) {
+ apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
+ } else {
+ res = (((u64)deltapm) * mult) >> 22;
+ do_div(res, 1000000);
+ pr_warning("APIC calibration not consistent "
+ "with PM Timer: %ldms instead of 100ms\n",
+ (long)res);
+ /* Correct the lapic counter value */
+ res = (((u64)(*delta)) * pm_100ms);
+ do_div(res, deltapm);
+ pr_info("APIC delta adjusted to PM-Timer: "
+ "%lu (%ld)\n", (unsigned long)res, *delta);
+ *delta = (long)res;
+ }
+
+ return 0;
+}
+
static int __init calibrate_APIC_clock(void)
{
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
- const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
- const long pm_thresh = pm_100ms/100;
void (*real_handler)(struct clock_event_device *dev);
unsigned long deltaj;
- long delta, deltapm;
+ long delta;
int pm_referenced = 0;
local_irq_disable();
@@ -470,10 +590,10 @@ static int __init calibrate_APIC_clock(void)
global_clock_event->event_handler = lapic_cal_handler;
/*
- * Setup the APIC counter to 1e9. There is no way the lapic
+ * Setup the APIC counter to maximum. There is no way the lapic
* can underflow in the 100ms detection time frame
*/
- __setup_APIC_LVTT(1000000000, 0, 0);
+ __setup_APIC_LVTT(0xffffffff, 0, 0);
/* Let the interrupts run */
local_irq_enable();
@@ -490,34 +610,9 @@ static int __init calibrate_APIC_clock(void)
delta = lapic_cal_t1 - lapic_cal_t2;
apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
- /* Check, if the PM timer is available */
- deltapm = lapic_cal_pm2 - lapic_cal_pm1;
- apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
-
- if (deltapm) {
- unsigned long mult;
- u64 res;
-
- mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
-
- if (deltapm > (pm_100ms - pm_thresh) &&
- deltapm < (pm_100ms + pm_thresh)) {
- apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
- } else {
- res = (((u64) deltapm) * mult) >> 22;
- do_div(res, 1000000);
- printk(KERN_WARNING "APIC calibration not consistent "
- "with PM Timer: %ldms instead of 100ms\n",
- (long)res);
- /* Correct the lapic counter value */
- res = (((u64) delta) * pm_100ms);
- do_div(res, deltapm);
- printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
- "%lu (%ld)\n", (unsigned long) res, delta);
- delta = (long) res;
- }
- pm_referenced = 1;
- }
+ /* we trust the PM based calibration if possible */
+ pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
+ &delta);
/* Calculate the scaled math multiplication factor */
lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
@@ -552,14 +647,16 @@ static int __init calibrate_APIC_clock(void)
*/
if (calibration_result < (1000000 / HZ)) {
local_irq_enable();
- printk(KERN_WARNING
- "APIC frequency too slow, disabling apic timer\n");
+ pr_warning("APIC frequency too slow, disabling apic timer\n");
return -1;
}
levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
- /* We trust the pm timer based calibration */
+ /*
+ * PM timer calibration failed or not turned on
+ * so lets try APIC timer based calibration
+ */
if (!pm_referenced) {
apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@@ -576,13 +673,9 @@ static int __init calibrate_APIC_clock(void)
while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
cpu_relax();
- local_irq_disable();
-
/* Stop the lapic timer */
lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
- local_irq_enable();
-
/* Jiffies delta */
deltaj = lapic_cal_j2 - lapic_cal_j1;
apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
@@ -596,8 +689,7 @@ static int __init calibrate_APIC_clock(void)
local_irq_enable();
if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
- printk(KERN_WARNING
- "APIC timer disabled due to verification failure.\n");
+ pr_warning("APIC timer disabled due to verification failure.\n");
return -1;
}
@@ -618,7 +710,7 @@ void __init setup_boot_APIC_clock(void)
* broadcast mechanism is used. On UP systems simply ignore it.
*/
if (disable_apic_timer) {
- printk(KERN_INFO "Disabling APIC timer\n");
+ pr_info("Disabling APIC timer\n");
/* No broadcast on UP ! */
if (num_possible_cpus() > 1) {
lapic_clockevent.mult = 1;
@@ -645,14 +737,14 @@ void __init setup_boot_APIC_clock(void)
if (nmi_watchdog != NMI_IO_APIC)
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
else
- printk(KERN_WARNING "APIC timer registered as dummy,"
+ pr_warning("APIC timer registered as dummy,"
" due to nmi_watchdog=%d!\n", nmi_watchdog);
/* Setup the lapic or request the broadcast */
setup_APIC_timer();
}
-void __devinit setup_secondary_APIC_clock(void)
+void __cpuinit setup_secondary_APIC_clock(void)
{
setup_APIC_timer();
}
@@ -677,8 +769,7 @@ static void local_apic_timer_interrupt(void)
* spurious.
*/
if (!evt->event_handler) {
- printk(KERN_WARNING
- "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+ pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
/* Switch it off */
lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
return;
@@ -687,11 +778,7 @@ static void local_apic_timer_interrupt(void)
/*
* the NMI deadlock-detector uses this.
*/
-#ifdef CONFIG_X86_64
- add_pda(apic_timer_irqs, 1);
-#else
- per_cpu(irq_stat, cpu).apic_timer_irqs++;
-#endif
+ inc_irq_stat(apic_timer_irqs);
evt->event_handler(evt);
}
@@ -704,7 +791,7 @@ static void local_apic_timer_interrupt(void)
* [ if a single-CPU system runs an SMP kernel then we call the local
* interrupt as well. Thus we cannot inline the local irq ... ]
*/
-void smp_apic_timer_interrupt(struct pt_regs *regs)
+void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
@@ -718,6 +805,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
* Besides, if we don't timer interrupts ignore the global
* interrupt lock, which is the WrongThing (tm) to do.
*/
+ exit_idle();
irq_enter();
local_apic_timer_interrupt();
irq_exit();
@@ -991,40 +1079,43 @@ void __init init_bsp_APIC(void)
static void __cpuinit lapic_setup_esr(void)
{
- unsigned long oldvalue, value, maxlvt;
- if (lapic_is_integrated() && !esr_disable) {
- if (esr_disable) {
- /*
- * Something untraceable is creating bad interrupts on
- * secondary quads ... for the moment, just leave the
- * ESR disabled - we can't do anything useful with the
- * errors anyway - mbligh
- */
- printk(KERN_INFO "Leaving ESR disabled.\n");
- return;
- }
- /* !82489DX */
- maxlvt = lapic_get_maxlvt();
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
- apic_write(APIC_ESR, 0);
- oldvalue = apic_read(APIC_ESR);
+ unsigned int oldvalue, value, maxlvt;
- /* enables sending errors */
- value = ERROR_APIC_VECTOR;
- apic_write(APIC_LVTERR, value);
+ if (!lapic_is_integrated()) {
+ pr_info("No ESR for 82489DX.\n");
+ return;
+ }
+
+ if (esr_disable) {
/*
- * spec says clear errors after enabling vector.
+ * Something untraceable is creating bad interrupts on
+ * secondary quads ... for the moment, just leave the
+ * ESR disabled - we can't do anything useful with the
+ * errors anyway - mbligh
*/
- if (maxlvt > 3)
- apic_write(APIC_ESR, 0);
- value = apic_read(APIC_ESR);
- if (value != oldvalue)
- apic_printk(APIC_VERBOSE, "ESR value before enabling "
- "vector: 0x%08lx after: 0x%08lx\n",
- oldvalue, value);
- } else {
- printk(KERN_INFO "No ESR for 82489DX.\n");
+ pr_info("Leaving ESR disabled.\n");
+ return;
}
+
+ maxlvt = lapic_get_maxlvt();
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ oldvalue = apic_read(APIC_ESR);
+
+ /* enables sending errors */
+ value = ERROR_APIC_VECTOR;
+ apic_write(APIC_LVTERR, value);
+
+ /*
+ * spec says clear errors after enabling vector.
+ */
+ if (maxlvt > 3)
+ apic_write(APIC_ESR, 0);
+ value = apic_read(APIC_ESR);
+ if (value != oldvalue)
+ apic_printk(APIC_VERBOSE, "ESR value before enabling "
+ "vector: 0x%08x after: 0x%08x\n",
+ oldvalue, value);
}
@@ -1033,24 +1124,27 @@ static void __cpuinit lapic_setup_esr(void)
*/
void __cpuinit setup_local_APIC(void)
{
- unsigned long value, integrated;
+ unsigned int value;
int i, j;
+#ifdef CONFIG_X86_32
/* Pound the ESR really hard over the head with a big hammer - mbligh */
- if (esr_disable) {
+ if (lapic_is_integrated() && esr_disable) {
apic_write(APIC_ESR, 0);
apic_write(APIC_ESR, 0);
apic_write(APIC_ESR, 0);
apic_write(APIC_ESR, 0);
}
+#endif
- integrated = lapic_is_integrated();
+ preempt_disable();
/*
* Double-check whether this APIC is really registered.
+ * This is meaningless in clustered apic mode, so we skip it.
*/
if (!apic_id_registered())
- WARN_ON_ONCE(1);
+ BUG();
/*
* Intel recommends to set DFR, LDR and TPR before enabling
@@ -1096,6 +1190,7 @@ void __cpuinit setup_local_APIC(void)
*/
value |= APIC_SPIV_APIC_ENABLED;
+#ifdef CONFIG_X86_32
/*
* Some unknown Intel IO/APIC (or APIC) errata is biting us with
* certain networking cards. If high frequency interrupts are
@@ -1116,8 +1211,13 @@ void __cpuinit setup_local_APIC(void)
* See also the comment in end_level_ioapic_irq(). --macro
*/
- /* Enable focus processor (bit==0) */
+ /*
+ * - enable focus processor (bit==0)
+ * - 64bit mode always use processor focus
+ * so no need to set it
+ */
value &= ~APIC_SPIV_FOCUS_DISABLED;
+#endif
/*
* Set spurious IRQ vector
@@ -1154,9 +1254,11 @@ void __cpuinit setup_local_APIC(void)
value = APIC_DM_NMI;
else
value = APIC_DM_NMI | APIC_LVT_MASKED;
- if (!integrated) /* 82489DX */
+ if (!lapic_is_integrated()) /* 82489DX */
value |= APIC_LVT_LEVEL_TRIGGER;
apic_write(APIC_LVT1, value);
+
+ preempt_enable();
}
void __cpuinit end_local_APIC_setup(void)
@@ -1177,6 +1279,146 @@ void __cpuinit end_local_APIC_setup(void)
apic_pm_activate();
}
+#ifdef HAVE_X2APIC
+void check_x2apic(void)
+{
+ int msr, msr2;
+
+ rdmsr(MSR_IA32_APICBASE, msr, msr2);
+
+ if (msr & X2APIC_ENABLE) {
+ pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
+ x2apic_preenabled = x2apic = 1;
+ apic_ops = &x2apic_ops;
+ }
+}
+
+void enable_x2apic(void)
+{
+ int msr, msr2;
+
+ rdmsr(MSR_IA32_APICBASE, msr, msr2);
+ if (!(msr & X2APIC_ENABLE)) {
+ pr_info("Enabling x2apic\n");
+ wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
+ }
+}
+
+void __init enable_IR_x2apic(void)
+{
+#ifdef CONFIG_INTR_REMAP
+ int ret;
+ unsigned long flags;
+
+ if (!cpu_has_x2apic)
+ return;
+
+ if (!x2apic_preenabled && disable_x2apic) {
+ pr_info("Skipped enabling x2apic and Interrupt-remapping "
+ "because of nox2apic\n");
+ return;
+ }
+
+ if (x2apic_preenabled && disable_x2apic)
+ panic("Bios already enabled x2apic, can't enforce nox2apic");
+
+ if (!x2apic_preenabled && skip_ioapic_setup) {
+ pr_info("Skipped enabling x2apic and Interrupt-remapping "
+ "because of skipping io-apic setup\n");
+ return;
+ }
+
+ ret = dmar_table_init();
+ if (ret) {
+ pr_info("dmar_table_init() failed with %d:\n", ret);
+
+ if (x2apic_preenabled)
+ panic("x2apic enabled by bios. But IR enabling failed");
+ else
+ pr_info("Not enabling x2apic,Intr-remapping\n");
+ return;
+ }
+
+ local_irq_save(flags);
+ mask_8259A();
+
+ ret = save_mask_IO_APIC_setup();
+ if (ret) {
+ pr_info("Saving IO-APIC state failed: %d\n", ret);
+ goto end;
+ }
+
+ ret = enable_intr_remapping(1);
+
+ if (ret && x2apic_preenabled) {
+ local_irq_restore(flags);
+ panic("x2apic enabled by bios. But IR enabling failed");
+ }
+
+ if (ret)
+ goto end_restore;
+
+ if (!x2apic) {
+ x2apic = 1;
+ apic_ops = &x2apic_ops;
+ enable_x2apic();
+ }
+
+end_restore:
+ if (ret)
+ /*
+ * IR enabling failed
+ */
+ restore_IO_APIC_setup();
+ else
+ reinit_intr_remapped_IO_APIC(x2apic_preenabled);
+
+end:
+ unmask_8259A();
+ local_irq_restore(flags);
+
+ if (!ret) {
+ if (!x2apic_preenabled)
+ pr_info("Enabled x2apic and interrupt-remapping\n");
+ else
+ pr_info("Enabled Interrupt-remapping\n");
+ } else
+ pr_err("Failed to enable Interrupt-remapping and x2apic\n");
+#else
+ if (!cpu_has_x2apic)
+ return;
+
+ if (x2apic_preenabled)
+ panic("x2apic enabled prior OS handover,"
+ " enable CONFIG_INTR_REMAP");
+
+ pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
+ " and x2apic\n");
+#endif
+
+ return;
+}
+#endif /* HAVE_X2APIC */
+
+#ifdef CONFIG_X86_64
+/*
+ * Detect and enable local APICs on non-SMP boards.
+ * Original code written by Keir Fraser.
+ * On AMD64 we trust the BIOS - if it says no APIC it is likely
+ * not correctly set up (usually the APIC timer won't work etc.)
+ */
+static int __init detect_init_APIC(void)
+{
+ if (!cpu_has_apic) {
+ pr_info("No local APIC present\n");
+ return -1;
+ }
+
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+ boot_cpu_physical_apicid = 0;
+ return 0;
+}
+#else
/*
* Detect and initialize APIC
*/
@@ -1209,8 +1451,8 @@ static int __init detect_init_APIC(void)
* "lapic" specified.
*/
if (!force_enable_local_apic) {
- printk(KERN_INFO "Local APIC disabled by BIOS -- "
- "you can enable it with \"lapic\"\n");
+ pr_info("Local APIC disabled by BIOS -- "
+ "you can enable it with \"lapic\"\n");
return -1;
}
/*
@@ -1220,8 +1462,7 @@ static int __init detect_init_APIC(void)
*/
rdmsr(MSR_IA32_APICBASE, l, h);
if (!(l & MSR_IA32_APICBASE_ENABLE)) {
- printk(KERN_INFO
- "Local APIC disabled by BIOS -- reenabling.\n");
+ pr_info("Local APIC disabled by BIOS -- reenabling.\n");
l &= ~MSR_IA32_APICBASE_BASE;
l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
wrmsr(MSR_IA32_APICBASE, l, h);
@@ -1234,7 +1475,7 @@ static int __init detect_init_APIC(void)
*/
features = cpuid_edx(1);
if (!(features & (1 << X86_FEATURE_APIC))) {
- printk(KERN_WARNING "Could not enable APIC!\n");
+ pr_warning("Could not enable APIC!\n");
return -1;
}
set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
@@ -1245,22 +1486,56 @@ static int __init detect_init_APIC(void)
if (l & MSR_IA32_APICBASE_ENABLE)
mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
- printk(KERN_INFO "Found and enabled local APIC!\n");
+ pr_info("Found and enabled local APIC!\n");
apic_pm_activate();
return 0;
no_apic:
- printk(KERN_INFO "No local APIC present or hardware disabled\n");
+ pr_info("No local APIC present or hardware disabled\n");
return -1;
}
+#endif
+
+#ifdef CONFIG_X86_64
+void __init early_init_lapic_mapping(void)
+{
+ unsigned long phys_addr;
+
+ /*
+ * If no local APIC can be found then go out
+ * : it means there is no mpatable and MADT
+ */
+ if (!smp_found_config)
+ return;
+
+ phys_addr = mp_lapic_addr;
+
+ set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
+ apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
+ APIC_BASE, phys_addr);
+
+ /*
+ * Fetch the APIC ID of the BSP in case we have a
+ * default configuration (or the MP table is broken).
+ */
+ boot_cpu_physical_apicid = read_apic_id();
+}
+#endif
/**
* init_apic_mappings - initialize APIC mappings
*/
void __init init_apic_mappings(void)
{
+#ifdef HAVE_X2APIC
+ if (x2apic) {
+ boot_cpu_physical_apicid = read_apic_id();
+ return;
+ }
+#endif
+
/*
* If no local APIC can be found then set up a fake all
* zeroes page to simulate the local APIC and another
@@ -1273,8 +1548,8 @@ void __init init_apic_mappings(void)
apic_phys = mp_lapic_addr;
set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
- printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE,
- apic_phys);
+ apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
+ APIC_BASE, apic_phys);
/*
* Fetch the APIC ID of the BSP in case we have a
@@ -1282,18 +1557,27 @@ void __init init_apic_mappings(void)
*/
if (boot_cpu_physical_apicid == -1U)
boot_cpu_physical_apicid = read_apic_id();
-
}
/*
* This initializes the IO-APIC and APIC hardware if this is
* a UP kernel.
*/
-
int apic_version[MAX_APICS];
int __init APIC_init_uniprocessor(void)
{
+#ifdef CONFIG_X86_64
+ if (disable_apic) {
+ pr_info("Apic disabled\n");
+ return -1;
+ }
+ if (!cpu_has_apic) {
+ disable_apic = 1;
+ pr_info("Apic disabled by BIOS\n");
+ return -1;
+ }
+#else
if (!smp_found_config && !cpu_has_apic)
return -1;
@@ -1302,39 +1586,68 @@ int __init APIC_init_uniprocessor(void)
*/
if (!cpu_has_apic &&
APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
- printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
- boot_cpu_physical_apicid);
+ pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
+ boot_cpu_physical_apicid);
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
return -1;
}
+#endif
- verify_local_APIC();
+#ifdef HAVE_X2APIC
+ enable_IR_x2apic();
+#endif
+#ifdef CONFIG_X86_64
+ setup_apic_routing();
+#endif
+ verify_local_APIC();
connect_bsp_APIC();
+#ifdef CONFIG_X86_64
+ apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
+#else
/*
* Hack: In case of kdump, after a crash, kernel might be booting
* on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
* might be zero if read from MP tables. Get it from LAPIC.
*/
-#ifdef CONFIG_CRASH_DUMP
+# ifdef CONFIG_CRASH_DUMP
boot_cpu_physical_apicid = read_apic_id();
+# endif
#endif
physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
-
setup_local_APIC();
+#ifdef CONFIG_X86_64
+ /*
+ * Now enable IO-APICs, actually call clear_IO_APIC
+ * We need clear_IO_APIC before enabling vector on BP
+ */
+ if (!skip_ioapic_setup && nr_ioapics)
+ enable_IO_APIC();
+#endif
+
#ifdef CONFIG_X86_IO_APIC
if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
#endif
localise_nmi_watchdog();
end_local_APIC_setup();
+
#ifdef CONFIG_X86_IO_APIC
- if (smp_found_config)
- if (!skip_ioapic_setup && nr_ioapics)
- setup_IO_APIC();
+ if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
+ setup_IO_APIC();
+# ifdef CONFIG_X86_64
+ else
+ nr_ioapics = 0;
+# endif
#endif
+
+#ifdef CONFIG_X86_64
+ setup_boot_APIC_clock();
+ check_nmi_watchdog();
+#else
setup_boot_clock();
+#endif
return 0;
}
@@ -1348,8 +1661,9 @@ int __init APIC_init_uniprocessor(void)
*/
void smp_spurious_interrupt(struct pt_regs *regs)
{
- unsigned long v;
+ u32 v;
+ exit_idle();
irq_enter();
/*
* Check if this really is a spurious interrupt and ACK it
@@ -1360,10 +1674,11 @@ void smp_spurious_interrupt(struct pt_regs *regs)
if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
ack_APIC_irq();
+ inc_irq_stat(irq_spurious_count);
+
/* see sw-dev-man vol 3, chapter 7.4.13.5 */
- printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
- "should never happen.\n", smp_processor_id());
- __get_cpu_var(irq_stat).irq_spurious_count++;
+ pr_info("spurious APIC interrupt on CPU#%d, "
+ "should never happen.\n", smp_processor_id());
irq_exit();
}
@@ -1372,8 +1687,9 @@ void smp_spurious_interrupt(struct pt_regs *regs)
*/
void smp_error_interrupt(struct pt_regs *regs)
{
- unsigned long v, v1;
+ u32 v, v1;
+ exit_idle();
irq_enter();
/* First tickle the hardware, only then report what went on. -- REW */
v = apic_read(APIC_ESR);
@@ -1382,17 +1698,18 @@ void smp_error_interrupt(struct pt_regs *regs)
ack_APIC_irq();
atomic_inc(&irq_err_count);
- /* Here is what the APIC error bits mean:
- 0: Send CS error
- 1: Receive CS error
- 2: Send accept error
- 3: Receive accept error
- 4: Reserved
- 5: Send illegal vector
- 6: Received illegal vector
- 7: Illegal register address
- */
- printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
+ /*
+ * Here is what the APIC error bits mean:
+ * 0: Send CS error
+ * 1: Receive CS error
+ * 2: Send accept error
+ * 3: Receive accept error
+ * 4: Reserved
+ * 5: Send illegal vector
+ * 6: Received illegal vector
+ * 7: Illegal register address
+ */
+ pr_debug("APIC error on CPU%d: %02x(%02x)\n",
smp_processor_id(), v , v1);
irq_exit();
}
@@ -1496,15 +1813,15 @@ void __cpuinit generic_processor_info(int apicid, int version)
* Validate version
*/
if (version == 0x0) {
- printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
- "fixing up to 0x10. (tell your hw vendor)\n",
- version);
+ pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
+ "fixing up to 0x10. (tell your hw vendor)\n",
+ version);
version = 0x10;
}
apic_version[apicid] = version;
if (num_processors >= NR_CPUS) {
- printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
+ pr_warning("WARNING: NR_CPUS limit of %i reached."
" Processor ignored.\n", NR_CPUS);
return;
}
@@ -1565,6 +1882,13 @@ void __cpuinit generic_processor_info(int apicid, int version)
cpu_set(cpu, cpu_present_map);
}
+#ifdef CONFIG_X86_64
+int hard_smp_processor_id(void)
+{
+ return read_apic_id();
+}
+#endif
+
/*
* Power management
*/
@@ -1640,7 +1964,7 @@ static int lapic_resume(struct sys_device *dev)
local_irq_save(flags);
-#ifdef CONFIG_X86_64
+#ifdef HAVE_X2APIC
if (x2apic)
enable_x2apic();
else
@@ -1702,7 +2026,7 @@ static struct sys_device device_lapic = {
.cls = &lapic_sysclass,
};
-static void __devinit apic_pm_activate(void)
+static void __cpuinit apic_pm_activate(void)
{
apic_pm_state.active = 1;
}
@@ -1728,16 +2052,87 @@ static void apic_pm_activate(void) { }
#endif /* CONFIG_PM */
+#ifdef CONFIG_X86_64
/*
- * APIC command line parameters
+ * apic_is_clustered_box() -- Check if we can expect good TSC
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ *
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis. Use available data to take a good guess.
+ * If in doubt, go HPET.
*/
-static int __init parse_lapic(char *arg)
+__cpuinit int apic_is_clustered_box(void)
{
- force_enable_local_apic = 1;
- return 0;
+ int i, clusters, zeros;
+ unsigned id;
+ u16 *bios_cpu_apicid;
+ DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
+
+ /*
+ * there is not this kind of box with AMD CPU yet.
+ * Some AMD box with quadcore cpu and 8 sockets apicid
+ * will be [4, 0x23] or [8, 0x27] could be thought to
+ * vsmp box still need checking...
+ */
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
+ return 0;
+
+ bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
+ bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
+
+ for (i = 0; i < NR_CPUS; i++) {
+ /* are we being called early in kernel startup? */
+ if (bios_cpu_apicid) {
+ id = bios_cpu_apicid[i];
+ }
+ else if (i < nr_cpu_ids) {
+ if (cpu_present(i))
+ id = per_cpu(x86_bios_cpu_apicid, i);
+ else
+ continue;
+ }
+ else
+ break;
+
+ if (id != BAD_APICID)
+ __set_bit(APIC_CLUSTERID(id), clustermap);
+ }
+
+ /* Problem: Partially populated chassis may not have CPUs in some of
+ * the APIC clusters they have been allocated. Only present CPUs have
+ * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
+ * Since clusters are allocated sequentially, count zeros only if
+ * they are bounded by ones.
+ */
+ clusters = 0;
+ zeros = 0;
+ for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
+ if (test_bit(i, clustermap)) {
+ clusters += 1 + zeros;
+ zeros = 0;
+ } else
+ ++zeros;
+ }
+
+ /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
+ * not guaranteed to be synced between boards
+ */
+ if (is_vsmp_box() && clusters > 1)
+ return 1;
+
+ /*
+ * If clusters > 2, then should be multi-chassis.
+ * May have to revisit this when multi-core + hyperthreaded CPUs come
+ * out, but AFAIK this will work even for them.
+ */
+ return (clusters > 2);
}
-early_param("lapic", parse_lapic);
+#endif
+/*
+ * APIC command line parameters
+ */
static int __init setup_disableapic(char *arg)
{
disable_apic = 1;
@@ -1779,7 +2174,6 @@ static int __init apic_set_verbosity(char *arg)
if (!arg) {
#ifdef CONFIG_X86_64
skip_ioapic_setup = 0;
- ioapic_force = 1;
return 0;
#endif
return -EINVAL;
@@ -1790,7 +2184,7 @@ static int __init apic_set_verbosity(char *arg)
else if (strcmp("verbose", arg) == 0)
apic_verbosity = APIC_VERBOSE;
else {
- printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+ pr_warning("APIC Verbosity level %s not recognised"
" use apic=verbose or apic=debug\n", arg);
return -EINVAL;
}
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
deleted file mode 100644
index 94ddb69ae15e..000000000000
--- a/arch/x86/kernel/apic_64.c
+++ /dev/null
@@ -1,1848 +0,0 @@
-/*
- * Local APIC handling, local APIC timers
- *
- * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
- *
- * Fixes
- * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
- * thanks to Eric Gilmore
- * and Rolf G. Tews
- * for testing these extensively.
- * Maciej W. Rozycki : Various updates and fixes.
- * Mikael Pettersson : Power Management for UP-APIC.
- * Pavel Machek and
- * Mikael Pettersson : PM converted to driver model.
- */
-
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/interrupt.h>
-#include <linux/mc146818rtc.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/ioport.h>
-#include <linux/clockchips.h>
-#include <linux/acpi_pmtmr.h>
-#include <linux/module.h>
-#include <linux/dmar.h>
-
-#include <asm/atomic.h>
-#include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/mpspec.h>
-#include <asm/hpet.h>
-#include <asm/pgalloc.h>
-#include <asm/nmi.h>
-#include <asm/idle.h>
-#include <asm/proto.h>
-#include <asm/timex.h>
-#include <asm/apic.h>
-#include <asm/i8259.h>
-
-#include <mach_ipi.h>
-#include <mach_apic.h>
-
-/* Disable local APIC timer from the kernel commandline or via dmi quirk */
-static int disable_apic_timer __cpuinitdata;
-static int apic_calibrate_pmtmr __initdata;
-int disable_apic;
-int disable_x2apic;
-int x2apic;
-
-/* x2apic enabled before OS handover */
-int x2apic_preenabled;
-
-/* Local APIC timer works in C2 */
-int local_apic_timer_c2_ok;
-EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
-
-/*
- * Debug level, exported for io_apic.c
- */
-unsigned int apic_verbosity;
-
-/* Have we found an MP table */
-int smp_found_config;
-
-static struct resource lapic_resource = {
- .name = "Local APIC",
- .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
-};
-
-static unsigned int calibration_result;
-
-static int lapic_next_event(unsigned long delta,
- struct clock_event_device *evt);
-static void lapic_timer_setup(enum clock_event_mode mode,
- struct clock_event_device *evt);
-static void lapic_timer_broadcast(cpumask_t mask);
-static void apic_pm_activate(void);
-
-/*
- * The local apic timer can be used for any function which is CPU local.
- */
-static struct clock_event_device lapic_clockevent = {
- .name = "lapic",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
- | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
- .shift = 32,
- .set_mode = lapic_timer_setup,
- .set_next_event = lapic_next_event,
- .broadcast = lapic_timer_broadcast,
- .rating = 100,
- .irq = -1,
-};
-static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
-
-static unsigned long apic_phys;
-
-unsigned long mp_lapic_addr;
-
-/*
- * Get the LAPIC version
- */
-static inline int lapic_get_version(void)
-{
- return GET_APIC_VERSION(apic_read(APIC_LVR));
-}
-
-/*
- * Check, if the APIC is integrated or a separate chip
- */
-static inline int lapic_is_integrated(void)
-{
-#ifdef CONFIG_X86_64
- return 1;
-#else
- return APIC_INTEGRATED(lapic_get_version());
-#endif
-}
-
-/*
- * Check, whether this is a modern or a first generation APIC
- */
-static int modern_apic(void)
-{
- /* AMD systems use old APIC versions, so check the CPU */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
- boot_cpu_data.x86 >= 0xf)
- return 1;
- return lapic_get_version() >= 0x14;
-}
-
-/*
- * Paravirt kernels also might be using these below ops. So we still
- * use generic apic_read()/apic_write(), which might be pointing to different
- * ops in PARAVIRT case.
- */
-void xapic_wait_icr_idle(void)
-{
- while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
- cpu_relax();
-}
-
-u32 safe_xapic_wait_icr_idle(void)
-{
- u32 send_status;
- int timeout;
-
- timeout = 0;
- do {
- send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
- if (!send_status)
- break;
- udelay(100);
- } while (timeout++ < 1000);
-
- return send_status;
-}
-
-void xapic_icr_write(u32 low, u32 id)
-{
- apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
- apic_write(APIC_ICR, low);
-}
-
-u64 xapic_icr_read(void)
-{
- u32 icr1, icr2;
-
- icr2 = apic_read(APIC_ICR2);
- icr1 = apic_read(APIC_ICR);
-
- return icr1 | ((u64)icr2 << 32);
-}
-
-static struct apic_ops xapic_ops = {
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = xapic_icr_read,
- .icr_write = xapic_icr_write,
- .wait_icr_idle = xapic_wait_icr_idle,
- .safe_wait_icr_idle = safe_xapic_wait_icr_idle,
-};
-
-struct apic_ops __read_mostly *apic_ops = &xapic_ops;
-EXPORT_SYMBOL_GPL(apic_ops);
-
-static void x2apic_wait_icr_idle(void)
-{
- /* no need to wait for icr idle in x2apic */
- return;
-}
-
-static u32 safe_x2apic_wait_icr_idle(void)
-{
- /* no need to wait for icr idle in x2apic */
- return 0;
-}
-
-void x2apic_icr_write(u32 low, u32 id)
-{
- wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
-}
-
-u64 x2apic_icr_read(void)
-{
- unsigned long val;
-
- rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
- return val;
-}
-
-static struct apic_ops x2apic_ops = {
- .read = native_apic_msr_read,
- .write = native_apic_msr_write,
- .icr_read = x2apic_icr_read,
- .icr_write = x2apic_icr_write,
- .wait_icr_idle = x2apic_wait_icr_idle,
- .safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
-};
-
-/**
- * enable_NMI_through_LVT0 - enable NMI through local vector table 0
- */
-void __cpuinit enable_NMI_through_LVT0(void)
-{
- unsigned int v;
-
- /* unmask and set to NMI */
- v = APIC_DM_NMI;
-
- /* Level triggered for 82489DX (32bit mode) */
- if (!lapic_is_integrated())
- v |= APIC_LVT_LEVEL_TRIGGER;
-
- apic_write(APIC_LVT0, v);
-}
-
-/**
- * lapic_get_maxlvt - get the maximum number of local vector table entries
- */
-int lapic_get_maxlvt(void)
-{
- unsigned int v;
-
- v = apic_read(APIC_LVR);
- /*
- * - we always have APIC integrated on 64bit mode
- * - 82489DXs do not report # of LVT entries
- */
- return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
-}
-
-/*
- * Local APIC timer
- */
-
-/* Clock divisor */
-#ifdef CONFG_X86_64
-#define APIC_DIVISOR 1
-#else
-#define APIC_DIVISOR 16
-#endif
-
-/*
- * This function sets up the local APIC timer, with a timeout of
- * 'clocks' APIC bus clock. During calibration we actually call
- * this function twice on the boot CPU, once with a bogus timeout
- * value, second time for real. The other (noncalibrating) CPUs
- * call this function only once, with the real, calibrated value.
- *
- * We do reads before writes even if unnecessary, to get around the
- * P5 APIC double write bug.
- */
-static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
-{
- unsigned int lvtt_value, tmp_value;
-
- lvtt_value = LOCAL_TIMER_VECTOR;
- if (!oneshot)
- lvtt_value |= APIC_LVT_TIMER_PERIODIC;
- if (!lapic_is_integrated())
- lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
-
- if (!irqen)
- lvtt_value |= APIC_LVT_MASKED;
-
- apic_write(APIC_LVTT, lvtt_value);
-
- /*
- * Divide PICLK by 16
- */
- tmp_value = apic_read(APIC_TDCR);
- apic_write(APIC_TDCR,
- (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
- APIC_TDR_DIV_16);
-
- if (!oneshot)
- apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
-}
-
-/*
- * Setup extended LVT, AMD specific (K8, family 10h)
- *
- * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
- * MCE interrupts are supported. Thus MCE offset must be set to 0.
- *
- * If mask=1, the LVT entry does not generate interrupts while mask=0
- * enables the vector. See also the BKDGs.
- */
-
-#define APIC_EILVT_LVTOFF_MCE 0
-#define APIC_EILVT_LVTOFF_IBS 1
-
-static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
-{
- unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
- unsigned int v = (mask << 16) | (msg_type << 8) | vector;
-
- apic_write(reg, v);
-}
-
-u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
-{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_MCE;
-}
-
-u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
-{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_IBS;
-}
-EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
-
-/*
- * Program the next event, relative to now
- */
-static int lapic_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- apic_write(APIC_TMICT, delta);
- return 0;
-}
-
-/*
- * Setup the lapic timer in periodic or oneshot mode
- */
-static void lapic_timer_setup(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- unsigned long flags;
- unsigned int v;
-
- /* Lapic used as dummy for broadcast ? */
- if (evt->features & CLOCK_EVT_FEAT_DUMMY)
- return;
-
- local_irq_save(flags);
-
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- case CLOCK_EVT_MODE_ONESHOT:
- __setup_APIC_LVTT(calibration_result,
- mode != CLOCK_EVT_MODE_PERIODIC, 1);
- break;
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- v = apic_read(APIC_LVTT);
- v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
- apic_write(APIC_LVTT, v);
- break;
- case CLOCK_EVT_MODE_RESUME:
- /* Nothing to do here */
- break;
- }
-
- local_irq_restore(flags);
-}
-
-/*
- * Local APIC timer broadcast function
- */
-static void lapic_timer_broadcast(cpumask_t mask)
-{
-#ifdef CONFIG_SMP
- send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
-#endif
-}
-
-/*
- * Setup the local APIC timer for this CPU. Copy the initilized values
- * of the boot CPU and register the clock event in the framework.
- */
-static void setup_APIC_timer(void)
-{
- struct clock_event_device *levt = &__get_cpu_var(lapic_events);
-
- memcpy(levt, &lapic_clockevent, sizeof(*levt));
- levt->cpumask = cpumask_of_cpu(smp_processor_id());
-
- clockevents_register_device(levt);
-}
-
-/*
- * In this function we calibrate APIC bus clocks to the external
- * timer. Unfortunately we cannot use jiffies and the timer irq
- * to calibrate, since some later bootup code depends on getting
- * the first irq? Ugh.
- *
- * We want to do the calibration only once since we
- * want to have local timer irqs syncron. CPUs connected
- * by the same APIC bus have the very same bus frequency.
- * And we want to have irqs off anyways, no accidental
- * APIC irq that way.
- */
-
-#define TICK_COUNT 100000000
-
-static int __init calibrate_APIC_clock(void)
-{
- unsigned apic, apic_start;
- unsigned long tsc, tsc_start;
- int result;
-
- local_irq_disable();
-
- /*
- * Put whatever arbitrary (but long enough) timeout
- * value into the APIC clock, we just want to get the
- * counter running for calibration.
- *
- * No interrupt enable !
- */
- __setup_APIC_LVTT(250000000, 0, 0);
-
- apic_start = apic_read(APIC_TMCCT);
-#ifdef CONFIG_X86_PM_TIMER
- if (apic_calibrate_pmtmr && pmtmr_ioport) {
- pmtimer_wait(5000); /* 5ms wait */
- apic = apic_read(APIC_TMCCT);
- result = (apic_start - apic) * 1000L / 5;
- } else
-#endif
- {
- rdtscll(tsc_start);
-
- do {
- apic = apic_read(APIC_TMCCT);
- rdtscll(tsc);
- } while ((tsc - tsc_start) < TICK_COUNT &&
- (apic_start - apic) < TICK_COUNT);
-
- result = (apic_start - apic) * 1000L * tsc_khz /
- (tsc - tsc_start);
- }
-
- local_irq_enable();
-
- printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
-
- printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
- result / 1000 / 1000, result / 1000 % 1000);
-
- /* Calculate the scaled math multiplication factor */
- lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC,
- lapic_clockevent.shift);
- lapic_clockevent.max_delta_ns =
- clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
- lapic_clockevent.min_delta_ns =
- clockevent_delta2ns(0xF, &lapic_clockevent);
-
- calibration_result = (result * APIC_DIVISOR) / HZ;
-
- /*
- * Do a sanity check on the APIC calibration result
- */
- if (calibration_result < (1000000 / HZ)) {
- printk(KERN_WARNING
- "APIC frequency too slow, disabling apic timer\n");
- return -1;
- }
-
- return 0;
-}
-
-/*
- * Setup the boot APIC
- *
- * Calibrate and verify the result.
- */
-void __init setup_boot_APIC_clock(void)
-{
- /*
- * The local apic timer can be disabled via the kernel
- * commandline or from the CPU detection code. Register the lapic
- * timer as a dummy clock event source on SMP systems, so the
- * broadcast mechanism is used. On UP systems simply ignore it.
- */
- if (disable_apic_timer) {
- printk(KERN_INFO "Disabling APIC timer\n");
- /* No broadcast on UP ! */
- if (num_possible_cpus() > 1) {
- lapic_clockevent.mult = 1;
- setup_APIC_timer();
- }
- return;
- }
-
- apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
- "calibrating APIC timer ...\n");
-
- if (calibrate_APIC_clock()) {
- /* No broadcast on UP ! */
- if (num_possible_cpus() > 1)
- setup_APIC_timer();
- return;
- }
-
- /*
- * If nmi_watchdog is set to IO_APIC, we need the
- * PIT/HPET going. Otherwise register lapic as a dummy
- * device.
- */
- if (nmi_watchdog != NMI_IO_APIC)
- lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
- else
- printk(KERN_WARNING "APIC timer registered as dummy,"
- " due to nmi_watchdog=%d!\n", nmi_watchdog);
-
- /* Setup the lapic or request the broadcast */
- setup_APIC_timer();
-}
-
-void __cpuinit setup_secondary_APIC_clock(void)
-{
- setup_APIC_timer();
-}
-
-/*
- * The guts of the apic timer interrupt
- */
-static void local_apic_timer_interrupt(void)
-{
- int cpu = smp_processor_id();
- struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
-
- /*
- * Normally we should not be here till LAPIC has been initialized but
- * in some cases like kdump, its possible that there is a pending LAPIC
- * timer interrupt from previous kernel's context and is delivered in
- * new kernel the moment interrupts are enabled.
- *
- * Interrupts are enabled early and LAPIC is setup much later, hence
- * its possible that when we get here evt->event_handler is NULL.
- * Check for event_handler being NULL and discard the interrupt as
- * spurious.
- */
- if (!evt->event_handler) {
- printk(KERN_WARNING
- "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
- /* Switch it off */
- lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
- return;
- }
-
- /*
- * the NMI deadlock-detector uses this.
- */
-#ifdef CONFIG_X86_64
- add_pda(apic_timer_irqs, 1);
-#else
- per_cpu(irq_stat, cpu).apic_timer_irqs++;
-#endif
-
- evt->event_handler(evt);
-}
-
-/*
- * Local APIC timer interrupt. This is the most natural way for doing
- * local interrupts, but local timer interrupts can be emulated by
- * broadcast interrupts too. [in case the hw doesn't support APIC timers]
- *
- * [ if a single-CPU system runs an SMP kernel then we call the local
- * interrupt as well. Thus we cannot inline the local irq ... ]
- */
-void smp_apic_timer_interrupt(struct pt_regs *regs)
-{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- /*
- * NOTE! We'd better ACK the irq immediately,
- * because timer handling can be slow.
- */
- ack_APIC_irq();
- /*
- * update_process_times() expects us to have done irq_enter().
- * Besides, if we don't timer interrupts ignore the global
- * interrupt lock, which is the WrongThing (tm) to do.
- */
- exit_idle();
- irq_enter();
- local_apic_timer_interrupt();
- irq_exit();
-
- set_irq_regs(old_regs);
-}
-
-int setup_profiling_timer(unsigned int multiplier)
-{
- return -EINVAL;
-}
-
-
-/*
- * Local APIC start and shutdown
- */
-
-/**
- * clear_local_APIC - shutdown the local APIC
- *
- * This is called, when a CPU is disabled and before rebooting, so the state of
- * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
- * leftovers during boot.
- */
-void clear_local_APIC(void)
-{
- int maxlvt;
- u32 v;
-
- /* APIC hasn't been mapped yet */
- if (!apic_phys)
- return;
-
- maxlvt = lapic_get_maxlvt();
- /*
- * Masking an LVT entry can trigger a local APIC error
- * if the vector is zero. Mask LVTERR first to prevent this.
- */
- if (maxlvt >= 3) {
- v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
- apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
- }
- /*
- * Careful: we have to set masks only first to deassert
- * any level-triggered sources.
- */
- v = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
- v = apic_read(APIC_LVT0);
- apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
- v = apic_read(APIC_LVT1);
- apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
- if (maxlvt >= 4) {
- v = apic_read(APIC_LVTPC);
- apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
- }
-
- /* lets not touch this if we didn't frob it */
-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
- if (maxlvt >= 5) {
- v = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
- }
-#endif
- /*
- * Clean APIC state for other OSs:
- */
- apic_write(APIC_LVTT, APIC_LVT_MASKED);
- apic_write(APIC_LVT0, APIC_LVT_MASKED);
- apic_write(APIC_LVT1, APIC_LVT_MASKED);
- if (maxlvt >= 3)
- apic_write(APIC_LVTERR, APIC_LVT_MASKED);
- if (maxlvt >= 4)
- apic_write(APIC_LVTPC, APIC_LVT_MASKED);
-
- /* Integrated APIC (!82489DX) ? */
- if (lapic_is_integrated()) {
- if (maxlvt > 3)
- /* Clear ESR due to Pentium errata 3AP and 11AP */
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
- }
-}
-
-/**
- * disable_local_APIC - clear and disable the local APIC
- */
-void disable_local_APIC(void)
-{
- unsigned int value;
-
- clear_local_APIC();
-
- /*
- * Disable APIC (implies clearing of registers
- * for 82489DX!).
- */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_SPIV_APIC_ENABLED;
- apic_write(APIC_SPIV, value);
-
-#ifdef CONFIG_X86_32
- /*
- * When LAPIC was disabled by the BIOS and enabled by the kernel,
- * restore the disabled state.
- */
- if (enabled_via_apicbase) {
- unsigned int l, h;
-
- rdmsr(MSR_IA32_APICBASE, l, h);
- l &= ~MSR_IA32_APICBASE_ENABLE;
- wrmsr(MSR_IA32_APICBASE, l, h);
- }
-#endif
-}
-
-/*
- * If Linux enabled the LAPIC against the BIOS default disable it down before
- * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
- * not power-off. Additionally clear all LVT entries before disable_local_APIC
- * for the case where Linux didn't enable the LAPIC.
- */
-void lapic_shutdown(void)
-{
- unsigned long flags;
-
- if (!cpu_has_apic)
- return;
-
- local_irq_save(flags);
-
-#ifdef CONFIG_X86_32
- if (!enabled_via_apicbase)
- clear_local_APIC();
- else
-#endif
- disable_local_APIC();
-
-
- local_irq_restore(flags);
-}
-
-/*
- * This is to verify that we're looking at a real local APIC.
- * Check these against your board if the CPUs aren't getting
- * started for no apparent reason.
- */
-int __init verify_local_APIC(void)
-{
- unsigned int reg0, reg1;
-
- /*
- * The version register is read-only in a real APIC.
- */
- reg0 = apic_read(APIC_LVR);
- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
- apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
- reg1 = apic_read(APIC_LVR);
- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
-
- /*
- * The two version reads above should print the same
- * numbers. If the second one is different, then we
- * poke at a non-APIC.
- */
- if (reg1 != reg0)
- return 0;
-
- /*
- * Check if the version looks reasonably.
- */
- reg1 = GET_APIC_VERSION(reg0);
- if (reg1 == 0x00 || reg1 == 0xff)
- return 0;
- reg1 = lapic_get_maxlvt();
- if (reg1 < 0x02 || reg1 == 0xff)
- return 0;
-
- /*
- * The ID register is read/write in a real APIC.
- */
- reg0 = apic_read(APIC_ID);
- apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
- apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
- reg1 = apic_read(APIC_ID);
- apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
- apic_write(APIC_ID, reg0);
- if (reg1 != (reg0 ^ APIC_ID_MASK))
- return 0;
-
- /*
- * The next two are just to see if we have sane values.
- * They're only really relevant if we're in Virtual Wire
- * compatibility mode, but most boxes are anymore.
- */
- reg0 = apic_read(APIC_LVT0);
- apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
- reg1 = apic_read(APIC_LVT1);
- apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
-
- return 1;
-}
-
-/**
- * sync_Arb_IDs - synchronize APIC bus arbitration IDs
- */
-void __init sync_Arb_IDs(void)
-{
- /*
- * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
- * needed on AMD.
- */
- if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
- return;
-
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
-
- apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
- apic_write(APIC_ICR, APIC_DEST_ALLINC |
- APIC_INT_LEVELTRIG | APIC_DM_INIT);
-}
-
-/*
- * An initial setup of the virtual wire mode.
- */
-void __init init_bsp_APIC(void)
-{
- unsigned int value;
-
- /*
- * Don't do the setup now if we have a SMP BIOS as the
- * through-I/O-APIC virtual wire mode might be active.
- */
- if (smp_found_config || !cpu_has_apic)
- return;
-
- /*
- * Do not trust the local APIC being empty at bootup.
- */
- clear_local_APIC();
-
- /*
- * Enable APIC.
- */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_VECTOR_MASK;
- value |= APIC_SPIV_APIC_ENABLED;
-
-#ifdef CONFIG_X86_32
- /* This bit is reserved on P4/Xeon and should be cleared */
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
- (boot_cpu_data.x86 == 15))
- value &= ~APIC_SPIV_FOCUS_DISABLED;
- else
-#endif
- value |= APIC_SPIV_FOCUS_DISABLED;
- value |= SPURIOUS_APIC_VECTOR;
- apic_write(APIC_SPIV, value);
-
- /*
- * Set up the virtual wire mode.
- */
- apic_write(APIC_LVT0, APIC_DM_EXTINT);
- value = APIC_DM_NMI;
- if (!lapic_is_integrated()) /* 82489DX */
- value |= APIC_LVT_LEVEL_TRIGGER;
- apic_write(APIC_LVT1, value);
-}
-
-static void __cpuinit lapic_setup_esr(void)
-{
- unsigned long oldvalue, value, maxlvt;
- if (lapic_is_integrated() && !esr_disable) {
- if (esr_disable) {
- /*
- * Something untraceable is creating bad interrupts on
- * secondary quads ... for the moment, just leave the
- * ESR disabled - we can't do anything useful with the
- * errors anyway - mbligh
- */
- printk(KERN_INFO "Leaving ESR disabled.\n");
- return;
- }
- /* !82489DX */
- maxlvt = lapic_get_maxlvt();
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
- apic_write(APIC_ESR, 0);
- oldvalue = apic_read(APIC_ESR);
-
- /* enables sending errors */
- value = ERROR_APIC_VECTOR;
- apic_write(APIC_LVTERR, value);
- /*
- * spec says clear errors after enabling vector.
- */
- if (maxlvt > 3)
- apic_write(APIC_ESR, 0);
- value = apic_read(APIC_ESR);
- if (value != oldvalue)
- apic_printk(APIC_VERBOSE, "ESR value before enabling "
- "vector: 0x%08lx after: 0x%08lx\n",
- oldvalue, value);
- } else {
- printk(KERN_INFO "No ESR for 82489DX.\n");
- }
-}
-
-
-/**
- * setup_local_APIC - setup the local APIC
- */
-void __cpuinit setup_local_APIC(void)
-{
- unsigned int value;
- int i, j;
-
- preempt_disable();
- value = apic_read(APIC_LVR);
-
- BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
-
- /*
- * Double-check whether this APIC is really registered.
- * This is meaningless in clustered apic mode, so we skip it.
- */
- if (!apic_id_registered())
- BUG();
-
- /*
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
- init_apic_ldr();
-
- /*
- * Set Task Priority to 'accept all'. We never change this
- * later on.
- */
- value = apic_read(APIC_TASKPRI);
- value &= ~APIC_TPRI_MASK;
- apic_write(APIC_TASKPRI, value);
-
- /*
- * After a crash, we no longer service the interrupts and a pending
- * interrupt from previous kernel might still have ISR bit set.
- *
- * Most probably by now CPU has serviced that pending interrupt and
- * it might not have done the ack_APIC_irq() because it thought,
- * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
- * does not clear the ISR bit and cpu thinks it has already serivced
- * the interrupt. Hence a vector might get locked. It was noticed
- * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
- */
- for (i = APIC_ISR_NR - 1; i >= 0; i--) {
- value = apic_read(APIC_ISR + i*0x10);
- for (j = 31; j >= 0; j--) {
- if (value & (1<<j))
- ack_APIC_irq();
- }
- }
-
- /*
- * Now that we are all set up, enable the APIC
- */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_VECTOR_MASK;
- /*
- * Enable APIC
- */
- value |= APIC_SPIV_APIC_ENABLED;
-
- /* We always use processor focus */
-
- /*
- * Set spurious IRQ vector
- */
- value |= SPURIOUS_APIC_VECTOR;
- apic_write(APIC_SPIV, value);
-
- /*
- * Set up LVT0, LVT1:
- *
- * set up through-local-APIC on the BP's LINT0. This is not
- * strictly necessary in pure symmetric-IO mode, but sometimes
- * we delegate interrupts to the 8259A.
- */
- /*
- * TODO: set up through-local-APIC from through-I/O-APIC? --macro
- */
- value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
- if (!smp_processor_id() && !value) {
- value = APIC_DM_EXTINT;
- apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
- smp_processor_id());
- } else {
- value = APIC_DM_EXTINT | APIC_LVT_MASKED;
- apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
- smp_processor_id());
- }
- apic_write(APIC_LVT0, value);
-
- /*
- * only the BP should see the LINT1 NMI signal, obviously.
- */
- if (!smp_processor_id())
- value = APIC_DM_NMI;
- else
- value = APIC_DM_NMI | APIC_LVT_MASKED;
- apic_write(APIC_LVT1, value);
- preempt_enable();
-}
-
-void __cpuinit end_local_APIC_setup(void)
-{
- lapic_setup_esr();
-
-#ifdef CONFIG_X86_32
- {
- unsigned int value;
- /* Disable the local apic timer */
- value = apic_read(APIC_LVTT);
- value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
- apic_write(APIC_LVTT, value);
- }
-#endif
-
- setup_apic_nmi_watchdog(NULL);
- apic_pm_activate();
-}
-
-void check_x2apic(void)
-{
- int msr, msr2;
-
- rdmsr(MSR_IA32_APICBASE, msr, msr2);
-
- if (msr & X2APIC_ENABLE) {
- printk("x2apic enabled by BIOS, switching to x2apic ops\n");
- x2apic_preenabled = x2apic = 1;
- apic_ops = &x2apic_ops;
- }
-}
-
-void enable_x2apic(void)
-{
- int msr, msr2;
-
- rdmsr(MSR_IA32_APICBASE, msr, msr2);
- if (!(msr & X2APIC_ENABLE)) {
- printk("Enabling x2apic\n");
- wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
- }
-}
-
-void enable_IR_x2apic(void)
-{
-#ifdef CONFIG_INTR_REMAP
- int ret;
- unsigned long flags;
-
- if (!cpu_has_x2apic)
- return;
-
- if (!x2apic_preenabled && disable_x2apic) {
- printk(KERN_INFO
- "Skipped enabling x2apic and Interrupt-remapping "
- "because of nox2apic\n");
- return;
- }
-
- if (x2apic_preenabled && disable_x2apic)
- panic("Bios already enabled x2apic, can't enforce nox2apic");
-
- if (!x2apic_preenabled && skip_ioapic_setup) {
- printk(KERN_INFO
- "Skipped enabling x2apic and Interrupt-remapping "
- "because of skipping io-apic setup\n");
- return;
- }
-
- ret = dmar_table_init();
- if (ret) {
- printk(KERN_INFO
- "dmar_table_init() failed with %d:\n", ret);
-
- if (x2apic_preenabled)
- panic("x2apic enabled by bios. But IR enabling failed");
- else
- printk(KERN_INFO
- "Not enabling x2apic,Intr-remapping\n");
- return;
- }
-
- local_irq_save(flags);
- mask_8259A();
- save_mask_IO_APIC_setup();
-
- ret = enable_intr_remapping(1);
-
- if (ret && x2apic_preenabled) {
- local_irq_restore(flags);
- panic("x2apic enabled by bios. But IR enabling failed");
- }
-
- if (ret)
- goto end;
-
- if (!x2apic) {
- x2apic = 1;
- apic_ops = &x2apic_ops;
- enable_x2apic();
- }
-end:
- if (ret)
- /*
- * IR enabling failed
- */
- restore_IO_APIC_setup();
- else
- reinit_intr_remapped_IO_APIC(x2apic_preenabled);
-
- unmask_8259A();
- local_irq_restore(flags);
-
- if (!ret) {
- if (!x2apic_preenabled)
- printk(KERN_INFO
- "Enabled x2apic and interrupt-remapping\n");
- else
- printk(KERN_INFO
- "Enabled Interrupt-remapping\n");
- } else
- printk(KERN_ERR
- "Failed to enable Interrupt-remapping and x2apic\n");
-#else
- if (!cpu_has_x2apic)
- return;
-
- if (x2apic_preenabled)
- panic("x2apic enabled prior OS handover,"
- " enable CONFIG_INTR_REMAP");
-
- printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping "
- " and x2apic\n");
-#endif
-
- return;
-}
-
-/*
- * Detect and enable local APICs on non-SMP boards.
- * Original code written by Keir Fraser.
- * On AMD64 we trust the BIOS - if it says no APIC it is likely
- * not correctly set up (usually the APIC timer won't work etc.)
- */
-static int __init detect_init_APIC(void)
-{
- if (!cpu_has_apic) {
- printk(KERN_INFO "No local APIC present\n");
- return -1;
- }
-
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
- boot_cpu_physical_apicid = 0;
- return 0;
-}
-
-void __init early_init_lapic_mapping(void)
-{
- unsigned long phys_addr;
-
- /*
- * If no local APIC can be found then go out
- * : it means there is no mpatable and MADT
- */
- if (!smp_found_config)
- return;
-
- phys_addr = mp_lapic_addr;
-
- set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
- apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- APIC_BASE, phys_addr);
-
- /*
- * Fetch the APIC ID of the BSP in case we have a
- * default configuration (or the MP table is broken).
- */
- boot_cpu_physical_apicid = read_apic_id();
-}
-
-/**
- * init_apic_mappings - initialize APIC mappings
- */
-void __init init_apic_mappings(void)
-{
- if (x2apic) {
- boot_cpu_physical_apicid = read_apic_id();
- return;
- }
-
- /*
- * If no local APIC can be found then set up a fake all
- * zeroes page to simulate the local APIC and another
- * one for the IO-APIC.
- */
- if (!smp_found_config && detect_init_APIC()) {
- apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
- apic_phys = __pa(apic_phys);
- } else
- apic_phys = mp_lapic_addr;
-
- set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
- apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- APIC_BASE, apic_phys);
-
- /*
- * Fetch the APIC ID of the BSP in case we have a
- * default configuration (or the MP table is broken).
- */
- boot_cpu_physical_apicid = read_apic_id();
-}
-
-/*
- * This initializes the IO-APIC and APIC hardware if this is
- * a UP kernel.
- */
-int apic_version[MAX_APICS];
-
-int __init APIC_init_uniprocessor(void)
-{
- if (disable_apic) {
- printk(KERN_INFO "Apic disabled\n");
- return -1;
- }
- if (!cpu_has_apic) {
- disable_apic = 1;
- printk(KERN_INFO "Apic disabled by BIOS\n");
- return -1;
- }
-
- enable_IR_x2apic();
- setup_apic_routing();
-
- verify_local_APIC();
-
- connect_bsp_APIC();
-
- physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
- apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
-
- setup_local_APIC();
-
- /*
- * Now enable IO-APICs, actually call clear_IO_APIC
- * We need clear_IO_APIC before enabling vector on BP
- */
- if (!skip_ioapic_setup && nr_ioapics)
- enable_IO_APIC();
-
- if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
- localise_nmi_watchdog();
- end_local_APIC_setup();
-
- if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
- setup_IO_APIC();
- else
- nr_ioapics = 0;
- setup_boot_APIC_clock();
- check_nmi_watchdog();
- return 0;
-}
-
-/*
- * Local APIC interrupts
- */
-
-/*
- * This interrupt should _never_ happen with our APIC/SMP architecture
- */
-asmlinkage void smp_spurious_interrupt(void)
-{
- unsigned int v;
- exit_idle();
- irq_enter();
- /*
- * Check if this really is a spurious interrupt and ACK it
- * if it is a vectored one. Just in case...
- * Spurious interrupts should not be ACKed.
- */
- v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
- if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
- ack_APIC_irq();
-
- add_pda(irq_spurious_count, 1);
- irq_exit();
-}
-
-/*
- * This interrupt should never happen with our APIC/SMP architecture
- */
-asmlinkage void smp_error_interrupt(void)
-{
- unsigned int v, v1;
-
- exit_idle();
- irq_enter();
- /* First tickle the hardware, only then report what went on. -- REW */
- v = apic_read(APIC_ESR);
- apic_write(APIC_ESR, 0);
- v1 = apic_read(APIC_ESR);
- ack_APIC_irq();
- atomic_inc(&irq_err_count);
-
- /* Here is what the APIC error bits mean:
- 0: Send CS error
- 1: Receive CS error
- 2: Send accept error
- 3: Receive accept error
- 4: Reserved
- 5: Send illegal vector
- 6: Received illegal vector
- 7: Illegal register address
- */
- printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
- smp_processor_id(), v , v1);
- irq_exit();
-}
-
-/**
- * connect_bsp_APIC - attach the APIC to the interrupt system
- */
-void __init connect_bsp_APIC(void)
-{
-#ifdef CONFIG_X86_32
- if (pic_mode) {
- /*
- * Do not trust the local APIC being empty at bootup.
- */
- clear_local_APIC();
- /*
- * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
- * local APIC to INT and NMI lines.
- */
- apic_printk(APIC_VERBOSE, "leaving PIC mode, "
- "enabling APIC mode.\n");
- outb(0x70, 0x22);
- outb(0x01, 0x23);
- }
-#endif
- enable_apic_mode();
-}
-
-/**
- * disconnect_bsp_APIC - detach the APIC from the interrupt system
- * @virt_wire_setup: indicates, whether virtual wire mode is selected
- *
- * Virtual wire mode is necessary to deliver legacy interrupts even when the
- * APIC is disabled.
- */
-void disconnect_bsp_APIC(int virt_wire_setup)
-{
- unsigned int value;
-
-#ifdef CONFIG_X86_32
- if (pic_mode) {
- /*
- * Put the board back into PIC mode (has an effect only on
- * certain older boards). Note that APIC interrupts, including
- * IPIs, won't work beyond this point! The only exception are
- * INIT IPIs.
- */
- apic_printk(APIC_VERBOSE, "disabling APIC mode, "
- "entering PIC mode.\n");
- outb(0x70, 0x22);
- outb(0x00, 0x23);
- return;
- }
-#endif
-
- /* Go back to Virtual Wire compatibility mode */
-
- /* For the spurious interrupt use vector F, and enable it */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_VECTOR_MASK;
- value |= APIC_SPIV_APIC_ENABLED;
- value |= 0xf;
- apic_write(APIC_SPIV, value);
-
- if (!virt_wire_setup) {
- /*
- * For LVT0 make it edge triggered, active high,
- * external and enabled
- */
- value = apic_read(APIC_LVT0);
- value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
- APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
- APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
- value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
- apic_write(APIC_LVT0, value);
- } else {
- /* Disable LVT0 */
- apic_write(APIC_LVT0, APIC_LVT_MASKED);
- }
-
- /*
- * For LVT1 make it edge triggered, active high,
- * nmi and enabled
- */
- value = apic_read(APIC_LVT1);
- value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
- APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
- APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
- value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
- apic_write(APIC_LVT1, value);
-}
-
-void __cpuinit generic_processor_info(int apicid, int version)
-{
- int cpu;
- cpumask_t tmp_map;
-
- /*
- * Validate version
- */
- if (version == 0x0) {
- printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
- "fixing up to 0x10. (tell your hw vendor)\n",
- version);
- version = 0x10;
- }
- apic_version[apicid] = version;
-
- if (num_processors >= NR_CPUS) {
- printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
- " Processor ignored.\n", NR_CPUS);
- return;
- }
-
- num_processors++;
- cpus_complement(tmp_map, cpu_present_map);
- cpu = first_cpu(tmp_map);
-
- physid_set(apicid, phys_cpu_present_map);
- if (apicid == boot_cpu_physical_apicid) {
- /*
- * x86_bios_cpu_apicid is required to have processors listed
- * in same order as logical cpu numbers. Hence the first
- * entry is BSP, and so on.
- */
- cpu = 0;
- }
- if (apicid > max_physical_apicid)
- max_physical_apicid = apicid;
-
-#ifdef CONFIG_X86_32
- /*
- * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
- * but we need to work other dependencies like SMP_SUSPEND etc
- * before this can be done without some confusion.
- * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
- * - Ashok Raj <ashok.raj@intel.com>
- */
- if (max_physical_apicid >= 8) {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- if (!APIC_XAPIC(version)) {
- def_to_bigsmp = 0;
- break;
- }
- /* If P4 and above fall through */
- case X86_VENDOR_AMD:
- def_to_bigsmp = 1;
- }
- }
-#endif
-
-#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
- /* are we being called early in kernel startup? */
- if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
- u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
- u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
-
- cpu_to_apicid[cpu] = apicid;
- bios_cpu_apicid[cpu] = apicid;
- } else {
- per_cpu(x86_cpu_to_apicid, cpu) = apicid;
- per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
- }
-#endif
-
- cpu_set(cpu, cpu_possible_map);
- cpu_set(cpu, cpu_present_map);
-}
-
-int hard_smp_processor_id(void)
-{
- return read_apic_id();
-}
-
-/*
- * Power management
- */
-#ifdef CONFIG_PM
-
-static struct {
- /*
- * 'active' is true if the local APIC was enabled by us and
- * not the BIOS; this signifies that we are also responsible
- * for disabling it before entering apm/acpi suspend
- */
- int active;
- /* r/w apic fields */
- unsigned int apic_id;
- unsigned int apic_taskpri;
- unsigned int apic_ldr;
- unsigned int apic_dfr;
- unsigned int apic_spiv;
- unsigned int apic_lvtt;
- unsigned int apic_lvtpc;
- unsigned int apic_lvt0;
- unsigned int apic_lvt1;
- unsigned int apic_lvterr;
- unsigned int apic_tmict;
- unsigned int apic_tdcr;
- unsigned int apic_thmr;
-} apic_pm_state;
-
-static int lapic_suspend(struct sys_device *dev, pm_message_t state)
-{
- unsigned long flags;
- int maxlvt;
-
- if (!apic_pm_state.active)
- return 0;
-
- maxlvt = lapic_get_maxlvt();
-
- apic_pm_state.apic_id = apic_read(APIC_ID);
- apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
- apic_pm_state.apic_ldr = apic_read(APIC_LDR);
- apic_pm_state.apic_dfr = apic_read(APIC_DFR);
- apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
- apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
- if (maxlvt >= 4)
- apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
- apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
- apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
- apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
- apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
- apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
- if (maxlvt >= 5)
- apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
-#endif
-
- local_irq_save(flags);
- disable_local_APIC();
- local_irq_restore(flags);
- return 0;
-}
-
-static int lapic_resume(struct sys_device *dev)
-{
- unsigned int l, h;
- unsigned long flags;
- int maxlvt;
-
- if (!apic_pm_state.active)
- return 0;
-
- maxlvt = lapic_get_maxlvt();
-
- local_irq_save(flags);
-
-#ifdef CONFIG_X86_64
- if (x2apic)
- enable_x2apic();
- else
-#endif
- {
- /*
- * Make sure the APICBASE points to the right address
- *
- * FIXME! This will be wrong if we ever support suspend on
- * SMP! We'll need to do this as part of the CPU restore!
- */
- rdmsr(MSR_IA32_APICBASE, l, h);
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
- wrmsr(MSR_IA32_APICBASE, l, h);
- }
-
- apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
- apic_write(APIC_ID, apic_pm_state.apic_id);
- apic_write(APIC_DFR, apic_pm_state.apic_dfr);
- apic_write(APIC_LDR, apic_pm_state.apic_ldr);
- apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
- apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
- apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
- apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
- if (maxlvt >= 5)
- apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
-#endif
- if (maxlvt >= 4)
- apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
- apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
- apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
- apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
- apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
-
- local_irq_restore(flags);
-
- return 0;
-}
-
-/*
- * This device has no shutdown method - fully functioning local APICs
- * are needed on every CPU up until machine_halt/restart/poweroff.
- */
-
-static struct sysdev_class lapic_sysclass = {
- .name = "lapic",
- .resume = lapic_resume,
- .suspend = lapic_suspend,
-};
-
-static struct sys_device device_lapic = {
- .id = 0,
- .cls = &lapic_sysclass,
-};
-
-static void __cpuinit apic_pm_activate(void)
-{
- apic_pm_state.active = 1;
-}
-
-static int __init init_lapic_sysfs(void)
-{
- int error;
-
- if (!cpu_has_apic)
- return 0;
- /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
-
- error = sysdev_class_register(&lapic_sysclass);
- if (!error)
- error = sysdev_register(&device_lapic);
- return error;
-}
-device_initcall(init_lapic_sysfs);
-
-#else /* CONFIG_PM */
-
-static void apic_pm_activate(void) { }
-
-#endif /* CONFIG_PM */
-
-/*
- * apic_is_clustered_box() -- Check if we can expect good TSC
- *
- * Thus far, the major user of this is IBM's Summit2 series:
- *
- * Clustered boxes may have unsynced TSC problems if they are
- * multi-chassis. Use available data to take a good guess.
- * If in doubt, go HPET.
- */
-__cpuinit int apic_is_clustered_box(void)
-{
- int i, clusters, zeros;
- unsigned id;
- u16 *bios_cpu_apicid;
- DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
-
- /*
- * there is not this kind of box with AMD CPU yet.
- * Some AMD box with quadcore cpu and 8 sockets apicid
- * will be [4, 0x23] or [8, 0x27] could be thought to
- * vsmp box still need checking...
- */
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
- return 0;
-
- bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
- bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
-
- for (i = 0; i < NR_CPUS; i++) {
- /* are we being called early in kernel startup? */
- if (bios_cpu_apicid) {
- id = bios_cpu_apicid[i];
- }
- else if (i < nr_cpu_ids) {
- if (cpu_present(i))
- id = per_cpu(x86_bios_cpu_apicid, i);
- else
- continue;
- }
- else
- break;
-
- if (id != BAD_APICID)
- __set_bit(APIC_CLUSTERID(id), clustermap);
- }
-
- /* Problem: Partially populated chassis may not have CPUs in some of
- * the APIC clusters they have been allocated. Only present CPUs have
- * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
- * Since clusters are allocated sequentially, count zeros only if
- * they are bounded by ones.
- */
- clusters = 0;
- zeros = 0;
- for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
- if (test_bit(i, clustermap)) {
- clusters += 1 + zeros;
- zeros = 0;
- } else
- ++zeros;
- }
-
- /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
- * not guaranteed to be synced between boards
- */
- if (is_vsmp_box() && clusters > 1)
- return 1;
-
- /*
- * If clusters > 2, then should be multi-chassis.
- * May have to revisit this when multi-core + hyperthreaded CPUs come
- * out, but AFAIK this will work even for them.
- */
- return (clusters > 2);
-}
-
-static __init int setup_nox2apic(char *str)
-{
- disable_x2apic = 1;
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_X2APIC);
- return 0;
-}
-early_param("nox2apic", setup_nox2apic);
-
-
-/*
- * APIC command line parameters
- */
-static int __init setup_disableapic(char *arg)
-{
- disable_apic = 1;
- setup_clear_cpu_cap(X86_FEATURE_APIC);
- return 0;
-}
-early_param("disableapic", setup_disableapic);
-
-/* same as disableapic, for compatibility */
-static int __init setup_nolapic(char *arg)
-{
- return setup_disableapic(arg);
-}
-early_param("nolapic", setup_nolapic);
-
-static int __init parse_lapic_timer_c2_ok(char *arg)
-{
- local_apic_timer_c2_ok = 1;
- return 0;
-}
-early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
-
-static int __init parse_disable_apic_timer(char *arg)
-{
- disable_apic_timer = 1;
- return 0;
-}
-early_param("noapictimer", parse_disable_apic_timer);
-
-static int __init parse_nolapic_timer(char *arg)
-{
- disable_apic_timer = 1;
- return 0;
-}
-early_param("nolapic_timer", parse_nolapic_timer);
-
-static __init int setup_apicpmtimer(char *s)
-{
- apic_calibrate_pmtmr = 1;
- notsc_setup(NULL);
- return 0;
-}
-__setup("apicpmtimer", setup_apicpmtimer);
-
-static int __init apic_set_verbosity(char *arg)
-{
- if (!arg) {
-#ifdef CONFIG_X86_64
- skip_ioapic_setup = 0;
- ioapic_force = 1;
- return 0;
-#endif
- return -EINVAL;
- }
-
- if (strcmp("debug", arg) == 0)
- apic_verbosity = APIC_DEBUG;
- else if (strcmp("verbose", arg) == 0)
- apic_verbosity = APIC_VERBOSE;
- else {
- printk(KERN_WARNING "APIC Verbosity level %s not recognised"
- " use apic=verbose or apic=debug\n", arg);
- return -EINVAL;
- }
-
- return 0;
-}
-early_param("apic", apic_set_verbosity);
-
-static int __init lapic_insert_resource(void)
-{
- if (!apic_phys)
- return -1;
-
- /* Put local APIC into the resource map. */
- lapic_resource.start = apic_phys;
- lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
- insert_resource(&iomem_resource, &lapic_resource);
-
- return 0;
-}
-
-/*
- * need call insert after e820_reserve_resources()
- * that is using request_resource
- */
-late_initcall(lapic_insert_resource);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5145a6e72bbb..3a26525a3f31 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -391,11 +391,7 @@ static int power_off;
#else
static int power_off = 1;
#endif
-#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
-static int realmode_power_off = 1;
-#else
static int realmode_power_off;
-#endif
#ifdef CONFIG_APM_ALLOW_INTS
static int allow_ints = 1;
#else
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6649d09ad88f..ee4df08feee6 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -11,7 +11,7 @@
#include <linux/suspend.h>
#include <linux/kbuild.h>
#include <asm/ucontext.h>
-#include "sigframe.h"
+#include <asm/sigframe.h>
#include <asm/pgtable.h>
#include <asm/fixmap.h>
#include <asm/processor.h>
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 505543a75a56..1d41d3f1edbc 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -20,9 +20,11 @@
#include <xen/interface/xen.h>
+#include <asm/sigframe.h>
+
#define __NO_STUBS 1
#undef __SYSCALL
-#undef ASM_X86__UNISTD_64_H
+#undef _ASM_X86_UNISTD_64_H
#define __SYSCALL(nr, sym) [nr] = 1,
static char syscalls[] = {
#include <asm/unistd.h>
@@ -87,7 +89,7 @@ int main(void)
BLANK();
#undef ENTRY
DEFINE(IA32_RT_SIGFRAME_sigcontext,
- offsetof (struct rt_sigframe32, uc.uc_mcontext));
+ offsetof (struct rt_sigframe_ia32, uc.uc_mcontext));
BLANK();
#endif
DEFINE(pbe_address, offsetof(struct pbe, address));
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index fdd585f9c53d..2a0a2a3cac26 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -1,8 +1,6 @@
/*
* BIOS run time interface routines.
*
- * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
- *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -16,33 +14,178 @@
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Russ Anderson
*/
+#include <linux/efi.h>
+#include <asm/efi.h>
+#include <linux/io.h>
#include <asm/uv/bios.h>
+#include <asm/uv/uv_hub.h>
-const char *
-x86_bios_strerror(long status)
+struct uv_systab uv_systab;
+
+s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
{
- const char *str;
- switch (status) {
- case 0: str = "Call completed without error"; break;
- case -1: str = "Not implemented"; break;
- case -2: str = "Invalid argument"; break;
- case -3: str = "Call completed with error"; break;
- default: str = "Unknown BIOS status code"; break;
- }
- return str;
+ struct uv_systab *tab = &uv_systab;
+
+ if (!tab->function)
+ /*
+ * BIOS does not support UV systab
+ */
+ return BIOS_STATUS_UNIMPLEMENTED;
+
+ return efi_call6((void *)__va(tab->function),
+ (u64)which, a1, a2, a3, a4, a5);
+}
+
+s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+ u64 a4, u64 a5)
+{
+ unsigned long bios_flags;
+ s64 ret;
+
+ local_irq_save(bios_flags);
+ ret = uv_bios_call(which, a1, a2, a3, a4, a5);
+ local_irq_restore(bios_flags);
+
+ return ret;
+}
+
+s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+ u64 a4, u64 a5)
+{
+ s64 ret;
+
+ preempt_disable();
+ ret = uv_bios_call(which, a1, a2, a3, a4, a5);
+ preempt_enable();
+
+ return ret;
+}
+
+
+long sn_partition_id;
+EXPORT_SYMBOL_GPL(sn_partition_id);
+long sn_coherency_id;
+EXPORT_SYMBOL_GPL(sn_coherency_id);
+long sn_region_size;
+EXPORT_SYMBOL_GPL(sn_region_size);
+int uv_type;
+
+
+s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
+ long *region)
+{
+ s64 ret;
+ u64 v0, v1;
+ union partition_info_u part;
+
+ ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc,
+ (u64)(&v0), (u64)(&v1), 0, 0);
+ if (ret != BIOS_STATUS_SUCCESS)
+ return ret;
+
+ part.val = v0;
+ if (uvtype)
+ *uvtype = part.hub_version;
+ if (partid)
+ *partid = part.partition_id;
+ if (coher)
+ *coher = part.coherence_id;
+ if (region)
+ *region = part.region_size;
+ return ret;
}
-long
-x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second,
- unsigned long *drift_info)
+int
+uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size,
+ unsigned long *intr_mmr_offset)
{
- struct uv_bios_retval isrv;
+ union uv_watchlist_u size_blade;
+ u64 watchlist;
+ s64 ret;
- BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
- *ticks_per_second = isrv.v0;
- *drift_info = isrv.v1;
- return isrv.status;
+ size_blade.size = mq_size;
+ size_blade.blade = blade;
+
+ /*
+ * bios returns watchlist number or negative error number.
+ */
+ ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
+ size_blade.val, (u64)intr_mmr_offset,
+ (u64)&watchlist, 0);
+ if (ret < BIOS_STATUS_SUCCESS)
+ return ret;
+
+ return watchlist;
+}
+EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc);
+
+int
+uv_bios_mq_watchlist_free(int blade, int watchlist_num)
+{
+ return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE,
+ blade, watchlist_num, 0, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free);
+
+s64
+uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms)
+{
+ return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len,
+ perms, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_change_memprotect);
+
+s64
+uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len)
+{
+ s64 ret;
+
+ ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie,
+ (u64)addr, buf, (u64)len, 0);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa);
+
+s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
+{
+ return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type,
+ (u64)ticks_per_second, 0, 0, 0);
}
-EXPORT_SYMBOL_GPL(x86_bios_freq_base);
+EXPORT_SYMBOL_GPL(uv_bios_freq_base);
+
+
+#ifdef CONFIG_EFI
+void uv_bios_init(void)
+{
+ struct uv_systab *tab;
+
+ if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
+ (efi.uv_systab == (unsigned long)NULL)) {
+ printk(KERN_CRIT "No EFI UV System Table.\n");
+ uv_systab.function = (unsigned long)NULL;
+ return;
+ }
+
+ tab = (struct uv_systab *)ioremap(efi.uv_systab,
+ sizeof(struct uv_systab));
+ if (strncmp(tab->signature, "UVST", 4) != 0)
+ printk(KERN_ERR "bad signature in UV system table!");
+
+ /*
+ * Copy table to permanent spot for later use.
+ */
+ memcpy(&uv_systab, tab, sizeof(struct uv_systab));
+ iounmap(tab);
+
+ printk(KERN_INFO "EFI UV System Table Revision %d\n", tab->revision);
+}
+#else /* !CONFIG_EFI */
+
+void uv_bios_init(void) { }
+#endif
+
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
new file mode 100644
index 000000000000..2ac0ab71412a
--- /dev/null
+++ b/arch/x86/kernel/check.c
@@ -0,0 +1,161 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#include <asm/e820.h>
+#include <asm/proto.h>
+
+/*
+ * Some BIOSes seem to corrupt the low 64k of memory during events
+ * like suspend/resume and unplugging an HDMI cable. Reserve all
+ * remaining free memory in that area and fill it with a distinct
+ * pattern.
+ */
+#define MAX_SCAN_AREAS 8
+
+static int __read_mostly memory_corruption_check = -1;
+
+static unsigned __read_mostly corruption_check_size = 64*1024;
+static unsigned __read_mostly corruption_check_period = 60; /* seconds */
+
+static struct e820entry scan_areas[MAX_SCAN_AREAS];
+static int num_scan_areas;
+
+
+static __init int set_corruption_check(char *arg)
+{
+ char *end;
+
+ memory_corruption_check = simple_strtol(arg, &end, 10);
+
+ return (*end == 0) ? 0 : -EINVAL;
+}
+early_param("memory_corruption_check", set_corruption_check);
+
+static __init int set_corruption_check_period(char *arg)
+{
+ char *end;
+
+ corruption_check_period = simple_strtoul(arg, &end, 10);
+
+ return (*end == 0) ? 0 : -EINVAL;
+}
+early_param("memory_corruption_check_period", set_corruption_check_period);
+
+static __init int set_corruption_check_size(char *arg)
+{
+ char *end;
+ unsigned size;
+
+ size = memparse(arg, &end);
+
+ if (*end == '\0')
+ corruption_check_size = size;
+
+ return (size == corruption_check_size) ? 0 : -EINVAL;
+}
+early_param("memory_corruption_check_size", set_corruption_check_size);
+
+
+void __init setup_bios_corruption_check(void)
+{
+ u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
+
+ if (memory_corruption_check == -1) {
+ memory_corruption_check =
+#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
+ 1
+#else
+ 0
+#endif
+ ;
+ }
+
+ if (corruption_check_size == 0)
+ memory_corruption_check = 0;
+
+ if (!memory_corruption_check)
+ return;
+
+ corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
+
+ while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
+ u64 size;
+ addr = find_e820_area_size(addr, &size, PAGE_SIZE);
+
+ if (addr == 0)
+ break;
+
+ if ((addr + size) > corruption_check_size)
+ size = corruption_check_size - addr;
+
+ if (size == 0)
+ break;
+
+ e820_update_range(addr, size, E820_RAM, E820_RESERVED);
+ scan_areas[num_scan_areas].addr = addr;
+ scan_areas[num_scan_areas].size = size;
+ num_scan_areas++;
+
+ /* Assume we've already mapped this early memory */
+ memset(__va(addr), 0, size);
+
+ addr += size;
+ }
+
+ printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
+ num_scan_areas);
+ update_e820();
+}
+
+
+void check_for_bios_corruption(void)
+{
+ int i;
+ int corruption = 0;
+
+ if (!memory_corruption_check)
+ return;
+
+ for (i = 0; i < num_scan_areas; i++) {
+ unsigned long *addr = __va(scan_areas[i].addr);
+ unsigned long size = scan_areas[i].size;
+
+ for (; size; addr++, size -= sizeof(unsigned long)) {
+ if (!*addr)
+ continue;
+ printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
+ addr, __pa(addr), *addr);
+ corruption = 1;
+ *addr = 0;
+ }
+ }
+
+ WARN_ONCE(corruption, KERN_ERR "Memory corruption detected in low memory\n");
+}
+
+static void check_corruption(struct work_struct *dummy);
+static DECLARE_DELAYED_WORK(bios_check_work, check_corruption);
+
+static void check_corruption(struct work_struct *dummy)
+{
+ check_for_bios_corruption();
+ schedule_delayed_work(&bios_check_work,
+ round_jiffies_relative(corruption_check_period*HZ));
+}
+
+static int start_periodic_check_for_corruption(void)
+{
+ if (!memory_corruption_check || corruption_check_period == 0)
+ return 0;
+
+ printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
+ corruption_check_period);
+
+ /* First time we run the checks right away */
+ schedule_delayed_work(&bios_check_work, 0);
+ return 0;
+}
+
+module_init(start_periodic_check_for_corruption);
+
diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore
new file mode 100644
index 000000000000..667df55a4399
--- /dev/null
+++ b/arch/x86/kernel/cpu/.gitignore
@@ -0,0 +1 @@
+capflags.c
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 7f0b45a5d788..82db7f45e2de 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -2,8 +2,14 @@
# Makefile for x86-compatible CPU details and quirks
#
+# Don't trace early stages of a secondary CPU boot
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_common.o = -pg
+endif
+
obj-y := intel_cacheinfo.o addon_cpuid_features.o
obj-y += proc.o capflags.o powerflags.o common.o
+obj-y += vmware.o hypervisor.o
obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
obj-$(CONFIG_X86_64) += bugs_64.o
@@ -25,7 +31,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
quiet_cmd_mkcapflags = MKCAP $@
cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
-cpufeature = $(src)/../../../../include/asm-x86/cpufeature.h
+cpufeature = $(src)/../../include/asm/cpufeature.h
targets += capflags.c
$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 0d9c993aa93e..2cf23634b6d9 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -69,7 +69,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
*/
void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_SMP
unsigned int eax, ebx, ecx, edx, sub_index;
unsigned int ht_mask_width, core_plus_mask_width;
unsigned int core_select_mask, core_level_siblings;
@@ -120,9 +120,17 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
& core_select_mask;
c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width);
+ /*
+ * Reinit the apicid, now that we have extended initial_apicid.
+ */
+ c->apicid = phys_pkg_id(c->initial_apicid, 0);
#else
c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
+ /*
+ * Reinit the apicid, now that we have extended initial_apicid.
+ */
+ c->apicid = phys_pkg_id(0);
#endif
c->x86_max_cores = (core_level_siblings / smp_num_siblings);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 32e73520adf7..7c878f6aa919 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -249,7 +249,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
}
numa_set_node(cpu, node);
- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+ printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
#endif
}
@@ -283,9 +283,14 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
{
early_init_amd_mc(c);
- /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
- if (c->x86_power & (1<<8))
+ /*
+ * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
+ * with P/T states and does not stop in deep C-states
+ */
+ if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 25581dcb280e..42e0853030cb 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -20,6 +20,7 @@
#include <asm/pat.h>
#include <asm/asm.h>
#include <asm/numa.h>
+#include <asm/smp.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/mpspec.h>
#include <asm/apic.h>
@@ -35,6 +36,7 @@
#include <asm/proto.h>
#include <asm/sections.h>
#include <asm/setup.h>
+#include <asm/hypervisor.h>
#include "cpu.h"
@@ -549,6 +551,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
this_cpu->c_early_init(c);
validate_pat_support(c);
+
+#ifdef CONFIG_SMP
+ c->cpu_index = boot_cpu_id;
+#endif
}
void __init early_cpu_init(void)
@@ -698,6 +704,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
detect_ht(c);
#endif
+ init_hypervisor(c);
/*
* On SMP, boot_cpu_data holds the common feature set between
* all CPUs; so make sure that we indicate which features are
@@ -857,7 +864,7 @@ EXPORT_SYMBOL(_cpu_pda);
struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
-char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
void __cpuinit pda_init(int cpu)
{
@@ -898,8 +905,8 @@ void __cpuinit pda_init(int cpu)
}
}
-char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
- DEBUG_STKSZ] __page_aligned_bss;
+static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
+ DEBUG_STKSZ] __page_aligned_bss;
extern asmlinkage void ignore_sysret(void);
@@ -1134,7 +1141,7 @@ void __cpuinit cpu_init(void)
/*
* Boot processor to setup the FP and extended state context info.
*/
- if (!smp_processor_id())
+ if (smp_processor_id() == boot_cpu_id)
init_thread_xstate();
xsave_init();
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index c24c4a487b7c..88ea02dcb622 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
#include <linux/cpufreq.h>
#include <linux/compiler.h>
#include <linux/dmi.h>
+#include <linux/ftrace.h>
#include <linux/acpi.h>
#include <acpi/processor.h>
@@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
unsigned int next_perf_state = 0; /* Index into perf table */
unsigned int i;
int result = 0;
+ struct power_trace it;
dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
@@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
}
}
+ trace_power_mark(&it, POWER_PSTATE, next_perf_state);
+
switch (data->cpu_feature) {
case SYSTEM_INTEL_MSR_CAPABLE:
cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
@@ -780,6 +784,9 @@ static int __init acpi_cpufreq_init(void)
{
int ret;
+ if (acpi_disabled)
+ return 0;
+
dprintk("acpi_cpufreq_init\n");
ret = acpi_cpufreq_early_init();
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 06fcce516d51..b0461856acfb 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -1,5 +1,5 @@
/*
- * (C) 2001-2004 Dave Jones. <davej@codemonkey.org.uk>
+ * (C) 2001-2004 Dave Jones. <davej@redhat.com>
* (C) 2002 Padraig Brady. <padraig@antefacto.com>
*
* Licensed under the terms of the GNU GPL License version 2.
@@ -1019,7 +1019,7 @@ MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
module_param(revid_errata, int, 0644);
MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
-MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
+MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors.");
MODULE_LICENSE ("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index b5ced806a316..c1ac5790c63e 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -246,7 +246,7 @@ static void __exit powernow_k6_exit(void)
}
-MODULE_AUTHOR("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
+MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 0a61159d7b71..7c7d56b43136 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -1,6 +1,6 @@
/*
* AMD K7 Powernow driver.
- * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs.
+ * (C) 2003 Dave Jones on behalf of SuSE Labs.
* (C) 2003-2004 Dave Jones <davej@redhat.com>
*
* Licensed under the terms of the GNU GPL License version 2.
@@ -692,7 +692,7 @@ static void __exit powernow_exit (void)
module_param(acpi_force, int, 0444);
MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
-MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
+MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors.");
MODULE_LICENSE ("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 84bb395038d8..7f05f44b97e9 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -7,7 +7,7 @@
* Support : mark.langsdorf@amd.com
*
* Based on the powernow-k7.c module written by Dave Jones.
- * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs
+ * (C) 2003 Dave Jones on behalf of SuSE Labs
* (C) 2004 Dominik Brodowski <linux@brodo.de>
* (C) 2004 Pavel Machek <pavel@suse.cz>
* Licensed under the terms of the GNU GPL License version 2.
@@ -45,7 +45,6 @@
#endif
#define PFX "powernow-k8: "
-#define BFX PFX "BIOS error: "
#define VERSION "version 2.20.00"
#include "powernow-k8.h"
@@ -116,9 +115,20 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
u32 i = 0;
if (cpu_family == CPU_HW_PSTATE) {
- rdmsr(MSR_PSTATE_STATUS, lo, hi);
- i = lo & HW_PSTATE_MASK;
- data->currpstate = i;
+ if (data->currpstate == HW_PSTATE_INVALID) {
+ /* read (initial) hw pstate if not yet set */
+ rdmsr(MSR_PSTATE_STATUS, lo, hi);
+ i = lo & HW_PSTATE_MASK;
+
+ /*
+ * a workaround for family 11h erratum 311 might cause
+ * an "out-of-range Pstate if the core is in Pstate-0
+ */
+ if (i >= data->numps)
+ data->currpstate = HW_PSTATE_0;
+ else
+ data->currpstate = i;
+ }
return 0;
}
do {
@@ -536,35 +546,40 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8
for (j = 0; j < data->numps; j++) {
if (pst[j].vid > LEAST_VID) {
- printk(KERN_ERR PFX "vid %d invalid : 0x%x\n", j, pst[j].vid);
+ printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
+ j, pst[j].vid);
return -EINVAL;
}
if (pst[j].vid < data->rvo) { /* vid + rvo >= 0 */
- printk(KERN_ERR BFX "0 vid exceeded with pstate %d\n", j);
+ printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
+ " %d\n", j);
return -ENODEV;
}
if (pst[j].vid < maxvid + data->rvo) { /* vid + rvo >= maxvid */
- printk(KERN_ERR BFX "maxvid exceeded with pstate %d\n", j);
+ printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
+ " %d\n", j);
return -ENODEV;
}
if (pst[j].fid > MAX_FID) {
- printk(KERN_ERR BFX "maxfid exceeded with pstate %d\n", j);
+ printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
+ " %d\n", j);
return -ENODEV;
}
if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
/* Only first fid is allowed to be in "low" range */
- printk(KERN_ERR BFX "two low fids - %d : 0x%x\n", j, pst[j].fid);
+ printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
+ "0x%x\n", j, pst[j].fid);
return -EINVAL;
}
if (pst[j].fid < lastfid)
lastfid = pst[j].fid;
}
if (lastfid & 1) {
- printk(KERN_ERR BFX "lastfid invalid\n");
+ printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
return -EINVAL;
}
if (lastfid > LO_FID_TABLE_TOP)
- printk(KERN_INFO BFX "first fid not from lo freq table\n");
+ printk(KERN_INFO FW_BUG PFX "first fid not from lo freq table\n");
return 0;
}
@@ -672,13 +687,13 @@ static int find_psb_table(struct powernow_k8_data *data)
dprintk("table vers: 0x%x\n", psb->tableversion);
if (psb->tableversion != PSB_VERSION_1_4) {
- printk(KERN_ERR BFX "PSB table is not v1.4\n");
+ printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
return -ENODEV;
}
dprintk("flags: 0x%x\n", psb->flags1);
if (psb->flags1) {
- printk(KERN_ERR BFX "unknown flags\n");
+ printk(KERN_ERR FW_BUG PFX "unknown flags\n");
return -ENODEV;
}
@@ -705,7 +720,7 @@ static int find_psb_table(struct powernow_k8_data *data)
}
}
if (cpst != 1) {
- printk(KERN_ERR BFX "numpst must be 1\n");
+ printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
return -ENODEV;
}
@@ -1117,6 +1132,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
}
data->cpu = pol->cpu;
+ data->currpstate = HW_PSTATE_INVALID;
if (powernow_k8_cpu_init_acpi(data)) {
/*
@@ -1130,17 +1146,19 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
"ACPI Processor module before starting this "
"driver.\n");
#else
- printk(KERN_ERR PFX "Your BIOS does not provide ACPI "
- "_PSS objects in a way that Linux understands. "
- "Please report this to the Linux ACPI maintainers"
- " and complain to your BIOS vendor.\n");
+ printk(KERN_ERR FW_BUG PFX "Your BIOS does not provide"
+ " ACPI _PSS objects in a way that Linux "
+ "understands. Please report this to the Linux "
+ "ACPI maintainers and complain to your BIOS "
+ "vendor.\n");
#endif
kfree(data);
return -ENODEV;
}
if (pol->cpu != 0) {
- printk(KERN_ERR PFX "No ACPI _PSS objects for CPU other than "
- "CPU0. Complain to your BIOS vendor.\n");
+ printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
+ "CPU other than CPU0. Complain to your BIOS "
+ "vendor.\n");
kfree(data);
return -ENODEV;
}
@@ -1193,7 +1211,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
/* min/max the cpu is capable of */
if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
- printk(KERN_ERR PFX "invalid powernow_table\n");
+ printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
powernow_k8_cpu_exit_acpi(data);
kfree(data->powernow_table);
kfree(data);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index ab48cfed4d96..65cfb5d7f77f 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -5,6 +5,19 @@
* http://www.gnu.org/licenses/gpl.html
*/
+
+enum pstate {
+ HW_PSTATE_INVALID = 0xff,
+ HW_PSTATE_0 = 0,
+ HW_PSTATE_1 = 1,
+ HW_PSTATE_2 = 2,
+ HW_PSTATE_3 = 3,
+ HW_PSTATE_4 = 4,
+ HW_PSTATE_5 = 5,
+ HW_PSTATE_6 = 6,
+ HW_PSTATE_7 = 7,
+};
+
struct powernow_k8_data {
unsigned int cpu;
@@ -23,7 +36,9 @@ struct powernow_k8_data {
u32 exttype; /* extended interface = 1 */
/* keep track of the current fid / vid or pstate */
- u32 currvid, currfid, currpstate;
+ u32 currvid;
+ u32 currfid;
+ enum pstate currpstate;
/* the powernow_table includes all frequency and vid/fid pairings:
* fid are the lower 8 bits of the index, vid are the upper 8 bits.
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 191f7263c61d..04d0376b64b0 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -431,7 +431,7 @@ static void __exit speedstep_exit(void)
}
-MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
+MODULE_AUTHOR ("Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges.");
MODULE_LICENSE ("GPL");
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
new file mode 100644
index 000000000000..fb5b86af0b01
--- /dev/null
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -0,0 +1,58 @@
+/*
+ * Common hypervisor code
+ *
+ * Copyright (C) 2008, VMware, Inc.
+ * Author : Alok N Kataria <akataria@vmware.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/vmware.h>
+#include <asm/hypervisor.h>
+
+static inline void __cpuinit
+detect_hypervisor_vendor(struct cpuinfo_x86 *c)
+{
+ if (vmware_platform()) {
+ c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
+ } else {
+ c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
+ }
+}
+
+unsigned long get_hypervisor_tsc_freq(void)
+{
+ if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
+ return vmware_get_tsc_khz();
+ return 0;
+}
+
+static inline void __cpuinit
+hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
+{
+ if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) {
+ vmware_set_feature_bits(c);
+ return;
+ }
+}
+
+void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
+{
+ detect_hypervisor_vendor(c);
+ hypervisor_set_feature_bits(c);
+}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 99468dbd08da..8ea6929e974c 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -11,7 +11,6 @@
#include <asm/pgtable.h>
#include <asm/msr.h>
#include <asm/uaccess.h>
-#include <asm/ptrace.h>
#include <asm/ds.h>
#include <asm/bugs.h>
@@ -41,6 +40,16 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
if (c->x86 == 15 && c->x86_cache_alignment == 64)
c->x86_cache_alignment = 128;
#endif
+
+ /*
+ * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
+ * with P/T states and does not stop in deep C-states
+ */
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
+
}
#ifdef CONFIG_X86_32
@@ -174,7 +183,7 @@ static void __cpuinit srat_detect_node(void)
node = first_node(node_online_map);
numa_set_node(cpu, node);
- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+ printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
#endif
}
@@ -242,6 +251,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
intel_workarounds(c);
+ /*
+ * Detect the extended topology information if available. This
+ * will reinitialise the initial_apicid which will be used
+ * in init_intel_cacheinfo()
+ */
+ detect_extended_topology(c);
+
l2 = init_intel_cacheinfo(c);
if (c->cpuid_level > 9) {
unsigned eax = cpuid_eax(10);
@@ -307,13 +323,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_P4);
if (c->x86 == 6)
set_cpu_cap(c, X86_FEATURE_P3);
-
- if (cpu_has_bts)
- ptrace_bts_init_intel(c);
-
#endif
- detect_extended_topology(c);
if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
/*
* let's use the legacy cpuid vector 0x1 and 0x4 for topology
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 3f46afbb1cf1..68b5d8681cbb 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -644,20 +644,17 @@ static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf)
return show_shared_cpu_map_func(leaf, 1, buf);
}
-static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
- switch(this_leaf->eax.split.type) {
- case CACHE_TYPE_DATA:
+static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
+{
+ switch (this_leaf->eax.split.type) {
+ case CACHE_TYPE_DATA:
return sprintf(buf, "Data\n");
- break;
- case CACHE_TYPE_INST:
+ case CACHE_TYPE_INST:
return sprintf(buf, "Instruction\n");
- break;
- case CACHE_TYPE_UNIFIED:
+ case CACHE_TYPE_UNIFIED:
return sprintf(buf, "Unified\n");
- break;
- default:
+ default:
return sprintf(buf, "Unknown\n");
- break;
}
}
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index f390c9f66351..dd3af6e7b39a 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -1,6 +1,6 @@
/*
- * Athlon/Hammer specific Machine Check Exception Reporting
- * (C) Copyright 2002 Dave Jones <davej@codemonkey.org.uk>
+ * Athlon specific Machine Check Exception Reporting
+ * (C) Copyright 2002 Dave Jones <davej@redhat.com>
*/
#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index 774d87cfd8cd..0ebf3fc6a610 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -1,6 +1,6 @@
/*
* mce.c - x86 Machine Check Exception Reporting
- * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk>
+ * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@redhat.com>
*/
#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 4b031a4ac856..1c838032fd37 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -510,12 +510,9 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
*/
void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
{
- static cpumask_t mce_cpus = CPU_MASK_NONE;
-
mce_cpu_quirks(c);
if (mce_dont_init ||
- cpu_test_and_set(smp_processor_id(), mce_cpus) ||
!mce_available(c))
return;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 5eb390a4b2e9..748c8f9e7a05 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -237,7 +237,7 @@ asmlinkage void mce_threshold_interrupt(void)
}
}
out:
- add_pda(irq_threshold_count, 1);
+ inc_irq_stat(irq_threshold_count);
irq_exit();
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index c17eaf5dd6dd..4b48f251fd39 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -26,7 +26,7 @@ asmlinkage void smp_thermal_interrupt(void)
if (therm_throt_process(msr_val & 1))
mce_log_therm_throt_event(smp_processor_id(), msr_val);
- add_pda(irq_thermal_count, 1);
+ inc_irq_stat(irq_thermal_count);
irq_exit();
}
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index cc1fccdd31e0..a74af128efc9 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -1,7 +1,7 @@
/*
* Non Fatal Machine Check Exception Reporting
*
- * (C) Copyright 2002 Dave Jones. <davej@codemonkey.org.uk>
+ * (C) Copyright 2002 Dave Jones. <davej@redhat.com>
*
* This file contains routines to check for non-fatal MCEs every 15s
*
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index c78c04821ea1..1159e269e596 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -803,6 +803,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
}
static struct res_range __initdata range[RANGE_NUM];
+static int __initdata nr_range;
#ifdef CONFIG_MTRR_SANITIZER
@@ -1206,39 +1207,43 @@ struct mtrr_cleanup_result {
#define PSHIFT (PAGE_SHIFT - 10)
static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
-static struct res_range __initdata range_new[RANGE_NUM];
static unsigned long __initdata min_loss_pfn[RANGE_NUM];
-static int __init mtrr_cleanup(unsigned address_bits)
+static void __init print_out_mtrr_range_state(void)
{
- unsigned long extra_remove_base, extra_remove_size;
- unsigned long base, size, def, dummy;
- mtrr_type type;
- int nr_range, nr_range_new;
- u64 chunk_size, gran_size;
- unsigned long range_sums, range_sums_new;
- int index_good;
- int num_reg_good;
int i;
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+ mtrr_type type;
- /* extra one for all 0 */
- int num[MTRR_NUM_TYPES + 1];
+ for (i = 0; i < num_var_ranges; i++) {
- if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
- return 0;
- rdmsr(MTRRdefType_MSR, def, dummy);
- def &= 0xff;
- if (def != MTRR_TYPE_UNCACHABLE)
- return 0;
+ size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
+ if (!size_base)
+ continue;
- /* get it and store it aside */
- memset(range_state, 0, sizeof(range_state));
- for (i = 0; i < num_var_ranges; i++) {
- mtrr_if->get(i, &base, &size, &type);
- range_state[i].base_pfn = base;
- range_state[i].size_pfn = size;
- range_state[i].type = type;
+ size_base = to_size_factor(size_base, &size_factor),
+ start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
+ start_base = to_size_factor(start_base, &start_factor),
+ type = range_state[i].type;
+
+ printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+ i, start_base, start_factor,
+ size_base, size_factor,
+ (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+ ((type == MTRR_TYPE_WRPROT) ? "WP" :
+ ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
+ );
}
+}
+
+static int __init mtrr_need_cleanup(void)
+{
+ int i;
+ mtrr_type type;
+ unsigned long size;
+ /* extra one for all 0 */
+ int num[MTRR_NUM_TYPES + 1];
/* check entries number */
memset(num, 0, sizeof(num));
@@ -1263,29 +1268,133 @@ static int __init mtrr_cleanup(unsigned address_bits)
num_var_ranges - num[MTRR_NUM_TYPES])
return 0;
- /* print original var MTRRs at first, for debugging: */
- printk(KERN_DEBUG "original variable MTRRs\n");
- for (i = 0; i < num_var_ranges; i++) {
- char start_factor = 'K', size_factor = 'K';
- unsigned long start_base, size_base;
+ return 1;
+}
- size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
- if (!size_base)
- continue;
+static unsigned long __initdata range_sums;
+static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
+ unsigned long extra_remove_base,
+ unsigned long extra_remove_size,
+ int i)
+{
+ int num_reg;
+ static struct res_range range_new[RANGE_NUM];
+ static int nr_range_new;
+ unsigned long range_sums_new;
+
+ /* convert ranges to var ranges state */
+ num_reg = x86_setup_var_mtrrs(range, nr_range,
+ chunk_size, gran_size);
+
+ /* we got new setting in range_state, check it */
+ memset(range_new, 0, sizeof(range_new));
+ nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+ extra_remove_base, extra_remove_size);
+ range_sums_new = sum_ranges(range_new, nr_range_new);
+
+ result[i].chunk_sizek = chunk_size >> 10;
+ result[i].gran_sizek = gran_size >> 10;
+ result[i].num_reg = num_reg;
+ if (range_sums < range_sums_new) {
+ result[i].lose_cover_sizek =
+ (range_sums_new - range_sums) << PSHIFT;
+ result[i].bad = 1;
+ } else
+ result[i].lose_cover_sizek =
+ (range_sums - range_sums_new) << PSHIFT;
- size_base = to_size_factor(size_base, &size_factor),
- start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
- start_base = to_size_factor(start_base, &start_factor),
- type = range_state[i].type;
+ /* double check it */
+ if (!result[i].bad && !result[i].lose_cover_sizek) {
+ if (nr_range_new != nr_range ||
+ memcmp(range, range_new, sizeof(range)))
+ result[i].bad = 1;
+ }
- printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
- i, start_base, start_factor,
- size_base, size_factor,
- (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
- ((type == MTRR_TYPE_WRPROT) ? "WP" :
- ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
- );
+ if (!result[i].bad && (range_sums - range_sums_new <
+ min_loss_pfn[num_reg])) {
+ min_loss_pfn[num_reg] =
+ range_sums - range_sums_new;
}
+}
+
+static void __init mtrr_print_out_one_result(int i)
+{
+ char gran_factor, chunk_factor, lose_factor;
+ unsigned long gran_base, chunk_base, lose_base;
+
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+ result[i].bad ? "*BAD*" : " ",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
+ result[i].num_reg, result[i].bad ? "-" : "",
+ lose_base, lose_factor);
+}
+
+static int __init mtrr_search_optimal_index(void)
+{
+ int i;
+ int num_reg_good;
+ int index_good;
+
+ if (nr_mtrr_spare_reg >= num_var_ranges)
+ nr_mtrr_spare_reg = num_var_ranges - 1;
+ num_reg_good = -1;
+ for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
+ if (!min_loss_pfn[i])
+ num_reg_good = i;
+ }
+
+ index_good = -1;
+ if (num_reg_good != -1) {
+ for (i = 0; i < NUM_RESULT; i++) {
+ if (!result[i].bad &&
+ result[i].num_reg == num_reg_good &&
+ !result[i].lose_cover_sizek) {
+ index_good = i;
+ break;
+ }
+ }
+ }
+
+ return index_good;
+}
+
+
+static int __init mtrr_cleanup(unsigned address_bits)
+{
+ unsigned long extra_remove_base, extra_remove_size;
+ unsigned long base, size, def, dummy;
+ mtrr_type type;
+ u64 chunk_size, gran_size;
+ int index_good;
+ int i;
+
+ if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+ return 0;
+ rdmsr(MTRRdefType_MSR, def, dummy);
+ def &= 0xff;
+ if (def != MTRR_TYPE_UNCACHABLE)
+ return 0;
+
+ /* get it and store it aside */
+ memset(range_state, 0, sizeof(range_state));
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ range_state[i].base_pfn = base;
+ range_state[i].size_pfn = size;
+ range_state[i].type = type;
+ }
+
+ /* check if we need handle it and can handle it */
+ if (!mtrr_need_cleanup())
+ return 0;
+
+ /* print original var MTRRs at first, for debugging: */
+ printk(KERN_DEBUG "original variable MTRRs\n");
+ print_out_mtrr_range_state();
memset(range, 0, sizeof(range));
extra_remove_size = 0;
@@ -1309,176 +1418,64 @@ static int __init mtrr_cleanup(unsigned address_bits)
range_sums >> (20 - PAGE_SHIFT));
if (mtrr_chunk_size && mtrr_gran_size) {
- int num_reg;
- char gran_factor, chunk_factor, lose_factor;
- unsigned long gran_base, chunk_base, lose_base;
-
- debug_print++;
- /* convert ranges to var ranges state */
- num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
- mtrr_gran_size);
+ i = 0;
+ mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
+ extra_remove_base, extra_remove_size, i);
- /* we got new setting in range_state, check it */
- memset(range_new, 0, sizeof(range_new));
- nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
- extra_remove_base,
- extra_remove_size);
- range_sums_new = sum_ranges(range_new, nr_range_new);
+ mtrr_print_out_one_result(i);
- i = 0;
- result[i].chunk_sizek = mtrr_chunk_size >> 10;
- result[i].gran_sizek = mtrr_gran_size >> 10;
- result[i].num_reg = num_reg;
- if (range_sums < range_sums_new) {
- result[i].lose_cover_sizek =
- (range_sums_new - range_sums) << PSHIFT;
- result[i].bad = 1;
- } else
- result[i].lose_cover_sizek =
- (range_sums - range_sums_new) << PSHIFT;
-
- gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
- chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
- lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
- printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
- result[i].bad?"*BAD*":" ",
- gran_base, gran_factor, chunk_base, chunk_factor);
- printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
- result[i].num_reg, result[i].bad?"-":"",
- lose_base, lose_factor);
if (!result[i].bad) {
set_var_mtrr_all(address_bits);
return 1;
}
printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
"will find optimal one\n");
- debug_print--;
- memset(result, 0, sizeof(result[0]));
}
i = 0;
memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
memset(result, 0, sizeof(result));
for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
- char gran_factor;
- unsigned long gran_base;
-
- if (debug_print)
- gran_base = to_size_factor(gran_size >> 10, &gran_factor);
for (chunk_size = gran_size; chunk_size < (1ULL<<32);
chunk_size <<= 1) {
- int num_reg;
- if (debug_print) {
- char chunk_factor;
- unsigned long chunk_base;
-
- chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
- printk(KERN_INFO "\n");
- printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
- gran_base, gran_factor, chunk_base, chunk_factor);
- }
if (i >= NUM_RESULT)
continue;
- /* convert ranges to var ranges state */
- num_reg = x86_setup_var_mtrrs(range, nr_range,
- chunk_size, gran_size);
-
- /* we got new setting in range_state, check it */
- memset(range_new, 0, sizeof(range_new));
- nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
- extra_remove_base, extra_remove_size);
- range_sums_new = sum_ranges(range_new, nr_range_new);
-
- result[i].chunk_sizek = chunk_size >> 10;
- result[i].gran_sizek = gran_size >> 10;
- result[i].num_reg = num_reg;
- if (range_sums < range_sums_new) {
- result[i].lose_cover_sizek =
- (range_sums_new - range_sums) << PSHIFT;
- result[i].bad = 1;
- } else
- result[i].lose_cover_sizek =
- (range_sums - range_sums_new) << PSHIFT;
-
- /* double check it */
- if (!result[i].bad && !result[i].lose_cover_sizek) {
- if (nr_range_new != nr_range ||
- memcmp(range, range_new, sizeof(range)))
- result[i].bad = 1;
+ mtrr_calc_range_state(chunk_size, gran_size,
+ extra_remove_base, extra_remove_size, i);
+ if (debug_print) {
+ mtrr_print_out_one_result(i);
+ printk(KERN_INFO "\n");
}
- if (!result[i].bad && (range_sums - range_sums_new <
- min_loss_pfn[num_reg])) {
- min_loss_pfn[num_reg] =
- range_sums - range_sums_new;
- }
i++;
}
}
- /* print out all */
- for (i = 0; i < NUM_RESULT; i++) {
- char gran_factor, chunk_factor, lose_factor;
- unsigned long gran_base, chunk_base, lose_base;
-
- gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
- chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
- lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
- printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
- result[i].bad?"*BAD*":" ",
- gran_base, gran_factor, chunk_base, chunk_factor);
- printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
- result[i].num_reg, result[i].bad?"-":"",
- lose_base, lose_factor);
- }
-
/* try to find the optimal index */
- if (nr_mtrr_spare_reg >= num_var_ranges)
- nr_mtrr_spare_reg = num_var_ranges - 1;
- num_reg_good = -1;
- for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
- if (!min_loss_pfn[i])
- num_reg_good = i;
- }
-
- index_good = -1;
- if (num_reg_good != -1) {
- for (i = 0; i < NUM_RESULT; i++) {
- if (!result[i].bad &&
- result[i].num_reg == num_reg_good &&
- !result[i].lose_cover_sizek) {
- index_good = i;
- break;
- }
- }
- }
+ index_good = mtrr_search_optimal_index();
if (index_good != -1) {
- char gran_factor, chunk_factor, lose_factor;
- unsigned long gran_base, chunk_base, lose_base;
-
printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
i = index_good;
- gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
- chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
- lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
- printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
- gran_base, gran_factor, chunk_base, chunk_factor);
- printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
- result[i].num_reg, lose_base, lose_factor);
+ mtrr_print_out_one_result(i);
+
/* convert ranges to var ranges state */
chunk_size = result[i].chunk_sizek;
chunk_size <<= 10;
gran_size = result[i].gran_sizek;
gran_size <<= 10;
- debug_print++;
x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
- debug_print--;
set_var_mtrr_all(address_bits);
+ printk(KERN_DEBUG "New variable MTRRs\n");
+ print_out_mtrr_range_state();
return 1;
+ } else {
+ /* print out all */
+ for (i = 0; i < NUM_RESULT; i++)
+ mtrr_print_out_one_result(i);
}
printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
@@ -1562,7 +1559,6 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
{
unsigned long i, base, size, highest_pfn = 0, def, dummy;
mtrr_type type;
- int nr_range;
u64 total_trim_size;
/* extra one for all 0 */
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 6bff382094f5..9abd48b22674 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -17,6 +17,8 @@
#include <linux/bitops.h>
#include <linux/smp.h>
#include <linux/nmi.h>
+#include <linux/kprobes.h>
+
#include <asm/apic.h>
#include <asm/intel_arch_perfmon.h>
@@ -336,7 +338,8 @@ static void single_msr_unreserve(void)
release_perfctr_nmi(wd_ops->perfctr);
}
-static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+static void __kprobes
+single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
{
/* start the cycle over again */
write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
@@ -401,7 +404,7 @@ static int setup_p6_watchdog(unsigned nmi_hz)
return 1;
}
-static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
{
/*
* P6 based Pentium M need to re-unmask
@@ -605,7 +608,7 @@ static void p4_unreserve(void)
release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
}
-static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
{
unsigned dummy;
/*
@@ -784,7 +787,7 @@ unsigned lapic_adjust_nmi_hz(unsigned hz)
return hz;
}
-int lapic_wd_event(unsigned nmi_hz)
+int __kprobes lapic_wd_event(unsigned nmi_hz)
{
struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
u64 ctr;
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index a26c480b9491..01b1244ef1c0 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -160,14 +160,16 @@ static void *c_start(struct seq_file *m, loff_t *pos)
{
if (*pos == 0) /* just in case, cpu 0 is not the first */
*pos = first_cpu(cpu_online_map);
- if ((*pos) < nr_cpu_ids && cpu_online(*pos))
+ else
+ *pos = next_cpu_nr(*pos - 1, cpu_online_map);
+ if ((*pos) < nr_cpu_ids)
return &cpu_data(*pos);
return NULL;
}
static void *c_next(struct seq_file *m, void *v, loff_t *pos)
{
- *pos = next_cpu(*pos, cpu_online_map);
+ (*pos)++;
return c_start(m, pos);
}
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
new file mode 100644
index 000000000000..284c399e3234
--- /dev/null
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -0,0 +1,112 @@
+/*
+ * VMware Detection code.
+ *
+ * Copyright (C) 2008, VMware, Inc.
+ * Author : Alok N Kataria <akataria@vmware.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/dmi.h>
+#include <asm/div64.h>
+#include <asm/vmware.h>
+
+#define CPUID_VMWARE_INFO_LEAF 0x40000000
+#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
+#define VMWARE_HYPERVISOR_PORT 0x5658
+
+#define VMWARE_PORT_CMD_GETVERSION 10
+#define VMWARE_PORT_CMD_GETHZ 45
+
+#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
+ __asm__("inl (%%dx)" : \
+ "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
+ "0"(VMWARE_HYPERVISOR_MAGIC), \
+ "1"(VMWARE_PORT_CMD_##cmd), \
+ "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
+ "memory");
+
+static inline int __vmware_platform(void)
+{
+ uint32_t eax, ebx, ecx, edx;
+ VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx);
+ return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
+}
+
+static unsigned long __vmware_get_tsc_khz(void)
+{
+ uint64_t tsc_hz;
+ uint32_t eax, ebx, ecx, edx;
+
+ VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+
+ if (ebx == UINT_MAX)
+ return 0;
+ tsc_hz = eax | (((uint64_t)ebx) << 32);
+ do_div(tsc_hz, 1000);
+ BUG_ON(tsc_hz >> 32);
+ return tsc_hz;
+}
+
+/*
+ * While checking the dmi string infomation, just checking the product
+ * serial key should be enough, as this will always have a VMware
+ * specific string when running under VMware hypervisor.
+ */
+int vmware_platform(void)
+{
+ if (cpu_has_hypervisor) {
+ unsigned int eax, ebx, ecx, edx;
+ char hyper_vendor_id[13];
+
+ cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx);
+ memcpy(hyper_vendor_id + 0, &ebx, 4);
+ memcpy(hyper_vendor_id + 4, &ecx, 4);
+ memcpy(hyper_vendor_id + 8, &edx, 4);
+ hyper_vendor_id[12] = '\0';
+ if (!strcmp(hyper_vendor_id, "VMwareVMware"))
+ return 1;
+ } else if (dmi_available && dmi_name_in_serial("VMware") &&
+ __vmware_platform())
+ return 1;
+
+ return 0;
+}
+
+unsigned long vmware_get_tsc_khz(void)
+{
+ BUG_ON(!vmware_platform());
+ return __vmware_get_tsc_khz();
+}
+
+/*
+ * VMware hypervisor takes care of exporting a reliable TSC to the guest.
+ * Still, due to timing difference when running on virtual cpus, the TSC can
+ * be marked as unstable in some cases. For example, the TSC sync check at
+ * bootup can fail due to a marginal offset between vcpus' TSCs (though the
+ * TSCs do not drift from each other). Also, the ACPI PM timer clocksource
+ * is not suitable as a watchdog when running on a hypervisor because the
+ * kernel may miss a wrap of the counter if the vcpu is descheduled for a
+ * long time. To skip these checks at runtime we set these capability bits,
+ * so that the kernel could just trust the hypervisor with providing a
+ * reliable virtual TSC that is suitable for timekeeping.
+ */
+void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c)
+{
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
+}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 6a44d6465991..72cefd1e649b 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -147,8 +147,8 @@ static __cpuinit int cpuid_device_create(int cpu)
{
struct device *dev;
- dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu),
- NULL, "cpu%d", cpu);
+ dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
+ "cpu%d", cpu);
return IS_ERR(dev) ? PTR_ERR(dev) : 0;
}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 268553817909..d84a852e4cd7 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -29,34 +29,17 @@
#include <mach_ipi.h>
-/* This keeps a track of which one is crashing cpu. */
-static int crashing_cpu;
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
-static atomic_t waiting_for_crash_ipi;
-static int crash_nmi_callback(struct notifier_block *self,
- unsigned long val, void *data)
+static void kdump_nmi_callback(int cpu, struct die_args *args)
{
struct pt_regs *regs;
#ifdef CONFIG_X86_32
struct pt_regs fixed_regs;
#endif
- int cpu;
- if (val != DIE_NMI_IPI)
- return NOTIFY_OK;
-
- regs = ((struct die_args *)data)->regs;
- cpu = raw_smp_processor_id();
-
- /* Don't do anything if this handler is invoked on crashing cpu.
- * Otherwise, system will completely hang. Crashing cpu can get
- * an NMI if system was initially booted with nmi_watchdog parameter.
- */
- if (cpu == crashing_cpu)
- return NOTIFY_STOP;
- local_irq_disable();
+ regs = args->regs;
#ifdef CONFIG_X86_32
if (!user_mode_vm(regs)) {
@@ -65,54 +48,19 @@ static int crash_nmi_callback(struct notifier_block *self,
}
#endif
crash_save_cpu(regs, cpu);
- disable_local_APIC();
- atomic_dec(&waiting_for_crash_ipi);
- /* Assume hlt works */
- halt();
- for (;;)
- cpu_relax();
-
- return 1;
-}
-static void smp_send_nmi_allbutself(void)
-{
- cpumask_t mask = cpu_online_map;
- cpu_clear(safe_smp_processor_id(), mask);
- if (!cpus_empty(mask))
- send_IPI_mask(mask, NMI_VECTOR);
+ disable_local_APIC();
}
-static struct notifier_block crash_nmi_nb = {
- .notifier_call = crash_nmi_callback,
-};
-
-static void nmi_shootdown_cpus(void)
+static void kdump_nmi_shootdown_cpus(void)
{
- unsigned long msecs;
-
- atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
- /* Would it be better to replace the trap vector here? */
- if (register_die_notifier(&crash_nmi_nb))
- return; /* return what? */
- /* Ensure the new callback function is set before sending
- * out the NMI
- */
- wmb();
+ nmi_shootdown_cpus(kdump_nmi_callback);
- smp_send_nmi_allbutself();
-
- msecs = 1000; /* Wait at most a second for the other cpus to stop */
- while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
- mdelay(1);
- msecs--;
- }
-
- /* Leave the nmi callback set */
disable_local_APIC();
}
+
#else
-static void nmi_shootdown_cpus(void)
+static void kdump_nmi_shootdown_cpus(void)
{
/* There are no cpus to shootdown */
}
@@ -131,9 +79,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
/* The kernel is broken so disable interrupts */
local_irq_disable();
- /* Make a note of crashing cpu. Will be used in NMI callback.*/
- crashing_cpu = safe_smp_processor_id();
- nmi_shootdown_cpus();
+ kdump_nmi_shootdown_cpus();
lapic_shutdown();
#if defined(CONFIG_X86_IO_APIC)
disable_IO_APIC();
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 72d0c56c1b48..f7cdb3b457aa 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -13,6 +13,9 @@
static void *kdump_buf_page;
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
/**
* copy_oldmem_page - copy one page from "oldmem"
* @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index e90a60ef10c2..045b36cada65 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -10,6 +10,9 @@
#include <linux/uaccess.h>
#include <linux/io.h>
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
/**
* copy_oldmem_page - copy one page from "oldmem"
* @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 2b69994fd3a8..da91701a2348 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -6,14 +6,13 @@
* precise-event based sampling (PEBS).
*
* It manages:
- * - per-thread and per-cpu allocation of BTS and PEBS
- * - buffer memory allocation (optional)
- * - buffer overflow handling
+ * - DS and BTS hardware configuration
+ * - buffer overflow handling (to be done)
* - buffer access
*
- * It assumes:
- * - get_task_struct on all parameter tasks
- * - current is allowed to trace parameter tasks
+ * It does not do:
+ * - security checking (is the caller allowed to trace the task)
+ * - buffer allocation (memory accounting)
*
*
* Copyright (C) 2007-2008 Intel Corporation.
@@ -21,8 +20,6 @@
*/
-#ifdef CONFIG_X86_DS
-
#include <asm/ds.h>
#include <linux/errno.h>
@@ -30,22 +27,69 @@
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/kernel.h>
/*
* The configuration for a particular DS hardware implementation.
*/
struct ds_configuration {
- /* the size of the DS structure in bytes */
- unsigned char sizeof_ds;
- /* the size of one pointer-typed field in the DS structure in bytes;
- this covers the first 8 fields related to buffer management. */
+ /* the name of the configuration */
+ const char *name;
+ /* the size of one pointer-typed field in the DS structure and
+ in the BTS and PEBS buffers in bytes;
+ this covers the first 8 DS fields related to buffer management. */
unsigned char sizeof_field;
/* the size of a BTS/PEBS record in bytes */
unsigned char sizeof_rec[2];
+ /* a series of bit-masks to control various features indexed
+ * by enum ds_feature */
+ unsigned long ctl[dsf_ctl_max];
+};
+static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
+
+#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
+
+#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */
+#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */
+#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
+
+#define BTS_CONTROL \
+ (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
+ ds_cfg.ctl[dsf_bts_overflow])
+
+
+/*
+ * A BTS or PEBS tracer.
+ *
+ * This holds the configuration of the tracer and serves as a handle
+ * to identify tracers.
+ */
+struct ds_tracer {
+ /* the DS context (partially) owned by this tracer */
+ struct ds_context *context;
+ /* the buffer provided on ds_request() and its size in bytes */
+ void *buffer;
+ size_t size;
+};
+
+struct bts_tracer {
+ /* the common DS part */
+ struct ds_tracer ds;
+ /* the trace including the DS configuration */
+ struct bts_trace trace;
+ /* buffer overflow notification function */
+ bts_ovfl_callback_t ovfl;
};
-static struct ds_configuration ds_cfg;
+struct pebs_tracer {
+ /* the common DS part */
+ struct ds_tracer ds;
+ /* the trace including the DS configuration */
+ struct pebs_trace trace;
+ /* buffer overflow notification function */
+ pebs_ovfl_callback_t ovfl;
+};
/*
* Debug Store (DS) save area configuration (see Intel64 and IA32
@@ -111,32 +155,9 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
/*
- * Locking is done only for allocating BTS or PEBS resources and for
- * guarding context and buffer memory allocation.
- *
- * Most functions require the current task to own the ds context part
- * they are going to access. All the locking is done when validating
- * access to the context.
+ * Locking is done only for allocating BTS or PEBS resources.
*/
-static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
-
-/*
- * Validate that the current task is allowed to access the BTS/PEBS
- * buffer of the parameter task.
- *
- * Returns 0, if access is granted; -Eerrno, otherwise.
- */
-static inline int ds_validate_access(struct ds_context *context,
- enum ds_qualifier qual)
-{
- if (!context)
- return -EPERM;
-
- if (context->owner[qual] == current)
- return 0;
-
- return -EPERM;
-}
+static DEFINE_SPINLOCK(ds_lock);
/*
@@ -152,27 +173,32 @@ static inline int ds_validate_access(struct ds_context *context,
* >0 number of per-thread tracers
* <0 number of per-cpu tracers
*
- * The below functions to get and put tracers and to check the
- * allocation type require the ds_lock to be held by the caller.
- *
* Tracers essentially gives the number of ds contexts for a certain
* type of allocation.
*/
-static long tracers;
+static atomic_t tracers = ATOMIC_INIT(0);
static inline void get_tracer(struct task_struct *task)
{
- tracers += (task ? 1 : -1);
+ if (task)
+ atomic_inc(&tracers);
+ else
+ atomic_dec(&tracers);
}
static inline void put_tracer(struct task_struct *task)
{
- tracers -= (task ? 1 : -1);
+ if (task)
+ atomic_dec(&tracers);
+ else
+ atomic_inc(&tracers);
}
static inline int check_tracer(struct task_struct *task)
{
- return (task ? (tracers >= 0) : (tracers <= 0));
+ return task ?
+ (atomic_read(&tracers) >= 0) :
+ (atomic_read(&tracers) <= 0);
}
@@ -185,100 +211,83 @@ static inline int check_tracer(struct task_struct *task)
*
* Contexts are use-counted. They are allocated on first access and
* deallocated when the last user puts the context.
- *
- * We distinguish between an allocating and a non-allocating get of a
- * context:
- * - the allocating get is used for requesting BTS/PEBS resources. It
- * requires the caller to hold the global ds_lock.
- * - the non-allocating get is used for all other cases. A
- * non-existing context indicates an error. It acquires and releases
- * the ds_lock itself for obtaining the context.
- *
- * A context and its DS configuration are allocated and deallocated
- * together. A context always has a DS configuration of the
- * appropriate size.
- */
-static DEFINE_PER_CPU(struct ds_context *, system_context);
-
-#define this_system_context per_cpu(system_context, smp_processor_id())
-
-/*
- * Returns the pointer to the parameter task's context or to the
- * system-wide context, if task is NULL.
- *
- * Increases the use count of the returned context, if not NULL.
*/
-static inline struct ds_context *ds_get_context(struct task_struct *task)
-{
- struct ds_context *context;
-
- spin_lock(&ds_lock);
+struct ds_context {
+ /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
+ unsigned char ds[MAX_SIZEOF_DS];
+ /* the owner of the BTS and PEBS configuration, respectively */
+ struct bts_tracer *bts_master;
+ struct pebs_tracer *pebs_master;
+ /* use count */
+ unsigned long count;
+ /* a pointer to the context location inside the thread_struct
+ * or the per_cpu context array */
+ struct ds_context **this;
+ /* a pointer to the task owning this context, or NULL, if the
+ * context is owned by a cpu */
+ struct task_struct *task;
+};
- context = (task ? task->thread.ds_ctx : this_system_context);
- if (context)
- context->count++;
+static DEFINE_PER_CPU(struct ds_context *, system_context_array);
- spin_unlock(&ds_lock);
+#define system_context per_cpu(system_context_array, smp_processor_id())
- return context;
-}
-/*
- * Same as ds_get_context, but allocates the context and it's DS
- * structure, if necessary; returns NULL; if out of memory.
- *
- * pre: requires ds_lock to be held
- */
-static inline struct ds_context *ds_alloc_context(struct task_struct *task)
+static inline struct ds_context *ds_get_context(struct task_struct *task)
{
struct ds_context **p_context =
- (task ? &task->thread.ds_ctx : &this_system_context);
- struct ds_context *context = *p_context;
-
- if (!context) {
- context = kzalloc(sizeof(*context), GFP_KERNEL);
-
- if (!context)
- return NULL;
+ (task ? &task->thread.ds_ctx : &system_context);
+ struct ds_context *context = NULL;
+ struct ds_context *new_context = NULL;
+ unsigned long irq;
+
+ /* Chances are small that we already have a context. */
+ new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
+ if (!new_context)
+ return NULL;
- context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
- if (!context->ds) {
- kfree(context);
- return NULL;
- }
+ spin_lock_irqsave(&ds_lock, irq);
- *p_context = context;
+ context = *p_context;
+ if (!context) {
+ context = new_context;
context->this = p_context;
context->task = task;
+ context->count = 0;
if (task)
set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
if (!task || (task == current))
- wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
+ wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
- get_tracer(task);
+ *p_context = context;
}
context->count++;
+ spin_unlock_irqrestore(&ds_lock, irq);
+
+ if (context != new_context)
+ kfree(new_context);
+
return context;
}
-/*
- * Decreases the use count of the parameter context, if not NULL.
- * Deallocates the context, if the use count reaches zero.
- */
static inline void ds_put_context(struct ds_context *context)
{
+ unsigned long irq;
+
if (!context)
return;
- spin_lock(&ds_lock);
+ spin_lock_irqsave(&ds_lock, irq);
- if (--context->count)
- goto out;
+ if (--context->count) {
+ spin_unlock_irqrestore(&ds_lock, irq);
+ return;
+ }
*(context->this) = NULL;
@@ -288,132 +297,263 @@ static inline void ds_put_context(struct ds_context *context)
if (!context->task || (context->task == current))
wrmsrl(MSR_IA32_DS_AREA, 0);
- put_tracer(context->task);
+ spin_unlock_irqrestore(&ds_lock, irq);
- /* free any leftover buffers from tracers that did not
- * deallocate them properly. */
- kfree(context->buffer[ds_bts]);
- kfree(context->buffer[ds_pebs]);
- kfree(context->ds);
kfree(context);
- out:
- spin_unlock(&ds_lock);
}
/*
- * Handle a buffer overflow
+ * Call the tracer's callback on a buffer overflow.
*
- * task: the task whose buffers are overflowing;
- * NULL for a buffer overflow on the current cpu
* context: the ds context
* qual: the buffer type
*/
-static void ds_overflow(struct task_struct *task, struct ds_context *context,
- enum ds_qualifier qual)
+static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
{
- if (!context)
- return;
-
- if (context->callback[qual])
- (*context->callback[qual])(task);
-
- /* todo: do some more overflow handling */
+ switch (qual) {
+ case ds_bts:
+ if (context->bts_master &&
+ context->bts_master->ovfl)
+ context->bts_master->ovfl(context->bts_master);
+ break;
+ case ds_pebs:
+ if (context->pebs_master &&
+ context->pebs_master->ovfl)
+ context->pebs_master->ovfl(context->pebs_master);
+ break;
+ }
}
/*
- * Allocate a non-pageable buffer of the parameter size.
- * Checks the memory and the locked memory rlimit.
+ * Write raw data into the BTS or PEBS buffer.
*
- * Returns the buffer, if successful;
- * NULL, if out of memory or rlimit exceeded.
+ * The remainder of any partially written record is zeroed out.
*
- * size: the requested buffer size in bytes
- * pages (out): if not NULL, contains the number of pages reserved
+ * context: the DS context
+ * qual: the buffer type
+ * record: the data to write
+ * size: the size of the data
*/
-static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
+static int ds_write(struct ds_context *context, enum ds_qualifier qual,
+ const void *record, size_t size)
{
- unsigned long rlim, vm, pgsz;
- void *buffer;
+ int bytes_written = 0;
- pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ if (!record)
+ return -EINVAL;
- rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
- vm = current->mm->total_vm + pgsz;
- if (rlim < vm)
- return NULL;
+ while (size) {
+ unsigned long base, index, end, write_end, int_th;
+ unsigned long write_size, adj_write_size;
- rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
- vm = current->mm->locked_vm + pgsz;
- if (rlim < vm)
- return NULL;
+ /*
+ * write as much as possible without producing an
+ * overflow interrupt.
+ *
+ * interrupt_threshold must either be
+ * - bigger than absolute_maximum or
+ * - point to a record between buffer_base and absolute_maximum
+ *
+ * index points to a valid record.
+ */
+ base = ds_get(context->ds, qual, ds_buffer_base);
+ index = ds_get(context->ds, qual, ds_index);
+ end = ds_get(context->ds, qual, ds_absolute_maximum);
+ int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
- buffer = kzalloc(size, GFP_KERNEL);
- if (!buffer)
- return NULL;
+ write_end = min(end, int_th);
+
+ /* if we are already beyond the interrupt threshold,
+ * we fill the entire buffer */
+ if (write_end <= index)
+ write_end = end;
+
+ if (write_end <= index)
+ break;
+
+ write_size = min((unsigned long) size, write_end - index);
+ memcpy((void *)index, record, write_size);
+
+ record = (const char *)record + write_size;
+ size -= write_size;
+ bytes_written += write_size;
+
+ adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
+ adj_write_size *= ds_cfg.sizeof_rec[qual];
+
+ /* zero out trailing bytes */
+ memset((char *)index + write_size, 0,
+ adj_write_size - write_size);
+ index += adj_write_size;
+
+ if (index >= end)
+ index = base;
+ ds_set(context->ds, qual, ds_index, index);
+
+ if (index >= int_th)
+ ds_overflow(context, qual);
+ }
+
+ return bytes_written;
+}
+
+
+/*
+ * Branch Trace Store (BTS) uses the following format. Different
+ * architectures vary in the size of those fields.
+ * - source linear address
+ * - destination linear address
+ * - flags
+ *
+ * Later architectures use 64bit pointers throughout, whereas earlier
+ * architectures use 32bit pointers in 32bit mode.
+ *
+ * We compute the base address for the first 8 fields based on:
+ * - the field size stored in the DS configuration
+ * - the relative field position
+ *
+ * In order to store additional information in the BTS buffer, we use
+ * a special source address to indicate that the record requires
+ * special interpretation.
+ *
+ * Netburst indicated via a bit in the flags field whether the branch
+ * was predicted; this is ignored.
+ *
+ * We use two levels of abstraction:
+ * - the raw data level defined here
+ * - an arch-independent level defined in ds.h
+ */
+
+enum bts_field {
+ bts_from,
+ bts_to,
+ bts_flags,
- current->mm->total_vm += pgsz;
- current->mm->locked_vm += pgsz;
+ bts_qual = bts_from,
+ bts_jiffies = bts_to,
+ bts_pid = bts_flags,
- if (pages)
- *pages = pgsz;
+ bts_qual_mask = (bts_qual_max - 1),
+ bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
+};
- return buffer;
+static inline unsigned long bts_get(const char *base, enum bts_field field)
+{
+ base += (ds_cfg.sizeof_field * field);
+ return *(unsigned long *)base;
}
-static int ds_request(struct task_struct *task, void *base, size_t size,
- ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
+static inline void bts_set(char *base, enum bts_field field, unsigned long val)
{
- struct ds_context *context;
- unsigned long buffer, adj;
- const unsigned long alignment = (1 << 3);
- int error = 0;
+ base += (ds_cfg.sizeof_field * field);;
+ (*(unsigned long *)base) = val;
+}
- if (!ds_cfg.sizeof_ds)
- return -EOPNOTSUPP;
- /* we require some space to do alignment adjustments below */
- if (size < (alignment + ds_cfg.sizeof_rec[qual]))
+/*
+ * The raw BTS data is architecture dependent.
+ *
+ * For higher-level users, we give an arch-independent view.
+ * - ds.h defines struct bts_struct
+ * - bts_read translates one raw bts record into a bts_struct
+ * - bts_write translates one bts_struct into the raw format and
+ * writes it into the top of the parameter tracer's buffer.
+ *
+ * return: bytes read/written on success; -Eerrno, otherwise
+ */
+static int bts_read(struct bts_tracer *tracer, const void *at,
+ struct bts_struct *out)
+{
+ if (!tracer)
return -EINVAL;
- /* buffer overflow notification is not yet implemented */
- if (ovfl)
- return -EOPNOTSUPP;
+ if (at < tracer->trace.ds.begin)
+ return -EINVAL;
+ if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
+ return -EINVAL;
- spin_lock(&ds_lock);
+ memset(out, 0, sizeof(*out));
+ if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
+ out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
+ out->variant.timestamp.jiffies = bts_get(at, bts_jiffies);
+ out->variant.timestamp.pid = bts_get(at, bts_pid);
+ } else {
+ out->qualifier = bts_branch;
+ out->variant.lbr.from = bts_get(at, bts_from);
+ out->variant.lbr.to = bts_get(at, bts_to);
+
+ if (!out->variant.lbr.from && !out->variant.lbr.to)
+ out->qualifier = bts_invalid;
+ }
- if (!check_tracer(task))
- return -EPERM;
+ return ds_cfg.sizeof_rec[ds_bts];
+}
- error = -ENOMEM;
- context = ds_alloc_context(task);
- if (!context)
- goto out_unlock;
+static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
+{
+ unsigned char raw[MAX_SIZEOF_BTS];
- error = -EALREADY;
- if (context->owner[qual] == current)
- goto out_unlock;
- error = -EPERM;
- if (context->owner[qual] != NULL)
- goto out_unlock;
- context->owner[qual] = current;
+ if (!tracer)
+ return -EINVAL;
+
+ if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
+ return -EOVERFLOW;
+
+ switch (in->qualifier) {
+ case bts_invalid:
+ bts_set(raw, bts_from, 0);
+ bts_set(raw, bts_to, 0);
+ bts_set(raw, bts_flags, 0);
+ break;
+ case bts_branch:
+ bts_set(raw, bts_from, in->variant.lbr.from);
+ bts_set(raw, bts_to, in->variant.lbr.to);
+ bts_set(raw, bts_flags, 0);
+ break;
+ case bts_task_arrives:
+ case bts_task_departs:
+ bts_set(raw, bts_qual, (bts_escape | in->qualifier));
+ bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies);
+ bts_set(raw, bts_pid, in->variant.timestamp.pid);
+ break;
+ default:
+ return -EINVAL;
+ }
- spin_unlock(&ds_lock);
+ return ds_write(tracer->ds.context, ds_bts, raw,
+ ds_cfg.sizeof_rec[ds_bts]);
+}
- error = -ENOMEM;
- if (!base) {
- base = ds_allocate_buffer(size, &context->pages[qual]);
- if (!base)
- goto out_release;
+static void ds_write_config(struct ds_context *context,
+ struct ds_trace *cfg, enum ds_qualifier qual)
+{
+ unsigned char *ds = context->ds;
- context->buffer[qual] = base;
- }
- error = 0;
+ ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
+ ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
+ ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
+ ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
+}
- context->callback[qual] = ovfl;
+static void ds_read_config(struct ds_context *context,
+ struct ds_trace *cfg, enum ds_qualifier qual)
+{
+ unsigned char *ds = context->ds;
+
+ cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
+ cfg->top = (void *)ds_get(ds, qual, ds_index);
+ cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
+ cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
+}
+
+static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
+ void *base, size_t size, size_t ith,
+ unsigned int flags) {
+ unsigned long buffer, adj;
/* adjust the buffer address and size to meet alignment
* constraints:
@@ -425,395 +565,383 @@ static int ds_request(struct task_struct *task, void *base, size_t size,
*/
buffer = (unsigned long)base;
- adj = ALIGN(buffer, alignment) - buffer;
+ adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
buffer += adj;
size -= adj;
- size /= ds_cfg.sizeof_rec[qual];
- size *= ds_cfg.sizeof_rec[qual];
-
- ds_set(context->ds, qual, ds_buffer_base, buffer);
- ds_set(context->ds, qual, ds_index, buffer);
- ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
+ trace->n = size / ds_cfg.sizeof_rec[qual];
+ trace->size = ds_cfg.sizeof_rec[qual];
- if (ovfl) {
- /* todo: select a suitable interrupt threshold */
- } else
- ds_set(context->ds, qual,
- ds_interrupt_threshold, buffer + size + 1);
+ size = (trace->n * trace->size);
- /* we keep the context until ds_release */
- return error;
-
- out_release:
- context->owner[qual] = NULL;
- ds_put_context(context);
- return error;
+ trace->begin = (void *)buffer;
+ trace->top = trace->begin;
+ trace->end = (void *)(buffer + size);
+ /* The value for 'no threshold' is -1, which will set the
+ * threshold outside of the buffer, just like we want it.
+ */
+ trace->ith = (void *)(buffer + size - ith);
- out_unlock:
- spin_unlock(&ds_lock);
- ds_put_context(context);
- return error;
+ trace->flags = flags;
}
-int ds_request_bts(struct task_struct *task, void *base, size_t size,
- ds_ovfl_callback_t ovfl)
-{
- return ds_request(task, base, size, ovfl, ds_bts);
-}
-int ds_request_pebs(struct task_struct *task, void *base, size_t size,
- ds_ovfl_callback_t ovfl)
-{
- return ds_request(task, base, size, ovfl, ds_pebs);
-}
-
-static int ds_release(struct task_struct *task, enum ds_qualifier qual)
+static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
+ enum ds_qualifier qual, struct task_struct *task,
+ void *base, size_t size, size_t th, unsigned int flags)
{
struct ds_context *context;
int error;
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
+ error = -EINVAL;
+ if (!base)
goto out;
- kfree(context->buffer[qual]);
- context->buffer[qual] = NULL;
-
- current->mm->total_vm -= context->pages[qual];
- current->mm->locked_vm -= context->pages[qual];
- context->pages[qual] = 0;
- context->owner[qual] = NULL;
-
- /*
- * we put the context twice:
- * once for the ds_get_context
- * once for the corresponding ds_request
- */
- ds_put_context(context);
- out:
- ds_put_context(context);
- return error;
-}
+ /* we require some space to do alignment adjustments below */
+ error = -EINVAL;
+ if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
+ goto out;
-int ds_release_bts(struct task_struct *task)
-{
- return ds_release(task, ds_bts);
-}
+ if (th != (size_t)-1) {
+ th *= ds_cfg.sizeof_rec[qual];
-int ds_release_pebs(struct task_struct *task)
-{
- return ds_release(task, ds_pebs);
-}
+ error = -EINVAL;
+ if (size <= th)
+ goto out;
+ }
-static int ds_get_index(struct task_struct *task, size_t *pos,
- enum ds_qualifier qual)
-{
- struct ds_context *context;
- unsigned long base, index;
- int error;
+ tracer->buffer = base;
+ tracer->size = size;
+ error = -ENOMEM;
context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
+ if (!context)
goto out;
+ tracer->context = context;
- base = ds_get(context->ds, qual, ds_buffer_base);
- index = ds_get(context->ds, qual, ds_index);
+ ds_init_ds_trace(trace, qual, base, size, th, flags);
- error = ((index - base) / ds_cfg.sizeof_rec[qual]);
- if (pos)
- *pos = error;
+ error = 0;
out:
- ds_put_context(context);
return error;
}
-int ds_get_bts_index(struct task_struct *task, size_t *pos)
+struct bts_tracer *ds_request_bts(struct task_struct *task,
+ void *base, size_t size,
+ bts_ovfl_callback_t ovfl, size_t th,
+ unsigned int flags)
{
- return ds_get_index(task, pos, ds_bts);
-}
-
-int ds_get_pebs_index(struct task_struct *task, size_t *pos)
-{
- return ds_get_index(task, pos, ds_pebs);
-}
-
-static int ds_get_end(struct task_struct *task, size_t *pos,
- enum ds_qualifier qual)
-{
- struct ds_context *context;
- unsigned long base, end;
+ struct bts_tracer *tracer;
+ unsigned long irq;
int error;
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
+ error = -EOPNOTSUPP;
+ if (!ds_cfg.ctl[dsf_bts])
goto out;
- base = ds_get(context->ds, qual, ds_buffer_base);
- end = ds_get(context->ds, qual, ds_absolute_maximum);
+ /* buffer overflow notification is not yet implemented */
+ error = -EOPNOTSUPP;
+ if (ovfl)
+ goto out;
- error = ((end - base) / ds_cfg.sizeof_rec[qual]);
- if (pos)
- *pos = error;
- out:
- ds_put_context(context);
- return error;
-}
+ error = -ENOMEM;
+ tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+ if (!tracer)
+ goto out;
+ tracer->ovfl = ovfl;
-int ds_get_bts_end(struct task_struct *task, size_t *pos)
-{
- return ds_get_end(task, pos, ds_bts);
-}
+ error = ds_request(&tracer->ds, &tracer->trace.ds,
+ ds_bts, task, base, size, th, flags);
+ if (error < 0)
+ goto out_tracer;
-int ds_get_pebs_end(struct task_struct *task, size_t *pos)
-{
- return ds_get_end(task, pos, ds_pebs);
-}
-static int ds_access(struct task_struct *task, size_t index,
- const void **record, enum ds_qualifier qual)
-{
- struct ds_context *context;
- unsigned long base, idx;
- int error;
+ spin_lock_irqsave(&ds_lock, irq);
- if (!record)
- return -EINVAL;
+ error = -EPERM;
+ if (!check_tracer(task))
+ goto out_unlock;
+ get_tracer(task);
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
- goto out;
+ error = -EPERM;
+ if (tracer->ds.context->bts_master)
+ goto out_put_tracer;
+ tracer->ds.context->bts_master = tracer;
- base = ds_get(context->ds, qual, ds_buffer_base);
- idx = base + (index * ds_cfg.sizeof_rec[qual]);
+ spin_unlock_irqrestore(&ds_lock, irq);
- error = -EINVAL;
- if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
- goto out;
- *record = (const void *)idx;
- error = ds_cfg.sizeof_rec[qual];
- out:
- ds_put_context(context);
- return error;
-}
+ tracer->trace.read = bts_read;
+ tracer->trace.write = bts_write;
-int ds_access_bts(struct task_struct *task, size_t index, const void **record)
-{
- return ds_access(task, index, record, ds_bts);
-}
+ ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+ ds_resume_bts(tracer);
-int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
-{
- return ds_access(task, index, record, ds_pebs);
+ return tracer;
+
+ out_put_tracer:
+ put_tracer(task);
+ out_unlock:
+ spin_unlock_irqrestore(&ds_lock, irq);
+ ds_put_context(tracer->ds.context);
+ out_tracer:
+ kfree(tracer);
+ out:
+ return ERR_PTR(error);
}
-static int ds_write(struct task_struct *task, const void *record, size_t size,
- enum ds_qualifier qual, int force)
+struct pebs_tracer *ds_request_pebs(struct task_struct *task,
+ void *base, size_t size,
+ pebs_ovfl_callback_t ovfl, size_t th,
+ unsigned int flags)
{
- struct ds_context *context;
+ struct pebs_tracer *tracer;
+ unsigned long irq;
int error;
- if (!record)
- return -EINVAL;
+ /* buffer overflow notification is not yet implemented */
+ error = -EOPNOTSUPP;
+ if (ovfl)
+ goto out;
- error = -EPERM;
- context = ds_get_context(task);
- if (!context)
+ error = -ENOMEM;
+ tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+ if (!tracer)
goto out;
+ tracer->ovfl = ovfl;
- if (!force) {
- error = ds_validate_access(context, qual);
- if (error < 0)
- goto out;
- }
+ error = ds_request(&tracer->ds, &tracer->trace.ds,
+ ds_pebs, task, base, size, th, flags);
+ if (error < 0)
+ goto out_tracer;
- error = 0;
- while (size) {
- unsigned long base, index, end, write_end, int_th;
- unsigned long write_size, adj_write_size;
+ spin_lock_irqsave(&ds_lock, irq);
- /*
- * write as much as possible without producing an
- * overflow interrupt.
- *
- * interrupt_threshold must either be
- * - bigger than absolute_maximum or
- * - point to a record between buffer_base and absolute_maximum
- *
- * index points to a valid record.
- */
- base = ds_get(context->ds, qual, ds_buffer_base);
- index = ds_get(context->ds, qual, ds_index);
- end = ds_get(context->ds, qual, ds_absolute_maximum);
- int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
+ error = -EPERM;
+ if (!check_tracer(task))
+ goto out_unlock;
+ get_tracer(task);
- write_end = min(end, int_th);
+ error = -EPERM;
+ if (tracer->ds.context->pebs_master)
+ goto out_put_tracer;
+ tracer->ds.context->pebs_master = tracer;
- /* if we are already beyond the interrupt threshold,
- * we fill the entire buffer */
- if (write_end <= index)
- write_end = end;
+ spin_unlock_irqrestore(&ds_lock, irq);
- if (write_end <= index)
- goto out;
+ ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+ ds_resume_pebs(tracer);
- write_size = min((unsigned long) size, write_end - index);
- memcpy((void *)index, record, write_size);
+ return tracer;
- record = (const char *)record + write_size;
- size -= write_size;
- error += write_size;
+ out_put_tracer:
+ put_tracer(task);
+ out_unlock:
+ spin_unlock_irqrestore(&ds_lock, irq);
+ ds_put_context(tracer->ds.context);
+ out_tracer:
+ kfree(tracer);
+ out:
+ return ERR_PTR(error);
+}
- adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
- adj_write_size *= ds_cfg.sizeof_rec[qual];
+void ds_release_bts(struct bts_tracer *tracer)
+{
+ if (!tracer)
+ return;
- /* zero out trailing bytes */
- memset((char *)index + write_size, 0,
- adj_write_size - write_size);
- index += adj_write_size;
+ ds_suspend_bts(tracer);
- if (index >= end)
- index = base;
- ds_set(context->ds, qual, ds_index, index);
+ WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
+ tracer->ds.context->bts_master = NULL;
- if (index >= int_th)
- ds_overflow(task, context, qual);
- }
+ put_tracer(tracer->ds.context->task);
+ ds_put_context(tracer->ds.context);
- out:
- ds_put_context(context);
- return error;
+ kfree(tracer);
}
-int ds_write_bts(struct task_struct *task, const void *record, size_t size)
+void ds_suspend_bts(struct bts_tracer *tracer)
{
- return ds_write(task, record, size, ds_bts, /* force = */ 0);
-}
+ struct task_struct *task;
-int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
-{
- return ds_write(task, record, size, ds_pebs, /* force = */ 0);
-}
+ if (!tracer)
+ return;
-int ds_unchecked_write_bts(struct task_struct *task,
- const void *record, size_t size)
-{
- return ds_write(task, record, size, ds_bts, /* force = */ 1);
-}
+ task = tracer->ds.context->task;
-int ds_unchecked_write_pebs(struct task_struct *task,
- const void *record, size_t size)
-{
- return ds_write(task, record, size, ds_pebs, /* force = */ 1);
+ if (!task || (task == current))
+ update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
+
+ if (task) {
+ task->thread.debugctlmsr &= ~BTS_CONTROL;
+
+ if (!task->thread.debugctlmsr)
+ clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
+ }
}
-static int ds_reset_or_clear(struct task_struct *task,
- enum ds_qualifier qual, int clear)
+void ds_resume_bts(struct bts_tracer *tracer)
{
- struct ds_context *context;
- unsigned long base, end;
- int error;
+ struct task_struct *task;
+ unsigned long control;
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
- goto out;
+ if (!tracer)
+ return;
- base = ds_get(context->ds, qual, ds_buffer_base);
- end = ds_get(context->ds, qual, ds_absolute_maximum);
+ task = tracer->ds.context->task;
- if (clear)
- memset((void *)base, 0, end - base);
+ control = ds_cfg.ctl[dsf_bts];
+ if (!(tracer->trace.ds.flags & BTS_KERNEL))
+ control |= ds_cfg.ctl[dsf_bts_kernel];
+ if (!(tracer->trace.ds.flags & BTS_USER))
+ control |= ds_cfg.ctl[dsf_bts_user];
- ds_set(context->ds, qual, ds_index, base);
+ if (task) {
+ task->thread.debugctlmsr |= control;
+ set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
+ }
- error = 0;
- out:
- ds_put_context(context);
- return error;
+ if (!task || (task == current))
+ update_debugctlmsr(get_debugctlmsr() | control);
}
-int ds_reset_bts(struct task_struct *task)
+void ds_release_pebs(struct pebs_tracer *tracer)
{
- return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
+ if (!tracer)
+ return;
+
+ ds_suspend_pebs(tracer);
+
+ WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
+ tracer->ds.context->pebs_master = NULL;
+
+ put_tracer(tracer->ds.context->task);
+ ds_put_context(tracer->ds.context);
+
+ kfree(tracer);
}
-int ds_reset_pebs(struct task_struct *task)
+void ds_suspend_pebs(struct pebs_tracer *tracer)
{
- return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
+
}
-int ds_clear_bts(struct task_struct *task)
+void ds_resume_pebs(struct pebs_tracer *tracer)
{
- return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
+
}
-int ds_clear_pebs(struct task_struct *task)
+const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
{
- return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
+ if (!tracer)
+ return NULL;
+
+ ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+ return &tracer->trace;
}
-int ds_get_pebs_reset(struct task_struct *task, u64 *value)
+const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
{
- struct ds_context *context;
- int error;
+ if (!tracer)
+ return NULL;
+
+ ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
+ tracer->trace.reset_value =
+ *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
+
+ return &tracer->trace;
+}
- if (!value)
+int ds_reset_bts(struct bts_tracer *tracer)
+{
+ if (!tracer)
return -EINVAL;
- context = ds_get_context(task);
- error = ds_validate_access(context, ds_pebs);
- if (error < 0)
- goto out;
+ tracer->trace.ds.top = tracer->trace.ds.begin;
- *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
+ ds_set(tracer->ds.context->ds, ds_bts, ds_index,
+ (unsigned long)tracer->trace.ds.top);
- error = 0;
- out:
- ds_put_context(context);
- return error;
+ return 0;
}
-int ds_set_pebs_reset(struct task_struct *task, u64 value)
+int ds_reset_pebs(struct pebs_tracer *tracer)
{
- struct ds_context *context;
- int error;
+ if (!tracer)
+ return -EINVAL;
- context = ds_get_context(task);
- error = ds_validate_access(context, ds_pebs);
- if (error < 0)
- goto out;
+ tracer->trace.ds.top = tracer->trace.ds.begin;
- *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
+ ds_set(tracer->ds.context->ds, ds_bts, ds_index,
+ (unsigned long)tracer->trace.ds.top);
- error = 0;
- out:
- ds_put_context(context);
- return error;
+ return 0;
+}
+
+int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
+{
+ if (!tracer)
+ return -EINVAL;
+
+ *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
+
+ return 0;
}
-static const struct ds_configuration ds_cfg_var = {
- .sizeof_ds = sizeof(long) * 12,
- .sizeof_field = sizeof(long),
- .sizeof_rec[ds_bts] = sizeof(long) * 3,
- .sizeof_rec[ds_pebs] = sizeof(long) * 10
+static const struct ds_configuration ds_cfg_netburst = {
+ .name = "netburst",
+ .ctl[dsf_bts] = (1 << 2) | (1 << 3),
+ .ctl[dsf_bts_kernel] = (1 << 5),
+ .ctl[dsf_bts_user] = (1 << 6),
+
+ .sizeof_field = sizeof(long),
+ .sizeof_rec[ds_bts] = sizeof(long) * 3,
+#ifdef __i386__
+ .sizeof_rec[ds_pebs] = sizeof(long) * 10,
+#else
+ .sizeof_rec[ds_pebs] = sizeof(long) * 18,
+#endif
};
-static const struct ds_configuration ds_cfg_64 = {
- .sizeof_ds = 8 * 12,
- .sizeof_field = 8,
- .sizeof_rec[ds_bts] = 8 * 3,
- .sizeof_rec[ds_pebs] = 8 * 10
+static const struct ds_configuration ds_cfg_pentium_m = {
+ .name = "pentium m",
+ .ctl[dsf_bts] = (1 << 6) | (1 << 7),
+
+ .sizeof_field = sizeof(long),
+ .sizeof_rec[ds_bts] = sizeof(long) * 3,
+#ifdef __i386__
+ .sizeof_rec[ds_pebs] = sizeof(long) * 10,
+#else
+ .sizeof_rec[ds_pebs] = sizeof(long) * 18,
+#endif
+};
+static const struct ds_configuration ds_cfg_core2 = {
+ .name = "core 2",
+ .ctl[dsf_bts] = (1 << 6) | (1 << 7),
+ .ctl[dsf_bts_kernel] = (1 << 9),
+ .ctl[dsf_bts_user] = (1 << 10),
+
+ .sizeof_field = 8,
+ .sizeof_rec[ds_bts] = 8 * 3,
+ .sizeof_rec[ds_pebs] = 8 * 18,
};
-static inline void
+static void
ds_configure(const struct ds_configuration *cfg)
{
+ memset(&ds_cfg, 0, sizeof(ds_cfg));
ds_cfg = *cfg;
+
+ printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name);
+
+ if (!cpu_has_bts) {
+ ds_cfg.ctl[dsf_bts] = 0;
+ printk(KERN_INFO "[ds] bts not available\n");
+ }
+ if (!cpu_has_pebs)
+ printk(KERN_INFO "[ds] pebs not available\n");
+
+ WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
}
void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -821,16 +949,15 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
switch (c->x86) {
case 0x6:
switch (c->x86_model) {
+ case 0 ... 0xC:
+ /* sorry, don't know about them */
+ break;
case 0xD:
case 0xE: /* Pentium M */
- ds_configure(&ds_cfg_var);
+ ds_configure(&ds_cfg_pentium_m);
break;
- case 0xF: /* Core2 */
- case 0x1C: /* Atom */
- ds_configure(&ds_cfg_64);
- break;
- default:
- /* sorry, don't know about them */
+ default: /* Core2, Atom, ... */
+ ds_configure(&ds_cfg_core2);
break;
}
break;
@@ -839,7 +966,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
case 0x0:
case 0x1:
case 0x2: /* Netburst */
- ds_configure(&ds_cfg_var);
+ ds_configure(&ds_cfg_netburst);
break;
default:
/* sorry, don't know about them */
@@ -852,13 +979,52 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
}
}
-void ds_free(struct ds_context *context)
+/*
+ * Change the DS configuration from tracing prev to tracing next.
+ */
+void ds_switch_to(struct task_struct *prev, struct task_struct *next)
+{
+ struct ds_context *prev_ctx = prev->thread.ds_ctx;
+ struct ds_context *next_ctx = next->thread.ds_ctx;
+
+ if (prev_ctx) {
+ update_debugctlmsr(0);
+
+ if (prev_ctx->bts_master &&
+ (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
+ struct bts_struct ts = {
+ .qualifier = bts_task_departs,
+ .variant.timestamp.jiffies = jiffies_64,
+ .variant.timestamp.pid = prev->pid
+ };
+ bts_write(prev_ctx->bts_master, &ts);
+ }
+ }
+
+ if (next_ctx) {
+ if (next_ctx->bts_master &&
+ (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
+ struct bts_struct ts = {
+ .qualifier = bts_task_arrives,
+ .variant.timestamp.jiffies = jiffies_64,
+ .variant.timestamp.pid = next->pid
+ };
+ bts_write(next_ctx->bts_master, &ts);
+ }
+
+ wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
+ }
+
+ update_debugctlmsr(next->thread.debugctlmsr);
+}
+
+void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
+{
+ clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR);
+ tsk->thread.ds_ctx = NULL;
+}
+
+void ds_exit_thread(struct task_struct *tsk)
{
- /* This is called when the task owning the parameter context
- * is dying. There should not be any user of that context left
- * to disturb us, anymore. */
- unsigned long leftovers = context->count;
- while (leftovers--)
- ds_put_context(context);
+ WARN_ON(tsk->thread.ds_ctx);
}
-#endif /* CONFIG_X86_DS */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
new file mode 100644
index 000000000000..6b1f6f6f8661
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/hardirq.h>
+#include <linux/kdebug.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/sysfs.h>
+
+#include <asm/stacktrace.h>
+
+#include "dumpstack.h"
+
+int panic_on_unrecovered_nmi;
+unsigned int code_bytes = 64;
+int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
+static int die_counter;
+
+void printk_address(unsigned long address, int reliable)
+{
+ printk(" [<%p>] %s%pS\n", (void *) address,
+ reliable ? "" : "? ", (void *) address);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static void
+print_ftrace_graph_addr(unsigned long addr, void *data,
+ const struct stacktrace_ops *ops,
+ struct thread_info *tinfo, int *graph)
+{
+ struct task_struct *task = tinfo->task;
+ unsigned long ret_addr;
+ int index = task->curr_ret_stack;
+
+ if (addr != (unsigned long)return_to_handler)
+ return;
+
+ if (!task->ret_stack || index < *graph)
+ return;
+
+ index -= *graph;
+ ret_addr = task->ret_stack[index].ret;
+
+ ops->address(data, ret_addr, 1);
+
+ (*graph)++;
+}
+#else
+static inline void
+print_ftrace_graph_addr(unsigned long addr, void *data,
+ const struct stacktrace_ops *ops,
+ struct thread_info *tinfo, int *graph)
+{ }
+#endif
+
+/*
+ * x86-64 can have up to three kernel stacks:
+ * process stack
+ * interrupt stack
+ * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
+ */
+
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+ void *p, unsigned int size, void *end)
+{
+ void *t = tinfo;
+ if (end) {
+ if (p < end && p >= (end-THREAD_SIZE))
+ return 1;
+ else
+ return 0;
+ }
+ return p > t && p < t + THREAD_SIZE - size;
+}
+
+unsigned long
+print_context_stack(struct thread_info *tinfo,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data,
+ unsigned long *end, int *graph)
+{
+ struct stack_frame *frame = (struct stack_frame *)bp;
+
+ while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
+ unsigned long addr;
+
+ addr = *stack;
+ if (__kernel_text_address(addr)) {
+ if ((unsigned long) stack == bp + sizeof(long)) {
+ ops->address(data, addr, 1);
+ frame = frame->next_frame;
+ bp = (unsigned long) frame;
+ } else {
+ ops->address(data, addr, bp == 0);
+ }
+ print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
+ }
+ stack++;
+ }
+ return bp;
+}
+
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+ printk(data);
+ print_symbol(msg, symbol);
+ printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+ printk("%s%s\n", (char *)data, msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+ printk("%s <%s> ", (char *)data, name);
+ return 0;
+}
+
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr, int reliable)
+{
+ touch_nmi_watchdog();
+ printk(data);
+ printk_address(addr, reliable);
+}
+
+static const struct stacktrace_ops print_trace_ops = {
+ .warning = print_trace_warning,
+ .warning_symbol = print_trace_warning_symbol,
+ .stack = print_trace_stack,
+ .address = print_trace_address,
+};
+
+void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp, char *log_lvl)
+{
+ printk("%sCall Trace:\n", log_lvl);
+ dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+}
+
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp)
+{
+ show_trace_log_lvl(task, regs, stack, bp, "");
+}
+
+void show_stack(struct task_struct *task, unsigned long *sp)
+{
+ show_stack_log_lvl(task, NULL, sp, 0, "");
+}
+
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+ unsigned long bp = 0;
+ unsigned long stack;
+
+#ifdef CONFIG_FRAME_POINTER
+ if (!bp)
+ get_bp(bp);
+#endif
+
+ printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+ current->pid, current->comm, print_tainted(),
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+ show_trace(NULL, NULL, &stack, bp);
+}
+EXPORT_SYMBOL(dump_stack);
+
+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned __kprobes long oops_begin(void)
+{
+ int cpu;
+ unsigned long flags;
+
+ oops_enter();
+
+ /* racy, but better than risking deadlock. */
+ raw_local_irq_save(flags);
+ cpu = smp_processor_id();
+ if (!__raw_spin_trylock(&die_lock)) {
+ if (cpu == die_owner)
+ /* nested oops. should stop eventually */;
+ else
+ __raw_spin_lock(&die_lock);
+ }
+ die_nest_count++;
+ die_owner = cpu;
+ console_verbose();
+ bust_spinlocks(1);
+ return flags;
+}
+
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+ if (regs && kexec_should_crash(current))
+ crash_kexec(regs);
+
+ bust_spinlocks(0);
+ die_owner = -1;
+ add_taint(TAINT_DIE);
+ die_nest_count--;
+ if (!die_nest_count)
+ /* Nest count reaches zero, release the lock. */
+ __raw_spin_unlock(&die_lock);
+ raw_local_irq_restore(flags);
+ oops_exit();
+
+ if (!signr)
+ return;
+ if (in_interrupt())
+ panic("Fatal exception in interrupt");
+ if (panic_on_oops)
+ panic("Fatal exception");
+ do_exit(signr);
+}
+
+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+{
+#ifdef CONFIG_X86_32
+ unsigned short ss;
+ unsigned long sp;
+#endif
+ printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
+#ifdef CONFIG_PREEMPT
+ printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+ printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ printk("DEBUG_PAGEALLOC");
+#endif
+ printk("\n");
+ sysfs_printk_last_file();
+ if (notify_die(DIE_OOPS, str, regs, err,
+ current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+ return 1;
+
+ show_registers(regs);
+#ifdef CONFIG_X86_32
+ sp = (unsigned long) (&regs->sp);
+ savesegment(ss, ss);
+ if (user_mode(regs)) {
+ sp = regs->sp;
+ ss = regs->ss & 0xffff;
+ }
+ printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+ print_symbol("%s", regs->ip);
+ printk(" SS:ESP %04x:%08lx\n", ss, sp);
+#else
+ /* Executive summary in case the oops scrolled away */
+ printk(KERN_ALERT "RIP ");
+ printk_address(regs->ip, 1);
+ printk(" RSP <%016lx>\n", regs->sp);
+#endif
+ return 0;
+}
+
+/*
+ * This is gone through when something in the kernel has done something bad
+ * and is about to be terminated:
+ */
+void die(const char *str, struct pt_regs *regs, long err)
+{
+ unsigned long flags = oops_begin();
+ int sig = SIGSEGV;
+
+ if (!user_mode_vm(regs))
+ report_bug(regs->ip, regs);
+
+ if (__die(str, regs, err))
+ sig = 0;
+ oops_end(flags, regs, sig);
+}
+
+void notrace __kprobes
+die_nmi(char *str, struct pt_regs *regs, int do_panic)
+{
+ unsigned long flags;
+
+ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
+ return;
+
+ /*
+ * We are in trouble anyway, lets at least try
+ * to get a message out.
+ */
+ flags = oops_begin();
+ printk(KERN_EMERG "%s", str);
+ printk(" on CPU%d, ip %08lx, registers:\n",
+ smp_processor_id(), regs->ip);
+ show_registers(regs);
+ oops_end(flags, regs, 0);
+ if (do_panic || panic_on_oops)
+ panic("Non maskable interrupt");
+ nmi_exit();
+ local_irq_enable();
+ do_exit(SIGBUS);
+}
+
+static int __init oops_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "panic"))
+ panic_on_oops = 1;
+ return 0;
+}
+early_param("oops", oops_setup);
+
+static int __init kstack_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+ return 0;
+}
+early_param("kstack", kstack_setup);
+
+static int __init code_bytes_setup(char *s)
+{
+ code_bytes = simple_strtoul(s, NULL, 0);
+ if (code_bytes > 8192)
+ code_bytes = 8192;
+
+ return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
new file mode 100644
index 000000000000..da87590b8698
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+
+#ifndef DUMPSTACK_H
+#define DUMPSTACK_H
+
+#ifdef CONFIG_X86_32
+#define STACKSLOTS_PER_LINE 8
+#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
+#else
+#define STACKSLOTS_PER_LINE 4
+#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
+#endif
+
+extern unsigned long
+print_context_stack(struct thread_info *tinfo,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data,
+ unsigned long *end, int *graph);
+
+extern void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp, char *log_lvl);
+
+extern void
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, unsigned long bp, char *log_lvl);
+
+extern unsigned int code_bytes;
+extern int kstack_depth_to_print;
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+ struct stack_frame *next_frame;
+ unsigned long return_address;
+};
+#endif
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 201ee359a1a9..d593cd1f58dc 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -13,72 +13,18 @@
#include <linux/kexec.h>
#include <linux/bug.h>
#include <linux/nmi.h>
+#include <linux/sysfs.h>
#include <asm/stacktrace.h>
-#define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
-
-int panic_on_unrecovered_nmi;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
-static unsigned int code_bytes = 64;
-static int die_counter;
-
-void printk_address(unsigned long address, int reliable)
-{
- printk(" [<%p>] %s%pS\n", (void *) address,
- reliable ? "" : "? ", (void *) address);
-}
-
-static inline int valid_stack_ptr(struct thread_info *tinfo,
- void *p, unsigned int size, void *end)
-{
- void *t = tinfo;
- if (end) {
- if (p < end && p >= (end-THREAD_SIZE))
- return 1;
- else
- return 0;
- }
- return p > t && p < t + THREAD_SIZE - size;
-}
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
- struct stack_frame *next_frame;
- unsigned long return_address;
-};
-
-static inline unsigned long
-print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end)
-{
- struct stack_frame *frame = (struct stack_frame *)bp;
-
- while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
- unsigned long addr;
-
- addr = *stack;
- if (__kernel_text_address(addr)) {
- if ((unsigned long) stack == bp + sizeof(long)) {
- ops->address(data, addr, 1);
- frame = frame->next_frame;
- bp = (unsigned long) frame;
- } else {
- ops->address(data, addr, bp == 0);
- }
- }
- stack++;
- }
- return bp;
-}
+#include "dumpstack.h"
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
const struct stacktrace_ops *ops, void *data)
{
+ int graph = 0;
+
if (!task)
task = current;
@@ -106,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
context = (struct thread_info *)
((unsigned long)stack & (~(THREAD_SIZE - 1)));
- bp = print_context_stack(context, stack, bp, ops, data, NULL);
+ bp = print_context_stack(context, stack, bp, ops,
+ data, NULL, &graph);
stack = (unsigned long *)context->previous_esp;
if (!stack)
@@ -118,57 +65,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
}
EXPORT_SYMBOL(dump_trace);
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
- printk(data);
- print_symbol(msg, symbol);
- printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
- printk("%s%s\n", (char *)data, msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
- printk("%s <%s> ", (char *)data, name);
- return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static void print_trace_address(void *data, unsigned long addr, int reliable)
-{
- touch_nmi_watchdog();
- printk(data);
- printk_address(addr, reliable);
-}
-
-static const struct stacktrace_ops print_trace_ops = {
- .warning = print_trace_warning,
- .warning_symbol = print_trace_warning_symbol,
- .stack = print_trace_stack,
- .address = print_trace_address,
-};
-
-static void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
-{
- printk("%sCall Trace:\n", log_lvl);
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
-
-void show_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp)
-{
- show_trace_log_lvl(task, regs, stack, bp, "");
-}
-
-static void
+void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *sp, unsigned long bp, char *log_lvl)
{
@@ -195,33 +92,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
-void show_stack(struct task_struct *task, unsigned long *sp)
-{
- show_stack_log_lvl(task, NULL, sp, 0, "");
-}
-
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
- unsigned long bp = 0;
- unsigned long stack;
-
-#ifdef CONFIG_FRAME_POINTER
- if (!bp)
- get_bp(bp);
-#endif
-
- printk("Pid: %d, comm: %.20s %s %s %.*s\n",
- current->pid, current->comm, print_tainted(),
- init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
- show_trace(NULL, NULL, &stack, bp);
-}
-
-EXPORT_SYMBOL(dump_stack);
void show_registers(struct pt_regs *regs)
{
@@ -282,166 +152,3 @@ int is_valid_bugaddr(unsigned long ip)
return ud2 == 0x0b0f;
}
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
-unsigned __kprobes long oops_begin(void)
-{
- unsigned long flags;
-
- oops_enter();
-
- if (die_owner != raw_smp_processor_id()) {
- console_verbose();
- raw_local_irq_save(flags);
- __raw_spin_lock(&die_lock);
- die_owner = smp_processor_id();
- die_nest_count = 0;
- bust_spinlocks(1);
- } else {
- raw_local_irq_save(flags);
- }
- die_nest_count++;
- return flags;
-}
-
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{
- bust_spinlocks(0);
- die_owner = -1;
- add_taint(TAINT_DIE);
- __raw_spin_unlock(&die_lock);
- raw_local_irq_restore(flags);
-
- if (!regs)
- return;
-
- if (kexec_should_crash(current))
- crash_kexec(regs);
- if (in_interrupt())
- panic("Fatal exception in interrupt");
- if (panic_on_oops)
- panic("Fatal exception");
- oops_exit();
- do_exit(signr);
-}
-
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
-{
- unsigned short ss;
- unsigned long sp;
-
- printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
- printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
- printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC");
-#endif
- printk("\n");
- if (notify_die(DIE_OOPS, str, regs, err,
- current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
- return 1;
-
- show_registers(regs);
- /* Executive summary in case the oops scrolled away */
- sp = (unsigned long) (&regs->sp);
- savesegment(ss, ss);
- if (user_mode(regs)) {
- sp = regs->sp;
- ss = regs->ss & 0xffff;
- }
- printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
- print_symbol("%s", regs->ip);
- printk(" SS:ESP %04x:%08lx\n", ss, sp);
- return 0;
-}
-
-/*
- * This is gone through when something in the kernel has done something bad
- * and is about to be terminated:
- */
-void die(const char *str, struct pt_regs *regs, long err)
-{
- unsigned long flags = oops_begin();
-
- if (die_nest_count < 3) {
- report_bug(regs->ip, regs);
-
- if (__die(str, regs, err))
- regs = NULL;
- } else {
- printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
- }
-
- oops_end(flags, regs, SIGSEGV);
-}
-
-static DEFINE_SPINLOCK(nmi_print_lock);
-
-void notrace __kprobes
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
- if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
- return;
-
- spin_lock(&nmi_print_lock);
- /*
- * We are in trouble anyway, lets at least try
- * to get a message out:
- */
- bust_spinlocks(1);
- printk(KERN_EMERG "%s", str);
- printk(" on CPU%d, ip %08lx, registers:\n",
- smp_processor_id(), regs->ip);
- show_registers(regs);
- if (do_panic)
- panic("Non maskable interrupt");
- console_silent();
- spin_unlock(&nmi_print_lock);
- bust_spinlocks(0);
-
- /*
- * If we are in kernel we are probably nested up pretty bad
- * and might aswell get out now while we still can:
- */
- if (!user_mode_vm(regs)) {
- current->thread.trap_no = 2;
- crash_kexec(regs);
- }
-
- do_exit(SIGSEGV);
-}
-
-static int __init oops_setup(char *s)
-{
- if (!s)
- return -EINVAL;
- if (!strcmp(s, "panic"))
- panic_on_oops = 1;
- return 0;
-}
-early_param("oops", oops_setup);
-
-static int __init kstack_setup(char *s)
-{
- if (!s)
- return -EINVAL;
- kstack_depth_to_print = simple_strtoul(s, NULL, 0);
- return 0;
-}
-early_param("kstack", kstack_setup);
-
-static int __init code_bytes_setup(char *s)
-{
- code_bytes = simple_strtoul(s, NULL, 0);
- if (code_bytes > 8192)
- code_bytes = 8192;
-
- return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 086cc8118e39..c302d0707048 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -13,22 +13,11 @@
#include <linux/kexec.h>
#include <linux/bug.h>
#include <linux/nmi.h>
+#include <linux/sysfs.h>
#include <asm/stacktrace.h>
-#define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
-
-int panic_on_unrecovered_nmi;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
-static unsigned int code_bytes = 64;
-static int die_counter;
-
-void printk_address(unsigned long address, int reliable)
-{
- printk(" [<%p>] %s%pS\n", (void *) address,
- reliable ? "" : "? ", (void *) address);
-}
+#include "dumpstack.h"
static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
unsigned *usedp, char **idp)
@@ -112,51 +101,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
* severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
*/
-static inline int valid_stack_ptr(struct thread_info *tinfo,
- void *p, unsigned int size, void *end)
-{
- void *t = tinfo;
- if (end) {
- if (p < end && p >= (end-THREAD_SIZE))
- return 1;
- else
- return 0;
- }
- return p > t && p < t + THREAD_SIZE - size;
-}
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
- struct stack_frame *next_frame;
- unsigned long return_address;
-};
-
-static inline unsigned long
-print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end)
-{
- struct stack_frame *frame = (struct stack_frame *)bp;
-
- while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
- unsigned long addr;
-
- addr = *stack;
- if (__kernel_text_address(addr)) {
- if ((unsigned long) stack == bp + sizeof(long)) {
- ops->address(data, addr, 1);
- frame = frame->next_frame;
- bp = (unsigned long) frame;
- } else {
- ops->address(data, addr, bp == 0);
- }
- }
- stack++;
- }
- return bp;
-}
-
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
const struct stacktrace_ops *ops, void *data)
@@ -165,6 +109,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
unsigned used = 0;
struct thread_info *tinfo;
+ int graph = 0;
if (!task)
task = current;
@@ -205,7 +150,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
break;
bp = print_context_stack(tinfo, stack, bp, ops,
- data, estack_end);
+ data, estack_end, &graph);
ops->stack(data, "<EOE>");
/*
* We link to the next stack via the
@@ -224,7 +169,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
if (ops->stack(data, "IRQ") < 0)
break;
bp = print_context_stack(tinfo, stack, bp,
- ops, data, irqstack_end);
+ ops, data, irqstack_end, &graph);
/*
* We link to the next stack (which would be
* the process stack normally) the last
@@ -242,62 +187,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
/*
* This handles the process stack:
*/
- bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
+ bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph);
put_cpu();
}
EXPORT_SYMBOL(dump_trace);
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
- printk(data);
- print_symbol(msg, symbol);
- printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
- printk("%s%s\n", (char *)data, msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
- printk("%s <%s> ", (char *)data, name);
- return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static void print_trace_address(void *data, unsigned long addr, int reliable)
-{
- touch_nmi_watchdog();
- printk(data);
- printk_address(addr, reliable);
-}
-
-static const struct stacktrace_ops print_trace_ops = {
- .warning = print_trace_warning,
- .warning_symbol = print_trace_warning_symbol,
- .stack = print_trace_stack,
- .address = print_trace_address,
-};
-
-static void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
-{
- printk("%sCall Trace:\n", log_lvl);
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
-
-void show_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp)
-{
- show_trace_log_lvl(task, regs, stack, bp, "");
-}
-
-static void
+void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *sp, unsigned long bp, char *log_lvl)
{
@@ -341,33 +236,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
-void show_stack(struct task_struct *task, unsigned long *sp)
-{
- show_stack_log_lvl(task, NULL, sp, 0, "");
-}
-
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
- unsigned long bp = 0;
- unsigned long stack;
-
-#ifdef CONFIG_FRAME_POINTER
- if (!bp)
- get_bp(bp);
-#endif
-
- printk("Pid: %d, comm: %.20s %s %s %.*s\n",
- current->pid, current->comm, print_tainted(),
- init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
- show_trace(NULL, NULL, &stack, bp);
-}
-EXPORT_SYMBOL(dump_stack);
-
void show_registers(struct pt_regs *regs)
{
int i;
@@ -428,146 +296,3 @@ int is_valid_bugaddr(unsigned long ip)
return ud2 == 0x0b0f;
}
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
-unsigned __kprobes long oops_begin(void)
-{
- int cpu;
- unsigned long flags;
-
- oops_enter();
-
- /* racy, but better than risking deadlock. */
- raw_local_irq_save(flags);
- cpu = smp_processor_id();
- if (!__raw_spin_trylock(&die_lock)) {
- if (cpu == die_owner)
- /* nested oops. should stop eventually */;
- else
- __raw_spin_lock(&die_lock);
- }
- die_nest_count++;
- die_owner = cpu;
- console_verbose();
- bust_spinlocks(1);
- return flags;
-}
-
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{
- die_owner = -1;
- bust_spinlocks(0);
- die_nest_count--;
- if (!die_nest_count)
- /* Nest count reaches zero, release the lock. */
- __raw_spin_unlock(&die_lock);
- raw_local_irq_restore(flags);
- if (!regs) {
- oops_exit();
- return;
- }
- if (in_interrupt())
- panic("Fatal exception in interrupt");
- if (panic_on_oops)
- panic("Fatal exception");
- oops_exit();
- do_exit(signr);
-}
-
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
-{
- printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
- printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
- printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC");
-#endif
- printk("\n");
- if (notify_die(DIE_OOPS, str, regs, err,
- current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
- return 1;
-
- show_registers(regs);
- add_taint(TAINT_DIE);
- /* Executive summary in case the oops scrolled away */
- printk(KERN_ALERT "RIP ");
- printk_address(regs->ip, 1);
- printk(" RSP <%016lx>\n", regs->sp);
- if (kexec_should_crash(current))
- crash_kexec(regs);
- return 0;
-}
-
-void die(const char *str, struct pt_regs *regs, long err)
-{
- unsigned long flags = oops_begin();
-
- if (!user_mode(regs))
- report_bug(regs->ip, regs);
-
- if (__die(str, regs, err))
- regs = NULL;
- oops_end(flags, regs, SIGSEGV);
-}
-
-notrace __kprobes void
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
- unsigned long flags;
-
- if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
- return;
-
- flags = oops_begin();
- /*
- * We are in trouble anyway, lets at least try
- * to get a message out.
- */
- printk(KERN_EMERG "%s", str);
- printk(" on CPU%d, ip %08lx, registers:\n",
- smp_processor_id(), regs->ip);
- show_registers(regs);
- if (kexec_should_crash(current))
- crash_kexec(regs);
- if (do_panic || panic_on_oops)
- panic("Non maskable interrupt");
- oops_end(flags, NULL, SIGBUS);
- nmi_exit();
- local_irq_enable();
- do_exit(SIGBUS);
-}
-
-static int __init oops_setup(char *s)
-{
- if (!s)
- return -EINVAL;
- if (!strcmp(s, "panic"))
- panic_on_oops = 1;
- return 0;
-}
-early_param("oops", oops_setup);
-
-static int __init kstack_setup(char *s)
-{
- if (!s)
- return -EINVAL;
- kstack_depth_to_print = simple_strtoul(s, NULL, 0);
- return 0;
-}
-early_param("kstack", kstack_setup);
-
-static int __init code_bytes_setup(char *s)
-{
- code_bytes = simple_strtoul(s, NULL, 0);
- if (code_bytes > 8192)
- code_bytes = 8192;
-
- return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 78e642feac30..65a13943e098 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -677,22 +677,6 @@ struct early_res {
};
static struct early_res early_res[MAX_EARLY_RES] __initdata = {
{ 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
-#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
- { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
-#endif
-#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
- /*
- * But first pinch a few for the stack/trampoline stuff
- * FIXME: Don't need the extra page at 4K, but need to fix
- * trampoline before removing it. (see the GDT stuff)
- */
- { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
- /*
- * Has to be in very low memory so we can execute
- * real-mode AP code.
- */
- { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
-#endif
{}
};
@@ -1282,25 +1266,25 @@ void __init e820_reserve_resources(void)
e820_res = res;
for (i = 0; i < e820.nr_map; i++) {
end = e820.map[i].addr + e820.map[i].size - 1;
-#ifndef CONFIG_RESOURCES_64BIT
- if (end > 0x100000000ULL) {
+ if (end != (resource_size_t)end) {
res++;
continue;
}
-#endif
res->name = e820_type_to_string(e820.map[i].type);
res->start = e820.map[i].addr;
res->end = end;
- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ res->flags = IORESOURCE_MEM;
/*
* don't register the region that could be conflicted with
* pci device BAR resource and insert them later in
* pcibios_resource_survey()
*/
- if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20))
+ if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
+ res->flags |= IORESOURCE_BUSY;
insert_resource(&iomem_resource, res);
+ }
res++;
}
@@ -1320,7 +1304,7 @@ void __init e820_reserve_resources_late(void)
res = e820_res;
for (i = 0; i < e820.nr_map; i++) {
if (!res->parent && res->end)
- reserve_region_with_split(&iomem_resource, res->start, res->end, res->name);
+ insert_resource_expand_to_fit(&iomem_resource, res);
res++;
}
}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 733c4f8d42ea..744aa7fc49d5 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -17,6 +17,7 @@
#include <asm/io_apic.h>
#include <asm/apic.h>
#include <asm/iommu.h>
+#include <asm/gart.h>
static void __init fix_hypertransport_config(int num, int slot, int func)
{
@@ -95,7 +96,8 @@ static void __init nvidia_bugs(int num, int slot, int func)
}
-static u32 ati_ixp4x0_rev(int num, int slot, int func)
+#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
+static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
{
u32 d;
u8 b;
@@ -115,7 +117,6 @@ static u32 ati_ixp4x0_rev(int num, int slot, int func)
static void __init ati_bugs(int num, int slot, int func)
{
-#if defined(CONFIG_ACPI) && defined (CONFIG_X86_IO_APIC)
u32 d;
u8 b;
@@ -138,21 +139,54 @@ static void __init ati_bugs(int num, int slot, int func)
printk(KERN_INFO "If you got timer trouble "
"try acpi_use_timer_override\n");
}
-#endif
}
-#ifdef CONFIG_DMAR
-static void __init intel_g33_dmar(int num, int slot, int func)
+static u32 __init ati_sbx00_rev(int num, int slot, int func)
{
- struct acpi_table_header *dmar_tbl;
- acpi_status status;
+ u32 old, d;
+
+ d = read_pci_config(num, slot, func, 0x70);
+ old = d;
+ d &= ~(1<<8);
+ write_pci_config(num, slot, func, 0x70, d);
+ d = read_pci_config(num, slot, func, 0x8);
+ d &= 0xff;
+ write_pci_config(num, slot, func, 0x70, old);
- status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
- if (ACPI_SUCCESS(status)) {
- printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n");
- dmar_disabled = 1;
+ return d;
+}
+
+static void __init ati_bugs_contd(int num, int slot, int func)
+{
+ u32 d, rev;
+
+ if (acpi_use_timer_override)
+ return;
+
+ rev = ati_sbx00_rev(num, slot, func);
+ if (rev > 0x13)
+ return;
+
+ /* check for IRQ0 interrupt swap */
+ d = read_pci_config(num, slot, func, 0x64);
+ if (!(d & (1<<14)))
+ acpi_skip_timer_override = 1;
+
+ if (acpi_skip_timer_override) {
+ printk(KERN_INFO "SB600 revision 0x%x\n", rev);
+ printk(KERN_INFO "Ignoring ACPI timer override.\n");
+ printk(KERN_INFO "If you got timer trouble "
+ "try acpi_use_timer_override\n");
}
}
+#else
+static void __init ati_bugs(int num, int slot, int func)
+{
+}
+
+static void __init ati_bugs_contd(int num, int slot, int func)
+{
+}
#endif
#define QFLAG_APPLY_ONCE 0x1
@@ -176,10 +210,8 @@ static struct chipset early_qrk[] __initdata = {
PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
{ PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
-#ifdef CONFIG_DMAR
- { PCI_VENDOR_ID_INTEL, 0x29c0,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
-#endif
+ { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
+ PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
{}
};
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 34ad997d3834..23b138e31e9c 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -875,49 +875,6 @@ static struct console early_dbgp_console = {
};
#endif
-/* Console interface to a host file on AMD's SimNow! */
-
-static int simnow_fd;
-
-enum {
- MAGIC1 = 0xBACCD00A,
- MAGIC2 = 0xCA110000,
- XOPEN = 5,
- XWRITE = 4,
-};
-
-static noinline long simnow(long cmd, long a, long b, long c)
-{
- long ret;
-
- asm volatile("cpuid" :
- "=a" (ret) :
- "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
- return ret;
-}
-
-static void __init simnow_init(char *str)
-{
- char *fn = "klog";
-
- if (*str == '=')
- fn = ++str;
- /* error ignored */
- simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
-}
-
-static void simnow_write(struct console *con, const char *s, unsigned n)
-{
- simnow(XWRITE, simnow_fd, (unsigned long)s, n);
-}
-
-static struct console simnow_console = {
- .name = "simnow",
- .write = simnow_write,
- .flags = CON_PRINTBUFFER,
- .index = -1,
-};
-
/* Direct interface for emergencies */
static struct console *early_console = &early_vga_console;
static int __initdata early_console_initialized;
@@ -960,10 +917,6 @@ static int __init setup_early_printk(char *buf)
max_ypos = boot_params.screen_info.orig_video_lines;
current_ypos = boot_params.screen_info.orig_y;
early_console = &early_vga_console;
- } else if (!strncmp(buf, "simnow", 6)) {
- simnow_init(buf + 6);
- early_console = &simnow_console;
- keep_early = 1;
#ifdef CONFIG_EARLY_PRINTK_DBGP
} else if (!strncmp(buf, "dbgp", 4)) {
if (early_dbgp_init(buf+4) < 0)
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 945a31cdd81f..1119d247fe11 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -367,6 +367,10 @@ void __init efi_init(void)
efi.smbios = config_tables[i].table;
printk(" SMBIOS=0x%lx ", config_tables[i].table);
} else if (!efi_guidcmp(config_tables[i].guid,
+ UV_SYSTEM_TABLE_GUID)) {
+ efi.uv_systab = config_tables[i].table;
+ printk(" UVsystab=0x%lx ", config_tables[i].table);
+ } else if (!efi_guidcmp(config_tables[i].guid,
HCDP_TABLE_GUID)) {
efi.hcdp = config_tables[i].table;
printk(" HCDP=0x%lx ", config_tables[i].table);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index b21fbfaffe39..d6f0490a7391 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -619,28 +619,37 @@ END(syscall_badsys)
27:;
/*
- * Build the entry stubs and pointer table with
- * some assembler magic.
+ * Build the entry stubs and pointer table with some assembler magic.
+ * We pack 7 stubs into a single 32-byte chunk, which will fit in a
+ * single cache line on all modern x86 implementations.
*/
-.section .rodata,"a"
+.section .init.rodata,"a"
ENTRY(interrupt)
.text
-
+ .p2align 5
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
ENTRY(irq_entries_start)
RING0_INT_FRAME
-vector=0
-.rept NR_IRQS
- ALIGN
- .if vector
+vector=FIRST_EXTERNAL_VECTOR
+.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
+ .balign 32
+ .rept 7
+ .if vector < NR_VECTORS
+ .if vector <> FIRST_EXTERNAL_VECTOR
CFI_ADJUST_CFA_OFFSET -4
- .endif
-1: pushl $~(vector)
+ .endif
+1: pushl $(~vector+0x80) /* Note: always in signed byte range */
CFI_ADJUST_CFA_OFFSET 4
- jmp common_interrupt
- .previous
+ .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
+ jmp 2f
+ .endif
+ .previous
.long 1b
- .text
+ .text
vector=vector+1
+ .endif
+ .endr
+2: jmp common_interrupt
.endr
END(irq_entries_start)
@@ -652,8 +661,9 @@ END(interrupt)
* the CPU automatically disables interrupts when executing an IRQ vector,
* so IRQ-flags tracing has to follow that:
*/
- ALIGN
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt:
+ addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
SAVE_ALL
TRACE_IRQS_OFF
movl %esp,%eax
@@ -678,65 +688,6 @@ ENDPROC(name)
/* The include is where all of the SMP etc. interrupts come from */
#include "entry_arch.h"
-KPROBE_ENTRY(page_fault)
- RING0_EC_FRAME
- pushl $do_page_fault
- CFI_ADJUST_CFA_OFFSET 4
- ALIGN
-error_code:
- /* the function address is in %fs's slot on the stack */
- pushl %es
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET es, 0*/
- pushl %ds
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET ds, 0*/
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET eax, 0
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebp, 0
- pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edi, 0
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET esi, 0
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx, 0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx, 0
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebx, 0
- cld
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET fs, 0*/
- movl $(__KERNEL_PERCPU), %ecx
- movl %ecx, %fs
- UNWIND_ESPFIX_STACK
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- /*CFI_REGISTER es, ecx*/
- movl PT_FS(%esp), %edi # get the function address
- movl PT_ORIG_EAX(%esp), %edx # get the error code
- movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- mov %ecx, PT_FS(%esp)
- /*CFI_REL_OFFSET fs, ES*/
- movl $(__USER_DS), %ecx
- movl %ecx, %ds
- movl %ecx, %es
- TRACE_IRQS_OFF
- movl %esp,%eax # pt_regs pointer
- call *%edi
- jmp ret_from_exception
- CFI_ENDPROC
-KPROBE_END(page_fault)
-
ENTRY(coprocessor_error)
RING0_INT_FRAME
pushl $0
@@ -767,140 +718,6 @@ ENTRY(device_not_available)
CFI_ENDPROC
END(device_not_available)
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-#define FIX_STACK(offset, ok, label) \
- cmpw $__KERNEL_CS,4(%esp); \
- jne ok; \
-label: \
- movl TSS_sysenter_sp0+offset(%esp),%esp; \
- CFI_DEF_CFA esp, 0; \
- CFI_UNDEFINED eip; \
- pushfl; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl $__KERNEL_CS; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl $sysenter_past_esp; \
- CFI_ADJUST_CFA_OFFSET 4; \
- CFI_REL_OFFSET eip, 0
-
-KPROBE_ENTRY(debug)
- RING0_INT_FRAME
- cmpl $ia32_sysenter_target,(%esp)
- jne debug_stack_correct
- FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
-debug_stack_correct:
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # error code 0
- movl %esp,%eax # pt_regs pointer
- call do_debug
- jmp ret_from_exception
- CFI_ENDPROC
-KPROBE_END(debug)
-
-/*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
- */
-KPROBE_ENTRY(nmi)
- RING0_INT_FRAME
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- movl %ss, %eax
- cmpw $__ESPFIX_SS, %ax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- je nmi_espfix_stack
- cmpl $ia32_sysenter_target,(%esp)
- je nmi_stack_fixup
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- movl %esp,%eax
- /* Do not access memory above the end of our stack page,
- * it might not exist.
- */
- andl $(THREAD_SIZE-1),%eax
- cmpl $(THREAD_SIZE-20),%eax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- jae nmi_stack_correct
- cmpl $ia32_sysenter_target,12(%esp)
- je nmi_debug_stack_check
-nmi_stack_correct:
- /* We have a RING0_INT_FRAME here */
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_nmi
- jmp restore_nocheck_notrace
- CFI_ENDPROC
-
-nmi_stack_fixup:
- RING0_INT_FRAME
- FIX_STACK(12,nmi_stack_correct, 1)
- jmp nmi_stack_correct
-
-nmi_debug_stack_check:
- /* We have a RING0_INT_FRAME here */
- cmpw $__KERNEL_CS,16(%esp)
- jne nmi_stack_correct
- cmpl $debug,(%esp)
- jb nmi_stack_correct
- cmpl $debug_esp_fix_insn,(%esp)
- ja nmi_stack_correct
- FIX_STACK(24,nmi_stack_correct, 1)
- jmp nmi_stack_correct
-
-nmi_espfix_stack:
- /* We have a RING0_INT_FRAME here.
- *
- * create the pointer to lss back
- */
- pushl %ss
- CFI_ADJUST_CFA_OFFSET 4
- pushl %esp
- CFI_ADJUST_CFA_OFFSET 4
- addw $4, (%esp)
- /* copy the iret frame of 12 bytes */
- .rept 3
- pushl 16(%esp)
- CFI_ADJUST_CFA_OFFSET 4
- .endr
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- FIXUP_ESPFIX_STACK # %eax == %esp
- xorl %edx,%edx # zero error code
- call do_nmi
- RESTORE_REGS
- lss 12+4(%esp), %esp # back to espfix stack
- CFI_ADJUST_CFA_OFFSET -24
- jmp irq_return
- CFI_ENDPROC
-KPROBE_END(nmi)
-
#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
iret
@@ -916,19 +733,6 @@ ENTRY(native_irq_enable_sysexit)
END(native_irq_enable_sysexit)
#endif
-KPROBE_ENTRY(int3)
- RING0_INT_FRAME
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_int3
- jmp ret_from_exception
- CFI_ENDPROC
-KPROBE_END(int3)
-
ENTRY(overflow)
RING0_INT_FRAME
pushl $0
@@ -993,14 +797,6 @@ ENTRY(stack_segment)
CFI_ENDPROC
END(stack_segment)
-KPROBE_ENTRY(general_protection)
- RING0_EC_FRAME
- pushl $do_general_protection
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-KPROBE_END(general_protection)
-
ENTRY(alignment_check)
RING0_EC_FRAME
pushl $do_alignment_check
@@ -1024,7 +820,7 @@ ENTRY(machine_check)
RING0_INT_FRAME
pushl $0
CFI_ADJUST_CFA_OFFSET 4
- pushl $do_machine_check
+ pushl machine_check_vector
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
@@ -1051,6 +847,7 @@ ENTRY(kernel_thread_helper)
push %eax
CFI_ADJUST_CFA_OFFSET 4
call do_exit
+ ud2 # padding for call trace
CFI_ENDPROC
ENDPROC(kernel_thread_helper)
@@ -1149,28 +946,17 @@ ENDPROC(xen_failsafe_callback)
#endif /* CONFIG_XEN */
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
#ifdef CONFIG_DYNAMIC_FTRACE
ENTRY(mcount)
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %eax
- subl $MCOUNT_INSN_SIZE, %eax
-
-.globl mcount_call
-mcount_call:
- call ftrace_stub
-
- popl %edx
- popl %ecx
- popl %eax
-
ret
END(mcount)
ENTRY(ftrace_caller)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
pushl %eax
pushl %ecx
pushl %edx
@@ -1185,6 +971,11 @@ ftrace_call:
popl %edx
popl %ecx
popl %eax
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ jmp ftrace_stub
+#endif
.globl ftrace_stub
ftrace_stub:
@@ -1194,8 +985,18 @@ END(ftrace_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(mcount)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
cmpl $ftrace_stub, ftrace_trace_function
jnz trace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpl $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
.globl ftrace_stub
ftrace_stub:
ret
@@ -1214,13 +1015,268 @@ trace:
popl %edx
popl %ecx
popl %eax
-
jmp ftrace_stub
END(mcount)
#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %edx
+ lea 0x4(%ebp), %eax
+ subl $MCOUNT_INSN_SIZE, %edx
+ call prepare_ftrace_return
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+ pushl $0
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ call ftrace_return_to_handler
+ movl %eax, 0xc(%esp)
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+#endif
.section .rodata,"a"
#include "syscall_table_32.S"
syscall_table_size=(.-sys_call_table)
+
+/*
+ * Some functions should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
+
+ENTRY(page_fault)
+ RING0_EC_FRAME
+ pushl $do_page_fault
+ CFI_ADJUST_CFA_OFFSET 4
+ ALIGN
+error_code:
+ /* the function address is in %fs's slot on the stack */
+ pushl %es
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET es, 0*/
+ pushl %ds
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET ds, 0*/
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET eax, 0
+ pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebp, 0
+ pushl %edi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edi, 0
+ pushl %esi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET esi, 0
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx, 0
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
+ pushl %ebx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebx, 0
+ cld
+ pushl %fs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET fs, 0*/
+ movl $(__KERNEL_PERCPU), %ecx
+ movl %ecx, %fs
+ UNWIND_ESPFIX_STACK
+ popl %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_REGISTER es, ecx*/
+ movl PT_FS(%esp), %edi # get the function address
+ movl PT_ORIG_EAX(%esp), %edx # get the error code
+ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
+ mov %ecx, PT_FS(%esp)
+ /*CFI_REL_OFFSET fs, ES*/
+ movl $(__USER_DS), %ecx
+ movl %ecx, %ds
+ movl %ecx, %es
+ TRACE_IRQS_OFF
+ movl %esp,%eax # pt_regs pointer
+ call *%edi
+ jmp ret_from_exception
+ CFI_ENDPROC
+END(page_fault)
+
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+#define FIX_STACK(offset, ok, label) \
+ cmpw $__KERNEL_CS,4(%esp); \
+ jne ok; \
+label: \
+ movl TSS_sysenter_sp0+offset(%esp),%esp; \
+ CFI_DEF_CFA esp, 0; \
+ CFI_UNDEFINED eip; \
+ pushfl; \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ pushl $__KERNEL_CS; \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ pushl $sysenter_past_esp; \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ CFI_REL_OFFSET eip, 0
+
+ENTRY(debug)
+ RING0_INT_FRAME
+ cmpl $ia32_sysenter_target,(%esp)
+ jne debug_stack_correct
+ FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
+debug_stack_correct:
+ pushl $-1 # mark this as an int
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ xorl %edx,%edx # error code 0
+ movl %esp,%eax # pt_regs pointer
+ call do_debug
+ jmp ret_from_exception
+ CFI_ENDPROC
+END(debug)
+
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+ENTRY(nmi)
+ RING0_INT_FRAME
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ movl %ss, %eax
+ cmpw $__ESPFIX_SS, %ax
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
+ je nmi_espfix_stack
+ cmpl $ia32_sysenter_target,(%esp)
+ je nmi_stack_fixup
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ movl %esp,%eax
+ /* Do not access memory above the end of our stack page,
+ * it might not exist.
+ */
+ andl $(THREAD_SIZE-1),%eax
+ cmpl $(THREAD_SIZE-20),%eax
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
+ jae nmi_stack_correct
+ cmpl $ia32_sysenter_target,12(%esp)
+ je nmi_debug_stack_check
+nmi_stack_correct:
+ /* We have a RING0_INT_FRAME here */
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ xorl %edx,%edx # zero error code
+ movl %esp,%eax # pt_regs pointer
+ call do_nmi
+ jmp restore_nocheck_notrace
+ CFI_ENDPROC
+
+nmi_stack_fixup:
+ RING0_INT_FRAME
+ FIX_STACK(12,nmi_stack_correct, 1)
+ jmp nmi_stack_correct
+
+nmi_debug_stack_check:
+ /* We have a RING0_INT_FRAME here */
+ cmpw $__KERNEL_CS,16(%esp)
+ jne nmi_stack_correct
+ cmpl $debug,(%esp)
+ jb nmi_stack_correct
+ cmpl $debug_esp_fix_insn,(%esp)
+ ja nmi_stack_correct
+ FIX_STACK(24,nmi_stack_correct, 1)
+ jmp nmi_stack_correct
+
+nmi_espfix_stack:
+ /* We have a RING0_INT_FRAME here.
+ *
+ * create the pointer to lss back
+ */
+ pushl %ss
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl %esp
+ CFI_ADJUST_CFA_OFFSET 4
+ addw $4, (%esp)
+ /* copy the iret frame of 12 bytes */
+ .rept 3
+ pushl 16(%esp)
+ CFI_ADJUST_CFA_OFFSET 4
+ .endr
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ FIXUP_ESPFIX_STACK # %eax == %esp
+ xorl %edx,%edx # zero error code
+ call do_nmi
+ RESTORE_REGS
+ lss 12+4(%esp), %esp # back to espfix stack
+ CFI_ADJUST_CFA_OFFSET -24
+ jmp irq_return
+ CFI_ENDPROC
+END(nmi)
+
+ENTRY(int3)
+ RING0_INT_FRAME
+ pushl $-1 # mark this as an int
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ xorl %edx,%edx # zero error code
+ movl %esp,%eax # pt_regs pointer
+ call do_int3
+ jmp ret_from_exception
+ CFI_ENDPROC
+END(int3)
+
+ENTRY(general_protection)
+ RING0_EC_FRAME
+ pushl $do_general_protection
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(general_protection)
+
+/*
+ * End of kprobes section
+ */
+ .popsection
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1db6ce4314e1..e28c7a987793 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -11,15 +11,15 @@
*
* NOTE: This code handles signal-recognition, which happens every time
* after an interrupt and after each system call.
- *
- * Normal syscalls and interrupts don't save a full stack frame, this is
+ *
+ * Normal syscalls and interrupts don't save a full stack frame, this is
* only done for syscall tracing, signals or fork/exec et.al.
- *
- * A note on terminology:
- * - top of stack: Architecture defined interrupt frame from SS to RIP
- * at the top of the kernel process stack.
+ *
+ * A note on terminology:
+ * - top of stack: Architecture defined interrupt frame from SS to RIP
+ * at the top of the kernel process stack.
* - partial stack frame: partially saved registers upto R11.
- * - full stack frame: Like partial stack frame, but all register saved.
+ * - full stack frame: Like partial stack frame, but all register saved.
*
* Some macro usage:
* - CFI macros are used to generate dwarf2 unwind information for better
@@ -60,50 +60,17 @@
#define __AUDIT_ARCH_LE 0x40000000
.code64
-
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
#ifdef CONFIG_DYNAMIC_FTRACE
ENTRY(mcount)
-
- subq $0x38, %rsp
- movq %rax, (%rsp)
- movq %rcx, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rsi, 24(%rsp)
- movq %rdi, 32(%rsp)
- movq %r8, 40(%rsp)
- movq %r9, 48(%rsp)
-
- movq 0x38(%rsp), %rdi
- subq $MCOUNT_INSN_SIZE, %rdi
-
-.globl mcount_call
-mcount_call:
- call ftrace_stub
-
- movq 48(%rsp), %r9
- movq 40(%rsp), %r8
- movq 32(%rsp), %rdi
- movq 24(%rsp), %rsi
- movq 16(%rsp), %rdx
- movq 8(%rsp), %rcx
- movq (%rsp), %rax
- addq $0x38, %rsp
-
retq
END(mcount)
ENTRY(ftrace_caller)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
- /* taken from glibc */
- subq $0x38, %rsp
- movq %rax, (%rsp)
- movq %rcx, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rsi, 24(%rsp)
- movq %rdi, 32(%rsp)
- movq %r8, 40(%rsp)
- movq %r9, 48(%rsp)
+ MCOUNT_SAVE_FRAME
movq 0x38(%rsp), %rdi
movq 8(%rbp), %rsi
@@ -113,14 +80,13 @@ ENTRY(ftrace_caller)
ftrace_call:
call ftrace_stub
- movq 48(%rsp), %r9
- movq 40(%rsp), %r8
- movq 32(%rsp), %rdi
- movq 24(%rsp), %rsi
- movq 16(%rsp), %rdx
- movq 8(%rsp), %rcx
- movq (%rsp), %rax
- addq $0x38, %rsp
+ MCOUNT_RESTORE_FRAME
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ jmp ftrace_stub
+#endif
.globl ftrace_stub
ftrace_stub:
@@ -129,15 +95,63 @@ END(ftrace_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(mcount)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
cmpq $ftrace_stub, ftrace_trace_function
jnz trace
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpq $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
+
.globl ftrace_stub
ftrace_stub:
retq
trace:
- /* taken from glibc */
- subq $0x38, %rsp
+ MCOUNT_SAVE_FRAME
+
+ movq 0x38(%rsp), %rdi
+ movq 8(%rbp), %rsi
+ subq $MCOUNT_INSN_SIZE, %rdi
+
+ call *ftrace_trace_function
+
+ MCOUNT_RESTORE_FRAME
+
+ jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
+ MCOUNT_SAVE_FRAME
+
+ leaq 8(%rbp), %rdi
+ movq 0x38(%rsp), %rsi
+ subq $MCOUNT_INSN_SIZE, %rsi
+
+ call prepare_ftrace_return
+
+ MCOUNT_RESTORE_FRAME
+
+ retq
+END(ftrace_graph_caller)
+
+
+.globl return_to_handler
+return_to_handler:
+ subq $80, %rsp
+
movq %rax, (%rsp)
movq %rcx, 8(%rsp)
movq %rdx, 16(%rsp)
@@ -145,13 +159,14 @@ trace:
movq %rdi, 32(%rsp)
movq %r8, 40(%rsp)
movq %r9, 48(%rsp)
+ movq %r10, 56(%rsp)
+ movq %r11, 64(%rsp)
- movq 0x38(%rsp), %rdi
- movq 8(%rbp), %rsi
- subq $MCOUNT_INSN_SIZE, %rdi
-
- call *ftrace_trace_function
+ call ftrace_return_to_handler
+ movq %rax, 72(%rsp)
+ movq 64(%rsp), %r11
+ movq 56(%rsp), %r10
movq 48(%rsp), %r9
movq 40(%rsp), %r8
movq 32(%rsp), %rdi
@@ -159,16 +174,14 @@ trace:
movq 16(%rsp), %rdx
movq 8(%rsp), %rcx
movq (%rsp), %rax
- addq $0x38, %rsp
+ addq $72, %rsp
+ retq
+#endif
- jmp ftrace_stub
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FTRACE */
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
-#endif
+#endif
#ifdef CONFIG_PARAVIRT
ENTRY(native_usergs_sysret64)
@@ -187,29 +200,29 @@ ENTRY(native_usergs_sysret64)
.endm
/*
- * C code is not supposed to know about undefined top of stack. Every time
- * a C function with an pt_regs argument is called from the SYSCALL based
+ * C code is not supposed to know about undefined top of stack. Every time
+ * a C function with an pt_regs argument is called from the SYSCALL based
* fast path FIXUP_TOP_OF_STACK is needed.
* RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
* manipulation.
- */
-
- /* %rsp:at FRAMEEND */
- .macro FIXUP_TOP_OF_STACK tmp
- movq %gs:pda_oldrsp,\tmp
- movq \tmp,RSP(%rsp)
- movq $__USER_DS,SS(%rsp)
- movq $__USER_CS,CS(%rsp)
- movq $-1,RCX(%rsp)
- movq R11(%rsp),\tmp /* get eflags */
- movq \tmp,EFLAGS(%rsp)
+ */
+
+ /* %rsp:at FRAMEEND */
+ .macro FIXUP_TOP_OF_STACK tmp offset=0
+ movq %gs:pda_oldrsp,\tmp
+ movq \tmp,RSP+\offset(%rsp)
+ movq $__USER_DS,SS+\offset(%rsp)
+ movq $__USER_CS,CS+\offset(%rsp)
+ movq $-1,RCX+\offset(%rsp)
+ movq R11+\offset(%rsp),\tmp /* get eflags */
+ movq \tmp,EFLAGS+\offset(%rsp)
.endm
- .macro RESTORE_TOP_OF_STACK tmp,offset=0
- movq RSP-\offset(%rsp),\tmp
- movq \tmp,%gs:pda_oldrsp
- movq EFLAGS-\offset(%rsp),\tmp
- movq \tmp,R11-\offset(%rsp)
+ .macro RESTORE_TOP_OF_STACK tmp offset=0
+ movq RSP+\offset(%rsp),\tmp
+ movq \tmp,%gs:pda_oldrsp
+ movq EFLAGS+\offset(%rsp),\tmp
+ movq \tmp,R11+\offset(%rsp)
.endm
.macro FAKE_STACK_FRAME child_rip
@@ -221,7 +234,7 @@ ENTRY(native_usergs_sysret64)
pushq %rax /* rsp */
CFI_ADJUST_CFA_OFFSET 8
CFI_REL_OFFSET rsp,0
- pushq $(1<<9) /* eflags - interrupts on */
+ pushq $X86_EFLAGS_IF /* eflags - interrupts on */
CFI_ADJUST_CFA_OFFSET 8
/*CFI_REL_OFFSET rflags,0*/
pushq $__KERNEL_CS /* cs */
@@ -239,62 +252,184 @@ ENTRY(native_usergs_sysret64)
CFI_ADJUST_CFA_OFFSET -(6*8)
.endm
- .macro CFI_DEFAULT_STACK start=1
+/*
+ * initial frame state for interrupts (and exceptions without error code)
+ */
+ .macro EMPTY_FRAME start=1 offset=0
.if \start
- CFI_STARTPROC simple
+ CFI_STARTPROC simple
CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8
+ CFI_DEF_CFA rsp,8+\offset
.else
- CFI_DEF_CFA_OFFSET SS+8
+ CFI_DEF_CFA_OFFSET 8+\offset
.endif
- CFI_REL_OFFSET r15,R15
- CFI_REL_OFFSET r14,R14
- CFI_REL_OFFSET r13,R13
- CFI_REL_OFFSET r12,R12
- CFI_REL_OFFSET rbp,RBP
- CFI_REL_OFFSET rbx,RBX
- CFI_REL_OFFSET r11,R11
- CFI_REL_OFFSET r10,R10
- CFI_REL_OFFSET r9,R9
- CFI_REL_OFFSET r8,R8
- CFI_REL_OFFSET rax,RAX
- CFI_REL_OFFSET rcx,RCX
- CFI_REL_OFFSET rdx,RDX
- CFI_REL_OFFSET rsi,RSI
- CFI_REL_OFFSET rdi,RDI
- CFI_REL_OFFSET rip,RIP
- /*CFI_REL_OFFSET cs,CS*/
- /*CFI_REL_OFFSET rflags,EFLAGS*/
- CFI_REL_OFFSET rsp,RSP
- /*CFI_REL_OFFSET ss,SS*/
.endm
+
+/*
+ * initial frame state for interrupts (and exceptions without error code)
+ */
+ .macro INTR_FRAME start=1 offset=0
+ EMPTY_FRAME \start, SS+8+\offset-RIP
+ /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
+ CFI_REL_OFFSET rsp, RSP+\offset-RIP
+ /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
+ /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
+ CFI_REL_OFFSET rip, RIP+\offset-RIP
+ .endm
+
+/*
+ * initial frame state for exceptions with error code (and interrupts
+ * with vector already pushed)
+ */
+ .macro XCPT_FRAME start=1 offset=0
+ INTR_FRAME \start, RIP+\offset-ORIG_RAX
+ /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
+ .endm
+
/*
- * A newly forked process directly context switches into this.
- */
-/* rdi: prev */
+ * frame that enables calling into C.
+ */
+ .macro PARTIAL_FRAME start=1 offset=0
+ XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
+ CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
+ CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
+ CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
+ CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
+ CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
+ CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
+ CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
+ CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
+ CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
+ .endm
+
+/*
+ * frame that enables passing a complete pt_regs to a C function.
+ */
+ .macro DEFAULT_FRAME start=1 offset=0
+ PARTIAL_FRAME \start, R11+\offset-R15
+ CFI_REL_OFFSET rbx, RBX+\offset
+ CFI_REL_OFFSET rbp, RBP+\offset
+ CFI_REL_OFFSET r12, R12+\offset
+ CFI_REL_OFFSET r13, R13+\offset
+ CFI_REL_OFFSET r14, R14+\offset
+ CFI_REL_OFFSET r15, R15+\offset
+ .endm
+
+/* save partial stack frame */
+ENTRY(save_args)
+ XCPT_FRAME
+ cld
+ movq_cfi rdi, RDI+16-ARGOFFSET
+ movq_cfi rsi, RSI+16-ARGOFFSET
+ movq_cfi rdx, RDX+16-ARGOFFSET
+ movq_cfi rcx, RCX+16-ARGOFFSET
+ movq_cfi rax, RAX+16-ARGOFFSET
+ movq_cfi r8, R8+16-ARGOFFSET
+ movq_cfi r9, R9+16-ARGOFFSET
+ movq_cfi r10, R10+16-ARGOFFSET
+ movq_cfi r11, R11+16-ARGOFFSET
+
+ leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
+ movq_cfi rbp, 8 /* push %rbp */
+ leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
+ testl $3, CS(%rdi)
+ je 1f
+ SWAPGS
+ /*
+ * irqcount is used to check if a CPU is already on an interrupt stack
+ * or not. While this is essentially redundant with preempt_count it is
+ * a little cheaper to use a separate counter in the PDA (short of
+ * moving irq_enter into assembly, which would be too much work)
+ */
+1: incl %gs:pda_irqcount
+ jne 2f
+ popq_cfi %rax /* move return address... */
+ mov %gs:pda_irqstackptr,%rsp
+ EMPTY_FRAME 0
+ pushq_cfi %rax /* ... to the new stack */
+ /*
+ * We entered an interrupt context - irqs are off:
+ */
+2: TRACE_IRQS_OFF
+ ret
+ CFI_ENDPROC
+END(save_args)
+
+ENTRY(save_rest)
+ PARTIAL_FRAME 1 REST_SKIP+8
+ movq 5*8+16(%rsp), %r11 /* save return address */
+ movq_cfi rbx, RBX+16
+ movq_cfi rbp, RBP+16
+ movq_cfi r12, R12+16
+ movq_cfi r13, R13+16
+ movq_cfi r14, R14+16
+ movq_cfi r15, R15+16
+ movq %r11, 8(%rsp) /* return address */
+ FIXUP_TOP_OF_STACK %r11, 16
+ ret
+ CFI_ENDPROC
+END(save_rest)
+
+/* save complete stack frame */
+ENTRY(save_paranoid)
+ XCPT_FRAME 1 RDI+8
+ cld
+ movq_cfi rdi, RDI+8
+ movq_cfi rsi, RSI+8
+ movq_cfi rdx, RDX+8
+ movq_cfi rcx, RCX+8
+ movq_cfi rax, RAX+8
+ movq_cfi r8, R8+8
+ movq_cfi r9, R9+8
+ movq_cfi r10, R10+8
+ movq_cfi r11, R11+8
+ movq_cfi rbx, RBX+8
+ movq_cfi rbp, RBP+8
+ movq_cfi r12, R12+8
+ movq_cfi r13, R13+8
+ movq_cfi r14, R14+8
+ movq_cfi r15, R15+8
+ movl $1,%ebx
+ movl $MSR_GS_BASE,%ecx
+ rdmsr
+ testl %edx,%edx
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx,%ebx
+1: ret
+ CFI_ENDPROC
+END(save_paranoid)
+
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * rdi: prev task we switched from
+ */
ENTRY(ret_from_fork)
- CFI_DEFAULT_STACK
+ DEFAULT_FRAME
+
push kernel_eflags(%rip)
CFI_ADJUST_CFA_OFFSET 8
- popf # reset kernel eflags
+ popf # reset kernel eflags
CFI_ADJUST_CFA_OFFSET -8
- call schedule_tail
+
+ call schedule_tail # rdi: 'prev' task parameter
+
GET_THREAD_INFO(%rcx)
- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
- jnz rff_trace
-rff_action:
+
+ CFI_REMEMBER_STATE
RESTORE_REST
- testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
+
+ testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
je int_ret_from_sys_call
- testl $_TIF_IA32,TI_flags(%rcx)
+
+ testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
jnz int_ret_from_sys_call
- RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
- jmp ret_from_sys_call
-rff_trace:
- movq %rsp,%rdi
- call syscall_trace_leave
- GET_THREAD_INFO(%rcx)
- jmp rff_action
+
+ RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
+ jmp ret_from_sys_call # go to the SYSRET fastpath
+
+ CFI_RESTORE_STATE
CFI_ENDPROC
END(ret_from_fork)
@@ -304,20 +439,20 @@ END(ret_from_fork)
* SYSCALL does not save anything on the stack and does not change the
* stack pointer.
*/
-
+
/*
- * Register setup:
+ * Register setup:
* rax system call number
* rdi arg0
- * rcx return address for syscall/sysret, C arg3
+ * rcx return address for syscall/sysret, C arg3
* rsi arg1
- * rdx arg2
+ * rdx arg2
* r10 arg3 (--> moved to rcx for C)
* r8 arg4
* r9 arg5
* r11 eflags for syscall/sysret, temporary for C
- * r12-r15,rbp,rbx saved by C code, not touched.
- *
+ * r12-r15,rbp,rbx saved by C code, not touched.
+ *
* Interrupts are off on entry.
* Only called from user space.
*
@@ -327,7 +462,7 @@ END(ret_from_fork)
* When user can change the frames always force IRET. That is because
* it deals with uncanonical addresses better. SYSRET has trouble
* with them due to bugs in both AMD and Intel CPUs.
- */
+ */
ENTRY(system_call)
CFI_STARTPROC simple
@@ -343,7 +478,7 @@ ENTRY(system_call)
*/
ENTRY(system_call_after_swapgs)
- movq %rsp,%gs:pda_oldrsp
+ movq %rsp,%gs:pda_oldrsp
movq %gs:pda_kernelstack,%rsp
/*
* No need to follow this irqs off/on section - it's straight
@@ -351,7 +486,7 @@ ENTRY(system_call_after_swapgs)
*/
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_ARGS 8,1
- movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
+ movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
GET_THREAD_INFO(%rcx)
@@ -365,19 +500,19 @@ system_call_fastpath:
movq %rax,RAX-ARGOFFSET(%rsp)
/*
* Syscall return path ending with SYSRET (fast path)
- * Has incomplete stack frame and undefined top of stack.
- */
+ * Has incomplete stack frame and undefined top of stack.
+ */
ret_from_sys_call:
movl $_TIF_ALLWORK_MASK,%edi
/* edi: flagmask */
-sysret_check:
+sysret_check:
LOCKDEP_SYS_EXIT
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
movl TI_flags(%rcx),%edx
andl %edi,%edx
- jnz sysret_careful
+ jnz sysret_careful
CFI_REMEMBER_STATE
/*
* sysretq will re-enable interrupts:
@@ -392,7 +527,7 @@ sysret_check:
CFI_RESTORE_STATE
/* Handle reschedules */
- /* edx: work, edi: workmask */
+ /* edx: work, edi: workmask */
sysret_careful:
bt $TIF_NEED_RESCHED,%edx
jnc sysret_signal
@@ -405,7 +540,7 @@ sysret_careful:
CFI_ADJUST_CFA_OFFSET -8
jmp sysret_check
- /* Handle a signal */
+ /* Handle a signal */
sysret_signal:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
@@ -414,17 +549,20 @@ sysret_signal:
jc sysret_audit
#endif
/* edx: work flags (arg3) */
- leaq do_notify_resume(%rip),%rax
leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
xorl %esi,%esi # oldset -> arg2
- call ptregscall_common
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %r11
+ call do_notify_resume
+ RESTORE_TOP_OF_STACK %r11
+ RESTORE_REST
movl $_TIF_WORK_MASK,%edi
/* Use IRET because user could have changed frame. This
works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
-
+
badsys:
movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
jmp ret_from_sys_call
@@ -463,7 +601,7 @@ sysret_audit:
#endif /* CONFIG_AUDITSYSCALL */
/* Do syscall tracing */
-tracesys:
+tracesys:
#ifdef CONFIG_AUDITSYSCALL
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
jz auditsys
@@ -486,8 +624,8 @@ tracesys:
call *sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
/* Use IRET because user could have changed frame */
-
-/*
+
+/*
* Syscall return path ending with IRET.
* Has correct top of stack, but partial stack frame.
*/
@@ -531,18 +669,18 @@ int_very_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_REST
- /* Check for syscall exit trace */
+ /* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT,%edx
jz int_signal
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
- leaq 8(%rsp),%rdi # &ptregs -> arg1
+ leaq 8(%rsp),%rdi # &ptregs -> arg1
call syscall_trace_leave
popq %rdi
CFI_ADJUST_CFA_OFFSET -8
andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
jmp int_restore_rest
-
+
int_signal:
testl $_TIF_DO_NOTIFY_MASK,%edx
jz 1f
@@ -557,22 +695,24 @@ int_restore_rest:
jmp int_with_check
CFI_ENDPROC
END(system_call)
-
-/*
+
+/*
* Certain special system calls that need to save a complete full stack frame.
- */
-
+ */
.macro PTREGSCALL label,func,arg
- .globl \label
-\label:
- leaq \func(%rip),%rax
- leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
- jmp ptregscall_common
+ENTRY(\label)
+ PARTIAL_FRAME 1 8 /* offset 8: return address */
+ subq $REST_SKIP, %rsp
+ CFI_ADJUST_CFA_OFFSET REST_SKIP
+ call save_rest
+ DEFAULT_FRAME 0 8 /* offset 8: return address */
+ leaq 8(%rsp), \arg /* pt_regs pointer */
+ call \func
+ jmp ptregscall_common
+ CFI_ENDPROC
END(\label)
.endm
- CFI_STARTPROC
-
PTREGSCALL stub_clone, sys_clone, %r8
PTREGSCALL stub_fork, sys_fork, %rdi
PTREGSCALL stub_vfork, sys_vfork, %rdi
@@ -580,25 +720,18 @@ END(\label)
PTREGSCALL stub_iopl, sys_iopl, %rsi
ENTRY(ptregscall_common)
- popq %r11
- CFI_ADJUST_CFA_OFFSET -8
- CFI_REGISTER rip, r11
- SAVE_REST
- movq %r11, %r15
- CFI_REGISTER rip, r15
- FIXUP_TOP_OF_STACK %r11
- call *%rax
- RESTORE_TOP_OF_STACK %r11
- movq %r15, %r11
- CFI_REGISTER rip, r11
- RESTORE_REST
- pushq %r11
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rip, 0
- ret
+ DEFAULT_FRAME 1 8 /* offset 8: return address */
+ RESTORE_TOP_OF_STACK %r11, 8
+ movq_cfi_restore R15+8, r15
+ movq_cfi_restore R14+8, r14
+ movq_cfi_restore R13+8, r13
+ movq_cfi_restore R12+8, r12
+ movq_cfi_restore RBP+8, rbp
+ movq_cfi_restore RBX+8, rbx
+ ret $REST_SKIP /* pop extended registers */
CFI_ENDPROC
END(ptregscall_common)
-
+
ENTRY(stub_execve)
CFI_STARTPROC
popq %r11
@@ -614,11 +747,11 @@ ENTRY(stub_execve)
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_execve)
-
+
/*
* sigreturn is special because it needs to restore all registers on return.
* This cannot be done with SYSRET, so use the IRET return path instead.
- */
+ */
ENTRY(stub_rt_sigreturn)
CFI_STARTPROC
addq $8, %rsp
@@ -634,70 +767,70 @@ ENTRY(stub_rt_sigreturn)
END(stub_rt_sigreturn)
/*
- * initial frame state for interrupts and exceptions
+ * Build the entry stubs and pointer table with some assembler magic.
+ * We pack 7 stubs into a single 32-byte chunk, which will fit in a
+ * single cache line on all modern x86 implementations.
*/
- .macro _frame ref
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8-\ref
- /*CFI_REL_OFFSET ss,SS-\ref*/
- CFI_REL_OFFSET rsp,RSP-\ref
- /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
- /*CFI_REL_OFFSET cs,CS-\ref*/
- CFI_REL_OFFSET rip,RIP-\ref
- .endm
+ .section .init.rodata,"a"
+ENTRY(interrupt)
+ .text
+ .p2align 5
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+ENTRY(irq_entries_start)
+ INTR_FRAME
+vector=FIRST_EXTERNAL_VECTOR
+.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
+ .balign 32
+ .rept 7
+ .if vector < NR_VECTORS
+ .if vector <> FIRST_EXTERNAL_VECTOR
+ CFI_ADJUST_CFA_OFFSET -8
+ .endif
+1: pushq $(~vector+0x80) /* Note: always in signed byte range */
+ CFI_ADJUST_CFA_OFFSET 8
+ .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
+ jmp 2f
+ .endif
+ .previous
+ .quad 1b
+ .text
+vector=vector+1
+ .endif
+ .endr
+2: jmp common_interrupt
+.endr
+ CFI_ENDPROC
+END(irq_entries_start)
-/* initial frame state for interrupts (and exceptions without error code) */
-#define INTR_FRAME _frame RIP
-/* initial frame state for exceptions with error code (and interrupts with
- vector already pushed) */
-#define XCPT_FRAME _frame ORIG_RAX
+.previous
+END(interrupt)
+.previous
-/*
+/*
* Interrupt entry/exit.
*
* Interrupt entry points save only callee clobbered registers in fast path.
- *
- * Entry runs with interrupts off.
- */
+ *
+ * Entry runs with interrupts off.
+ */
-/* 0(%rsp): interrupt number */
+/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
- cld
- SAVE_ARGS
- leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
- pushq %rbp
- /*
- * Save rbp twice: One is for marking the stack frame, as usual, and the
- * other, to fill pt_regs properly. This is because bx comes right
- * before the last saved register in that structure, and not bp. If the
- * base pointer were in the place bx is today, this would not be needed.
- */
- movq %rbp, -8(%rsp)
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rbp, 0
- movq %rsp,%rbp
- CFI_DEF_CFA_REGISTER rbp
- testl $3,CS(%rdi)
- je 1f
- SWAPGS
- /* irqcount is used to check if a CPU is already on an interrupt
- stack or not. While this is essentially redundant with preempt_count
- it is a little cheaper to use a separate counter in the PDA
- (short of moving irq_enter into assembly, which would be too
- much work) */
-1: incl %gs:pda_irqcount
- cmoveq %gs:pda_irqstackptr,%rsp
- push %rbp # backlink for old unwinder
- /*
- * We entered an interrupt context - irqs are off:
- */
- TRACE_IRQS_OFF
+ subq $10*8, %rsp
+ CFI_ADJUST_CFA_OFFSET 10*8
+ call save_args
+ PARTIAL_FRAME 0
call \func
.endm
-ENTRY(common_interrupt)
+ /*
+ * The interrupt stubs push (~vector+0x80) onto the stack and
+ * then jump to common_interrupt.
+ */
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+common_interrupt:
XCPT_FRAME
+ addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
interrupt do_IRQ
/* 0(%rsp): oldrsp-ARGOFFSET */
ret_from_intr:
@@ -711,12 +844,12 @@ exit_intr:
GET_THREAD_INFO(%rcx)
testl $3,CS-ARGOFFSET(%rsp)
je retint_kernel
-
+
/* Interrupt came from user space */
/*
* Has a correct top of stack, but a partial stack frame
* %rcx: thread info. Interrupts off.
- */
+ */
retint_with_reschedule:
movl $_TIF_WORK_MASK,%edi
retint_check:
@@ -789,20 +922,20 @@ retint_careful:
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
call schedule
- popq %rdi
+ popq %rdi
CFI_ADJUST_CFA_OFFSET -8
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp retint_check
-
+
retint_signal:
testl $_TIF_DO_NOTIFY_MASK,%edx
jz retint_swapgs
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_REST
- movq $-1,ORIG_RAX(%rsp)
+ movq $-1,ORIG_RAX(%rsp)
xorl %esi,%esi # oldset
movq %rsp,%rdi # &pt_regs
call do_notify_resume
@@ -824,324 +957,211 @@ ENTRY(retint_kernel)
jnc retint_restore_args
call preempt_schedule_irq
jmp exit_intr
-#endif
+#endif
CFI_ENDPROC
END(common_interrupt)
-
+
/*
* APIC interrupts.
- */
- .macro apicinterrupt num,func
+ */
+.macro apicinterrupt num sym do_sym
+ENTRY(\sym)
INTR_FRAME
pushq $~(\num)
CFI_ADJUST_CFA_OFFSET 8
- interrupt \func
+ interrupt \do_sym
jmp ret_from_intr
CFI_ENDPROC
- .endm
-
-ENTRY(thermal_interrupt)
- apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
-END(thermal_interrupt)
-
-ENTRY(threshold_interrupt)
- apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
-END(threshold_interrupt)
-
-#ifdef CONFIG_SMP
-ENTRY(reschedule_interrupt)
- apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
-END(reschedule_interrupt)
-
- .macro INVALIDATE_ENTRY num
-ENTRY(invalidate_interrupt\num)
- apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
-END(invalidate_interrupt\num)
- .endm
+END(\sym)
+.endm
- INVALIDATE_ENTRY 0
- INVALIDATE_ENTRY 1
- INVALIDATE_ENTRY 2
- INVALIDATE_ENTRY 3
- INVALIDATE_ENTRY 4
- INVALIDATE_ENTRY 5
- INVALIDATE_ENTRY 6
- INVALIDATE_ENTRY 7
-
-ENTRY(call_function_interrupt)
- apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
-END(call_function_interrupt)
-ENTRY(call_function_single_interrupt)
- apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
-END(call_function_single_interrupt)
-ENTRY(irq_move_cleanup_interrupt)
- apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
-END(irq_move_cleanup_interrupt)
+#ifdef CONFIG_SMP
+apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
+ irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
#endif
-ENTRY(apic_timer_interrupt)
- apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
-END(apic_timer_interrupt)
+apicinterrupt UV_BAU_MESSAGE \
+ uv_bau_message_intr1 uv_bau_message_interrupt
+apicinterrupt LOCAL_TIMER_VECTOR \
+ apic_timer_interrupt smp_apic_timer_interrupt
+
+#ifdef CONFIG_SMP
+apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
+ invalidate_interrupt0 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
+ invalidate_interrupt1 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
+ invalidate_interrupt2 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
+ invalidate_interrupt3 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
+ invalidate_interrupt4 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
+ invalidate_interrupt5 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
+ invalidate_interrupt6 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
+ invalidate_interrupt7 smp_invalidate_interrupt
+#endif
-ENTRY(uv_bau_message_intr1)
- apicinterrupt 220,uv_bau_message_interrupt
-END(uv_bau_message_intr1)
+apicinterrupt THRESHOLD_APIC_VECTOR \
+ threshold_interrupt mce_threshold_interrupt
+apicinterrupt THERMAL_APIC_VECTOR \
+ thermal_interrupt smp_thermal_interrupt
+
+#ifdef CONFIG_SMP
+apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
+ call_function_single_interrupt smp_call_function_single_interrupt
+apicinterrupt CALL_FUNCTION_VECTOR \
+ call_function_interrupt smp_call_function_interrupt
+apicinterrupt RESCHEDULE_VECTOR \
+ reschedule_interrupt smp_reschedule_interrupt
+#endif
-ENTRY(error_interrupt)
- apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
-END(error_interrupt)
+apicinterrupt ERROR_APIC_VECTOR \
+ error_interrupt smp_error_interrupt
+apicinterrupt SPURIOUS_APIC_VECTOR \
+ spurious_interrupt smp_spurious_interrupt
-ENTRY(spurious_interrupt)
- apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
-END(spurious_interrupt)
-
/*
* Exception entry points.
- */
- .macro zeroentry sym
+ */
+.macro zeroentry sym do_sym
+ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $0 /* push error code/oldrax */
- CFI_ADJUST_CFA_OFFSET 8
- pushq %rax /* push real oldrax to the rdi slot */
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rax,0
- leaq \sym(%rip),%rax
- jmp error_entry
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ subq $15*8,%rsp
+ CFI_ADJUST_CFA_OFFSET 15*8
+ call error_entry
+ DEFAULT_FRAME 0
+ movq %rsp,%rdi /* pt_regs pointer */
+ xorl %esi,%esi /* no error code */
+ call \do_sym
+ jmp error_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
- .endm
+END(\sym)
+.endm
- .macro errorentry sym
- XCPT_FRAME
+.macro paranoidzeroentry sym do_sym
+ENTRY(\sym)
+ INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq %rax
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rax,0
- leaq \sym(%rip),%rax
- jmp error_entry
+ subq $15*8, %rsp
+ call save_paranoid
+ TRACE_IRQS_OFF
+ movq %rsp,%rdi /* pt_regs pointer */
+ xorl %esi,%esi /* no error code */
+ call \do_sym
+ jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
- .endm
+END(\sym)
+.endm
- /* error code is on the stack already */
- /* handle NMI like exceptions that can happen everywhere */
- .macro paranoidentry sym, ist=0, irqtrace=1
- SAVE_ALL
- cld
- movl $1,%ebx
- movl $MSR_GS_BASE,%ecx
- rdmsr
- testl %edx,%edx
- js 1f
- SWAPGS
- xorl %ebx,%ebx
-1:
- .if \ist
- movq %gs:pda_data_offset, %rbp
- .endif
- .if \irqtrace
- TRACE_IRQS_OFF
- .endif
- movq %rsp,%rdi
- movq ORIG_RAX(%rsp),%rsi
- movq $-1,ORIG_RAX(%rsp)
- .if \ist
- subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
- .endif
- call \sym
- .if \ist
- addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
- .endif
- DISABLE_INTERRUPTS(CLBR_NONE)
- .if \irqtrace
+.macro paranoidzeroentry_ist sym do_sym ist
+ENTRY(\sym)
+ INTR_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
+ CFI_ADJUST_CFA_OFFSET 8
+ subq $15*8, %rsp
+ call save_paranoid
TRACE_IRQS_OFF
- .endif
- .endm
+ movq %rsp,%rdi /* pt_regs pointer */
+ xorl %esi,%esi /* no error code */
+ movq %gs:pda_data_offset, %rbp
+ subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+ call \do_sym
+ addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+ jmp paranoid_exit /* %ebx: no swapgs flag */
+ CFI_ENDPROC
+END(\sym)
+.endm
- /*
- * "Paranoid" exit path from exception stack.
- * Paranoid because this is used by NMIs and cannot take
- * any kernel state for granted.
- * We don't do kernel preemption checks here, because only
- * NMI should be common and it does not enable IRQs and
- * cannot get reschedule ticks.
- *
- * "trace" is 0 for the NMI handler only, because irq-tracing
- * is fundamentally NMI-unsafe. (we cannot change the soft and
- * hard flags at once, atomically)
- */
- .macro paranoidexit trace=1
- /* ebx: no swapgs flag */
-paranoid_exit\trace:
- testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_restore\trace
- testl $3,CS(%rsp)
- jnz paranoid_userspace\trace
-paranoid_swapgs\trace:
- .if \trace
- TRACE_IRQS_IRETQ 0
- .endif
- SWAPGS_UNSAFE_STACK
-paranoid_restore\trace:
- RESTORE_ALL 8
- jmp irq_return
-paranoid_userspace\trace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz paranoid_swapgs\trace
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz paranoid_schedule\trace
- movl %ebx,%edx /* arg3: thread flags */
- .if \trace
- TRACE_IRQS_ON
- .endif
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- .if \trace
- TRACE_IRQS_OFF
- .endif
- jmp paranoid_userspace\trace
-paranoid_schedule\trace:
- .if \trace
- TRACE_IRQS_ON
- .endif
- ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
- DISABLE_INTERRUPTS(CLBR_ANY)
- .if \trace
- TRACE_IRQS_OFF
- .endif
- jmp paranoid_userspace\trace
+.macro errorentry sym do_sym
+ENTRY(\sym)
+ XCPT_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ subq $15*8,%rsp
+ CFI_ADJUST_CFA_OFFSET 15*8
+ call error_entry
+ DEFAULT_FRAME 0
+ movq %rsp,%rdi /* pt_regs pointer */
+ movq ORIG_RAX(%rsp),%rsi /* get error code */
+ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ call \do_sym
+ jmp error_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
- .endm
+END(\sym)
+.endm
-/*
- * Exception entry point. This expects an error code/orig_rax on the stack
- * and the exception handler in %rax.
- */
-KPROBE_ENTRY(error_entry)
- _frame RDI
- CFI_REL_OFFSET rax,0
- /* rdi slot contains rax, oldrax contains error code */
- cld
- subq $14*8,%rsp
- CFI_ADJUST_CFA_OFFSET (14*8)
- movq %rsi,13*8(%rsp)
- CFI_REL_OFFSET rsi,RSI
- movq 14*8(%rsp),%rsi /* load rax from rdi slot */
- CFI_REGISTER rax,rsi
- movq %rdx,12*8(%rsp)
- CFI_REL_OFFSET rdx,RDX
- movq %rcx,11*8(%rsp)
- CFI_REL_OFFSET rcx,RCX
- movq %rsi,10*8(%rsp) /* store rax */
- CFI_REL_OFFSET rax,RAX
- movq %r8, 9*8(%rsp)
- CFI_REL_OFFSET r8,R8
- movq %r9, 8*8(%rsp)
- CFI_REL_OFFSET r9,R9
- movq %r10,7*8(%rsp)
- CFI_REL_OFFSET r10,R10
- movq %r11,6*8(%rsp)
- CFI_REL_OFFSET r11,R11
- movq %rbx,5*8(%rsp)
- CFI_REL_OFFSET rbx,RBX
- movq %rbp,4*8(%rsp)
- CFI_REL_OFFSET rbp,RBP
- movq %r12,3*8(%rsp)
- CFI_REL_OFFSET r12,R12
- movq %r13,2*8(%rsp)
- CFI_REL_OFFSET r13,R13
- movq %r14,1*8(%rsp)
- CFI_REL_OFFSET r14,R14
- movq %r15,(%rsp)
- CFI_REL_OFFSET r15,R15
- xorl %ebx,%ebx
- testl $3,CS(%rsp)
- je error_kernelspace
-error_swapgs:
- SWAPGS
-error_sti:
- TRACE_IRQS_OFF
- movq %rdi,RDI(%rsp)
- CFI_REL_OFFSET rdi,RDI
- movq %rsp,%rdi
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp)
- call *%rax
- /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
-error_exit:
- movl %ebx,%eax
- RESTORE_REST
- DISABLE_INTERRUPTS(CLBR_NONE)
+ /* error code is on the stack already */
+.macro paranoiderrorentry sym do_sym
+ENTRY(\sym)
+ XCPT_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ subq $15*8,%rsp
+ CFI_ADJUST_CFA_OFFSET 15*8
+ call save_paranoid
+ DEFAULT_FRAME 0
TRACE_IRQS_OFF
- GET_THREAD_INFO(%rcx)
- testl %eax,%eax
- jne retint_kernel
- LOCKDEP_SYS_EXIT_IRQ
- movl TI_flags(%rcx),%edx
- movl $_TIF_WORK_MASK,%edi
- andl %edi,%edx
- jnz retint_careful
- jmp retint_swapgs
+ movq %rsp,%rdi /* pt_regs pointer */
+ movq ORIG_RAX(%rsp),%rsi /* get error code */
+ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ call \do_sym
+ jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
+END(\sym)
+.endm
-error_kernelspace:
- incl %ebx
- /* There are two places in the kernel that can potentially fault with
- usergs. Handle them here. The exception handlers after
- iret run with kernel gs again, so don't set the user space flag.
- B stepping K8s sometimes report an truncated RIP for IRET
- exceptions returning to compat mode. Check for these here too. */
- leaq irq_return(%rip),%rcx
- cmpq %rcx,RIP(%rsp)
- je error_swapgs
- movl %ecx,%ecx /* zero extend */
- cmpq %rcx,RIP(%rsp)
- je error_swapgs
- cmpq $gs_change,RIP(%rsp)
- je error_swapgs
- jmp error_sti
-KPROBE_END(error_entry)
-
- /* Reload gs selector with exception handling */
- /* edi: new selector */
+zeroentry divide_error do_divide_error
+zeroentry overflow do_overflow
+zeroentry bounds do_bounds
+zeroentry invalid_op do_invalid_op
+zeroentry device_not_available do_device_not_available
+paranoiderrorentry double_fault do_double_fault
+zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
+errorentry invalid_TSS do_invalid_TSS
+errorentry segment_not_present do_segment_not_present
+zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
+zeroentry coprocessor_error do_coprocessor_error
+errorentry alignment_check do_alignment_check
+zeroentry simd_coprocessor_error do_simd_coprocessor_error
+
+ /* Reload gs selector with exception handling */
+ /* edi: new selector */
ENTRY(native_load_gs_index)
CFI_STARTPROC
pushf
CFI_ADJUST_CFA_OFFSET 8
DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
- SWAPGS
-gs_change:
- movl %edi,%gs
+ SWAPGS
+gs_change:
+ movl %edi,%gs
2: mfence /* workaround */
SWAPGS
- popf
+ popf
CFI_ADJUST_CFA_OFFSET -8
- ret
+ ret
CFI_ENDPROC
-ENDPROC(native_load_gs_index)
-
- .section __ex_table,"a"
- .align 8
- .quad gs_change,bad_gs
- .previous
- .section .fixup,"ax"
+END(native_load_gs_index)
+
+ .section __ex_table,"a"
+ .align 8
+ .quad gs_change,bad_gs
+ .previous
+ .section .fixup,"ax"
/* running with kernelgs */
-bad_gs:
+bad_gs:
SWAPGS /* switch back to user gs */
xorl %eax,%eax
- movl %eax,%gs
- jmp 2b
- .previous
-
+ movl %eax,%gs
+ jmp 2b
+ .previous
+
/*
* Create a kernel thread.
*
@@ -1164,7 +1184,7 @@ ENTRY(kernel_thread)
xorl %r8d,%r8d
xorl %r9d,%r9d
-
+
# clone now
call do_fork
movq %rax,RAX(%rsp)
@@ -1175,15 +1195,15 @@ ENTRY(kernel_thread)
* so internally to the x86_64 port you can rely on kernel_thread()
* not to reschedule the child before returning, this avoids the need
* of hacks for example to fork off the per-CPU idle tasks.
- * [Hopefully no generic code relies on the reschedule -AK]
+ * [Hopefully no generic code relies on the reschedule -AK]
*/
RESTORE_ALL
UNFAKE_STACK_FRAME
ret
CFI_ENDPROC
-ENDPROC(kernel_thread)
-
-child_rip:
+END(kernel_thread)
+
+ENTRY(child_rip)
pushq $0 # fake return address
CFI_STARTPROC
/*
@@ -1196,8 +1216,9 @@ child_rip:
# exit
mov %eax, %edi
call do_exit
+ ud2 # padding for call trace
CFI_ENDPROC
-ENDPROC(child_rip)
+END(child_rip)
/*
* execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -1217,10 +1238,10 @@ ENDPROC(child_rip)
ENTRY(kernel_execve)
CFI_STARTPROC
FAKE_STACK_FRAME $0
- SAVE_ALL
+ SAVE_ALL
movq %rsp,%rcx
call sys_execve
- movq %rax, RAX(%rsp)
+ movq %rax, RAX(%rsp)
RESTORE_REST
testq %rax,%rax
je int_ret_from_sys_call
@@ -1228,129 +1249,7 @@ ENTRY(kernel_execve)
UNFAKE_STACK_FRAME
ret
CFI_ENDPROC
-ENDPROC(kernel_execve)
-
-KPROBE_ENTRY(page_fault)
- errorentry do_page_fault
-KPROBE_END(page_fault)
-
-ENTRY(coprocessor_error)
- zeroentry do_coprocessor_error
-END(coprocessor_error)
-
-ENTRY(simd_coprocessor_error)
- zeroentry do_simd_coprocessor_error
-END(simd_coprocessor_error)
-
-ENTRY(device_not_available)
- zeroentry do_device_not_available
-END(device_not_available)
-
- /* runs on exception stack */
-KPROBE_ENTRY(debug)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8
- paranoidentry do_debug, DEBUG_STACK
- paranoidexit
-KPROBE_END(debug)
-
- /* runs on exception stack */
-KPROBE_ENTRY(nmi)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $-1
- CFI_ADJUST_CFA_OFFSET 8
- paranoidentry do_nmi, 0, 0
-#ifdef CONFIG_TRACE_IRQFLAGS
- paranoidexit 0
-#else
- jmp paranoid_exit1
- CFI_ENDPROC
-#endif
-KPROBE_END(nmi)
-
-KPROBE_ENTRY(int3)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8
- paranoidentry do_int3, DEBUG_STACK
- jmp paranoid_exit1
- CFI_ENDPROC
-KPROBE_END(int3)
-
-ENTRY(overflow)
- zeroentry do_overflow
-END(overflow)
-
-ENTRY(bounds)
- zeroentry do_bounds
-END(bounds)
-
-ENTRY(invalid_op)
- zeroentry do_invalid_op
-END(invalid_op)
-
-ENTRY(coprocessor_segment_overrun)
- zeroentry do_coprocessor_segment_overrun
-END(coprocessor_segment_overrun)
-
- /* runs on exception stack */
-ENTRY(double_fault)
- XCPT_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- paranoidentry do_double_fault
- jmp paranoid_exit1
- CFI_ENDPROC
-END(double_fault)
-
-ENTRY(invalid_TSS)
- errorentry do_invalid_TSS
-END(invalid_TSS)
-
-ENTRY(segment_not_present)
- errorentry do_segment_not_present
-END(segment_not_present)
-
- /* runs on exception stack */
-ENTRY(stack_segment)
- XCPT_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- paranoidentry do_stack_segment
- jmp paranoid_exit1
- CFI_ENDPROC
-END(stack_segment)
-
-KPROBE_ENTRY(general_protection)
- errorentry do_general_protection
-KPROBE_END(general_protection)
-
-ENTRY(alignment_check)
- errorentry do_alignment_check
-END(alignment_check)
-
-ENTRY(divide_error)
- zeroentry do_divide_error
-END(divide_error)
-
-ENTRY(spurious_interrupt_bug)
- zeroentry do_spurious_interrupt_bug
-END(spurious_interrupt_bug)
-
-#ifdef CONFIG_X86_MCE
- /* runs on exception stack */
-ENTRY(machine_check)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8
- paranoidentry do_machine_check
- jmp paranoid_exit1
- CFI_ENDPROC
-END(machine_check)
-#endif
+END(kernel_execve)
/* Call softirq on interrupt stack. Interrupts are off. */
ENTRY(call_softirq)
@@ -1370,40 +1269,33 @@ ENTRY(call_softirq)
decl %gs:pda_irqcount
ret
CFI_ENDPROC
-ENDPROC(call_softirq)
-
-KPROBE_ENTRY(ignore_sysret)
- CFI_STARTPROC
- mov $-ENOSYS,%eax
- sysret
- CFI_ENDPROC
-ENDPROC(ignore_sysret)
+END(call_softirq)
#ifdef CONFIG_XEN
-ENTRY(xen_hypervisor_callback)
- zeroentry xen_do_hypervisor_callback
-END(xen_hypervisor_callback)
+zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
/*
-# A note on the "critical region" in our callback handler.
-# We want to avoid stacking callback handlers due to events occurring
-# during handling of the last event. To do this, we keep events disabled
-# until we've done all processing. HOWEVER, we must enable events before
-# popping the stack frame (can't be done atomically) and so it would still
-# be possible to get enough handler activations to overflow the stack.
-# Although unlikely, bugs of that kind are hard to track down, so we'd
-# like to avoid the possibility.
-# So, on entry to the handler we detect whether we interrupted an
-# existing activation in its critical region -- if so, we pop the current
-# activation and restart the handler using the previous one.
-*/
+ * A note on the "critical region" in our callback handler.
+ * We want to avoid stacking callback handlers due to events occurring
+ * during handling of the last event. To do this, we keep events disabled
+ * until we've done all processing. HOWEVER, we must enable events before
+ * popping the stack frame (can't be done atomically) and so it would still
+ * be possible to get enough handler activations to overflow the stack.
+ * Although unlikely, bugs of that kind are hard to track down, so we'd
+ * like to avoid the possibility.
+ * So, on entry to the handler we detect whether we interrupted an
+ * existing activation in its critical region -- if so, we pop the current
+ * activation and restart the handler using the previous one.
+ */
ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
CFI_STARTPROC
-/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
- see the correct pointer to the pt_regs */
+/*
+ * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
+ * see the correct pointer to the pt_regs
+ */
movq %rdi, %rsp # we don't return, adjust the stack frame
CFI_ENDPROC
- CFI_DEFAULT_STACK
+ DEFAULT_FRAME
11: incl %gs:pda_irqcount
movq %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
@@ -1418,23 +1310,26 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
END(do_hypervisor_callback)
/*
-# Hypervisor uses this for application faults while it executes.
-# We get here for two reasons:
-# 1. Fault while reloading DS, ES, FS or GS
-# 2. Fault while executing IRET
-# Category 1 we do not need to fix up as Xen has already reloaded all segment
-# registers that could be reloaded and zeroed the others.
-# Category 2 we fix up by killing the current process. We cannot use the
-# normal Linux return path in this case because if we use the IRET hypercall
-# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
-# We distinguish between categories by comparing each saved segment register
-# with its current contents: any discrepancy means we in category 1.
-*/
+ * Hypervisor uses this for application faults while it executes.
+ * We get here for two reasons:
+ * 1. Fault while reloading DS, ES, FS or GS
+ * 2. Fault while executing IRET
+ * Category 1 we do not need to fix up as Xen has already reloaded all segment
+ * registers that could be reloaded and zeroed the others.
+ * Category 2 we fix up by killing the current process. We cannot use the
+ * normal Linux return path in this case because if we use the IRET hypercall
+ * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+ * We distinguish between categories by comparing each saved segment register
+ * with its current contents: any discrepancy means we in category 1.
+ */
ENTRY(xen_failsafe_callback)
- framesz = (RIP-0x30) /* workaround buggy gas */
- _frame framesz
- CFI_REL_OFFSET rcx, 0
- CFI_REL_OFFSET r11, 8
+ INTR_FRAME 1 (6*8)
+ /*CFI_REL_OFFSET gs,GS*/
+ /*CFI_REL_OFFSET fs,FS*/
+ /*CFI_REL_OFFSET es,ES*/
+ /*CFI_REL_OFFSET ds,DS*/
+ CFI_REL_OFFSET r11,8
+ CFI_REL_OFFSET rcx,0
movw %ds,%cx
cmpw %cx,0x10(%rsp)
CFI_REMEMBER_STATE
@@ -1455,12 +1350,9 @@ ENTRY(xen_failsafe_callback)
CFI_RESTORE r11
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8
- pushq %r11
- CFI_ADJUST_CFA_OFFSET 8
- pushq %rcx
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $0 /* RIP */
+ pushq_cfi %r11
+ pushq_cfi %rcx
jmp general_protection
CFI_RESTORE_STATE
1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
@@ -1470,11 +1362,223 @@ ENTRY(xen_failsafe_callback)
CFI_RESTORE r11
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $0
SAVE_ALL
jmp error_exit
CFI_ENDPROC
END(xen_failsafe_callback)
#endif /* CONFIG_XEN */
+
+/*
+ * Some functions should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
+
+paranoidzeroentry_ist debug do_debug DEBUG_STACK
+paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
+paranoiderrorentry stack_segment do_stack_segment
+errorentry general_protection do_general_protection
+errorentry page_fault do_page_fault
+#ifdef CONFIG_X86_MCE
+paranoidzeroentry machine_check do_machine_check
+#endif
+
+ /*
+ * "Paranoid" exit path from exception stack.
+ * Paranoid because this is used by NMIs and cannot take
+ * any kernel state for granted.
+ * We don't do kernel preemption checks here, because only
+ * NMI should be common and it does not enable IRQs and
+ * cannot get reschedule ticks.
+ *
+ * "trace" is 0 for the NMI handler only, because irq-tracing
+ * is fundamentally NMI-unsafe. (we cannot change the soft and
+ * hard flags at once, atomically)
+ */
+
+ /* ebx: no swapgs flag */
+ENTRY(paranoid_exit)
+ INTR_FRAME
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ testl %ebx,%ebx /* swapgs needed? */
+ jnz paranoid_restore
+ testl $3,CS(%rsp)
+ jnz paranoid_userspace
+paranoid_swapgs:
+ TRACE_IRQS_IRETQ 0
+ SWAPGS_UNSAFE_STACK
+paranoid_restore:
+ RESTORE_ALL 8
+ jmp irq_return
+paranoid_userspace:
+ GET_THREAD_INFO(%rcx)
+ movl TI_flags(%rcx),%ebx
+ andl $_TIF_WORK_MASK,%ebx
+ jz paranoid_swapgs
+ movq %rsp,%rdi /* &pt_regs */
+ call sync_regs
+ movq %rax,%rsp /* switch stack for scheduling */
+ testl $_TIF_NEED_RESCHED,%ebx
+ jnz paranoid_schedule
+ movl %ebx,%edx /* arg3: thread flags */
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ xorl %esi,%esi /* arg2: oldset */
+ movq %rsp,%rdi /* arg1: &pt_regs */
+ call do_notify_resume
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp paranoid_userspace
+paranoid_schedule:
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_ANY)
+ call schedule
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_OFF
+ jmp paranoid_userspace
+ CFI_ENDPROC
+END(paranoid_exit)
+
+/*
+ * Exception entry point. This expects an error code/orig_rax on the stack.
+ * returns in "no swapgs flag" in %ebx.
+ */
+ENTRY(error_entry)
+ XCPT_FRAME
+ CFI_ADJUST_CFA_OFFSET 15*8
+ /* oldrax contains error code */
+ cld
+ movq_cfi rdi, RDI+8
+ movq_cfi rsi, RSI+8
+ movq_cfi rdx, RDX+8
+ movq_cfi rcx, RCX+8
+ movq_cfi rax, RAX+8
+ movq_cfi r8, R8+8
+ movq_cfi r9, R9+8
+ movq_cfi r10, R10+8
+ movq_cfi r11, R11+8
+ movq_cfi rbx, RBX+8
+ movq_cfi rbp, RBP+8
+ movq_cfi r12, R12+8
+ movq_cfi r13, R13+8
+ movq_cfi r14, R14+8
+ movq_cfi r15, R15+8
+ xorl %ebx,%ebx
+ testl $3,CS+8(%rsp)
+ je error_kernelspace
+error_swapgs:
+ SWAPGS
+error_sti:
+ TRACE_IRQS_OFF
+ ret
+ CFI_ENDPROC
+
+/*
+ * There are two places in the kernel that can potentially fault with
+ * usergs. Handle them here. The exception handlers after iret run with
+ * kernel gs again, so don't set the user space flag. B stepping K8s
+ * sometimes report an truncated RIP for IRET exceptions returning to
+ * compat mode. Check for these here too.
+ */
+error_kernelspace:
+ incl %ebx
+ leaq irq_return(%rip),%rcx
+ cmpq %rcx,RIP+8(%rsp)
+ je error_swapgs
+ movl %ecx,%ecx /* zero extend */
+ cmpq %rcx,RIP+8(%rsp)
+ je error_swapgs
+ cmpq $gs_change,RIP+8(%rsp)
+ je error_swapgs
+ jmp error_sti
+END(error_entry)
+
+
+/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
+ENTRY(error_exit)
+ DEFAULT_FRAME
+ movl %ebx,%eax
+ RESTORE_REST
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ GET_THREAD_INFO(%rcx)
+ testl %eax,%eax
+ jne retint_kernel
+ LOCKDEP_SYS_EXIT_IRQ
+ movl TI_flags(%rcx),%edx
+ movl $_TIF_WORK_MASK,%edi
+ andl %edi,%edx
+ jnz retint_careful
+ jmp retint_swapgs
+ CFI_ENDPROC
+END(error_exit)
+
+
+ /* runs on exception stack */
+ENTRY(nmi)
+ INTR_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ pushq_cfi $-1
+ subq $15*8, %rsp
+ CFI_ADJUST_CFA_OFFSET 15*8
+ call save_paranoid
+ DEFAULT_FRAME 0
+ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
+ movq %rsp,%rdi
+ movq $-1,%rsi
+ call do_nmi
+#ifdef CONFIG_TRACE_IRQFLAGS
+ /* paranoidexit; without TRACE_IRQS_OFF */
+ /* ebx: no swapgs flag */
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ testl %ebx,%ebx /* swapgs needed? */
+ jnz nmi_restore
+ testl $3,CS(%rsp)
+ jnz nmi_userspace
+nmi_swapgs:
+ SWAPGS_UNSAFE_STACK
+nmi_restore:
+ RESTORE_ALL 8
+ jmp irq_return
+nmi_userspace:
+ GET_THREAD_INFO(%rcx)
+ movl TI_flags(%rcx),%ebx
+ andl $_TIF_WORK_MASK,%ebx
+ jz nmi_swapgs
+ movq %rsp,%rdi /* &pt_regs */
+ call sync_regs
+ movq %rax,%rsp /* switch stack for scheduling */
+ testl $_TIF_NEED_RESCHED,%ebx
+ jnz nmi_schedule
+ movl %ebx,%edx /* arg3: thread flags */
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ xorl %esi,%esi /* arg2: oldset */
+ movq %rsp,%rdi /* arg1: &pt_regs */
+ call do_notify_resume
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ jmp nmi_userspace
+nmi_schedule:
+ ENABLE_INTERRUPTS(CLBR_ANY)
+ call schedule
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ jmp nmi_userspace
+ CFI_ENDPROC
+#else
+ jmp paranoid_exit
+ CFI_ENDPROC
+#endif
+END(nmi)
+
+ENTRY(ignore_sysret)
+ CFI_STARTPROC
+ mov $-ENOSYS,%eax
+ sysret
+ CFI_ENDPROC
+END(ignore_sysret)
+
+/*
+ * End of kprobes section
+ */
+ .popsection
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c
index f454c78fcef6..53699c931ad4 100644
--- a/arch/x86/kernel/es7000_32.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -38,8 +38,11 @@
#include <asm/io.h>
#include <asm/nmi.h>
#include <asm/smp.h>
+#include <asm/atomic.h>
#include <asm/apicdef.h>
#include <mach_mpparse.h>
+#include <asm/genapic.h>
+#include <asm/setup.h>
/*
* ES7000 chipsets
@@ -161,6 +164,43 @@ es7000_rename_gsi(int ioapic, int gsi)
return gsi;
}
+static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
+{
+ unsigned long vect = 0, psaival = 0;
+
+ if (psai == NULL)
+ return -1;
+
+ vect = ((unsigned long)__pa(eip)/0x1000) << 16;
+ psaival = (0x1000000 | vect | cpu);
+
+ while (*psai & 0x1000000)
+ ;
+
+ *psai = psaival;
+
+ return 0;
+}
+
+static void noop_wait_for_deassert(atomic_t *deassert_not_used)
+{
+}
+
+static int __init es7000_update_genapic(void)
+{
+ genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
+
+ /* MPENTIUMIII */
+ if (boot_cpu_data.x86 == 6 &&
+ (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) {
+ es7000_update_genapic_to_cluster();
+ genapic->wait_for_init_deassert = noop_wait_for_deassert;
+ genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
+ }
+
+ return 0;
+}
+
void __init
setup_unisys(void)
{
@@ -176,6 +216,8 @@ setup_unisys(void)
else
es7000_plat = ES7000_CLASSIC;
ioapic_renumber_irq = es7000_rename_gsi;
+
+ x86_quirks->update_genapic = es7000_update_genapic;
}
/*
@@ -250,31 +292,24 @@ int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
{
struct acpi_table_header *header = NULL;
int i = 0;
- acpi_size tbl_size;
- while (ACPI_SUCCESS(acpi_get_table_with_size("OEM1", i++, &header, &tbl_size))) {
+ while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) {
if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) {
struct oem_table *t = (struct oem_table *)header;
oem_addrX = t->OEMTableAddr;
oem_size = t->OEMTableSize;
- early_acpi_os_unmap_memory(header, tbl_size);
*oem_addr = (unsigned long)__acpi_map_table(oem_addrX,
oem_size);
return 0;
}
- early_acpi_os_unmap_memory(header, tbl_size);
}
return -1;
}
void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
{
- if (!oem_addr)
- return;
-
- __acpi_unmap_table((char *)oem_addr, oem_size);
}
#endif
@@ -324,26 +359,6 @@ es7000_mip_write(struct mip_reg *mip_reg)
return status;
}
-int
-es7000_start_cpu(int cpu, unsigned long eip)
-{
- unsigned long vect = 0, psaival = 0;
-
- if (psai == NULL)
- return -1;
-
- vect = ((unsigned long)__pa(eip)/0x1000) << 16;
- psaival = (0x1000000 | vect | cpu);
-
- while (*psai & 0x1000000)
- ;
-
- *psai = psaival;
-
- return 0;
-
-}
-
void __init
es7000_sw_apic(void)
{
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ab115cd15fdf..1b43086b097a 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -11,17 +11,20 @@
#include <linux/spinlock.h>
#include <linux/hardirq.h>
+#include <linux/uaccess.h>
#include <linux/ftrace.h>
#include <linux/percpu.h>
+#include <linux/sched.h>
#include <linux/init.h>
#include <linux/list.h>
-#include <asm/alternative.h>
#include <asm/ftrace.h>
+#include <linux/ftrace.h>
+#include <asm/nops.h>
+#include <asm/nmi.h>
-/* Long is fine, even if it is only 4 bytes ;-) */
-static long *ftrace_nop;
+#ifdef CONFIG_DYNAMIC_FTRACE
union ftrace_code_union {
char code[MCOUNT_INSN_SIZE];
@@ -31,18 +34,12 @@ union ftrace_code_union {
} __attribute__((packed));
};
-
-static int notrace ftrace_calc_offset(long ip, long addr)
+static int ftrace_calc_offset(long ip, long addr)
{
return (int)(addr - ip);
}
-notrace unsigned char *ftrace_nop_replace(void)
-{
- return (char *)ftrace_nop;
-}
-
-notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
{
static union ftrace_code_union calc;
@@ -56,48 +53,198 @@ notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
return calc.code;
}
-notrace int
+/*
+ * Modifying code must take extra care. On an SMP machine, if
+ * the code being modified is also being executed on another CPU
+ * that CPU will have undefined results and possibly take a GPF.
+ * We use kstop_machine to stop other CPUS from exectuing code.
+ * But this does not stop NMIs from happening. We still need
+ * to protect against that. We separate out the modification of
+ * the code to take care of this.
+ *
+ * Two buffers are added: An IP buffer and a "code" buffer.
+ *
+ * 1) Put the instruction pointer into the IP buffer
+ * and the new code into the "code" buffer.
+ * 2) Set a flag that says we are modifying code
+ * 3) Wait for any running NMIs to finish.
+ * 4) Write the code
+ * 5) clear the flag.
+ * 6) Wait for any running NMIs to finish.
+ *
+ * If an NMI is executed, the first thing it does is to call
+ * "ftrace_nmi_enter". This will check if the flag is set to write
+ * and if it is, it will write what is in the IP and "code" buffers.
+ *
+ * The trick is, it does not matter if everyone is writing the same
+ * content to the code location. Also, if a CPU is executing code
+ * it is OK to write to that code location if the contents being written
+ * are the same as what exists.
+ */
+
+static atomic_t in_nmi = ATOMIC_INIT(0);
+static int mod_code_status; /* holds return value of text write */
+static int mod_code_write; /* set when NMI should do the write */
+static void *mod_code_ip; /* holds the IP to write to */
+static void *mod_code_newcode; /* holds the text to write to the IP */
+
+static unsigned nmi_wait_count;
+static atomic_t nmi_update_count = ATOMIC_INIT(0);
+
+int ftrace_arch_read_dyn_info(char *buf, int size)
+{
+ int r;
+
+ r = snprintf(buf, size, "%u %u",
+ nmi_wait_count,
+ atomic_read(&nmi_update_count));
+ return r;
+}
+
+static void ftrace_mod_code(void)
+{
+ /*
+ * Yes, more than one CPU process can be writing to mod_code_status.
+ * (and the code itself)
+ * But if one were to fail, then they all should, and if one were
+ * to succeed, then they all should.
+ */
+ mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
+ MCOUNT_INSN_SIZE);
+}
+
+void ftrace_nmi_enter(void)
+{
+ atomic_inc(&in_nmi);
+ /* Must have in_nmi seen before reading write flag */
+ smp_mb();
+ if (mod_code_write) {
+ ftrace_mod_code();
+ atomic_inc(&nmi_update_count);
+ }
+}
+
+void ftrace_nmi_exit(void)
+{
+ /* Finish all executions before clearing in_nmi */
+ smp_wmb();
+ atomic_dec(&in_nmi);
+}
+
+static void wait_for_nmi(void)
+{
+ int waited = 0;
+
+ while (atomic_read(&in_nmi)) {
+ waited = 1;
+ cpu_relax();
+ }
+
+ if (waited)
+ nmi_wait_count++;
+}
+
+static int
+do_ftrace_mod_code(unsigned long ip, void *new_code)
+{
+ mod_code_ip = (void *)ip;
+ mod_code_newcode = new_code;
+
+ /* The buffers need to be visible before we let NMIs write them */
+ smp_wmb();
+
+ mod_code_write = 1;
+
+ /* Make sure write bit is visible before we wait on NMIs */
+ smp_mb();
+
+ wait_for_nmi();
+
+ /* Make sure all running NMIs have finished before we write the code */
+ smp_mb();
+
+ ftrace_mod_code();
+
+ /* Make sure the write happens before clearing the bit */
+ smp_wmb();
+
+ mod_code_write = 0;
+
+ /* make sure NMIs see the cleared bit */
+ smp_mb();
+
+ wait_for_nmi();
+
+ return mod_code_status;
+}
+
+
+
+
+static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+
+static unsigned char *ftrace_nop_replace(void)
+{
+ return ftrace_nop;
+}
+
+static int
ftrace_modify_code(unsigned long ip, unsigned char *old_code,
unsigned char *new_code)
{
- unsigned replaced;
- unsigned old = *(unsigned *)old_code; /* 4 bytes */
- unsigned new = *(unsigned *)new_code; /* 4 bytes */
- unsigned char newch = new_code[4];
- int faulted = 0;
+ unsigned char replaced[MCOUNT_INSN_SIZE];
/*
* Note: Due to modules and __init, code can
* disappear and change, we need to protect against faulting
- * as well as code changing.
+ * as well as code changing. We do this by using the
+ * probe_kernel_* functions.
*
* No real locking needed, this code is run through
- * kstop_machine.
+ * kstop_machine, or before SMP starts.
*/
- asm volatile (
- "1: lock\n"
- " cmpxchg %3, (%2)\n"
- " jnz 2f\n"
- " movb %b4, 4(%2)\n"
- "2:\n"
- ".section .fixup, \"ax\"\n"
- "3: movl $1, %0\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : "=r"(faulted), "=a"(replaced)
- : "r"(ip), "r"(new), "c"(newch),
- "0"(faulted), "a"(old)
- : "memory");
+
+ /* read the text we want to modify */
+ if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ /* Make sure it is what we expect it to be */
+ if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+ return -EINVAL;
+
+ /* replace the text with the new text */
+ if (do_ftrace_mod_code(ip, new_code))
+ return -EPERM;
+
sync_core();
- if (replaced != old && replaced != new)
- faulted = 2;
+ return 0;
+}
+
+int ftrace_make_nop(struct module *mod,
+ struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned char *new, *old;
+ unsigned long ip = rec->ip;
- return faulted;
+ old = ftrace_call_replace(ip, addr);
+ new = ftrace_nop_replace();
+
+ return ftrace_modify_code(rec->ip, old, new);
}
-notrace int ftrace_update_ftrace_func(ftrace_func_t func)
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned char *new, *old;
+ unsigned long ip = rec->ip;
+
+ old = ftrace_nop_replace();
+ new = ftrace_call_replace(ip, addr);
+
+ return ftrace_modify_code(rec->ip, old, new);
+}
+
+int ftrace_update_ftrace_func(ftrace_func_t func)
{
unsigned long ip = (unsigned long)(&ftrace_call);
unsigned char old[MCOUNT_INSN_SIZE], *new;
@@ -110,32 +257,281 @@ notrace int ftrace_update_ftrace_func(ftrace_func_t func)
return ret;
}
-notrace int ftrace_mcount_set(unsigned long *data)
+int __init ftrace_dyn_arch_init(void *data)
{
- unsigned long ip = (long)(&mcount_call);
- unsigned long *addr = data;
- unsigned char old[MCOUNT_INSN_SIZE], *new;
+ extern const unsigned char ftrace_test_p6nop[];
+ extern const unsigned char ftrace_test_nop5[];
+ extern const unsigned char ftrace_test_jmp[];
+ int faulted = 0;
/*
- * Replace the mcount stub with a pointer to the
- * ip recorder function.
+ * There is no good nop for all x86 archs.
+ * We will default to using the P6_NOP5, but first we
+ * will test to make sure that the nop will actually
+ * work on this CPU. If it faults, we will then
+ * go to a lesser efficient 5 byte nop. If that fails
+ * we then just use a jmp as our nop. This isn't the most
+ * efficient nop, but we can not use a multi part nop
+ * since we would then risk being preempted in the middle
+ * of that nop, and if we enabled tracing then, it might
+ * cause a system crash.
+ *
+ * TODO: check the cpuid to determine the best nop.
*/
- memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
- new = ftrace_call_replace(ip, *addr);
- *addr = ftrace_modify_code(ip, old, new);
+ asm volatile (
+ "ftrace_test_jmp:"
+ "jmp ftrace_test_p6nop\n"
+ "nop\n"
+ "nop\n"
+ "nop\n" /* 2 byte jmp + 3 bytes */
+ "ftrace_test_p6nop:"
+ P6_NOP5
+ "jmp 1f\n"
+ "ftrace_test_nop5:"
+ ".byte 0x66,0x66,0x66,0x66,0x90\n"
+ "1:"
+ ".section .fixup, \"ax\"\n"
+ "2: movl $1, %0\n"
+ " jmp ftrace_test_nop5\n"
+ "3: movl $2, %0\n"
+ " jmp 1b\n"
+ ".previous\n"
+ _ASM_EXTABLE(ftrace_test_p6nop, 2b)
+ _ASM_EXTABLE(ftrace_test_nop5, 3b)
+ : "=r"(faulted) : "0" (faulted));
+
+ switch (faulted) {
+ case 0:
+ pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n");
+ memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
+ break;
+ case 1:
+ pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n");
+ memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
+ break;
+ case 2:
+ pr_info("ftrace: converting mcount calls to jmp . + 5\n");
+ memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
+ break;
+ }
+
+ /* The return code is retured via data */
+ *(unsigned long *)data = 0;
return 0;
}
+#endif
-int __init ftrace_dyn_arch_init(void *data)
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern void ftrace_graph_call(void);
+
+static int ftrace_mod_jmp(unsigned long ip,
+ int old_offset, int new_offset)
+{
+ unsigned char code[MCOUNT_INSN_SIZE];
+
+ if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
+ return -EINVAL;
+
+ *(int *)(&code[1]) = new_offset;
+
+ if (do_ftrace_mod_code(ip, &code))
+ return -EPERM;
+
+ return 0;
+}
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+ unsigned long ip = (unsigned long)(&ftrace_graph_call);
+ int old_offset, new_offset;
+
+ old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+ new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+
+ return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+ unsigned long ip = (unsigned long)(&ftrace_graph_call);
+ int old_offset, new_offset;
+
+ old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+ new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+
+ return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+#else /* CONFIG_DYNAMIC_FTRACE */
+
+/*
+ * These functions are picked from those used on
+ * this page for dynamic ftrace. They have been
+ * simplified to ignore all traces in NMI context.
+ */
+static atomic_t in_nmi;
+
+void ftrace_nmi_enter(void)
+{
+ atomic_inc(&in_nmi);
+}
+
+void ftrace_nmi_exit(void)
+{
+ atomic_dec(&in_nmi);
+}
+
+#endif /* !CONFIG_DYNAMIC_FTRACE */
+
+/* Add a function return address to the trace stack on thread info.*/
+static int push_return_trace(unsigned long ret, unsigned long long time,
+ unsigned long func, int *depth)
{
- const unsigned char *const *noptable = find_nop_table();
+ int index;
- /* This is running in kstop_machine */
+ if (!current->ret_stack)
+ return -EBUSY;
- ftrace_mcount_set(data);
+ /* The return trace stack is full */
+ if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+ atomic_inc(&current->trace_overrun);
+ return -EBUSY;
+ }
- ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE];
+ index = ++current->curr_ret_stack;
+ barrier();
+ current->ret_stack[index].ret = ret;
+ current->ret_stack[index].func = func;
+ current->ret_stack[index].calltime = time;
+ *depth = index;
return 0;
}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
+{
+ int index;
+
+ index = current->curr_ret_stack;
+
+ if (unlikely(index < 0)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ /* Might as well panic, otherwise we have no where to go */
+ *ret = (unsigned long)panic;
+ return;
+ }
+
+ *ret = current->ret_stack[index].ret;
+ trace->func = current->ret_stack[index].func;
+ trace->calltime = current->ret_stack[index].calltime;
+ trace->overrun = atomic_read(&current->trace_overrun);
+ trace->depth = index;
+ barrier();
+ current->curr_ret_stack--;
+
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(void)
+{
+ struct ftrace_graph_ret trace;
+ unsigned long ret;
+
+ pop_return_trace(&trace, &ret);
+ trace.rettime = cpu_clock(raw_smp_processor_id());
+ ftrace_graph_return(&trace);
+
+ if (unlikely(!ret)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ /* Might as well panic. What else to do? */
+ ret = (unsigned long)panic;
+ }
+
+ return ret;
+}
+
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info.
+ */
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
+{
+ unsigned long old;
+ unsigned long long calltime;
+ int faulted;
+ struct ftrace_graph_ent trace;
+ unsigned long return_hooker = (unsigned long)
+ &return_to_handler;
+
+ /* Nmi's are currently unsupported */
+ if (unlikely(atomic_read(&in_nmi)))
+ return;
+
+ if (unlikely(atomic_read(&current->tracing_graph_pause)))
+ return;
+
+ /*
+ * Protect against fault, even if it shouldn't
+ * happen. This tool is too much intrusive to
+ * ignore such a protection.
+ */
+ asm volatile(
+ "1: " _ASM_MOV " (%[parent_old]), %[old]\n"
+ "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n"
+ " movl $0, %[faulted]\n"
+
+ ".section .fixup, \"ax\"\n"
+ "3: movl $1, %[faulted]\n"
+ ".previous\n"
+
+ _ASM_EXTABLE(1b, 3b)
+ _ASM_EXTABLE(2b, 3b)
+
+ : [parent_replaced] "=r" (parent), [old] "=r" (old),
+ [faulted] "=r" (faulted)
+ : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
+ : "memory"
+ );
+
+ if (unlikely(faulted)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ return;
+ }
+
+ if (unlikely(!__kernel_text_address(old))) {
+ ftrace_graph_stop();
+ *parent = old;
+ WARN_ON(1);
+ return;
+ }
+
+ calltime = cpu_clock(raw_smp_processor_id());
+
+ if (push_return_trace(old, calltime,
+ self_addr, &trace.depth) == -EBUSY) {
+ *parent = old;
+ return;
+ }
+
+ trace.func = self_addr;
+
+ /* Only trace if the calling function expects to */
+ if (!ftrace_graph_entry(&trace)) {
+ current->curr_ret_stack--;
+ *parent = old;
+ }
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 6c9bfc9e1e95..2bced78b0b8e 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -21,6 +21,7 @@
#include <asm/smp.h>
#include <asm/ipi.h>
#include <asm/genapic.h>
+#include <asm/setup.h>
extern struct genapic apic_flat;
extern struct genapic apic_physflat;
@@ -53,6 +54,9 @@ void __init setup_apic_routing(void)
genapic = &apic_physflat;
printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
}
+
+ if (x86_quirks->update_genapic)
+ x86_quirks->update_genapic();
}
/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 9eca5ba7a6b1..c0262791bda4 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -25,7 +25,7 @@
#include <acpi/acpi_bus.h>
#endif
-static int __init flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
return 1;
}
@@ -170,7 +170,7 @@ struct genapic apic_flat = {
* We cannot use logical delivery in this case because the mask
* overflows, so use physical mode.
*/
-static int __init physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
#ifdef CONFIG_ACPI
/*
@@ -179,8 +179,10 @@ static int __init physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
* is an example).
*/
if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
- (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
+ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
+ printk(KERN_DEBUG "system APIC only can use physical flat");
return 1;
+ }
#endif
return 0;
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
index e4bf2cc0d743..f6a2c8eb48a6 100644
--- a/arch/x86/kernel/genx2apic_cluster.c
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -12,7 +12,7 @@
DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
-static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
if (cpu_has_x2apic)
return 1;
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
index 8f1343df2627..d042211768b7 100644
--- a/arch/x86/kernel/genx2apic_phys.c
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -19,7 +19,7 @@ static int set_x2apic_phys_mode(char *arg)
}
early_param("x2apic_phys", set_x2apic_phys_mode);
-static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
if (cpu_has_x2apic && x2apic_phys)
return 1;
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 33581d94a90e..dece17289731 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -10,14 +10,17 @@
#include <linux/kernel.h>
#include <linux/threads.h>
+#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/sched.h>
-#include <linux/bootmem.h>
#include <linux/module.h>
#include <linux/hardirq.h>
+#include <linux/timer.h>
+#include <linux/proc_fs.h>
+#include <asm/current.h>
#include <asm/smp.h>
#include <asm/ipi.h>
#include <asm/genapic.h>
@@ -30,7 +33,7 @@ DEFINE_PER_CPU(int, x2apic_extra_bits);
static enum uv_system_type uv_system_type;
-static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
if (!strcmp(oem_id, "SGI")) {
if (!strcmp(oem_table_id, "UVL"))
@@ -341,12 +344,12 @@ static __init void map_mmioh_high(int max_pnode)
static __init void uv_rtc_init(void)
{
- long status, ticks_per_sec, drift;
+ long status;
+ u64 ticks_per_sec;
- status =
- x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec,
- &drift);
- if (status != 0 || ticks_per_sec < 100000) {
+ status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK,
+ &ticks_per_sec);
+ if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) {
printk(KERN_WARNING
"unable to determine platform RTC clock frequency, "
"guessing.\n");
@@ -356,7 +359,119 @@ static __init void uv_rtc_init(void)
sn_rtc_cycles_per_second = ticks_per_sec;
}
-static bool uv_system_inited;
+/*
+ * percpu heartbeat timer
+ */
+static void uv_heartbeat(unsigned long ignored)
+{
+ struct timer_list *timer = &uv_hub_info->scir.timer;
+ unsigned char bits = uv_hub_info->scir.state;
+
+ /* flip heartbeat bit */
+ bits ^= SCIR_CPU_HEARTBEAT;
+
+ /* is this cpu idle? */
+ if (idle_cpu(raw_smp_processor_id()))
+ bits &= ~SCIR_CPU_ACTIVITY;
+ else
+ bits |= SCIR_CPU_ACTIVITY;
+
+ /* update system controller interface reg */
+ uv_set_scir_bits(bits);
+
+ /* enable next timer period */
+ mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+}
+
+static void __cpuinit uv_heartbeat_enable(int cpu)
+{
+ if (!uv_cpu_hub_info(cpu)->scir.enabled) {
+ struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
+
+ uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
+ setup_timer(timer, uv_heartbeat, cpu);
+ timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
+ add_timer_on(timer, cpu);
+ uv_cpu_hub_info(cpu)->scir.enabled = 1;
+ }
+
+ /* check boot cpu */
+ if (!uv_cpu_hub_info(0)->scir.enabled)
+ uv_heartbeat_enable(0);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void __cpuinit uv_heartbeat_disable(int cpu)
+{
+ if (uv_cpu_hub_info(cpu)->scir.enabled) {
+ uv_cpu_hub_info(cpu)->scir.enabled = 0;
+ del_timer(&uv_cpu_hub_info(cpu)->scir.timer);
+ }
+ uv_set_cpu_scir_bits(cpu, 0xff);
+}
+
+/*
+ * cpu hotplug notifier
+ */
+static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ uv_heartbeat_enable(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ uv_heartbeat_disable(cpu);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static __init void uv_scir_register_cpu_notifier(void)
+{
+ hotcpu_notifier(uv_scir_cpu_notify, 0);
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+
+static __init void uv_scir_register_cpu_notifier(void)
+{
+}
+
+static __init int uv_init_heartbeat(void)
+{
+ int cpu;
+
+ if (is_uv_system())
+ for_each_online_cpu(cpu)
+ uv_heartbeat_enable(cpu);
+ return 0;
+}
+
+late_initcall(uv_init_heartbeat);
+
+#endif /* !CONFIG_HOTPLUG_CPU */
+
+/*
+ * Called on each cpu to initialize the per_cpu UV data area.
+ * ZZZ hotplug not supported yet
+ */
+void __cpuinit uv_cpu_init(void)
+{
+ /* CPU 0 initilization will be done via uv_system_init. */
+ if (!uv_blade_info)
+ return;
+
+ uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
+
+ if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
+ set_x2apic_extra_bits(uv_hub_info->pnode);
+}
+
void __init uv_system_init(void)
{
@@ -383,16 +498,16 @@ void __init uv_system_init(void)
printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
- uv_blade_info = alloc_bootmem_pages(bytes);
+ uv_blade_info = kmalloc(bytes, GFP_KERNEL);
get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
- uv_node_to_blade = alloc_bootmem_pages(bytes);
+ uv_node_to_blade = kmalloc(bytes, GFP_KERNEL);
memset(uv_node_to_blade, 255, bytes);
bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
- uv_cpu_to_blade = alloc_bootmem_pages(bytes);
+ uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL);
memset(uv_cpu_to_blade, 255, bytes);
blade = 0;
@@ -412,6 +527,9 @@ void __init uv_system_init(void)
gnode_upper = (((unsigned long)node_id.s.node_id) &
~((1 << n_val) - 1)) << m_val;
+ uv_bios_init();
+ uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
+ &sn_coherency_id, &sn_region_size);
uv_rtc_init();
for_each_present_cpu(cpu) {
@@ -422,8 +540,7 @@ void __init uv_system_init(void)
uv_blade_info[blade].nr_possible_cpus++;
uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
- uv_cpu_hub_info(cpu)->lowmem_remap_top =
- lowmem_redir_base + lowmem_redir_size;
+ uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
uv_cpu_hub_info(cpu)->m_val = m_val;
uv_cpu_hub_info(cpu)->n_val = m_val;
uv_cpu_hub_info(cpu)->numa_blade_id = blade;
@@ -433,7 +550,8 @@ void __init uv_system_init(void)
uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
- uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */
+ uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
+ uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
uv_node_to_blade[nid] = blade;
uv_cpu_to_blade[cpu] = blade;
max_pnode = max(pnode, max_pnode);
@@ -448,21 +566,8 @@ void __init uv_system_init(void)
map_mmr_high(max_pnode);
map_config_high(max_pnode);
map_mmioh_high(max_pnode);
- uv_system_inited = true;
-}
-
-/*
- * Called on each cpu to initialize the per_cpu UV data area.
- * ZZZ hotplug not supported yet
- */
-void __cpuinit uv_cpu_init(void)
-{
- BUG_ON(!uv_system_inited);
-
- uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
- if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
- set_x2apic_extra_bits(uv_hub_info->pnode);
+ uv_cpu_init();
+ uv_scir_register_cpu_notifier();
+ proc_mkdir("sgi_uv", NULL);
}
-
-
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 1dcb0f13897e..3e66bd364a9d 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -35,7 +35,6 @@ void __init reserve_ebda_region(void)
/* start of EBDA area */
ebda_addr = get_bios_ebda();
- printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
/* Fixup: bios puts an EBDA in the top 64K segment */
/* of conventional memory, but does not adjust lowmem. */
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index fa1d25dd83e3..ac108d1fe182 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -12,9 +12,12 @@
#include <asm/sections.h>
#include <asm/e820.h>
#include <asm/bios_ebda.h>
+#include <asm/trampoline.h>
void __init i386_start_kernel(void)
{
+ reserve_trampoline_memory();
+
reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index d16084f90649..388e05a5fc17 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -24,6 +24,7 @@
#include <asm/kdebug.h>
#include <asm/e820.h>
#include <asm/bios_ebda.h>
+#include <asm/trampoline.h>
/* boot cpu pda */
static struct x8664_pda _boot_cpu_pda __read_mostly;
@@ -120,6 +121,8 @@ void __init x86_64_start_reservations(char *real_mode_data)
{
copy_bootdata(__va(real_mode_data));
+ reserve_trampoline_memory();
+
reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index acf62fc233da..845ea097383e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1,29 +1,51 @@
#include <linux/clocksource.h>
#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/sysdev.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/hpet.h>
#include <linux/init.h>
-#include <linux/sysdev.h>
+#include <linux/cpu.h>
#include <linux/pm.h>
+#include <linux/io.h>
#include <asm/fixmap.h>
-#include <asm/hpet.h>
#include <asm/i8253.h>
-#include <asm/io.h>
+#include <asm/hpet.h>
-#define HPET_MASK CLOCKSOURCE_MASK(32)
-#define HPET_SHIFT 22
+#define HPET_MASK CLOCKSOURCE_MASK(32)
+#define HPET_SHIFT 22
/* FSEC = 10^-15
NSEC = 10^-9 */
-#define FSEC_PER_NSEC 1000000L
+#define FSEC_PER_NSEC 1000000L
+
+#define HPET_DEV_USED_BIT 2
+#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT)
+#define HPET_DEV_VALID 0x8
+#define HPET_DEV_FSB_CAP 0x1000
+#define HPET_DEV_PERI_CAP 0x2000
+
+#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
/*
* HPET address is set in acpi/boot.c, when an ACPI entry exists
*/
-unsigned long hpet_address;
-static void __iomem *hpet_virt_address;
+unsigned long hpet_address;
+#ifdef CONFIG_PCI_MSI
+static unsigned long hpet_num_timers;
+#endif
+static void __iomem *hpet_virt_address;
+
+struct hpet_dev {
+ struct clock_event_device evt;
+ unsigned int num;
+ int cpu;
+ unsigned int irq;
+ unsigned int flags;
+ char name[10];
+};
unsigned long hpet_readl(unsigned long a)
{
@@ -59,7 +81,7 @@ static inline void hpet_clear_mapping(void)
static int boot_hpet_disable;
int hpet_force_user;
-static int __init hpet_setup(char* str)
+static int __init hpet_setup(char *str)
{
if (str) {
if (!strncmp("disable", str, 7))
@@ -80,7 +102,7 @@ __setup("nohpet", disable_hpet);
static inline int is_hpet_capable(void)
{
- return (!boot_hpet_disable && hpet_address);
+ return !boot_hpet_disable && hpet_address;
}
/*
@@ -102,6 +124,9 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled);
* timer 0 and timer 1 in case of RTC emulation.
*/
#ifdef CONFIG_HPET
+
+static void hpet_reserve_msi_timers(struct hpet_data *hd);
+
static void hpet_reserve_platform_timers(unsigned long id)
{
struct hpet __iomem *hpet = hpet_virt_address;
@@ -111,10 +136,10 @@ static void hpet_reserve_platform_timers(unsigned long id)
nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
- memset(&hd, 0, sizeof (hd));
- hd.hd_phys_address = hpet_address;
- hd.hd_address = hpet;
- hd.hd_nirqs = nrtimers;
+ memset(&hd, 0, sizeof(hd));
+ hd.hd_phys_address = hpet_address;
+ hd.hd_address = hpet;
+ hd.hd_nirqs = nrtimers;
hpet_reserve_timer(&hd, 0);
#ifdef CONFIG_HPET_EMULATE_RTC
@@ -130,10 +155,12 @@ static void hpet_reserve_platform_timers(unsigned long id)
hd.hd_irq[1] = HPET_LEGACY_RTC;
for (i = 2; i < nrtimers; timer++, i++) {
- hd.hd_irq[i] = (readl(&timer->hpet_config) & Tn_INT_ROUTE_CNF_MASK) >>
- Tn_INT_ROUTE_CNF_SHIFT;
+ hd.hd_irq[i] = (readl(&timer->hpet_config) &
+ Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
}
+ hpet_reserve_msi_timers(&hd);
+
hpet_alloc(&hd);
}
@@ -227,71 +254,422 @@ static void hpet_legacy_clockevent_register(void)
printk(KERN_DEBUG "hpet clockevent registered\n");
}
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static int hpet_setup_msi_irq(unsigned int irq);
+
+static void hpet_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt, int timer)
{
unsigned long cfg, cmp, now;
uint64_t delta;
- switch(mode) {
+ switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
- delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult;
- delta >>= hpet_clockevent.shift;
+ delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
+ delta >>= evt->shift;
now = hpet_readl(HPET_COUNTER);
cmp = now + (unsigned long) delta;
- cfg = hpet_readl(HPET_T0_CFG);
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
HPET_TN_SETVAL | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_T0_CFG);
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
/*
* The first write after writing TN_SETVAL to the
* config register sets the counter value, the second
* write sets the period.
*/
- hpet_writel(cmp, HPET_T0_CMP);
+ hpet_writel(cmp, HPET_Tn_CMP(timer));
udelay(1);
- hpet_writel((unsigned long) delta, HPET_T0_CMP);
+ hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer));
break;
case CLOCK_EVT_MODE_ONESHOT:
- cfg = hpet_readl(HPET_T0_CFG);
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
cfg &= ~HPET_TN_PERIODIC;
cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_T0_CFG);
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
break;
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
- cfg = hpet_readl(HPET_T0_CFG);
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
cfg &= ~HPET_TN_ENABLE;
- hpet_writel(cfg, HPET_T0_CFG);
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
break;
case CLOCK_EVT_MODE_RESUME:
- hpet_enable_legacy_int();
+ if (timer == 0) {
+ hpet_enable_legacy_int();
+ } else {
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+ hpet_setup_msi_irq(hdev->irq);
+ disable_irq(hdev->irq);
+ irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu));
+ enable_irq(hdev->irq);
+ }
break;
}
}
-static int hpet_legacy_next_event(unsigned long delta,
- struct clock_event_device *evt)
+static int hpet_next_event(unsigned long delta,
+ struct clock_event_device *evt, int timer)
{
u32 cnt;
cnt = hpet_readl(HPET_COUNTER);
cnt += (u32) delta;
- hpet_writel(cnt, HPET_T0_CMP);
+ hpet_writel(cnt, HPET_Tn_CMP(timer));
/*
* We need to read back the CMP register to make sure that
* what we wrote hit the chip before we compare it to the
* counter.
*/
- WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt);
+ WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt);
return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
}
+static void hpet_legacy_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ hpet_set_mode(mode, evt, 0);
+}
+
+static int hpet_legacy_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ return hpet_next_event(delta, evt, 0);
+}
+
+/*
+ * HPET MSI Support
+ */
+#ifdef CONFIG_PCI_MSI
+
+static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
+static struct hpet_dev *hpet_devs;
+
+void hpet_msi_unmask(unsigned int irq)
+{
+ struct hpet_dev *hdev = get_irq_data(irq);
+ unsigned long cfg;
+
+ /* unmask it */
+ cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
+ cfg |= HPET_TN_FSB;
+ hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
+}
+
+void hpet_msi_mask(unsigned int irq)
+{
+ unsigned long cfg;
+ struct hpet_dev *hdev = get_irq_data(irq);
+
+ /* mask it */
+ cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
+ cfg &= ~HPET_TN_FSB;
+ hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
+}
+
+void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
+{
+ struct hpet_dev *hdev = get_irq_data(irq);
+
+ hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
+ hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
+}
+
+void hpet_msi_read(unsigned int irq, struct msi_msg *msg)
+{
+ struct hpet_dev *hdev = get_irq_data(irq);
+
+ msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
+ msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
+ msg->address_hi = 0;
+}
+
+static void hpet_msi_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+ hpet_set_mode(mode, evt, hdev->num);
+}
+
+static int hpet_msi_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+ return hpet_next_event(delta, evt, hdev->num);
+}
+
+static int hpet_setup_msi_irq(unsigned int irq)
+{
+ if (arch_setup_hpet_msi(irq)) {
+ destroy_irq(irq);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int hpet_assign_irq(struct hpet_dev *dev)
+{
+ unsigned int irq;
+
+ irq = create_irq();
+ if (!irq)
+ return -EINVAL;
+
+ set_irq_data(irq, dev);
+
+ if (hpet_setup_msi_irq(irq))
+ return -EINVAL;
+
+ dev->irq = irq;
+ return 0;
+}
+
+static irqreturn_t hpet_interrupt_handler(int irq, void *data)
+{
+ struct hpet_dev *dev = (struct hpet_dev *)data;
+ struct clock_event_device *hevt = &dev->evt;
+
+ if (!hevt->event_handler) {
+ printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n",
+ dev->num);
+ return IRQ_HANDLED;
+ }
+
+ hevt->event_handler(hevt);
+ return IRQ_HANDLED;
+}
+
+static int hpet_setup_irq(struct hpet_dev *dev)
+{
+
+ if (request_irq(dev->irq, hpet_interrupt_handler,
+ IRQF_DISABLED|IRQF_NOBALANCING, dev->name, dev))
+ return -1;
+
+ disable_irq(dev->irq);
+ irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu));
+ enable_irq(dev->irq);
+
+ printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
+ dev->name, dev->irq);
+
+ return 0;
+}
+
+/* This should be called in specific @cpu */
+static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
+{
+ struct clock_event_device *evt = &hdev->evt;
+ uint64_t hpet_freq;
+
+ WARN_ON(cpu != smp_processor_id());
+ if (!(hdev->flags & HPET_DEV_VALID))
+ return;
+
+ if (hpet_setup_msi_irq(hdev->irq))
+ return;
+
+ hdev->cpu = cpu;
+ per_cpu(cpu_hpet_dev, cpu) = hdev;
+ evt->name = hdev->name;
+ hpet_setup_irq(hdev);
+ evt->irq = hdev->irq;
+
+ evt->rating = 110;
+ evt->features = CLOCK_EVT_FEAT_ONESHOT;
+ if (hdev->flags & HPET_DEV_PERI_CAP)
+ evt->features |= CLOCK_EVT_FEAT_PERIODIC;
+
+ evt->set_mode = hpet_msi_set_mode;
+ evt->set_next_event = hpet_msi_next_event;
+ evt->shift = 32;
+
+ /*
+ * The period is a femto seconds value. We need to calculate the
+ * scaled math multiplication factor for nanosecond to hpet tick
+ * conversion.
+ */
+ hpet_freq = 1000000000000000ULL;
+ do_div(hpet_freq, hpet_period);
+ evt->mult = div_sc((unsigned long) hpet_freq,
+ NSEC_PER_SEC, evt->shift);
+ /* Calculate the max delta */
+ evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
+ /* 5 usec minimum reprogramming delta. */
+ evt->min_delta_ns = 5000;
+
+ evt->cpumask = cpumask_of_cpu(hdev->cpu);
+ clockevents_register_device(evt);
+}
+
+#ifdef CONFIG_HPET
+/* Reserve at least one timer for userspace (/dev/hpet) */
+#define RESERVE_TIMERS 1
+#else
+#define RESERVE_TIMERS 0
+#endif
+
+static void hpet_msi_capability_lookup(unsigned int start_timer)
+{
+ unsigned int id;
+ unsigned int num_timers;
+ unsigned int num_timers_used = 0;
+ int i;
+
+ id = hpet_readl(HPET_ID);
+
+ num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
+ num_timers++; /* Value read out starts from 0 */
+
+ hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
+ if (!hpet_devs)
+ return;
+
+ hpet_num_timers = num_timers;
+
+ for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
+ struct hpet_dev *hdev = &hpet_devs[num_timers_used];
+ unsigned long cfg = hpet_readl(HPET_Tn_CFG(i));
+
+ /* Only consider HPET timer with MSI support */
+ if (!(cfg & HPET_TN_FSB_CAP))
+ continue;
+
+ hdev->flags = 0;
+ if (cfg & HPET_TN_PERIODIC_CAP)
+ hdev->flags |= HPET_DEV_PERI_CAP;
+ hdev->num = i;
+
+ sprintf(hdev->name, "hpet%d", i);
+ if (hpet_assign_irq(hdev))
+ continue;
+
+ hdev->flags |= HPET_DEV_FSB_CAP;
+ hdev->flags |= HPET_DEV_VALID;
+ num_timers_used++;
+ if (num_timers_used == num_possible_cpus())
+ break;
+ }
+
+ printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n",
+ num_timers, num_timers_used);
+}
+
+#ifdef CONFIG_HPET
+static void hpet_reserve_msi_timers(struct hpet_data *hd)
+{
+ int i;
+
+ if (!hpet_devs)
+ return;
+
+ for (i = 0; i < hpet_num_timers; i++) {
+ struct hpet_dev *hdev = &hpet_devs[i];
+
+ if (!(hdev->flags & HPET_DEV_VALID))
+ continue;
+
+ hd->hd_irq[hdev->num] = hdev->irq;
+ hpet_reserve_timer(hd, hdev->num);
+ }
+}
+#endif
+
+static struct hpet_dev *hpet_get_unused_timer(void)
+{
+ int i;
+
+ if (!hpet_devs)
+ return NULL;
+
+ for (i = 0; i < hpet_num_timers; i++) {
+ struct hpet_dev *hdev = &hpet_devs[i];
+
+ if (!(hdev->flags & HPET_DEV_VALID))
+ continue;
+ if (test_and_set_bit(HPET_DEV_USED_BIT,
+ (unsigned long *)&hdev->flags))
+ continue;
+ return hdev;
+ }
+ return NULL;
+}
+
+struct hpet_work_struct {
+ struct delayed_work work;
+ struct completion complete;
+};
+
+static void hpet_work(struct work_struct *w)
+{
+ struct hpet_dev *hdev;
+ int cpu = smp_processor_id();
+ struct hpet_work_struct *hpet_work;
+
+ hpet_work = container_of(w, struct hpet_work_struct, work.work);
+
+ hdev = hpet_get_unused_timer();
+ if (hdev)
+ init_one_hpet_msi_clockevent(hdev, cpu);
+
+ complete(&hpet_work->complete);
+}
+
+static int hpet_cpuhp_notify(struct notifier_block *n,
+ unsigned long action, void *hcpu)
+{
+ unsigned long cpu = (unsigned long)hcpu;
+ struct hpet_work_struct work;
+ struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu);
+
+ switch (action & 0xf) {
+ case CPU_ONLINE:
+ INIT_DELAYED_WORK(&work.work, hpet_work);
+ init_completion(&work.complete);
+ /* FIXME: add schedule_work_on() */
+ schedule_delayed_work_on(cpu, &work.work, 0);
+ wait_for_completion(&work.complete);
+ break;
+ case CPU_DEAD:
+ if (hdev) {
+ free_irq(hdev->irq, hdev);
+ hdev->flags &= ~HPET_DEV_USED;
+ per_cpu(cpu_hpet_dev, cpu) = NULL;
+ }
+ break;
+ }
+ return NOTIFY_OK;
+}
+#else
+
+static int hpet_setup_msi_irq(unsigned int irq)
+{
+ return 0;
+}
+static void hpet_msi_capability_lookup(unsigned int start_timer)
+{
+ return;
+}
+
+#ifdef CONFIG_HPET
+static void hpet_reserve_msi_timers(struct hpet_data *hd)
+{
+ return;
+}
+#endif
+
+static int hpet_cpuhp_notify(struct notifier_block *n,
+ unsigned long action, void *hcpu)
+{
+ return NOTIFY_OK;
+}
+
+#endif
+
/*
* Clock source related code
*/
@@ -427,13 +805,15 @@ int __init hpet_enable(void)
if (id & HPET_ID_LEGSUP) {
hpet_legacy_clockevent_register();
+ hpet_msi_capability_lookup(2);
return 1;
}
+ hpet_msi_capability_lookup(0);
return 0;
out_nohpet:
hpet_clear_mapping();
- boot_hpet_disable = 1;
+ hpet_address = 0;
return 0;
}
@@ -445,6 +825,8 @@ out_nohpet:
*/
static __init int hpet_late_init(void)
{
+ int cpu;
+
if (boot_hpet_disable)
return -ENODEV;
@@ -454,12 +836,20 @@ static __init int hpet_late_init(void)
hpet_address = force_hpet_address;
hpet_enable();
- if (!hpet_virt_address)
- return -ENODEV;
}
+ if (!hpet_virt_address)
+ return -ENODEV;
+
hpet_reserve_platform_timers(hpet_readl(HPET_ID));
+ for_each_online_cpu(cpu) {
+ hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
+ }
+
+ /* This notifier should be called after workqueue is ready */
+ hotcpu_notifier(hpet_cpuhp_notify, -20);
+
return 0;
}
fs_initcall(hpet_late_init);
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index dd7ebee446af..43cec6bdda63 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -5,7 +5,7 @@
#include <asm/desc.h>
#include <asm/ftrace.h>
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
/* mcount is defined in assembly */
EXPORT_SYMBOL(mcount);
#endif
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 1f20608d4ca8..b0f61f0dcd0a 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -58,7 +58,7 @@ void __cpuinit mxcsr_feature_mask_init(void)
stts();
}
-void __init init_thread_xstate(void)
+void __cpuinit init_thread_xstate(void)
{
if (!HAVE_HWFP) {
xstate_size = sizeof(struct i387_soft_struct);
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index a4f93b4120c1..d39918076bb4 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -14,7 +14,6 @@ static struct fs_struct init_fs = INIT_FS;
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
-EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
/*
* Initial thread structure.
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic.c
index 02063ae042f7..f6ea94b74da1 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic.c
@@ -27,17 +27,21 @@
#include <linux/sched.h>
#include <linux/pci.h>
#include <linux/mc146818rtc.h>
+#include <linux/compiler.h>
#include <linux/acpi.h>
+#include <linux/module.h>
#include <linux/sysdev.h>
#include <linux/msi.h>
#include <linux/htirq.h>
-#include <linux/dmar.h>
-#include <linux/jiffies.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/jiffies.h> /* time_after() */
#ifdef CONFIG_ACPI
#include <acpi/acpi_bus.h>
#endif
#include <linux/bootmem.h>
#include <linux/dmar.h>
+#include <linux/hpet.h>
#include <asm/idle.h>
#include <asm/io.h>
@@ -46,27 +50,108 @@
#include <asm/proto.h>
#include <asm/acpi.h>
#include <asm/dma.h>
+#include <asm/timer.h>
#include <asm/i8259.h>
#include <asm/nmi.h>
#include <asm/msidef.h>
#include <asm/hypertransport.h>
+#include <asm/setup.h>
#include <asm/irq_remapping.h>
+#include <asm/hpet.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/uv_irq.h>
#include <mach_ipi.h>
#include <mach_apic.h>
+#include <mach_apicdef.h>
#define __apicdebuginit(type) static type __init
+/*
+ * Is the SiS APIC rmw bug present ?
+ * -1 = don't know, 0 = no, 1 = yes
+ */
+int sis_apic_bug = -1;
+
+static DEFINE_SPINLOCK(ioapic_lock);
+static DEFINE_SPINLOCK(vector_lock);
+
+/*
+ * # of IRQ routing registers
+ */
+int nr_ioapic_registers[MAX_IO_APICS];
+
+/* I/O APIC entries */
+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
+int nr_ioapics;
+
+/* MP IRQ source entries */
+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* # of MP IRQ source entries */
+int mp_irq_entries;
+
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
+int skip_ioapic_setup;
+
+static int __init parse_noapic(char *str)
+{
+ /* disable IO-APIC */
+ disable_ioapic_setup();
+ return 0;
+}
+early_param("noapic", parse_noapic);
+
+struct irq_pin_list;
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+struct irq_pin_list {
+ int apic, pin;
+ struct irq_pin_list *next;
+};
+
+static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+{
+ struct irq_pin_list *pin;
+ int node;
+
+ node = cpu_to_node(cpu);
+
+ pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
+ printk(KERN_DEBUG " alloc irq_2_pin on cpu %d node %d\n", cpu, node);
+
+ return pin;
+}
+
struct irq_cfg {
+ struct irq_pin_list *irq_2_pin;
cpumask_t domain;
cpumask_t old_domain;
unsigned move_cleanup_count;
u8 vector;
u8 move_in_progress : 1;
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+ u8 move_desc_pending : 1;
+#endif
};
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg irq_cfgx[] = {
+#else
+static struct irq_cfg irq_cfgx[NR_IRQS] = {
+#endif
[0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
[1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
[2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
@@ -85,63 +170,190 @@ static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
[15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
};
-static int assign_irq_vector(int irq, cpumask_t mask);
+void __init arch_early_irq_init(void)
+{
+ struct irq_cfg *cfg;
+ struct irq_desc *desc;
+ int count;
+ int i;
-int first_system_vector = 0xfe;
+ cfg = irq_cfgx;
+ count = ARRAY_SIZE(irq_cfgx);
-char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
+ for (i = 0; i < count; i++) {
+ desc = irq_to_desc(i);
+ desc->chip_data = &cfg[i];
+ }
+}
-int sis_apic_bug; /* not actually supported, dummy for compile */
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+ struct irq_cfg *cfg = NULL;
+ struct irq_desc *desc;
-static int no_timer_check;
+ desc = irq_to_desc(irq);
+ if (desc)
+ cfg = desc->chip_data;
-static int disable_timer_pin_1 __initdata;
+ return cfg;
+}
-int timer_through_8259 __initdata;
+static struct irq_cfg *get_one_free_irq_cfg(int cpu)
+{
+ struct irq_cfg *cfg;
+ int node;
-/* Where if anywhere is the i8259 connect in external int mode */
-static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+ node = cpu_to_node(cpu);
-static DEFINE_SPINLOCK(ioapic_lock);
-static DEFINE_SPINLOCK(vector_lock);
+ cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+ printk(KERN_DEBUG " alloc irq_cfg on cpu %d node %d\n", cpu, node);
-/*
- * # of IRQ routing registers
- */
-int nr_ioapic_registers[MAX_IO_APICS];
+ return cfg;
+}
-/* I/O APIC RTE contents at the OS boot up */
-struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
+void arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+ struct irq_cfg *cfg;
-/* I/O APIC entries */
-struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
-int nr_ioapics;
+ cfg = desc->chip_data;
+ if (!cfg) {
+ desc->chip_data = get_one_free_irq_cfg(cpu);
+ if (!desc->chip_data) {
+ printk(KERN_ERR "can not alloc irq_cfg\n");
+ BUG_ON(1);
+ }
+ }
+}
-/* MP IRQ source entries */
-struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-/* # of MP IRQ source entries */
-int mp_irq_entries;
+static void
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
+{
+ struct irq_pin_list *old_entry, *head, *tail, *entry;
-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+ cfg->irq_2_pin = NULL;
+ old_entry = old_cfg->irq_2_pin;
+ if (!old_entry)
+ return;
-/*
- * Rough estimation of how many shared IRQs there are, can
- * be changed anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+ entry = get_one_free_irq_2_pin(cpu);
+ if (!entry)
+ return;
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
+ entry->apic = old_entry->apic;
+ entry->pin = old_entry->pin;
+ head = entry;
+ tail = entry;
+ old_entry = old_entry->next;
+ while (old_entry) {
+ entry = get_one_free_irq_2_pin(cpu);
+ if (!entry) {
+ entry = head;
+ while (entry) {
+ head = entry->next;
+ kfree(entry);
+ entry = head;
+ }
+ /* still use the old one */
+ return;
+ }
+ entry->apic = old_entry->apic;
+ entry->pin = old_entry->pin;
+ tail->next = entry;
+ tail = entry;
+ old_entry = old_entry->next;
+ }
+
+ tail->next = NULL;
+ cfg->irq_2_pin = head;
+}
+
+static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
+{
+ struct irq_pin_list *entry, *next;
+
+ if (old_cfg->irq_2_pin == cfg->irq_2_pin)
+ return;
+
+ entry = old_cfg->irq_2_pin;
+
+ while (entry) {
+ next = entry->next;
+ kfree(entry);
+ entry = next;
+ }
+ old_cfg->irq_2_pin = NULL;
+}
+
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+ struct irq_desc *desc, int cpu)
+{
+ struct irq_cfg *cfg;
+ struct irq_cfg *old_cfg;
+
+ cfg = get_one_free_irq_cfg(cpu);
+
+ if (!cfg)
+ return;
+
+ desc->chip_data = cfg;
+
+ old_cfg = old_desc->chip_data;
+
+ memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+ init_copy_irq_2_pin(old_cfg, cfg, cpu);
+}
+
+static void free_irq_cfg(struct irq_cfg *old_cfg)
+{
+ kfree(old_cfg);
+}
+
+void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+ struct irq_cfg *old_cfg, *cfg;
+
+ old_cfg = old_desc->chip_data;
+ cfg = desc->chip_data;
+
+ if (old_cfg == cfg)
+ return;
+
+ if (old_cfg) {
+ free_irq_2_pin(old_cfg, cfg);
+ free_irq_cfg(old_cfg);
+ old_desc->chip_data = NULL;
+ }
+}
+
+static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+ struct irq_cfg *cfg = desc->chip_data;
+
+ if (!cfg->move_in_progress) {
+ /* it means that domain is not changed */
+ if (!cpus_intersects(desc->affinity, mask))
+ cfg->move_desc_pending = 1;
+ }
+}
+#endif
+
+#else
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+ return irq < nr_irqs ? irq_cfgx + irq : NULL;
+}
+
+#endif
-static struct irq_pin_list {
- short apic, pin, next;
-} irq_2_pin[PIN_MAP_SIZE];
+#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
+static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+}
+#endif
struct io_apic {
unsigned int index;
@@ -172,27 +384,32 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
/*
* Re-write a value: to be used for read-modify-write
* cycles where the read already set up the index register.
+ *
+ * Older SiS APIC requires we rewrite the index register
*/
-static inline void io_apic_modify(unsigned int apic, unsigned int value)
+static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
+
+ if (sis_apic_bug)
+ writel(reg, &io_apic->index);
writel(value, &io_apic->data);
}
-static bool io_apic_level_ack_pending(unsigned int irq)
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
{
struct irq_pin_list *entry;
unsigned long flags;
spin_lock_irqsave(&ioapic_lock, flags);
- entry = irq_2_pin + irq;
+ entry = cfg->irq_2_pin;
for (;;) {
unsigned int reg;
int pin;
- pin = entry->pin;
- if (pin == -1)
+ if (!entry)
break;
+ pin = entry->pin;
reg = io_apic_read(entry->apic, 0x10 + pin*2);
/* Is the remote IRR bit set? */
if (reg & IO_APIC_REDIR_REMOTE_IRR) {
@@ -201,45 +418,13 @@ static bool io_apic_level_ack_pending(unsigned int irq)
}
if (!entry->next)
break;
- entry = irq_2_pin + entry->next;
+ entry = entry->next;
}
spin_unlock_irqrestore(&ioapic_lock, flags);
return false;
}
-/*
- * Synchronize the IO-APIC and the CPU by doing
- * a dummy read from the IO-APIC
- */
-static inline void io_apic_sync(unsigned int apic)
-{
- struct io_apic __iomem *io_apic = io_apic_base(apic);
- readl(&io_apic->data);
-}
-
-#define __DO_ACTION(R, ACTION, FINAL) \
- \
-{ \
- int pin; \
- struct irq_pin_list *entry = irq_2_pin + irq; \
- \
- BUG_ON(irq >= NR_IRQS); \
- for (;;) { \
- unsigned int reg; \
- pin = entry->pin; \
- if (pin == -1) \
- break; \
- reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
- reg ACTION; \
- io_apic_modify(entry->apic, reg); \
- FINAL; \
- if (!entry->next) \
- break; \
- entry = irq_2_pin + entry->next; \
- } \
-}
-
union entry_union {
struct { u32 w1, w2; };
struct IO_APIC_route_entry entry;
@@ -296,83 +481,118 @@ static void ioapic_mask_entry(int apic, int pin)
}
#ifdef CONFIG_SMP
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
{
int apic, pin;
- struct irq_pin_list *entry = irq_2_pin + irq;
+ struct irq_pin_list *entry;
+ u8 vector = cfg->vector;
- BUG_ON(irq >= NR_IRQS);
+ entry = cfg->irq_2_pin;
for (;;) {
unsigned int reg;
+
+ if (!entry)
+ break;
+
apic = entry->apic;
pin = entry->pin;
- if (pin == -1)
- break;
+#ifdef CONFIG_INTR_REMAP
/*
* With interrupt-remapping, destination information comes
* from interrupt-remapping table entry.
*/
if (!irq_remapped(irq))
io_apic_write(apic, 0x11 + pin*2, dest);
+#else
+ io_apic_write(apic, 0x11 + pin*2, dest);
+#endif
reg = io_apic_read(apic, 0x10 + pin*2);
reg &= ~IO_APIC_REDIR_VECTOR_MASK;
reg |= vector;
- io_apic_modify(apic, reg);
+ io_apic_modify(apic, 0x10 + pin*2, reg);
if (!entry->next)
break;
- entry = irq_2_pin + entry->next;
+ entry = entry->next;
}
}
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
+
+static void set_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
unsigned long flags;
unsigned int dest;
cpumask_t tmp;
+ unsigned int irq;
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
return;
- if (assign_irq_vector(irq, mask))
+ irq = desc->irq;
+ cfg = desc->chip_data;
+ if (assign_irq_vector(irq, cfg, mask))
return;
+ set_extra_move_desc(desc, mask);
+
cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
-
/*
* Only the high 8 bits are valid.
*/
dest = SET_APIC_LOGICAL_ID(dest);
spin_lock_irqsave(&ioapic_lock, flags);
- __target_IO_APIC_irq(irq, dest, cfg->vector);
- irq_desc[irq].affinity = mask;
+ __target_IO_APIC_irq(irq, dest, cfg);
+ desc->affinity = mask;
spin_unlock_irqrestore(&ioapic_lock, flags);
}
-#endif
+
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+ struct irq_desc *desc;
+
+ desc = irq_to_desc(irq);
+
+ set_ioapic_affinity_irq_desc(desc, mask);
+}
+#endif /* CONFIG_SMP */
/*
* The common case is 1:1 IRQ<->pin mappings. Sometimes there are
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
{
- static int first_free_entry = NR_IRQS;
- struct irq_pin_list *entry = irq_2_pin + irq;
+ struct irq_pin_list *entry;
- BUG_ON(irq >= NR_IRQS);
- while (entry->next)
- entry = irq_2_pin + entry->next;
+ entry = cfg->irq_2_pin;
+ if (!entry) {
+ entry = get_one_free_irq_2_pin(cpu);
+ if (!entry) {
+ printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
+ apic, pin);
+ return;
+ }
+ cfg->irq_2_pin = entry;
+ entry->apic = apic;
+ entry->pin = pin;
+ return;
+ }
+
+ while (entry->next) {
+ /* not again, please */
+ if (entry->apic == apic && entry->pin == pin)
+ return;
- if (entry->pin != -1) {
- entry->next = first_free_entry;
- entry = irq_2_pin + entry->next;
- if (++first_free_entry >= PIN_MAP_SIZE)
- panic("io_apic.c: ran out of irq_2_pin entries!");
+ entry = entry->next;
}
+
+ entry->next = get_one_free_irq_2_pin(cpu);
+ entry = entry->next;
entry->apic = apic;
entry->pin = pin;
}
@@ -380,53 +600,123 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
/*
* Reroute an IRQ to a different pin.
*/
-static void __init replace_pin_at_irq(unsigned int irq,
+static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
int oldapic, int oldpin,
int newapic, int newpin)
{
- struct irq_pin_list *entry = irq_2_pin + irq;
+ struct irq_pin_list *entry = cfg->irq_2_pin;
+ int replaced = 0;
- while (1) {
+ while (entry) {
if (entry->apic == oldapic && entry->pin == oldpin) {
entry->apic = newapic;
entry->pin = newpin;
- }
- if (!entry->next)
+ replaced = 1;
+ /* every one is different, right? */
break;
- entry = irq_2_pin + entry->next;
+ }
+ entry = entry->next;
}
+
+ /* why? call replace before add? */
+ if (!replaced)
+ add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
}
+static inline void io_apic_modify_irq(struct irq_cfg *cfg,
+ int mask_and, int mask_or,
+ void (*final)(struct irq_pin_list *entry))
+{
+ int pin;
+ struct irq_pin_list *entry;
-#define DO_ACTION(name,R,ACTION, FINAL) \
- \
- static void name##_IO_APIC_irq (unsigned int irq) \
- __DO_ACTION(R, ACTION, FINAL)
+ for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
+ unsigned int reg;
+ pin = entry->pin;
+ reg = io_apic_read(entry->apic, 0x10 + pin * 2);
+ reg &= mask_and;
+ reg |= mask_or;
+ io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
+ if (final)
+ final(entry);
+ }
+}
-/* mask = 1 */
-DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
+static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
+{
+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
+}
-/* mask = 0 */
-DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, )
+#ifdef CONFIG_X86_64
+void io_apic_sync(struct irq_pin_list *entry)
+{
+ /*
+ * Synchronize the IO-APIC and the CPU by doing
+ * a dummy read from the IO-APIC
+ */
+ struct io_apic __iomem *io_apic;
+ io_apic = io_apic_base(entry->apic);
+ readl(&io_apic->data);
+}
-static void mask_IO_APIC_irq (unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
+{
+ io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+}
+#else /* CONFIG_X86_32 */
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
{
+ io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
+}
+
+static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
+{
+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+ IO_APIC_REDIR_MASKED, NULL);
+}
+
+static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
+{
+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
+ IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
+}
+#endif /* CONFIG_X86_32 */
+
+static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
+{
+ struct irq_cfg *cfg = desc->chip_data;
unsigned long flags;
+ BUG_ON(!cfg);
+
spin_lock_irqsave(&ioapic_lock, flags);
- __mask_IO_APIC_irq(irq);
+ __mask_IO_APIC_irq(cfg);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
{
+ struct irq_cfg *cfg = desc->chip_data;
unsigned long flags;
spin_lock_irqsave(&ioapic_lock, flags);
- __unmask_IO_APIC_irq(irq);
+ __unmask_IO_APIC_irq(cfg);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
+static void mask_IO_APIC_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ mask_IO_APIC_irq_desc(desc);
+}
+static void unmask_IO_APIC_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ unmask_IO_APIC_irq_desc(desc);
+}
+
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
{
struct IO_APIC_route_entry entry;
@@ -450,6 +740,68 @@ static void clear_IO_APIC (void)
clear_IO_APIC_pin(apic, pin);
}
+#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32)
+void send_IPI_self(int vector)
+{
+ unsigned int cfg;
+
+ /*
+ * Wait for idle.
+ */
+ apic_wait_icr_idle();
+ cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ apic_write(APIC_ICR, cfg);
+}
+#endif /* !CONFIG_SMP && CONFIG_X86_32*/
+
+#ifdef CONFIG_X86_32
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+
+#define MAX_PIRQS 8
+static int pirq_entries [MAX_PIRQS];
+static int pirqs_enabled;
+
+static int __init ioapic_pirq_setup(char *str)
+{
+ int i, max;
+ int ints[MAX_PIRQS+1];
+
+ get_options(str, ARRAY_SIZE(ints), ints);
+
+ for (i = 0; i < MAX_PIRQS; i++)
+ pirq_entries[i] = -1;
+
+ pirqs_enabled = 1;
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "PIRQ redirection, working around broken MP-BIOS.\n");
+ max = MAX_PIRQS;
+ if (ints[0] < MAX_PIRQS)
+ max = ints[0];
+
+ for (i = 0; i < max; i++) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+ /*
+ * PIRQs are mapped upside down, usually.
+ */
+ pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+ }
+ return 1;
+}
+
+__setup("pirq=", ioapic_pirq_setup);
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_INTR_REMAP
+/* I/O APIC RTE contents at the OS boot up */
+static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
+
/*
* Saves and masks all the unmasked IO-APIC RTE's
*/
@@ -474,7 +826,7 @@ int save_mask_IO_APIC_setup(void)
kzalloc(sizeof(struct IO_APIC_route_entry) *
nr_ioapic_registers[apic], GFP_KERNEL);
if (!early_ioapic_entries[apic])
- return -ENOMEM;
+ goto nomem;
}
for (apic = 0; apic < nr_ioapics; apic++)
@@ -488,17 +840,31 @@ int save_mask_IO_APIC_setup(void)
ioapic_write_entry(apic, pin, entry);
}
}
+
return 0;
+
+nomem:
+ while (apic >= 0)
+ kfree(early_ioapic_entries[apic--]);
+ memset(early_ioapic_entries, 0,
+ ARRAY_SIZE(early_ioapic_entries));
+
+ return -ENOMEM;
}
void restore_IO_APIC_setup(void)
{
int apic, pin;
- for (apic = 0; apic < nr_ioapics; apic++)
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ if (!early_ioapic_entries[apic])
+ break;
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
ioapic_write_entry(apic, pin,
early_ioapic_entries[apic][pin]);
+ kfree(early_ioapic_entries[apic]);
+ early_ioapic_entries[apic] = NULL;
+ }
}
void reinit_intr_remapped_IO_APIC(int intr_remapping)
@@ -512,25 +878,7 @@ void reinit_intr_remapped_IO_APIC(int intr_remapping)
*/
restore_IO_APIC_setup();
}
-
-int skip_ioapic_setup;
-int ioapic_force;
-
-static int __init parse_noapic(char *str)
-{
- disable_ioapic_setup();
- return 0;
-}
-early_param("noapic", parse_noapic);
-
-/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
-static int __init disable_timer_pin_setup(char *arg)
-{
- disable_timer_pin_1 = 1;
- return 1;
-}
-__setup("disable_timer_pin_1", disable_timer_pin_setup);
-
+#endif
/*
* Find the IRQ entry number of a certain pin.
@@ -634,22 +982,54 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
best_guess = irq;
}
}
- BUG_ON(best_guess >= NR_IRQS);
return best_guess;
}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+ if (irq < NR_IRQS_LEGACY) {
+ unsigned int port = 0x4d0 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+ }
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "Broken MPtable reports ISA irq %d\n", irq);
+ return 0;
+}
+
+#endif
+
/* ISA interrupts are always polarity zero edge triggered,
* when listed as conforming in the MP table. */
#define default_ISA_trigger(idx) (0)
#define default_ISA_polarity(idx) (0)
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value. If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+
+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
+#define default_EISA_polarity(idx) default_ISA_polarity(idx)
+
/* PCI interrupts are always polarity one level triggered,
* when listed as conforming in the MP table. */
#define default_PCI_trigger(idx) (1)
#define default_PCI_polarity(idx) (1)
+/* MCA interrupts are always polarity zero level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_MCA_trigger(idx) (1)
+#define default_MCA_polarity(idx) default_ISA_polarity(idx)
+
static int MPBIOS_polarity(int idx)
{
int bus = mp_irqs[idx].mp_srcbus;
@@ -707,6 +1087,36 @@ static int MPBIOS_trigger(int idx)
trigger = default_ISA_trigger(idx);
else
trigger = default_PCI_trigger(idx);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+ switch (mp_bus_id_to_type[bus]) {
+ case MP_BUS_ISA: /* ISA pin */
+ {
+ /* set before the switch */
+ break;
+ }
+ case MP_BUS_EISA: /* EISA pin */
+ {
+ trigger = default_EISA_trigger(idx);
+ break;
+ }
+ case MP_BUS_PCI: /* PCI pin */
+ {
+ /* set before the switch */
+ break;
+ }
+ case MP_BUS_MCA: /* MCA pin */
+ {
+ trigger = default_MCA_trigger(idx);
+ break;
+ }
+ default:
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 1;
+ break;
+ }
+ }
+#endif
break;
case 1: /* edge */
{
@@ -744,6 +1154,7 @@ static inline int irq_trigger(int idx)
return MPBIOS_trigger(idx);
}
+int (*ioapic_renumber_irq)(int ioapic, int irq);
static int pin_2_irq(int idx, int apic, int pin)
{
int irq, i;
@@ -765,8 +1176,32 @@ static int pin_2_irq(int idx, int apic, int pin)
while (i < apic)
irq += nr_ioapic_registers[i++];
irq += pin;
+ /*
+ * For MPS mode, so far only needed by ES7000 platform
+ */
+ if (ioapic_renumber_irq)
+ irq = ioapic_renumber_irq(apic, irq);
+ }
+
+#ifdef CONFIG_X86_32
+ /*
+ * PCI IRQ command line redirection. Yes, limits are hardcoded.
+ */
+ if ((pin >= 16) && (pin <= 23)) {
+ if (pirq_entries[pin-16] != -1) {
+ if (!pirq_entries[pin-16]) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "disabling PIRQ%d\n", pin-16);
+ } else {
+ irq = pirq_entries[pin-16];
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "using PIRQ%d -> IRQ %d\n",
+ pin-16, irq);
+ }
+ }
}
- BUG_ON(irq >= NR_IRQS);
+#endif
+
return irq;
}
@@ -783,7 +1218,7 @@ void unlock_vector_lock(void)
spin_unlock(&vector_lock);
}
-static int __assign_irq_vector(int irq, cpumask_t mask)
+static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
{
/*
* NOTE! The local APIC isn't very good at handling
@@ -799,17 +1234,13 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
unsigned int old_vector;
int cpu;
- struct irq_cfg *cfg;
- BUG_ON((unsigned)irq >= NR_IRQS);
- cfg = &irq_cfg[irq];
+ if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+ return -EBUSY;
/* Only try and allocate irqs on cpus that are present */
cpus_and(mask, mask, cpu_online_map);
- if ((cfg->move_in_progress) || cfg->move_cleanup_count)
- return -EBUSY;
-
old_vector = cfg->vector;
if (old_vector) {
cpumask_t tmp;
@@ -837,8 +1268,13 @@ next:
}
if (unlikely(current_vector == vector))
continue;
+#ifdef CONFIG_X86_64
if (vector == IA32_SYSCALL_VECTOR)
goto next;
+#else
+ if (vector == SYSCALL_VECTOR)
+ goto next;
+#endif
for_each_cpu_mask_nr(new_cpu, new_mask)
if (per_cpu(vector_irq, new_cpu)[vector] != -1)
goto next;
@@ -858,25 +1294,22 @@ next:
return -ENOSPC;
}
-static int assign_irq_vector(int irq, cpumask_t mask)
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
{
int err;
unsigned long flags;
spin_lock_irqsave(&vector_lock, flags);
- err = __assign_irq_vector(irq, mask);
+ err = __assign_irq_vector(irq, cfg, mask);
spin_unlock_irqrestore(&vector_lock, flags);
return err;
}
-static void __clear_irq_vector(int irq)
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
{
- struct irq_cfg *cfg;
cpumask_t mask;
int cpu, vector;
- BUG_ON((unsigned)irq >= NR_IRQS);
- cfg = &irq_cfg[irq];
BUG_ON(!cfg->vector);
vector = cfg->vector;
@@ -886,6 +1319,20 @@ static void __clear_irq_vector(int irq)
cfg->vector = 0;
cpus_clear(cfg->domain);
+
+ if (likely(!cfg->move_in_progress))
+ return;
+ cpus_and(mask, cfg->old_domain, cpu_online_map);
+ for_each_cpu_mask_nr(cpu, mask) {
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
+ vector++) {
+ if (per_cpu(vector_irq, cpu)[vector] != irq)
+ continue;
+ per_cpu(vector_irq, cpu)[vector] = -1;
+ break;
+ }
+ }
+ cfg->move_in_progress = 0;
}
void __setup_vector_irq(int cpu)
@@ -893,12 +1340,17 @@ void __setup_vector_irq(int cpu)
/* Initialize vector_irq on a new cpu */
/* This function must be called with vector_lock held */
int irq, vector;
+ struct irq_cfg *cfg;
+ struct irq_desc *desc;
/* Mark the inuse vectors */
- for (irq = 0; irq < NR_IRQS; ++irq) {
- if (!cpu_isset(cpu, irq_cfg[irq].domain))
+ for_each_irq_desc(irq, desc) {
+ if (!desc)
+ continue;
+ cfg = desc->chip_data;
+ if (!cpu_isset(cpu, cfg->domain))
continue;
- vector = irq_cfg[irq].vector;
+ vector = cfg->vector;
per_cpu(vector_irq, cpu)[vector] = irq;
}
/* Mark the free vectors */
@@ -906,7 +1358,9 @@ void __setup_vector_irq(int cpu)
irq = per_cpu(vector_irq, cpu)[vector];
if (irq < 0)
continue;
- if (!cpu_isset(cpu, irq_cfg[irq].domain))
+
+ cfg = irq_cfg(irq);
+ if (!cpu_isset(cpu, cfg->domain))
per_cpu(vector_irq, cpu)[vector] = -1;
}
}
@@ -916,16 +1370,46 @@ static struct irq_chip ioapic_chip;
static struct irq_chip ir_ioapic_chip;
#endif
-static void ioapic_register_intr(int irq, unsigned long trigger)
+#define IOAPIC_AUTO -1
+#define IOAPIC_EDGE 0
+#define IOAPIC_LEVEL 1
+
+#ifdef CONFIG_X86_32
+static inline int IO_APIC_irq_trigger(int irq)
{
- if (trigger)
- irq_desc[irq].status |= IRQ_LEVEL;
+ int apic, idx, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ idx = find_irq_entry(apic, pin, mp_INT);
+ if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
+ return irq_trigger(idx);
+ }
+ }
+ /*
+ * nonexistent IRQs are edge default
+ */
+ return 0;
+}
+#else
+static inline int IO_APIC_irq_trigger(int irq)
+{
+ return 1;
+}
+#endif
+
+static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
+{
+
+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+ trigger == IOAPIC_LEVEL)
+ desc->status |= IRQ_LEVEL;
else
- irq_desc[irq].status &= ~IRQ_LEVEL;
+ desc->status &= ~IRQ_LEVEL;
#ifdef CONFIG_INTR_REMAP
if (irq_remapped(irq)) {
- irq_desc[irq].status |= IRQ_MOVE_PCNTXT;
+ desc->status |= IRQ_MOVE_PCNTXT;
if (trigger)
set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
handle_fasteoi_irq,
@@ -936,7 +1420,8 @@ static void ioapic_register_intr(int irq, unsigned long trigger)
return;
}
#endif
- if (trigger)
+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+ trigger == IOAPIC_LEVEL)
set_irq_chip_and_handler_name(irq, &ioapic_chip,
handle_fasteoi_irq,
"fasteoi");
@@ -1006,18 +1491,20 @@ static int setup_ioapic_entry(int apic, int irq,
return 0;
}
-static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
int trigger, int polarity)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
struct IO_APIC_route_entry entry;
cpumask_t mask;
if (!IO_APIC_IRQ(irq))
return;
+ cfg = desc->chip_data;
+
mask = TARGET_CPUS;
- if (assign_irq_vector(irq, mask))
+ if (assign_irq_vector(irq, cfg, mask))
return;
cpus_and(mask, cfg->domain, mask);
@@ -1034,12 +1521,12 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
cfg->vector)) {
printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
mp_ioapics[apic].mp_apicid, pin);
- __clear_irq_vector(irq);
+ __clear_irq_vector(irq, cfg);
return;
}
- ioapic_register_intr(irq, trigger);
- if (irq < 16)
+ ioapic_register_intr(irq, desc, trigger);
+ if (irq < NR_IRQS_LEGACY)
disable_8259A_irq(irq);
ioapic_write_entry(apic, pin, entry);
@@ -1047,37 +1534,58 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
static void __init setup_IO_APIC_irqs(void)
{
- int apic, pin, idx, irq, first_notcon = 1;
+ int apic, pin, idx, irq;
+ int notcon = 0;
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+ int cpu = boot_cpu_id;
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-
- idx = find_irq_entry(apic,pin,mp_INT);
- if (idx == -1) {
- if (first_notcon) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
- first_notcon = 0;
- } else
- apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
- continue;
- }
- if (!first_notcon) {
- apic_printk(APIC_VERBOSE, " not connected.\n");
- first_notcon = 1;
- }
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+
+ idx = find_irq_entry(apic, pin, mp_INT);
+ if (idx == -1) {
+ if (!notcon) {
+ notcon = 1;
+ apic_printk(APIC_VERBOSE,
+ KERN_DEBUG " %d-%d",
+ mp_ioapics[apic].mp_apicid,
+ pin);
+ } else
+ apic_printk(APIC_VERBOSE, " %d-%d",
+ mp_ioapics[apic].mp_apicid,
+ pin);
+ continue;
+ }
+ if (notcon) {
+ apic_printk(APIC_VERBOSE,
+ " (apicid-pin) not connected\n");
+ notcon = 0;
+ }
- irq = pin_2_irq(idx, apic, pin);
- add_pin_to_irq(irq, apic, pin);
+ irq = pin_2_irq(idx, apic, pin);
+#ifdef CONFIG_X86_32
+ if (multi_timer_check(apic, irq))
+ continue;
+#endif
+ desc = irq_to_desc_alloc_cpu(irq, cpu);
+ if (!desc) {
+ printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+ continue;
+ }
+ cfg = desc->chip_data;
+ add_pin_to_irq_cpu(cfg, cpu, apic, pin);
- setup_IO_APIC_irq(apic, pin, irq,
- irq_trigger(idx), irq_polarity(idx));
- }
+ setup_IO_APIC_irq(apic, pin, irq, desc,
+ irq_trigger(idx), irq_polarity(idx));
+ }
}
- if (!first_notcon)
- apic_printk(APIC_VERBOSE, " not connected.\n");
+ if (notcon)
+ apic_printk(APIC_VERBOSE,
+ " (apicid-pin) not connected\n");
}
/*
@@ -1088,8 +1596,10 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
{
struct IO_APIC_route_entry entry;
+#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
return;
+#endif
memset(&entry, 0, sizeof(entry));
@@ -1124,7 +1634,11 @@ __apicdebuginit(void) print_IO_APIC(void)
union IO_APIC_reg_00 reg_00;
union IO_APIC_reg_01 reg_01;
union IO_APIC_reg_02 reg_02;
+ union IO_APIC_reg_03 reg_03;
unsigned long flags;
+ struct irq_cfg *cfg;
+ struct irq_desc *desc;
+ unsigned int irq;
if (apic_verbosity == APIC_QUIET)
return;
@@ -1147,12 +1661,16 @@ __apicdebuginit(void) print_IO_APIC(void)
reg_01.raw = io_apic_read(apic, 1);
if (reg_01.bits.version >= 0x10)
reg_02.raw = io_apic_read(apic, 2);
+ if (reg_01.bits.version >= 0x20)
+ reg_03.raw = io_apic_read(apic, 3);
spin_unlock_irqrestore(&ioapic_lock, flags);
printk("\n");
printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
+ printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
+ printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
@@ -1160,11 +1678,27 @@ __apicdebuginit(void) print_IO_APIC(void)
printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
- if (reg_01.bits.version >= 0x10) {
+ /*
+ * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
+ * but the value of reg_02 is read as the previous read register
+ * value, so ignore it if reg_02 == reg_01.
+ */
+ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
}
+ /*
+ * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
+ * or reg_03, but the value of reg_0[23] is read as the previous read
+ * register value, so ignore it if reg_03 == reg_0[12].
+ */
+ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
+ reg_03.raw != reg_01.raw) {
+ printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
+ printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
+ }
+
printk(KERN_DEBUG ".... IRQ redirection table:\n");
printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
@@ -1193,16 +1727,21 @@ __apicdebuginit(void) print_IO_APIC(void)
}
}
printk(KERN_DEBUG "IRQ to pin mappings:\n");
- for (i = 0; i < NR_IRQS; i++) {
- struct irq_pin_list *entry = irq_2_pin + i;
- if (entry->pin < 0)
+ for_each_irq_desc(irq, desc) {
+ struct irq_pin_list *entry;
+
+ if (!desc)
+ continue;
+ cfg = desc->chip_data;
+ entry = cfg->irq_2_pin;
+ if (!entry)
continue;
- printk(KERN_DEBUG "IRQ%d ", i);
+ printk(KERN_DEBUG "IRQ%d ", irq);
for (;;) {
printk("-> %d:%d", entry->apic, entry->pin);
if (!entry->next)
break;
- entry = irq_2_pin + entry->next;
+ entry = entry->next;
}
printk("\n");
}
@@ -1236,7 +1775,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base)
__apicdebuginit(void) print_local_APIC(void *dummy)
{
unsigned int v, ver, maxlvt;
- unsigned long icr;
+ u64 icr;
if (apic_verbosity == APIC_QUIET)
return;
@@ -1253,20 +1792,31 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
v = apic_read(APIC_TASKPRI);
printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
- v = apic_read(APIC_ARBPRI);
- printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
- v & APIC_ARBPRI_MASK);
- v = apic_read(APIC_PROCPRI);
- printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
+ if (!APIC_XAPIC(ver)) {
+ v = apic_read(APIC_ARBPRI);
+ printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+ v & APIC_ARBPRI_MASK);
+ }
+ v = apic_read(APIC_PROCPRI);
+ printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+ }
+
+ /*
+ * Remote read supported only in the 82489DX and local APIC for
+ * Pentium processors.
+ */
+ if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
+ v = apic_read(APIC_RRR);
+ printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+ }
- v = apic_read(APIC_EOI);
- printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
- v = apic_read(APIC_RRR);
- printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
v = apic_read(APIC_LDR);
printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
- v = apic_read(APIC_DFR);
- printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+ if (!x2apic_enabled()) {
+ v = apic_read(APIC_DFR);
+ printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+ }
v = apic_read(APIC_SPIV);
printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
@@ -1277,8 +1827,13 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
printk(KERN_DEBUG "... APIC IRR field:\n");
print_APIC_bitfield(APIC_IRR);
- v = apic_read(APIC_ESR);
- printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+
+ v = apic_read(APIC_ESR);
+ printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+ }
icr = apic_icr_read();
printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
@@ -1312,7 +1867,12 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
__apicdebuginit(void) print_all_local_APICs(void)
{
- on_each_cpu(print_local_APIC, NULL, 1);
+ int cpu;
+
+ preempt_disable();
+ for_each_online_cpu(cpu)
+ smp_call_function_single(cpu, print_local_APIC, NULL, 1);
+ preempt_enable();
}
__apicdebuginit(void) print_PIC(void)
@@ -1359,17 +1919,22 @@ __apicdebuginit(int) print_all_ICs(void)
fs_initcall(print_all_ICs);
+/* Where if anywhere is the i8259 connect in external int mode */
+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+
void __init enable_IO_APIC(void)
{
union IO_APIC_reg_01 reg_01;
int i8259_apic, i8259_pin;
- int i, apic;
+ int apic;
unsigned long flags;
- for (i = 0; i < PIN_MAP_SIZE; i++) {
- irq_2_pin[i].pin = -1;
- irq_2_pin[i].next = 0;
- }
+#ifdef CONFIG_X86_32
+ int i;
+ if (!pirqs_enabled)
+ for (i = 0; i < MAX_PIRQS; i++)
+ pirq_entries[i] = -1;
+#endif
/*
* The number of IO-APIC IRQ registers (== #pins):
@@ -1399,6 +1964,10 @@ void __init enable_IO_APIC(void)
}
found_i8259:
/* Look to see what if the MP table has reported the ExtINT */
+ /* If we could not find the appropriate pin by looking at the ioapic
+ * the i8259 probably is not connected the ioapic but give the
+ * mptable a chance anyway.
+ */
i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
/* Trust the MP table if nothing is setup in the hardware */
@@ -1458,6 +2027,133 @@ void disable_IO_APIC(void)
disconnect_bsp_APIC(ioapic_i8259.pin != -1);
}
+#ifdef CONFIG_X86_32
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
+ */
+
+static void __init setup_ioapic_ids_from_mpc(void)
+{
+ union IO_APIC_reg_00 reg_00;
+ physid_mask_t phys_id_present_map;
+ int apic;
+ int i;
+ unsigned char old_id;
+ unsigned long flags;
+
+ if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
+ return;
+
+ /*
+ * Don't check I/O APIC IDs for xAPIC systems. They have
+ * no meaning without the serial APIC bus.
+ */
+ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ return;
+ /*
+ * This is broken; anything with a real cpu count has to
+ * circumvent this idiocy regardless.
+ */
+ phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
+
+ /*
+ * Set the IOAPIC ID to the value stored in the MPC table.
+ */
+ for (apic = 0; apic < nr_ioapics; apic++) {
+
+ /* Read the register 0 value */
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(apic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ old_id = mp_ioapics[apic].mp_apicid;
+
+ if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
+ apic, mp_ioapics[apic].mp_apicid);
+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+ reg_00.bits.ID);
+ mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
+ }
+
+ /*
+ * Sanity check, is the ID really free? Every APIC in a
+ * system must have a unique ID or we get lots of nice
+ * 'stuck on smp_invalidate_needed IPI wait' messages.
+ */
+ if (check_apicid_used(phys_id_present_map,
+ mp_ioapics[apic].mp_apicid)) {
+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
+ apic, mp_ioapics[apic].mp_apicid);
+ for (i = 0; i < get_physical_broadcast(); i++)
+ if (!physid_isset(i, phys_id_present_map))
+ break;
+ if (i >= get_physical_broadcast())
+ panic("Max APIC ID exceeded!\n");
+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+ i);
+ physid_set(i, phys_id_present_map);
+ mp_ioapics[apic].mp_apicid = i;
+ } else {
+ physid_mask_t tmp;
+ tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
+ apic_printk(APIC_VERBOSE, "Setting %d in the "
+ "phys_id_present_map\n",
+ mp_ioapics[apic].mp_apicid);
+ physids_or(phys_id_present_map, phys_id_present_map, tmp);
+ }
+
+
+ /*
+ * We need to adjust the IRQ routing table
+ * if the ID changed.
+ */
+ if (old_id != mp_ioapics[apic].mp_apicid)
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].mp_dstapic == old_id)
+ mp_irqs[i].mp_dstapic
+ = mp_ioapics[apic].mp_apicid;
+
+ /*
+ * Read the right value from the MPC table and
+ * write it into the ID register.
+ */
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "...changing IO-APIC physical APIC ID to %d ...",
+ mp_ioapics[apic].mp_apicid);
+
+ reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(apic, 0, reg_00.raw);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /*
+ * Sanity check
+ */
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(apic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
+ printk("could not set ID!\n");
+ else
+ apic_printk(APIC_VERBOSE, " ok.\n");
+ }
+}
+#endif
+
+int no_timer_check __initdata;
+
+static int __init notimercheck(char *s)
+{
+ no_timer_check = 1;
+ return 1;
+}
+__setup("no_timer_check", notimercheck);
+
/*
* There is a nasty bug in some older SMP boards, their mptable lies
* about the timer IRQ. We do the following to work around the situation:
@@ -1471,6 +2167,9 @@ static int __init timer_irq_works(void)
unsigned long t1 = jiffies;
unsigned long flags;
+ if (no_timer_check)
+ return 1;
+
local_save_flags(flags);
local_irq_enable();
/* Let ten ticks pass... */
@@ -1518,22 +2217,26 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
{
int was_pending = 0;
unsigned long flags;
+ struct irq_cfg *cfg;
spin_lock_irqsave(&ioapic_lock, flags);
- if (irq < 16) {
+ if (irq < NR_IRQS_LEGACY) {
disable_8259A_irq(irq);
if (i8259A_irq_pending(irq))
was_pending = 1;
}
- __unmask_IO_APIC_irq(irq);
+ cfg = irq_cfg(irq);
+ __unmask_IO_APIC_irq(cfg);
spin_unlock_irqrestore(&ioapic_lock, flags);
return was_pending;
}
+#ifdef CONFIG_X86_64
static int ioapic_retrigger_irq(unsigned int irq)
{
- struct irq_cfg *cfg = &irq_cfg[irq];
+
+ struct irq_cfg *cfg = irq_cfg(irq);
unsigned long flags;
spin_lock_irqsave(&vector_lock, flags);
@@ -1542,6 +2245,14 @@ static int ioapic_retrigger_irq(unsigned int irq)
return 1;
}
+#else
+static int ioapic_retrigger_irq(unsigned int irq)
+{
+ send_IPI_self(irq_cfg(irq)->vector);
+
+ return 1;
+}
+#endif
/*
* Level and edge triggered IO-APIC interrupts need different handling,
@@ -1578,32 +2289,37 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
* as simple as edge triggered migration and we can do the irq migration
* with a simple atomic update to IO-APIC RTE.
*/
-static void migrate_ioapic_irq(int irq, cpumask_t mask)
+static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask)
{
- struct irq_cfg *cfg = irq_cfg + irq;
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_cfg *cfg;
cpumask_t tmp, cleanup_mask;
struct irte irte;
- int modify_ioapic_rte = desc->status & IRQ_LEVEL;
+ int modify_ioapic_rte;
unsigned int dest;
unsigned long flags;
+ unsigned int irq;
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
return;
+ irq = desc->irq;
if (get_irte(irq, &irte))
return;
- if (assign_irq_vector(irq, mask))
+ cfg = desc->chip_data;
+ if (assign_irq_vector(irq, cfg, mask))
return;
+ set_extra_move_desc(desc, mask);
+
cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
+ modify_ioapic_rte = desc->status & IRQ_LEVEL;
if (modify_ioapic_rte) {
spin_lock_irqsave(&ioapic_lock, flags);
- __target_IO_APIC_irq(irq, dest, cfg->vector);
+ __target_IO_APIC_irq(irq, dest, cfg);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -1622,18 +2338,19 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask)
cfg->move_in_progress = 0;
}
- irq_desc[irq].affinity = mask;
+ desc->affinity = mask;
}
-static int migrate_irq_remapped_level(int irq)
+static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
{
int ret = -1;
+ struct irq_cfg *cfg = desc->chip_data;
- mask_IO_APIC_irq(irq);
+ mask_IO_APIC_irq_desc(desc);
- if (io_apic_level_ack_pending(irq)) {
+ if (io_apic_level_ack_pending(cfg)) {
/*
- * Interrupt in progress. Migrating irq now will change the
+ * Interrupt in progress. Migrating irq now will change the
* vector information in the IO-APIC RTE and that will confuse
* the EOI broadcast performed by cpu.
* So, delay the irq migration to the next instance.
@@ -1643,23 +2360,27 @@ static int migrate_irq_remapped_level(int irq)
}
/* everthing is clear. we have right of way */
- migrate_ioapic_irq(irq, irq_desc[irq].pending_mask);
+ migrate_ioapic_irq_desc(desc, desc->pending_mask);
ret = 0;
- irq_desc[irq].status &= ~IRQ_MOVE_PENDING;
- cpus_clear(irq_desc[irq].pending_mask);
+ desc->status &= ~IRQ_MOVE_PENDING;
+ cpus_clear(desc->pending_mask);
unmask:
- unmask_IO_APIC_irq(irq);
+ unmask_IO_APIC_irq_desc(desc);
+
return ret;
}
static void ir_irq_migration(struct work_struct *work)
{
- int irq;
+ unsigned int irq;
+ struct irq_desc *desc;
+
+ for_each_irq_desc(irq, desc) {
+ if (!desc)
+ continue;
- for (irq = 0; irq < NR_IRQS; irq++) {
- struct irq_desc *desc = irq_desc + irq;
if (desc->status & IRQ_MOVE_PENDING) {
unsigned long flags;
@@ -1671,8 +2392,7 @@ static void ir_irq_migration(struct work_struct *work)
continue;
}
- desc->chip->set_affinity(irq,
- irq_desc[irq].pending_mask);
+ desc->chip->set_affinity(irq, desc->pending_mask);
spin_unlock_irqrestore(&desc->lock, flags);
}
}
@@ -1681,22 +2401,29 @@ static void ir_irq_migration(struct work_struct *work)
/*
* Migrates the IRQ destination in the process context.
*/
-static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
{
- if (irq_desc[irq].status & IRQ_LEVEL) {
- irq_desc[irq].status |= IRQ_MOVE_PENDING;
- irq_desc[irq].pending_mask = mask;
- migrate_irq_remapped_level(irq);
+ if (desc->status & IRQ_LEVEL) {
+ desc->status |= IRQ_MOVE_PENDING;
+ desc->pending_mask = mask;
+ migrate_irq_remapped_level_desc(desc);
return;
}
- migrate_ioapic_irq(irq, mask);
+ migrate_ioapic_irq_desc(desc, mask);
+}
+static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ set_ir_ioapic_affinity_irq_desc(desc, mask);
}
#endif
asmlinkage void smp_irq_move_cleanup_interrupt(void)
{
unsigned vector, me;
+
ack_APIC_irq();
exit_idle();
irq_enter();
@@ -1707,11 +2434,15 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
struct irq_desc *desc;
struct irq_cfg *cfg;
irq = __get_cpu_var(vector_irq)[vector];
- if (irq >= NR_IRQS)
+
+ if (irq == -1)
continue;
- desc = irq_desc + irq;
- cfg = irq_cfg + irq;
+ desc = irq_to_desc(irq);
+ if (!desc)
+ continue;
+
+ cfg = irq_cfg(irq);
spin_lock(&desc->lock);
if (!cfg->move_cleanup_count)
goto unlock;
@@ -1728,19 +2459,40 @@ unlock:
irq_exit();
}
-static void irq_complete_move(unsigned int irq)
+static void irq_complete_move(struct irq_desc **descp)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_desc *desc = *descp;
+ struct irq_cfg *cfg = desc->chip_data;
unsigned vector, me;
- if (likely(!cfg->move_in_progress))
+ if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+ if (likely(!cfg->move_desc_pending))
+ return;
+
+ /* domain has not changed, but affinity did */
+ me = smp_processor_id();
+ if (cpu_isset(me, desc->affinity)) {
+ *descp = desc = move_irq_desc(desc, me);
+ /* get the new one */
+ cfg = desc->chip_data;
+ cfg->move_desc_pending = 0;
+ }
+#endif
return;
+ }
vector = ~get_irq_regs()->orig_ax;
me = smp_processor_id();
if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
cpumask_t cleanup_mask;
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+ *descp = desc = move_irq_desc(desc, me);
+ /* get the new one */
+ cfg = desc->chip_data;
+#endif
+
cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
cfg->move_cleanup_count = cpus_weight(cleanup_mask);
send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
@@ -1748,8 +2500,9 @@ static void irq_complete_move(unsigned int irq)
}
}
#else
-static inline void irq_complete_move(unsigned int irq) {}
+static inline void irq_complete_move(struct irq_desc **descp) {}
#endif
+
#ifdef CONFIG_INTR_REMAP
static void ack_x2apic_level(unsigned int irq)
{
@@ -1760,28 +2513,66 @@ static void ack_x2apic_edge(unsigned int irq)
{
ack_x2APIC_irq();
}
+
#endif
static void ack_apic_edge(unsigned int irq)
{
- irq_complete_move(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ irq_complete_move(&desc);
move_native_irq(irq);
ack_APIC_irq();
}
+atomic_t irq_mis_count;
+
static void ack_apic_level(unsigned int irq)
{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+#ifdef CONFIG_X86_32
+ unsigned long v;
+ int i;
+#endif
+ struct irq_cfg *cfg;
int do_unmask_irq = 0;
- irq_complete_move(irq);
+ irq_complete_move(&desc);
#ifdef CONFIG_GENERIC_PENDING_IRQ
/* If we are moving the irq we need to mask it */
- if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
+ if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
do_unmask_irq = 1;
- mask_IO_APIC_irq(irq);
+ mask_IO_APIC_irq_desc(desc);
}
#endif
+#ifdef CONFIG_X86_32
+ /*
+ * It appears there is an erratum which affects at least version 0x11
+ * of I/O APIC (that's the 82093AA and cores integrated into various
+ * chipsets). Under certain conditions a level-triggered interrupt is
+ * erroneously delivered as edge-triggered one but the respective IRR
+ * bit gets set nevertheless. As a result the I/O unit expects an EOI
+ * message but it will never arrive and further interrupts are blocked
+ * from the source. The exact reason is so far unknown, but the
+ * phenomenon was observed when two consecutive interrupt requests
+ * from a given source get delivered to the same CPU and the source is
+ * temporarily disabled in between.
+ *
+ * A workaround is to simulate an EOI message manually. We achieve it
+ * by setting the trigger mode to edge and then to level when the edge
+ * trigger mode gets detected in the TMR of a local APIC for a
+ * level-triggered interrupt. We mask the source for the time of the
+ * operation to prevent an edge-triggered interrupt escaping meanwhile.
+ * The idea is from Manfred Spraul. --macro
+ */
+ cfg = desc->chip_data;
+ i = cfg->vector;
+
+ v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
+#endif
+
/*
* We must acknowledge the irq before we move it or the acknowledge will
* not propagate properly.
@@ -1816,35 +2607,46 @@ static void ack_apic_level(unsigned int irq)
* accurate and is causing problems then it is a hardware bug
* and you can go talk to the chipset vendor about it.
*/
- if (!io_apic_level_ack_pending(irq))
+ cfg = desc->chip_data;
+ if (!io_apic_level_ack_pending(cfg))
move_masked_irq(irq);
- unmask_IO_APIC_irq(irq);
+ unmask_IO_APIC_irq_desc(desc);
+ }
+
+#ifdef CONFIG_X86_32
+ if (!(v & (1 << (i & 0x1f)))) {
+ atomic_inc(&irq_mis_count);
+ spin_lock(&ioapic_lock);
+ __mask_and_edge_IO_APIC_irq(cfg);
+ __unmask_and_level_IO_APIC_irq(cfg);
+ spin_unlock(&ioapic_lock);
}
+#endif
}
static struct irq_chip ioapic_chip __read_mostly = {
- .name = "IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
- .ack = ack_apic_edge,
- .eoi = ack_apic_level,
+ .name = "IO-APIC",
+ .startup = startup_ioapic_irq,
+ .mask = mask_IO_APIC_irq,
+ .unmask = unmask_IO_APIC_irq,
+ .ack = ack_apic_edge,
+ .eoi = ack_apic_level,
#ifdef CONFIG_SMP
- .set_affinity = set_ioapic_affinity_irq,
+ .set_affinity = set_ioapic_affinity_irq,
#endif
.retrigger = ioapic_retrigger_irq,
};
#ifdef CONFIG_INTR_REMAP
static struct irq_chip ir_ioapic_chip __read_mostly = {
- .name = "IR-IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
- .ack = ack_x2apic_edge,
- .eoi = ack_x2apic_level,
+ .name = "IR-IO-APIC",
+ .startup = startup_ioapic_irq,
+ .mask = mask_IO_APIC_irq,
+ .unmask = unmask_IO_APIC_irq,
+ .ack = ack_x2apic_edge,
+ .eoi = ack_x2apic_level,
#ifdef CONFIG_SMP
- .set_affinity = set_ir_ioapic_affinity_irq,
+ .set_affinity = set_ir_ioapic_affinity_irq,
#endif
.retrigger = ioapic_retrigger_irq,
};
@@ -1853,6 +2655,8 @@ static struct irq_chip ir_ioapic_chip __read_mostly = {
static inline void init_IO_APIC_traps(void)
{
int irq;
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
/*
* NOTE! The local APIC isn't very good at handling
@@ -1865,39 +2669,47 @@ static inline void init_IO_APIC_traps(void)
* Also, we've got to be careful not to trash gate
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
- for (irq = 0; irq < NR_IRQS ; irq++) {
- if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) {
+ for_each_irq_desc(irq, desc) {
+ if (!desc)
+ continue;
+
+ cfg = desc->chip_data;
+ if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
* Hmm.. We don't have an entry for this,
* so default to an old-fashioned 8259
* interrupt if we can..
*/
- if (irq < 16)
+ if (irq < NR_IRQS_LEGACY)
make_8259A_irq(irq);
else
/* Strange. Oh, well.. */
- irq_desc[irq].chip = &no_irq_chip;
+ desc->chip = &no_irq_chip;
}
}
}
-static void unmask_lapic_irq(unsigned int irq)
+/*
+ * The local APIC irq-chip implementation:
+ */
+
+static void mask_lapic_irq(unsigned int irq)
{
unsigned long v;
v = apic_read(APIC_LVT0);
- apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
+ apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
}
-static void mask_lapic_irq(unsigned int irq)
+static void unmask_lapic_irq(unsigned int irq)
{
unsigned long v;
v = apic_read(APIC_LVT0);
- apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+ apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
-static void ack_lapic_irq (unsigned int irq)
+static void ack_lapic_irq(unsigned int irq)
{
ack_APIC_irq();
}
@@ -1909,9 +2721,9 @@ static struct irq_chip lapic_chip __read_mostly = {
.ack = ack_lapic_irq,
};
-static void lapic_register_intr(int irq)
+static void lapic_register_intr(int irq, struct irq_desc *desc)
{
- irq_desc[irq].status &= ~IRQ_LEVEL;
+ desc->status &= ~IRQ_LEVEL;
set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
"edge");
}
@@ -1919,19 +2731,19 @@ static void lapic_register_intr(int irq)
static void __init setup_nmi(void)
{
/*
- * Dirty trick to enable the NMI watchdog ...
+ * Dirty trick to enable the NMI watchdog ...
* We put the 8259A master into AEOI mode and
* unmask on all local APICs LVT0 as NMI.
*
* The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
* is from Maciej W. Rozycki - so we do not have to EOI from
* the NMI handler or the timer interrupt.
- */
- printk(KERN_INFO "activating NMI Watchdog ...");
+ */
+ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
enable_NMI_through_LVT0();
- printk(" done.\n");
+ apic_printk(APIC_VERBOSE, " done.\n");
}
/*
@@ -1948,12 +2760,17 @@ static inline void __init unlock_ExtINT_logic(void)
unsigned char save_control, save_freq_select;
pin = find_isa_irq_pin(8, mp_INT);
+ if (pin == -1) {
+ WARN_ON_ONCE(1);
+ return;
+ }
apic = find_isa_irq_apic(8, mp_INT);
- if (pin == -1)
+ if (apic == -1) {
+ WARN_ON_ONCE(1);
return;
+ }
entry0 = ioapic_read_entry(apic, pin);
-
clear_IO_APIC_pin(apic, pin);
memset(&entry1, 0, sizeof(entry1));
@@ -1988,35 +2805,60 @@ static inline void __init unlock_ExtINT_logic(void)
ioapic_write_entry(apic, pin, entry0);
}
+static int disable_timer_pin_1 __initdata;
+/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
+static int __init disable_timer_pin_setup(char *arg)
+{
+ disable_timer_pin_1 = 1;
+ return 0;
+}
+early_param("disable_timer_pin_1", disable_timer_pin_setup);
+
+int timer_through_8259 __initdata;
+
/*
* This code may look a bit paranoid, but it's supposed to cooperate with
* a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
* is so screwy. Thanks to Brian Perkins for testing/hacking this beast
* fanatically on his truly buggy board.
*
- * FIXME: really need to revamp this for modern platforms only.
+ * FIXME: really need to revamp this for all platforms.
*/
static inline void __init check_timer(void)
{
- struct irq_cfg *cfg = irq_cfg + 0;
+ struct irq_desc *desc = irq_to_desc(0);
+ struct irq_cfg *cfg = desc->chip_data;
+ int cpu = boot_cpu_id;
int apic1, pin1, apic2, pin2;
unsigned long flags;
+ unsigned int ver;
int no_pin1 = 0;
local_irq_save(flags);
+ ver = apic_read(APIC_LVR);
+ ver = GET_APIC_VERSION(ver);
+
/*
* get/set the timer IRQ vector:
*/
disable_8259A_irq(0);
- assign_irq_vector(0, TARGET_CPUS);
+ assign_irq_vector(0, cfg, TARGET_CPUS);
/*
* As IRQ0 is to be enabled in the 8259A, the virtual
- * wire has to be disabled in the local APIC.
+ * wire has to be disabled in the local APIC. Also
+ * timer interrupts need to be acknowledged manually in
+ * the 8259A for the i82489DX when using the NMI
+ * watchdog as that APIC treats NMIs as level-triggered.
+ * The AEOI mode will finish them in the 8259A
+ * automatically.
*/
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
init_8259A(1);
+#ifdef CONFIG_X86_32
+ timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
+#endif
pin1 = find_isa_irq_pin(0, mp_INT);
apic1 = find_isa_irq_apic(0, mp_INT);
@@ -2035,8 +2877,10 @@ static inline void __init check_timer(void)
* 8259A.
*/
if (pin1 == -1) {
+#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
panic("BIOS bug: timer not connected to IO-APIC");
+#endif
pin1 = pin2;
apic1 = apic2;
no_pin1 = 1;
@@ -2050,11 +2894,11 @@ static inline void __init check_timer(void)
* Ok, does IRQ0 through the IOAPIC work?
*/
if (no_pin1) {
- add_pin_to_irq(0, apic1, pin1);
+ add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
}
- unmask_IO_APIC_irq(0);
- if (!no_timer_check && timer_irq_works()) {
+ unmask_IO_APIC_irq_desc(desc);
+ if (timer_irq_works()) {
if (nmi_watchdog == NMI_IO_APIC) {
setup_nmi();
enable_8259A_irq(0);
@@ -2063,8 +2907,10 @@ static inline void __init check_timer(void)
clear_IO_APIC_pin(0, pin1);
goto out;
}
+#ifdef CONFIG_INTR_REMAP
if (intr_remapping_enabled)
panic("timer doesn't work through Interrupt-remapped IO-APIC");
+#endif
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
@@ -2077,9 +2923,9 @@ static inline void __init check_timer(void)
/*
* legacy devices should be connected to IO APIC #0
*/
- replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+ replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
- unmask_IO_APIC_irq(0);
+ unmask_IO_APIC_irq_desc(desc);
enable_8259A_irq(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2104,11 +2950,14 @@ static inline void __init check_timer(void)
"through the IO-APIC - disabling NMI Watchdog!\n");
nmi_watchdog = NMI_NONE;
}
+#ifdef CONFIG_X86_32
+ timer_ack = 0;
+#endif
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as Virtual Wire IRQ...\n");
- lapic_register_intr(0);
+ lapic_register_intr(0, desc);
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
enable_8259A_irq(0);
@@ -2140,13 +2989,6 @@ out:
local_irq_restore(flags);
}
-static int __init notimercheck(char *s)
-{
- no_timer_check = 1;
- return 1;
-}
-__setup("no_timer_check", notimercheck);
-
/*
* Traditionally ISA IRQ2 is the cascade IRQ, and is not available
* to devices. However there may be an I/O APIC pin available for
@@ -2164,25 +3006,49 @@ __setup("no_timer_check", notimercheck);
* the I/O APIC in all cases now. No actual device should request
* it anyway. --macro
*/
-#define PIC_IRQS (1<<2)
+#define PIC_IRQS (1 << PIC_CASCADE_IR)
void __init setup_IO_APIC(void)
{
+#ifdef CONFIG_X86_32
+ enable_IO_APIC();
+#else
/*
* calling enable_IO_APIC() is moved to setup_local_APIC for BP
*/
+#endif
io_apic_irqs = ~PIC_IRQS;
apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
-
+ /*
+ * Set up IO-APIC IRQ routing.
+ */
+#ifdef CONFIG_X86_32
+ if (!acpi_ioapic)
+ setup_ioapic_ids_from_mpc();
+#endif
sync_Arb_IDs();
setup_IO_APIC_irqs();
init_IO_APIC_traps();
check_timer();
}
+/*
+ * Called after all the initialization is done. If we didnt find any
+ * APIC bugs then we can allow the modify fast path
+ */
+
+static int __init io_apic_bug_finalize(void)
+{
+ if (sis_apic_bug == -1)
+ sis_apic_bug = 0;
+ return 0;
+}
+
+late_initcall(io_apic_bug_finalize);
+
struct sysfs_ioapic_data {
struct sys_device dev;
struct IO_APIC_route_entry entry[0];
@@ -2270,43 +3136,80 @@ device_initcall(ioapic_init_sysfs);
/*
* Dynamic irq allocate and deallocation
*/
-int create_irq(void)
+unsigned int create_irq_nr(unsigned int irq_want)
{
/* Allocate an unused irq */
- int irq;
- int new;
+ unsigned int irq;
+ unsigned int new;
unsigned long flags;
+ struct irq_cfg *cfg_new = NULL;
+ int cpu = boot_cpu_id;
+ struct irq_desc *desc_new = NULL;
- irq = -ENOSPC;
+ irq = 0;
spin_lock_irqsave(&vector_lock, flags);
- for (new = (NR_IRQS - 1); new >= 0; new--) {
+ for (new = irq_want; new < NR_IRQS; new++) {
if (platform_legacy_irq(new))
continue;
- if (irq_cfg[new].vector != 0)
+
+ desc_new = irq_to_desc_alloc_cpu(new, cpu);
+ if (!desc_new) {
+ printk(KERN_INFO "can not get irq_desc for %d\n", new);
+ continue;
+ }
+ cfg_new = desc_new->chip_data;
+
+ if (cfg_new->vector != 0)
continue;
- if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+ if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
irq = new;
break;
}
spin_unlock_irqrestore(&vector_lock, flags);
- if (irq >= 0) {
+ if (irq > 0) {
dynamic_irq_init(irq);
+ /* restore it, in case dynamic_irq_init clear it */
+ if (desc_new)
+ desc_new->chip_data = cfg_new;
}
return irq;
}
+static int nr_irqs_gsi = NR_IRQS_LEGACY;
+int create_irq(void)
+{
+ unsigned int irq_want;
+ int irq;
+
+ irq_want = nr_irqs_gsi;
+ irq = create_irq_nr(irq_want);
+
+ if (irq == 0)
+ irq = -1;
+
+ return irq;
+}
+
void destroy_irq(unsigned int irq)
{
unsigned long flags;
+ struct irq_cfg *cfg;
+ struct irq_desc *desc;
+ /* store it, in case dynamic_irq_cleanup clear it */
+ desc = irq_to_desc(irq);
+ cfg = desc->chip_data;
dynamic_irq_cleanup(irq);
+ /* connect back irq_cfg */
+ if (desc)
+ desc->chip_data = cfg;
#ifdef CONFIG_INTR_REMAP
free_irte(irq);
#endif
spin_lock_irqsave(&vector_lock, flags);
- __clear_irq_vector(irq);
+ __clear_irq_vector(irq, cfg);
spin_unlock_irqrestore(&vector_lock, flags);
}
@@ -2316,13 +3219,14 @@ void destroy_irq(unsigned int irq)
#ifdef CONFIG_PCI_MSI
static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
int err;
unsigned dest;
cpumask_t tmp;
+ cfg = irq_cfg(irq);
tmp = TARGET_CPUS;
- err = assign_irq_vector(irq, tmp);
+ err = assign_irq_vector(irq, cfg, tmp);
if (err)
return err;
@@ -2383,7 +3287,8 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
#ifdef CONFIG_SMP
static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg;
struct msi_msg msg;
unsigned int dest;
cpumask_t tmp;
@@ -2392,23 +3297,25 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
if (cpus_empty(tmp))
return;
- if (assign_irq_vector(irq, mask))
+ cfg = desc->chip_data;
+ if (assign_irq_vector(irq, cfg, mask))
return;
+ set_extra_move_desc(desc, mask);
+
cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
- read_msi_msg(irq, &msg);
+ read_msi_msg_desc(desc, &msg);
msg.data &= ~MSI_DATA_VECTOR_MASK;
msg.data |= MSI_DATA_VECTOR(cfg->vector);
msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
- write_msi_msg(irq, &msg);
- irq_desc[irq].affinity = mask;
+ write_msi_msg_desc(desc, &msg);
+ desc->affinity = mask;
}
-
#ifdef CONFIG_INTR_REMAP
/*
* Migrate the MSI irq to another cpumask. This migration is
@@ -2416,7 +3323,8 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
*/
static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg;
unsigned int dest;
cpumask_t tmp, cleanup_mask;
struct irte irte;
@@ -2428,9 +3336,12 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
if (get_irte(irq, &irte))
return;
- if (assign_irq_vector(irq, mask))
+ cfg = desc->chip_data;
+ if (assign_irq_vector(irq, cfg, mask))
return;
+ set_extra_move_desc(desc, mask);
+
cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
@@ -2454,8 +3365,9 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
cfg->move_in_progress = 0;
}
- irq_desc[irq].affinity = mask;
+ desc->affinity = mask;
}
+
#endif
#endif /* CONFIG_SMP */
@@ -2507,14 +3419,14 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
if (index < 0) {
printk(KERN_ERR
"Unable to allocate %d IRTE for PCI %s\n", nvec,
- pci_name(dev));
+ pci_name(dev));
return -ENOSPC;
}
return index;
}
#endif
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
{
int ret;
struct msi_msg msg;
@@ -2523,12 +3435,12 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
if (ret < 0)
return ret;
- set_irq_msi(irq, desc);
+ set_irq_msi(irq, msidesc);
write_msi_msg(irq, &msg);
#ifdef CONFIG_INTR_REMAP
if (irq_remapped(irq)) {
- struct irq_desc *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
/*
* irq migration in process context
*/
@@ -2538,16 +3450,21 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
#endif
set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+ dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+
return 0;
}
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
{
- int irq, ret;
+ unsigned int irq;
+ int ret;
+ unsigned int irq_want;
- irq = create_irq();
- if (irq < 0)
- return irq;
+ irq_want = nr_irqs_gsi;
+ irq = create_irq_nr(irq_want);
+ if (irq == 0)
+ return -1;
#ifdef CONFIG_INTR_REMAP
if (!intr_remapping_enabled)
@@ -2558,7 +3475,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
goto error;
no_ir:
#endif
- ret = setup_msi_irq(dev, desc, irq);
+ ret = setup_msi_irq(dev, msidesc, irq);
if (ret < 0) {
destroy_irq(irq);
return ret;
@@ -2574,18 +3491,23 @@ error:
int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
- int irq, ret, sub_handle;
- struct msi_desc *desc;
+ unsigned int irq;
+ int ret, sub_handle;
+ struct msi_desc *msidesc;
+ unsigned int irq_want;
+
#ifdef CONFIG_INTR_REMAP
struct intel_iommu *iommu = 0;
int index = 0;
#endif
+ irq_want = nr_irqs_gsi;
sub_handle = 0;
- list_for_each_entry(desc, &dev->msi_list, list) {
- irq = create_irq();
- if (irq < 0)
- return irq;
+ list_for_each_entry(msidesc, &dev->msi_list, list) {
+ irq = create_irq_nr(irq_want);
+ irq_want++;
+ if (irq == 0)
+ return -1;
#ifdef CONFIG_INTR_REMAP
if (!intr_remapping_enabled)
goto no_ir;
@@ -2615,7 +3537,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
}
no_ir:
#endif
- ret = setup_msi_irq(dev, desc, irq);
+ ret = setup_msi_irq(dev, msidesc, irq);
if (ret < 0)
goto error;
sub_handle++;
@@ -2636,7 +3558,8 @@ void arch_teardown_msi_irq(unsigned int irq)
#ifdef CONFIG_SMP
static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg;
struct msi_msg msg;
unsigned int dest;
cpumask_t tmp;
@@ -2645,9 +3568,12 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
if (cpus_empty(tmp))
return;
- if (assign_irq_vector(irq, mask))
+ cfg = desc->chip_data;
+ if (assign_irq_vector(irq, cfg, mask))
return;
+ set_extra_move_desc(desc, mask);
+
cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
@@ -2659,8 +3585,9 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
dmar_msi_write(irq, &msg);
- irq_desc[irq].affinity = mask;
+ desc->affinity = mask;
}
+
#endif /* CONFIG_SMP */
struct irq_chip dmar_msi_type = {
@@ -2689,6 +3616,71 @@ int arch_setup_dmar_msi(unsigned int irq)
}
#endif
+#ifdef CONFIG_HPET_TIMER
+
+#ifdef CONFIG_SMP
+static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg;
+ struct msi_msg msg;
+ unsigned int dest;
+ cpumask_t tmp;
+
+ cpus_and(tmp, mask, cpu_online_map);
+ if (cpus_empty(tmp))
+ return;
+
+ cfg = desc->chip_data;
+ if (assign_irq_vector(irq, cfg, mask))
+ return;
+
+ set_extra_move_desc(desc, mask);
+
+ cpus_and(tmp, cfg->domain, mask);
+ dest = cpu_mask_to_apicid(tmp);
+
+ hpet_msi_read(irq, &msg);
+
+ msg.data &= ~MSI_DATA_VECTOR_MASK;
+ msg.data |= MSI_DATA_VECTOR(cfg->vector);
+ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+ msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+ hpet_msi_write(irq, &msg);
+ desc->affinity = mask;
+}
+
+#endif /* CONFIG_SMP */
+
+struct irq_chip hpet_msi_type = {
+ .name = "HPET_MSI",
+ .unmask = hpet_msi_unmask,
+ .mask = hpet_msi_mask,
+ .ack = ack_apic_edge,
+#ifdef CONFIG_SMP
+ .set_affinity = hpet_msi_set_affinity,
+#endif
+ .retrigger = ioapic_retrigger_irq,
+};
+
+int arch_setup_hpet_msi(unsigned int irq)
+{
+ int ret;
+ struct msi_msg msg;
+
+ ret = msi_compose_msg(NULL, irq, &msg);
+ if (ret < 0)
+ return ret;
+
+ hpet_msi_write(irq, &msg);
+ set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq,
+ "edge");
+
+ return 0;
+}
+#endif
+
#endif /* CONFIG_PCI_MSI */
/*
* Hypertransport interrupt support
@@ -2713,7 +3705,8 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg;
unsigned int dest;
cpumask_t tmp;
@@ -2721,15 +3714,19 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
if (cpus_empty(tmp))
return;
- if (assign_irq_vector(irq, mask))
+ cfg = desc->chip_data;
+ if (assign_irq_vector(irq, cfg, mask))
return;
+ set_extra_move_desc(desc, mask);
+
cpus_and(tmp, cfg->domain, mask);
dest = cpu_mask_to_apicid(tmp);
target_ht_irq(irq, dest, cfg->vector);
- irq_desc[irq].affinity = mask;
+ desc->affinity = mask;
}
+
#endif
static struct irq_chip ht_irq_chip = {
@@ -2745,12 +3742,13 @@ static struct irq_chip ht_irq_chip = {
int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
{
- struct irq_cfg *cfg = irq_cfg + irq;
+ struct irq_cfg *cfg;
int err;
cpumask_t tmp;
+ cfg = irq_cfg(irq);
tmp = TARGET_CPUS;
- err = assign_irq_vector(irq, tmp);
+ err = assign_irq_vector(irq, cfg, tmp);
if (!err) {
struct ht_irq_msg msg;
unsigned dest;
@@ -2777,20 +3775,185 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
set_irq_chip_and_handler_name(irq, &ht_irq_chip,
handle_edge_irq, "edge");
+
+ dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
}
return err;
}
#endif /* CONFIG_HT_IRQ */
+#ifdef CONFIG_X86_64
+/*
+ * Re-target the irq to the specified CPU and enable the specified MMR located
+ * on the specified blade to allow the sending of MSIs to the specified CPU.
+ */
+int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
+ unsigned long mmr_offset)
+{
+ const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
+ struct irq_cfg *cfg;
+ int mmr_pnode;
+ unsigned long mmr_value;
+ struct uv_IO_APIC_route_entry *entry;
+ unsigned long flags;
+ int err;
+
+ cfg = irq_cfg(irq);
+
+ err = assign_irq_vector(irq, cfg, *eligible_cpu);
+ if (err != 0)
+ return err;
+
+ spin_lock_irqsave(&vector_lock, flags);
+ set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
+ irq_name);
+ spin_unlock_irqrestore(&vector_lock, flags);
+
+ mmr_value = 0;
+ entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+ BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
+ entry->vector = cfg->vector;
+ entry->delivery_mode = INT_DELIVERY_MODE;
+ entry->dest_mode = INT_DEST_MODE;
+ entry->polarity = 0;
+ entry->trigger = 0;
+ entry->mask = 0;
+ entry->dest = cpu_mask_to_apicid(*eligible_cpu);
+
+ mmr_pnode = uv_blade_to_pnode(mmr_blade);
+ uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+
+ return irq;
+}
+
+/*
+ * Disable the specified MMR located on the specified blade so that MSIs are
+ * longer allowed to be sent.
+ */
+void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
+{
+ unsigned long mmr_value;
+ struct uv_IO_APIC_route_entry *entry;
+ int mmr_pnode;
+
+ mmr_value = 0;
+ entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+ BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
+ entry->mask = 1;
+
+ mmr_pnode = uv_blade_to_pnode(mmr_blade);
+ uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+}
+#endif /* CONFIG_X86_64 */
+
+int __init io_apic_get_redir_entries (int ioapic)
+{
+ union IO_APIC_reg_01 reg_01;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(ioapic, 1);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return reg_01.bits.entries;
+}
+
+void __init probe_nr_irqs_gsi(void)
+{
+ int idx;
+ int nr = 0;
+
+ for (idx = 0; idx < nr_ioapics; idx++)
+ nr += io_apic_get_redir_entries(idx) + 1;
+
+ if (nr > nr_irqs_gsi)
+ nr_irqs_gsi = nr;
+}
+
/* --------------------------------------------------------------------------
ACPI-based IOAPIC Configuration
-------------------------------------------------------------------------- */
#ifdef CONFIG_ACPI
-#define IO_APIC_MAX_ID 0xFE
+#ifdef CONFIG_X86_32
+int __init io_apic_get_unique_id(int ioapic, int apic_id)
+{
+ union IO_APIC_reg_00 reg_00;
+ static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
+ physid_mask_t tmp;
+ unsigned long flags;
+ int i = 0;
-int __init io_apic_get_redir_entries (int ioapic)
+ /*
+ * The P4 platform supports up to 256 APIC IDs on two separate APIC
+ * buses (one for LAPICs, one for IOAPICs), where predecessors only
+ * supports up to 16 on one shared APIC bus.
+ *
+ * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
+ * advantage of new APIC bus architecture.
+ */
+
+ if (physids_empty(apic_id_map))
+ apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ if (apic_id >= get_physical_broadcast()) {
+ printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
+ "%d\n", ioapic, apic_id, reg_00.bits.ID);
+ apic_id = reg_00.bits.ID;
+ }
+
+ /*
+ * Every APIC in a system must have a unique ID or we get lots of nice
+ * 'stuck on smp_invalidate_needed IPI wait' messages.
+ */
+ if (check_apicid_used(apic_id_map, apic_id)) {
+
+ for (i = 0; i < get_physical_broadcast(); i++) {
+ if (!check_apicid_used(apic_id_map, i))
+ break;
+ }
+
+ if (i == get_physical_broadcast())
+ panic("Max apic_id exceeded!\n");
+
+ printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
+ "trying %d\n", ioapic, apic_id, i);
+
+ apic_id = i;
+ }
+
+ tmp = apicid_to_cpu_present(apic_id);
+ physids_or(apic_id_map, apic_id_map, tmp);
+
+ if (reg_00.bits.ID != apic_id) {
+ reg_00.bits.ID = apic_id;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(ioapic, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /* Sanity check */
+ if (reg_00.bits.ID != apic_id) {
+ printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
+ return -1;
+ }
+ }
+
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+
+ return apic_id;
+}
+
+int __init io_apic_get_version(int ioapic)
{
union IO_APIC_reg_01 reg_01;
unsigned long flags;
@@ -2799,25 +3962,37 @@ int __init io_apic_get_redir_entries (int ioapic)
reg_01.raw = io_apic_read(ioapic, 1);
spin_unlock_irqrestore(&ioapic_lock, flags);
- return reg_01.bits.entries;
+ return reg_01.bits.version;
}
-
+#endif
int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
{
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+ int cpu = boot_cpu_id;
+
if (!IO_APIC_IRQ(irq)) {
apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
ioapic);
return -EINVAL;
}
+ desc = irq_to_desc_alloc_cpu(irq, cpu);
+ if (!desc) {
+ printk(KERN_INFO "can not get irq_desc %d\n", irq);
+ return 0;
+ }
+
/*
* IRQs < 16 are already in the irq_2_pin[] map
*/
- if (irq >= 16)
- add_pin_to_irq(irq, ioapic, pin);
+ if (irq >= NR_IRQS_LEGACY) {
+ cfg = desc->chip_data;
+ add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
+ }
- setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
+ setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
return 0;
}
@@ -2853,6 +4028,9 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
void __init setup_ioapic_dest(void)
{
int pin, ioapic, irq, irq_entry;
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+ cpumask_t mask;
if (skip_ioapic_setup == 1)
return;
@@ -2868,16 +4046,31 @@ void __init setup_ioapic_dest(void)
* when you have too many devices, because at that time only boot
* cpu is online.
*/
- if (!irq_cfg[irq].vector)
- setup_IO_APIC_irq(ioapic, pin, irq,
+ desc = irq_to_desc(irq);
+ cfg = desc->chip_data;
+ if (!cfg->vector) {
+ setup_IO_APIC_irq(ioapic, pin, irq, desc,
irq_trigger(irq_entry),
irq_polarity(irq_entry));
+ continue;
+
+ }
+
+ /*
+ * Honour affinities which have been set in early boot
+ */
+ if (desc->status &
+ (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
+ mask = desc->affinity;
+ else
+ mask = TARGET_CPUS;
+
#ifdef CONFIG_INTR_REMAP
- else if (intr_remapping_enabled)
- set_ir_ioapic_affinity_irq(irq, TARGET_CPUS);
-#endif
+ if (intr_remapping_enabled)
+ set_ir_ioapic_affinity_irq_desc(desc, mask);
else
- set_ioapic_affinity_irq(irq, TARGET_CPUS);
+#endif
+ set_ioapic_affinity_irq_desc(desc, mask);
}
}
@@ -2930,14 +4123,28 @@ void __init ioapic_init_mappings(void)
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
ioapic_phys = mp_ioapics[i].mp_apicaddr;
+#ifdef CONFIG_X86_32
+ if (!ioapic_phys) {
+ printk(KERN_ERR
+ "WARNING: bogus zero IO-APIC "
+ "address found in MPTABLE, "
+ "disabling IO/APIC support!\n");
+ smp_found_config = 0;
+ skip_ioapic_setup = 1;
+ goto fake_ioapic_page;
+ }
+#endif
} else {
+#ifdef CONFIG_X86_32
+fake_ioapic_page:
+#endif
ioapic_phys = (unsigned long)
alloc_bootmem_pages(PAGE_SIZE);
ioapic_phys = __pa(ioapic_phys);
}
set_fixmap_nocache(idx, ioapic_phys);
apic_printk(APIC_VERBOSE,
- "mapped IOAPIC to %016lx (%016lx)\n",
+ "mapped IOAPIC to %08lx (%08lx)\n",
__fix_to_virt(idx), ioapic_phys);
idx++;
@@ -2971,4 +4178,3 @@ static int __init ioapic_insert_resources(void)
/* Insert the IO APIC resources after PCI initialization has occured to handle
* IO APICS that are mapped in on a BAR in PCI space. */
late_initcall(ioapic_insert_resources);
-
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
deleted file mode 100644
index e710289f673e..000000000000
--- a/arch/x86/kernel/io_apic_32.c
+++ /dev/null
@@ -1,2908 +0,0 @@
-/*
- * Intel IO-APIC support for multi-Pentium hosts.
- *
- * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
- *
- * Many thanks to Stig Venaas for trying out countless experimental
- * patches and reporting/debugging problems patiently!
- *
- * (c) 1999, Multiple IO-APIC support, developed by
- * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
- * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
- * further tested and cleaned up by Zach Brown <zab@redhat.com>
- * and Ingo Molnar <mingo@redhat.com>
- *
- * Fixes
- * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
- * thanks to Eric Gilmore
- * and Rolf G. Tews
- * for testing these extensively
- * Paul Diefenbaugh : Added full ACPI support
- */
-
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/bootmem.h>
-#include <linux/mc146818rtc.h>
-#include <linux/compiler.h>
-#include <linux/acpi.h>
-#include <linux/module.h>
-#include <linux/sysdev.h>
-#include <linux/pci.h>
-#include <linux/msi.h>
-#include <linux/htirq.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/jiffies.h> /* time_after() */
-
-#include <asm/io.h>
-#include <asm/smp.h>
-#include <asm/desc.h>
-#include <asm/timer.h>
-#include <asm/i8259.h>
-#include <asm/nmi.h>
-#include <asm/msidef.h>
-#include <asm/hypertransport.h>
-#include <asm/setup.h>
-
-#include <mach_apic.h>
-#include <mach_apicdef.h>
-
-#define __apicdebuginit(type) static type __init
-
-int (*ioapic_renumber_irq)(int ioapic, int irq);
-atomic_t irq_mis_count;
-
-/* Where if anywhere is the i8259 connect in external int mode */
-static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
-
-static DEFINE_SPINLOCK(ioapic_lock);
-DEFINE_SPINLOCK(vector_lock);
-
-int timer_through_8259 __initdata;
-
-/*
- * Is the SiS APIC rmw bug present ?
- * -1 = don't know, 0 = no, 1 = yes
- */
-int sis_apic_bug = -1;
-
-/*
- * # of IRQ routing registers
- */
-int nr_ioapic_registers[MAX_IO_APICS];
-
-/* I/O APIC entries */
-struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
-int nr_ioapics;
-
-/* MP IRQ source entries */
-struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
-
-/* # of MP IRQ source entries */
-int mp_irq_entries;
-
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-int mp_bus_id_to_type[MAX_MP_BUSSES];
-#endif
-
-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-
-static int disable_timer_pin_1 __initdata;
-
-/*
- * Rough estimation of how many shared IRQs there are, can
- * be changed anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
-
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
-
-static struct irq_pin_list {
- int apic, pin, next;
-} irq_2_pin[PIN_MAP_SIZE];
-
-struct io_apic {
- unsigned int index;
- unsigned int unused[3];
- unsigned int data;
-};
-
-static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
-{
- return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
- + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
-}
-
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
-{
- struct io_apic __iomem *io_apic = io_apic_base(apic);
- writel(reg, &io_apic->index);
- return readl(&io_apic->data);
-}
-
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
- struct io_apic __iomem *io_apic = io_apic_base(apic);
- writel(reg, &io_apic->index);
- writel(value, &io_apic->data);
-}
-
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index register
- */
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
- volatile struct io_apic __iomem *io_apic = io_apic_base(apic);
- if (sis_apic_bug)
- writel(reg, &io_apic->index);
- writel(value, &io_apic->data);
-}
-
-union entry_union {
- struct { u32 w1, w2; };
- struct IO_APIC_route_entry entry;
-};
-
-static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
-{
- union entry_union eu;
- unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
- eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
- eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- return eu.entry;
-}
-
-/*
- * When we write a new IO APIC routing entry, we need to write the high
- * word first! If the mask bit in the low word is clear, we will enable
- * the interrupt, and we need to make sure the entry is fully populated
- * before that happens.
- */
-static void
-__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
-{
- union entry_union eu;
- eu.entry = e;
- io_apic_write(apic, 0x11 + 2*pin, eu.w2);
- io_apic_write(apic, 0x10 + 2*pin, eu.w1);
-}
-
-static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
-{
- unsigned long flags;
- spin_lock_irqsave(&ioapic_lock, flags);
- __ioapic_write_entry(apic, pin, e);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-/*
- * When we mask an IO APIC routing entry, we need to write the low
- * word first, in order to set the mask bit before we change the
- * high bits!
- */
-static void ioapic_mask_entry(int apic, int pin)
-{
- unsigned long flags;
- union entry_union eu = { .entry.mask = 1 };
-
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x10 + 2*pin, eu.w1);
- io_apic_write(apic, 0x11 + 2*pin, eu.w2);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-/*
- * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
- * shared ISA-space IRQs, so we have to support them. We are super
- * fast in the common case, and fast for shared ISA-space IRQs.
- */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
-{
- static int first_free_entry = NR_IRQS;
- struct irq_pin_list *entry = irq_2_pin + irq;
-
- while (entry->next)
- entry = irq_2_pin + entry->next;
-
- if (entry->pin != -1) {
- entry->next = first_free_entry;
- entry = irq_2_pin + entry->next;
- if (++first_free_entry >= PIN_MAP_SIZE)
- panic("io_apic.c: whoops");
- }
- entry->apic = apic;
- entry->pin = pin;
-}
-
-/*
- * Reroute an IRQ to a different pin.
- */
-static void __init replace_pin_at_irq(unsigned int irq,
- int oldapic, int oldpin,
- int newapic, int newpin)
-{
- struct irq_pin_list *entry = irq_2_pin + irq;
-
- while (1) {
- if (entry->apic == oldapic && entry->pin == oldpin) {
- entry->apic = newapic;
- entry->pin = newpin;
- }
- if (!entry->next)
- break;
- entry = irq_2_pin + entry->next;
- }
-}
-
-static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
-{
- struct irq_pin_list *entry = irq_2_pin + irq;
- unsigned int pin, reg;
-
- for (;;) {
- pin = entry->pin;
- if (pin == -1)
- break;
- reg = io_apic_read(entry->apic, 0x10 + pin*2);
- reg &= ~disable;
- reg |= enable;
- io_apic_modify(entry->apic, 0x10 + pin*2, reg);
- if (!entry->next)
- break;
- entry = irq_2_pin + entry->next;
- }
-}
-
-/* mask = 1 */
-static void __mask_IO_APIC_irq(unsigned int irq)
-{
- __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
-}
-
-/* mask = 0 */
-static void __unmask_IO_APIC_irq(unsigned int irq)
-{
- __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
-}
-
-/* mask = 1, trigger = 0 */
-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
-{
- __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
- IO_APIC_REDIR_LEVEL_TRIGGER);
-}
-
-/* mask = 0, trigger = 1 */
-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
-{
- __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
- IO_APIC_REDIR_MASKED);
-}
-
-static void mask_IO_APIC_irq(unsigned int irq)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- __mask_IO_APIC_irq(irq);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void unmask_IO_APIC_irq(unsigned int irq)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- __unmask_IO_APIC_irq(irq);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
-{
- struct IO_APIC_route_entry entry;
-
- /* Check delivery_mode to be sure we're not clearing an SMI pin */
- entry = ioapic_read_entry(apic, pin);
- if (entry.delivery_mode == dest_SMI)
- return;
-
- /*
- * Disable it in the IO-APIC irq-routing table:
- */
- ioapic_mask_entry(apic, pin);
-}
-
-static void clear_IO_APIC(void)
-{
- int apic, pin;
-
- for (apic = 0; apic < nr_ioapics; apic++)
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
- clear_IO_APIC_pin(apic, pin);
-}
-
-#ifdef CONFIG_SMP
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
-{
- unsigned long flags;
- int pin;
- struct irq_pin_list *entry = irq_2_pin + irq;
- unsigned int apicid_value;
- cpumask_t tmp;
-
- cpus_and(tmp, cpumask, cpu_online_map);
- if (cpus_empty(tmp))
- tmp = TARGET_CPUS;
-
- cpus_and(cpumask, tmp, CPU_MASK_ALL);
-
- apicid_value = cpu_mask_to_apicid(cpumask);
- /* Prepare to do the io_apic_write */
- apicid_value = apicid_value << 24;
- spin_lock_irqsave(&ioapic_lock, flags);
- for (;;) {
- pin = entry->pin;
- if (pin == -1)
- break;
- io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
- if (!entry->next)
- break;
- entry = irq_2_pin + entry->next;
- }
- irq_desc[irq].affinity = cpumask;
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-#if defined(CONFIG_IRQBALANCE)
-# include <asm/processor.h> /* kernel_thread() */
-# include <linux/kernel_stat.h> /* kstat */
-# include <linux/slab.h> /* kmalloc() */
-# include <linux/timer.h>
-
-#define IRQBALANCE_CHECK_ARCH -999
-#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
-#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
-#define BALANCED_IRQ_MORE_DELTA (HZ/10)
-#define BALANCED_IRQ_LESS_DELTA (HZ)
-
-static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
-static int physical_balance __read_mostly;
-static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
-
-static struct irq_cpu_info {
- unsigned long *last_irq;
- unsigned long *irq_delta;
- unsigned long irq;
-} irq_cpu_data[NR_CPUS];
-
-#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
-#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
-#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
-
-#define IDLE_ENOUGH(cpu,now) \
- (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
-
-#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
-
-#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i)))
-
-static cpumask_t balance_irq_affinity[NR_IRQS] = {
- [0 ... NR_IRQS-1] = CPU_MASK_ALL
-};
-
-void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
-{
- balance_irq_affinity[irq] = mask;
-}
-
-static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
- unsigned long now, int direction)
-{
- int search_idle = 1;
- int cpu = curr_cpu;
-
- goto inside;
-
- do {
- if (unlikely(cpu == curr_cpu))
- search_idle = 0;
-inside:
- if (direction == 1) {
- cpu++;
- if (cpu >= NR_CPUS)
- cpu = 0;
- } else {
- cpu--;
- if (cpu == -1)
- cpu = NR_CPUS-1;
- }
- } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
- (search_idle && !IDLE_ENOUGH(cpu, now)));
-
- return cpu;
-}
-
-static inline void balance_irq(int cpu, int irq)
-{
- unsigned long now = jiffies;
- cpumask_t allowed_mask;
- unsigned int new_cpu;
-
- if (irqbalance_disabled)
- return;
-
- cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
- new_cpu = move(cpu, allowed_mask, now, 1);
- if (cpu != new_cpu)
- set_pending_irq(irq, cpumask_of_cpu(new_cpu));
-}
-
-static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
-{
- int i, j;
-
- for_each_online_cpu(i) {
- for (j = 0; j < NR_IRQS; j++) {
- if (!irq_desc[j].action)
- continue;
- /* Is it a significant load ? */
- if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
- useful_load_threshold)
- continue;
- balance_irq(i, j);
- }
- }
- balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
- return;
-}
-
-static void do_irq_balance(void)
-{
- int i, j;
- unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
- unsigned long move_this_load = 0;
- int max_loaded = 0, min_loaded = 0;
- int load;
- unsigned long useful_load_threshold = balanced_irq_interval + 10;
- int selected_irq;
- int tmp_loaded, first_attempt = 1;
- unsigned long tmp_cpu_irq;
- unsigned long imbalance = 0;
- cpumask_t allowed_mask, target_cpu_mask, tmp;
-
- for_each_possible_cpu(i) {
- int package_index;
- CPU_IRQ(i) = 0;
- if (!cpu_online(i))
- continue;
- package_index = CPU_TO_PACKAGEINDEX(i);
- for (j = 0; j < NR_IRQS; j++) {
- unsigned long value_now, delta;
- /* Is this an active IRQ or balancing disabled ? */
- if (!irq_desc[j].action || irq_balancing_disabled(j))
- continue;
- if (package_index == i)
- IRQ_DELTA(package_index, j) = 0;
- /* Determine the total count per processor per IRQ */
- value_now = (unsigned long) kstat_cpu(i).irqs[j];
-
- /* Determine the activity per processor per IRQ */
- delta = value_now - LAST_CPU_IRQ(i, j);
-
- /* Update last_cpu_irq[][] for the next time */
- LAST_CPU_IRQ(i, j) = value_now;
-
- /* Ignore IRQs whose rate is less than the clock */
- if (delta < useful_load_threshold)
- continue;
- /* update the load for the processor or package total */
- IRQ_DELTA(package_index, j) += delta;
-
- /* Keep track of the higher numbered sibling as well */
- if (i != package_index)
- CPU_IRQ(i) += delta;
- /*
- * We have sibling A and sibling B in the package
- *
- * cpu_irq[A] = load for cpu A + load for cpu B
- * cpu_irq[B] = load for cpu B
- */
- CPU_IRQ(package_index) += delta;
- }
- }
- /* Find the least loaded processor package */
- for_each_online_cpu(i) {
- if (i != CPU_TO_PACKAGEINDEX(i))
- continue;
- if (min_cpu_irq > CPU_IRQ(i)) {
- min_cpu_irq = CPU_IRQ(i);
- min_loaded = i;
- }
- }
- max_cpu_irq = ULONG_MAX;
-
-tryanothercpu:
- /*
- * Look for heaviest loaded processor.
- * We may come back to get the next heaviest loaded processor.
- * Skip processors with trivial loads.
- */
- tmp_cpu_irq = 0;
- tmp_loaded = -1;
- for_each_online_cpu(i) {
- if (i != CPU_TO_PACKAGEINDEX(i))
- continue;
- if (max_cpu_irq <= CPU_IRQ(i))
- continue;
- if (tmp_cpu_irq < CPU_IRQ(i)) {
- tmp_cpu_irq = CPU_IRQ(i);
- tmp_loaded = i;
- }
- }
-
- if (tmp_loaded == -1) {
- /*
- * In the case of small number of heavy interrupt sources,
- * loading some of the cpus too much. We use Ingo's original
- * approach to rotate them around.
- */
- if (!first_attempt && imbalance >= useful_load_threshold) {
- rotate_irqs_among_cpus(useful_load_threshold);
- return;
- }
- goto not_worth_the_effort;
- }
-
- first_attempt = 0; /* heaviest search */
- max_cpu_irq = tmp_cpu_irq; /* load */
- max_loaded = tmp_loaded; /* processor */
- imbalance = (max_cpu_irq - min_cpu_irq) / 2;
-
- /*
- * if imbalance is less than approx 10% of max load, then
- * observe diminishing returns action. - quit
- */
- if (imbalance < (max_cpu_irq >> 3))
- goto not_worth_the_effort;
-
-tryanotherirq:
- /* if we select an IRQ to move that can't go where we want, then
- * see if there is another one to try.
- */
- move_this_load = 0;
- selected_irq = -1;
- for (j = 0; j < NR_IRQS; j++) {
- /* Is this an active IRQ? */
- if (!irq_desc[j].action)
- continue;
- if (imbalance <= IRQ_DELTA(max_loaded, j))
- continue;
- /* Try to find the IRQ that is closest to the imbalance
- * without going over.
- */
- if (move_this_load < IRQ_DELTA(max_loaded, j)) {
- move_this_load = IRQ_DELTA(max_loaded, j);
- selected_irq = j;
- }
- }
- if (selected_irq == -1)
- goto tryanothercpu;
-
- imbalance = move_this_load;
-
- /* For physical_balance case, we accumulated both load
- * values in the one of the siblings cpu_irq[],
- * to use the same code for physical and logical processors
- * as much as possible.
- *
- * NOTE: the cpu_irq[] array holds the sum of the load for
- * sibling A and sibling B in the slot for the lowest numbered
- * sibling (A), _AND_ the load for sibling B in the slot for
- * the higher numbered sibling.
- *
- * We seek the least loaded sibling by making the comparison
- * (A+B)/2 vs B
- */
- load = CPU_IRQ(min_loaded) >> 1;
- for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) {
- if (load > CPU_IRQ(j)) {
- /* This won't change cpu_sibling_map[min_loaded] */
- load = CPU_IRQ(j);
- min_loaded = j;
- }
- }
-
- cpus_and(allowed_mask,
- cpu_online_map,
- balance_irq_affinity[selected_irq]);
- target_cpu_mask = cpumask_of_cpu(min_loaded);
- cpus_and(tmp, target_cpu_mask, allowed_mask);
-
- if (!cpus_empty(tmp)) {
- /* mark for change destination */
- set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
-
- /* Since we made a change, come back sooner to
- * check for more variation.
- */
- balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
- return;
- }
- goto tryanotherirq;
-
-not_worth_the_effort:
- /*
- * if we did not find an IRQ to move, then adjust the time interval
- * upward
- */
- balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
- return;
-}
-
-static int balanced_irq(void *unused)
-{
- int i;
- unsigned long prev_balance_time = jiffies;
- long time_remaining = balanced_irq_interval;
-
- /* push everything to CPU 0 to give us a starting point. */
- for (i = 0 ; i < NR_IRQS ; i++) {
- irq_desc[i].pending_mask = cpumask_of_cpu(0);
- set_pending_irq(i, cpumask_of_cpu(0));
- }
-
- set_freezable();
- for ( ; ; ) {
- time_remaining = schedule_timeout_interruptible(time_remaining);
- try_to_freeze();
- if (time_after(jiffies,
- prev_balance_time+balanced_irq_interval)) {
- preempt_disable();
- do_irq_balance();
- prev_balance_time = jiffies;
- time_remaining = balanced_irq_interval;
- preempt_enable();
- }
- }
- return 0;
-}
-
-static int __init balanced_irq_init(void)
-{
- int i;
- struct cpuinfo_x86 *c;
- cpumask_t tmp;
-
- cpus_shift_right(tmp, cpu_online_map, 2);
- c = &boot_cpu_data;
- /* When not overwritten by the command line ask subarchitecture. */
- if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
- irqbalance_disabled = NO_BALANCE_IRQ;
- if (irqbalance_disabled)
- return 0;
-
- /* disable irqbalance completely if there is only one processor online */
- if (num_online_cpus() < 2) {
- irqbalance_disabled = 1;
- return 0;
- }
- /*
- * Enable physical balance only if more than 1 physical processor
- * is present
- */
- if (smp_num_siblings > 1 && !cpus_empty(tmp))
- physical_balance = 1;
-
- for_each_online_cpu(i) {
- irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
- irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
- if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
- printk(KERN_ERR "balanced_irq_init: out of memory");
- goto failed;
- }
- }
-
- printk(KERN_INFO "Starting balanced_irq\n");
- if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
- return 0;
- printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
-failed:
- for_each_possible_cpu(i) {
- kfree(irq_cpu_data[i].irq_delta);
- irq_cpu_data[i].irq_delta = NULL;
- kfree(irq_cpu_data[i].last_irq);
- irq_cpu_data[i].last_irq = NULL;
- }
- return 0;
-}
-
-int __devinit irqbalance_disable(char *str)
-{
- irqbalance_disabled = 1;
- return 1;
-}
-
-__setup("noirqbalance", irqbalance_disable);
-
-late_initcall(balanced_irq_init);
-#endif /* CONFIG_IRQBALANCE */
-#endif /* CONFIG_SMP */
-
-#ifndef CONFIG_SMP
-void send_IPI_self(int vector)
-{
- unsigned int cfg;
-
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
- cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- apic_write(APIC_ICR, cfg);
-}
-#endif /* !CONFIG_SMP */
-
-
-/*
- * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
- * specific CPU-side IRQs.
- */
-
-#define MAX_PIRQS 8
-static int pirq_entries [MAX_PIRQS];
-static int pirqs_enabled;
-int skip_ioapic_setup;
-
-static int __init ioapic_pirq_setup(char *str)
-{
- int i, max;
- int ints[MAX_PIRQS+1];
-
- get_options(str, ARRAY_SIZE(ints), ints);
-
- for (i = 0; i < MAX_PIRQS; i++)
- pirq_entries[i] = -1;
-
- pirqs_enabled = 1;
- apic_printk(APIC_VERBOSE, KERN_INFO
- "PIRQ redirection, working around broken MP-BIOS.\n");
- max = MAX_PIRQS;
- if (ints[0] < MAX_PIRQS)
- max = ints[0];
-
- for (i = 0; i < max; i++) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
- /*
- * PIRQs are mapped upside down, usually.
- */
- pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
- }
- return 1;
-}
-
-__setup("pirq=", ioapic_pirq_setup);
-
-/*
- * Find the IRQ entry number of a certain pin.
- */
-static int find_irq_entry(int apic, int pin, int type)
-{
- int i;
-
- for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_irqtype == type &&
- (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
- mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
- mp_irqs[i].mp_dstirq == pin)
- return i;
-
- return -1;
-}
-
-/*
- * Find the pin to which IRQ[irq] (ISA) is connected
- */
-static int __init find_isa_irq_pin(int irq, int type)
-{
- int i;
-
- for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
-
- if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mp_irqtype == type) &&
- (mp_irqs[i].mp_srcbusirq == irq))
-
- return mp_irqs[i].mp_dstirq;
- }
- return -1;
-}
-
-static int __init find_isa_irq_apic(int irq, int type)
-{
- int i;
-
- for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
-
- if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mp_irqtype == type) &&
- (mp_irqs[i].mp_srcbusirq == irq))
- break;
- }
- if (i < mp_irq_entries) {
- int apic;
- for (apic = 0; apic < nr_ioapics; apic++) {
- if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
- return apic;
- }
- }
-
- return -1;
-}
-
-/*
- * Find a specific PCI IRQ entry.
- * Not an __init, possibly needed by modules
- */
-static int pin_2_irq(int idx, int apic, int pin);
-
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
-{
- int apic, i, best_guess = -1;
-
- apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
- "slot:%d, pin:%d.\n", bus, slot, pin);
- if (test_bit(bus, mp_bus_not_pci)) {
- printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
- return -1;
- }
- for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mp_srcbus;
-
- for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
- mp_irqs[i].mp_dstapic == MP_APIC_ALL)
- break;
-
- if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].mp_irqtype &&
- (bus == lbus) &&
- (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
-
- if (!(apic || IO_APIC_IRQ(irq)))
- continue;
-
- if (pin == (mp_irqs[i].mp_srcbusirq & 3))
- return irq;
- /*
- * Use the first all-but-pin matching entry as a
- * best-guess fuzzy result for broken mptables.
- */
- if (best_guess < 0)
- best_guess = irq;
- }
- }
- return best_guess;
-}
-EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-
-/*
- * This function currently is only a helper for the i386 smp boot process where
- * we need to reprogram the ioredtbls to cater for the cpus which have come online
- * so mask in all cases should simply be TARGET_CPUS
- */
-#ifdef CONFIG_SMP
-void __init setup_ioapic_dest(void)
-{
- int pin, ioapic, irq, irq_entry;
-
- if (skip_ioapic_setup == 1)
- return;
-
- for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
- for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
- irq_entry = find_irq_entry(ioapic, pin, mp_INT);
- if (irq_entry == -1)
- continue;
- irq = pin_2_irq(irq_entry, ioapic, pin);
- set_ioapic_affinity_irq(irq, TARGET_CPUS);
- }
-
- }
-}
-#endif
-
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
-/*
- * EISA Edge/Level control register, ELCR
- */
-static int EISA_ELCR(unsigned int irq)
-{
- if (irq < 16) {
- unsigned int port = 0x4d0 + (irq >> 3);
- return (inb(port) >> (irq & 7)) & 1;
- }
- apic_printk(APIC_VERBOSE, KERN_INFO
- "Broken MPtable reports ISA irq %d\n", irq);
- return 0;
-}
-#endif
-
-/* ISA interrupts are always polarity zero edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_ISA_trigger(idx) (0)
-#define default_ISA_polarity(idx) (0)
-
-/* EISA interrupts are always polarity zero and can be edge or level
- * trigger depending on the ELCR value. If an interrupt is listed as
- * EISA conforming in the MP table, that means its trigger type must
- * be read in from the ELCR */
-
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
-#define default_EISA_polarity(idx) default_ISA_polarity(idx)
-
-/* PCI interrupts are always polarity one level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_PCI_trigger(idx) (1)
-#define default_PCI_polarity(idx) (1)
-
-/* MCA interrupts are always polarity zero level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_MCA_trigger(idx) (1)
-#define default_MCA_polarity(idx) default_ISA_polarity(idx)
-
-static int MPBIOS_polarity(int idx)
-{
- int bus = mp_irqs[idx].mp_srcbus;
- int polarity;
-
- /*
- * Determine IRQ line polarity (high active or low active):
- */
- switch (mp_irqs[idx].mp_irqflag & 3) {
- case 0: /* conforms, ie. bus-type dependent polarity */
- {
- polarity = test_bit(bus, mp_bus_not_pci)?
- default_ISA_polarity(idx):
- default_PCI_polarity(idx);
- break;
- }
- case 1: /* high active */
- {
- polarity = 0;
- break;
- }
- case 2: /* reserved */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
- case 3: /* low active */
- {
- polarity = 1;
- break;
- }
- default: /* invalid */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
- }
- return polarity;
-}
-
-static int MPBIOS_trigger(int idx)
-{
- int bus = mp_irqs[idx].mp_srcbus;
- int trigger;
-
- /*
- * Determine IRQ trigger mode (edge or level sensitive):
- */
- switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
- case 0: /* conforms, ie. bus-type dependent */
- {
- trigger = test_bit(bus, mp_bus_not_pci)?
- default_ISA_trigger(idx):
- default_PCI_trigger(idx);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
- switch (mp_bus_id_to_type[bus]) {
- case MP_BUS_ISA: /* ISA pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_EISA: /* EISA pin */
- {
- trigger = default_EISA_trigger(idx);
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_MCA: /* MCA pin */
- {
- trigger = default_MCA_trigger(idx);
- break;
- }
- default:
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
- break;
- }
- }
-#endif
- break;
- }
- case 1: /* edge */
- {
- trigger = 0;
- break;
- }
- case 2: /* reserved */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
- break;
- }
- case 3: /* level */
- {
- trigger = 1;
- break;
- }
- default: /* invalid */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 0;
- break;
- }
- }
- return trigger;
-}
-
-static inline int irq_polarity(int idx)
-{
- return MPBIOS_polarity(idx);
-}
-
-static inline int irq_trigger(int idx)
-{
- return MPBIOS_trigger(idx);
-}
-
-static int pin_2_irq(int idx, int apic, int pin)
-{
- int irq, i;
- int bus = mp_irqs[idx].mp_srcbus;
-
- /*
- * Debugging check, we are in big trouble if this message pops up!
- */
- if (mp_irqs[idx].mp_dstirq != pin)
- printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
-
- if (test_bit(bus, mp_bus_not_pci))
- irq = mp_irqs[idx].mp_srcbusirq;
- else {
- /*
- * PCI IRQs are mapped in order
- */
- i = irq = 0;
- while (i < apic)
- irq += nr_ioapic_registers[i++];
- irq += pin;
-
- /*
- * For MPS mode, so far only needed by ES7000 platform
- */
- if (ioapic_renumber_irq)
- irq = ioapic_renumber_irq(apic, irq);
- }
-
- /*
- * PCI IRQ command line redirection. Yes, limits are hardcoded.
- */
- if ((pin >= 16) && (pin <= 23)) {
- if (pirq_entries[pin-16] != -1) {
- if (!pirq_entries[pin-16]) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "disabling PIRQ%d\n", pin-16);
- } else {
- irq = pirq_entries[pin-16];
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "using PIRQ%d -> IRQ %d\n",
- pin-16, irq);
- }
- }
- }
- return irq;
-}
-
-static inline int IO_APIC_irq_trigger(int irq)
-{
- int apic, idx, pin;
-
- for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- idx = find_irq_entry(apic, pin, mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
- return irq_trigger(idx);
- }
- }
- /*
- * nonexistent IRQs are edge default
- */
- return 0;
-}
-
-/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
-
-static int __assign_irq_vector(int irq)
-{
- static int current_vector = FIRST_DEVICE_VECTOR, current_offset;
- int vector, offset;
-
- BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
-
- if (irq_vector[irq] > 0)
- return irq_vector[irq];
-
- vector = current_vector;
- offset = current_offset;
-next:
- vector += 8;
- if (vector >= first_system_vector) {
- offset = (offset + 1) % 8;
- vector = FIRST_DEVICE_VECTOR + offset;
- }
- if (vector == current_vector)
- return -ENOSPC;
- if (test_and_set_bit(vector, used_vectors))
- goto next;
-
- current_vector = vector;
- current_offset = offset;
- irq_vector[irq] = vector;
-
- return vector;
-}
-
-static int assign_irq_vector(int irq)
-{
- unsigned long flags;
- int vector;
-
- spin_lock_irqsave(&vector_lock, flags);
- vector = __assign_irq_vector(irq);
- spin_unlock_irqrestore(&vector_lock, flags);
-
- return vector;
-}
-
-static struct irq_chip ioapic_chip;
-
-#define IOAPIC_AUTO -1
-#define IOAPIC_EDGE 0
-#define IOAPIC_LEVEL 1
-
-static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
-{
- if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
- trigger == IOAPIC_LEVEL) {
- irq_desc[irq].status |= IRQ_LEVEL;
- set_irq_chip_and_handler_name(irq, &ioapic_chip,
- handle_fasteoi_irq, "fasteoi");
- } else {
- irq_desc[irq].status &= ~IRQ_LEVEL;
- set_irq_chip_and_handler_name(irq, &ioapic_chip,
- handle_edge_irq, "edge");
- }
- set_intr_gate(vector, interrupt[irq]);
-}
-
-static void __init setup_IO_APIC_irqs(void)
-{
- struct IO_APIC_route_entry entry;
- int apic, pin, idx, irq, first_notcon = 1, vector;
-
- apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
-
- for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-
- /*
- * add it to the IO-APIC irq-routing table:
- */
- memset(&entry, 0, sizeof(entry));
-
- entry.delivery_mode = INT_DELIVERY_MODE;
- entry.dest_mode = INT_DEST_MODE;
- entry.mask = 0; /* enable IRQ */
- entry.dest.logical.logical_dest =
- cpu_mask_to_apicid(TARGET_CPUS);
-
- idx = find_irq_entry(apic, pin, mp_INT);
- if (idx == -1) {
- if (first_notcon) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- " IO-APIC (apicid-pin) %d-%d",
- mp_ioapics[apic].mp_apicid,
- pin);
- first_notcon = 0;
- } else
- apic_printk(APIC_VERBOSE, ", %d-%d",
- mp_ioapics[apic].mp_apicid, pin);
- continue;
- }
-
- if (!first_notcon) {
- apic_printk(APIC_VERBOSE, " not connected.\n");
- first_notcon = 1;
- }
-
- entry.trigger = irq_trigger(idx);
- entry.polarity = irq_polarity(idx);
-
- if (irq_trigger(idx)) {
- entry.trigger = 1;
- entry.mask = 1;
- }
-
- irq = pin_2_irq(idx, apic, pin);
- /*
- * skip adding the timer int on secondary nodes, which causes
- * a small but painful rift in the time-space continuum
- */
- if (multi_timer_check(apic, irq))
- continue;
- else
- add_pin_to_irq(irq, apic, pin);
-
- if (!apic && !IO_APIC_IRQ(irq))
- continue;
-
- if (IO_APIC_IRQ(irq)) {
- vector = assign_irq_vector(irq);
- entry.vector = vector;
- ioapic_register_intr(irq, vector, IOAPIC_AUTO);
-
- if (!apic && (irq < 16))
- disable_8259A_irq(irq);
- }
- ioapic_write_entry(apic, pin, entry);
- }
- }
-
- if (!first_notcon)
- apic_printk(APIC_VERBOSE, " not connected.\n");
-}
-
-/*
- * Set up the timer pin, possibly with the 8259A-master behind.
- */
-static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
- int vector)
-{
- struct IO_APIC_route_entry entry;
-
- memset(&entry, 0, sizeof(entry));
-
- /*
- * We use logical delivery to get the timer IRQ
- * to the first CPU.
- */
- entry.dest_mode = INT_DEST_MODE;
- entry.mask = 1; /* mask IRQ now */
- entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
- entry.delivery_mode = INT_DELIVERY_MODE;
- entry.polarity = 0;
- entry.trigger = 0;
- entry.vector = vector;
-
- /*
- * The timer IRQ doesn't have to know that behind the
- * scene we may have a 8259A-master in AEOI mode ...
- */
- ioapic_register_intr(0, vector, IOAPIC_EDGE);
-
- /*
- * Add it to the IO-APIC irq-routing table:
- */
- ioapic_write_entry(apic, pin, entry);
-}
-
-
-__apicdebuginit(void) print_IO_APIC(void)
-{
- int apic, i;
- union IO_APIC_reg_00 reg_00;
- union IO_APIC_reg_01 reg_01;
- union IO_APIC_reg_02 reg_02;
- union IO_APIC_reg_03 reg_03;
- unsigned long flags;
-
- if (apic_verbosity == APIC_QUIET)
- return;
-
- printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
- for (i = 0; i < nr_ioapics; i++)
- printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
-
- /*
- * We are a bit conservative about what we expect. We have to
- * know about every hardware change ASAP.
- */
- printk(KERN_INFO "testing the IO APIC.......................\n");
-
- for (apic = 0; apic < nr_ioapics; apic++) {
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
- reg_01.raw = io_apic_read(apic, 1);
- if (reg_01.bits.version >= 0x10)
- reg_02.raw = io_apic_read(apic, 2);
- if (reg_01.bits.version >= 0x20)
- reg_03.raw = io_apic_read(apic, 3);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
- printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
- printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
- printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
- printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
-
- printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
- printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
-
- printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
- printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
-
- /*
- * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
- * but the value of reg_02 is read as the previous read register
- * value, so ignore it if reg_02 == reg_01.
- */
- if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
- printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
- printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
- }
-
- /*
- * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
- * or reg_03, but the value of reg_0[23] is read as the previous read
- * register value, so ignore it if reg_03 == reg_0[12].
- */
- if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
- reg_03.raw != reg_01.raw) {
- printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
- printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
- }
-
- printk(KERN_DEBUG ".... IRQ redirection table:\n");
-
- printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
- " Stat Dest Deli Vect: \n");
-
- for (i = 0; i <= reg_01.bits.entries; i++) {
- struct IO_APIC_route_entry entry;
-
- entry = ioapic_read_entry(apic, i);
-
- printk(KERN_DEBUG " %02x %03X %02X ",
- i,
- entry.dest.logical.logical_dest,
- entry.dest.physical.physical_dest
- );
-
- printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
- entry.mask,
- entry.trigger,
- entry.irr,
- entry.polarity,
- entry.delivery_status,
- entry.dest_mode,
- entry.delivery_mode,
- entry.vector
- );
- }
- }
- printk(KERN_DEBUG "IRQ to pin mappings:\n");
- for (i = 0; i < NR_IRQS; i++) {
- struct irq_pin_list *entry = irq_2_pin + i;
- if (entry->pin < 0)
- continue;
- printk(KERN_DEBUG "IRQ%d ", i);
- for (;;) {
- printk("-> %d:%d", entry->apic, entry->pin);
- if (!entry->next)
- break;
- entry = irq_2_pin + entry->next;
- }
- printk("\n");
- }
-
- printk(KERN_INFO ".................................... done.\n");
-
- return;
-}
-
-__apicdebuginit(void) print_APIC_bitfield(int base)
-{
- unsigned int v;
- int i, j;
-
- if (apic_verbosity == APIC_QUIET)
- return;
-
- printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
- for (i = 0; i < 8; i++) {
- v = apic_read(base + i*0x10);
- for (j = 0; j < 32; j++) {
- if (v & (1<<j))
- printk("1");
- else
- printk("0");
- }
- printk("\n");
- }
-}
-
-__apicdebuginit(void) print_local_APIC(void *dummy)
-{
- unsigned int v, ver, maxlvt;
- u64 icr;
-
- if (apic_verbosity == APIC_QUIET)
- return;
-
- printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
- smp_processor_id(), hard_smp_processor_id());
- v = apic_read(APIC_ID);
- printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
- GET_APIC_ID(v));
- v = apic_read(APIC_LVR);
- printk(KERN_INFO "... APIC VERSION: %08x\n", v);
- ver = GET_APIC_VERSION(v);
- maxlvt = lapic_get_maxlvt();
-
- v = apic_read(APIC_TASKPRI);
- printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
-
- if (APIC_INTEGRATED(ver)) { /* !82489DX */
- v = apic_read(APIC_ARBPRI);
- printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
- v & APIC_ARBPRI_MASK);
- v = apic_read(APIC_PROCPRI);
- printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
- }
-
- v = apic_read(APIC_EOI);
- printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
- v = apic_read(APIC_RRR);
- printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
- v = apic_read(APIC_LDR);
- printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
- v = apic_read(APIC_DFR);
- printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
- v = apic_read(APIC_SPIV);
- printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
-
- printk(KERN_DEBUG "... APIC ISR field:\n");
- print_APIC_bitfield(APIC_ISR);
- printk(KERN_DEBUG "... APIC TMR field:\n");
- print_APIC_bitfield(APIC_TMR);
- printk(KERN_DEBUG "... APIC IRR field:\n");
- print_APIC_bitfield(APIC_IRR);
-
- if (APIC_INTEGRATED(ver)) { /* !82489DX */
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
- apic_write(APIC_ESR, 0);
- v = apic_read(APIC_ESR);
- printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
- }
-
- icr = apic_icr_read();
- printk(KERN_DEBUG "... APIC ICR: %08x\n", icr);
- printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32);
-
- v = apic_read(APIC_LVTT);
- printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
-
- if (maxlvt > 3) { /* PC is LVT#4. */
- v = apic_read(APIC_LVTPC);
- printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
- }
- v = apic_read(APIC_LVT0);
- printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
- v = apic_read(APIC_LVT1);
- printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
-
- if (maxlvt > 2) { /* ERR is LVT#3. */
- v = apic_read(APIC_LVTERR);
- printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
- }
-
- v = apic_read(APIC_TMICT);
- printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
- v = apic_read(APIC_TMCCT);
- printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
- v = apic_read(APIC_TDCR);
- printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
- printk("\n");
-}
-
-__apicdebuginit(void) print_all_local_APICs(void)
-{
- on_each_cpu(print_local_APIC, NULL, 1);
-}
-
-__apicdebuginit(void) print_PIC(void)
-{
- unsigned int v;
- unsigned long flags;
-
- if (apic_verbosity == APIC_QUIET)
- return;
-
- printk(KERN_DEBUG "\nprinting PIC contents\n");
-
- spin_lock_irqsave(&i8259A_lock, flags);
-
- v = inb(0xa1) << 8 | inb(0x21);
- printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
-
- v = inb(0xa0) << 8 | inb(0x20);
- printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
-
- outb(0x0b, 0xa0);
- outb(0x0b, 0x20);
- v = inb(0xa0) << 8 | inb(0x20);
- outb(0x0a, 0xa0);
- outb(0x0a, 0x20);
-
- spin_unlock_irqrestore(&i8259A_lock, flags);
-
- printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
-
- v = inb(0x4d1) << 8 | inb(0x4d0);
- printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
-}
-
-__apicdebuginit(int) print_all_ICs(void)
-{
- print_PIC();
- print_all_local_APICs();
- print_IO_APIC();
-
- return 0;
-}
-
-fs_initcall(print_all_ICs);
-
-
-static void __init enable_IO_APIC(void)
-{
- union IO_APIC_reg_01 reg_01;
- int i8259_apic, i8259_pin;
- int i, apic;
- unsigned long flags;
-
- for (i = 0; i < PIN_MAP_SIZE; i++) {
- irq_2_pin[i].pin = -1;
- irq_2_pin[i].next = 0;
- }
- if (!pirqs_enabled)
- for (i = 0; i < MAX_PIRQS; i++)
- pirq_entries[i] = -1;
-
- /*
- * The number of IO-APIC IRQ registers (== #pins):
- */
- for (apic = 0; apic < nr_ioapics; apic++) {
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_01.raw = io_apic_read(apic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- nr_ioapic_registers[apic] = reg_01.bits.entries+1;
- }
- for (apic = 0; apic < nr_ioapics; apic++) {
- int pin;
- /* See if any of the pins is in ExtINT mode */
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- struct IO_APIC_route_entry entry;
- entry = ioapic_read_entry(apic, pin);
-
-
- /* If the interrupt line is enabled and in ExtInt mode
- * I have found the pin where the i8259 is connected.
- */
- if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
- ioapic_i8259.apic = apic;
- ioapic_i8259.pin = pin;
- goto found_i8259;
- }
- }
- }
- found_i8259:
- /* Look to see what if the MP table has reported the ExtINT */
- /* If we could not find the appropriate pin by looking at the ioapic
- * the i8259 probably is not connected the ioapic but give the
- * mptable a chance anyway.
- */
- i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
- i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
- /* Trust the MP table if nothing is setup in the hardware */
- if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
- printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
- ioapic_i8259.pin = i8259_pin;
- ioapic_i8259.apic = i8259_apic;
- }
- /* Complain if the MP table and the hardware disagree */
- if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
- (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
- {
- printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
- }
-
- /*
- * Do not trust the IO-APIC being empty at bootup
- */
- clear_IO_APIC();
-}
-
-/*
- * Not an __init, needed by the reboot code
- */
-void disable_IO_APIC(void)
-{
- /*
- * Clear the IO-APIC before rebooting:
- */
- clear_IO_APIC();
-
- /*
- * If the i8259 is routed through an IOAPIC
- * Put that IOAPIC in virtual wire mode
- * so legacy interrupts can be delivered.
- */
- if (ioapic_i8259.pin != -1) {
- struct IO_APIC_route_entry entry;
-
- memset(&entry, 0, sizeof(entry));
- entry.mask = 0; /* Enabled */
- entry.trigger = 0; /* Edge */
- entry.irr = 0;
- entry.polarity = 0; /* High */
- entry.delivery_status = 0;
- entry.dest_mode = 0; /* Physical */
- entry.delivery_mode = dest_ExtINT; /* ExtInt */
- entry.vector = 0;
- entry.dest.physical.physical_dest = read_apic_id();
-
- /*
- * Add it to the IO-APIC irq-routing table:
- */
- ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
- }
- disconnect_bsp_APIC(ioapic_i8259.pin != -1);
-}
-
-/*
- * function to set the IO-APIC physical IDs based on the
- * values stored in the MPC table.
- *
- * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
- */
-
-static void __init setup_ioapic_ids_from_mpc(void)
-{
- union IO_APIC_reg_00 reg_00;
- physid_mask_t phys_id_present_map;
- int apic;
- int i;
- unsigned char old_id;
- unsigned long flags;
-
- if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
- return;
-
- /*
- * Don't check I/O APIC IDs for xAPIC systems. They have
- * no meaning without the serial APIC bus.
- */
- if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
- return;
- /*
- * This is broken; anything with a real cpu count has to
- * circumvent this idiocy regardless.
- */
- phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
-
- /*
- * Set the IOAPIC ID to the value stored in the MPC table.
- */
- for (apic = 0; apic < nr_ioapics; apic++) {
-
- /* Read the register 0 value */
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- old_id = mp_ioapics[apic].mp_apicid;
-
- if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
- printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
- apic, mp_ioapics[apic].mp_apicid);
- printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
- reg_00.bits.ID);
- mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
- }
-
- /*
- * Sanity check, is the ID really free? Every APIC in a
- * system must have a unique ID or we get lots of nice
- * 'stuck on smp_invalidate_needed IPI wait' messages.
- */
- if (check_apicid_used(phys_id_present_map,
- mp_ioapics[apic].mp_apicid)) {
- printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
- apic, mp_ioapics[apic].mp_apicid);
- for (i = 0; i < get_physical_broadcast(); i++)
- if (!physid_isset(i, phys_id_present_map))
- break;
- if (i >= get_physical_broadcast())
- panic("Max APIC ID exceeded!\n");
- printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
- i);
- physid_set(i, phys_id_present_map);
- mp_ioapics[apic].mp_apicid = i;
- } else {
- physid_mask_t tmp;
- tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
- apic_printk(APIC_VERBOSE, "Setting %d in the "
- "phys_id_present_map\n",
- mp_ioapics[apic].mp_apicid);
- physids_or(phys_id_present_map, phys_id_present_map, tmp);
- }
-
-
- /*
- * We need to adjust the IRQ routing table
- * if the ID changed.
- */
- if (old_id != mp_ioapics[apic].mp_apicid)
- for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_dstapic == old_id)
- mp_irqs[i].mp_dstapic
- = mp_ioapics[apic].mp_apicid;
-
- /*
- * Read the right value from the MPC table and
- * write it into the ID register.
- */
- apic_printk(APIC_VERBOSE, KERN_INFO
- "...changing IO-APIC physical APIC ID to %d ...",
- mp_ioapics[apic].mp_apicid);
-
- reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0, reg_00.raw);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- /*
- * Sanity check
- */
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
- printk("could not set ID!\n");
- else
- apic_printk(APIC_VERBOSE, " ok.\n");
- }
-}
-
-int no_timer_check __initdata;
-
-static int __init notimercheck(char *s)
-{
- no_timer_check = 1;
- return 1;
-}
-__setup("no_timer_check", notimercheck);
-
-/*
- * There is a nasty bug in some older SMP boards, their mptable lies
- * about the timer IRQ. We do the following to work around the situation:
- *
- * - timer IRQ defaults to IO-APIC IRQ
- * - if this function detects that timer IRQs are defunct, then we fall
- * back to ISA timer IRQs
- */
-static int __init timer_irq_works(void)
-{
- unsigned long t1 = jiffies;
- unsigned long flags;
-
- if (no_timer_check)
- return 1;
-
- local_save_flags(flags);
- local_irq_enable();
- /* Let ten ticks pass... */
- mdelay((10 * 1000) / HZ);
- local_irq_restore(flags);
-
- /*
- * Expect a few ticks at least, to be sure some possible
- * glue logic does not lock up after one or two first
- * ticks in a non-ExtINT mode. Also the local APIC
- * might have cached one ExtINT interrupt. Finally, at
- * least one tick may be lost due to delays.
- */
- if (time_after(jiffies, t1 + 4))
- return 1;
-
- return 0;
-}
-
-/*
- * In the SMP+IOAPIC case it might happen that there are an unspecified
- * number of pending IRQ events unhandled. These cases are very rare,
- * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
- * better to do it this way as thus we do not have to be aware of
- * 'pending' interrupts in the IRQ path, except at this point.
- */
-/*
- * Edge triggered needs to resend any interrupt
- * that was delayed but this is now handled in the device
- * independent code.
- */
-
-/*
- * Startup quirk:
- *
- * Starting up a edge-triggered IO-APIC interrupt is
- * nasty - we need to make sure that we get the edge.
- * If it is already asserted for some reason, we need
- * return 1 to indicate that is was pending.
- *
- * This is not complete - we should be able to fake
- * an edge even if it isn't on the 8259A...
- *
- * (We do this for level-triggered IRQs too - it cannot hurt.)
- */
-static unsigned int startup_ioapic_irq(unsigned int irq)
-{
- int was_pending = 0;
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- if (irq < 16) {
- disable_8259A_irq(irq);
- if (i8259A_irq_pending(irq))
- was_pending = 1;
- }
- __unmask_IO_APIC_irq(irq);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return was_pending;
-}
-
-static void ack_ioapic_irq(unsigned int irq)
-{
- move_native_irq(irq);
- ack_APIC_irq();
-}
-
-static void ack_ioapic_quirk_irq(unsigned int irq)
-{
- unsigned long v;
- int i;
-
- move_native_irq(irq);
-/*
- * It appears there is an erratum which affects at least version 0x11
- * of I/O APIC (that's the 82093AA and cores integrated into various
- * chipsets). Under certain conditions a level-triggered interrupt is
- * erroneously delivered as edge-triggered one but the respective IRR
- * bit gets set nevertheless. As a result the I/O unit expects an EOI
- * message but it will never arrive and further interrupts are blocked
- * from the source. The exact reason is so far unknown, but the
- * phenomenon was observed when two consecutive interrupt requests
- * from a given source get delivered to the same CPU and the source is
- * temporarily disabled in between.
- *
- * A workaround is to simulate an EOI message manually. We achieve it
- * by setting the trigger mode to edge and then to level when the edge
- * trigger mode gets detected in the TMR of a local APIC for a
- * level-triggered interrupt. We mask the source for the time of the
- * operation to prevent an edge-triggered interrupt escaping meanwhile.
- * The idea is from Manfred Spraul. --macro
- */
- i = irq_vector[irq];
-
- v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
-
- ack_APIC_irq();
-
- if (!(v & (1 << (i & 0x1f)))) {
- atomic_inc(&irq_mis_count);
- spin_lock(&ioapic_lock);
- __mask_and_edge_IO_APIC_irq(irq);
- __unmask_and_level_IO_APIC_irq(irq);
- spin_unlock(&ioapic_lock);
- }
-}
-
-static int ioapic_retrigger_irq(unsigned int irq)
-{
- send_IPI_self(irq_vector[irq]);
-
- return 1;
-}
-
-static struct irq_chip ioapic_chip __read_mostly = {
- .name = "IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
- .ack = ack_ioapic_irq,
- .eoi = ack_ioapic_quirk_irq,
-#ifdef CONFIG_SMP
- .set_affinity = set_ioapic_affinity_irq,
-#endif
- .retrigger = ioapic_retrigger_irq,
-};
-
-
-static inline void init_IO_APIC_traps(void)
-{
- int irq;
-
- /*
- * NOTE! The local APIC isn't very good at handling
- * multiple interrupts at the same interrupt level.
- * As the interrupt level is determined by taking the
- * vector number and shifting that right by 4, we
- * want to spread these out a bit so that they don't
- * all fall in the same interrupt level.
- *
- * Also, we've got to be careful not to trash gate
- * 0x80, because int 0x80 is hm, kind of importantish. ;)
- */
- for (irq = 0; irq < NR_IRQS ; irq++) {
- if (IO_APIC_IRQ(irq) && !irq_vector[irq]) {
- /*
- * Hmm.. We don't have an entry for this,
- * so default to an old-fashioned 8259
- * interrupt if we can..
- */
- if (irq < 16)
- make_8259A_irq(irq);
- else
- /* Strange. Oh, well.. */
- irq_desc[irq].chip = &no_irq_chip;
- }
- }
-}
-
-/*
- * The local APIC irq-chip implementation:
- */
-
-static void ack_lapic_irq(unsigned int irq)
-{
- ack_APIC_irq();
-}
-
-static void mask_lapic_irq(unsigned int irq)
-{
- unsigned long v;
-
- v = apic_read(APIC_LVT0);
- apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
-}
-
-static void unmask_lapic_irq(unsigned int irq)
-{
- unsigned long v;
-
- v = apic_read(APIC_LVT0);
- apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
-}
-
-static struct irq_chip lapic_chip __read_mostly = {
- .name = "local-APIC",
- .mask = mask_lapic_irq,
- .unmask = unmask_lapic_irq,
- .ack = ack_lapic_irq,
-};
-
-static void lapic_register_intr(int irq, int vector)
-{
- irq_desc[irq].status &= ~IRQ_LEVEL;
- set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
- "edge");
- set_intr_gate(vector, interrupt[irq]);
-}
-
-static void __init setup_nmi(void)
-{
- /*
- * Dirty trick to enable the NMI watchdog ...
- * We put the 8259A master into AEOI mode and
- * unmask on all local APICs LVT0 as NMI.
- *
- * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
- * is from Maciej W. Rozycki - so we do not have to EOI from
- * the NMI handler or the timer interrupt.
- */
- apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
-
- enable_NMI_through_LVT0();
-
- apic_printk(APIC_VERBOSE, " done.\n");
-}
-
-/*
- * This looks a bit hackish but it's about the only one way of sending
- * a few INTA cycles to 8259As and any associated glue logic. ICR does
- * not support the ExtINT mode, unfortunately. We need to send these
- * cycles as some i82489DX-based boards have glue logic that keeps the
- * 8259A interrupt line asserted until INTA. --macro
- */
-static inline void __init unlock_ExtINT_logic(void)
-{
- int apic, pin, i;
- struct IO_APIC_route_entry entry0, entry1;
- unsigned char save_control, save_freq_select;
-
- pin = find_isa_irq_pin(8, mp_INT);
- if (pin == -1) {
- WARN_ON_ONCE(1);
- return;
- }
- apic = find_isa_irq_apic(8, mp_INT);
- if (apic == -1) {
- WARN_ON_ONCE(1);
- return;
- }
-
- entry0 = ioapic_read_entry(apic, pin);
- clear_IO_APIC_pin(apic, pin);
-
- memset(&entry1, 0, sizeof(entry1));
-
- entry1.dest_mode = 0; /* physical delivery */
- entry1.mask = 0; /* unmask IRQ now */
- entry1.dest.physical.physical_dest = hard_smp_processor_id();
- entry1.delivery_mode = dest_ExtINT;
- entry1.polarity = entry0.polarity;
- entry1.trigger = 0;
- entry1.vector = 0;
-
- ioapic_write_entry(apic, pin, entry1);
-
- save_control = CMOS_READ(RTC_CONTROL);
- save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
- CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
- RTC_FREQ_SELECT);
- CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
-
- i = 100;
- while (i-- > 0) {
- mdelay(10);
- if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
- i -= 10;
- }
-
- CMOS_WRITE(save_control, RTC_CONTROL);
- CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
- clear_IO_APIC_pin(apic, pin);
-
- ioapic_write_entry(apic, pin, entry0);
-}
-
-/*
- * This code may look a bit paranoid, but it's supposed to cooperate with
- * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
- * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
- * fanatically on his truly buggy board.
- */
-static inline void __init check_timer(void)
-{
- int apic1, pin1, apic2, pin2;
- int no_pin1 = 0;
- int vector;
- unsigned int ver;
- unsigned long flags;
-
- local_irq_save(flags);
-
- ver = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(ver);
-
- /*
- * get/set the timer IRQ vector:
- */
- disable_8259A_irq(0);
- vector = assign_irq_vector(0);
- set_intr_gate(vector, interrupt[0]);
-
- /*
- * As IRQ0 is to be enabled in the 8259A, the virtual
- * wire has to be disabled in the local APIC. Also
- * timer interrupts need to be acknowledged manually in
- * the 8259A for the i82489DX when using the NMI
- * watchdog as that APIC treats NMIs as level-triggered.
- * The AEOI mode will finish them in the 8259A
- * automatically.
- */
- apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
- init_8259A(1);
- timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
-
- pin1 = find_isa_irq_pin(0, mp_INT);
- apic1 = find_isa_irq_apic(0, mp_INT);
- pin2 = ioapic_i8259.pin;
- apic2 = ioapic_i8259.apic;
-
- apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
- "apic1=%d pin1=%d apic2=%d pin2=%d\n",
- vector, apic1, pin1, apic2, pin2);
-
- /*
- * Some BIOS writers are clueless and report the ExtINTA
- * I/O APIC input from the cascaded 8259A as the timer
- * interrupt input. So just in case, if only one pin
- * was found above, try it both directly and through the
- * 8259A.
- */
- if (pin1 == -1) {
- pin1 = pin2;
- apic1 = apic2;
- no_pin1 = 1;
- } else if (pin2 == -1) {
- pin2 = pin1;
- apic2 = apic1;
- }
-
- if (pin1 != -1) {
- /*
- * Ok, does IRQ0 through the IOAPIC work?
- */
- if (no_pin1) {
- add_pin_to_irq(0, apic1, pin1);
- setup_timer_IRQ0_pin(apic1, pin1, vector);
- }
- unmask_IO_APIC_irq(0);
- if (timer_irq_works()) {
- if (nmi_watchdog == NMI_IO_APIC) {
- setup_nmi();
- enable_8259A_irq(0);
- }
- if (disable_timer_pin_1 > 0)
- clear_IO_APIC_pin(0, pin1);
- goto out;
- }
- clear_IO_APIC_pin(apic1, pin1);
- if (!no_pin1)
- apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
- "8254 timer not connected to IO-APIC\n");
-
- apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
- "(IRQ0) through the 8259A ...\n");
- apic_printk(APIC_QUIET, KERN_INFO
- "..... (found apic %d pin %d) ...\n", apic2, pin2);
- /*
- * legacy devices should be connected to IO APIC #0
- */
- replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
- setup_timer_IRQ0_pin(apic2, pin2, vector);
- unmask_IO_APIC_irq(0);
- enable_8259A_irq(0);
- if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
- timer_through_8259 = 1;
- if (nmi_watchdog == NMI_IO_APIC) {
- disable_8259A_irq(0);
- setup_nmi();
- enable_8259A_irq(0);
- }
- goto out;
- }
- /*
- * Cleanup, just in case ...
- */
- disable_8259A_irq(0);
- clear_IO_APIC_pin(apic2, pin2);
- apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
- }
-
- if (nmi_watchdog == NMI_IO_APIC) {
- apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
- "through the IO-APIC - disabling NMI Watchdog!\n");
- nmi_watchdog = NMI_NONE;
- }
- timer_ack = 0;
-
- apic_printk(APIC_QUIET, KERN_INFO
- "...trying to set up timer as Virtual Wire IRQ...\n");
-
- lapic_register_intr(0, vector);
- apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
- enable_8259A_irq(0);
-
- if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
- goto out;
- }
- disable_8259A_irq(0);
- apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
- apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
-
- apic_printk(APIC_QUIET, KERN_INFO
- "...trying to set up timer as ExtINT IRQ...\n");
-
- init_8259A(0);
- make_8259A_irq(0);
- apic_write(APIC_LVT0, APIC_DM_EXTINT);
-
- unlock_ExtINT_logic();
-
- if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
- goto out;
- }
- apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
- panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
- "report. Then try booting with the 'noapic' option.\n");
-out:
- local_irq_restore(flags);
-}
-
-/*
- * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
- * to devices. However there may be an I/O APIC pin available for
- * this interrupt regardless. The pin may be left unconnected, but
- * typically it will be reused as an ExtINT cascade interrupt for
- * the master 8259A. In the MPS case such a pin will normally be
- * reported as an ExtINT interrupt in the MP table. With ACPI
- * there is no provision for ExtINT interrupts, and in the absence
- * of an override it would be treated as an ordinary ISA I/O APIC
- * interrupt, that is edge-triggered and unmasked by default. We
- * used to do this, but it caused problems on some systems because
- * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
- * the same ExtINT cascade interrupt to drive the local APIC of the
- * bootstrap processor. Therefore we refrain from routing IRQ2 to
- * the I/O APIC in all cases now. No actual device should request
- * it anyway. --macro
- */
-#define PIC_IRQS (1 << PIC_CASCADE_IR)
-
-void __init setup_IO_APIC(void)
-{
- int i;
-
- /* Reserve all the system vectors. */
- for (i = first_system_vector; i < NR_VECTORS; i++)
- set_bit(i, used_vectors);
-
- enable_IO_APIC();
-
- io_apic_irqs = ~PIC_IRQS;
-
- printk("ENABLING IO-APIC IRQs\n");
-
- /*
- * Set up IO-APIC IRQ routing.
- */
- if (!acpi_ioapic)
- setup_ioapic_ids_from_mpc();
- sync_Arb_IDs();
- setup_IO_APIC_irqs();
- init_IO_APIC_traps();
- check_timer();
-}
-
-/*
- * Called after all the initialization is done. If we didnt find any
- * APIC bugs then we can allow the modify fast path
- */
-
-static int __init io_apic_bug_finalize(void)
-{
- if (sis_apic_bug == -1)
- sis_apic_bug = 0;
- return 0;
-}
-
-late_initcall(io_apic_bug_finalize);
-
-struct sysfs_ioapic_data {
- struct sys_device dev;
- struct IO_APIC_route_entry entry[0];
-};
-static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
-
-static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
-{
- struct IO_APIC_route_entry *entry;
- struct sysfs_ioapic_data *data;
- int i;
-
- data = container_of(dev, struct sysfs_ioapic_data, dev);
- entry = data->entry;
- for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
- entry[i] = ioapic_read_entry(dev->id, i);
-
- return 0;
-}
-
-static int ioapic_resume(struct sys_device *dev)
-{
- struct IO_APIC_route_entry *entry;
- struct sysfs_ioapic_data *data;
- unsigned long flags;
- union IO_APIC_reg_00 reg_00;
- int i;
-
- data = container_of(dev, struct sysfs_ioapic_data, dev);
- entry = data->entry;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(dev->id, 0);
- if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
- reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
- io_apic_write(dev->id, 0, reg_00.raw);
- }
- spin_unlock_irqrestore(&ioapic_lock, flags);
- for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
- ioapic_write_entry(dev->id, i, entry[i]);
-
- return 0;
-}
-
-static struct sysdev_class ioapic_sysdev_class = {
- .name = "ioapic",
- .suspend = ioapic_suspend,
- .resume = ioapic_resume,
-};
-
-static int __init ioapic_init_sysfs(void)
-{
- struct sys_device *dev;
- int i, size, error = 0;
-
- error = sysdev_class_register(&ioapic_sysdev_class);
- if (error)
- return error;
-
- for (i = 0; i < nr_ioapics; i++) {
- size = sizeof(struct sys_device) + nr_ioapic_registers[i]
- * sizeof(struct IO_APIC_route_entry);
- mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
- if (!mp_ioapic_data[i]) {
- printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
- continue;
- }
- dev = &mp_ioapic_data[i]->dev;
- dev->id = i;
- dev->cls = &ioapic_sysdev_class;
- error = sysdev_register(dev);
- if (error) {
- kfree(mp_ioapic_data[i]);
- mp_ioapic_data[i] = NULL;
- printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
- continue;
- }
- }
-
- return 0;
-}
-
-device_initcall(ioapic_init_sysfs);
-
-/*
- * Dynamic irq allocate and deallocation
- */
-int create_irq(void)
-{
- /* Allocate an unused irq */
- int irq, new, vector = 0;
- unsigned long flags;
-
- irq = -ENOSPC;
- spin_lock_irqsave(&vector_lock, flags);
- for (new = (NR_IRQS - 1); new >= 0; new--) {
- if (platform_legacy_irq(new))
- continue;
- if (irq_vector[new] != 0)
- continue;
- vector = __assign_irq_vector(new);
- if (likely(vector > 0))
- irq = new;
- break;
- }
- spin_unlock_irqrestore(&vector_lock, flags);
-
- if (irq >= 0) {
- set_intr_gate(vector, interrupt[irq]);
- dynamic_irq_init(irq);
- }
- return irq;
-}
-
-void destroy_irq(unsigned int irq)
-{
- unsigned long flags;
-
- dynamic_irq_cleanup(irq);
-
- spin_lock_irqsave(&vector_lock, flags);
- clear_bit(irq_vector[irq], used_vectors);
- irq_vector[irq] = 0;
- spin_unlock_irqrestore(&vector_lock, flags);
-}
-
-/*
- * MSI message composition
- */
-#ifdef CONFIG_PCI_MSI
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
-{
- int vector;
- unsigned dest;
-
- vector = assign_irq_vector(irq);
- if (vector >= 0) {
- dest = cpu_mask_to_apicid(TARGET_CPUS);
-
- msg->address_hi = MSI_ADDR_BASE_HI;
- msg->address_lo =
- MSI_ADDR_BASE_LO |
- ((INT_DEST_MODE == 0) ?
-MSI_ADDR_DEST_MODE_PHYSICAL:
- MSI_ADDR_DEST_MODE_LOGICAL) |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
- MSI_ADDR_REDIRECTION_CPU:
- MSI_ADDR_REDIRECTION_LOWPRI) |
- MSI_ADDR_DEST_ID(dest);
-
- msg->data =
- MSI_DATA_TRIGGER_EDGE |
- MSI_DATA_LEVEL_ASSERT |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
-MSI_DATA_DELIVERY_FIXED:
- MSI_DATA_DELIVERY_LOWPRI) |
- MSI_DATA_VECTOR(vector);
- }
- return vector;
-}
-
-#ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
-{
- struct msi_msg msg;
- unsigned int dest;
- cpumask_t tmp;
- int vector;
-
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
- tmp = TARGET_CPUS;
-
- vector = assign_irq_vector(irq);
- if (vector < 0)
- return;
-
- dest = cpu_mask_to_apicid(mask);
-
- read_msi_msg(irq, &msg);
-
- msg.data &= ~MSI_DATA_VECTOR_MASK;
- msg.data |= MSI_DATA_VECTOR(vector);
- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-
- write_msi_msg(irq, &msg);
- irq_desc[irq].affinity = mask;
-}
-#endif /* CONFIG_SMP */
-
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
- */
-static struct irq_chip msi_chip = {
- .name = "PCI-MSI",
- .unmask = unmask_msi_irq,
- .mask = mask_msi_irq,
- .ack = ack_ioapic_irq,
-#ifdef CONFIG_SMP
- .set_affinity = set_msi_irq_affinity,
-#endif
- .retrigger = ioapic_retrigger_irq,
-};
-
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
-{
- struct msi_msg msg;
- int irq, ret;
- irq = create_irq();
- if (irq < 0)
- return irq;
-
- ret = msi_compose_msg(dev, irq, &msg);
- if (ret < 0) {
- destroy_irq(irq);
- return ret;
- }
-
- set_irq_msi(irq, desc);
- write_msi_msg(irq, &msg);
-
- set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
- "edge");
-
- return 0;
-}
-
-void arch_teardown_msi_irq(unsigned int irq)
-{
- destroy_irq(irq);
-}
-
-#endif /* CONFIG_PCI_MSI */
-
-/*
- * Hypertransport interrupt support
- */
-#ifdef CONFIG_HT_IRQ
-
-#ifdef CONFIG_SMP
-
-static void target_ht_irq(unsigned int irq, unsigned int dest)
-{
- struct ht_irq_msg msg;
- fetch_ht_irq_msg(irq, &msg);
-
- msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
- msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
-
- msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
- msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
-
- write_ht_irq_msg(irq, &msg);
-}
-
-static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
-{
- unsigned int dest;
- cpumask_t tmp;
-
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
- tmp = TARGET_CPUS;
-
- cpus_and(mask, tmp, CPU_MASK_ALL);
-
- dest = cpu_mask_to_apicid(mask);
-
- target_ht_irq(irq, dest);
- irq_desc[irq].affinity = mask;
-}
-#endif
-
-static struct irq_chip ht_irq_chip = {
- .name = "PCI-HT",
- .mask = mask_ht_irq,
- .unmask = unmask_ht_irq,
- .ack = ack_ioapic_irq,
-#ifdef CONFIG_SMP
- .set_affinity = set_ht_irq_affinity,
-#endif
- .retrigger = ioapic_retrigger_irq,
-};
-
-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
-{
- int vector;
-
- vector = assign_irq_vector(irq);
- if (vector >= 0) {
- struct ht_irq_msg msg;
- unsigned dest;
- cpumask_t tmp;
-
- cpus_clear(tmp);
- cpu_set(vector >> 8, tmp);
- dest = cpu_mask_to_apicid(tmp);
-
- msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
-
- msg.address_lo =
- HT_IRQ_LOW_BASE |
- HT_IRQ_LOW_DEST_ID(dest) |
- HT_IRQ_LOW_VECTOR(vector) |
- ((INT_DEST_MODE == 0) ?
- HT_IRQ_LOW_DM_PHYSICAL :
- HT_IRQ_LOW_DM_LOGICAL) |
- HT_IRQ_LOW_RQEOI_EDGE |
- ((INT_DELIVERY_MODE != dest_LowestPrio) ?
- HT_IRQ_LOW_MT_FIXED :
- HT_IRQ_LOW_MT_ARBITRATED) |
- HT_IRQ_LOW_IRQ_MASKED;
-
- write_ht_irq_msg(irq, &msg);
-
- set_irq_chip_and_handler_name(irq, &ht_irq_chip,
- handle_edge_irq, "edge");
- }
- return vector;
-}
-#endif /* CONFIG_HT_IRQ */
-
-/* --------------------------------------------------------------------------
- ACPI-based IOAPIC Configuration
- -------------------------------------------------------------------------- */
-
-#ifdef CONFIG_ACPI
-
-int __init io_apic_get_unique_id(int ioapic, int apic_id)
-{
- union IO_APIC_reg_00 reg_00;
- static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
- physid_mask_t tmp;
- unsigned long flags;
- int i = 0;
-
- /*
- * The P4 platform supports up to 256 APIC IDs on two separate APIC
- * buses (one for LAPICs, one for IOAPICs), where predecessors only
- * supports up to 16 on one shared APIC bus.
- *
- * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
- * advantage of new APIC bus architecture.
- */
-
- if (physids_empty(apic_id_map))
- apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(ioapic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- if (apic_id >= get_physical_broadcast()) {
- printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
- "%d\n", ioapic, apic_id, reg_00.bits.ID);
- apic_id = reg_00.bits.ID;
- }
-
- /*
- * Every APIC in a system must have a unique ID or we get lots of nice
- * 'stuck on smp_invalidate_needed IPI wait' messages.
- */
- if (check_apicid_used(apic_id_map, apic_id)) {
-
- for (i = 0; i < get_physical_broadcast(); i++) {
- if (!check_apicid_used(apic_id_map, i))
- break;
- }
-
- if (i == get_physical_broadcast())
- panic("Max apic_id exceeded!\n");
-
- printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
- "trying %d\n", ioapic, apic_id, i);
-
- apic_id = i;
- }
-
- tmp = apicid_to_cpu_present(apic_id);
- physids_or(apic_id_map, apic_id_map, tmp);
-
- if (reg_00.bits.ID != apic_id) {
- reg_00.bits.ID = apic_id;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic, 0, reg_00.raw);
- reg_00.raw = io_apic_read(ioapic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- /* Sanity check */
- if (reg_00.bits.ID != apic_id) {
- printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
- return -1;
- }
- }
-
- apic_printk(APIC_VERBOSE, KERN_INFO
- "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
-
- return apic_id;
-}
-
-
-int __init io_apic_get_version(int ioapic)
-{
- union IO_APIC_reg_01 reg_01;
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_01.raw = io_apic_read(ioapic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return reg_01.bits.version;
-}
-
-
-int __init io_apic_get_redir_entries(int ioapic)
-{
- union IO_APIC_reg_01 reg_01;
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_01.raw = io_apic_read(ioapic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return reg_01.bits.entries;
-}
-
-
-int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
-{
- struct IO_APIC_route_entry entry;
-
- if (!IO_APIC_IRQ(irq)) {
- printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
- ioapic);
- return -EINVAL;
- }
-
- /*
- * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
- * Note that we mask (disable) IRQs now -- these get enabled when the
- * corresponding device driver registers for this IRQ.
- */
-
- memset(&entry, 0, sizeof(entry));
-
- entry.delivery_mode = INT_DELIVERY_MODE;
- entry.dest_mode = INT_DEST_MODE;
- entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
- entry.trigger = edge_level;
- entry.polarity = active_high_low;
- entry.mask = 1;
-
- /*
- * IRQs < 16 are already in the irq_2_pin[] map
- */
- if (irq >= 16)
- add_pin_to_irq(irq, ioapic, pin);
-
- entry.vector = assign_irq_vector(irq);
-
- apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
- "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
- mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
- edge_level, active_high_low);
-
- ioapic_register_intr(irq, entry.vector, edge_level);
-
- if (!ioapic && (irq < 16))
- disable_8259A_irq(irq);
-
- ioapic_write_entry(ioapic, pin, entry);
-
- return 0;
-}
-
-int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
-{
- int i;
-
- if (skip_ioapic_setup)
- return -1;
-
- for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mp_irqtype == mp_INT &&
- mp_irqs[i].mp_srcbusirq == bus_irq)
- break;
- if (i >= mp_irq_entries)
- return -1;
-
- *trigger = irq_trigger(i);
- *polarity = irq_polarity(i);
- return 0;
-}
-
-#endif /* CONFIG_ACPI */
-
-static int __init parse_disable_timer_pin_1(char *arg)
-{
- disable_timer_pin_1 = 1;
- return 0;
-}
-early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
-
-static int __init parse_enable_timer_pin_1(char *arg)
-{
- disable_timer_pin_1 = -1;
- return 0;
-}
-early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
-
-static int __init parse_noapic(char *arg)
-{
- /* disable IO-APIC */
- disable_ioapic_setup();
- return 0;
-}
-early_param("noapic", parse_noapic);
-
-void __init ioapic_init_mappings(void)
-{
- unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
- int i;
-
- for (i = 0; i < nr_ioapics; i++) {
- if (smp_found_config) {
- ioapic_phys = mp_ioapics[i].mp_apicaddr;
- if (!ioapic_phys) {
- printk(KERN_ERR
- "WARNING: bogus zero IO-APIC "
- "address found in MPTABLE, "
- "disabling IO/APIC support!\n");
- smp_found_config = 0;
- skip_ioapic_setup = 1;
- goto fake_ioapic_page;
- }
- } else {
-fake_ioapic_page:
- ioapic_phys = (unsigned long)
- alloc_bootmem_pages(PAGE_SIZE);
- ioapic_phys = __pa(ioapic_phys);
- }
- set_fixmap_nocache(idx, ioapic_phys);
- printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
- __fix_to_virt(idx), ioapic_phys);
- idx++;
- }
-}
-
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
new file mode 100644
index 000000000000..3f1d9d18df67
--- /dev/null
+++ b/arch/x86/kernel/irq.c
@@ -0,0 +1,192 @@
+/*
+ * Common interrupt code for 32 and 64 bit
+ */
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/seq_file.h>
+
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/smp.h>
+
+atomic_t irq_err_count;
+
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+ printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /*
+ * Currently unexpected vectors happen only on SMP and APIC.
+ * We _must_ ack these because every local APIC has only N
+ * irq slots per priority level, and a 'hanging, unacked' IRQ
+ * holds up an irq slot - in excessive cases (when multiple
+ * unexpected vectors occur) that might lock up the APIC
+ * completely.
+ * But only ack when the APIC is enabled -AK
+ */
+ if (cpu_has_apic)
+ ack_APIC_irq();
+#endif
+}
+
+#ifdef CONFIG_X86_32
+# define irq_stats(x) (&per_cpu(irq_stat, x))
+#else
+# define irq_stats(x) cpu_pda(x)
+#endif
+/*
+ * /proc/interrupts printing:
+ */
+static int show_other_interrupts(struct seq_file *p)
+{
+ int j;
+
+ seq_printf(p, "NMI: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
+ seq_printf(p, " Non-maskable interrupts\n");
+#ifdef CONFIG_X86_LOCAL_APIC
+ seq_printf(p, "LOC: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
+ seq_printf(p, " Local timer interrupts\n");
+#endif
+#ifdef CONFIG_SMP
+ seq_printf(p, "RES: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
+ seq_printf(p, " Rescheduling interrupts\n");
+ seq_printf(p, "CAL: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
+ seq_printf(p, " Function call interrupts\n");
+ seq_printf(p, "TLB: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
+ seq_printf(p, " TLB shootdowns\n");
+#endif
+#ifdef CONFIG_X86_MCE
+ seq_printf(p, "TRM: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
+ seq_printf(p, " Thermal event interrupts\n");
+# ifdef CONFIG_X86_64
+ seq_printf(p, "THR: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
+ seq_printf(p, " Threshold APIC interrupts\n");
+# endif
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+ seq_printf(p, "SPU: ");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+ seq_printf(p, " Spurious interrupts\n");
+#endif
+ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+#if defined(CONFIG_X86_IO_APIC)
+ seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+#endif
+ return 0;
+}
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+ unsigned long flags, any_count = 0;
+ int i = *(loff_t *) v, j;
+ struct irqaction *action;
+ struct irq_desc *desc;
+
+ if (i > nr_irqs)
+ return 0;
+
+ if (i == nr_irqs)
+ return show_other_interrupts(p);
+
+ /* print header */
+ if (i == 0) {
+ seq_printf(p, " ");
+ for_each_online_cpu(j)
+ seq_printf(p, "CPU%-8d", j);
+ seq_putc(p, '\n');
+ }
+
+ desc = irq_to_desc(i);
+ if (!desc)
+ return 0;
+
+ spin_lock_irqsave(&desc->lock, flags);
+#ifndef CONFIG_SMP
+ any_count = kstat_irqs(i);
+#else
+ for_each_online_cpu(j)
+ any_count |= kstat_irqs_cpu(i, j);
+#endif
+ action = desc->action;
+ if (!action && !any_count)
+ goto out;
+
+ seq_printf(p, "%3d: ", i);
+#ifndef CONFIG_SMP
+ seq_printf(p, "%10u ", kstat_irqs(i));
+#else
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
+#endif
+ seq_printf(p, " %8s", desc->chip->name);
+ seq_printf(p, "-%-8s", desc->name);
+
+ if (action) {
+ seq_printf(p, " %s", action->name);
+ while ((action = action->next) != NULL)
+ seq_printf(p, ", %s", action->name);
+ }
+
+ seq_putc(p, '\n');
+out:
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return 0;
+}
+
+/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+ u64 sum = irq_stats(cpu)->__nmi_count;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ sum += irq_stats(cpu)->apic_timer_irqs;
+#endif
+#ifdef CONFIG_SMP
+ sum += irq_stats(cpu)->irq_resched_count;
+ sum += irq_stats(cpu)->irq_call_count;
+ sum += irq_stats(cpu)->irq_tlb_count;
+#endif
+#ifdef CONFIG_X86_MCE
+ sum += irq_stats(cpu)->irq_thermal_count;
+# ifdef CONFIG_X86_64
+ sum += irq_stats(cpu)->irq_threshold_count;
+#endif
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+ sum += irq_stats(cpu)->irq_spurious_count;
+#endif
+ return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+ u64 sum = atomic_read(&irq_err_count);
+
+#ifdef CONFIG_X86_IO_APIC
+ sum += atomic_read(&irq_mis_count);
+#endif
+ return sum;
+}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index b71e02d42f4f..119fc9c8ff7f 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -25,29 +25,6 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
DEFINE_PER_CPU(struct pt_regs *, irq_regs);
EXPORT_PER_CPU_SYMBOL(irq_regs);
-/*
- * 'what should we do if we get a hw irq event on an illegal vector'.
- * each architecture has to answer this themselves.
- */
-void ack_bad_irq(unsigned int irq)
-{
- printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
-
-#ifdef CONFIG_X86_LOCAL_APIC
- /*
- * Currently unexpected vectors happen only on SMP and APIC.
- * We _must_ ack these because every local APIC has only N
- * irq slots per priority level, and a 'hanging, unacked' IRQ
- * holds up an irq slot - in excessive cases (when multiple
- * unexpected vectors occur) that might lock up the APIC
- * completely.
- * But only ack when the APIC is enabled -AK
- */
- if (cpu_has_apic)
- ack_APIC_irq();
-#endif
-}
-
#ifdef CONFIG_DEBUG_STACKOVERFLOW
/* Debugging check for stack overflow: is there less than 1KB free? */
static int check_stack_overflow(void)
@@ -223,20 +200,25 @@ unsigned int do_IRQ(struct pt_regs *regs)
{
struct pt_regs *old_regs;
/* high bit used in ret_from_ code */
- int overflow, irq = ~regs->orig_ax;
- struct irq_desc *desc = irq_desc + irq;
+ int overflow;
+ unsigned vector = ~regs->orig_ax;
+ struct irq_desc *desc;
+ unsigned irq;
- if (unlikely((unsigned)irq >= NR_IRQS)) {
- printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
- __func__, irq);
- BUG();
- }
old_regs = set_irq_regs(regs);
irq_enter();
+ irq = __get_cpu_var(vector_irq)[vector];
overflow = check_stack_overflow();
+ desc = irq_to_desc(irq);
+ if (unlikely(!desc)) {
+ printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n",
+ __func__, irq, vector, smp_processor_id());
+ BUG();
+ }
+
if (!execute_on_irq_stack(overflow, desc, irq)) {
if (unlikely(overflow))
print_stack_overflow();
@@ -248,146 +230,6 @@ unsigned int do_IRQ(struct pt_regs *regs)
return 1;
}
-/*
- * Interrupt statistics:
- */
-
-atomic_t irq_err_count;
-
-/*
- * /proc/interrupts printing:
- */
-
-int show_interrupts(struct seq_file *p, void *v)
-{
- int i = *(loff_t *) v, j;
- struct irqaction * action;
- unsigned long flags;
-
- if (i == 0) {
- seq_printf(p, " ");
- for_each_online_cpu(j)
- seq_printf(p, "CPU%-8d",j);
- seq_putc(p, '\n');
- }
-
- if (i < NR_IRQS) {
- unsigned any_count = 0;
-
- spin_lock_irqsave(&irq_desc[i].lock, flags);
-#ifndef CONFIG_SMP
- any_count = kstat_irqs(i);
-#else
- for_each_online_cpu(j)
- any_count |= kstat_cpu(j).irqs[i];
-#endif
- action = irq_desc[i].action;
- if (!action && !any_count)
- goto skip;
- seq_printf(p, "%3d: ",i);
-#ifndef CONFIG_SMP
- seq_printf(p, "%10u ", kstat_irqs(i));
-#else
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
-#endif
- seq_printf(p, " %8s", irq_desc[i].chip->name);
- seq_printf(p, "-%-8s", irq_desc[i].name);
-
- if (action) {
- seq_printf(p, " %s", action->name);
- while ((action = action->next) != NULL)
- seq_printf(p, ", %s", action->name);
- }
-
- seq_putc(p, '\n');
-skip:
- spin_unlock_irqrestore(&irq_desc[i].lock, flags);
- } else if (i == NR_IRQS) {
- seq_printf(p, "NMI: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", nmi_count(j));
- seq_printf(p, " Non-maskable interrupts\n");
-#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "LOC: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).apic_timer_irqs);
- seq_printf(p, " Local timer interrupts\n");
-#endif
-#ifdef CONFIG_SMP
- seq_printf(p, "RES: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).irq_resched_count);
- seq_printf(p, " Rescheduling interrupts\n");
- seq_printf(p, "CAL: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).irq_call_count);
- seq_printf(p, " Function call interrupts\n");
- seq_printf(p, "TLB: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).irq_tlb_count);
- seq_printf(p, " TLB shootdowns\n");
-#endif
-#ifdef CONFIG_X86_MCE
- seq_printf(p, "TRM: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).irq_thermal_count);
- seq_printf(p, " Thermal event interrupts\n");
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- seq_printf(p, "SPU: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).irq_spurious_count);
- seq_printf(p, " Spurious interrupts\n");
-#endif
- seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
-#if defined(CONFIG_X86_IO_APIC)
- seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
-#endif
- }
- return 0;
-}
-
-/*
- * /proc/stat helpers
- */
-u64 arch_irq_stat_cpu(unsigned int cpu)
-{
- u64 sum = nmi_count(cpu);
-
-#ifdef CONFIG_X86_LOCAL_APIC
- sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
-#endif
-#ifdef CONFIG_SMP
- sum += per_cpu(irq_stat, cpu).irq_resched_count;
- sum += per_cpu(irq_stat, cpu).irq_call_count;
- sum += per_cpu(irq_stat, cpu).irq_tlb_count;
-#endif
-#ifdef CONFIG_X86_MCE
- sum += per_cpu(irq_stat, cpu).irq_thermal_count;
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- sum += per_cpu(irq_stat, cpu).irq_spurious_count;
-#endif
- return sum;
-}
-
-u64 arch_irq_stat(void)
-{
- u64 sum = atomic_read(&irq_err_count);
-
-#ifdef CONFIG_X86_IO_APIC
- sum += atomic_read(&irq_mis_count);
-#endif
- return sum;
-}
-
#ifdef CONFIG_HOTPLUG_CPU
#include <mach_apic.h>
@@ -395,20 +237,24 @@ void fixup_irqs(cpumask_t map)
{
unsigned int irq;
static int warned;
+ struct irq_desc *desc;
- for (irq = 0; irq < NR_IRQS; irq++) {
+ for_each_irq_desc(irq, desc) {
cpumask_t mask;
+
+ if (!desc)
+ continue;
if (irq == 2)
continue;
- cpus_and(mask, irq_desc[irq].affinity, map);
+ cpus_and(mask, desc->affinity, map);
if (any_online_cpu(mask) == NR_CPUS) {
printk("Breaking affinity for irq %i\n", irq);
mask = map;
}
- if (irq_desc[irq].chip->set_affinity)
- irq_desc[irq].chip->set_affinity(irq, mask);
- else if (irq_desc[irq].action && !(warned++))
+ if (desc->chip->set_affinity)
+ desc->chip->set_affinity(irq, mask);
+ else if (desc->action && !(warned++))
printk("Cannot set affinity for irq %i\n", irq);
}
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index f065fe9071b9..a174a217eb1a 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -13,34 +13,12 @@
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/delay.h>
+#include <linux/ftrace.h>
#include <asm/uaccess.h>
#include <asm/io_apic.h>
#include <asm/idle.h>
#include <asm/smp.h>
-atomic_t irq_err_count;
-
-/*
- * 'what should we do if we get a hw irq event on an illegal vector'.
- * each architecture has to answer this themselves.
- */
-void ack_bad_irq(unsigned int irq)
-{
- printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq);
- /*
- * Currently unexpected vectors happen only on SMP and APIC.
- * We _must_ ack these because every local APIC has only N
- * irq slots per priority level, and a 'hanging, unacked' IRQ
- * holds up an irq slot - in excessive cases (when multiple
- * unexpected vectors occur) that might lock up the APIC
- * completely.
- * But don't ack when the APIC is disabled. -AK
- */
- if (!disable_apic)
- ack_APIC_irq();
-}
-
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
/*
* Probabilistic stack overflow check:
*
@@ -50,134 +28,17 @@ void ack_bad_irq(unsigned int irq)
*/
static inline void stack_overflow_check(struct pt_regs *regs)
{
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
u64 curbase = (u64)task_stack_page(current);
- static unsigned long warned = -60*HZ;
-
- if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
- regs->sp < curbase + sizeof(struct thread_info) + 128 &&
- time_after(jiffies, warned + 60*HZ)) {
- printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
- current->comm, curbase, regs->sp);
- show_stack(NULL,NULL);
- warned = jiffies;
- }
-}
-#endif
-
-/*
- * Generic, controller-independent functions:
- */
-int show_interrupts(struct seq_file *p, void *v)
-{
- int i = *(loff_t *) v, j;
- struct irqaction * action;
- unsigned long flags;
-
- if (i == 0) {
- seq_printf(p, " ");
- for_each_online_cpu(j)
- seq_printf(p, "CPU%-8d",j);
- seq_putc(p, '\n');
- }
+ WARN_ONCE(regs->sp >= curbase &&
+ regs->sp <= curbase + THREAD_SIZE &&
+ regs->sp < curbase + sizeof(struct thread_info) +
+ sizeof(struct pt_regs) + 128,
- if (i < NR_IRQS) {
- unsigned any_count = 0;
-
- spin_lock_irqsave(&irq_desc[i].lock, flags);
-#ifndef CONFIG_SMP
- any_count = kstat_irqs(i);
-#else
- for_each_online_cpu(j)
- any_count |= kstat_cpu(j).irqs[i];
-#endif
- action = irq_desc[i].action;
- if (!action && !any_count)
- goto skip;
- seq_printf(p, "%3d: ",i);
-#ifndef CONFIG_SMP
- seq_printf(p, "%10u ", kstat_irqs(i));
-#else
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+ "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
+ current->comm, curbase, regs->sp);
#endif
- seq_printf(p, " %8s", irq_desc[i].chip->name);
- seq_printf(p, "-%-8s", irq_desc[i].name);
-
- if (action) {
- seq_printf(p, " %s", action->name);
- while ((action = action->next) != NULL)
- seq_printf(p, ", %s", action->name);
- }
- seq_putc(p, '\n');
-skip:
- spin_unlock_irqrestore(&irq_desc[i].lock, flags);
- } else if (i == NR_IRQS) {
- seq_printf(p, "NMI: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
- seq_printf(p, " Non-maskable interrupts\n");
- seq_printf(p, "LOC: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
- seq_printf(p, " Local timer interrupts\n");
-#ifdef CONFIG_SMP
- seq_printf(p, "RES: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count);
- seq_printf(p, " Rescheduling interrupts\n");
- seq_printf(p, "CAL: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
- seq_printf(p, " Function call interrupts\n");
- seq_printf(p, "TLB: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
- seq_printf(p, " TLB shootdowns\n");
-#endif
-#ifdef CONFIG_X86_MCE
- seq_printf(p, "TRM: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
- seq_printf(p, " Thermal event interrupts\n");
- seq_printf(p, "THR: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
- seq_printf(p, " Threshold APIC interrupts\n");
-#endif
- seq_printf(p, "SPU: ");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
- seq_printf(p, " Spurious interrupts\n");
- seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
- }
- return 0;
-}
-
-/*
- * /proc/stat helpers
- */
-u64 arch_irq_stat_cpu(unsigned int cpu)
-{
- u64 sum = cpu_pda(cpu)->__nmi_count;
-
- sum += cpu_pda(cpu)->apic_timer_irqs;
-#ifdef CONFIG_SMP
- sum += cpu_pda(cpu)->irq_resched_count;
- sum += cpu_pda(cpu)->irq_call_count;
- sum += cpu_pda(cpu)->irq_tlb_count;
-#endif
-#ifdef CONFIG_X86_MCE
- sum += cpu_pda(cpu)->irq_thermal_count;
- sum += cpu_pda(cpu)->irq_threshold_count;
-#endif
- sum += cpu_pda(cpu)->irq_spurious_count;
- return sum;
-}
-
-u64 arch_irq_stat(void)
-{
- return atomic_read(&irq_err_count);
}
/*
@@ -185,9 +46,10 @@ u64 arch_irq_stat(void)
* SMP cross-CPU interrupts have their own specific
* handlers).
*/
-asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
+asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
+ struct irq_desc *desc;
/* high bit used in ret_from_ code */
unsigned vector = ~regs->orig_ax;
@@ -197,12 +59,11 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
irq_enter();
irq = __get_cpu_var(vector_irq)[vector];
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
stack_overflow_check(regs);
-#endif
- if (likely(irq < NR_IRQS))
- generic_handle_irq(irq);
+ desc = irq_to_desc(irq);
+ if (likely(desc))
+ generic_handle_irq_desc(irq, desc);
else {
if (!disable_apic)
ack_APIC_irq();
@@ -223,42 +84,45 @@ void fixup_irqs(cpumask_t map)
{
unsigned int irq;
static int warned;
+ struct irq_desc *desc;
- for (irq = 0; irq < NR_IRQS; irq++) {
+ for_each_irq_desc(irq, desc) {
cpumask_t mask;
int break_affinity = 0;
int set_affinity = 1;
+ if (!desc)
+ continue;
if (irq == 2)
continue;
/* interrupt's are disabled at this point */
- spin_lock(&irq_desc[irq].lock);
+ spin_lock(&desc->lock);
if (!irq_has_action(irq) ||
- cpus_equal(irq_desc[irq].affinity, map)) {
- spin_unlock(&irq_desc[irq].lock);
+ cpus_equal(desc->affinity, map)) {
+ spin_unlock(&desc->lock);
continue;
}
- cpus_and(mask, irq_desc[irq].affinity, map);
+ cpus_and(mask, desc->affinity, map);
if (cpus_empty(mask)) {
break_affinity = 1;
mask = map;
}
- if (irq_desc[irq].chip->mask)
- irq_desc[irq].chip->mask(irq);
+ if (desc->chip->mask)
+ desc->chip->mask(irq);
- if (irq_desc[irq].chip->set_affinity)
- irq_desc[irq].chip->set_affinity(irq, mask);
+ if (desc->chip->set_affinity)
+ desc->chip->set_affinity(irq, mask);
else if (!(warned++))
set_affinity = 0;
- if (irq_desc[irq].chip->unmask)
- irq_desc[irq].chip->unmask(irq);
+ if (desc->chip->unmask)
+ desc->chip->unmask(irq);
- spin_unlock(&irq_desc[irq].lock);
+ spin_unlock(&desc->lock);
if (break_affinity && set_affinity)
printk("Broke affinity for irq %i\n", irq);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 9200a1e2752d..203384ed2b5d 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -68,7 +68,13 @@ void __init init_ISA_irqs (void)
/*
* 16 old-style INTA-cycle interrupts:
*/
- for (i = 0; i < 16; i++) {
+ for (i = 0; i < NR_IRQS_LEGACY; i++) {
+ struct irq_desc *desc = irq_to_desc(i);
+
+ desc->status = IRQ_DISABLED;
+ desc->action = NULL;
+ desc->depth = 1;
+
set_irq_chip_and_handler_name(i, &i8259A_chip,
handle_level_irq, "XT");
}
@@ -83,6 +89,27 @@ static struct irqaction irq2 = {
.name = "cascade",
};
+DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+ [0 ... IRQ0_VECTOR - 1] = -1,
+ [IRQ0_VECTOR] = 0,
+ [IRQ1_VECTOR] = 1,
+ [IRQ2_VECTOR] = 2,
+ [IRQ3_VECTOR] = 3,
+ [IRQ4_VECTOR] = 4,
+ [IRQ5_VECTOR] = 5,
+ [IRQ6_VECTOR] = 6,
+ [IRQ7_VECTOR] = 7,
+ [IRQ8_VECTOR] = 8,
+ [IRQ9_VECTOR] = 9,
+ [IRQ10_VECTOR] = 10,
+ [IRQ11_VECTOR] = 11,
+ [IRQ12_VECTOR] = 12,
+ [IRQ13_VECTOR] = 13,
+ [IRQ14_VECTOR] = 14,
+ [IRQ15_VECTOR] = 15,
+ [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
+};
+
/* Overridden in paravirt.c */
void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
@@ -98,22 +125,14 @@ void __init native_init_IRQ(void)
* us. (some of these will be overridden and become
* 'special' SMP interrupts)
*/
- for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
- int vector = FIRST_EXTERNAL_VECTOR + i;
- if (i >= NR_IRQS)
- break;
+ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
/* SYSCALL_VECTOR was reserved in trap_init. */
- if (!test_bit(vector, used_vectors))
- set_intr_gate(vector, interrupt[i]);
+ if (i != SYSCALL_VECTOR)
+ set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
}
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
- /*
- * IRQ0 must be given a fixed assignment and initialized,
- * because it's used before the IO-APIC is set up.
- */
- set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
/*
* The reschedule interrupt is a CPU-to-CPU reschedule-helper
* IPI, driven by wakeup.
@@ -128,6 +147,9 @@ void __init native_init_IRQ(void)
/* IPI for single call function */
set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
+
+ /* Low priority IPI to cleanup after moving an irq */
+ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
#endif
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 5b5be9d43c2a..6190e6ef546c 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -24,41 +24,6 @@
#include <asm/i8259.h>
/*
- * Common place to define all x86 IRQ vectors
- *
- * This builds up the IRQ handler stubs using some ugly macros in irq.h
- *
- * These macros create the low-level assembly IRQ routines that save
- * register context and call do_IRQ(). do_IRQ() then does all the
- * operations that are needed to keep the AT (or SMP IOAPIC)
- * interrupt-controller happy.
- */
-
-#define IRQ_NAME2(nr) nr##_interrupt(void)
-#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
-
-/*
- * SMP has a few special interrupts for IPI messages
- */
-
-#define BUILD_IRQ(nr) \
- asmlinkage void IRQ_NAME(nr); \
- asm("\n.text\n.p2align\n" \
- "IRQ" #nr "_interrupt:\n\t" \
- "push $~(" #nr ") ; " \
- "jmp common_interrupt\n" \
- ".previous");
-
-#define BI(x,y) \
- BUILD_IRQ(x##y)
-
-#define BUILD_16_IRQS(x) \
- BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
- BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
- BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
- BI(x,c) BI(x,d) BI(x,e) BI(x,f)
-
-/*
* ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
* (these are usually mapped to vectors 0x30-0x3f)
*/
@@ -73,37 +38,6 @@
*
* (these are usually mapped into the 0x30-0xff vector range)
*/
- BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
-BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
-BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
-BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
-
-#undef BUILD_16_IRQS
-#undef BI
-
-
-#define IRQ(x,y) \
- IRQ##x##y##_interrupt
-
-#define IRQLIST_16(x) \
- IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
- IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
- IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
- IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
-
-/* for the irq vectors */
-static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
- IRQLIST_16(0x2), IRQLIST_16(0x3),
- IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
- IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
- IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
-};
-
-#undef IRQ
-#undef IRQLIST_16
-
-
-
/*
* IRQ2 is cascade interrupt to second interrupt controller
@@ -142,23 +76,18 @@ void __init init_ISA_irqs(void)
init_bsp_APIC();
init_8259A(0);
- for (i = 0; i < NR_IRQS; i++) {
- irq_desc[i].status = IRQ_DISABLED;
- irq_desc[i].action = NULL;
- irq_desc[i].depth = 1;
+ for (i = 0; i < NR_IRQS_LEGACY; i++) {
+ struct irq_desc *desc = irq_to_desc(i);
+
+ desc->status = IRQ_DISABLED;
+ desc->action = NULL;
+ desc->depth = 1;
- if (i < 16) {
- /*
- * 16 old-style INTA-cycle interrupts:
- */
- set_irq_chip_and_handler_name(i, &i8259A_chip,
+ /*
+ * 16 old-style INTA-cycle interrupts:
+ */
+ set_irq_chip_and_handler_name(i, &i8259A_chip,
handle_level_irq, "XT");
- } else {
- /*
- * 'high' PCI IRQs filled in on demand
- */
- irq_desc[i].chip = &no_irq_chip;
- }
}
}
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index 304d8bad6559..cbc4332a77b2 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -18,7 +18,6 @@ static u32 *flush_words;
struct pci_device_id k8_nb_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
{}
};
EXPORT_SYMBOL(k8_nb_ids);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index d02def06ca91..e169ae9b6a62 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void)
return ret;
}
+/*
+ * If we don't do that, there is the possibility that the guest
+ * will calibrate under heavy load - thus, getting a lower lpj -
+ * and execute the delays themselves without load. This is wrong,
+ * because no delay loop can finish beforehand.
+ * Any heuristics is subject to fail, because ultimately, a large
+ * poll of guests can be running and trouble each other. So we preset
+ * lpj here
+ */
+static unsigned long kvm_get_tsc_khz(void)
+{
+ return preset_lpj;
+}
+
+static void kvm_get_preset_lpj(void)
+{
+ struct pvclock_vcpu_time_info *src;
+ unsigned long khz;
+ u64 lpj;
+
+ src = &per_cpu(hv_clock, 0);
+ khz = pvclock_tsc_khz(src);
+
+ lpj = ((u64)khz * 1000);
+ do_div(lpj, HZ);
+ preset_lpj = lpj;
+}
+
static struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_read,
@@ -100,7 +128,7 @@ static int kvm_register_clock(char *txt)
}
#ifdef CONFIG_X86_LOCAL_APIC
-static void kvm_setup_secondary_clock(void)
+static void __cpuinit kvm_setup_secondary_clock(void)
{
/*
* Now that the first cpu already had this clocksource initialized,
@@ -153,6 +181,7 @@ void __init kvmclock_init(void)
pv_time_ops.get_wallclock = kvm_get_wallclock;
pv_time_ops.set_wallclock = kvm_set_wallclock;
pv_time_ops.sched_clock = kvm_clock_read;
+ pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
#ifdef CONFIG_X86_LOCAL_APIC
pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
#endif
@@ -163,6 +192,7 @@ void __init kvmclock_init(void)
#ifdef CONFIG_KEXEC
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
+ kvm_get_preset_lpj();
clocksource_register(&kvm_clock);
}
}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 0732adba05ca..37f420018a41 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -13,6 +13,7 @@
#include <linux/numa.h>
#include <linux/ftrace.h>
#include <linux/suspend.h>
+#include <linux/gfp.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -25,15 +26,6 @@
#include <asm/system.h>
#include <asm/cacheflush.h>
-#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
-static u32 kexec_pgd[1024] PAGE_ALIGNED;
-#ifdef CONFIG_X86_PAE
-static u32 kexec_pmd0[1024] PAGE_ALIGNED;
-static u32 kexec_pmd1[1024] PAGE_ALIGNED;
-#endif
-static u32 kexec_pte0[1024] PAGE_ALIGNED;
-static u32 kexec_pte1[1024] PAGE_ALIGNED;
-
static void set_idt(void *newidt, __u16 limit)
{
struct desc_ptr curidt;
@@ -76,6 +68,76 @@ static void load_segments(void)
#undef __STR
}
+static void machine_kexec_free_page_tables(struct kimage *image)
+{
+ free_page((unsigned long)image->arch.pgd);
+#ifdef CONFIG_X86_PAE
+ free_page((unsigned long)image->arch.pmd0);
+ free_page((unsigned long)image->arch.pmd1);
+#endif
+ free_page((unsigned long)image->arch.pte0);
+ free_page((unsigned long)image->arch.pte1);
+}
+
+static int machine_kexec_alloc_page_tables(struct kimage *image)
+{
+ image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
+#ifdef CONFIG_X86_PAE
+ image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+ image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+#endif
+ image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL);
+ image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!image->arch.pgd ||
+#ifdef CONFIG_X86_PAE
+ !image->arch.pmd0 || !image->arch.pmd1 ||
+#endif
+ !image->arch.pte0 || !image->arch.pte1) {
+ machine_kexec_free_page_tables(image);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static void machine_kexec_page_table_set_one(
+ pgd_t *pgd, pmd_t *pmd, pte_t *pte,
+ unsigned long vaddr, unsigned long paddr)
+{
+ pud_t *pud;
+
+ pgd += pgd_index(vaddr);
+#ifdef CONFIG_X86_PAE
+ if (!(pgd_val(*pgd) & _PAGE_PRESENT))
+ set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT));
+#endif
+ pud = pud_offset(pgd, vaddr);
+ pmd = pmd_offset(pud, vaddr);
+ if (!(pmd_val(*pmd) & _PAGE_PRESENT))
+ set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+ pte = pte_offset_kernel(pmd, vaddr);
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+}
+
+static void machine_kexec_prepare_page_tables(struct kimage *image)
+{
+ void *control_page;
+ pmd_t *pmd = 0;
+
+ control_page = page_address(image->control_code_page);
+#ifdef CONFIG_X86_PAE
+ pmd = image->arch.pmd0;
+#endif
+ machine_kexec_page_table_set_one(
+ image->arch.pgd, pmd, image->arch.pte0,
+ (unsigned long)control_page, __pa(control_page));
+#ifdef CONFIG_X86_PAE
+ pmd = image->arch.pmd1;
+#endif
+ machine_kexec_page_table_set_one(
+ image->arch.pgd, pmd, image->arch.pte1,
+ __pa(control_page), __pa(control_page));
+}
+
/*
* A architecture hook called to validate the
* proposed image and prepare the control pages
@@ -87,12 +149,20 @@ static void load_segments(void)
* reboot code buffer to allow us to avoid allocations
* later.
*
- * Make control page executable.
+ * - Make control page executable.
+ * - Allocate page tables
+ * - Setup page tables
*/
int machine_kexec_prepare(struct kimage *image)
{
+ int error;
+
if (nx_enabled)
set_pages_x(image->control_code_page, 1);
+ error = machine_kexec_alloc_page_tables(image);
+ if (error)
+ return error;
+ machine_kexec_prepare_page_tables(image);
return 0;
}
@@ -104,6 +174,7 @@ void machine_kexec_cleanup(struct kimage *image)
{
if (nx_enabled)
set_pages_nx(image->control_code_page, 1);
+ machine_kexec_free_page_tables(image);
}
/*
@@ -150,19 +221,11 @@ void machine_kexec(struct kimage *image)
relocate_kernel_ptr = control_page;
page_list[PA_CONTROL_PAGE] = __pa(control_page);
page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
- page_list[PA_PGD] = __pa(kexec_pgd);
- page_list[VA_PGD] = (unsigned long)kexec_pgd;
-#ifdef CONFIG_X86_PAE
- page_list[PA_PMD_0] = __pa(kexec_pmd0);
- page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
- page_list[PA_PMD_1] = __pa(kexec_pmd1);
- page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
-#endif
- page_list[PA_PTE_0] = __pa(kexec_pte0);
- page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
- page_list[PA_PTE_1] = __pa(kexec_pte1);
- page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
- page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT);
+ page_list[PA_PGD] = __pa(image->arch.pgd);
+
+ if (image->type == KEXEC_TYPE_DEFAULT)
+ page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
+ << PAGE_SHIFT);
/* The segment registers are funny things, they have both a
* visible and an invisible part. Whenever the visible part is
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 7a1f8eeac2c7..c25fdb382292 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -10,7 +10,7 @@
* This driver allows to upgrade microcode on AMD
* family 0x10 and 0x11 processors.
*
- * Licensed unter the terms of the GNU General Public
+ * Licensed under the terms of the GNU General Public
* License version 2. See file COPYING for details.
*/
@@ -32,14 +32,14 @@
#include <linux/platform_device.h>
#include <linux/pci.h>
#include <linux/pci_ids.h>
+#include <linux/uaccess.h>
#include <asm/msr.h>
-#include <asm/uaccess.h>
#include <asm/processor.h>
#include <asm/microcode.h>
MODULE_DESCRIPTION("AMD Microcode Update Driver");
-MODULE_AUTHOR("Peter Oruba <peter.oruba@amd.com>");
+MODULE_AUTHOR("Peter Oruba");
MODULE_LICENSE("GPL v2");
#define UCODE_MAGIC 0x00414d44
@@ -47,43 +47,38 @@ MODULE_LICENSE("GPL v2");
#define UCODE_UCODE_TYPE 0x00000001
struct equiv_cpu_entry {
- unsigned int installed_cpu;
- unsigned int fixed_errata_mask;
- unsigned int fixed_errata_compare;
- unsigned int equiv_cpu;
-};
+ u32 installed_cpu;
+ u32 fixed_errata_mask;
+ u32 fixed_errata_compare;
+ u16 equiv_cpu;
+ u16 res;
+} __attribute__((packed));
struct microcode_header_amd {
- unsigned int data_code;
- unsigned int patch_id;
- unsigned char mc_patch_data_id[2];
- unsigned char mc_patch_data_len;
- unsigned char init_flag;
- unsigned int mc_patch_data_checksum;
- unsigned int nb_dev_id;
- unsigned int sb_dev_id;
- unsigned char processor_rev_id[2];
- unsigned char nb_rev_id;
- unsigned char sb_rev_id;
- unsigned char bios_api_rev;
- unsigned char reserved1[3];
- unsigned int match_reg[8];
-};
+ u32 data_code;
+ u32 patch_id;
+ u16 mc_patch_data_id;
+ u8 mc_patch_data_len;
+ u8 init_flag;
+ u32 mc_patch_data_checksum;
+ u32 nb_dev_id;
+ u32 sb_dev_id;
+ u16 processor_rev_id;
+ u8 nb_rev_id;
+ u8 sb_rev_id;
+ u8 bios_api_rev;
+ u8 reserved1[3];
+ u32 match_reg[8];
+} __attribute__((packed));
struct microcode_amd {
struct microcode_header_amd hdr;
unsigned int mpb[0];
};
-#define UCODE_MAX_SIZE (2048)
-#define DEFAULT_UCODE_DATASIZE (896)
-#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd))
-#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
-#define DWSIZE (sizeof(u32))
-/* For now we support a fixed ucode total size only */
-#define get_totalsize(mc) \
- ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
- + MC_HEADER_SIZE)
+#define UCODE_MAX_SIZE 2048
+#define UCODE_CONTAINER_SECTION_HDR 8
+#define UCODE_CONTAINER_HEADER_SIZE 12
/* serialize access to the physical write */
static DEFINE_SPINLOCK(microcode_update_lock);
@@ -93,31 +88,24 @@ static struct equiv_cpu_entry *equiv_cpu_table;
static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ u32 dummy;
memset(csig, 0, sizeof(*csig));
-
if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
- printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n",
- cpu);
+ printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not "
+ "supported\n", cpu, c->x86);
return -1;
}
-
- asm volatile("movl %1, %%ecx; rdmsr"
- : "=a" (csig->rev)
- : "i" (0x0000008B) : "ecx");
-
- printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n",
- csig->rev);
-
+ rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
+ printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev);
return 0;
}
static int get_matching_microcode(int cpu, void *mc, int rev)
{
struct microcode_header_amd *mc_header = mc;
- struct pci_dev *nb_pci_dev, *sb_pci_dev;
unsigned int current_cpu_id;
- unsigned int equiv_cpu_id = 0x00;
+ u16 equiv_cpu_id = 0;
unsigned int i = 0;
BUG_ON(equiv_cpu_table == NULL);
@@ -132,57 +120,25 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
}
if (!equiv_cpu_id) {
- printk(KERN_ERR "microcode: CPU%d cpu_id "
- "not found in equivalent cpu table \n", cpu);
+ printk(KERN_WARNING "microcode: CPU%d: cpu revision "
+ "not listed in equivalent cpu table\n", cpu);
return 0;
}
- if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) {
- printk(KERN_ERR
- "microcode: CPU%d patch does not match "
- "(patch is %x, cpu extended is %x) \n",
- cpu, mc_header->processor_rev_id[0],
- (equiv_cpu_id & 0xff));
+ if (mc_header->processor_rev_id != equiv_cpu_id) {
+ printk(KERN_ERR "microcode: CPU%d: patch mismatch "
+ "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
+ cpu, mc_header->processor_rev_id, equiv_cpu_id);
return 0;
}
- if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) {
- printk(KERN_ERR "microcode: CPU%d patch does not match "
- "(patch is %x, cpu base id is %x) \n",
- cpu, mc_header->processor_rev_id[1],
- ((equiv_cpu_id >> 16) & 0xff));
-
+ /* ucode might be chipset specific -- currently we don't support this */
+ if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
+ printk(KERN_ERR "microcode: CPU%d: loading of chipset "
+ "specific code not yet supported\n", cpu);
return 0;
}
- /* ucode may be northbridge specific */
- if (mc_header->nb_dev_id) {
- nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
- (mc_header->nb_dev_id & 0xff),
- NULL);
- if ((!nb_pci_dev) ||
- (mc_header->nb_rev_id != nb_pci_dev->revision)) {
- printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu);
- pci_dev_put(nb_pci_dev);
- return 0;
- }
- pci_dev_put(nb_pci_dev);
- }
-
- /* ucode may be southbridge specific */
- if (mc_header->sb_dev_id) {
- sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
- (mc_header->sb_dev_id & 0xff),
- NULL);
- if ((!sb_pci_dev) ||
- (mc_header->sb_rev_id != sb_pci_dev->revision)) {
- printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu);
- pci_dev_put(sb_pci_dev);
- return 0;
- }
- pci_dev_put(sb_pci_dev);
- }
-
if (mc_header->patch_id <= rev)
return 0;
@@ -192,12 +148,10 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
static void apply_microcode_amd(int cpu)
{
unsigned long flags;
- unsigned int eax, edx;
- unsigned int rev;
+ u32 rev, dummy;
int cpu_num = raw_smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
struct microcode_amd *mc_amd = uci->mc;
- unsigned long addr;
/* We should bind the task to the CPU */
BUG_ON(cpu_num != cpu);
@@ -206,42 +160,34 @@ static void apply_microcode_amd(int cpu)
return;
spin_lock_irqsave(&microcode_update_lock, flags);
-
- addr = (unsigned long)&mc_amd->hdr.data_code;
- edx = (unsigned int)(((unsigned long)upper_32_bits(addr)));
- eax = (unsigned int)(((unsigned long)lower_32_bits(addr)));
-
- asm volatile("movl %0, %%ecx; wrmsr" :
- : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx");
-
+ wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
/* get patch id after patching */
- asm volatile("movl %1, %%ecx; rdmsr"
- : "=a" (rev)
- : "i" (0x0000008B) : "ecx");
-
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
spin_unlock_irqrestore(&microcode_update_lock, flags);
/* check current patch id and patch's id for match */
if (rev != mc_amd->hdr.patch_id) {
- printk(KERN_ERR "microcode: CPU%d update from revision "
- "0x%x to 0x%x failed\n", cpu_num,
- mc_amd->hdr.patch_id, rev);
+ printk(KERN_ERR "microcode: CPU%d: update failed "
+ "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
return;
}
- printk(KERN_INFO "microcode: CPU%d updated from revision "
- "0x%x to 0x%x \n",
- cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id);
+ printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
+ cpu, rev);
uci->cpu_sig.rev = rev;
}
-static void * get_next_ucode(u8 *buf, unsigned int size,
- int (*get_ucode_data)(void *, const void *, size_t),
- unsigned int *mc_size)
+static int get_ucode_data(void *to, const u8 *from, size_t n)
+{
+ memcpy(to, from, n);
+ return 0;
+}
+
+static void *get_next_ucode(const u8 *buf, unsigned int size,
+ unsigned int *mc_size)
{
unsigned int total_size;
-#define UCODE_CONTAINER_SECTION_HDR 8
u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
void *mc;
@@ -249,39 +195,37 @@ static void * get_next_ucode(u8 *buf, unsigned int size,
return NULL;
if (section_hdr[0] != UCODE_UCODE_TYPE) {
- printk(KERN_ERR "microcode: error! "
- "Wrong microcode payload type field\n");
+ printk(KERN_ERR "microcode: error: invalid type field in "
+ "container file section header\n");
return NULL;
}
total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
- printk(KERN_INFO "microcode: size %u, total_size %u\n",
- size, total_size);
+ printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
+ size, total_size);
if (total_size > size || total_size > UCODE_MAX_SIZE) {
- printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
+ printk(KERN_ERR "microcode: error: size mismatch\n");
return NULL;
}
mc = vmalloc(UCODE_MAX_SIZE);
if (mc) {
memset(mc, 0, UCODE_MAX_SIZE);
- if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) {
+ if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR,
+ total_size)) {
vfree(mc);
mc = NULL;
} else
*mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
}
-#undef UCODE_CONTAINER_SECTION_HDR
return mc;
}
-static int install_equiv_cpu_table(u8 *buf,
- int (*get_ucode_data)(void *, const void *, size_t))
+static int install_equiv_cpu_table(const u8 *buf)
{
-#define UCODE_CONTAINER_HEADER_SIZE 12
u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
unsigned int *buf_pos = (unsigned int *)container_hdr;
unsigned long size;
@@ -292,14 +236,15 @@ static int install_equiv_cpu_table(u8 *buf,
size = buf_pos[2];
if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
- printk(KERN_ERR "microcode: error! "
- "Wrong microcode equivalnet cpu table\n");
+ printk(KERN_ERR "microcode: error: invalid type field in "
+ "container file section header\n");
return 0;
}
equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
if (!equiv_cpu_table) {
- printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n");
+ printk(KERN_ERR "microcode: failed to allocate "
+ "equivalent CPU table\n");
return 0;
}
@@ -310,7 +255,6 @@ static int install_equiv_cpu_table(u8 *buf,
}
return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
-#undef UCODE_CONTAINER_HEADER_SIZE
}
static void free_equiv_cpu_table(void)
@@ -321,18 +265,20 @@ static void free_equiv_cpu_table(void)
}
}
-static int generic_load_microcode(int cpu, void *data, size_t size,
- int (*get_ucode_data)(void *, const void *, size_t))
+static int generic_load_microcode(int cpu, const u8 *data, size_t size)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- u8 *ucode_ptr = data, *new_mc = NULL, *mc;
+ const u8 *ucode_ptr = data;
+ void *new_mc = NULL;
+ void *mc;
int new_rev = uci->cpu_sig.rev;
unsigned int leftover;
unsigned long offset;
- offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data);
+ offset = install_equiv_cpu_table(ucode_ptr);
if (!offset) {
- printk(KERN_ERR "microcode: installing equivalent cpu table failed\n");
+ printk(KERN_ERR "microcode: failed to create "
+ "equivalent cpu table\n");
return -EINVAL;
}
@@ -343,7 +289,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
unsigned int uninitialized_var(mc_size);
struct microcode_header_amd *mc_header;
- mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size);
+ mc = get_next_ucode(ucode_ptr, leftover, &mc_size);
if (!mc)
break;
@@ -353,7 +299,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
vfree(new_mc);
new_rev = mc_header->patch_id;
new_mc = mc;
- } else
+ } else
vfree(mc);
ucode_ptr += mc_size;
@@ -365,9 +311,9 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
if (uci->mc)
vfree(uci->mc);
uci->mc = new_mc;
- pr_debug("microcode: CPU%d found a matching microcode update with"
- " version 0x%x (current=0x%x)\n",
- cpu, new_rev, uci->cpu_sig.rev);
+ pr_debug("microcode: CPU%d found a matching microcode "
+ "update with version 0x%x (current=0x%x)\n",
+ cpu, new_rev, uci->cpu_sig.rev);
} else
vfree(new_mc);
}
@@ -377,12 +323,6 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
return (int)leftover;
}
-static int get_ucode_fw(void *to, const void *from, size_t n)
-{
- memcpy(to, from, n);
- return 0;
-}
-
static int request_microcode_fw(int cpu, struct device *device)
{
const char *fw_name = "amd-ucode/microcode_amd.bin";
@@ -394,12 +334,11 @@ static int request_microcode_fw(int cpu, struct device *device)
ret = request_firmware(&firmware, fw_name, device);
if (ret) {
- printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name);
+ printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
return ret;
}
- ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
- &get_ucode_fw);
+ ret = generic_load_microcode(cpu, firmware->data, firmware->size);
release_firmware(firmware);
@@ -408,8 +347,8 @@ static int request_microcode_fw(int cpu, struct device *device)
static int request_microcode_user(int cpu, const void __user *buf, size_t size)
{
- printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode"
- "is not supported\n");
+ printk(KERN_INFO "microcode: AMD microcode update via "
+ "/dev/cpu/microcode not supported\n");
return -1;
}
@@ -433,3 +372,4 @@ struct microcode_ops * __init init_amd_microcode(void)
{
return &microcode_amd_ops;
}
+
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 936d8d55f230..c9b721ba968c 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -99,7 +99,7 @@ MODULE_LICENSE("GPL");
#define MICROCODE_VERSION "2.00"
-struct microcode_ops *microcode_ops;
+static struct microcode_ops *microcode_ops;
/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
static DEFINE_MUTEX(microcode_mutex);
@@ -203,7 +203,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
#endif
/* fake device for request_firmware */
-struct platform_device *microcode_pdev;
+static struct platform_device *microcode_pdev;
static ssize_t reload_store(struct sys_device *dev,
struct sysdev_attribute *attr,
@@ -272,13 +272,18 @@ static struct attribute_group mc_attr_group = {
.name = "microcode",
};
-static void microcode_fini_cpu(int cpu)
+static void __microcode_fini_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- mutex_lock(&microcode_mutex);
microcode_ops->microcode_fini_cpu(cpu);
uci->valid = 0;
+}
+
+static void microcode_fini_cpu(int cpu)
+{
+ mutex_lock(&microcode_mutex);
+ __microcode_fini_cpu(cpu);
mutex_unlock(&microcode_mutex);
}
@@ -306,12 +311,16 @@ static int microcode_resume_cpu(int cpu)
* to this cpu (a bit of paranoia):
*/
if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
- microcode_fini_cpu(cpu);
+ __microcode_fini_cpu(cpu);
+ printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
+ cpu);
return -1;
}
- if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) {
- microcode_fini_cpu(cpu);
+ if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) {
+ __microcode_fini_cpu(cpu);
+ printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n",
+ cpu);
/* Should we look for a new ucode here? */
return 1;
}
@@ -319,7 +328,7 @@ static int microcode_resume_cpu(int cpu)
return 0;
}
-void microcode_update_cpu(int cpu)
+static void microcode_update_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
int err = 0;
@@ -480,8 +489,8 @@ static int __init microcode_init(void)
printk(KERN_INFO
"Microcode Update Driver: v" MICROCODE_VERSION
- " <tigran@aivazian.fsnet.co.uk>"
- " <peter.oruba@amd.com>\n");
+ " <tigran@aivazian.fsnet.co.uk>,"
+ " Peter Oruba\n");
return 0;
}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 622dc4a21784..b7f4c929e615 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -155,6 +155,7 @@ static DEFINE_SPINLOCK(microcode_update_lock);
static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu_num);
+ unsigned long flags;
unsigned int val[2];
memset(csig, 0, sizeof(*csig));
@@ -174,11 +175,16 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
csig->pf = 1 << ((val[1] >> 18) & 7);
}
+ /* serialize access to the physical write to MSR 0x79 */
+ spin_lock_irqsave(&microcode_update_lock, flags);
+
wrmsr(MSR_IA32_UCODE_REV, 0, 0);
/* see notes above for revision 1.07. Apparent chip bug */
sync_core();
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
+ spin_unlock_irqrestore(&microcode_update_lock, flags);
+
pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
csig->sig, csig->pf, csig->rev);
@@ -465,7 +471,7 @@ static void microcode_fini_cpu(int cpu)
uci->mc = NULL;
}
-struct microcode_ops microcode_intel_ops = {
+static struct microcode_ops microcode_intel_ops = {
.request_microcode_user = request_microcode_user,
.request_microcode_fw = request_microcode_fw,
.collect_cpu_info = collect_cpu_info,
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index f98f4e1dba09..45e3b69808ba 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -586,23 +586,23 @@ static void __init __get_smp_config(unsigned int early)
{
struct intel_mp_floating *mpf = mpf_found;
- if (x86_quirks->mach_get_smp_config) {
- if (x86_quirks->mach_get_smp_config(early))
- return;
- }
+ if (!mpf)
+ return;
+
if (acpi_lapic && early)
return;
+
/*
- * ACPI supports both logical (e.g. Hyper-Threading) and physical
- * processors, where MPS only supports physical.
+ * MPS doesn't support hyperthreading, aka only have
+ * thread 0 apic id in MPS table
*/
- if (acpi_lapic && acpi_ioapic) {
- printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
- "information\n");
+ if (acpi_lapic && acpi_ioapic)
return;
- } else if (acpi_lapic)
- printk(KERN_INFO "Using ACPI for processor (LAPIC) "
- "configuration information\n");
+
+ if (x86_quirks->mach_get_smp_config) {
+ if (x86_quirks->mach_get_smp_config(early))
+ return;
+ }
printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
mpf->mpf_specification);
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 2e2af5d18191..82a7c7ed6d45 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -163,8 +163,8 @@ static int __cpuinit msr_device_create(int cpu)
{
struct device *dev;
- dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu),
- NULL, "msr%d", cpu);
+ dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL,
+ "msr%d", cpu);
return IS_ERR(dev) ? PTR_ERR(dev) : 0;
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 2c97f07f1c2c..8bd1bf9622a7 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -131,6 +131,11 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count)
atomic_dec(&nmi_active);
}
+static void __acpi_nmi_disable(void *__unused)
+{
+ apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+
int __init check_nmi_watchdog(void)
{
unsigned int *prev_nmi_count;
@@ -179,8 +184,12 @@ int __init check_nmi_watchdog(void)
kfree(prev_nmi_count);
return 0;
error:
- if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
- disable_8259A_irq(0);
+ if (nmi_watchdog == NMI_IO_APIC) {
+ if (!timer_through_8259)
+ disable_8259A_irq(0);
+ on_each_cpu(__acpi_nmi_disable, NULL, 1);
+ }
+
#ifdef CONFIG_X86_32
timer_ack = 0;
#endif
@@ -199,12 +208,17 @@ static int __init setup_nmi_watchdog(char *str)
++str;
}
- get_option(&str, &nmi);
-
- if (nmi >= NMI_INVALID)
- return 0;
+ if (!strncmp(str, "lapic", 5))
+ nmi_watchdog = NMI_LOCAL_APIC;
+ else if (!strncmp(str, "ioapic", 6))
+ nmi_watchdog = NMI_IO_APIC;
+ else {
+ get_option(&str, &nmi);
+ if (nmi >= NMI_INVALID)
+ return 0;
+ nmi_watchdog = nmi;
+ }
- nmi_watchdog = nmi;
return 1;
}
__setup("nmi_watchdog=", setup_nmi_watchdog);
@@ -285,11 +299,6 @@ void acpi_nmi_enable(void)
on_each_cpu(__acpi_nmi_enable, NULL, 1);
}
-static void __acpi_nmi_disable(void *__unused)
-{
- apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
/*
* Disable timer based NMIs on all CPUs:
*/
@@ -340,6 +349,8 @@ void stop_apic_nmi_watchdog(void *unused)
return;
if (nmi_watchdog == NMI_LOCAL_APIC)
lapic_watchdog_stop();
+ else
+ __acpi_nmi_disable(NULL);
__get_cpu_var(wd_enabled) = 0;
atomic_dec(&nmi_active);
}
@@ -465,6 +476,24 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
#ifdef CONFIG_SYSCTL
+static void enable_ioapic_nmi_watchdog_single(void *unused)
+{
+ __get_cpu_var(wd_enabled) = 1;
+ atomic_inc(&nmi_active);
+ __acpi_nmi_enable(NULL);
+}
+
+static void enable_ioapic_nmi_watchdog(void)
+{
+ on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1);
+ touch_nmi_watchdog();
+}
+
+static void disable_ioapic_nmi_watchdog(void)
+{
+ on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
+}
+
static int __init setup_unknown_nmi_panic(char *str)
{
unknown_nmi_panic = 1;
@@ -507,6 +536,11 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
enable_lapic_nmi_watchdog();
else
disable_lapic_nmi_watchdog();
+ } else if (nmi_watchdog == NMI_IO_APIC) {
+ if (nmi_watchdog_enabled)
+ enable_ioapic_nmi_watchdog();
+ else
+ disable_ioapic_nmi_watchdog();
} else {
printk(KERN_WARNING
"NMI watchdog doesn't know what hardware to touch\n");
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 4caff39078e0..0deea37a53cf 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -31,7 +31,7 @@
#include <asm/numaq.h>
#include <asm/topology.h>
#include <asm/processor.h>
-#include <asm/mpspec.h>
+#include <asm/genapic.h>
#include <asm/e820.h>
#include <asm/setup.h>
@@ -235,6 +235,13 @@ static int __init numaq_setup_ioapic_ids(void)
return 1;
}
+static int __init numaq_update_genapic(void)
+{
+ genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi;
+
+ return 0;
+}
+
static struct x86_quirks numaq_x86_quirks __initdata = {
.arch_pre_time_init = numaq_pre_time_init,
.arch_time_init = NULL,
@@ -250,6 +257,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
.mpc_oem_pci_bus = mpc_oem_pci_bus,
.smp_read_mpc_oem = smp_read_mpc_oem,
.setup_ioapic_ids = numaq_setup_ioapic_ids,
+ .update_genapic = numaq_update_genapic,
};
void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 0e9f1982b1dd..95777b0faa73 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -7,7 +7,8 @@
#include <asm/paravirt.h>
-static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
+static inline void
+default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
{
__raw_spin_lock(lock);
}
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 080d1d27f37a..d28bbdc35e4e 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -217,16 +217,6 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap,
#endif /* CONFIG_IOMMU_DEBUG */
-static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
-{
- unsigned int npages;
-
- npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK);
- npages >>= PAGE_SHIFT;
-
- return npages;
-}
-
static inline int translation_enabled(struct iommu_table *tbl)
{
/* only PHBs with translation enabled have an IOMMU table */
@@ -408,7 +398,7 @@ static void calgary_unmap_sg(struct device *dev,
if (dmalen == 0)
break;
- npages = num_dma_pages(dma, dmalen);
+ npages = iommu_num_pages(dma, dmalen, PAGE_SIZE);
iommu_free(tbl, dma, npages);
}
}
@@ -427,7 +417,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
BUG_ON(!sg_page(s));
vaddr = (unsigned long) sg_virt(s);
- npages = num_dma_pages(vaddr, s->length);
+ npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
entry = iommu_range_alloc(dev, tbl, npages);
if (entry == bad_dma_address) {
@@ -464,7 +454,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
struct iommu_table *tbl = find_iommu_table(dev);
uaddr = (unsigned long)vaddr;
- npages = num_dma_pages(uaddr, size);
+ npages = iommu_num_pages(uaddr, size, PAGE_SIZE);
return iommu_alloc(dev, tbl, vaddr, npages, direction);
}
@@ -475,7 +465,7 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
struct iommu_table *tbl = find_iommu_table(dev);
unsigned int npages;
- npages = num_dma_pages(dma_handle, size);
+ npages = iommu_num_pages(dma_handle, size, PAGE_SIZE);
iommu_free(tbl, dma_handle, npages);
}
@@ -1577,7 +1567,7 @@ static int __init calgary_parse_options(char *p)
++p;
if (*p == '\0')
break;
- bridge = simple_strtol(p, &endp, 0);
+ bridge = simple_strtoul(p, &endp, 0);
if (p == endp)
break;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 0a3824e837b4..19a1044a0cd9 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -6,6 +6,7 @@
#include <asm/proto.h>
#include <asm/dma.h>
#include <asm/iommu.h>
+#include <asm/gart.h>
#include <asm/calgary.h>
#include <asm/amd_iommu.h>
@@ -30,11 +31,6 @@ int no_iommu __read_mostly;
/* Set this to 1 if there is a HW IOMMU in the system */
int iommu_detected __read_mostly = 0;
-/* This tells the BIO block layer to assume merging. Default to off
- because we cannot guarantee merging later. */
-int iommu_bio_merge __read_mostly = 0;
-EXPORT_SYMBOL(iommu_bio_merge);
-
dma_addr_t bad_dma_address __read_mostly = 0;
EXPORT_SYMBOL(bad_dma_address);
@@ -105,11 +101,15 @@ static void __init dma32_free_bootmem(void)
dma32_bootmem_ptr = NULL;
dma32_bootmem_size = 0;
}
+#endif
void __init pci_iommu_alloc(void)
{
+#ifdef CONFIG_X86_64
/* free the range so iommu could get some range less than 4G */
dma32_free_bootmem();
+#endif
+
/*
* The order of these functions is important for
* fall-back/fail-over reasons
@@ -125,15 +125,6 @@ void __init pci_iommu_alloc(void)
pci_swiotlb_init();
}
-unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
-{
- unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
-
- return size >> PAGE_SHIFT;
-}
-EXPORT_SYMBOL(iommu_num_pages);
-#endif
-
void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_addr, gfp_t flag)
{
@@ -188,7 +179,6 @@ static __init int iommu_setup(char *p)
}
if (!strncmp(p, "biomerge", 8)) {
- iommu_bio_merge = 4096;
iommu_merge = 1;
force_iommu = 1;
}
@@ -300,8 +290,8 @@ fs_initcall(pci_iommu_init);
static __devinit void via_no_dac(struct pci_dev *dev)
{
if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
- printk(KERN_INFO "PCI: VIA PCI bridge detected."
- "Disabling DAC.\n");
+ printk(KERN_INFO
+ "PCI: VIA PCI bridge detected. Disabling DAC.\n");
forbid_dac = 1;
}
}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 145f1c83369f..a35eaa379ff6 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -123,6 +123,8 @@ static void free_iommu(unsigned long offset, int size)
spin_lock_irqsave(&iommu_bitmap_lock, flags);
iommu_area_free(iommu_gart_bitmap, offset, size);
+ if (offset >= next_bit)
+ next_bit = offset + size;
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
}
@@ -231,7 +233,7 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
size_t size, int dir, unsigned long align_mask)
{
- unsigned long npages = iommu_num_pages(phys_mem, size);
+ unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
int i;
@@ -285,7 +287,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
return;
iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
- npages = iommu_num_pages(dma_addr, size);
+ npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
for (i = 0; i < npages; i++) {
iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
CLEAR_LEAK(iommu_page + i);
@@ -368,7 +370,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
}
addr = phys_addr;
- pages = iommu_num_pages(s->offset, s->length);
+ pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
while (pages--) {
iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
SET_LEAK(iommu_page);
@@ -451,7 +453,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
seg_size += s->length;
need = nextneed;
- pages += iommu_num_pages(s->offset, s->length);
+ pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
ps = s;
}
if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
@@ -743,10 +745,8 @@ void __init gart_iommu_init(void)
unsigned long scratch;
long i;
- if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) {
- printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n");
+ if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
return;
- }
#ifndef CONFIG_AGP_AMD64
no_agp = 1;
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index c4ce0332759e..242c3440687f 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -3,6 +3,8 @@
#include <linux/pci.h>
#include <linux/cache.h>
#include <linux/module.h>
+#include <linux/swiotlb.h>
+#include <linux/bootmem.h>
#include <linux/dma-mapping.h>
#include <asm/iommu.h>
@@ -11,6 +13,31 @@
int swiotlb __read_mostly;
+void *swiotlb_alloc_boot(size_t size, unsigned long nslabs)
+{
+ return alloc_bootmem_low_pages(size);
+}
+
+void *swiotlb_alloc(unsigned order, unsigned long nslabs)
+{
+ return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
+}
+
+dma_addr_t swiotlb_phys_to_bus(phys_addr_t paddr)
+{
+ return paddr;
+}
+
+phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
+{
+ return baddr;
+}
+
+int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
+{
+ return 0;
+}
+
static dma_addr_t
swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
int direction)
@@ -18,9 +45,21 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
}
+static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flags)
+{
+ void *vaddr;
+
+ vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags);
+ if (vaddr)
+ return vaddr;
+
+ return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
+}
+
struct dma_mapping_ops swiotlb_dma_ops = {
.mapping_error = swiotlb_dma_mapping_error,
- .alloc_coherent = swiotlb_alloc_coherent,
+ .alloc_coherent = x86_swiotlb_alloc_coherent,
.free_coherent = swiotlb_free_coherent,
.map_single = swiotlb_map_single_phys,
.unmap_single = swiotlb_unmap_single,
@@ -38,8 +77,10 @@ struct dma_mapping_ops swiotlb_dma_ops = {
void __init pci_swiotlb_init(void)
{
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
+#ifdef CONFIG_X86_64
if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
swiotlb = 1;
+#endif
if (swiotlb_force)
swiotlb = 1;
if (swiotlb) {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c622772744d8..e68bb9e30864 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,13 +1,16 @@
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <asm/idle.h>
#include <linux/smp.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/pm.h>
#include <linux/clockchips.h>
+#include <linux/ftrace.h>
#include <asm/system.h>
+#include <asm/apic.h>
unsigned long idle_halt;
EXPORT_SYMBOL(idle_halt);
@@ -100,6 +103,9 @@ static inline int hlt_use_halt(void)
void default_idle(void)
{
if (hlt_use_halt()) {
+ struct power_trace it;
+
+ trace_power_start(&it, POWER_CSTATE, 1);
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
@@ -112,6 +118,7 @@ void default_idle(void)
else
local_irq_enable();
current_thread_info()->status |= TS_POLLING;
+ trace_power_end(&it);
} else {
local_irq_enable();
/* loop is done by the caller */
@@ -122,6 +129,21 @@ void default_idle(void)
EXPORT_SYMBOL(default_idle);
#endif
+void stop_this_cpu(void *dummy)
+{
+ local_irq_disable();
+ /*
+ * Remove this CPU:
+ */
+ cpu_clear(smp_processor_id(), cpu_online_map);
+ disable_local_APIC();
+
+ for (;;) {
+ if (hlt_works(smp_processor_id()))
+ halt();
+ }
+}
+
static void do_nothing(void *unused)
{
}
@@ -154,24 +176,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
*/
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
+ struct power_trace it;
+
+ trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
if (!need_resched()) {
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__mwait(ax, cx);
}
+ trace_power_end(&it);
}
/* Default MONITOR/MWAIT with no hints, used for default C1 state */
static void mwait_idle(void)
{
+ struct power_trace it;
if (!need_resched()) {
+ trace_power_start(&it, POWER_CSTATE, 1);
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__sti_mwait(0, 0);
else
local_irq_enable();
+ trace_power_end(&it);
} else
local_irq_enable();
}
@@ -183,9 +212,13 @@ static void mwait_idle(void)
*/
static void poll_idle(void)
{
+ struct power_trace it;
+
+ trace_power_start(&it, POWER_CSTATE, 0);
local_irq_enable();
while (!need_resched())
cpu_relax();
+ trace_power_end(&it);
}
/*
@@ -270,7 +303,7 @@ static void c1e_idle(void)
rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
if (lo & K8_INTP_C1E_ACTIVE_MASK) {
c1e_detected = 1;
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
mark_tsc_unstable("TSC halt in AMD C1E");
printk(KERN_INFO "System has AMD C1E enabled\n");
set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302fe6d45..3ba155d24884 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,6 +38,7 @@
#include <linux/percpu.h>
#include <linux/prctl.h>
#include <linux/dmi.h>
+#include <linux/ftrace.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -59,6 +60,7 @@
#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/smp.h>
+#include <asm/ds.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -250,14 +252,8 @@ void exit_thread(void)
tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
put_cpu();
}
-#ifdef CONFIG_X86_DS
- /* Free any DS contexts that have not been properly released. */
- if (unlikely(current->thread.ds_ctx)) {
- /* we clear debugctl to make sure DS is not used. */
- update_debugctlmsr(0);
- ds_free(current->thread.ds_ctx);
- }
-#endif /* CONFIG_X86_DS */
+
+ ds_exit_thread(current);
}
void flush_thread(void)
@@ -339,6 +335,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
kfree(p->thread.io_bitmap_ptr);
p->thread.io_bitmap_max = 0;
}
+
+ ds_copy_thread(p, current);
+
+ clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
+ p->thread.debugctlmsr = 0;
+
return err;
}
@@ -419,48 +421,19 @@ int set_tsc_mode(unsigned int val)
return 0;
}
-#ifdef CONFIG_X86_DS
-static int update_debugctl(struct thread_struct *prev,
- struct thread_struct *next, unsigned long debugctl)
-{
- unsigned long ds_prev = 0;
- unsigned long ds_next = 0;
-
- if (prev->ds_ctx)
- ds_prev = (unsigned long)prev->ds_ctx->ds;
- if (next->ds_ctx)
- ds_next = (unsigned long)next->ds_ctx->ds;
-
- if (ds_next != ds_prev) {
- /* we clear debugctl to make sure DS
- * is not in use when we change it */
- debugctl = 0;
- update_debugctlmsr(0);
- wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
- }
- return debugctl;
-}
-#else
-static int update_debugctl(struct thread_struct *prev,
- struct thread_struct *next, unsigned long debugctl)
-{
- return debugctl;
-}
-#endif /* CONFIG_X86_DS */
-
static noinline void
__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss)
{
struct thread_struct *prev, *next;
- unsigned long debugctl;
prev = &prev_p->thread;
next = &next_p->thread;
- debugctl = update_debugctl(prev, next, prev->debugctlmsr);
-
- if (next->debugctlmsr != debugctl)
+ if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
+ test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
+ ds_switch_to(prev_p, next_p);
+ else if (next->debugctlmsr != prev->debugctlmsr)
update_debugctlmsr(next->debugctlmsr);
if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -482,15 +455,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
hard_enable_TSC();
}
-#ifdef CONFIG_X86_PTRACE_BTS
- if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
- ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
-
- if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
- ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif /* CONFIG_X86_PTRACE_BTS */
-
-
if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
/*
* Disable the bitmap via an invalid offset. We still cache
@@ -548,7 +512,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
* the task-switch, and shows up in ret_from_fork in entry.S,
* for example.
*/
-struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+__notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 749d5f888d4d..efb0396e19bf 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -40,6 +40,7 @@
#include <linux/prctl.h>
#include <linux/uaccess.h>
#include <linux/io.h>
+#include <linux/ftrace.h>
#include <asm/pgtable.h>
#include <asm/system.h>
@@ -53,6 +54,7 @@
#include <asm/ia32.h>
#include <asm/idle.h>
#include <asm/syscalls.h>
+#include <asm/ds.h>
asmlinkage extern void ret_from_fork(void);
@@ -64,6 +66,13 @@ void idle_notifier_register(struct notifier_block *n)
{
atomic_notifier_chain_register(&idle_notifier, n);
}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+
+void idle_notifier_unregister(struct notifier_block *n)
+{
+ atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_unregister);
void enter_idle(void)
{
@@ -240,14 +249,8 @@ void exit_thread(void)
t->io_bitmap_max = 0;
put_cpu();
}
-#ifdef CONFIG_X86_DS
- /* Free any DS contexts that have not been properly released. */
- if (unlikely(t->ds_ctx)) {
- /* we clear debugctl to make sure DS is not used. */
- update_debugctlmsr(0);
- ds_free(t->ds_ctx);
- }
-#endif /* CONFIG_X86_DS */
+
+ ds_exit_thread(current);
}
void flush_thread(void)
@@ -377,6 +380,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
if (err)
goto out;
}
+
+ ds_copy_thread(p, me);
+
+ clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
+ p->thread.debugctlmsr = 0;
+
err = 0;
out:
if (err && p->thread.io_bitmap_ptr) {
@@ -475,35 +484,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
struct tss_struct *tss)
{
struct thread_struct *prev, *next;
- unsigned long debugctl;
prev = &prev_p->thread,
next = &next_p->thread;
- debugctl = prev->debugctlmsr;
-
-#ifdef CONFIG_X86_DS
- {
- unsigned long ds_prev = 0, ds_next = 0;
-
- if (prev->ds_ctx)
- ds_prev = (unsigned long)prev->ds_ctx->ds;
- if (next->ds_ctx)
- ds_next = (unsigned long)next->ds_ctx->ds;
-
- if (ds_next != ds_prev) {
- /*
- * We clear debugctl to make sure DS
- * is not in use when we change it:
- */
- debugctl = 0;
- update_debugctlmsr(0);
- wrmsrl(MSR_IA32_DS_AREA, ds_next);
- }
- }
-#endif /* CONFIG_X86_DS */
-
- if (next->debugctlmsr != debugctl)
+ if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
+ test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
+ ds_switch_to(prev_p, next_p);
+ else if (next->debugctlmsr != prev->debugctlmsr)
update_debugctlmsr(next->debugctlmsr);
if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -538,14 +526,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
*/
memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
}
-
-#ifdef CONFIG_X86_PTRACE_BTS
- if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
- ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
-
- if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
- ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif /* CONFIG_X86_PTRACE_BTS */
}
/*
@@ -556,8 +536,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
* - could test fs/gs bitsliced
*
* Kprobes not supported here. Set the probe on schedule instead.
+ * Function graph tracer not supported too.
*/
-struct task_struct *
+__notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0a6d8c12e10d..0a5df5f82fb9 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -581,158 +581,91 @@ static int ioperm_get(struct task_struct *target,
}
#ifdef CONFIG_X86_PTRACE_BTS
-/*
- * The configuration for a particular BTS hardware implementation.
- */
-struct bts_configuration {
- /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
- unsigned char sizeof_bts;
- /* the size of a field in the BTS record in bytes */
- unsigned char sizeof_field;
- /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
- unsigned long debugctl_mask;
-};
-static struct bts_configuration bts_cfg;
-
-#define BTS_MAX_RECORD_SIZE (8 * 3)
-
-
-/*
- * Branch Trace Store (BTS) uses the following format. Different
- * architectures vary in the size of those fields.
- * - source linear address
- * - destination linear address
- * - flags
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- * We compute the base address for the first 8 fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- *
- * In order to store additional information in the BTS buffer, we use
- * a special source address to indicate that the record requires
- * special interpretation.
- *
- * Netburst indicated via a bit in the flags field whether the branch
- * was predicted; this is ignored.
- */
-
-enum bts_field {
- bts_from = 0,
- bts_to,
- bts_flags,
-
- bts_escape = (unsigned long)-1,
- bts_qual = bts_to,
- bts_jiffies = bts_flags
-};
-
-static inline unsigned long bts_get(const char *base, enum bts_field field)
-{
- base += (bts_cfg.sizeof_field * field);
- return *(unsigned long *)base;
-}
-
-static inline void bts_set(char *base, enum bts_field field, unsigned long val)
-{
- base += (bts_cfg.sizeof_field * field);;
- (*(unsigned long *)base) = val;
-}
-
-/*
- * Translate a BTS record from the raw format into the bts_struct format
- *
- * out (out): bts_struct interpretation
- * raw: raw BTS record
- */
-static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
-{
- memset(out, 0, sizeof(*out));
- if (bts_get(raw, bts_from) == bts_escape) {
- out->qualifier = bts_get(raw, bts_qual);
- out->variant.jiffies = bts_get(raw, bts_jiffies);
- } else {
- out->qualifier = BTS_BRANCH;
- out->variant.lbr.from_ip = bts_get(raw, bts_from);
- out->variant.lbr.to_ip = bts_get(raw, bts_to);
- }
-}
-
static int ptrace_bts_read_record(struct task_struct *child, size_t index,
struct bts_struct __user *out)
{
- struct bts_struct ret;
- const void *bts_record;
- size_t bts_index, bts_end;
+ const struct bts_trace *trace;
+ struct bts_struct bts;
+ const unsigned char *at;
int error;
- error = ds_get_bts_end(child, &bts_end);
- if (error < 0)
- return error;
-
- if (bts_end <= index)
- return -EINVAL;
+ trace = ds_read_bts(child->bts);
+ if (!trace)
+ return -EPERM;
- error = ds_get_bts_index(child, &bts_index);
- if (error < 0)
- return error;
+ at = trace->ds.top - ((index + 1) * trace->ds.size);
+ if ((void *)at < trace->ds.begin)
+ at += (trace->ds.n * trace->ds.size);
- /* translate the ptrace bts index into the ds bts index */
- bts_index += bts_end - (index + 1);
- if (bts_end <= bts_index)
- bts_index -= bts_end;
+ if (!trace->read)
+ return -EOPNOTSUPP;
- error = ds_access_bts(child, bts_index, &bts_record);
+ error = trace->read(child->bts, at, &bts);
if (error < 0)
return error;
- ptrace_bts_translate_record(&ret, bts_record);
-
- if (copy_to_user(out, &ret, sizeof(ret)))
+ if (copy_to_user(out, &bts, sizeof(bts)))
return -EFAULT;
- return sizeof(ret);
+ return sizeof(bts);
}
static int ptrace_bts_drain(struct task_struct *child,
long size,
struct bts_struct __user *out)
{
- struct bts_struct ret;
- const unsigned char *raw;
- size_t end, i;
- int error;
+ const struct bts_trace *trace;
+ const unsigned char *at;
+ int error, drained = 0;
- error = ds_get_bts_index(child, &end);
- if (error < 0)
- return error;
+ trace = ds_read_bts(child->bts);
+ if (!trace)
+ return -EPERM;
- if (size < (end * sizeof(struct bts_struct)))
+ if (!trace->read)
+ return -EOPNOTSUPP;
+
+ if (size < (trace->ds.top - trace->ds.begin))
return -EIO;
- error = ds_access_bts(child, 0, (const void **)&raw);
- if (error < 0)
- return error;
+ for (at = trace->ds.begin; (void *)at < trace->ds.top;
+ out++, drained++, at += trace->ds.size) {
+ struct bts_struct bts;
+ int error;
- for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
- ptrace_bts_translate_record(&ret, raw);
+ error = trace->read(child->bts, at, &bts);
+ if (error < 0)
+ return error;
- if (copy_to_user(out, &ret, sizeof(ret)))
+ if (copy_to_user(out, &bts, sizeof(bts)))
return -EFAULT;
}
- error = ds_clear_bts(child);
+ memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
+
+ error = ds_reset_bts(child->bts);
if (error < 0)
return error;
- return end;
+ return drained;
}
-static void ptrace_bts_ovfl(struct task_struct *child)
+static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
{
- send_sig(child->thread.bts_ovfl_signal, child, 0);
+ child->bts_buffer = alloc_locked_buffer(size);
+ if (!child->bts_buffer)
+ return -ENOMEM;
+
+ child->bts_size = size;
+
+ return 0;
+}
+
+static void ptrace_bts_free_buffer(struct task_struct *child)
+{
+ free_locked_buffer(child->bts_buffer, child->bts_size);
+ child->bts_buffer = NULL;
+ child->bts_size = 0;
}
static int ptrace_bts_config(struct task_struct *child,
@@ -740,114 +673,86 @@ static int ptrace_bts_config(struct task_struct *child,
const struct ptrace_bts_config __user *ucfg)
{
struct ptrace_bts_config cfg;
- int error = 0;
-
- error = -EOPNOTSUPP;
- if (!bts_cfg.sizeof_bts)
- goto errout;
+ unsigned int flags = 0;
- error = -EIO;
if (cfg_size < sizeof(cfg))
- goto errout;
+ return -EIO;
- error = -EFAULT;
if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
- goto errout;
+ return -EFAULT;
- error = -EINVAL;
- if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
- !(cfg.flags & PTRACE_BTS_O_ALLOC))
- goto errout;
+ if (child->bts) {
+ ds_release_bts(child->bts);
+ child->bts = NULL;
+ }
- if (cfg.flags & PTRACE_BTS_O_ALLOC) {
- ds_ovfl_callback_t ovfl = NULL;
- unsigned int sig = 0;
+ if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
+ if (!cfg.signal)
+ return -EINVAL;
- /* we ignore the error in case we were not tracing child */
- (void)ds_release_bts(child);
+ return -EOPNOTSUPP;
- if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
- if (!cfg.signal)
- goto errout;
+ child->thread.bts_ovfl_signal = cfg.signal;
+ }
- sig = cfg.signal;
- ovfl = ptrace_bts_ovfl;
- }
+ if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
+ (cfg.size != child->bts_size)) {
+ int error;
- error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
- if (error < 0)
- goto errout;
+ ptrace_bts_free_buffer(child);
- child->thread.bts_ovfl_signal = sig;
+ error = ptrace_bts_allocate_buffer(child, cfg.size);
+ if (error < 0)
+ return error;
}
- error = -EINVAL;
- if (!child->thread.ds_ctx && cfg.flags)
- goto errout;
-
if (cfg.flags & PTRACE_BTS_O_TRACE)
- child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
- else
- child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
+ flags |= BTS_USER;
if (cfg.flags & PTRACE_BTS_O_SCHED)
- set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
- else
- clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+ flags |= BTS_TIMESTAMPS;
- error = sizeof(cfg);
+ child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
+ /* ovfl = */ NULL, /* th = */ (size_t)-1,
+ flags);
+ if (IS_ERR(child->bts)) {
+ int error = PTR_ERR(child->bts);
-out:
- if (child->thread.debugctlmsr)
- set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
- else
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+ ptrace_bts_free_buffer(child);
+ child->bts = NULL;
- return error;
+ return error;
+ }
-errout:
- child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
- clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
- goto out;
+ return sizeof(cfg);
}
static int ptrace_bts_status(struct task_struct *child,
long cfg_size,
struct ptrace_bts_config __user *ucfg)
{
+ const struct bts_trace *trace;
struct ptrace_bts_config cfg;
- size_t end;
- const void *base, *max;
- int error;
if (cfg_size < sizeof(cfg))
return -EIO;
- error = ds_get_bts_end(child, &end);
- if (error < 0)
- return error;
-
- error = ds_access_bts(child, /* index = */ 0, &base);
- if (error < 0)
- return error;
-
- error = ds_access_bts(child, /* index = */ end, &max);
- if (error < 0)
- return error;
+ trace = ds_read_bts(child->bts);
+ if (!trace)
+ return -EPERM;
memset(&cfg, 0, sizeof(cfg));
- cfg.size = (max - base);
+ cfg.size = trace->ds.end - trace->ds.begin;
cfg.signal = child->thread.bts_ovfl_signal;
cfg.bts_size = sizeof(struct bts_struct);
if (cfg.signal)
cfg.flags |= PTRACE_BTS_O_SIGNAL;
- if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
- child->thread.debugctlmsr & bts_cfg.debugctl_mask)
+ if (trace->ds.flags & BTS_USER)
cfg.flags |= PTRACE_BTS_O_TRACE;
- if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
+ if (trace->ds.flags & BTS_TIMESTAMPS)
cfg.flags |= PTRACE_BTS_O_SCHED;
if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
@@ -856,110 +761,77 @@ static int ptrace_bts_status(struct task_struct *child,
return sizeof(cfg);
}
-static int ptrace_bts_write_record(struct task_struct *child,
- const struct bts_struct *in)
+static int ptrace_bts_clear(struct task_struct *child)
{
- unsigned char bts_record[BTS_MAX_RECORD_SIZE];
+ const struct bts_trace *trace;
- BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
+ trace = ds_read_bts(child->bts);
+ if (!trace)
+ return -EPERM;
- memset(bts_record, 0, bts_cfg.sizeof_bts);
- switch (in->qualifier) {
- case BTS_INVALID:
- break;
+ memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
- case BTS_BRANCH:
- bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
- bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
- break;
+ return ds_reset_bts(child->bts);
+}
- case BTS_TASK_ARRIVES:
- case BTS_TASK_DEPARTS:
- bts_set(bts_record, bts_from, bts_escape);
- bts_set(bts_record, bts_qual, in->qualifier);
- bts_set(bts_record, bts_jiffies, in->variant.jiffies);
- break;
+static int ptrace_bts_size(struct task_struct *child)
+{
+ const struct bts_trace *trace;
- default:
- return -EINVAL;
- }
+ trace = ds_read_bts(child->bts);
+ if (!trace)
+ return -EPERM;
- /* The writing task will be the switched-to task on a context
- * switch. It needs to write into the switched-from task's BTS
- * buffer. */
- return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
+ return (trace->ds.top - trace->ds.begin) / trace->ds.size;
}
-void ptrace_bts_take_timestamp(struct task_struct *tsk,
- enum bts_qualifier qualifier)
+static void ptrace_bts_fork(struct task_struct *tsk)
{
- struct bts_struct rec = {
- .qualifier = qualifier,
- .variant.jiffies = jiffies_64
- };
-
- ptrace_bts_write_record(tsk, &rec);
+ tsk->bts = NULL;
+ tsk->bts_buffer = NULL;
+ tsk->bts_size = 0;
+ tsk->thread.bts_ovfl_signal = 0;
}
-static const struct bts_configuration bts_cfg_netburst = {
- .sizeof_bts = sizeof(long) * 3,
- .sizeof_field = sizeof(long),
- .debugctl_mask = (1<<2)|(1<<3)|(1<<5)
-};
+static void ptrace_bts_untrace(struct task_struct *child)
+{
+ if (unlikely(child->bts)) {
+ ds_release_bts(child->bts);
+ child->bts = NULL;
+
+ /* We cannot update total_vm and locked_vm since
+ child's mm is already gone. But we can reclaim the
+ memory. */
+ kfree(child->bts_buffer);
+ child->bts_buffer = NULL;
+ child->bts_size = 0;
+ }
+}
-static const struct bts_configuration bts_cfg_pentium_m = {
- .sizeof_bts = sizeof(long) * 3,
- .sizeof_field = sizeof(long),
- .debugctl_mask = (1<<6)|(1<<7)
-};
+static void ptrace_bts_detach(struct task_struct *child)
+{
+ if (unlikely(child->bts)) {
+ ds_release_bts(child->bts);
+ child->bts = NULL;
-static const struct bts_configuration bts_cfg_core2 = {
- .sizeof_bts = 8 * 3,
- .sizeof_field = 8,
- .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
-};
+ ptrace_bts_free_buffer(child);
+ }
+}
+#else
+static inline void ptrace_bts_fork(struct task_struct *tsk) {}
+static inline void ptrace_bts_detach(struct task_struct *child) {}
+static inline void ptrace_bts_untrace(struct task_struct *child) {}
+#endif /* CONFIG_X86_PTRACE_BTS */
-static inline void bts_configure(const struct bts_configuration *cfg)
+void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
{
- bts_cfg = *cfg;
+ ptrace_bts_fork(child);
}
-void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
+void x86_ptrace_untrace(struct task_struct *child)
{
- switch (c->x86) {
- case 0x6:
- switch (c->x86_model) {
- case 0xD:
- case 0xE: /* Pentium M */
- bts_configure(&bts_cfg_pentium_m);
- break;
- case 0xF: /* Core2 */
- case 0x1C: /* Atom */
- bts_configure(&bts_cfg_core2);
- break;
- default:
- /* sorry, don't know about them */
- break;
- }
- break;
- case 0xF:
- switch (c->x86_model) {
- case 0x0:
- case 0x1:
- case 0x2: /* Netburst */
- bts_configure(&bts_cfg_netburst);
- break;
- default:
- /* sorry, don't know about them */
- break;
- }
- break;
- default:
- /* sorry, don't know about them */
- break;
- }
+ ptrace_bts_untrace(child);
}
-#endif /* CONFIG_X86_PTRACE_BTS */
/*
* Called by kernel/ptrace.c when detaching..
@@ -972,15 +844,7 @@ void ptrace_disable(struct task_struct *child)
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
#endif
-#ifdef CONFIG_X86_PTRACE_BTS
- (void)ds_release_bts(child);
-
- child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
- if (!child->thread.debugctlmsr)
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-
- clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
-#endif /* CONFIG_X86_PTRACE_BTS */
+ ptrace_bts_detach(child);
}
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -1112,7 +976,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
break;
case PTRACE_BTS_SIZE:
- ret = ds_get_bts_index(child, /* pos = */ NULL);
+ ret = ptrace_bts_size(child);
break;
case PTRACE_BTS_GET:
@@ -1121,7 +985,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
break;
case PTRACE_BTS_CLEAR:
- ret = ds_clear_bts(child);
+ ret = ptrace_bts_clear(child);
break;
case PTRACE_BTS_DRAIN:
@@ -1384,6 +1248,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
case PTRACE_GET_THREAD_AREA:
case PTRACE_SET_THREAD_AREA:
+#ifdef CONFIG_X86_PTRACE_BTS
+ case PTRACE_BTS_CONFIG:
+ case PTRACE_BTS_STATUS:
+ case PTRACE_BTS_SIZE:
+ case PTRACE_BTS_GET:
+ case PTRACE_BTS_CLEAR:
+ case PTRACE_BTS_DRAIN:
+#endif /* CONFIG_X86_PTRACE_BTS */
return arch_ptrace(child, request, addr, data);
default:
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 05fbe9a0325a..4f9c55f3a7c0 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
return dst->version;
}
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
+{
+ u64 pv_tsc_khz = 1000000ULL << 32;
+
+ do_div(pv_tsc_khz, src->tsc_to_system_mul);
+ if (src->tsc_shift < 0)
+ pv_tsc_khz <<= -src->tsc_shift;
+ else
+ pv_tsc_khz >>= src->tsc_shift;
+ return pv_tsc_khz;
+}
+
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
{
struct pvclock_shadow_time shadow;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index f6a11b9b1f98..309949e9e1c1 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -35,9 +35,6 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
if (!(word & (1 << 13))) {
dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
"disabling irq balancing and affinity\n");
-#ifdef CONFIG_IRQBALANCE
- irqbalance_disable("");
-#endif
noirqdebug_setup("");
#ifdef CONFIG_PROC_FS
no_irq_affinity = 1;
@@ -171,6 +168,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31,
ich_force_enable_hpet);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4,
+ ich_force_enable_hpet);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
ich_force_enable_hpet);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f4c93f1cfc19..61f718df6eec 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -21,6 +21,9 @@
# include <asm/iommu.h>
#endif
+#include <mach_ipi.h>
+
+
/*
* Power off function, if any
*/
@@ -29,18 +32,17 @@ EXPORT_SYMBOL(pm_power_off);
static const struct desc_ptr no_idt = {};
static int reboot_mode;
-/*
- * Keyboard reset and triple fault may result in INIT, not RESET, which
- * doesn't work when we're in vmx root mode. Try ACPI first.
- */
-enum reboot_type reboot_type = BOOT_ACPI;
+enum reboot_type reboot_type = BOOT_KBD;
int reboot_force;
#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
static int reboot_cpu = -1;
#endif
-/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old]
+/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
+bool port_cf9_safe = false;
+
+/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
warm Don't set the cold reboot flag
cold Set the cold reboot flag
bios Reboot by jumping through the BIOS (only for X86_32)
@@ -49,6 +51,7 @@ static int reboot_cpu = -1;
kbd Use the keyboard controller. cold reset (default)
acpi Use the RESET_REG in the FADT
efi Use efi reset_system runtime service
+ pci Use the so-called "PCI reset register", CF9
force Avoid anything that could hang.
*/
static int __init reboot_setup(char *str)
@@ -83,6 +86,7 @@ static int __init reboot_setup(char *str)
case 'k':
case 't':
case 'e':
+ case 'p':
reboot_type = *str;
break;
@@ -173,6 +177,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
},
},
+ { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 330",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"),
+ DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
+ },
+ },
{ /* Handle problems with rebooting on Dell 2400's */
.callback = set_bios_reboot,
.ident = "Dell PowerEdge 2400",
@@ -399,12 +412,27 @@ static void native_machine_emergency_restart(void)
reboot_type = BOOT_KBD;
break;
-
case BOOT_EFI:
if (efi_enabled)
- efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD,
+ efi.reset_system(reboot_mode ?
+ EFI_RESET_WARM :
+ EFI_RESET_COLD,
EFI_SUCCESS, 0, NULL);
+ reboot_type = BOOT_KBD;
+ break;
+
+ case BOOT_CF9:
+ port_cf9_safe = true;
+ /* fall through */
+ case BOOT_CF9_COND:
+ if (port_cf9_safe) {
+ u8 cf9 = inb(0xcf9) & ~6;
+ outb(cf9|2, 0xcf9); /* Request hard reset */
+ udelay(50);
+ outb(cf9|6, 0xcf9); /* Actually do the reset */
+ udelay(50);
+ }
reboot_type = BOOT_KBD;
break;
}
@@ -465,6 +493,11 @@ static void native_machine_restart(char *__unused)
static void native_machine_halt(void)
{
+ /* stop other cpus and apics */
+ machine_shutdown();
+
+ /* stop this cpu */
+ stop_this_cpu(NULL);
}
static void native_machine_power_off(void)
@@ -518,3 +551,95 @@ void machine_crash_shutdown(struct pt_regs *regs)
machine_ops.crash_shutdown(regs);
}
#endif
+
+
+#if defined(CONFIG_SMP)
+
+/* This keeps a track of which one is crashing cpu. */
+static int crashing_cpu;
+static nmi_shootdown_cb shootdown_callback;
+
+static atomic_t waiting_for_crash_ipi;
+
+static int crash_nmi_callback(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ int cpu;
+
+ if (val != DIE_NMI_IPI)
+ return NOTIFY_OK;
+
+ cpu = raw_smp_processor_id();
+
+ /* Don't do anything if this handler is invoked on crashing cpu.
+ * Otherwise, system will completely hang. Crashing cpu can get
+ * an NMI if system was initially booted with nmi_watchdog parameter.
+ */
+ if (cpu == crashing_cpu)
+ return NOTIFY_STOP;
+ local_irq_disable();
+
+ shootdown_callback(cpu, (struct die_args *)data);
+
+ atomic_dec(&waiting_for_crash_ipi);
+ /* Assume hlt works */
+ halt();
+ for (;;)
+ cpu_relax();
+
+ return 1;
+}
+
+static void smp_send_nmi_allbutself(void)
+{
+ cpumask_t mask = cpu_online_map;
+ cpu_clear(safe_smp_processor_id(), mask);
+ if (!cpus_empty(mask))
+ send_IPI_mask(mask, NMI_VECTOR);
+}
+
+static struct notifier_block crash_nmi_nb = {
+ .notifier_call = crash_nmi_callback,
+};
+
+/* Halt all other CPUs, calling the specified function on each of them
+ *
+ * This function can be used to halt all other CPUs on crash
+ * or emergency reboot time. The function passed as parameter
+ * will be called inside a NMI handler on all CPUs.
+ */
+void nmi_shootdown_cpus(nmi_shootdown_cb callback)
+{
+ unsigned long msecs;
+ local_irq_disable();
+
+ /* Make a note of crashing cpu. Will be used in NMI callback.*/
+ crashing_cpu = safe_smp_processor_id();
+
+ shootdown_callback = callback;
+
+ atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+ /* Would it be better to replace the trap vector here? */
+ if (register_die_notifier(&crash_nmi_nb))
+ return; /* return what? */
+ /* Ensure the new callback function is set before sending
+ * out the NMI
+ */
+ wmb();
+
+ smp_send_nmi_allbutself();
+
+ msecs = 1000; /* Wait at most a second for the other cpus to stop */
+ while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+ mdelay(1);
+ msecs--;
+ }
+
+ /* Leave the nmi callback set */
+}
+#else /* !CONFIG_SMP */
+void nmi_shootdown_cpus(nmi_shootdown_cb callback)
+{
+ /* No other CPUs to shoot down */
+}
+#endif
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 6f50664b2ba5..a160f3119725 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -10,15 +10,12 @@
#include <asm/page.h>
#include <asm/kexec.h>
#include <asm/processor-flags.h>
-#include <asm/pgtable.h>
/*
* Must be relocatable PIC code callable as a C function
*/
#define PTR(x) (x << 2)
-#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define PAE_PGD_ATTR (_PAGE_PRESENT)
/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
* ~ control_page + PAGE_SIZE are used as data storage and stack for
@@ -39,7 +36,6 @@
#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
.text
- .align PAGE_SIZE
.globl relocate_kernel
relocate_kernel:
/* Save the CPU context, used for jumping back */
@@ -60,117 +56,6 @@ relocate_kernel:
movl %cr4, %eax
movl %eax, CR4(%edi)
-#ifdef CONFIG_X86_PAE
- /* map the control page at its virtual address */
-
- movl PTR(VA_PGD)(%ebp), %edi
- movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
- andl $0xc0000000, %eax
- shrl $27, %eax
- addl %edi, %eax
-
- movl PTR(PA_PMD_0)(%ebp), %edx
- orl $PAE_PGD_ATTR, %edx
- movl %edx, (%eax)
-
- movl PTR(VA_PMD_0)(%ebp), %edi
- movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
- andl $0x3fe00000, %eax
- shrl $18, %eax
- addl %edi, %eax
-
- movl PTR(PA_PTE_0)(%ebp), %edx
- orl $PAGE_ATTR, %edx
- movl %edx, (%eax)
-
- movl PTR(VA_PTE_0)(%ebp), %edi
- movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
- andl $0x001ff000, %eax
- shrl $9, %eax
- addl %edi, %eax
-
- movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
- orl $PAGE_ATTR, %edx
- movl %edx, (%eax)
-
- /* identity map the control page at its physical address */
-
- movl PTR(VA_PGD)(%ebp), %edi
- movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
- andl $0xc0000000, %eax
- shrl $27, %eax
- addl %edi, %eax
-
- movl PTR(PA_PMD_1)(%ebp), %edx
- orl $PAE_PGD_ATTR, %edx
- movl %edx, (%eax)
-
- movl PTR(VA_PMD_1)(%ebp), %edi
- movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
- andl $0x3fe00000, %eax
- shrl $18, %eax
- addl %edi, %eax
-
- movl PTR(PA_PTE_1)(%ebp), %edx
- orl $PAGE_ATTR, %edx
- movl %edx, (%eax)
-
- movl PTR(VA_PTE_1)(%ebp), %edi
- movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
- andl $0x001ff000, %eax
- shrl $9, %eax
- addl %edi, %eax
-
- movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
- orl $PAGE_ATTR, %edx
- movl %edx, (%eax)
-#else
- /* map the control page at its virtual address */
-
- movl PTR(VA_PGD)(%ebp), %edi
- movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
- andl $0xffc00000, %eax
- shrl $20, %eax
- addl %edi, %eax
-
- movl PTR(PA_PTE_0)(%ebp), %edx
- orl $PAGE_ATTR, %edx
- movl %edx, (%eax)
-
- movl PTR(VA_PTE_0)(%ebp), %edi
- movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
- andl $0x003ff000, %eax
- shrl $10, %eax
- addl %edi, %eax
-
- movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
- orl $PAGE_ATTR, %edx
- movl %edx, (%eax)
-
- /* identity map the control page at its physical address */
-
- movl PTR(VA_PGD)(%ebp), %edi
- movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
- andl $0xffc00000, %eax
- shrl $20, %eax
- addl %edi, %eax
-
- movl PTR(PA_PTE_1)(%ebp), %edx
- orl $PAGE_ATTR, %edx
- movl %edx, (%eax)
-
- movl PTR(VA_PTE_1)(%ebp), %edi
- movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
- andl $0x003ff000, %eax
- shrl $10, %eax
- addl %edi, %eax
-
- movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
- orl $PAGE_ATTR, %edx
- movl %edx, (%eax)
-#endif
-
-relocate_new_kernel:
/* read the arguments and say goodbye to the stack */
movl 20+4(%esp), %ebx /* page_list */
movl 20+8(%esp), %ebp /* list of pages */
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 0a23b5795b25..dd6f2b71561b 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -52,7 +52,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
cmos_minutes = CMOS_READ(RTC_MINUTES);
if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
- BCD_TO_BIN(cmos_minutes);
+ cmos_minutes = bcd2bin(cmos_minutes);
/*
* since we're only adjusting minutes and seconds,
@@ -69,8 +69,8 @@ int mach_set_rtc_mmss(unsigned long nowtime)
if (abs(real_minutes - cmos_minutes) < 30) {
if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
- BIN_TO_BCD(real_seconds);
- BIN_TO_BCD(real_minutes);
+ real_seconds = bin2bcd(real_seconds);
+ real_minutes = bin2bcd(real_minutes);
}
CMOS_WRITE(real_seconds,RTC_SECONDS);
CMOS_WRITE(real_minutes,RTC_MINUTES);
@@ -124,16 +124,16 @@ unsigned long mach_get_cmos_time(void)
WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY));
if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) {
- BCD_TO_BIN(sec);
- BCD_TO_BIN(min);
- BCD_TO_BIN(hour);
- BCD_TO_BIN(day);
- BCD_TO_BIN(mon);
- BCD_TO_BIN(year);
+ sec = bcd2bin(sec);
+ min = bcd2bin(min);
+ hour = bcd2bin(hour);
+ day = bcd2bin(day);
+ mon = bcd2bin(mon);
+ year = bcd2bin(year);
}
if (century) {
- BCD_TO_BIN(century);
+ century = bcd2bin(century);
year += century * 100;
printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
} else
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 2255782e8d4b..ae0d8042cf69 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -93,11 +93,13 @@
#include <asm/desc.h>
#include <asm/dma.h>
#include <asm/iommu.h>
+#include <asm/gart.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <mach_apic.h>
#include <asm/paravirt.h>
+#include <asm/hypervisor.h>
#include <asm/percpu.h>
#include <asm/topology.h>
@@ -448,6 +450,7 @@ static void __init reserve_early_setup_data(void)
* @size: Size of the crashkernel memory to reserve.
* Returns the base address on success, and -1ULL on failure.
*/
+static
unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
{
const unsigned long long alignment = 16<<20; /* 16M */
@@ -561,7 +564,13 @@ static void __init reserve_standard_io_resources(void)
}
-#ifdef CONFIG_PROC_VMCORE
+/*
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence
+ * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+
+#ifdef CONFIG_CRASH_DUMP
/* elfcorehdr= specifies the location of elf core header
* stored by the crashed kernel. This option will be passed
* by kexec loader to the capture kernel.
@@ -577,161 +586,24 @@ static int __init setup_elfcorehdr(char *arg)
early_param("elfcorehdr", setup_elfcorehdr);
#endif
-static struct x86_quirks default_x86_quirks __initdata;
-
-struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
-
-/*
- * Some BIOSes seem to corrupt the low 64k of memory during events
- * like suspend/resume and unplugging an HDMI cable. Reserve all
- * remaining free memory in that area and fill it with a distinct
- * pattern.
- */
-#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
-#define MAX_SCAN_AREAS 8
-
-static int __read_mostly memory_corruption_check = -1;
-
-static unsigned __read_mostly corruption_check_size = 64*1024;
-static unsigned __read_mostly corruption_check_period = 60; /* seconds */
-
-static struct e820entry scan_areas[MAX_SCAN_AREAS];
-static int num_scan_areas;
-
-
-static int set_corruption_check(char *arg)
+static int __init default_update_genapic(void)
{
- char *end;
-
- memory_corruption_check = simple_strtol(arg, &end, 10);
-
- return (*end == 0) ? 0 : -EINVAL;
-}
-early_param("memory_corruption_check", set_corruption_check);
-
-static int set_corruption_check_period(char *arg)
-{
- char *end;
-
- corruption_check_period = simple_strtoul(arg, &end, 10);
-
- return (*end == 0) ? 0 : -EINVAL;
-}
-early_param("memory_corruption_check_period", set_corruption_check_period);
-
-static int set_corruption_check_size(char *arg)
-{
- char *end;
- unsigned size;
-
- size = memparse(arg, &end);
-
- if (*end == '\0')
- corruption_check_size = size;
-
- return (size == corruption_check_size) ? 0 : -EINVAL;
-}
-early_param("memory_corruption_check_size", set_corruption_check_size);
-
-
-static void __init setup_bios_corruption_check(void)
-{
- u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
-
- if (memory_corruption_check == -1) {
- memory_corruption_check =
-#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
- 1
-#else
- 0
+#ifdef CONFIG_X86_SMP
+# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64)
+ genapic->wakeup_cpu = wakeup_secondary_cpu_via_init;
+# endif
#endif
- ;
- }
-
- if (corruption_check_size == 0)
- memory_corruption_check = 0;
-
- if (!memory_corruption_check)
- return;
-
- corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
-
- while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
- u64 size;
- addr = find_e820_area_size(addr, &size, PAGE_SIZE);
-
- if (addr == 0)
- break;
-
- if ((addr + size) > corruption_check_size)
- size = corruption_check_size - addr;
-
- if (size == 0)
- break;
-
- e820_update_range(addr, size, E820_RAM, E820_RESERVED);
- scan_areas[num_scan_areas].addr = addr;
- scan_areas[num_scan_areas].size = size;
- num_scan_areas++;
-
- /* Assume we've already mapped this early memory */
- memset(__va(addr), 0, size);
-
- addr += size;
- }
-
- printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
- num_scan_areas);
- update_e820();
-}
-
-static struct timer_list periodic_check_timer;
-void check_for_bios_corruption(void)
-{
- int i;
- int corruption = 0;
-
- if (!memory_corruption_check)
- return;
-
- for(i = 0; i < num_scan_areas; i++) {
- unsigned long *addr = __va(scan_areas[i].addr);
- unsigned long size = scan_areas[i].size;
-
- for(; size; addr++, size -= sizeof(unsigned long)) {
- if (!*addr)
- continue;
- printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
- addr, __pa(addr), *addr);
- corruption = 1;
- *addr = 0;
- }
- }
-
- WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
-}
-
-static void periodic_check_for_corruption(unsigned long data)
-{
- check_for_bios_corruption();
- mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
+ return 0;
}
-void start_periodic_check_for_corruption(void)
-{
- if (!memory_corruption_check || corruption_check_period == 0)
- return;
-
- printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
- corruption_check_period);
+static struct x86_quirks default_x86_quirks __initdata = {
+ .update_genapic = default_update_genapic,
+};
- init_timer(&periodic_check_timer);
- periodic_check_timer.function = &periodic_check_for_corruption;
- periodic_check_for_corruption(0);
-}
-#endif
+struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
+#ifdef CONFIG_X86_RESERVE_LOW_64K
static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
{
printk(KERN_NOTICE
@@ -743,6 +615,7 @@ static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
return 0;
}
+#endif
/* List of systems that have known low memory corruption BIOS problems */
static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
@@ -758,7 +631,7 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
.callback = dmi_low_memory_corruption,
.ident = "Phoenix BIOS",
.matches = {
- DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
},
},
#endif
@@ -788,6 +661,9 @@ void __init setup_arch(char **cmdline_p)
printk(KERN_INFO "Command line: %s\n", boot_command_line);
#endif
+ /* VMI may relocate the fixmap; do this before touching ioremap area */
+ vmi_init();
+
early_cpu_init();
early_ioremap_init();
@@ -874,13 +750,8 @@ void __init setup_arch(char **cmdline_p)
check_efer();
#endif
-#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
- /*
- * Must be before kernel pagetables are setup
- * or fixmap area is touched.
- */
- vmi_init();
-#endif
+ /* Must be before kernel pagetables are setup */
+ vmi_activate();
/* after early param, so could get panic from serial */
reserve_early_setup_data();
@@ -903,6 +774,12 @@ void __init setup_arch(char **cmdline_p)
dmi_check_system(bad_bios_dmi_table);
+ /*
+ * VMware detection requires dmi to be available, so this
+ * needs to be done after dmi_scan_machine, for the BP.
+ */
+ init_hypervisor(&boot_cpu_data);
+
#ifdef CONFIG_X86_32
probe_roms();
#endif
@@ -1067,6 +944,7 @@ void __init setup_arch(char **cmdline_p)
#endif
prefill_possible_map();
+
#ifdef CONFIG_X86_64
init_cpu_to_node();
#endif
@@ -1074,6 +952,9 @@ void __init setup_arch(char **cmdline_p)
init_apic_mappings();
ioapic_init_mappings();
+ /* need to wait for io_apic is mapped */
+ probe_nr_irqs_gsi();
+
kvm_guest_init();
e820_reserve_resources();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 0e67f72d9316..ae0c0d3bb770 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -140,25 +140,30 @@ static void __init setup_cpu_pda_map(void)
*/
void __init setup_per_cpu_areas(void)
{
- ssize_t size = PERCPU_ENOUGH_ROOM;
+ ssize_t size, old_size;
char *ptr;
int cpu;
+ unsigned long align = 1;
/* Setup cpu_pda map */
setup_cpu_pda_map();
/* Copy section for each CPU (we discard the original) */
- size = PERCPU_ENOUGH_ROOM;
+ old_size = PERCPU_ENOUGH_ROOM;
+ align = max_t(unsigned long, PAGE_SIZE, align);
+ size = roundup(old_size, align);
printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
size);
for_each_possible_cpu(cpu) {
#ifndef CONFIG_NEED_MULTIPLE_NODES
- ptr = alloc_bootmem_pages(size);
+ ptr = __alloc_bootmem(size, align,
+ __pa(MAX_DMA_ADDRESS));
#else
int node = early_cpu_to_node(cpu);
if (!node_online(node) || !NODE_DATA(node)) {
- ptr = alloc_bootmem_pages(size);
+ ptr = __alloc_bootmem(size, align,
+ __pa(MAX_DMA_ADDRESS));
printk(KERN_INFO
"cpu %d has no node %d or node-local memory\n",
cpu, node);
@@ -167,7 +172,8 @@ void __init setup_per_cpu_areas(void)
cpu, __pa(ptr));
}
else {
- ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+ ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
+ __pa(MAX_DMA_ADDRESS));
if (ptr)
printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
cpu, node, __pa(ptr));
@@ -175,7 +181,6 @@ void __init setup_per_cpu_areas(void)
#endif
per_cpu_offset(cpu) = ptr - __per_cpu_start;
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-
}
printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
@@ -213,7 +218,7 @@ static void __init setup_node_to_cpumask_map(void)
/* allocate the map */
map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
- pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
+ pr_debug("Node to cpumask map at %p for %d nodes\n",
map, nr_node_ids);
/* node_to_cpumask() will now work */
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
deleted file mode 100644
index cc673aa55ce4..000000000000
--- a/arch/x86/kernel/sigframe.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifdef CONFIG_X86_32
-struct sigframe {
- char __user *pretcode;
- int sig;
- struct sigcontext sc;
- /*
- * fpstate is unused. fpstate is moved/allocated after
- * retcode[] below. This movement allows to have the FP state and the
- * future state extensions (xsave) stay together.
- * And at the same time retaining the unused fpstate, prevents changing
- * the offset of extramask[] in the sigframe and thus prevent any
- * legacy application accessing/modifying it.
- */
- struct _fpstate fpstate_unused;
- unsigned long extramask[_NSIG_WORDS-1];
- char retcode[8];
- /* fp state follows here */
-};
-
-struct rt_sigframe {
- char __user *pretcode;
- int sig;
- struct siginfo __user *pinfo;
- void __user *puc;
- struct siginfo info;
- struct ucontext uc;
- char retcode[8];
- /* fp state follows here */
-};
-#else
-struct rt_sigframe {
- char __user *pretcode;
- struct ucontext uc;
- struct siginfo info;
- /* fp state follows here */
-};
-
-int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs);
-int ia32_setup_frame(int sig, struct k_sigaction *ka,
- sigset_t *set, struct pt_regs *regs);
-#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal.c
index d6dd057d0f22..89bb7668041d 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal.c
@@ -1,36 +1,41 @@
/*
* Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
*
* 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
* 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
+ * 2000-2002 x86-64 support by Andi Kleen
*/
-#include <linux/list.h>
-#include <linux/personality.h>
-#include <linux/binfmts.h>
-#include <linux/suspend.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
#include <linux/kernel.h>
-#include <linux/ptrace.h>
#include <linux/signal.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
#include <linux/errno.h>
-#include <linux/sched.h>
#include <linux/wait.h>
+#include <linux/ptrace.h>
#include <linux/tracehook.h>
-#include <linux/elf.h>
-#include <linux/smp.h>
-#include <linux/mm.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/uaccess.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
-#include <asm/uaccess.h>
#include <asm/i387.h>
#include <asm/vdso.h>
+
+#ifdef CONFIG_X86_64
+#include <asm/proto.h>
+#include <asm/ia32_unistd.h>
+#include <asm/mce.h>
+#endif /* CONFIG_X86_64 */
+
#include <asm/syscall.h>
#include <asm/syscalls.h>
-#include "sigframe.h"
+#include <asm/sigframe.h>
#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
@@ -45,74 +50,6 @@
# define FIX_EFLAGS __FIX_EFLAGS
#endif
-/*
- * Atomically swap in the new signal mask, and wait for a signal.
- */
-asmlinkage int
-sys_sigsuspend(int history0, int history1, old_sigset_t mask)
-{
- mask &= _BLOCKABLE;
- spin_lock_irq(&current->sighand->siglock);
- current->saved_sigmask = current->blocked;
- siginitset(&current->blocked, mask);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- set_restore_sigmask();
-
- return -ERESTARTNOHAND;
-}
-
-asmlinkage int
-sys_sigaction(int sig, const struct old_sigaction __user *act,
- struct old_sigaction __user *oact)
-{
- struct k_sigaction new_ka, old_ka;
- int ret;
-
- if (act) {
- old_sigset_t mask;
-
- if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
- __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
- __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
- return -EFAULT;
-
- __get_user(new_ka.sa.sa_flags, &act->sa_flags);
- __get_user(mask, &act->sa_mask);
- siginitset(&new_ka.sa.sa_mask, mask);
- }
-
- ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
-
- if (!ret && oact) {
- if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
- __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
- __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
- return -EFAULT;
-
- __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
- __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
- }
-
- return ret;
-}
-
-asmlinkage int sys_sigaltstack(unsigned long bx)
-{
- /*
- * This is needed to make gcc realize it doesn't own the
- * "struct pt_regs"
- */
- struct pt_regs *regs = (struct pt_regs *)&bx;
- const stack_t __user *uss = (const stack_t __user *)bx;
- stack_t __user *uoss = (stack_t __user *)regs->cx;
-
- return do_sigaltstack(uss, uoss, regs->sp);
-}
-
#define COPY(x) { \
err |= __get_user(regs->x, &sc->x); \
}
@@ -123,7 +60,7 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
regs->seg = tmp; \
}
-#define COPY_SEG_STRICT(seg) { \
+#define COPY_SEG_CPL3(seg) { \
unsigned short tmp; \
err |= __get_user(tmp, &sc->seg); \
regs->seg = tmp | 3; \
@@ -135,9 +72,6 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
loadsegment(seg, tmp); \
}
-/*
- * Do a signal return; undo the signal stack.
- */
static int
restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
unsigned long *pax)
@@ -149,14 +83,36 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
/* Always make any pending restarted system calls return -EINTR */
current_thread_info()->restart_block.fn = do_no_restart_syscall;
+#ifdef CONFIG_X86_32
GET_SEG(gs);
COPY_SEG(fs);
COPY_SEG(es);
COPY_SEG(ds);
+#endif /* CONFIG_X86_32 */
+
COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
COPY(dx); COPY(cx); COPY(ip);
- COPY_SEG_STRICT(cs);
- COPY_SEG_STRICT(ss);
+
+#ifdef CONFIG_X86_64
+ COPY(r8);
+ COPY(r9);
+ COPY(r10);
+ COPY(r11);
+ COPY(r12);
+ COPY(r13);
+ COPY(r14);
+ COPY(r15);
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_X86_32
+ COPY_SEG_CPL3(cs);
+ COPY_SEG_CPL3(ss);
+#else /* !CONFIG_X86_32 */
+ /* Kernel saves and restores only the CS segment register on signals,
+ * which is the bare minimum needed to allow mixed 32/64-bit code.
+ * App's signal handler can save/restore other segments if needed. */
+ COPY_SEG_CPL3(cs);
+#endif /* CONFIG_X86_32 */
err |= __get_user(tmpflags, &sc->flags);
regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
@@ -169,102 +125,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
return err;
}
-asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
-{
- struct sigframe __user *frame;
- struct pt_regs *regs;
- unsigned long ax;
- sigset_t set;
-
- regs = (struct pt_regs *) &__unused;
- frame = (struct sigframe __user *)(regs->sp - 8);
-
- if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
- goto badframe;
- if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
- && __copy_from_user(&set.sig[1], &frame->extramask,
- sizeof(frame->extramask))))
- goto badframe;
-
- sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- if (restore_sigcontext(regs, &frame->sc, &ax))
- goto badframe;
- return ax;
-
-badframe:
- if (show_unhandled_signals && printk_ratelimit()) {
- printk("%s%s[%d] bad frame in sigreturn frame:"
- "%p ip:%lx sp:%lx oeax:%lx",
- task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
- current->comm, task_pid_nr(current), frame, regs->ip,
- regs->sp, regs->orig_ax);
- print_vma_addr(" in ", regs->ip);
- printk(KERN_CONT "\n");
- }
-
- force_sig(SIGSEGV, current);
-
- return 0;
-}
-
-static long do_rt_sigreturn(struct pt_regs *regs)
-{
- struct rt_sigframe __user *frame;
- unsigned long ax;
- sigset_t set;
-
- frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
- if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
- goto badframe;
- if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
- goto badframe;
-
- sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
- goto badframe;
-
- if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
- goto badframe;
-
- return ax;
-
-badframe:
- signal_fault(regs, frame, "rt_sigreturn");
- return 0;
-}
-
-asmlinkage int sys_rt_sigreturn(unsigned long __unused)
-{
- struct pt_regs *regs = (struct pt_regs *)&__unused;
-
- return do_rt_sigreturn(regs);
-}
-
-/*
- * Set up a signal frame.
- */
static int
setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
struct pt_regs *regs, unsigned long mask)
{
- int tmp, err = 0;
+ int err = 0;
- err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs);
- savesegment(gs, tmp);
- err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
+#ifdef CONFIG_X86_32
+ {
+ unsigned int tmp;
+ savesegment(gs, tmp);
+ err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
+ }
+ err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs);
err |= __put_user(regs->es, (unsigned int __user *)&sc->es);
err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds);
+#endif /* CONFIG_X86_32 */
+
err |= __put_user(regs->di, &sc->di);
err |= __put_user(regs->si, &sc->si);
err |= __put_user(regs->bp, &sc->bp);
@@ -273,19 +151,33 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
err |= __put_user(regs->dx, &sc->dx);
err |= __put_user(regs->cx, &sc->cx);
err |= __put_user(regs->ax, &sc->ax);
+#ifdef CONFIG_X86_64
+ err |= __put_user(regs->r8, &sc->r8);
+ err |= __put_user(regs->r9, &sc->r9);
+ err |= __put_user(regs->r10, &sc->r10);
+ err |= __put_user(regs->r11, &sc->r11);
+ err |= __put_user(regs->r12, &sc->r12);
+ err |= __put_user(regs->r13, &sc->r13);
+ err |= __put_user(regs->r14, &sc->r14);
+ err |= __put_user(regs->r15, &sc->r15);
+#endif /* CONFIG_X86_64 */
+
err |= __put_user(current->thread.trap_no, &sc->trapno);
err |= __put_user(current->thread.error_code, &sc->err);
err |= __put_user(regs->ip, &sc->ip);
+#ifdef CONFIG_X86_32
err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs);
err |= __put_user(regs->flags, &sc->flags);
err |= __put_user(regs->sp, &sc->sp_at_signal);
err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
+#else /* !CONFIG_X86_32 */
+ err |= __put_user(regs->flags, &sc->flags);
+ err |= __put_user(regs->cs, &sc->cs);
+ err |= __put_user(0, &sc->gs);
+ err |= __put_user(0, &sc->fs);
+#endif /* CONFIG_X86_32 */
- tmp = save_i387_xstate(fpstate);
- if (tmp < 0)
- err = 1;
- else
- err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
+ err |= __put_user(fpstate, &sc->fpstate);
/* non-iBCS2 extensions.. */
err |= __put_user(mask, &sc->oldmask);
@@ -295,6 +187,32 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
}
/*
+ * Set up a signal frame.
+ */
+#ifdef CONFIG_X86_32
+static const struct {
+ u16 poplmovl;
+ u32 val;
+ u16 int80;
+} __attribute__((packed)) retcode = {
+ 0xb858, /* popl %eax; movl $..., %eax */
+ __NR_sigreturn,
+ 0x80cd, /* int $0x80 */
+};
+
+static const struct {
+ u8 movl;
+ u32 val;
+ u16 int80;
+ u8 pad;
+} __attribute__((packed)) rt_retcode = {
+ 0xb8, /* movl $..., %eax */
+ __NR_rt_sigreturn,
+ 0x80cd, /* int $0x80 */
+ 0
+};
+
+/*
* Determine which stack to use..
*/
static inline void __user *
@@ -328,6 +246,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
if (used_math()) {
sp = sp - sig_xstate_size;
*fpstate = (struct _fpstate *) sp;
+ if (save_i387_xstate(*fpstate) < 0)
+ return (void __user *)-1L;
}
sp -= frame_size;
@@ -383,9 +303,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
* reasons and because gdb uses it as a signature to notice
* signal handler stack frames.
*/
- err |= __put_user(0xb858, (short __user *)(frame->retcode+0));
- err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
- err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
+ err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode);
if (err)
return -EFAULT;
@@ -454,9 +372,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
* reasons and because gdb uses it as a signature to notice
* signal handler stack frames.
*/
- err |= __put_user(0xb8, (char __user *)(frame->retcode+0));
- err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1));
- err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
+ err |= __put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
if (err)
return -EFAULT;
@@ -475,23 +391,293 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
return 0;
}
+#else /* !CONFIG_X86_32 */
+/*
+ * Determine which stack to use..
+ */
+static void __user *
+get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size)
+{
+ /* Default to using normal stack - redzone*/
+ sp -= 128;
+
+ /* This is the X/Open sanctioned signal stack switching. */
+ if (ka->sa.sa_flags & SA_ONSTACK) {
+ if (sas_ss_flags(sp) == 0)
+ sp = current->sas_ss_sp + current->sas_ss_size;
+ }
+
+ return (void __user *)round_down(sp - size, 64);
+}
+
+static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+ sigset_t *set, struct pt_regs *regs)
+{
+ struct rt_sigframe __user *frame;
+ void __user *fp = NULL;
+ int err = 0;
+ struct task_struct *me = current;
+
+ if (used_math()) {
+ fp = get_stack(ka, regs->sp, sig_xstate_size);
+ frame = (void __user *)round_down(
+ (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
+
+ if (save_i387_xstate(fp) < 0)
+ return -EFAULT;
+ } else
+ frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8;
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ return -EFAULT;
+
+ if (ka->sa.sa_flags & SA_SIGINFO) {
+ if (copy_siginfo_to_user(&frame->info, info))
+ return -EFAULT;
+ }
+
+ /* Create the ucontext. */
+ if (cpu_has_xsave)
+ err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
+ else
+ err |= __put_user(0, &frame->uc.uc_flags);
+ err |= __put_user(0, &frame->uc.uc_link);
+ err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+ err |= __put_user(sas_ss_flags(regs->sp),
+ &frame->uc.uc_stack.ss_flags);
+ err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+ /* Set up to return from userspace. If provided, use a stub
+ already in userspace. */
+ /* x86-64 should always use SA_RESTORER. */
+ if (ka->sa.sa_flags & SA_RESTORER) {
+ err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
+ } else {
+ /* could use a vstub here */
+ return -EFAULT;
+ }
+
+ if (err)
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->di = sig;
+ /* In case the signal handler was declared without prototypes */
+ regs->ax = 0;
+
+ /* This also works for non SA_SIGINFO handlers because they expect the
+ next argument after the signal number on the stack. */
+ regs->si = (unsigned long)&frame->info;
+ regs->dx = (unsigned long)&frame->uc;
+ regs->ip = (unsigned long) ka->sa.sa_handler;
+
+ regs->sp = (unsigned long)frame;
+
+ /* Set up the CS register to run signal handlers in 64-bit mode,
+ even if the handler happens to be interrupting 32-bit code. */
+ regs->cs = __USER_CS;
+
+ return 0;
+}
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_X86_32
+/*
+ * Atomically swap in the new signal mask, and wait for a signal.
+ */
+asmlinkage int
+sys_sigsuspend(int history0, int history1, old_sigset_t mask)
+{
+ mask &= _BLOCKABLE;
+ spin_lock_irq(&current->sighand->siglock);
+ current->saved_sigmask = current->blocked;
+ siginitset(&current->blocked, mask);
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ set_restore_sigmask();
+
+ return -ERESTARTNOHAND;
+}
+
+asmlinkage int
+sys_sigaction(int sig, const struct old_sigaction __user *act,
+ struct old_sigaction __user *oact)
+{
+ struct k_sigaction new_ka, old_ka;
+ int ret;
+
+ if (act) {
+ old_sigset_t mask;
+
+ if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+ __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
+ __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
+ return -EFAULT;
+
+ __get_user(new_ka.sa.sa_flags, &act->sa_flags);
+ __get_user(mask, &act->sa_mask);
+ siginitset(&new_ka.sa.sa_mask, mask);
+ }
+
+ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+
+ if (!ret && oact) {
+ if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+ __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
+ __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
+ return -EFAULT;
+
+ __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
+ __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
+ }
+
+ return ret;
+}
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_X86_32
+asmlinkage int sys_sigaltstack(unsigned long bx)
+{
+ /*
+ * This is needed to make gcc realize it doesn't own the
+ * "struct pt_regs"
+ */
+ struct pt_regs *regs = (struct pt_regs *)&bx;
+ const stack_t __user *uss = (const stack_t __user *)bx;
+ stack_t __user *uoss = (stack_t __user *)regs->cx;
+
+ return do_sigaltstack(uss, uoss, regs->sp);
+}
+#else /* !CONFIG_X86_32 */
+asmlinkage long
+sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
+ struct pt_regs *regs)
+{
+ return do_sigaltstack(uss, uoss, regs->sp);
+}
+#endif /* CONFIG_X86_32 */
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+#ifdef CONFIG_X86_32
+asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
+{
+ struct sigframe __user *frame;
+ struct pt_regs *regs;
+ unsigned long ax;
+ sigset_t set;
+
+ regs = (struct pt_regs *) &__unused;
+ frame = (struct sigframe __user *)(regs->sp - 8);
+
+ if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
+ && __copy_from_user(&set.sig[1], &frame->extramask,
+ sizeof(frame->extramask))))
+ goto badframe;
+
+ sigdelsetmask(&set, ~_BLOCKABLE);
+ spin_lock_irq(&current->sighand->siglock);
+ current->blocked = set;
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ if (restore_sigcontext(regs, &frame->sc, &ax))
+ goto badframe;
+ return ax;
+
+badframe:
+ signal_fault(regs, frame, "sigreturn");
+
+ return 0;
+}
+#endif /* CONFIG_X86_32 */
+
+static long do_rt_sigreturn(struct pt_regs *regs)
+{
+ struct rt_sigframe __user *frame;
+ unsigned long ax;
+ sigset_t set;
+
+ frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
+ if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+ goto badframe;
+ if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+ goto badframe;
+
+ sigdelsetmask(&set, ~_BLOCKABLE);
+ spin_lock_irq(&current->sighand->siglock);
+ current->blocked = set;
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
+ goto badframe;
+
+ if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
+ goto badframe;
+
+ return ax;
+
+badframe:
+ signal_fault(regs, frame, "rt_sigreturn");
+ return 0;
+}
+
+#ifdef CONFIG_X86_32
+asmlinkage int sys_rt_sigreturn(struct pt_regs regs)
+{
+ return do_rt_sigreturn(&regs);
+}
+#else /* !CONFIG_X86_32 */
+asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
+{
+ return do_rt_sigreturn(regs);
+}
+#endif /* CONFIG_X86_32 */
/*
* OK, we're invoking a handler:
*/
static int signr_convert(int sig)
{
+#ifdef CONFIG_X86_32
struct thread_info *info = current_thread_info();
if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
return info->exec_domain->signal_invmap[sig];
+#endif /* CONFIG_X86_32 */
return sig;
}
+#ifdef CONFIG_X86_32
+
#define is_ia32 1
#define ia32_setup_frame __setup_frame
#define ia32_setup_rt_frame __setup_rt_frame
+#else /* !CONFIG_X86_32 */
+
+#ifdef CONFIG_IA32_EMULATION
+#define is_ia32 test_thread_flag(TIF_IA32)
+#else /* !CONFIG_IA32_EMULATION */
+#define is_ia32 0
+#endif /* CONFIG_IA32_EMULATION */
+
+int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+ sigset_t *set, struct pt_regs *regs);
+int ia32_setup_frame(int sig, struct k_sigaction *ka,
+ sigset_t *set, struct pt_regs *regs);
+
+#endif /* CONFIG_X86_32 */
+
static int
setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
sigset_t *set, struct pt_regs *regs)
@@ -592,7 +778,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
return 0;
}
+#ifdef CONFIG_X86_32
#define NR_restart_syscall __NR_restart_syscall
+#else /* !CONFIG_X86_32 */
+#define NR_restart_syscall \
+ test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
+#endif /* CONFIG_X86_32 */
+
/*
* Note that 'init' is a special process: it doesn't get signals it doesn't
* want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -704,8 +896,9 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
struct task_struct *me = current;
if (show_unhandled_signals && printk_ratelimit()) {
- printk(KERN_INFO
+ printk("%s"
"%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
+ task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
me->comm, me->pid, where, frame,
regs->ip, regs->sp, regs->orig_ax);
print_vma_addr(" in ", regs->ip);
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
deleted file mode 100644
index a5c9627f4db9..000000000000
--- a/arch/x86/kernel/signal_64.c
+++ /dev/null
@@ -1,516 +0,0 @@
-/*
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
- *
- * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
- * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
- * 2000-2002 x86-64 support by Andi Kleen
- */
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/kernel.h>
-#include <linux/signal.h>
-#include <linux/errno.h>
-#include <linux/wait.h>
-#include <linux/ptrace.h>
-#include <linux/tracehook.h>
-#include <linux/unistd.h>
-#include <linux/stddef.h>
-#include <linux/personality.h>
-#include <linux/compiler.h>
-#include <linux/uaccess.h>
-
-#include <asm/processor.h>
-#include <asm/ucontext.h>
-#include <asm/i387.h>
-#include <asm/proto.h>
-#include <asm/ia32_unistd.h>
-#include <asm/mce.h>
-#include <asm/syscall.h>
-#include <asm/syscalls.h>
-#include "sigframe.h"
-
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
-#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
- X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
- X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
- X86_EFLAGS_CF)
-
-#ifdef CONFIG_X86_32
-# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF)
-#else
-# define FIX_EFLAGS __FIX_EFLAGS
-#endif
-
-asmlinkage long
-sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
- struct pt_regs *regs)
-{
- return do_sigaltstack(uss, uoss, regs->sp);
-}
-
-#define COPY(x) { \
- err |= __get_user(regs->x, &sc->x); \
-}
-
-#define COPY_SEG_STRICT(seg) { \
- unsigned short tmp; \
- err |= __get_user(tmp, &sc->seg); \
- regs->seg = tmp | 3; \
-}
-
-/*
- * Do a signal return; undo the signal stack.
- */
-static int
-restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
- unsigned long *pax)
-{
- void __user *buf;
- unsigned int tmpflags;
- unsigned int err = 0;
-
- /* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
-
- COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
- COPY(dx); COPY(cx); COPY(ip);
- COPY(r8);
- COPY(r9);
- COPY(r10);
- COPY(r11);
- COPY(r12);
- COPY(r13);
- COPY(r14);
- COPY(r15);
-
- /* Kernel saves and restores only the CS segment register on signals,
- * which is the bare minimum needed to allow mixed 32/64-bit code.
- * App's signal handler can save/restore other segments if needed. */
- COPY_SEG_STRICT(cs);
-
- err |= __get_user(tmpflags, &sc->flags);
- regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
- regs->orig_ax = -1; /* disable syscall checks */
-
- err |= __get_user(buf, &sc->fpstate);
- err |= restore_i387_xstate(buf);
-
- err |= __get_user(*pax, &sc->ax);
- return err;
-}
-
-static long do_rt_sigreturn(struct pt_regs *regs)
-{
- struct rt_sigframe __user *frame;
- unsigned long ax;
- sigset_t set;
-
- frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
- if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
- goto badframe;
- if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
- goto badframe;
-
- sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
- goto badframe;
-
- if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
- goto badframe;
-
- return ax;
-
-badframe:
- signal_fault(regs, frame, "rt_sigreturn");
- return 0;
-}
-
-asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
-{
- return do_rt_sigreturn(regs);
-}
-
-/*
- * Set up a signal frame.
- */
-
-static inline int
-setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
- unsigned long mask, struct task_struct *me)
-{
- int err = 0;
-
- err |= __put_user(regs->cs, &sc->cs);
- err |= __put_user(0, &sc->gs);
- err |= __put_user(0, &sc->fs);
-
- err |= __put_user(regs->di, &sc->di);
- err |= __put_user(regs->si, &sc->si);
- err |= __put_user(regs->bp, &sc->bp);
- err |= __put_user(regs->sp, &sc->sp);
- err |= __put_user(regs->bx, &sc->bx);
- err |= __put_user(regs->dx, &sc->dx);
- err |= __put_user(regs->cx, &sc->cx);
- err |= __put_user(regs->ax, &sc->ax);
- err |= __put_user(regs->r8, &sc->r8);
- err |= __put_user(regs->r9, &sc->r9);
- err |= __put_user(regs->r10, &sc->r10);
- err |= __put_user(regs->r11, &sc->r11);
- err |= __put_user(regs->r12, &sc->r12);
- err |= __put_user(regs->r13, &sc->r13);
- err |= __put_user(regs->r14, &sc->r14);
- err |= __put_user(regs->r15, &sc->r15);
- err |= __put_user(me->thread.trap_no, &sc->trapno);
- err |= __put_user(me->thread.error_code, &sc->err);
- err |= __put_user(regs->ip, &sc->ip);
- err |= __put_user(regs->flags, &sc->flags);
- err |= __put_user(mask, &sc->oldmask);
- err |= __put_user(me->thread.cr2, &sc->cr2);
-
- return err;
-}
-
-/*
- * Determine which stack to use..
- */
-
-static void __user *
-get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
-{
- unsigned long sp;
-
- /* Default to using normal stack - redzone*/
- sp = regs->sp - 128;
-
- /* This is the X/Open sanctioned signal stack switching. */
- if (ka->sa.sa_flags & SA_ONSTACK) {
- if (sas_ss_flags(sp) == 0)
- sp = current->sas_ss_sp + current->sas_ss_size;
- }
-
- return (void __user *)round_down(sp - size, 64);
-}
-
-static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs)
-{
- struct rt_sigframe __user *frame;
- void __user *fp = NULL;
- int err = 0;
- struct task_struct *me = current;
-
- if (used_math()) {
- fp = get_stack(ka, regs, sig_xstate_size);
- frame = (void __user *)round_down(
- (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
-
- if (save_i387_xstate(fp) < 0)
- return -EFAULT;
- } else
- frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
-
- if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
- return -EFAULT;
-
- if (ka->sa.sa_flags & SA_SIGINFO) {
- if (copy_siginfo_to_user(&frame->info, info))
- return -EFAULT;
- }
-
- /* Create the ucontext. */
- if (cpu_has_xsave)
- err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
- else
- err |= __put_user(0, &frame->uc.uc_flags);
- err |= __put_user(0, &frame->uc.uc_link);
- err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
- err |= __put_user(sas_ss_flags(regs->sp),
- &frame->uc.uc_stack.ss_flags);
- err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
- err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
- if (sizeof(*set) == 16) {
- __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
- __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
- } else
- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-
- /* Set up to return from userspace. If provided, use a stub
- already in userspace. */
- /* x86-64 should always use SA_RESTORER. */
- if (ka->sa.sa_flags & SA_RESTORER) {
- err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
- } else {
- /* could use a vstub here */
- return -EFAULT;
- }
-
- if (err)
- return -EFAULT;
-
- /* Set up registers for signal handler */
- regs->di = sig;
- /* In case the signal handler was declared without prototypes */
- regs->ax = 0;
-
- /* This also works for non SA_SIGINFO handlers because they expect the
- next argument after the signal number on the stack. */
- regs->si = (unsigned long)&frame->info;
- regs->dx = (unsigned long)&frame->uc;
- regs->ip = (unsigned long) ka->sa.sa_handler;
-
- regs->sp = (unsigned long)frame;
-
- /* Set up the CS register to run signal handlers in 64-bit mode,
- even if the handler happens to be interrupting 32-bit code. */
- regs->cs = __USER_CS;
-
- return 0;
-}
-
-/*
- * OK, we're invoking a handler
- */
-static int signr_convert(int sig)
-{
- return sig;
-}
-
-#ifdef CONFIG_IA32_EMULATION
-#define is_ia32 test_thread_flag(TIF_IA32)
-#else
-#define is_ia32 0
-#endif
-
-static int
-setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs)
-{
- int usig = signr_convert(sig);
- int ret;
-
- /* Set up the stack frame */
- if (is_ia32) {
- if (ka->sa.sa_flags & SA_SIGINFO)
- ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
- else
- ret = ia32_setup_frame(usig, ka, set, regs);
- } else
- ret = __setup_rt_frame(sig, ka, info, set, regs);
-
- if (ret) {
- force_sigsegv(sig, current);
- return -EFAULT;
- }
-
- return ret;
-}
-
-static int
-handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
- sigset_t *oldset, struct pt_regs *regs)
-{
- int ret;
-
- /* Are we from a system call? */
- if (syscall_get_nr(current, regs) >= 0) {
- /* If so, check system call restarting.. */
- switch (syscall_get_error(current, regs)) {
- case -ERESTART_RESTARTBLOCK:
- case -ERESTARTNOHAND:
- regs->ax = -EINTR;
- break;
-
- case -ERESTARTSYS:
- if (!(ka->sa.sa_flags & SA_RESTART)) {
- regs->ax = -EINTR;
- break;
- }
- /* fallthrough */
- case -ERESTARTNOINTR:
- regs->ax = regs->orig_ax;
- regs->ip -= 2;
- break;
- }
- }
-
- /*
- * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
- * flag so that register information in the sigcontext is correct.
- */
- if (unlikely(regs->flags & X86_EFLAGS_TF) &&
- likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
- regs->flags &= ~X86_EFLAGS_TF;
-
- ret = setup_rt_frame(sig, ka, info, oldset, regs);
-
- if (ret)
- return ret;
-
-#ifdef CONFIG_X86_64
- /*
- * This has nothing to do with segment registers,
- * despite the name. This magic affects uaccess.h
- * macros' behavior. Reset it to the normal setting.
- */
- set_fs(USER_DS);
-#endif
-
- /*
- * Clear the direction flag as per the ABI for function entry.
- */
- regs->flags &= ~X86_EFLAGS_DF;
-
- /*
- * Clear TF when entering the signal handler, but
- * notify any tracer that was single-stepping it.
- * The tracer may want to single-step inside the
- * handler too.
- */
- regs->flags &= ~X86_EFLAGS_TF;
-
- spin_lock_irq(&current->sighand->siglock);
- sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
- if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&current->blocked, sig);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- tracehook_signal_handler(sig, info, ka, regs,
- test_thread_flag(TIF_SINGLESTEP));
-
- return 0;
-}
-
-#define NR_restart_syscall \
- test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
-/*
- * Note that 'init' is a special process: it doesn't get signals it doesn't
- * want to handle. Thus you cannot kill init even with a SIGKILL even by
- * mistake.
- */
-static void do_signal(struct pt_regs *regs)
-{
- struct k_sigaction ka;
- siginfo_t info;
- int signr;
- sigset_t *oldset;
-
- /*
- * We want the common case to go fast, which is why we may in certain
- * cases get here from kernel mode. Just return without doing anything
- * if so.
- * X86_32: vm86 regs switched out by assembly code before reaching
- * here, so testing against kernel CS suffices.
- */
- if (!user_mode(regs))
- return;
-
- if (current_thread_info()->status & TS_RESTORE_SIGMASK)
- oldset = &current->saved_sigmask;
- else
- oldset = &current->blocked;
-
- signr = get_signal_to_deliver(&info, &ka, regs, NULL);
- if (signr > 0) {
- /*
- * Re-enable any watchpoints before delivering the
- * signal to user space. The processor register will
- * have been cleared if the watchpoint triggered
- * inside the kernel.
- */
- if (current->thread.debugreg7)
- set_debugreg(current->thread.debugreg7, 7);
-
- /* Whee! Actually deliver the signal. */
- if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
- /*
- * A signal was successfully delivered; the saved
- * sigmask will have been stored in the signal frame,
- * and will be restored by sigreturn, so we can simply
- * clear the TS_RESTORE_SIGMASK flag.
- */
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- }
- return;
- }
-
- /* Did we come from a system call? */
- if (syscall_get_nr(current, regs) >= 0) {
- /* Restart the system call - no handlers present */
- switch (syscall_get_error(current, regs)) {
- case -ERESTARTNOHAND:
- case -ERESTARTSYS:
- case -ERESTARTNOINTR:
- regs->ax = regs->orig_ax;
- regs->ip -= 2;
- break;
-
- case -ERESTART_RESTARTBLOCK:
- regs->ax = NR_restart_syscall;
- regs->ip -= 2;
- break;
- }
- }
-
- /*
- * If there's no signal to deliver, we just put the saved sigmask
- * back.
- */
- if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
- }
-}
-
-/*
- * notification of userspace execution resumption
- * - triggered by the TIF_WORK_MASK flags
- */
-void
-do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
-{
-#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
- /* notify userspace of pending MCEs */
- if (thread_info_flags & _TIF_MCE_NOTIFY)
- mce_notify_user();
-#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
-
- /* deal with pending signal delivery */
- if (thread_info_flags & _TIF_SIGPENDING)
- do_signal(regs);
-
- if (thread_info_flags & _TIF_NOTIFY_RESUME) {
- clear_thread_flag(TIF_NOTIFY_RESUME);
- tracehook_notify_resume(regs);
- }
-
-#ifdef CONFIG_X86_32
- clear_thread_flag(TIF_IRET);
-#endif /* CONFIG_X86_32 */
-}
-
-void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
-{
- struct task_struct *me = current;
-
- if (show_unhandled_signals && printk_ratelimit()) {
- printk(KERN_INFO
- "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
- me->comm, me->pid, where, frame,
- regs->ip, regs->sp, regs->orig_ax);
- print_vma_addr(" in ", regs->ip);
- printk(KERN_CONT "\n");
- }
-
- force_sig(SIGSEGV, me);
-}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 18f9b19f5f8f..7e558db362c1 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -140,19 +140,6 @@ void native_send_call_func_ipi(cpumask_t mask)
send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
}
-static void stop_this_cpu(void *dummy)
-{
- local_irq_disable();
- /*
- * Remove this CPU:
- */
- cpu_clear(smp_processor_id(), cpu_online_map);
- disable_local_APIC();
- if (hlt_works(smp_processor_id()))
- for (;;) halt();
- for (;;);
-}
-
/*
* this function calls the 'stop' function on all other CPUs in the system.
*/
@@ -178,11 +165,7 @@ static void native_smp_send_stop(void)
void smp_reschedule_interrupt(struct pt_regs *regs)
{
ack_APIC_irq();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_resched_count++;
-#else
- add_pda(irq_resched_count, 1);
-#endif
+ inc_irq_stat(irq_resched_count);
}
void smp_call_function_interrupt(struct pt_regs *regs)
@@ -190,11 +173,7 @@ void smp_call_function_interrupt(struct pt_regs *regs)
ack_APIC_irq();
irq_enter();
generic_smp_call_function_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ inc_irq_stat(irq_call_count);
irq_exit();
}
@@ -203,11 +182,7 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
ack_APIC_irq();
irq_enter();
generic_smp_call_function_single_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ inc_irq_stat(irq_call_count);
irq_exit();
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8c3aca7cb343..f8500c969442 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,6 +62,7 @@
#include <asm/mtrr.h>
#include <asm/vmi.h>
#include <asm/genapic.h>
+#include <asm/setup.h>
#include <linux/mc146818rtc.h>
#include <mach_apic.h>
@@ -282,19 +283,19 @@ static void __cpuinit smp_callin(void)
cpu_set(cpuid, cpu_callin_map);
}
+static int __cpuinitdata unsafe_smp;
+
/*
* Activate a secondary processor.
*/
-static void __cpuinit start_secondary(void *unused)
+notrace static void __cpuinit start_secondary(void *unused)
{
/*
* Don't put *anything* before cpu_init(), SMP booting is too
* fragile that we want to limit the things done here to the
* most necessary things.
*/
-#ifdef CONFIG_VMI
vmi_bringup();
-#endif
cpu_init();
preempt_disable();
smp_callin();
@@ -397,7 +398,7 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
goto valid_k7;
/* If we get here, not a certified SMP capable AMD system. */
- add_taint(TAINT_UNSAFE_SMP);
+ unsafe_smp = 1;
}
valid_k7:
@@ -414,12 +415,10 @@ static void __cpuinit smp_checks(void)
* Don't taint if we are running SMP kernel on a single non-MP
* approved Athlon
*/
- if (tainted & TAINT_UNSAFE_SMP) {
- if (num_online_cpus())
- printk(KERN_INFO "WARNING: This combination of AMD"
- "processors is not suitable for SMP.\n");
- else
- tainted &= ~TAINT_UNSAFE_SMP;
+ if (unsafe_smp && num_online_cpus() > 1) {
+ printk(KERN_INFO "WARNING: This combination of AMD"
+ "processors is not suitable for SMP.\n");
+ add_taint(TAINT_UNSAFE_SMP);
}
}
@@ -536,17 +535,17 @@ static void impress_friends(void)
pr_debug("Before bogocount - setting activated=1.\n");
}
-static inline void __inquire_remote_apic(int apicid)
+void __inquire_remote_apic(int apicid)
{
unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
char *names[] = { "ID", "VERSION", "SPIV" };
int timeout;
u32 status;
- printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
+ printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid);
for (i = 0; i < ARRAY_SIZE(regs); i++) {
- printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]);
+ printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]);
/*
* Wait for idle.
@@ -575,14 +574,13 @@ static inline void __inquire_remote_apic(int apicid)
}
}
-#ifdef WAKE_SECONDARY_VIA_NMI
/*
* Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
* INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
* won't ... remember to clear down the APIC, etc later.
*/
-static int __devinit
-wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
+int __devinit
+wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
{
unsigned long send_status, accept_status = 0;
int maxlvt;
@@ -599,7 +597,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
* Give the other CPU some time to accept the IPI.
*/
udelay(200);
- if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+ if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
maxlvt = lapic_get_maxlvt();
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
@@ -614,11 +612,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
return (send_status | accept_status);
}
-#endif /* WAKE_SECONDARY_VIA_NMI */
-#ifdef WAKE_SECONDARY_VIA_INIT
-static int __devinit
-wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
+int __devinit
+wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
{
unsigned long send_status, accept_status = 0;
int maxlvt, num_starts, j;
@@ -737,7 +733,6 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
return (send_status | accept_status);
}
-#endif /* WAKE_SECONDARY_VIA_INIT */
struct create_idle {
struct work_struct work;
@@ -874,7 +869,7 @@ do_rest:
start_ip = setup_trampoline();
/* So we see what's up */
- printk(KERN_INFO "Booting processor %d/%d ip %lx\n",
+ printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n",
cpu, apicid, start_ip);
/*
@@ -893,9 +888,11 @@ do_rest:
smpboot_setup_warm_reset_vector(start_ip);
/*
* Be paranoid about clearing APIC errors.
- */
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
+ */
+ if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ }
}
/*
@@ -1084,8 +1081,10 @@ static int __init smp_sanity_check(unsigned max_cpus)
#endif
if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
- printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
- "by the BIOS.\n", hard_smp_processor_id());
+ printk(KERN_WARNING
+ "weird, boot CPU (#%d) not listed by the BIOS.\n",
+ hard_smp_processor_id());
+
physid_set(hard_smp_processor_id(), phys_cpu_present_map);
}
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index a03e7f6d90c3..10786af95545 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -6,6 +6,7 @@
#include <linux/sched.h>
#include <linux/stacktrace.h>
#include <linux/module.h>
+#include <linux/uaccess.h>
#include <asm/stacktrace.h>
static void save_stack_warning(void *data, char *msg)
@@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
+
+/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
+
+struct stack_frame {
+ const void __user *next_fp;
+ unsigned long ret_addr;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+ int ret;
+
+ if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+ return 0;
+
+ ret = 1;
+ pagefault_disable();
+ if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+ ret = 0;
+ pagefault_enable();
+
+ return ret;
+}
+
+static inline void __save_stack_trace_user(struct stack_trace *trace)
+{
+ const struct pt_regs *regs = task_pt_regs(current);
+ const void __user *fp = (const void __user *)regs->bp;
+
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = regs->ip;
+
+ while (trace->nr_entries < trace->max_entries) {
+ struct stack_frame frame;
+
+ frame.next_fp = NULL;
+ frame.ret_addr = 0;
+ if (!copy_stack_frame(fp, &frame))
+ break;
+ if ((unsigned long)fp < regs->sp)
+ break;
+ if (frame.ret_addr) {
+ trace->entries[trace->nr_entries++] =
+ frame.ret_addr;
+ }
+ if (fp == frame.next_fp)
+ break;
+ fp = frame.next_fp;
+ }
+}
+
+void save_stack_trace_user(struct stack_trace *trace)
+{
+ /*
+ * Trace user stack if we are not a kernel thread
+ */
+ if (current->mm) {
+ __save_stack_trace_user(trace);
+ }
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 3d1be4f0fac5..de87d6008295 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -8,12 +8,12 @@
#define __NO_STUBS
#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
-#undef ASM_X86__UNISTD_64_H
+#undef _ASM_X86_UNISTD_64_H
#include <asm/unistd_64.h>
#undef __SYSCALL
#define __SYSCALL(nr, sym) [nr] = sym,
-#undef ASM_X86__UNISTD_64_H
+#undef _ASM_X86_UNISTD_64_H
typedef void (*sys_call_ptr_t)(void);
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 77b400f06ea2..65309e4cb1c0 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -75,7 +75,7 @@ EXPORT_SYMBOL(profile_pc);
irqreturn_t timer_interrupt(int irq, void *dev_id)
{
/* Keep nmi watchdog up to date */
- per_cpu(irq_stat, smp_processor_id()).irq0_irqs++;
+ inc_irq_stat(irq0_irqs);
#ifdef CONFIG_X86_IO_APIC
if (timer_ack) {
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index cb19d650c216..891e7a7c4334 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -49,9 +49,9 @@ unsigned long profile_pc(struct pt_regs *regs)
}
EXPORT_SYMBOL(profile_pc);
-irqreturn_t timer_interrupt(int irq, void *dev_id)
+static irqreturn_t timer_interrupt(int irq, void *dev_id)
{
- add_pda(irq0_irqs, 1);
+ inc_irq_stat(irq0_irqs);
global_clock_event->event_handler(global_clock_event);
@@ -80,6 +80,8 @@ unsigned long __init calibrate_cpu(void)
break;
no_ctr_free = (i == 4);
if (no_ctr_free) {
+ WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
+ "cpu_khz value may be incorrect.\n");
i = 3;
rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
wrmsrl(MSR_K7_EVNTSEL3, 0);
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index e00534b33534..8da059f949be 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -34,9 +34,8 @@ static DEFINE_SPINLOCK(tlbstate_lock);
*/
void leave_mm(int cpu)
{
- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
- BUG();
- cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
+ BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
+ cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
load_cr3(swapper_pg_dir);
}
EXPORT_SYMBOL_GPL(leave_mm);
@@ -104,8 +103,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
* BUG();
*/
- if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
+ if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
+ if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
if (flush_va == TLB_FLUSH_ALL)
local_flush_tlb();
else
@@ -119,7 +118,7 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
smp_mb__after_clear_bit();
out:
put_cpu_no_resched();
- __get_cpu_var(irq_stat).irq_tlb_count++;
+ inc_irq_stat(irq_tlb_count);
}
void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
@@ -154,6 +153,12 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
flush_mm = mm;
flush_va = va;
cpus_or(flush_cpumask, cpumask, flush_cpumask);
+
+ /*
+ * Make the above memory operations globally visible before
+ * sending the IPI.
+ */
+ smp_mb();
/*
* We have to send the IPI only to
* CPUs affected.
@@ -232,7 +237,7 @@ static void do_flush_tlb_all(void *info)
unsigned long cpu = smp_processor_id();
__flush_tlb_all();
- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
+ if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(cpu);
}
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index dcbf7a1159ea..29887d7081a9 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -154,7 +154,7 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
out:
ack_APIC_irq();
cpu_clear(cpu, f->flush_cpumask);
- add_pda(irq_tlb_count, 1);
+ inc_irq_stat(irq_tlb_count);
}
void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
@@ -183,6 +183,11 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
/*
+ * Make the above memory operations globally visible before
+ * sending the IPI.
+ */
+ smp_mb();
+ /*
* We have to send the IPI only to
* CPUs affected.
*/
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 8b8c0d6640fa..6a00e5faaa74 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -6,7 +6,7 @@
* This code is released under the GNU General Public License version 2 or
* later.
*/
-#include <linux/mc146818rtc.h>
+#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/kernel.h>
@@ -566,14 +566,10 @@ static int __init uv_ptc_init(void)
if (!is_uv_system())
return 0;
- if (!proc_mkdir("sgi_uv", NULL))
- return -EINVAL;
-
proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);
if (!proc_uv_ptc) {
printk(KERN_ERR "unable to create %s proc entry\n",
UV_PTC_BASENAME);
- remove_proc_entry("sgi_uv", NULL);
return -EINVAL;
}
proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 1106fac6024d..808031a5ba19 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,10 +1,26 @@
#include <linux/io.h>
#include <asm/trampoline.h>
+#include <asm/e820.h>
/* ready for x86_64 and x86 */
unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
+void __init reserve_trampoline_memory(void)
+{
+#ifdef CONFIG_X86_32
+ /*
+ * But first pinch a few for the stack/trampoline stuff
+ * FIXME: Don't need the extra page at 4K, but need to fix
+ * trampoline before removing it. (see the GDT stuff)
+ */
+ reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
+#endif
+ /* Has to be in very low memory so we can execute real-mode AP code. */
+ reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE,
+ "TRAMPOLINE");
+}
+
/*
* Currently trivial. Write the real->protected mode
* bootstrap into the page concerned. The caller
@@ -12,7 +28,6 @@ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
*/
unsigned long setup_trampoline(void)
{
- memcpy(trampoline_base, trampoline_data,
- trampoline_end - trampoline_data);
+ memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
return virt_to_phys(trampoline_base);
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index e062974cce34..141907ab6e22 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -481,11 +481,7 @@ do_nmi(struct pt_regs *regs, long error_code)
{
nmi_enter();
-#ifdef CONFIG_X86_32
- { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
-#else
- add_pda(__nmi_count, 1);
-#endif
+ inc_irq_stat(__nmi_count);
if (!ignore_nmis)
default_do_nmi(regs);
@@ -664,7 +660,7 @@ void math_error(void __user *ip)
{
struct task_struct *task;
siginfo_t info;
- unsigned short cwd, swd;
+ unsigned short cwd, swd, err;
/*
* Save the info for the exception handler and clear the error.
@@ -675,7 +671,6 @@ void math_error(void __user *ip)
task->thread.error_code = 0;
info.si_signo = SIGFPE;
info.si_errno = 0;
- info.si_code = __SI_FAULT;
info.si_addr = ip;
/*
* (~cwd & swd) will mask out exceptions that are not set to unmasked
@@ -689,34 +684,31 @@ void math_error(void __user *ip)
*/
cwd = get_fpu_cwd(task);
swd = get_fpu_swd(task);
- switch (swd & ~cwd & 0x3f) {
- case 0x000: /* No unmasked exception */
+
+ err = swd & ~cwd & 0x3f;
+
#ifdef CONFIG_X86_32
+ if (!err)
return;
#endif
- default: /* Multiple exceptions */
- break;
- case 0x001: /* Invalid Op */
+
+ if (err & 0x001) { /* Invalid op */
/*
* swd & 0x240 == 0x040: Stack Underflow
* swd & 0x240 == 0x240: Stack Overflow
* User must clear the SF bit (0x40) if set
*/
info.si_code = FPE_FLTINV;
- break;
- case 0x002: /* Denormalize */
- case 0x010: /* Underflow */
- info.si_code = FPE_FLTUND;
- break;
- case 0x004: /* Zero Divide */
+ } else if (err & 0x004) { /* Divide by Zero */
info.si_code = FPE_FLTDIV;
- break;
- case 0x008: /* Overflow */
+ } else if (err & 0x008) { /* Overflow */
info.si_code = FPE_FLTOVF;
- break;
- case 0x020: /* Precision */
+ } else if (err & 0x012) { /* Denormal, Underflow */
+ info.si_code = FPE_FLTUND;
+ } else if (err & 0x020) { /* Precision */
info.si_code = FPE_FLTRES;
- break;
+ } else {
+ info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */
}
force_sig_info(SIGFPE, &info, task);
}
@@ -931,14 +923,6 @@ do_device_not_available(struct pt_regs *regs, long error)
}
#ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_MCE
-dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error)
-{
- conditional_sti(regs);
- machine_check_vector(regs, error);
-}
-#endif
-
dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
{
siginfo_t info;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 161bb850fc47..599e58168631 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -15,6 +15,7 @@
#include <asm/vgtod.h>
#include <asm/time.h>
#include <asm/delay.h>
+#include <asm/hypervisor.h>
unsigned int cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -31,6 +32,7 @@ static int tsc_unstable;
erroneous rdtsc usage on !cpu_has_tsc processors */
static int tsc_disabled = -1;
+static int tsc_clocksource_reliable;
/*
* Scheduler clock - returns current time in nanosec units.
*/
@@ -55,7 +57,7 @@ u64 native_sched_clock(void)
rdtscll(this_offset);
/* return the value in ns */
- return cycles_2_ns(this_offset);
+ return __cycles_2_ns(this_offset);
}
/* We need to define a real function for sched_clock, to override the
@@ -98,6 +100,15 @@ int __init notsc_setup(char *str)
__setup("notsc", notsc_setup);
+static int __init tsc_setup(char *str)
+{
+ if (!strcmp(str, "reliable"))
+ tsc_clocksource_reliable = 1;
+ return 1;
+}
+
+__setup("tsc=", tsc_setup);
+
#define MAX_RETRIES 5
#define SMI_TRESHOLD 50000
@@ -352,9 +363,15 @@ unsigned long native_calibrate_tsc(void)
{
u64 tsc1, tsc2, delta, ref1, ref2;
unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
- unsigned long flags, latch, ms, fast_calibrate;
+ unsigned long flags, latch, ms, fast_calibrate, tsc_khz;
int hpet = is_hpet_enabled(), i, loopmin;
+ tsc_khz = get_hypervisor_tsc_freq();
+ if (tsc_khz) {
+ printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
+ return tsc_khz;
+ }
+
local_irq_save(flags);
fast_calibrate = quick_pit_calibrate();
local_irq_restore(flags);
@@ -731,24 +748,21 @@ static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
{}
};
-/*
- * Geode_LX - the OLPC CPU has a possibly a very reliable TSC
- */
+static void __init check_system_tsc_reliable(void)
+{
#ifdef CONFIG_MGEODE_LX
-/* RTSC counts during suspend */
+ /* RTSC counts during suspend */
#define RTSC_SUSP 0x100
-
-static void __init check_geode_tsc_reliable(void)
-{
unsigned long res_low, res_high;
rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
+ /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */
if (res_low & RTSC_SUSP)
- clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
-}
-#else
-static inline void check_geode_tsc_reliable(void) { }
+ tsc_clocksource_reliable = 1;
#endif
+ if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
+ tsc_clocksource_reliable = 1;
+}
/*
* Make an educated guess if the TSC is trustworthy and synchronized
@@ -759,7 +773,7 @@ __cpuinit int unsynchronized_tsc(void)
if (!cpu_has_tsc || tsc_unstable)
return 1;
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_SMP
if (apic_is_clustered_box())
return 1;
#endif
@@ -783,6 +797,8 @@ static void __init init_tsc_clocksource(void)
{
clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
clocksource_tsc.shift);
+ if (tsc_clocksource_reliable)
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
/* lower the rating if we already know its unstable: */
if (check_tsc_unstable()) {
clocksource_tsc.rating = 0;
@@ -813,10 +829,6 @@ void __init tsc_init(void)
cpu_khz = calibrate_cpu();
#endif
- lpj = ((u64)tsc_khz * 1000);
- do_div(lpj, HZ);
- lpj_fine = lpj;
-
printk("Detected %lu.%03lu MHz processor.\n",
(unsigned long)cpu_khz / 1000,
(unsigned long)cpu_khz % 1000);
@@ -836,6 +848,10 @@ void __init tsc_init(void)
/* now allow native_sched_clock() to use rdtsc */
tsc_disabled = 0;
+ lpj = ((u64)tsc_khz * 1000);
+ do_div(lpj, HZ);
+ lpj_fine = lpj;
+
use_tsc_delay();
/* Check and install the TSC clocksource */
dmi_check_system(bad_tsc_dmi_table);
@@ -843,7 +859,7 @@ void __init tsc_init(void)
if (unsynchronized_tsc())
mark_tsc_unstable("TSCs unsynchronized");
- check_geode_tsc_reliable();
+ check_system_tsc_reliable();
init_tsc_clocksource();
}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 9ffb01c31c40..bf36328f6ef9 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -46,7 +46,9 @@ static __cpuinit void check_tsc_warp(void)
cycles_t start, now, prev, end;
int i;
+ rdtsc_barrier();
start = get_cycles();
+ rdtsc_barrier();
/*
* The measurement runs for 20 msecs:
*/
@@ -61,7 +63,9 @@ static __cpuinit void check_tsc_warp(void)
*/
__raw_spin_lock(&sync_lock);
prev = last_tsc;
+ rdtsc_barrier();
now = get_cycles();
+ rdtsc_barrier();
last_tsc = now;
__raw_spin_unlock(&sync_lock);
@@ -108,6 +112,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
if (unsynchronized_tsc())
return;
+ if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
+ printk(KERN_INFO
+ "Skipping synchronization checks as TSC is reliable.\n");
+ return;
+ }
+
printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
smp_processor_id(), cpu);
@@ -161,7 +171,7 @@ void __cpuinit check_tsc_sync_target(void)
{
int cpus = 2;
- if (unsynchronized_tsc())
+ if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
return;
/*
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
new file mode 100644
index 000000000000..aeef529917e4
--- /dev/null
+++ b/arch/x86/kernel/uv_irq.c
@@ -0,0 +1,79 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV IRQ functions
+ *
+ * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/irq.h>
+
+#include <asm/apic.h>
+#include <asm/uv/uv_irq.h>
+
+static void uv_noop(unsigned int irq)
+{
+}
+
+static unsigned int uv_noop_ret(unsigned int irq)
+{
+ return 0;
+}
+
+static void uv_ack_apic(unsigned int irq)
+{
+ ack_APIC_irq();
+}
+
+struct irq_chip uv_irq_chip = {
+ .name = "UV-CORE",
+ .startup = uv_noop_ret,
+ .shutdown = uv_noop,
+ .enable = uv_noop,
+ .disable = uv_noop,
+ .ack = uv_noop,
+ .mask = uv_noop,
+ .unmask = uv_noop,
+ .eoi = uv_ack_apic,
+ .end = uv_noop,
+};
+
+/*
+ * Set up a mapping of an available irq and vector, and enable the specified
+ * MMR that defines the MSI that is to be sent to the specified CPU when an
+ * interrupt is raised.
+ */
+int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
+ unsigned long mmr_offset)
+{
+ int irq;
+ int ret;
+
+ irq = create_irq();
+ if (irq <= 0)
+ return -EBUSY;
+
+ ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset);
+ if (ret != irq)
+ destroy_irq(irq);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(uv_setup_irq);
+
+/*
+ * Tear down a mapping of an irq and vector, and disable the specified MMR that
+ * defined the MSI that was to be sent to the specified CPU when an interrupt
+ * was raised.
+ *
+ * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
+ */
+void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset)
+{
+ arch_disable_uv_irq(mmr_blade, mmr_offset);
+ destroy_irq(irq);
+}
+EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
new file mode 100644
index 000000000000..67f9b9dbf800
--- /dev/null
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -0,0 +1,72 @@
+/*
+ * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Russ Anderson
+ */
+
+#include <linux/sysdev.h>
+#include <asm/uv/bios.h>
+
+struct kobject *sgi_uv_kobj;
+
+static ssize_t partition_id_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id);
+}
+
+static ssize_t coherence_id_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id());
+}
+
+static struct kobj_attribute partition_id_attr =
+ __ATTR(partition_id, S_IRUGO, partition_id_show, NULL);
+
+static struct kobj_attribute coherence_id_attr =
+ __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL);
+
+
+static int __init sgi_uv_sysfs_init(void)
+{
+ unsigned long ret;
+
+ if (!sgi_uv_kobj)
+ sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
+ if (!sgi_uv_kobj) {
+ printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n");
+ return -EINVAL;
+ }
+
+ ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
+ if (ret) {
+ printk(KERN_WARNING "sysfs_create_file partition_id failed \n");
+ return ret;
+ }
+
+ ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
+ if (ret) {
+ printk(KERN_WARNING "sysfs_create_file coherence_id failed \n");
+ return ret;
+ }
+
+ return 0;
+}
+
+device_initcall(sgi_uv_sysfs_init);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 61a97e616f70..0c9667f0752a 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -484,10 +484,11 @@ static void disable_cobalt_irq(unsigned int irq)
static unsigned int startup_cobalt_irq(unsigned int irq)
{
unsigned long flags;
+ struct irq_desc *desc = irq_to_desc(irq);
spin_lock_irqsave(&cobalt_lock, flags);
- if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
- irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
+ if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
+ desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
enable_cobalt_irq(irq);
spin_unlock_irqrestore(&cobalt_lock, flags);
return 0;
@@ -506,9 +507,10 @@ static void ack_cobalt_irq(unsigned int irq)
static void end_cobalt_irq(unsigned int irq)
{
unsigned long flags;
+ struct irq_desc *desc = irq_to_desc(irq);
spin_lock_irqsave(&cobalt_lock, flags);
- if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
+ if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
enable_cobalt_irq(irq);
spin_unlock_irqrestore(&cobalt_lock, flags);
}
@@ -626,12 +628,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
spin_unlock_irqrestore(&i8259A_lock, flags);
- desc = irq_desc + realirq;
+ desc = irq_to_desc(realirq);
/*
* handle this 'virtual interrupt' as a Cobalt one now.
*/
- kstat_cpu(smp_processor_id()).irqs[realirq]++;
+ kstat_incr_irqs_this_cpu(realirq, desc);
if (likely(desc->action != NULL))
handle_IRQ_event(realirq, desc->action);
@@ -662,27 +664,29 @@ void init_VISWS_APIC_irqs(void)
int i;
for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
- irq_desc[i].status = IRQ_DISABLED;
- irq_desc[i].action = 0;
- irq_desc[i].depth = 1;
+ struct irq_desc *desc = irq_to_desc(i);
+
+ desc->status = IRQ_DISABLED;
+ desc->action = 0;
+ desc->depth = 1;
if (i == 0) {
- irq_desc[i].chip = &cobalt_irq_type;
+ desc->chip = &cobalt_irq_type;
}
else if (i == CO_IRQ_IDE0) {
- irq_desc[i].chip = &cobalt_irq_type;
+ desc->chip = &cobalt_irq_type;
}
else if (i == CO_IRQ_IDE1) {
- irq_desc[i].chip = &cobalt_irq_type;
+ desc->chip = &cobalt_irq_type;
}
else if (i == CO_IRQ_8259) {
- irq_desc[i].chip = &piix4_master_irq_type;
+ desc->chip = &piix4_master_irq_type;
}
else if (i < CO_IRQ_APIC0) {
- irq_desc[i].chip = &piix4_virtual_irq_type;
+ desc->chip = &piix4_virtual_irq_type;
}
else if (IS_CO_APIC(i)) {
- irq_desc[i].chip = &cobalt_irq_type;
+ desc->chip = &cobalt_irq_type;
}
}
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 8b6c393ab9fd..23206ba16874 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -266,109 +266,6 @@ static void vmi_nop(void)
{
}
-#ifdef CONFIG_DEBUG_PAGE_TYPE
-
-#ifdef CONFIG_X86_PAE
-#define MAX_BOOT_PTS (2048+4+1)
-#else
-#define MAX_BOOT_PTS (1024+1)
-#endif
-
-/*
- * During boot, mem_map is not yet available in paging_init, so stash
- * all the boot page allocations here.
- */
-static struct {
- u32 pfn;
- int type;
-} boot_page_allocations[MAX_BOOT_PTS];
-static int num_boot_page_allocations;
-static int boot_allocations_applied;
-
-void vmi_apply_boot_page_allocations(void)
-{
- int i;
- BUG_ON(!mem_map);
- for (i = 0; i < num_boot_page_allocations; i++) {
- struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
- page->type = boot_page_allocations[i].type;
- page->type = boot_page_allocations[i].type &
- ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
- }
- boot_allocations_applied = 1;
-}
-
-static void record_page_type(u32 pfn, int type)
-{
- BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
- boot_page_allocations[num_boot_page_allocations].pfn = pfn;
- boot_page_allocations[num_boot_page_allocations].type = type;
- num_boot_page_allocations++;
-}
-
-static void check_zeroed_page(u32 pfn, int type, struct page *page)
-{
- u32 *ptr;
- int i;
- int limit = PAGE_SIZE / sizeof(int);
-
- if (page_address(page))
- ptr = (u32 *)page_address(page);
- else
- ptr = (u32 *)__va(pfn << PAGE_SHIFT);
- /*
- * When cloning the root in non-PAE mode, only the userspace
- * pdes need to be zeroed.
- */
- if (type & VMI_PAGE_CLONE)
- limit = KERNEL_PGD_BOUNDARY;
- for (i = 0; i < limit; i++)
- BUG_ON(ptr[i]);
-}
-
-/*
- * We stash the page type into struct page so we can verify the page
- * types are used properly.
- */
-static void vmi_set_page_type(u32 pfn, int type)
-{
- /* PAE can have multiple roots per page - don't track */
- if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
- return;
-
- if (boot_allocations_applied) {
- struct page *page = pfn_to_page(pfn);
- if (type != VMI_PAGE_NORMAL)
- BUG_ON(page->type);
- else
- BUG_ON(page->type == VMI_PAGE_NORMAL);
- page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
- if (type & VMI_PAGE_ZEROED)
- check_zeroed_page(pfn, type, page);
- } else {
- record_page_type(pfn, type);
- }
-}
-
-static void vmi_check_page_type(u32 pfn, int type)
-{
- /* PAE can have multiple roots per page - skip checks */
- if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
- return;
-
- type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
- if (boot_allocations_applied) {
- struct page *page = pfn_to_page(pfn);
- BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
- BUG_ON(type == VMI_PAGE_NORMAL && page->type);
- BUG_ON((type & page->type) == 0);
- }
-}
-#else
-#define vmi_set_page_type(p,t) do { } while (0)
-#define vmi_check_page_type(p,t) do { } while (0)
-#endif
-
#ifdef CONFIG_HIGHPTE
static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
{
@@ -395,7 +292,6 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
{
- vmi_set_page_type(pfn, VMI_PAGE_L1);
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
}
@@ -406,27 +302,22 @@ static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
* It is called only for swapper_pg_dir, which already has
* data on it.
*/
- vmi_set_page_type(pfn, VMI_PAGE_L2);
vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
}
static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
{
- vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
- vmi_check_page_type(clonepfn, VMI_PAGE_L2);
vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
}
static void vmi_release_pte(unsigned long pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L1);
- vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
}
static void vmi_release_pmd(unsigned long pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L2);
- vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
}
/*
@@ -450,26 +341,22 @@ static void vmi_release_pmd(unsigned long pfn)
static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
- vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
}
static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
- vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
}
static void vmi_set_pte(pte_t *ptep, pte_t pte)
{
/* XXX because of set_pmd_pte, this can be called on PT or PD layers */
- vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
}
static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
{
- vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
}
@@ -477,10 +364,8 @@ static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
#ifdef CONFIG_X86_PAE
const pte_t pte = { .pte = pmdval.pmd };
- vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
#else
const pte_t pte = { pmdval.pud.pgd.pgd };
- vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
#endif
vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
}
@@ -502,7 +387,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
{
- vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
}
@@ -510,21 +394,18 @@ static void vmi_set_pud(pud_t *pudp, pud_t pudval)
{
/* Um, eww */
const pte_t pte = { .pte = pudval.pgd.pgd };
- vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
}
static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
const pte_t pte = { .pte = 0 };
- vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
}
static void vmi_pmd_clear(pmd_t *pmd)
{
const pte_t pte = { .pte = 0 };
- vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
}
#endif
@@ -960,8 +841,6 @@ static inline int __init activate_vmi(void)
void __init vmi_init(void)
{
- unsigned long flags;
-
if (!vmi_rom)
probe_vmi_rom();
else
@@ -973,13 +852,21 @@ void __init vmi_init(void)
reserve_top_address(-vmi_rom->virtual_top);
- local_irq_save(flags);
- activate_vmi();
-
#ifdef CONFIG_X86_IO_APIC
/* This is virtual hardware; timer routing is wired correctly */
no_timer_check = 1;
#endif
+}
+
+void vmi_activate(void)
+{
+ unsigned long flags;
+
+ if (!vmi_rom)
+ return;
+
+ local_irq_save(flags);
+ activate_vmi();
local_irq_restore(flags & X86_EFLAGS_IF);
}
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 6953859fe289..254ee07f8635 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -235,11 +235,14 @@ static void __devinit vmi_time_init_clockevent(void)
void __init vmi_time_init(void)
{
+ unsigned int cpu;
/* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
vmi_time_init_clockevent();
setup_irq(0, &vmi_clock_action);
+ for_each_possible_cpu(cpu)
+ per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
}
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index a9b8560adbc2..82c67559dde7 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -44,6 +44,7 @@ SECTIONS
SCHED_TEXT
LOCK_TEXT
KPROBES_TEXT
+ IRQENTRY_TEXT
*(.fixup)
*(.gnu.warning)
_etext = .; /* End of text section */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 46e05447405b..1a614c0e6bef 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -35,6 +35,7 @@ SECTIONS
SCHED_TEXT
LOCK_TEXT
KPROBES_TEXT
+ IRQENTRY_TEXT
*(.fixup)
*(.gnu.warning)
_etext = .; /* End of text section */
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 7766d36983fc..a688f3bfaec2 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -78,7 +78,7 @@ static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
static void __init set_vsmp_pv_ops(void)
{
- void *address;
+ void __iomem *address;
unsigned int cap, ctl, cfg;
/* set vSMP magic bits to indicate vSMP capable kernel */
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 0b8b6690a86d..44153afc9067 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -17,6 +17,9 @@
* want per guest time just set the kernel.vsyscall64 sysctl to 0.
*/
+/* Disable profiling for userspace code: */
+#define DISABLE_BRANCH_PROFILING
+
#include <linux/time.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -128,7 +131,16 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
gettimeofday(tv,NULL);
return;
}
+
+ /*
+ * Surround the RDTSC by barriers, to make sure it's not
+ * speculated to outside the seqlock critical section and
+ * does not cause time warps:
+ */
+ rdtsc_barrier();
now = vread();
+ rdtsc_barrier();
+
base = __vsyscall_gtod_data.clock.cycle_last;
mask = __vsyscall_gtod_data.clock.mask;
mult = __vsyscall_gtod_data.clock.mult;
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index b545f371b5f5..695e426aa354 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -12,7 +12,7 @@
#include <asm/desc.h>
#include <asm/ftrace.h>
-#ifdef CONFIG_FTRACE
+#ifdef CONFIG_FUNCTION_TRACER
/* mcount is defined in assembly */
EXPORT_SYMBOL(mcount);
#endif
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 9abac8a9d823..15c3e6999182 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -248,7 +248,7 @@ clear:
* This will be saved when ever the FP and extended state context is
* saved on the user stack during the signal handler delivery to the user.
*/
-void prepare_fx_sw_frame(void)
+static void prepare_fx_sw_frame(void)
{
int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) +
FP_XSTATE_MAGIC2_SIZE;
@@ -310,7 +310,7 @@ static void __init setup_xstate_init(void)
/*
* Enable and initialize the xsave feature.
*/
-void __init xsave_cntxt_init(void)
+void __ref xsave_cntxt_init(void)
{
unsigned int eax, ebx, ecx, edx;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ce3251ce5504..b81125f0bdee 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -20,6 +20,8 @@ if VIRTUALIZATION
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
+ # for device assignment:
+ depends on PCI
select PREEMPT_NOTIFIERS
select MMU_NOTIFIER
select ANON_INODES
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d0e940bb6f40..c02343594b4d 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,10 +3,13 @@
#
common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
- coalesced_mmio.o)
+ coalesced_mmio.o irq_comm.o)
ifeq ($(CONFIG_KVM_TRACE),y)
common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
endif
+ifeq ($(CONFIG_DMAR),y)
+common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
+endif
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c0f7872a9124..59ebd37ad79e 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,13 +200,14 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
if (!atomic_inc_and_test(&pt->pending))
set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
- if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
- vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ if (vcpu0 && waitqueue_active(&vcpu0->wq))
wake_up_interruptible(&vcpu0->wq);
- }
- pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
- pt->scheduled = ktime_to_ns(pt->timer.expires);
+ hrtimer_add_expires_ns(&pt->timer, pt->period);
+ pt->scheduled = hrtimer_get_expires_ns(&pt->timer);
+ if (pt->period)
+ ps->channels[0].count_load_time = hrtimer_get_expires(&pt->timer);
return (pt->period == 0 ? 0 : 1);
}
@@ -215,12 +216,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
{
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
- if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending)
+ if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
return atomic_read(&pit->pit_state.pit_timer.pending);
-
return 0;
}
+static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+ struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
+ irq_ack_notifier);
+ spin_lock(&ps->inject_lock);
+ if (atomic_dec_return(&ps->pit_timer.pending) < 0)
+ atomic_inc(&ps->pit_timer.pending);
+ ps->irq_ack = 1;
+ spin_unlock(&ps->inject_lock);
+}
+
static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
{
struct kvm_kpit_state *ps;
@@ -246,7 +257,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
timer = &pit->pit_state.pit_timer.timer;
if (hrtimer_cancel(timer))
- hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
static void destroy_pit_timer(struct kvm_kpit_timer *pt)
@@ -255,8 +266,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt)
hrtimer_cancel(&pt->timer);
}
-static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
+static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
{
+ struct kvm_kpit_timer *pt = &ps->pit_timer;
s64 interval;
interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -268,6 +280,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
pt->period = (is_period == 0) ? 0 : interval;
pt->timer.function = pit_timer_fn;
atomic_set(&pt->pending, 0);
+ ps->irq_ack = 1;
hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
HRTIMER_MODE_ABS);
@@ -302,11 +315,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
case 1:
/* FIXME: enhance mode 4 precision */
case 4:
- create_pit_timer(&ps->pit_timer, val, 0);
+ create_pit_timer(ps, val, 0);
break;
case 2:
case 3:
- create_pit_timer(&ps->pit_timer, val, 1);
+ create_pit_timer(ps, val, 1);
break;
default:
destroy_pit_timer(&ps->pit_timer);
@@ -520,7 +533,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
mutex_unlock(&pit->pit_state.lock);
atomic_set(&pit->pit_state.pit_timer.pending, 0);
- pit->pit_state.inject_pending = 1;
+ pit->pit_state.irq_ack = 1;
}
struct kvm_pit *kvm_create_pit(struct kvm *kvm)
@@ -532,8 +545,17 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
if (!pit)
return NULL;
+ mutex_lock(&kvm->lock);
+ pit->irq_source_id = kvm_request_irq_source_id(kvm);
+ mutex_unlock(&kvm->lock);
+ if (pit->irq_source_id < 0) {
+ kfree(pit);
+ return NULL;
+ }
+
mutex_init(&pit->pit_state.lock);
mutex_lock(&pit->pit_state.lock);
+ spin_lock_init(&pit->pit_state.inject_lock);
/* Initialize PIO device */
pit->dev.read = pit_ioport_read;
@@ -555,6 +577,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
pit_state->pit = pit;
hrtimer_init(&pit_state->pit_timer.timer,
CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ pit_state->irq_ack_notifier.gsi = 0;
+ pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
+ kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
mutex_unlock(&pit->pit_state.lock);
kvm_pit_reset(pit);
@@ -570,6 +595,7 @@ void kvm_free_pit(struct kvm *kvm)
mutex_lock(&kvm->arch.vpit->pit_state.lock);
timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
hrtimer_cancel(timer);
+ kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
kfree(kvm->arch.vpit);
}
@@ -578,10 +604,8 @@ void kvm_free_pit(struct kvm *kvm)
static void __inject_pit_timer_intr(struct kvm *kvm)
{
mutex_lock(&kvm->lock);
- kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
- kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0);
- kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
- kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
+ kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
+ kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
mutex_unlock(&kvm->lock);
}
@@ -592,37 +616,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
struct kvm_kpit_state *ps;
if (vcpu && pit) {
+ int inject = 0;
ps = &pit->pit_state;
- /* Try to inject pending interrupts when:
- * 1. Pending exists
- * 2. Last interrupt was accepted or waited for too long time*/
- if (atomic_read(&ps->pit_timer.pending) &&
- (ps->inject_pending ||
- (jiffies - ps->last_injected_time
- >= KVM_MAX_PIT_INTR_INTERVAL))) {
- ps->inject_pending = 0;
- __inject_pit_timer_intr(kvm);
- ps->last_injected_time = jiffies;
- }
- }
-}
-
-void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
-{
- struct kvm_arch *arch = &vcpu->kvm->arch;
- struct kvm_kpit_state *ps;
-
- if (vcpu && arch->vpit) {
- ps = &arch->vpit->pit_state;
- if (atomic_read(&ps->pit_timer.pending) &&
- (((arch->vpic->pics[0].imr & 1) == 0 &&
- arch->vpic->pics[0].irq_base == vec) ||
- (arch->vioapic->redirtbl[0].fields.vector == vec &&
- arch->vioapic->redirtbl[0].fields.mask != 1))) {
- ps->inject_pending = 1;
- atomic_dec(&ps->pit_timer.pending);
- ps->channels[0].count_load_time = ktime_get();
+ /* Try to inject pending interrupts when
+ * last one has been acked.
+ */
+ spin_lock(&ps->inject_lock);
+ if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
+ ps->irq_ack = 0;
+ inject = 1;
}
+ spin_unlock(&ps->inject_lock);
+ if (inject)
+ __inject_pit_timer_intr(kvm);
}
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index db25c2a6c8c4..4178022b97aa 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -8,7 +8,6 @@ struct kvm_kpit_timer {
int irq;
s64 period; /* unit: ns */
s64 scheduled;
- ktime_t last_update;
atomic_t pending;
};
@@ -34,8 +33,9 @@ struct kvm_kpit_state {
u32 speaker_data_on;
struct mutex lock;
struct kvm_pit *pit;
- bool inject_pending; /* if inject pending interrupts */
- unsigned long last_injected_time;
+ spinlock_t inject_lock;
+ unsigned long irq_ack;
+ struct kvm_irq_ack_notifier irq_ack_notifier;
};
struct kvm_pit {
@@ -44,6 +44,7 @@ struct kvm_pit {
struct kvm_io_device speaker_dev;
struct kvm *kvm;
struct kvm_kpit_state pit_state;
+ int irq_source_id;
};
#define KVM_PIT_BASE_ADDRESS 0x40
@@ -54,7 +55,6 @@ struct kvm_pit {
#define KVM_PIT_CHANNEL_MASK 0x3
void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
struct kvm_pit *kvm_create_pit(struct kvm *kvm);
void kvm_free_pit(struct kvm *kvm);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index c31164e8aa46..17e41e165f1a 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -30,6 +30,19 @@
#include <linux/kvm_host.h>
+static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
+{
+ s->isr &= ~(1 << irq);
+ s->isr_ack |= (1 << irq);
+}
+
+void kvm_pic_clear_isr_ack(struct kvm *kvm)
+{
+ struct kvm_pic *s = pic_irqchip(kvm);
+ s->pics[0].isr_ack = 0xff;
+ s->pics[1].isr_ack = 0xff;
+}
+
/*
* set irq level. If an edge is detected, then the IRR is set to 1
*/
@@ -141,11 +154,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
*/
static inline void pic_intack(struct kvm_kpic_state *s, int irq)
{
+ s->isr |= 1 << irq;
if (s->auto_eoi) {
if (s->rotate_on_auto_eoi)
s->priority_add = (irq + 1) & 7;
- } else
- s->isr |= (1 << irq);
+ pic_clear_isr(s, irq);
+ }
/*
* We don't clear a level sensitive interrupt here
*/
@@ -153,9 +167,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
s->irr &= ~(1 << irq);
}
-int kvm_pic_read_irq(struct kvm_pic *s)
+int kvm_pic_read_irq(struct kvm *kvm)
{
int irq, irq2, intno;
+ struct kvm_pic *s = pic_irqchip(kvm);
irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) {
@@ -181,16 +196,32 @@ int kvm_pic_read_irq(struct kvm_pic *s)
intno = s->pics[0].irq_base + irq;
}
pic_update_irq(s);
+ kvm_notify_acked_irq(kvm, irq);
return intno;
}
void kvm_pic_reset(struct kvm_kpic_state *s)
{
+ int irq, irqbase;
+ struct kvm *kvm = s->pics_state->irq_request_opaque;
+ struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
+
+ if (s == &s->pics_state->pics[0])
+ irqbase = 0;
+ else
+ irqbase = 8;
+
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
+ if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
+ if (s->irr & (1 << irq) || s->isr & (1 << irq))
+ kvm_notify_acked_irq(kvm, irq+irqbase);
+ }
s->last_irr = 0;
s->irr = 0;
s->imr = 0;
s->isr = 0;
+ s->isr_ack = 0xff;
s->priority_add = 0;
s->irq_base = 0;
s->read_reg_select = 0;
@@ -243,7 +274,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
priority = get_priority(s, s->isr);
if (priority != 8) {
irq = (priority + s->priority_add) & 7;
- s->isr &= ~(1 << irq);
+ pic_clear_isr(s, irq);
if (cmd == 5)
s->priority_add = (irq + 1) & 7;
pic_update_irq(s->pics_state);
@@ -251,7 +282,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
break;
case 3:
irq = val & 7;
- s->isr &= ~(1 << irq);
+ pic_clear_isr(s, irq);
pic_update_irq(s->pics_state);
break;
case 6:
@@ -260,8 +291,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
break;
case 7:
irq = val & 7;
- s->isr &= ~(1 << irq);
s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
pic_update_irq(s->pics_state);
break;
default:
@@ -303,7 +334,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
s->pics_state->pics[0].irr &= ~(1 << 2);
}
s->irr &= ~(1 << ret);
- s->isr &= ~(1 << ret);
+ pic_clear_isr(s, ret);
if (addr1 >> 7 || ret != 2)
pic_update_irq(s->pics_state);
} else {
@@ -422,10 +453,14 @@ static void pic_irq_request(void *opaque, int level)
{
struct kvm *kvm = opaque;
struct kvm_vcpu *vcpu = kvm->vcpus[0];
+ struct kvm_pic *s = pic_irqchip(kvm);
+ int irq = pic_get_irq(&s->pics[0]);
- pic_irqchip(kvm)->output = level;
- if (vcpu)
+ s->output = level;
+ if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
+ s->pics[0].isr_ack &= ~(1 << irq);
kvm_vcpu_kick(vcpu);
+ }
}
struct kvm_pic *kvm_create_pic(struct kvm *kvm)
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 76d736b5f664..c019b8edcdb7 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
if (kvm_apic_accept_pic_intr(v)) {
s = pic_irqchip(v->kvm);
s->output = 0; /* PIC */
- vector = kvm_pic_read_irq(s);
+ vector = kvm_pic_read_irq(v->kvm);
}
}
return vector;
@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
{
kvm_apic_timer_intr_post(vcpu, vec);
- kvm_pit_timer_intr_post(vcpu, vec);
/* TODO: PIT, RTC etc. */
}
EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7ca47cbb48bb..f17c8f5bbf31 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -42,6 +42,7 @@ struct kvm_kpic_state {
u8 irr; /* interrupt request register */
u8 imr; /* interrupt mask register */
u8 isr; /* interrupt service register */
+ u8 isr_ack; /* interrupt ack detection */
u8 priority_add; /* highest irq priority */
u8 irq_base;
u8 read_reg_select;
@@ -63,12 +64,13 @@ struct kvm_pic {
void *irq_request_opaque;
int output; /* intr from master PIC */
struct kvm_io_device dev;
+ void (*ack_notifier)(void *opaque, int irq);
};
struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_pic_set_irq(void *opaque, int irq, int level);
-int kvm_pic_read_irq(struct kvm_pic *s);
+int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
+void kvm_pic_clear_isr_ack(struct kvm *kvm);
static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
new file mode 100644
index 000000000000..1ff819dce7d3
--- /dev/null
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -0,0 +1,32 @@
+#ifndef ASM_KVM_CACHE_REGS_H
+#define ASM_KVM_CACHE_REGS_H
+
+static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->cache_reg(vcpu, reg);
+
+ return vcpu->arch.regs[reg];
+}
+
+static inline void kvm_register_write(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg,
+ unsigned long val)
+{
+ vcpu->arch.regs[reg] = val;
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
+{
+ return kvm_register_read(vcpu, VCPU_REGS_RIP);
+}
+
+static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ kvm_register_write(vcpu, VCPU_REGS_RIP, val);
+}
+
+#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 73f43de69f67..0fc3cab48943 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,6 +32,7 @@
#include <asm/current.h>
#include <asm/apicdef.h>
#include <asm/atomic.h>
+#include "kvm_cache_regs.h"
#include "irq.h"
#define PRId64 "d"
@@ -338,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
} else
apic_clear_vector(vector, apic->regs + APIC_TMR);
- if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
- kvm_vcpu_kick(vcpu);
- else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- if (waitqueue_active(&vcpu->wq))
- wake_up_interruptible(&vcpu->wq);
- }
+ kvm_vcpu_kick(vcpu);
result = (orig_irr == 0);
break;
@@ -370,21 +365,18 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
kvm_vcpu_kick(vcpu);
} else {
- printk(KERN_DEBUG
- "Ignoring de-assert INIT to vcpu %d\n",
- vcpu->vcpu_id);
+ apic_debug("Ignoring de-assert INIT to vcpu %d\n",
+ vcpu->vcpu_id);
}
-
break;
case APIC_DM_STARTUP:
- printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
- vcpu->vcpu_id, vector);
+ apic_debug("SIPI to vcpu %d vector 0x%02x\n",
+ vcpu->vcpu_id, vector);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
vcpu->arch.sipi_vector = vector;
vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
- if (waitqueue_active(&vcpu->wq))
- wake_up_interruptible(&vcpu->wq);
+ kvm_vcpu_kick(vcpu);
}
break;
@@ -438,7 +430,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
static void apic_set_eoi(struct kvm_lapic *apic)
{
int vector = apic_find_highest_isr(apic);
-
+ int trigger_mode;
/*
* Not every write EOI will has corresponding ISR,
* one example is when Kernel check timer on setup_IO_APIC
@@ -450,7 +442,10 @@ static void apic_set_eoi(struct kvm_lapic *apic)
apic_update_ppr(apic);
if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
- kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
+ trigger_mode = IOAPIC_LEVEL_TRIG;
+ else
+ trigger_mode = IOAPIC_EDGE_TRIG;
+ kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
}
static void apic_send_ipi(struct kvm_lapic *apic)
@@ -558,8 +553,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
struct kvm_run *run = vcpu->run;
set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
- kvm_x86_ops->cache_regs(vcpu);
- run->tpr_access.rip = vcpu->arch.rip;
+ run->tpr_access.rip = kvm_rip_read(vcpu);
run->tpr_access.is_write = write;
}
@@ -683,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this,
* Refer SDM 8.4.1
*/
if (len != 4 || alignment) {
- if (printk_ratelimit())
- printk(KERN_ERR "apic write: bad size=%d %lx\n",
- len, (long)address);
+ /* Don't shout loud, $infamous_os would cause only noise. */
+ apic_debug("apic write: bad size=%d %lx\n",
+ len, (long)address);
return;
}
@@ -947,15 +941,12 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
if(!atomic_inc_and_test(&apic->timer.pending))
set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
- if (waitqueue_active(q)) {
- apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ if (waitqueue_active(q))
wake_up_interruptible(q);
- }
+
if (apic_lvtt_period(apic)) {
result = 1;
- apic->timer.dev.expires = ktime_add_ns(
- apic->timer.dev.expires,
- apic->timer.period);
+ hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period);
}
return result;
}
@@ -1124,7 +1115,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
timer = &apic->timer.dev;
if (hrtimer_cancel(timer))
- hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3da2508eb22a..410ddbc1aa2e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -70,6 +70,9 @@ static int dbg = 0;
module_param(dbg, bool, 0644);
#endif
+static int oos_shadow = 1;
+module_param(oos_shadow, bool, 0644);
+
#ifndef MMU_DEBUG
#define ASSERT(x) do { } while (0)
#else
@@ -135,18 +138,24 @@ module_param(dbg, bool, 0644);
#define ACC_USER_MASK PT_USER_MASK
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
-struct kvm_pv_mmu_op_buffer {
- void *ptr;
- unsigned len;
- unsigned processed;
- char buf[512] __aligned(sizeof(long));
-};
+#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
struct kvm_rmap_desc {
u64 *shadow_ptes[RMAP_EXT];
struct kvm_rmap_desc *more;
};
+struct kvm_shadow_walk {
+ int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
+ u64 addr, u64 *spte, int level);
+};
+
+struct kvm_unsync_walk {
+ int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
+};
+
+typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
static struct kmem_cache *mmu_page_header_cache;
@@ -305,7 +314,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
if (r)
goto out;
r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
- rmap_desc_cache, 1);
+ rmap_desc_cache, 4);
if (r)
goto out;
r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -405,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
{
struct vm_area_struct *vma;
unsigned long addr;
+ int ret = 0;
addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr))
- return 0;
+ return ret;
+ down_read(&current->mm->mmap_sem);
vma = find_vma(current->mm, addr);
if (vma && is_vm_hugetlb_page(vma))
- return 1;
+ ret = 1;
+ up_read(&current->mm->mmap_sem);
- return 0;
+ return ret;
}
static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -649,8 +661,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
if (write_protected)
kvm_flush_remote_tlbs(kvm);
-
- account_shadowed(kvm, gfn);
}
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -859,6 +869,77 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
BUG();
}
+
+static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ mmu_parent_walk_fn fn)
+{
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ struct kvm_mmu_page *parent_sp;
+ int i;
+
+ if (!sp->multimapped && sp->parent_pte) {
+ parent_sp = page_header(__pa(sp->parent_pte));
+ fn(vcpu, parent_sp);
+ mmu_parent_walk(vcpu, parent_sp, fn);
+ return;
+ }
+ hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+ if (!pte_chain->parent_ptes[i])
+ break;
+ parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
+ fn(vcpu, parent_sp);
+ mmu_parent_walk(vcpu, parent_sp, fn);
+ }
+}
+
+static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+{
+ unsigned int index;
+ struct kvm_mmu_page *sp = page_header(__pa(spte));
+
+ index = spte - sp->spt;
+ __set_bit(index, sp->unsync_child_bitmap);
+ sp->unsync_children = 1;
+}
+
+static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
+{
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ int i;
+
+ if (!sp->parent_pte)
+ return;
+
+ if (!sp->multimapped) {
+ kvm_mmu_update_unsync_bitmap(sp->parent_pte);
+ return;
+ }
+
+ hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+ if (!pte_chain->parent_ptes[i])
+ break;
+ kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
+ }
+}
+
+static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ sp->unsync_children = 1;
+ kvm_mmu_update_parents_unsync(sp);
+ return 1;
+}
+
+static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+ kvm_mmu_update_parents_unsync(sp);
+}
+
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
@@ -868,6 +949,58 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
sp->spt[i] = shadow_trap_nonpresent_pte;
}
+static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ return 1;
+}
+
+static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+}
+
+#define for_each_unsync_children(bitmap, idx) \
+ for (idx = find_first_bit(bitmap, 512); \
+ idx < 512; \
+ idx = find_next_bit(bitmap, 512, idx+1))
+
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_unsync_walk *walker)
+{
+ int i, ret;
+
+ if (!sp->unsync_children)
+ return 0;
+
+ for_each_unsync_children(sp->unsync_child_bitmap, i) {
+ u64 ent = sp->spt[i];
+
+ if (is_shadow_present_pte(ent)) {
+ struct kvm_mmu_page *child;
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+ if (child->unsync_children) {
+ ret = mmu_unsync_walk(child, walker);
+ if (ret)
+ return ret;
+ __clear_bit(i, sp->unsync_child_bitmap);
+ }
+
+ if (child->unsync) {
+ ret = walker->entry(child, walker);
+ __clear_bit(i, sp->unsync_child_bitmap);
+ if (ret)
+ return ret;
+ }
+ }
+ }
+
+ if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
+ sp->unsync_children = 0;
+
+ return 0;
+}
+
static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
{
unsigned index;
@@ -888,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
return NULL;
}
+static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ WARN_ON(!sp->unsync);
+ sp->unsync = 0;
+ --kvm->stat.mmu_unsync;
+}
+
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ if (sp->role.glevels != vcpu->arch.mmu.root_level) {
+ kvm_mmu_zap_page(vcpu->kvm, sp);
+ return 1;
+ }
+
+ rmap_write_protect(vcpu->kvm, sp->gfn);
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+ if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+ kvm_mmu_zap_page(vcpu->kvm, sp);
+ return 1;
+ }
+
+ kvm_mmu_flush_tlb(vcpu);
+ return 0;
+}
+
+struct sync_walker {
+ struct kvm_vcpu *vcpu;
+ struct kvm_unsync_walk walker;
+};
+
+static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+{
+ struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
+ walker);
+ struct kvm_vcpu *vcpu = sync_walk->vcpu;
+
+ kvm_sync_page(vcpu, sp);
+ return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
+}
+
+static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ struct sync_walker walker = {
+ .walker = { .entry = mmu_sync_fn, },
+ .vcpu = vcpu,
+ };
+
+ while (mmu_unsync_walk(sp, &walker.walker))
+ cond_resched_lock(&vcpu->kvm->mmu_lock);
+}
+
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
@@ -901,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
unsigned quadrant;
struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node;
+ struct hlist_node *node, *tmp;
role.word = 0;
role.glevels = vcpu->arch.mmu.root_level;
@@ -917,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn, role.word);
index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry(sp, node, bucket, hash_link)
- if (sp->gfn == gfn && sp->role.word == role.word) {
+ hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
+ if (sp->gfn == gfn) {
+ if (sp->unsync)
+ if (kvm_sync_page(vcpu, sp))
+ continue;
+
+ if (sp->role.word != role.word)
+ continue;
+
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+ if (sp->unsync_children) {
+ set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
+ kvm_mmu_mark_parents_unsync(vcpu, sp);
+ }
pgprintk("%s: found\n", __func__);
return sp;
}
@@ -931,8 +1128,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
sp->gfn = gfn;
sp->role = role;
hlist_add_head(&sp->hash_link, bucket);
- if (!metaphysical)
+ if (!metaphysical) {
rmap_write_protect(vcpu->kvm, gfn);
+ account_shadowed(vcpu->kvm, gfn);
+ }
if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
vcpu->arch.mmu.prefetch_page(vcpu, sp);
else
@@ -940,6 +1139,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
return sp;
}
+static int walk_shadow(struct kvm_shadow_walk *walker,
+ struct kvm_vcpu *vcpu, u64 addr)
+{
+ hpa_t shadow_addr;
+ int level;
+ int r;
+ u64 *sptep;
+ unsigned index;
+
+ shadow_addr = vcpu->arch.mmu.root_hpa;
+ level = vcpu->arch.mmu.shadow_root_level;
+ if (level == PT32E_ROOT_LEVEL) {
+ shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+ shadow_addr &= PT64_BASE_ADDR_MASK;
+ --level;
+ }
+
+ while (level >= PT_PAGE_TABLE_LEVEL) {
+ index = SHADOW_PT_INDEX(addr, level);
+ sptep = ((u64 *)__va(shadow_addr)) + index;
+ r = walker->entry(walker, vcpu, addr, sptep, level);
+ if (r)
+ return r;
+ shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
+ --level;
+ }
+ return 0;
+}
+
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
struct kvm_mmu_page *sp)
{
@@ -955,7 +1183,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
rmap_remove(kvm, &pt[i]);
pt[i] = shadow_trap_nonpresent_pte;
}
- kvm_flush_remote_tlbs(kvm);
return;
}
@@ -974,7 +1201,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
}
pt[i] = shadow_trap_nonpresent_pte;
}
- kvm_flush_remote_tlbs(kvm);
}
static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -991,11 +1217,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
kvm->vcpus[i]->arch.last_pte_updated = NULL;
}
-static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
{
u64 *parent_pte;
- ++kvm->stat.mmu_shadow_zapped;
while (sp->multimapped || sp->parent_pte) {
if (!sp->multimapped)
parent_pte = sp->parent_pte;
@@ -1010,21 +1235,59 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_put_page(sp, parent_pte);
set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
}
+}
+
+struct zap_walker {
+ struct kvm_unsync_walk walker;
+ struct kvm *kvm;
+ int zapped;
+};
+
+static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+{
+ struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
+ walker);
+ kvm_mmu_zap_page(zap_walk->kvm, sp);
+ zap_walk->zapped = 1;
+ return 0;
+}
+
+static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ struct zap_walker walker = {
+ .walker = { .entry = mmu_zap_fn, },
+ .kvm = kvm,
+ .zapped = 0,
+ };
+
+ if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+ return 0;
+ mmu_unsync_walk(sp, &walker.walker);
+ return walker.zapped;
+}
+
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ int ret;
+ ++kvm->stat.mmu_shadow_zapped;
+ ret = mmu_zap_unsync_children(kvm, sp);
kvm_mmu_page_unlink_children(kvm, sp);
+ kvm_mmu_unlink_parents(kvm, sp);
+ kvm_flush_remote_tlbs(kvm);
+ if (!sp->role.invalid && !sp->role.metaphysical)
+ unaccount_shadowed(kvm, sp->gfn);
+ if (sp->unsync)
+ kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
- if (!sp->role.metaphysical && !sp->role.invalid)
- unaccount_shadowed(kvm, sp->gfn);
hlist_del(&sp->hash_link);
kvm_mmu_free_page(kvm, sp);
} else {
- int invalid = sp->role.invalid;
- list_move(&sp->link, &kvm->arch.active_mmu_pages);
sp->role.invalid = 1;
+ list_move(&sp->link, &kvm->arch.active_mmu_pages);
kvm_reload_remote_mmus(kvm);
- if (!sp->role.metaphysical && !invalid)
- unaccount_shadowed(kvm, sp->gfn);
}
kvm_mmu_reset_last_pte_updated(kvm);
+ return ret;
}
/*
@@ -1077,8 +1340,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
if (sp->gfn == gfn && !sp->role.metaphysical) {
pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
sp->role.word);
- kvm_mmu_zap_page(kvm, sp);
r = 1;
+ if (kvm_mmu_zap_page(kvm, sp))
+ n = bucket->first;
}
return r;
}
@@ -1101,6 +1365,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
__set_bit(slot, &sp->slot_bitmap);
}
+static void mmu_convert_notrap(struct kvm_mmu_page *sp)
+{
+ int i;
+ u64 *pt = sp->spt;
+
+ if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
+ return;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (pt[i] == shadow_notrap_nonpresent_pte)
+ set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
+ }
+}
+
struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
{
struct page *page;
@@ -1110,51 +1388,60 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
if (gpa == UNMAPPED_GVA)
return NULL;
- down_read(&current->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- up_read(&current->mm->mmap_sem);
return page;
}
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
- unsigned pt_access, unsigned pte_access,
- int user_fault, int write_fault, int dirty,
- int *ptwrite, int largepage, gfn_t gfn,
- pfn_t pfn, bool speculative)
+static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
- u64 spte;
- int was_rmapped = 0;
- int was_writeble = is_writeble_pte(*shadow_pte);
+ unsigned index;
+ struct hlist_head *bucket;
+ struct kvm_mmu_page *s;
+ struct hlist_node *node, *n;
- pgprintk("%s: spte %llx access %x write_fault %d"
- " user_fault %d gfn %lx\n",
- __func__, *shadow_pte, pt_access,
- write_fault, user_fault, gfn);
+ index = kvm_page_table_hashfn(sp->gfn);
+ bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+ /* don't unsync if pagetable is shadowed with multiple roles */
+ hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
+ if (s->gfn != sp->gfn || s->role.metaphysical)
+ continue;
+ if (s->role.word != sp->role.word)
+ return 1;
+ }
+ kvm_mmu_mark_parents_unsync(vcpu, sp);
+ ++vcpu->kvm->stat.mmu_unsync;
+ sp->unsync = 1;
+ mmu_convert_notrap(sp);
+ return 0;
+}
- if (is_rmap_pte(*shadow_pte)) {
- /*
- * If we overwrite a PTE page pointer with a 2MB PMD, unlink
- * the parent of the now unreachable PTE.
- */
- if (largepage && !is_large_pte(*shadow_pte)) {
- struct kvm_mmu_page *child;
- u64 pte = *shadow_pte;
+static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool can_unsync)
+{
+ struct kvm_mmu_page *shadow;
- child = page_header(pte & PT64_BASE_ADDR_MASK);
- mmu_page_remove_parent_pte(child, shadow_pte);
- } else if (pfn != spte_to_pfn(*shadow_pte)) {
- pgprintk("hfn old %lx new %lx\n",
- spte_to_pfn(*shadow_pte), pfn);
- rmap_remove(vcpu->kvm, shadow_pte);
- } else {
- if (largepage)
- was_rmapped = is_large_pte(*shadow_pte);
- else
- was_rmapped = 1;
- }
+ shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
+ if (shadow) {
+ if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
+ return 1;
+ if (shadow->unsync)
+ return 0;
+ if (can_unsync && oos_shadow)
+ return kvm_unsync_page(vcpu, shadow);
+ return 1;
}
+ return 0;
+}
+static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+ unsigned pte_access, int user_fault,
+ int write_fault, int dirty, int largepage,
+ gfn_t gfn, pfn_t pfn, bool speculative,
+ bool can_unsync)
+{
+ u64 spte;
+ int ret = 0;
/*
* We don't set the accessed bit, since we sometimes want to see
* whether the guest actually used the pte (in order to detect
@@ -1162,7 +1449,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
*/
spte = shadow_base_present_pte | shadow_dirty_mask;
if (!speculative)
- pte_access |= PT_ACCESSED_MASK;
+ spte |= shadow_accessed_mask;
if (!dirty)
pte_access &= ~ACC_WRITE_MASK;
if (pte_access & ACC_EXEC_MASK)
@@ -1178,35 +1465,82 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
if ((pte_access & ACC_WRITE_MASK)
|| (write_fault && !is_write_protection(vcpu) && !user_fault)) {
- struct kvm_mmu_page *shadow;
+
+ if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
+ ret = 1;
+ spte = shadow_trap_nonpresent_pte;
+ goto set_pte;
+ }
spte |= PT_WRITABLE_MASK;
- shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
- if (shadow ||
- (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
+ if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__func__, gfn);
+ ret = 1;
pte_access &= ~ACC_WRITE_MASK;
- if (is_writeble_pte(spte)) {
+ if (is_writeble_pte(spte))
spte &= ~PT_WRITABLE_MASK;
- kvm_x86_ops->tlb_flush(vcpu);
- }
- if (write_fault)
- *ptwrite = 1;
}
}
if (pte_access & ACC_WRITE_MASK)
mark_page_dirty(vcpu->kvm, gfn);
- pgprintk("%s: setting spte %llx\n", __func__, spte);
- pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
- (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
- (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
+set_pte:
set_shadow_pte(shadow_pte, spte);
- if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK)
- && (spte & PT_PRESENT_MASK))
+ return ret;
+}
+
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+ unsigned pt_access, unsigned pte_access,
+ int user_fault, int write_fault, int dirty,
+ int *ptwrite, int largepage, gfn_t gfn,
+ pfn_t pfn, bool speculative)
+{
+ int was_rmapped = 0;
+ int was_writeble = is_writeble_pte(*shadow_pte);
+
+ pgprintk("%s: spte %llx access %x write_fault %d"
+ " user_fault %d gfn %lx\n",
+ __func__, *shadow_pte, pt_access,
+ write_fault, user_fault, gfn);
+
+ if (is_rmap_pte(*shadow_pte)) {
+ /*
+ * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+ * the parent of the now unreachable PTE.
+ */
+ if (largepage && !is_large_pte(*shadow_pte)) {
+ struct kvm_mmu_page *child;
+ u64 pte = *shadow_pte;
+
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ mmu_page_remove_parent_pte(child, shadow_pte);
+ } else if (pfn != spte_to_pfn(*shadow_pte)) {
+ pgprintk("hfn old %lx new %lx\n",
+ spte_to_pfn(*shadow_pte), pfn);
+ rmap_remove(vcpu->kvm, shadow_pte);
+ } else {
+ if (largepage)
+ was_rmapped = is_large_pte(*shadow_pte);
+ else
+ was_rmapped = 1;
+ }
+ }
+ if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
+ dirty, largepage, gfn, pfn, speculative, true)) {
+ if (write_fault)
+ *ptwrite = 1;
+ kvm_x86_ops->tlb_flush(vcpu);
+ }
+
+ pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
+ pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
+ is_large_pte(*shadow_pte)? "2MB" : "4kB",
+ is_present_pte(*shadow_pte)?"RW":"R", gfn,
+ *shadow_pte, shadow_pte);
+ if (!was_rmapped && is_large_pte(*shadow_pte))
++vcpu->kvm->stat.lpages;
page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
@@ -1230,54 +1564,67 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
- int largepage, gfn_t gfn, pfn_t pfn,
- int level)
-{
- hpa_t table_addr = vcpu->arch.mmu.root_hpa;
- int pt_write = 0;
-
- for (; ; level--) {
- u32 index = PT64_INDEX(v, level);
- u64 *table;
-
- ASSERT(VALID_PAGE(table_addr));
- table = __va(table_addr);
+struct direct_shadow_walk {
+ struct kvm_shadow_walk walker;
+ pfn_t pfn;
+ int write;
+ int largepage;
+ int pt_write;
+};
- if (level == 1) {
- mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
- 0, write, 1, &pt_write, 0, gfn, pfn, false);
- return pt_write;
- }
+static int direct_map_entry(struct kvm_shadow_walk *_walk,
+ struct kvm_vcpu *vcpu,
+ u64 addr, u64 *sptep, int level)
+{
+ struct direct_shadow_walk *walk =
+ container_of(_walk, struct direct_shadow_walk, walker);
+ struct kvm_mmu_page *sp;
+ gfn_t pseudo_gfn;
+ gfn_t gfn = addr >> PAGE_SHIFT;
+
+ if (level == PT_PAGE_TABLE_LEVEL
+ || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
+ mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
+ 0, walk->write, 1, &walk->pt_write,
+ walk->largepage, gfn, walk->pfn, false);
+ ++vcpu->stat.pf_fixed;
+ return 1;
+ }
- if (largepage && level == 2) {
- mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
- 0, write, 1, &pt_write, 1, gfn, pfn, false);
- return pt_write;
+ if (*sptep == shadow_trap_nonpresent_pte) {
+ pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
+ 1, ACC_ALL, sptep);
+ if (!sp) {
+ pgprintk("nonpaging_map: ENOMEM\n");
+ kvm_release_pfn_clean(walk->pfn);
+ return -ENOMEM;
}
- if (table[index] == shadow_trap_nonpresent_pte) {
- struct kvm_mmu_page *new_table;
- gfn_t pseudo_gfn;
-
- pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
- >> PAGE_SHIFT;
- new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
- v, level - 1,
- 1, ACC_ALL, &table[index]);
- if (!new_table) {
- pgprintk("nonpaging_map: ENOMEM\n");
- kvm_release_pfn_clean(pfn);
- return -ENOMEM;
- }
-
- set_shadow_pte(&table[index],
- __pa(new_table->spt)
- | PT_PRESENT_MASK | PT_WRITABLE_MASK
- | shadow_user_mask | shadow_x_mask);
- }
- table_addr = table[index] & PT64_BASE_ADDR_MASK;
+ set_shadow_pte(sptep,
+ __pa(sp->spt)
+ | PT_PRESENT_MASK | PT_WRITABLE_MASK
+ | shadow_user_mask | shadow_x_mask);
}
+ return 0;
+}
+
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
+ int largepage, gfn_t gfn, pfn_t pfn)
+{
+ int r;
+ struct direct_shadow_walk walker = {
+ .walker = { .entry = direct_map_entry, },
+ .pfn = pfn,
+ .largepage = largepage,
+ .write = write,
+ .pt_write = 0,
+ };
+
+ r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
+ if (r < 0)
+ return r;
+ return walker.pt_write;
}
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
@@ -1287,16 +1634,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
pfn_t pfn;
unsigned long mmu_seq;
- down_read(&current->mm->mmap_sem);
if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
gfn &= ~(KVM_PAGES_PER_HPAGE-1);
largepage = 1;
}
mmu_seq = vcpu->kvm->mmu_notifier_seq;
- /* implicit mb(), we'll read before PT lock is unlocked */
+ smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
- up_read(&current->mm->mmap_sem);
/* mmio */
if (is_error_pfn(pfn)) {
@@ -1308,8 +1653,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
- r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
- PT32E_ROOT_LEVEL);
+ r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -1405,6 +1749,37 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
}
+static void mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+ sp = page_header(root);
+ mmu_sync_children(vcpu, sp);
+ return;
+ }
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ mmu_sync_children(vcpu, sp);
+ }
+ }
+}
+
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+ spin_lock(&vcpu->kvm->mmu_lock);
+ mmu_sync_roots(vcpu);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
{
return vaddr;
@@ -1446,15 +1821,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
if (r)
return r;
- down_read(&current->mm->mmap_sem);
if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
gfn &= ~(KVM_PAGES_PER_HPAGE-1);
largepage = 1;
}
mmu_seq = vcpu->kvm->mmu_notifier_seq;
- /* implicit mb(), we'll read before PT lock is unlocked */
+ smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
- up_read(&current->mm->mmap_sem);
if (is_error_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
return 1;
@@ -1464,7 +1837,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
- largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
+ largepage, gfn, pfn);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
@@ -1489,6 +1862,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->free = nonpaging_free;
context->prefetch_page = nonpaging_prefetch_page;
+ context->sync_page = nonpaging_sync_page;
+ context->invlpg = nonpaging_invlpg;
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
@@ -1536,6 +1911,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
context->prefetch_page = paging64_prefetch_page;
+ context->sync_page = paging64_sync_page;
+ context->invlpg = paging64_invlpg;
context->free = paging_free;
context->root_level = level;
context->shadow_root_level = level;
@@ -1557,6 +1934,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
context->gva_to_gpa = paging32_gva_to_gpa;
context->free = paging_free;
context->prefetch_page = paging32_prefetch_page;
+ context->sync_page = paging32_sync_page;
+ context->invlpg = paging32_invlpg;
context->root_level = PT32_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
@@ -1576,6 +1955,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->page_fault = tdp_page_fault;
context->free = nonpaging_free;
context->prefetch_page = nonpaging_prefetch_page;
+ context->sync_page = nonpaging_sync_page;
+ context->invlpg = nonpaging_invlpg;
context->shadow_root_level = kvm_x86_ops->get_tdp_level();
context->root_hpa = INVALID_PAGE;
@@ -1647,6 +2028,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
mmu_alloc_roots(vcpu);
+ mmu_sync_roots(vcpu);
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
kvm_mmu_flush_tlb(vcpu);
@@ -1767,15 +2149,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
return;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
- down_read(&current->mm->mmap_sem);
if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
gfn &= ~(KVM_PAGES_PER_HPAGE-1);
vcpu->arch.update_pte.largepage = 1;
}
vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
- /* implicit mb(), we'll read before PT lock is unlocked */
+ smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
- up_read(&current->mm->mmap_sem);
if (is_error_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
@@ -1837,7 +2217,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
- if (sp->gfn != gfn || sp->role.metaphysical)
+ if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
continue;
pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
@@ -1855,7 +2235,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
*/
pgprintk("misaligned: gpa %llx bytes %d role %x\n",
gpa, bytes, sp->role.word);
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ if (kvm_mmu_zap_page(vcpu->kvm, sp))
+ n = bucket->first;
++vcpu->kvm->stat.mmu_flooded;
continue;
}
@@ -1969,6 +2350,16 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ spin_lock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.invlpg(vcpu, gva);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_flush_tlb(vcpu);
+ ++vcpu->stat.invlpg;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
+
void kvm_enable_tdp(void)
{
tdp_enabled = true;
@@ -2055,6 +2446,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
{
struct kvm_mmu_page *sp;
+ spin_lock(&kvm->mmu_lock);
list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
int i;
u64 *pt;
@@ -2068,6 +2460,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
if (pt[i] & PT_WRITABLE_MASK)
pt[i] &= ~PT_WRITABLE_MASK;
}
+ kvm_flush_remote_tlbs(kvm);
+ spin_unlock(&kvm->mmu_lock);
}
void kvm_mmu_zap_all(struct kvm *kvm)
@@ -2076,7 +2470,9 @@ void kvm_mmu_zap_all(struct kvm *kvm)
spin_lock(&kvm->mmu_lock);
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
- kvm_mmu_zap_page(kvm, sp);
+ if (kvm_mmu_zap_page(kvm, sp))
+ node = container_of(kvm->arch.active_mmu_pages.next,
+ struct kvm_mmu_page, link);
spin_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs(kvm);
@@ -2238,6 +2634,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
kvm_x86_ops->tlb_flush(vcpu);
+ set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
return 1;
}
@@ -2291,18 +2688,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
gpa_t addr, unsigned long *ret)
{
int r;
- struct kvm_pv_mmu_op_buffer buffer;
+ struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
- buffer.ptr = buffer.buf;
- buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf);
- buffer.processed = 0;
+ buffer->ptr = buffer->buf;
+ buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
+ buffer->processed = 0;
- r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len);
+ r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
if (r)
goto out;
- while (buffer.len) {
- r = kvm_pv_mmu_op_one(vcpu, &buffer);
+ while (buffer->len) {
+ r = kvm_pv_mmu_op_one(vcpu, buffer);
if (r < 0)
goto out;
if (r == 0)
@@ -2311,7 +2708,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
r = 1;
out:
- *ret = buffer.processed;
+ *ret = buffer->processed;
return r;
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4a814bff21f2..84eee43bbe74 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -25,11 +25,11 @@
#if PTTYPE == 64
#define pt_element_t u64
#define guest_walker guest_walker64
+ #define shadow_walker shadow_walker64
#define FNAME(name) paging##64_##name
#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
#define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
- #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT64_LEVEL_BITS
#ifdef CONFIG_X86_64
@@ -42,11 +42,11 @@
#elif PTTYPE == 32
#define pt_element_t u32
#define guest_walker guest_walker32
+ #define shadow_walker shadow_walker32
#define FNAME(name) paging##32_##name
#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
#define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
- #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
@@ -73,6 +73,17 @@ struct guest_walker {
u32 error_code;
};
+struct shadow_walker {
+ struct kvm_shadow_walk walker;
+ struct guest_walker *guest_walker;
+ int user_fault;
+ int write_fault;
+ int largepage;
+ int *ptwrite;
+ pfn_t pfn;
+ u64 *sptep;
+};
+
static gfn_t gpte_to_gfn(pt_element_t gpte)
{
return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -91,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
pt_element_t *table;
struct page *page;
- down_read(&current->mm->mmap_sem);
page = gfn_to_page(kvm, table_gfn);
- up_read(&current->mm->mmap_sem);
table = kmap_atomic(page, KM_USER0);
-
ret = CMPXCHG(&table[index], orig_pte, new_pte);
-
kunmap_atomic(table, KM_USER0);
kvm_release_page_dirty(page);
@@ -274,86 +281,90 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
/*
* Fetch a shadow pte for a specific level in the paging hierarchy.
*/
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
- struct guest_walker *walker,
- int user_fault, int write_fault, int largepage,
- int *ptwrite, pfn_t pfn)
+static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
+ struct kvm_vcpu *vcpu, u64 addr,
+ u64 *sptep, int level)
{
- hpa_t shadow_addr;
- int level;
- u64 *shadow_ent;
- unsigned access = walker->pt_access;
-
- if (!is_present_pte(walker->ptes[walker->level - 1]))
- return NULL;
-
- shadow_addr = vcpu->arch.mmu.root_hpa;
- level = vcpu->arch.mmu.shadow_root_level;
- if (level == PT32E_ROOT_LEVEL) {
- shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
- shadow_addr &= PT64_BASE_ADDR_MASK;
- --level;
+ struct shadow_walker *sw =
+ container_of(_sw, struct shadow_walker, walker);
+ struct guest_walker *gw = sw->guest_walker;
+ unsigned access = gw->pt_access;
+ struct kvm_mmu_page *shadow_page;
+ u64 spte;
+ int metaphysical;
+ gfn_t table_gfn;
+ int r;
+ pt_element_t curr_pte;
+
+ if (level == PT_PAGE_TABLE_LEVEL
+ || (sw->largepage && level == PT_DIRECTORY_LEVEL)) {
+ mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
+ sw->user_fault, sw->write_fault,
+ gw->ptes[gw->level-1] & PT_DIRTY_MASK,
+ sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
+ false);
+ sw->sptep = sptep;
+ return 1;
}
- for (; ; level--) {
- u32 index = SHADOW_PT_INDEX(addr, level);
- struct kvm_mmu_page *shadow_page;
- u64 shadow_pte;
- int metaphysical;
- gfn_t table_gfn;
-
- shadow_ent = ((u64 *)__va(shadow_addr)) + index;
- if (level == PT_PAGE_TABLE_LEVEL)
- break;
-
- if (largepage && level == PT_DIRECTORY_LEVEL)
- break;
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
+ return 0;
- if (is_shadow_present_pte(*shadow_ent)
- && !is_large_pte(*shadow_ent)) {
- shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
- continue;
- }
+ if (is_large_pte(*sptep)) {
+ set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ rmap_remove(vcpu->kvm, sptep);
+ }
- if (is_large_pte(*shadow_ent))
- rmap_remove(vcpu->kvm, shadow_ent);
-
- if (level - 1 == PT_PAGE_TABLE_LEVEL
- && walker->level == PT_DIRECTORY_LEVEL) {
- metaphysical = 1;
- if (!is_dirty_pte(walker->ptes[level - 1]))
- access &= ~ACC_WRITE_MASK;
- table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
- } else {
- metaphysical = 0;
- table_gfn = walker->table_gfn[level - 2];
- }
- shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
- metaphysical, access,
- shadow_ent);
- if (!metaphysical) {
- int r;
- pt_element_t curr_pte;
- r = kvm_read_guest_atomic(vcpu->kvm,
- walker->pte_gpa[level - 2],
- &curr_pte, sizeof(curr_pte));
- if (r || curr_pte != walker->ptes[level - 2]) {
- kvm_release_pfn_clean(pfn);
- return NULL;
- }
+ if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
+ metaphysical = 1;
+ if (!is_dirty_pte(gw->ptes[level - 1]))
+ access &= ~ACC_WRITE_MASK;
+ table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
+ } else {
+ metaphysical = 0;
+ table_gfn = gw->table_gfn[level - 2];
+ }
+ shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1,
+ metaphysical, access, sptep);
+ if (!metaphysical) {
+ r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
+ &curr_pte, sizeof(curr_pte));
+ if (r || curr_pte != gw->ptes[level - 2]) {
+ kvm_mmu_put_page(shadow_page, sptep);
+ kvm_release_pfn_clean(sw->pfn);
+ sw->sptep = NULL;
+ return 1;
}
- shadow_addr = __pa(shadow_page->spt);
- shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
- | PT_WRITABLE_MASK | PT_USER_MASK;
- set_shadow_pte(shadow_ent, shadow_pte);
}
- mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
- user_fault, write_fault,
- walker->ptes[walker->level-1] & PT_DIRTY_MASK,
- ptwrite, largepage, walker->gfn, pfn, false);
+ spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK
+ | PT_WRITABLE_MASK | PT_USER_MASK;
+ *sptep = spte;
+ return 0;
+}
+
+static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+ struct guest_walker *guest_walker,
+ int user_fault, int write_fault, int largepage,
+ int *ptwrite, pfn_t pfn)
+{
+ struct shadow_walker walker = {
+ .walker = { .entry = FNAME(shadow_walk_entry), },
+ .guest_walker = guest_walker,
+ .user_fault = user_fault,
+ .write_fault = write_fault,
+ .largepage = largepage,
+ .ptwrite = ptwrite,
+ .pfn = pfn,
+ };
+
+ if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1]))
+ return NULL;
+
+ walk_shadow(&walker.walker, vcpu, addr);
- return shadow_ent;
+ return walker.sptep;
}
/*
@@ -407,7 +418,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
return 0;
}
- down_read(&current->mm->mmap_sem);
if (walker.level == PT_DIRECTORY_LEVEL) {
gfn_t large_gfn;
large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
@@ -417,9 +427,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
}
}
mmu_seq = vcpu->kvm->mmu_notifier_seq;
- /* implicit mb(), we'll read before PT lock is unlocked */
+ smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
- up_read(&current->mm->mmap_sem);
/* mmio */
if (is_error_pfn(pfn)) {
@@ -453,6 +462,31 @@ out_unlock:
return 0;
}
+static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
+ struct kvm_vcpu *vcpu, u64 addr,
+ u64 *sptep, int level)
+{
+
+ if (level == PT_PAGE_TABLE_LEVEL) {
+ if (is_shadow_present_pte(*sptep))
+ rmap_remove(vcpu->kvm, sptep);
+ set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+ return 1;
+ }
+ if (!is_shadow_present_pte(*sptep))
+ return 1;
+ return 0;
+}
+
+static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ struct shadow_walker walker = {
+ .walker = { .entry = FNAME(shadow_invlpg_entry), },
+ };
+
+ walk_shadow(&walker.walker, vcpu, gva);
+}
+
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
{
struct guest_walker walker;
@@ -499,12 +533,66 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
}
}
+/*
+ * Using the cached information from sp->gfns is safe because:
+ * - The spte has a reference to the struct page, so the pfn for a given gfn
+ * can't change unless all sptes pointing to it are nuked first.
+ * - Alias changes zap the entire shadow cache.
+ */
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ int i, offset, nr_present;
+
+ offset = nr_present = 0;
+
+ if (PTTYPE == 32)
+ offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+ unsigned pte_access;
+ pt_element_t gpte;
+ gpa_t pte_gpa;
+ gfn_t gfn = sp->gfns[i];
+
+ if (!is_shadow_present_pte(sp->spt[i]))
+ continue;
+
+ pte_gpa = gfn_to_gpa(sp->gfn);
+ pte_gpa += (i+offset) * sizeof(pt_element_t);
+
+ if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
+ return -EINVAL;
+
+ if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) ||
+ !(gpte & PT_ACCESSED_MASK)) {
+ u64 nonpresent;
+
+ rmap_remove(vcpu->kvm, &sp->spt[i]);
+ if (is_present_pte(gpte))
+ nonpresent = shadow_trap_nonpresent_pte;
+ else
+ nonpresent = shadow_notrap_nonpresent_pte;
+ set_shadow_pte(&sp->spt[i], nonpresent);
+ continue;
+ }
+
+ nr_present++;
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
+ is_dirty_pte(gpte), 0, gfn,
+ spte_to_pfn(sp->spt[i]), true, false);
+ }
+
+ return !nr_present;
+}
+
#undef pt_element_t
#undef guest_walker
+#undef shadow_walker
#undef FNAME
#undef PT_BASE_ADDR_MASK
#undef PT_INDEX
-#undef SHADOW_PT_INDEX
#undef PT_LEVEL_MASK
#undef PT_DIR_BASE_ADDR_MASK
#undef PT_LEVEL_BITS
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8233b86c778c..9c4ce657d963 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -18,6 +18,7 @@
#include "kvm_svm.h"
#include "irq.h"
#include "mmu.h"
+#include "kvm_cache_regs.h"
#include <linux/module.h>
#include <linux/kernel.h>
@@ -35,10 +36,6 @@ MODULE_LICENSE("GPL");
#define IOPM_ALLOC_ORDER 2
#define MSRPM_ALLOC_ORDER 1
-#define DB_VECTOR 1
-#define UD_VECTOR 6
-#define GP_VECTOR 13
-
#define DR7_GD_MASK (1 << 13)
#define DR6_BD_MASK (1 << 13)
@@ -47,7 +44,7 @@ MODULE_LICENSE("GPL");
#define SVM_FEATURE_NPT (1 << 0)
#define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_DEATURE_SVML (1 << 2)
+#define SVM_FEATURE_SVML (1 << 2)
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
@@ -236,13 +233,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
printk(KERN_DEBUG "%s: NOP\n", __func__);
return;
}
- if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
- printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
- __func__,
- svm->vmcb->save.rip,
- svm->next_rip);
+ if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
+ printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
+ __func__, kvm_rip_read(vcpu), svm->next_rip);
- vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
+ kvm_rip_write(vcpu, svm->next_rip);
svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
vcpu->arch.interrupt_window_open = 1;
@@ -530,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm)
(1ULL << INTERCEPT_CPUID) |
(1ULL << INTERCEPT_INVD) |
(1ULL << INTERCEPT_HLT) |
+ (1ULL << INTERCEPT_INVLPG) |
(1ULL << INTERCEPT_INVLPGA) |
(1ULL << INTERCEPT_IOIO_PROT) |
(1ULL << INTERCEPT_MSR_PROT) |
@@ -581,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm)
save->dr7 = 0x400;
save->rflags = 2;
save->rip = 0x0000fff0;
+ svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
/*
* cr0 val on cpu init should be 0x60000010, we enable cpu
@@ -593,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm)
if (npt_enabled) {
/* Setup VMCB for Nested Paging */
control->nested_ctl = 1;
- control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH);
+ control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
+ (1ULL << INTERCEPT_INVLPG));
control->intercept_exceptions &= ~(1 << PF_VECTOR);
control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
INTERCEPT_CR3_MASK);
@@ -615,10 +613,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
init_vmcb(svm);
if (vcpu->vcpu_id != 0) {
- svm->vmcb->save.rip = 0;
+ kvm_rip_write(vcpu, 0);
svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
}
+ vcpu->arch.regs_avail = ~0;
+ vcpu->arch.regs_dirty = ~0;
return 0;
}
@@ -721,23 +721,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
rdtscll(vcpu->arch.host_tsc);
}
-static void svm_cache_regs(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
- vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
- vcpu->arch.rip = svm->vmcb->save.rip;
-}
-
-static void svm_decache_regs(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
- svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
- svm->vmcb->save.rip = vcpu->arch.rip;
-}
-
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
{
return to_svm(vcpu)->vmcb->save.rflags;
@@ -1040,7 +1023,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
if (npt_enabled)
svm_flush_tlb(&svm->vcpu);
- if (event_injection)
+ if (!npt_enabled && event_injection)
kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
}
@@ -1139,14 +1122,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
- svm->next_rip = svm->vmcb->save.rip + 1;
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
skip_emulated_instruction(&svm->vcpu);
return kvm_emulate_halt(&svm->vcpu);
}
static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
- svm->next_rip = svm->vmcb->save.rip + 3;
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
kvm_emulate_hypercall(&svm->vcpu);
return 1;
@@ -1178,11 +1161,18 @@ static int task_switch_interception(struct vcpu_svm *svm,
static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
- svm->next_rip = svm->vmcb->save.rip + 2;
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
kvm_emulate_cpuid(&svm->vcpu);
return 1;
}
+static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+ if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
+ pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
+ return 1;
+}
+
static int emulate_on_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
@@ -1273,9 +1263,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
(u32)(data >> 32), handler);
- svm->vmcb->save.rax = data & 0xffffffff;
+ svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
- svm->next_rip = svm->vmcb->save.rip + 2;
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
skip_emulated_instruction(&svm->vcpu);
}
return 1;
@@ -1359,13 +1349,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- u64 data = (svm->vmcb->save.rax & -1u)
+ u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
handler);
- svm->next_rip = svm->vmcb->save.rip + 2;
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
if (svm_set_msr(&svm->vcpu, ecx, data))
kvm_inject_gp(&svm->vcpu, 0);
else
@@ -1436,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
[SVM_EXIT_CPUID] = cpuid_interception,
[SVM_EXIT_INVD] = emulate_on_interception,
[SVM_EXIT_HLT] = halt_interception,
- [SVM_EXIT_INVLPG] = emulate_on_interception,
+ [SVM_EXIT_INVLPG] = invlpg_interception,
[SVM_EXIT_INVLPGA] = invalid_op_interception,
[SVM_EXIT_IOIO] = io_interception,
[SVM_EXIT_MSR] = msr_interception,
@@ -1538,6 +1528,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
+ ++svm->vcpu.stat.irq_injections;
control = &svm->vmcb->control;
control->int_vector = irq;
control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -1716,6 +1707,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
}
+#ifdef CONFIG_X86_64
+#define R "r"
+#else
+#define R "e"
+#endif
+
static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1723,6 +1720,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
u16 gs_selector;
u16 ldt_selector;
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
pre_svm_run(svm);
sync_lapic_to_cr8(vcpu);
@@ -1750,19 +1751,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
local_irq_enable();
asm volatile (
+ "push %%"R"bp; \n\t"
+ "mov %c[rbx](%[svm]), %%"R"bx \n\t"
+ "mov %c[rcx](%[svm]), %%"R"cx \n\t"
+ "mov %c[rdx](%[svm]), %%"R"dx \n\t"
+ "mov %c[rsi](%[svm]), %%"R"si \n\t"
+ "mov %c[rdi](%[svm]), %%"R"di \n\t"
+ "mov %c[rbp](%[svm]), %%"R"bp \n\t"
#ifdef CONFIG_X86_64
- "push %%rbp; \n\t"
-#else
- "push %%ebp; \n\t"
-#endif
-
-#ifdef CONFIG_X86_64
- "mov %c[rbx](%[svm]), %%rbx \n\t"
- "mov %c[rcx](%[svm]), %%rcx \n\t"
- "mov %c[rdx](%[svm]), %%rdx \n\t"
- "mov %c[rsi](%[svm]), %%rsi \n\t"
- "mov %c[rdi](%[svm]), %%rdi \n\t"
- "mov %c[rbp](%[svm]), %%rbp \n\t"
"mov %c[r8](%[svm]), %%r8 \n\t"
"mov %c[r9](%[svm]), %%r9 \n\t"
"mov %c[r10](%[svm]), %%r10 \n\t"
@@ -1771,41 +1767,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %c[r13](%[svm]), %%r13 \n\t"
"mov %c[r14](%[svm]), %%r14 \n\t"
"mov %c[r15](%[svm]), %%r15 \n\t"
-#else
- "mov %c[rbx](%[svm]), %%ebx \n\t"
- "mov %c[rcx](%[svm]), %%ecx \n\t"
- "mov %c[rdx](%[svm]), %%edx \n\t"
- "mov %c[rsi](%[svm]), %%esi \n\t"
- "mov %c[rdi](%[svm]), %%edi \n\t"
- "mov %c[rbp](%[svm]), %%ebp \n\t"
#endif
-#ifdef CONFIG_X86_64
- /* Enter guest mode */
- "push %%rax \n\t"
- "mov %c[vmcb](%[svm]), %%rax \n\t"
- __ex(SVM_VMLOAD) "\n\t"
- __ex(SVM_VMRUN) "\n\t"
- __ex(SVM_VMSAVE) "\n\t"
- "pop %%rax \n\t"
-#else
/* Enter guest mode */
- "push %%eax \n\t"
- "mov %c[vmcb](%[svm]), %%eax \n\t"
+ "push %%"R"ax \n\t"
+ "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
__ex(SVM_VMLOAD) "\n\t"
__ex(SVM_VMRUN) "\n\t"
__ex(SVM_VMSAVE) "\n\t"
- "pop %%eax \n\t"
-#endif
+ "pop %%"R"ax \n\t"
/* Save guest registers, load host registers */
+ "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
+ "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
+ "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
+ "mov %%"R"si, %c[rsi](%[svm]) \n\t"
+ "mov %%"R"di, %c[rdi](%[svm]) \n\t"
+ "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
#ifdef CONFIG_X86_64
- "mov %%rbx, %c[rbx](%[svm]) \n\t"
- "mov %%rcx, %c[rcx](%[svm]) \n\t"
- "mov %%rdx, %c[rdx](%[svm]) \n\t"
- "mov %%rsi, %c[rsi](%[svm]) \n\t"
- "mov %%rdi, %c[rdi](%[svm]) \n\t"
- "mov %%rbp, %c[rbp](%[svm]) \n\t"
"mov %%r8, %c[r8](%[svm]) \n\t"
"mov %%r9, %c[r9](%[svm]) \n\t"
"mov %%r10, %c[r10](%[svm]) \n\t"
@@ -1814,18 +1793,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %%r13, %c[r13](%[svm]) \n\t"
"mov %%r14, %c[r14](%[svm]) \n\t"
"mov %%r15, %c[r15](%[svm]) \n\t"
-
- "pop %%rbp; \n\t"
-#else
- "mov %%ebx, %c[rbx](%[svm]) \n\t"
- "mov %%ecx, %c[rcx](%[svm]) \n\t"
- "mov %%edx, %c[rdx](%[svm]) \n\t"
- "mov %%esi, %c[rsi](%[svm]) \n\t"
- "mov %%edi, %c[rdi](%[svm]) \n\t"
- "mov %%ebp, %c[rbp](%[svm]) \n\t"
-
- "pop %%ebp; \n\t"
#endif
+ "pop %%"R"bp"
:
: [svm]"a"(svm),
[vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
@@ -1846,11 +1815,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
[r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
#endif
: "cc", "memory"
+ , R"bx", R"cx", R"dx", R"si", R"di"
#ifdef CONFIG_X86_64
- , "rbx", "rcx", "rdx", "rsi", "rdi"
, "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
-#else
- , "ebx", "ecx", "edx" , "esi", "edi"
#endif
);
@@ -1858,6 +1825,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
load_db_regs(svm->host_db_regs);
vcpu->arch.cr2 = svm->vmcb->save.cr2;
+ vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+ vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+ vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
write_dr6(svm->host_dr6);
write_dr7(svm->host_dr7);
@@ -1879,6 +1849,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
svm->next_rip = 0;
}
+#undef R
+
static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1977,8 +1949,6 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_gdt = svm_set_gdt,
.get_dr = svm_get_dr,
.set_dr = svm_set_dr,
- .cache_regs = svm_cache_regs,
- .decache_regs = svm_decache_regs,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7041cc52b562..a4018b01e1f9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -26,6 +26,8 @@
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/moduleparam.h>
+#include "kvm_cache_regs.h"
+#include "x86.h"
#include <asm/io.h>
#include <asm/desc.h>
@@ -47,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0);
static int enable_ept = 1;
module_param(enable_ept, bool, 0);
+static int emulate_invalid_guest_state = 0;
+module_param(emulate_invalid_guest_state, bool, 0);
+
struct vmcs {
u32 revision_id;
u32 abort;
@@ -56,6 +61,7 @@ struct vmcs {
struct vcpu_vmx {
struct kvm_vcpu vcpu;
struct list_head local_vcpus_link;
+ unsigned long host_rsp;
int launched;
u8 fail;
u32 idt_vectoring_info;
@@ -83,6 +89,7 @@ struct vcpu_vmx {
} irq;
} rmode;
int vpid;
+ bool emulation_required;
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -468,7 +475,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
if (!vcpu->fpu_active)
eb |= 1u << NM_VECTOR;
if (vcpu->guest_debug.enabled)
- eb |= 1u << 1;
+ eb |= 1u << DB_VECTOR;
if (vcpu->arch.rmode.active)
eb = ~0;
if (vm_need_ept())
@@ -715,9 +722,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
unsigned long rip;
u32 interruptibility;
- rip = vmcs_readl(GUEST_RIP);
+ rip = kvm_rip_read(vcpu);
rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- vmcs_writel(GUEST_RIP, rip);
+ kvm_rip_write(vcpu, rip);
/*
* We emulated an instruction, so temporary interrupt blocking
@@ -733,19 +740,35 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (has_error_code)
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+
+ if (vcpu->arch.rmode.active) {
+ vmx->rmode.irq.pending = true;
+ vmx->rmode.irq.vector = nr;
+ vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+ if (nr == BP_VECTOR)
+ vmx->rmode.irq.rip++;
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ nr | INTR_TYPE_SOFT_INTR
+ | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
+ | INTR_INFO_VALID_MASK);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+ kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+ return;
+ }
+
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
nr | INTR_TYPE_EXCEPTION
| (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
| INTR_INFO_VALID_MASK);
- if (has_error_code)
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
}
static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+ return false;
}
/*
@@ -947,24 +970,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
return ret;
}
-/*
- * Sync the rsp and rip registers into the vcpu structure. This allows
- * registers to be accessed by indexing vcpu->arch.regs.
- */
-static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
- vcpu->arch.rip = vmcs_readl(GUEST_RIP);
-}
-
-/*
- * Syncs rsp and rip back into the vmcs. Should be called after possible
- * modification.
- */
-static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
+static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
- vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
- vmcs_writel(GUEST_RIP, vcpu->arch.rip);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ switch (reg) {
+ case VCPU_REGS_RSP:
+ vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+ break;
+ case VCPU_REGS_RIP:
+ vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
+ break;
+ default:
+ break;
+ }
}
static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
@@ -1007,17 +1025,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
static int vmx_get_irq(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 idtv_info_field;
-
- idtv_info_field = vmx->idt_vectoring_info;
- if (idtv_info_field & INTR_INFO_VALID_MASK) {
- if (is_external_interrupt(idtv_info_field))
- return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
- else
- printk(KERN_DEBUG "pending exception: not handled yet\n");
- }
- return -1;
+ if (!vcpu->arch.interrupt.pending)
+ return -1;
+ return vcpu->arch.interrupt.nr;
}
static __init int cpu_has_kvm_support(void)
@@ -1031,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void)
u64 msr;
rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
- return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
- MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
- == MSR_IA32_FEATURE_CONTROL_LOCKED;
+ return (msr & (FEATURE_CONTROL_LOCKED |
+ FEATURE_CONTROL_VMXON_ENABLED))
+ == FEATURE_CONTROL_LOCKED;
/* locked but not enabled */
}
@@ -1045,14 +1055,14 @@ static void hardware_enable(void *garbage)
INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
- if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
- MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
- != (MSR_IA32_FEATURE_CONTROL_LOCKED |
- MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
+ if ((old & (FEATURE_CONTROL_LOCKED |
+ FEATURE_CONTROL_VMXON_ENABLED))
+ != (FEATURE_CONTROL_LOCKED |
+ FEATURE_CONTROL_VMXON_ENABLED))
/* enable and lock */
wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
- MSR_IA32_FEATURE_CONTROL_LOCKED |
- MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
+ FEATURE_CONTROL_LOCKED |
+ FEATURE_CONTROL_VMXON_ENABLED);
write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
asm volatile (ASM_VMX_VMXON_RAX
: : "a"(&phys_addr), "m"(phys_addr)
@@ -1120,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_USE_IO_BITMAPS |
CPU_BASED_MOV_DR_EXITING |
- CPU_BASED_USE_TSC_OFFSETING;
+ CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_INVLPG_EXITING;
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1149,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
#endif
if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
- /* CR3 accesses don't need to cause VM Exits when EPT enabled */
+ /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
+ enabled */
min &= ~(CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING);
+ CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_INVLPG_EXITING);
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
&_cpu_based_exec_control) < 0)
return -EIO;
@@ -1288,7 +1301,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
static void enter_pmode(struct kvm_vcpu *vcpu)
{
unsigned long flags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ vmx->emulation_required = 1;
vcpu->arch.rmode.active = 0;
vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
@@ -1305,6 +1320,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
update_exception_bitmap(vcpu);
+ if (emulate_invalid_guest_state)
+ return;
+
fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
@@ -1345,7 +1363,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
static void enter_rmode(struct kvm_vcpu *vcpu)
{
unsigned long flags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ vmx->emulation_required = 1;
vcpu->arch.rmode.active = 1;
vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
@@ -1367,6 +1387,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
update_exception_bitmap(vcpu);
+ if (emulate_invalid_guest_state)
+ goto continue_rmode;
+
vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
vmcs_write32(GUEST_SS_LIMIT, 0xffff);
vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
@@ -1382,6 +1405,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
+continue_rmode:
kvm_mmu_reset_context(vcpu);
init_rmode(vcpu->kvm);
}
@@ -1715,6 +1739,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
vmcs_writel(GUEST_GDTR_BASE, dt->base);
}
+static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment var;
+ u32 ar;
+
+ vmx_get_segment(vcpu, &var, seg);
+ ar = vmx_segment_access_rights(&var);
+
+ if (var.base != (var.selector << 4))
+ return false;
+ if (var.limit != 0xffff)
+ return false;
+ if (ar != 0xf3)
+ return false;
+
+ return true;
+}
+
+static bool code_segment_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs;
+ unsigned int cs_rpl;
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ cs_rpl = cs.selector & SELECTOR_RPL_MASK;
+
+ if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
+ return false;
+ if (!cs.s)
+ return false;
+ if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) {
+ if (cs.dpl > cs_rpl)
+ return false;
+ } else if (cs.type & AR_TYPE_CODE_MASK) {
+ if (cs.dpl != cs_rpl)
+ return false;
+ }
+ if (!cs.present)
+ return false;
+
+ /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
+ return true;
+}
+
+static bool stack_segment_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment ss;
+ unsigned int ss_rpl;
+
+ vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+ ss_rpl = ss.selector & SELECTOR_RPL_MASK;
+
+ if ((ss.type != 3) || (ss.type != 7))
+ return false;
+ if (!ss.s)
+ return false;
+ if (ss.dpl != ss_rpl) /* DPL != RPL */
+ return false;
+ if (!ss.present)
+ return false;
+
+ return true;
+}
+
+static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment var;
+ unsigned int rpl;
+
+ vmx_get_segment(vcpu, &var, seg);
+ rpl = var.selector & SELECTOR_RPL_MASK;
+
+ if (!var.s)
+ return false;
+ if (!var.present)
+ return false;
+ if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
+ if (var.dpl < rpl) /* DPL < RPL */
+ return false;
+ }
+
+ /* TODO: Add other members to kvm_segment_field to allow checking for other access
+ * rights flags
+ */
+ return true;
+}
+
+static bool tr_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment tr;
+
+ vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
+
+ if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */
+ return false;
+ if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */
+ return false;
+ if (!tr.present)
+ return false;
+
+ return true;
+}
+
+static bool ldtr_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment ldtr;
+
+ vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
+
+ if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */
+ return false;
+ if (ldtr.type != 2)
+ return false;
+ if (!ldtr.present)
+ return false;
+
+ return true;
+}
+
+static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs, ss;
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+
+ return ((cs.selector & SELECTOR_RPL_MASK) ==
+ (ss.selector & SELECTOR_RPL_MASK));
+}
+
+/*
+ * Check if guest state is valid. Returns true if valid, false if
+ * not.
+ * We assume that registers are always usable
+ */
+static bool guest_state_valid(struct kvm_vcpu *vcpu)
+{
+ /* real mode guest state checks */
+ if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
+ return false;
+ } else {
+ /* protected mode guest state checks */
+ if (!cs_ss_rpl_check(vcpu))
+ return false;
+ if (!code_segment_valid(vcpu))
+ return false;
+ if (!stack_segment_valid(vcpu))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_DS))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_ES))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_FS))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_GS))
+ return false;
+ if (!tr_valid(vcpu))
+ return false;
+ if (!ldtr_valid(vcpu))
+ return false;
+ }
+ /* TODO:
+ * - Add checks on RIP
+ * - Add checks on RFLAGS
+ */
+
+ return true;
+}
+
static int init_rmode_tss(struct kvm *kvm)
{
gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
@@ -1726,7 +1930,8 @@ static int init_rmode_tss(struct kvm *kvm)
if (r < 0)
goto out;
data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
- r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
+ r = kvm_write_guest_page(kvm, fn++, &data,
+ TSS_IOPB_BASE_OFFSET, sizeof(u16));
if (r < 0)
goto out;
r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
@@ -1789,7 +1994,7 @@ static void seg_setup(int seg)
vmcs_write16(sf->selector, 0);
vmcs_writel(sf->base, 0);
vmcs_write32(sf->limit, 0xffff);
- vmcs_write32(sf->ar_bytes, 0x93);
+ vmcs_write32(sf->ar_bytes, 0xf3);
}
static int alloc_apic_access_page(struct kvm *kvm)
@@ -1808,9 +2013,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
if (r)
goto out;
- down_read(&current->mm->mmap_sem);
kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
- up_read(&current->mm->mmap_sem);
out:
up_write(&kvm->slots_lock);
return r;
@@ -1832,10 +2035,8 @@ static int alloc_identity_pagetable(struct kvm *kvm)
if (r)
goto out;
- down_read(&current->mm->mmap_sem);
kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
- up_read(&current->mm->mmap_sem);
out:
up_write(&kvm->slots_lock);
return r;
@@ -1917,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
}
if (!vm_need_ept())
exec_control |= CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_CR3_LOAD_EXITING;
+ CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_INVLPG_EXITING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
if (cpu_has_secondary_exec_ctrls()) {
@@ -2019,6 +2221,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
u64 msr;
int ret;
+ vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
down_read(&vcpu->kvm->slots_lock);
if (!init_rmode(vmx->vcpu.kvm)) {
ret = -ENOMEM;
@@ -2036,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
fx_init(&vmx->vcpu);
+ seg_setup(VCPU_SREG_CS);
/*
* GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
* insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
@@ -2047,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
}
- vmcs_write32(GUEST_CS_LIMIT, 0xffff);
- vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
seg_setup(VCPU_SREG_DS);
seg_setup(VCPU_SREG_ES);
@@ -2072,10 +2274,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_RFLAGS, 0x02);
if (vmx->vcpu.vcpu_id == 0)
- vmcs_writel(GUEST_RIP, 0xfff0);
+ kvm_rip_write(vcpu, 0xfff0);
else
- vmcs_writel(GUEST_RIP, 0);
- vmcs_writel(GUEST_RSP, 0);
+ kvm_rip_write(vcpu, 0);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
/* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
vmcs_writel(GUEST_DR7, 0x400);
@@ -2125,6 +2327,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
ret = 0;
+ /* HACK: Don't enable emulation on guest boot/reset */
+ vmx->emulation_required = 0;
+
out:
up_read(&vcpu->kvm->slots_lock);
return ret;
@@ -2136,14 +2341,15 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
+ ++vcpu->stat.irq_injections;
if (vcpu->arch.rmode.active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = irq;
- vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
+ vmx->rmode.irq.rip = kvm_rip_read(vcpu);
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
- vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
+ kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
return;
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -2154,7 +2360,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
- vcpu->arch.nmi_pending = 0;
}
static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
@@ -2166,7 +2371,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
if (!vcpu->arch.irq_pending[word_index])
clear_bit(word_index, &vcpu->arch.irq_summary);
- vmx_inject_irq(vcpu, irq);
+ kvm_queue_interrupt(vcpu, irq);
}
@@ -2180,13 +2385,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
if (vcpu->arch.interrupt_window_open &&
- vcpu->arch.irq_summary &&
- !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
- /*
- * If interrupts enabled, and not blocked by sti or mov ss. Good.
- */
+ vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
kvm_do_inject_irq(vcpu);
+ if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
+ vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
if (!vcpu->arch.interrupt_window_open &&
(vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
@@ -2237,9 +2441,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
static int handle_rmode_exception(struct kvm_vcpu *vcpu,
int vec, u32 err_code)
{
- if (!vcpu->arch.rmode.active)
- return 0;
-
/*
* Instruction with address size override prefix opcode 0x67
* Cause the #SS fault with 0 error code in VM86 mode.
@@ -2247,6 +2448,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
return 1;
+ /*
+ * Forward all other exceptions that are valid in real mode.
+ * FIXME: Breaks guest debugging in real mode, needs to be fixed with
+ * the required debugging infrastructure rework.
+ */
+ switch (vec) {
+ case DE_VECTOR:
+ case DB_VECTOR:
+ case BP_VECTOR:
+ case OF_VECTOR:
+ case BR_VECTOR:
+ case UD_VECTOR:
+ case DF_VECTOR:
+ case SS_VECTOR:
+ case GP_VECTOR:
+ case MF_VECTOR:
+ kvm_queue_exception(vcpu, vec);
+ return 1;
+ }
return 0;
}
@@ -2288,7 +2508,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
error_code = 0;
- rip = vmcs_readl(GUEST_RIP);
+ rip = kvm_rip_read(vcpu);
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
if (is_page_fault(intr_info)) {
@@ -2298,7 +2518,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
cr2 = vmcs_readl(EXIT_QUALIFICATION);
KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
(u32)((u64)cr2 >> 32), handler);
- if (vect_info & VECTORING_INFO_VALID_MASK)
+ if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending)
kvm_mmu_unprotect_page_virt(vcpu, cr2);
return kvm_mmu_page_fault(vcpu, cr2, error_code);
}
@@ -2386,27 +2606,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
reg = (exit_qualification >> 8) & 15;
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
- KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg],
- (u32)((u64)vcpu->arch.regs[reg] >> 32), handler);
+ KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
+ (u32)kvm_register_read(vcpu, reg),
+ (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
+ handler);
switch (cr) {
case 0:
- vcpu_load_rsp_rip(vcpu);
- kvm_set_cr0(vcpu, vcpu->arch.regs[reg]);
+ kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
skip_emulated_instruction(vcpu);
return 1;
case 3:
- vcpu_load_rsp_rip(vcpu);
- kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
+ kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
skip_emulated_instruction(vcpu);
return 1;
case 4:
- vcpu_load_rsp_rip(vcpu);
- kvm_set_cr4(vcpu, vcpu->arch.regs[reg]);
+ kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
skip_emulated_instruction(vcpu);
return 1;
case 8:
- vcpu_load_rsp_rip(vcpu);
- kvm_set_cr8(vcpu, vcpu->arch.regs[reg]);
+ kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg));
skip_emulated_instruction(vcpu);
if (irqchip_in_kernel(vcpu->kvm))
return 1;
@@ -2415,7 +2633,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
};
break;
case 2: /* clts */
- vcpu_load_rsp_rip(vcpu);
vmx_fpu_deactivate(vcpu);
vcpu->arch.cr0 &= ~X86_CR0_TS;
vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
@@ -2426,21 +2643,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
case 1: /*mov from cr*/
switch (cr) {
case 3:
- vcpu_load_rsp_rip(vcpu);
- vcpu->arch.regs[reg] = vcpu->arch.cr3;
- vcpu_put_rsp_rip(vcpu);
+ kvm_register_write(vcpu, reg, vcpu->arch.cr3);
KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
- (u32)vcpu->arch.regs[reg],
- (u32)((u64)vcpu->arch.regs[reg] >> 32),
+ (u32)kvm_register_read(vcpu, reg),
+ (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
handler);
skip_emulated_instruction(vcpu);
return 1;
case 8:
- vcpu_load_rsp_rip(vcpu);
- vcpu->arch.regs[reg] = kvm_get_cr8(vcpu);
- vcpu_put_rsp_rip(vcpu);
+ kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
- (u32)vcpu->arch.regs[reg], handler);
+ (u32)kvm_register_read(vcpu, reg), handler);
skip_emulated_instruction(vcpu);
return 1;
}
@@ -2472,7 +2685,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
dr = exit_qualification & 7;
reg = (exit_qualification >> 8) & 15;
- vcpu_load_rsp_rip(vcpu);
if (exit_qualification & 16) {
/* mov from dr */
switch (dr) {
@@ -2485,12 +2697,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
default:
val = 0;
}
- vcpu->arch.regs[reg] = val;
+ kvm_register_write(vcpu, reg, val);
KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
} else {
/* mov to dr */
}
- vcpu_put_rsp_rip(vcpu);
skip_emulated_instruction(vcpu);
return 1;
}
@@ -2583,6 +2794,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
+static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+
+ kvm_mmu_invlpg(vcpu, exit_qualification);
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
skip_emulated_instruction(vcpu);
@@ -2695,6 +2915,43 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
+static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
+ struct kvm_run *kvm_run)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int err;
+
+ preempt_enable();
+ local_irq_enable();
+
+ while (!guest_state_valid(vcpu)) {
+ err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+
+ switch (err) {
+ case EMULATE_DONE:
+ break;
+ case EMULATE_DO_MMIO:
+ kvm_report_emulation_failure(vcpu, "mmio");
+ /* TODO: Handle MMIO */
+ return;
+ default:
+ kvm_report_emulation_failure(vcpu, "emulation failure");
+ return;
+ }
+
+ if (signal_pending(current))
+ break;
+ if (need_resched())
+ schedule();
+ }
+
+ local_irq_disable();
+ preempt_disable();
+
+ /* Guest state should be valid now, no more emulation should be needed */
+ vmx->emulation_required = 0;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -2714,6 +2971,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
[EXIT_REASON_MSR_WRITE] = handle_wrmsr,
[EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
[EXIT_REASON_HLT] = handle_halt,
+ [EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_VMCALL] = handle_vmcall,
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
@@ -2735,8 +2993,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 vectoring_info = vmx->idt_vectoring_info;
- KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP),
- (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit);
+ KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
+ (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
/* Access CR3 don't cause VMExit in paging mode, so we need
* to sync with guest real CR3. */
@@ -2829,88 +3087,94 @@ static void enable_intr_window(struct kvm_vcpu *vcpu)
enable_irq_window(vcpu);
}
-static void vmx_intr_assist(struct kvm_vcpu *vcpu)
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 idtv_info_field, intr_info_field, exit_intr_info_field;
- int vector;
+ u32 exit_intr_info;
+ u32 idt_vectoring_info;
+ bool unblock_nmi;
+ u8 vector;
+ int type;
+ bool idtv_info_valid;
+ u32 error;
- update_tpr_threshold(vcpu);
-
- intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
- exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO);
- idtv_info_field = vmx->idt_vectoring_info;
- if (intr_info_field & INTR_INFO_VALID_MASK) {
- if (idtv_info_field & INTR_INFO_VALID_MASK) {
- /* TODO: fault when IDT_Vectoring */
- if (printk_ratelimit())
- printk(KERN_ERR "Fault when IDT_Vectoring\n");
- }
- enable_intr_window(vcpu);
- return;
+ exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ if (cpu_has_virtual_nmis()) {
+ unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+ vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+ /*
+ * SDM 3: 25.7.1.2
+ * Re-set bit "block by NMI" before VM entry if vmexit caused by
+ * a guest IRET fault.
+ */
+ if (unblock_nmi && vector != DF_VECTOR)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
}
- if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
- if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
- == INTR_TYPE_EXT_INTR
- && vcpu->arch.rmode.active) {
- u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
-
- vmx_inject_irq(vcpu, vect);
- enable_intr_window(vcpu);
- return;
- }
-
- KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
+ idt_vectoring_info = vmx->idt_vectoring_info;
+ idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+ vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+ type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+ if (vmx->vcpu.arch.nmi_injected) {
/*
* SDM 3: 25.7.1.2
* Clear bit "block by NMI" before VM entry if a NMI delivery
* faulted.
*/
- if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
- == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis())
- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
- vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- ~GUEST_INTR_STATE_NMI);
-
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field
- & ~INTR_INFO_RESVD_BITS_MASK);
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
- vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
-
- if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK))
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
- vmcs_read32(IDT_VECTORING_ERROR_CODE));
- enable_intr_window(vcpu);
- return;
+ if (idtv_info_valid && type == INTR_TYPE_NMI_INTR)
+ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmx->vcpu.arch.nmi_injected = false;
+ }
+ kvm_clear_exception_queue(&vmx->vcpu);
+ if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) {
+ if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
+ error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ kvm_queue_exception_e(&vmx->vcpu, vector, error);
+ } else
+ kvm_queue_exception(&vmx->vcpu, vector);
+ vmx->idt_vectoring_info = 0;
}
+ kvm_clear_interrupt_queue(&vmx->vcpu);
+ if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
+ kvm_queue_interrupt(&vmx->vcpu, vector);
+ vmx->idt_vectoring_info = 0;
+ }
+}
+
+static void vmx_intr_assist(struct kvm_vcpu *vcpu)
+{
+ update_tpr_threshold(vcpu);
+
if (cpu_has_virtual_nmis()) {
- /*
- * SDM 3: 25.7.1.2
- * Re-set bit "block by NMI" before VM entry if vmexit caused by
- * a guest IRET fault.
- */
- if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) &&
- (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8)
- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
- vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) |
- GUEST_INTR_STATE_NMI);
- else if (vcpu->arch.nmi_pending) {
- if (vmx_nmi_enabled(vcpu))
- vmx_inject_nmi(vcpu);
+ if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+ if (vcpu->arch.interrupt.pending) {
+ enable_nmi_window(vcpu);
+ } else if (vmx_nmi_enabled(vcpu)) {
+ vcpu->arch.nmi_pending = false;
+ vcpu->arch.nmi_injected = true;
+ } else {
+ enable_intr_window(vcpu);
+ return;
+ }
+ }
+ if (vcpu->arch.nmi_injected) {
+ vmx_inject_nmi(vcpu);
enable_intr_window(vcpu);
return;
}
-
}
- if (!kvm_cpu_has_interrupt(vcpu))
- return;
- if (vmx_irq_enabled(vcpu)) {
- vector = kvm_cpu_get_interrupt(vcpu);
- vmx_inject_irq(vcpu, vector);
- kvm_timer_intr_post(vcpu, vector);
- } else
- enable_irq_window(vcpu);
+ if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
+ if (vmx_irq_enabled(vcpu))
+ kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
+ else
+ enable_irq_window(vcpu);
+ }
+ if (vcpu->arch.interrupt.pending) {
+ vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+ kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
+ }
}
/*
@@ -2922,9 +3186,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
static void fixup_rmode_irq(struct vcpu_vmx *vmx)
{
vmx->rmode.irq.pending = 0;
- if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
+ if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
return;
- vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
+ kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
@@ -2936,11 +3200,30 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
| vmx->rmode.irq.vector;
}
+#ifdef CONFIG_X86_64
+#define R "r"
+#define Q "q"
+#else
+#define R "e"
+#define Q "l"
+#endif
+
static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 intr_info;
+ /* Handle invalid guest state instead of entering VMX */
+ if (vmx->emulation_required && emulate_invalid_guest_state) {
+ handle_invalid_guest_state(vcpu, kvm_run);
+ return;
+ }
+
+ if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+ vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+ if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+ vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
/*
* Loading guest fpu may have cleared host cr0.ts
*/
@@ -2948,26 +3231,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
asm(
/* Store host registers */
-#ifdef CONFIG_X86_64
- "push %%rdx; push %%rbp;"
- "push %%rcx \n\t"
-#else
- "push %%edx; push %%ebp;"
- "push %%ecx \n\t"
-#endif
+ "push %%"R"dx; push %%"R"bp;"
+ "push %%"R"cx \n\t"
+ "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
+ "je 1f \n\t"
+ "mov %%"R"sp, %c[host_rsp](%0) \n\t"
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
+ "1: \n\t"
/* Check if vmlaunch of vmresume is needed */
"cmpl $0, %c[launched](%0) \n\t"
/* Load guest registers. Don't clobber flags. */
+ "mov %c[cr2](%0), %%"R"ax \n\t"
+ "mov %%"R"ax, %%cr2 \n\t"
+ "mov %c[rax](%0), %%"R"ax \n\t"
+ "mov %c[rbx](%0), %%"R"bx \n\t"
+ "mov %c[rdx](%0), %%"R"dx \n\t"
+ "mov %c[rsi](%0), %%"R"si \n\t"
+ "mov %c[rdi](%0), %%"R"di \n\t"
+ "mov %c[rbp](%0), %%"R"bp \n\t"
#ifdef CONFIG_X86_64
- "mov %c[cr2](%0), %%rax \n\t"
- "mov %%rax, %%cr2 \n\t"
- "mov %c[rax](%0), %%rax \n\t"
- "mov %c[rbx](%0), %%rbx \n\t"
- "mov %c[rdx](%0), %%rdx \n\t"
- "mov %c[rsi](%0), %%rsi \n\t"
- "mov %c[rdi](%0), %%rdi \n\t"
- "mov %c[rbp](%0), %%rbp \n\t"
"mov %c[r8](%0), %%r8 \n\t"
"mov %c[r9](%0), %%r9 \n\t"
"mov %c[r10](%0), %%r10 \n\t"
@@ -2976,18 +3258,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %c[r13](%0), %%r13 \n\t"
"mov %c[r14](%0), %%r14 \n\t"
"mov %c[r15](%0), %%r15 \n\t"
- "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
-#else
- "mov %c[cr2](%0), %%eax \n\t"
- "mov %%eax, %%cr2 \n\t"
- "mov %c[rax](%0), %%eax \n\t"
- "mov %c[rbx](%0), %%ebx \n\t"
- "mov %c[rdx](%0), %%edx \n\t"
- "mov %c[rsi](%0), %%esi \n\t"
- "mov %c[rdi](%0), %%edi \n\t"
- "mov %c[rbp](%0), %%ebp \n\t"
- "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
#endif
+ "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
+
/* Enter guest mode */
"jne .Llaunched \n\t"
__ex(ASM_VMX_VMLAUNCH) "\n\t"
@@ -2995,15 +3268,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
".Lkvm_vmx_return: "
/* Save guest registers, load host registers, keep flags */
+ "xchg %0, (%%"R"sp) \n\t"
+ "mov %%"R"ax, %c[rax](%0) \n\t"
+ "mov %%"R"bx, %c[rbx](%0) \n\t"
+ "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
+ "mov %%"R"dx, %c[rdx](%0) \n\t"
+ "mov %%"R"si, %c[rsi](%0) \n\t"
+ "mov %%"R"di, %c[rdi](%0) \n\t"
+ "mov %%"R"bp, %c[rbp](%0) \n\t"
#ifdef CONFIG_X86_64
- "xchg %0, (%%rsp) \n\t"
- "mov %%rax, %c[rax](%0) \n\t"
- "mov %%rbx, %c[rbx](%0) \n\t"
- "pushq (%%rsp); popq %c[rcx](%0) \n\t"
- "mov %%rdx, %c[rdx](%0) \n\t"
- "mov %%rsi, %c[rsi](%0) \n\t"
- "mov %%rdi, %c[rdi](%0) \n\t"
- "mov %%rbp, %c[rbp](%0) \n\t"
"mov %%r8, %c[r8](%0) \n\t"
"mov %%r9, %c[r9](%0) \n\t"
"mov %%r10, %c[r10](%0) \n\t"
@@ -3012,28 +3285,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %%r13, %c[r13](%0) \n\t"
"mov %%r14, %c[r14](%0) \n\t"
"mov %%r15, %c[r15](%0) \n\t"
- "mov %%cr2, %%rax \n\t"
- "mov %%rax, %c[cr2](%0) \n\t"
-
- "pop %%rbp; pop %%rbp; pop %%rdx \n\t"
-#else
- "xchg %0, (%%esp) \n\t"
- "mov %%eax, %c[rax](%0) \n\t"
- "mov %%ebx, %c[rbx](%0) \n\t"
- "pushl (%%esp); popl %c[rcx](%0) \n\t"
- "mov %%edx, %c[rdx](%0) \n\t"
- "mov %%esi, %c[rsi](%0) \n\t"
- "mov %%edi, %c[rdi](%0) \n\t"
- "mov %%ebp, %c[rbp](%0) \n\t"
- "mov %%cr2, %%eax \n\t"
- "mov %%eax, %c[cr2](%0) \n\t"
-
- "pop %%ebp; pop %%ebp; pop %%edx \n\t"
#endif
+ "mov %%cr2, %%"R"ax \n\t"
+ "mov %%"R"ax, %c[cr2](%0) \n\t"
+
+ "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t"
"setbe %c[fail](%0) \n\t"
: : "c"(vmx), "d"((unsigned long)HOST_RSP),
[launched]"i"(offsetof(struct vcpu_vmx, launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
+ [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
@@ -3053,14 +3314,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
#endif
[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
: "cc", "memory"
+ , R"bx", R"di", R"si"
#ifdef CONFIG_X86_64
- , "rbx", "rdi", "rsi"
, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
-#else
- , "ebx", "edi", "rsi"
#endif
);
+ vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
+ vcpu->arch.regs_dirty = 0;
+
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
if (vmx->rmode.irq.pending)
fixup_rmode_irq(vmx);
@@ -3080,8 +3342,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
KVMTRACE_0D(NMI, vcpu, handler);
asm("int $2");
}
+
+ vmx_complete_interrupts(vmx);
}
+#undef R
+#undef Q
+
static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3224,8 +3491,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_idt = vmx_set_idt,
.get_gdt = vmx_get_gdt,
.set_gdt = vmx_set_gdt,
- .cache_regs = vcpu_load_rsp_rip,
- .decache_regs = vcpu_put_rsp_rip,
+ .cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
@@ -3300,7 +3566,8 @@ static int __init vmx_init(void)
bypass_guest_pf = 0;
kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
VMX_EPT_WRITABLE_MASK |
- VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
+ VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT |
+ VMX_EPT_IGMT_BIT);
kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
VMX_EPT_EXECUTABLE_MASK);
kvm_enable_tdp();
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 17e25995b65b..ec5edc339da6 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -331,9 +331,6 @@ enum vmcs_field {
#define AR_RESERVD_MASK 0xfffe0f00
-#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
-#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
-
#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10
@@ -355,6 +352,7 @@ enum vmcs_field {
#define VMX_EPT_READABLE_MASK 0x1ull
#define VMX_EPT_WRITABLE_MASK 0x2ull
#define VMX_EPT_EXECUTABLE_MASK 0x4ull
+#define VMX_EPT_IGMT_BIT (1ull << 6)
#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0d682fc6aeb3..f1f8ff2f1fa2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4,10 +4,14 @@
* derived from drivers/kvm/kvm_main.c
*
* Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
*
* Authors:
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
@@ -19,14 +23,18 @@
#include "mmu.h"
#include "i8254.h"
#include "tss.h"
+#include "kvm_cache_regs.h"
+#include "x86.h"
#include <linux/clocksource.h>
+#include <linux/interrupt.h>
#include <linux/kvm.h>
#include <linux/fs.h>
#include <linux/vmalloc.h>
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/highmem.h>
+#include <linux/intel-iommu.h>
#include <asm/uaccess.h>
#include <asm/msr.h>
@@ -61,6 +69,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
struct kvm_x86_ops *kvm_x86_ops;
+EXPORT_SYMBOL_GPL(kvm_x86_ops);
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "pf_fixed", VCPU_STAT(pf_fixed) },
@@ -83,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "fpu_reload", VCPU_STAT(fpu_reload) },
{ "insn_emulation", VCPU_STAT(insn_emulation) },
{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+ { "irq_injections", VCPU_STAT(irq_injections) },
{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -90,12 +100,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_flooded", VM_STAT(mmu_flooded) },
{ "mmu_recycled", VM_STAT(mmu_recycled) },
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
+ { "mmu_unsync", VM_STAT(mmu_unsync) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages) },
{ NULL }
};
-
unsigned long segment_base(u16 selector)
{
struct descriptor_table gdt;
@@ -352,6 +362,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+ kvm_mmu_sync_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
return;
}
@@ -564,7 +575,7 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
- __FUNCTION__, tsc_khz, hv_clock->tsc_shift,
+ __func__, tsc_khz, hv_clock->tsc_shift,
hv_clock->tsc_to_system_mul);
}
@@ -662,6 +673,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
__func__, data);
break;
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!data) {
+ /* We support the non-activated case already */
+ break;
+ } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
+ /* Values other than LBR and BTF are vendor-specific,
+ thus reserved and should throw a #GP */
+ return 1;
+ }
+ pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+ __func__, data);
+ break;
case MSR_IA32_UCODE_REV:
case MSR_IA32_UCODE_WRITE:
break;
@@ -692,10 +715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
/* ...but clean it before doing the actual write */
vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
- down_read(&current->mm->mmap_sem);
vcpu->arch.time_page =
gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
- up_read(&current->mm->mmap_sem);
if (is_error_page(vcpu->arch.time_page)) {
kvm_release_page_clean(vcpu->arch.time_page);
@@ -752,8 +773,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MC0_MISC+8:
case MSR_IA32_MC0_MISC+12:
case MSR_IA32_MC0_MISC+16:
+ case MSR_IA32_MC0_MISC+20:
case MSR_IA32_UCODE_REV:
case MSR_IA32_EBL_CR_POWERON:
+ case MSR_IA32_DEBUGCTLMSR:
+ case MSR_IA32_LASTBRANCHFROMIP:
+ case MSR_IA32_LASTBRANCHTOIP:
+ case MSR_IA32_LASTINTFROMIP:
+ case MSR_IA32_LASTINTTOIP:
data = 0;
break;
case MSR_MTRRcap:
@@ -901,6 +928,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_PV_MMU:
r = !tdp_enabled;
break;
+ case KVM_CAP_IOMMU:
+ r = intel_iommu_found();
+ break;
default:
r = 0;
break;
@@ -1303,28 +1333,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
int r;
+ struct kvm_lapic_state *lapic = NULL;
switch (ioctl) {
case KVM_GET_LAPIC: {
- struct kvm_lapic_state lapic;
+ lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
- memset(&lapic, 0, sizeof lapic);
- r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
+ r = -ENOMEM;
+ if (!lapic)
+ goto out;
+ r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
if (r)
goto out;
r = -EFAULT;
- if (copy_to_user(argp, &lapic, sizeof lapic))
+ if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
goto out;
r = 0;
break;
}
case KVM_SET_LAPIC: {
- struct kvm_lapic_state lapic;
-
+ lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!lapic)
+ goto out;
r = -EFAULT;
- if (copy_from_user(&lapic, argp, sizeof lapic))
+ if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
goto out;
- r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
+ r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
if (r)
goto out;
r = 0;
@@ -1422,6 +1457,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
}
out:
+ if (lapic)
+ kfree(lapic);
return r;
}
@@ -1630,6 +1667,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r = -EINVAL;
+ /*
+ * This union makes it completely explicit to gcc-3.x
+ * that these two variables' stack usage should be
+ * combined, not added together.
+ */
+ union {
+ struct kvm_pit_state ps;
+ struct kvm_memory_alias alias;
+ } u;
switch (ioctl) {
case KVM_SET_TSS_ADDR:
@@ -1661,17 +1707,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_GET_NR_MMU_PAGES:
r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
break;
- case KVM_SET_MEMORY_ALIAS: {
- struct kvm_memory_alias alias;
-
+ case KVM_SET_MEMORY_ALIAS:
r = -EFAULT;
- if (copy_from_user(&alias, argp, sizeof alias))
+ if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
goto out;
- r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
+ r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
if (r)
goto out;
break;
- }
case KVM_CREATE_IRQCHIP:
r = -ENOMEM;
kvm->arch.vpic = kvm_create_pic(kvm);
@@ -1699,13 +1742,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
if (irqchip_in_kernel(kvm)) {
mutex_lock(&kvm->lock);
- if (irq_event.irq < 16)
- kvm_pic_set_irq(pic_irqchip(kvm),
- irq_event.irq,
- irq_event.level);
- kvm_ioapic_set_irq(kvm->arch.vioapic,
- irq_event.irq,
- irq_event.level);
+ kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event.irq, irq_event.level);
mutex_unlock(&kvm->lock);
r = 0;
}
@@ -1713,65 +1751,77 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
- struct kvm_irqchip chip;
+ struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
- r = -EFAULT;
- if (copy_from_user(&chip, argp, sizeof chip))
+ r = -ENOMEM;
+ if (!chip)
goto out;
+ r = -EFAULT;
+ if (copy_from_user(chip, argp, sizeof *chip))
+ goto get_irqchip_out;
r = -ENXIO;
if (!irqchip_in_kernel(kvm))
- goto out;
- r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
+ goto get_irqchip_out;
+ r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
- goto out;
+ goto get_irqchip_out;
r = -EFAULT;
- if (copy_to_user(argp, &chip, sizeof chip))
- goto out;
+ if (copy_to_user(argp, chip, sizeof *chip))
+ goto get_irqchip_out;
r = 0;
+ get_irqchip_out:
+ kfree(chip);
+ if (r)
+ goto out;
break;
}
case KVM_SET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
- struct kvm_irqchip chip;
+ struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
- r = -EFAULT;
- if (copy_from_user(&chip, argp, sizeof chip))
+ r = -ENOMEM;
+ if (!chip)
goto out;
+ r = -EFAULT;
+ if (copy_from_user(chip, argp, sizeof *chip))
+ goto set_irqchip_out;
r = -ENXIO;
if (!irqchip_in_kernel(kvm))
- goto out;
- r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
+ goto set_irqchip_out;
+ r = kvm_vm_ioctl_set_irqchip(kvm, chip);
if (r)
- goto out;
+ goto set_irqchip_out;
r = 0;
+ set_irqchip_out:
+ kfree(chip);
+ if (r)
+ goto out;
break;
}
case KVM_GET_PIT: {
- struct kvm_pit_state ps;
r = -EFAULT;
- if (copy_from_user(&ps, argp, sizeof ps))
+ if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
goto out;
r = -ENXIO;
if (!kvm->arch.vpit)
goto out;
- r = kvm_vm_ioctl_get_pit(kvm, &ps);
+ r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
if (r)
goto out;
r = -EFAULT;
- if (copy_to_user(argp, &ps, sizeof ps))
+ if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
goto out;
r = 0;
break;
}
case KVM_SET_PIT: {
- struct kvm_pit_state ps;
r = -EFAULT;
- if (copy_from_user(&ps, argp, sizeof ps))
+ if (copy_from_user(&u.ps, argp, sizeof u.ps))
goto out;
r = -ENXIO;
if (!kvm->arch.vpit)
goto out;
- r = kvm_vm_ioctl_set_pit(kvm, &ps);
+ r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
if (r)
goto out;
r = 0;
@@ -2018,9 +2068,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
val = *(u64 *)new;
- down_read(&current->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- up_read(&current->mm->mmap_sem);
kaddr = kmap_atomic(page, KM_USER0);
set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
@@ -2040,6 +2088,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
{
+ kvm_mmu_invlpg(vcpu, address);
return X86EMUL_CONTINUE;
}
@@ -2080,7 +2129,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
{
u8 opcodes[4];
- unsigned long rip = vcpu->arch.rip;
+ unsigned long rip = kvm_rip_read(vcpu);
unsigned long rip_linear;
if (!printk_ratelimit())
@@ -2102,6 +2151,14 @@ static struct x86_emulate_ops emulate_ops = {
.cmpxchg_emulated = emulator_cmpxchg_emulated,
};
+static void cache_all_regs(struct kvm_vcpu *vcpu)
+{
+ kvm_register_read(vcpu, VCPU_REGS_RAX);
+ kvm_register_read(vcpu, VCPU_REGS_RSP);
+ kvm_register_read(vcpu, VCPU_REGS_RIP);
+ vcpu->arch.regs_dirty = ~0;
+}
+
int emulate_instruction(struct kvm_vcpu *vcpu,
struct kvm_run *run,
unsigned long cr2,
@@ -2111,8 +2168,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
int r;
struct decode_cache *c;
+ kvm_clear_exception_queue(vcpu);
vcpu->arch.mmio_fault_cr2 = cr2;
- kvm_x86_ops->cache_regs(vcpu);
+ /*
+ * TODO: fix x86_emulate.c to use guest_read/write_register
+ * instead of direct ->regs accesses, can save hundred cycles
+ * on Intel for instructions that don't read/change RSP, for
+ * for example.
+ */
+ cache_all_regs(vcpu);
vcpu->mmio_is_write = 0;
vcpu->arch.pio.string = 0;
@@ -2172,7 +2236,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
return EMULATE_DO_MMIO;
}
- kvm_x86_ops->decache_regs(vcpu);
kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
if (vcpu->mmio_is_write) {
@@ -2225,20 +2288,19 @@ int complete_pio(struct kvm_vcpu *vcpu)
struct kvm_pio_request *io = &vcpu->arch.pio;
long delta;
int r;
-
- kvm_x86_ops->cache_regs(vcpu);
+ unsigned long val;
if (!io->string) {
- if (io->in)
- memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
- io->size);
+ if (io->in) {
+ val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ memcpy(&val, vcpu->arch.pio_data, io->size);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+ }
} else {
if (io->in) {
r = pio_copy_data(vcpu);
- if (r) {
- kvm_x86_ops->cache_regs(vcpu);
+ if (r)
return r;
- }
}
delta = 1;
@@ -2248,19 +2310,24 @@ int complete_pio(struct kvm_vcpu *vcpu)
* The size of the register should really depend on
* current address size.
*/
- vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
+ val = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ val -= delta;
+ kvm_register_write(vcpu, VCPU_REGS_RCX, val);
}
if (io->down)
delta = -delta;
delta *= io->size;
- if (io->in)
- vcpu->arch.regs[VCPU_REGS_RDI] += delta;
- else
- vcpu->arch.regs[VCPU_REGS_RSI] += delta;
+ if (io->in) {
+ val = kvm_register_read(vcpu, VCPU_REGS_RDI);
+ val += delta;
+ kvm_register_write(vcpu, VCPU_REGS_RDI, val);
+ } else {
+ val = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ val += delta;
+ kvm_register_write(vcpu, VCPU_REGS_RSI, val);
+ }
}
- kvm_x86_ops->decache_regs(vcpu);
-
io->count -= io->cur_count;
io->cur_count = 0;
@@ -2313,6 +2380,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
int size, unsigned port)
{
struct kvm_io_device *pio_dev;
+ unsigned long val;
vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2333,8 +2401,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
handler);
- kvm_x86_ops->cache_regs(vcpu);
- memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
+ val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ memcpy(vcpu->arch.pio_data, &val, 4);
kvm_x86_ops->skip_emulated_instruction(vcpu);
@@ -2492,11 +2560,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
KVMTRACE_0D(HLT, vcpu, handler);
if (irqchip_in_kernel(vcpu->kvm)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
- up_read(&vcpu->kvm->slots_lock);
- kvm_vcpu_block(vcpu);
- down_read(&vcpu->kvm->slots_lock);
- if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
- return -EINTR;
return 1;
} else {
vcpu->run->exit_reason = KVM_EXIT_HLT;
@@ -2519,13 +2582,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
unsigned long nr, a0, a1, a2, a3, ret;
int r = 1;
- kvm_x86_ops->cache_regs(vcpu);
-
- nr = vcpu->arch.regs[VCPU_REGS_RAX];
- a0 = vcpu->arch.regs[VCPU_REGS_RBX];
- a1 = vcpu->arch.regs[VCPU_REGS_RCX];
- a2 = vcpu->arch.regs[VCPU_REGS_RDX];
- a3 = vcpu->arch.regs[VCPU_REGS_RSI];
+ nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
@@ -2548,8 +2609,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
ret = -KVM_ENOSYS;
break;
}
- vcpu->arch.regs[VCPU_REGS_RAX] = ret;
- kvm_x86_ops->decache_regs(vcpu);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
++vcpu->stat.hypercalls;
return r;
}
@@ -2559,6 +2619,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
{
char instruction[3];
int ret = 0;
+ unsigned long rip = kvm_rip_read(vcpu);
/*
@@ -2568,9 +2629,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
*/
kvm_mmu_zap_all(vcpu->kvm);
- kvm_x86_ops->cache_regs(vcpu);
kvm_x86_ops->patch_hypercall(vcpu, instruction);
- if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
+ if (emulator_write_emulated(rip, instruction, 3, vcpu)
!= X86EMUL_CONTINUE)
ret = -EFAULT;
@@ -2700,13 +2760,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
u32 function, index;
struct kvm_cpuid_entry2 *e, *best;
- kvm_x86_ops->cache_regs(vcpu);
- function = vcpu->arch.regs[VCPU_REGS_RAX];
- index = vcpu->arch.regs[VCPU_REGS_RCX];
- vcpu->arch.regs[VCPU_REGS_RAX] = 0;
- vcpu->arch.regs[VCPU_REGS_RBX] = 0;
- vcpu->arch.regs[VCPU_REGS_RCX] = 0;
- vcpu->arch.regs[VCPU_REGS_RDX] = 0;
+ function = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
best = NULL;
for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
e = &vcpu->arch.cpuid_entries[i];
@@ -2724,18 +2783,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
best = e;
}
if (best) {
- vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
- vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
- vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
- vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
+ kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
}
- kvm_x86_ops->decache_regs(vcpu);
kvm_x86_ops->skip_emulated_instruction(vcpu);
KVMTRACE_5D(CPUID, vcpu, function,
- (u32)vcpu->arch.regs[VCPU_REGS_RAX],
- (u32)vcpu->arch.regs[VCPU_REGS_RBX],
- (u32)vcpu->arch.regs[VCPU_REGS_RCX],
- (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler);
+ (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
+ (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
+ (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
+ (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
}
EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
@@ -2776,9 +2834,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
if (!apic || !apic->vapic_addr)
return;
- down_read(&current->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
- up_read(&current->mm->mmap_sem);
vcpu->arch.apic->vapic_page = page;
}
@@ -2796,28 +2852,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
up_read(&vcpu->kvm->slots_lock);
}
-static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
- if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
- pr_debug("vcpu %d received sipi with vector # %x\n",
- vcpu->vcpu_id, vcpu->arch.sipi_vector);
- kvm_lapic_reset(vcpu);
- r = kvm_x86_ops->vcpu_reset(vcpu);
- if (r)
- return r;
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- }
-
- down_read(&vcpu->kvm->slots_lock);
- vapic_enter(vcpu);
-
-preempted:
- if (vcpu->guest_debug.enabled)
- kvm_x86_ops->guest_debug_pre(vcpu);
-
-again:
if (vcpu->requests)
if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
kvm_mmu_unload(vcpu);
@@ -2829,6 +2867,8 @@ again:
if (vcpu->requests) {
if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
__kvm_migrate_timers(vcpu);
+ if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+ kvm_mmu_sync_roots(vcpu);
if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
kvm_x86_ops->tlb_flush(vcpu);
if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
@@ -2854,21 +2894,15 @@ again:
local_irq_disable();
- if (vcpu->requests || need_resched()) {
+ if (vcpu->requests || need_resched() || signal_pending(current)) {
local_irq_enable();
preempt_enable();
r = 1;
goto out;
}
- if (signal_pending(current)) {
- local_irq_enable();
- preempt_enable();
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.signal_exits;
- goto out;
- }
+ if (vcpu->guest_debug.enabled)
+ kvm_x86_ops->guest_debug_pre(vcpu);
vcpu->guest_mode = 1;
/*
@@ -2917,8 +2951,8 @@ again:
* Profile KVM exit RIPs:
*/
if (unlikely(prof_on == KVM_PROFILING)) {
- kvm_x86_ops->cache_regs(vcpu);
- profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
+ unsigned long rip = kvm_rip_read(vcpu);
+ profile_hit(KVM_PROFILING, (void *)rip);
}
if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
@@ -2927,26 +2961,63 @@ again:
kvm_lapic_sync_from_vapic(vcpu);
r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
+out:
+ return r;
+}
- if (r > 0) {
- if (dm_request_for_irq_injection(vcpu, kvm_run)) {
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.request_irq_exits;
- goto out;
- }
- if (!need_resched())
- goto again;
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ int r;
+
+ if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
+ pr_debug("vcpu %d received sipi with vector # %x\n",
+ vcpu->vcpu_id, vcpu->arch.sipi_vector);
+ kvm_lapic_reset(vcpu);
+ r = kvm_x86_ops->vcpu_reset(vcpu);
+ if (r)
+ return r;
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
-out:
- up_read(&vcpu->kvm->slots_lock);
- if (r > 0) {
- kvm_resched(vcpu);
- down_read(&vcpu->kvm->slots_lock);
- goto preempted;
+ down_read(&vcpu->kvm->slots_lock);
+ vapic_enter(vcpu);
+
+ r = 1;
+ while (r > 0) {
+ if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
+ r = vcpu_enter_guest(vcpu, kvm_run);
+ else {
+ up_read(&vcpu->kvm->slots_lock);
+ kvm_vcpu_block(vcpu);
+ down_read(&vcpu->kvm->slots_lock);
+ if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
+ if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+ vcpu->arch.mp_state =
+ KVM_MP_STATE_RUNNABLE;
+ if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
+ r = -EINTR;
+ }
+
+ if (r > 0) {
+ if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+ r = -EINTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.request_irq_exits;
+ }
+ if (signal_pending(current)) {
+ r = -EINTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ }
+ if (need_resched()) {
+ up_read(&vcpu->kvm->slots_lock);
+ kvm_resched(vcpu);
+ down_read(&vcpu->kvm->slots_lock);
+ }
+ }
}
+ up_read(&vcpu->kvm->slots_lock);
post_kvm_run_save(vcpu, kvm_run);
vapic_exit(vcpu);
@@ -2966,6 +3037,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
kvm_vcpu_block(vcpu);
+ clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
r = -EAGAIN;
goto out;
}
@@ -2999,11 +3071,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
}
#endif
- if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
- kvm_x86_ops->cache_regs(vcpu);
- vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
- kvm_x86_ops->decache_regs(vcpu);
- }
+ if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
+ kvm_register_write(vcpu, VCPU_REGS_RAX,
+ kvm_run->hypercall.ret);
r = __vcpu_run(vcpu, kvm_run);
@@ -3019,28 +3089,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
vcpu_load(vcpu);
- kvm_x86_ops->cache_regs(vcpu);
-
- regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
- regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
- regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
- regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
- regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
- regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
- regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
- regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
+ regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
+ regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
#ifdef CONFIG_X86_64
- regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
- regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
- regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
- regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
- regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
- regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
- regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
- regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
+ regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
+ regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
+ regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
+ regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
+ regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
+ regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
+ regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
+ regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
#endif
- regs->rip = vcpu->arch.rip;
+ regs->rip = kvm_rip_read(vcpu);
regs->rflags = kvm_x86_ops->get_rflags(vcpu);
/*
@@ -3058,29 +3126,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
vcpu_load(vcpu);
- vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
- vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
- vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
- vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
- vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
- vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
- vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
- vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
+ kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
+ kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
+ kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
#ifdef CONFIG_X86_64
- vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
- vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
- vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
- vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
- vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
- vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
- vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
- vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
+ kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
+ kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
+ kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
+ kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
+ kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
+ kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
+ kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
+ kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
+
#endif
- vcpu->arch.rip = regs->rip;
+ kvm_rip_write(vcpu, regs->rip);
kvm_x86_ops->set_rflags(vcpu, regs->rflags);
- kvm_x86_ops->decache_regs(vcpu);
vcpu->arch.exception.pending = false;
@@ -3294,11 +3362,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
return 0;
}
+static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
+{
+ struct kvm_segment segvar = {
+ .base = selector << 4,
+ .limit = 0xffff,
+ .selector = selector,
+ .type = 3,
+ .present = 1,
+ .dpl = 3,
+ .db = 0,
+ .s = 1,
+ .l = 0,
+ .g = 0,
+ .avl = 0,
+ .unusable = 0,
+ };
+ kvm_x86_ops->set_segment(vcpu, &segvar, seg);
+ return 0;
+}
+
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
int type_bits, int seg)
{
struct kvm_segment kvm_seg;
+ if (!(vcpu->arch.cr0 & X86_CR0_PE))
+ return kvm_load_realmode_segment(vcpu, selector, seg);
if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
return 1;
kvm_seg.type |= type_bits;
@@ -3316,17 +3406,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
struct tss_segment_32 *tss)
{
tss->cr3 = vcpu->arch.cr3;
- tss->eip = vcpu->arch.rip;
+ tss->eip = kvm_rip_read(vcpu);
tss->eflags = kvm_x86_ops->get_rflags(vcpu);
- tss->eax = vcpu->arch.regs[VCPU_REGS_RAX];
- tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX];
- tss->edx = vcpu->arch.regs[VCPU_REGS_RDX];
- tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX];
- tss->esp = vcpu->arch.regs[VCPU_REGS_RSP];
- tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP];
- tss->esi = vcpu->arch.regs[VCPU_REGS_RSI];
- tss->edi = vcpu->arch.regs[VCPU_REGS_RDI];
-
+ tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+ tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
@@ -3342,17 +3431,17 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
{
kvm_set_cr3(vcpu, tss->cr3);
- vcpu->arch.rip = tss->eip;
+ kvm_rip_write(vcpu, tss->eip);
kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
- vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax;
- vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx;
- vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx;
- vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx;
- vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp;
- vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp;
- vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi;
- vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi;
+ kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
+ kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
+ kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
+ kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
return 1;
@@ -3380,16 +3469,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
static void save_state_to_tss16(struct kvm_vcpu *vcpu,
struct tss_segment_16 *tss)
{
- tss->ip = vcpu->arch.rip;
+ tss->ip = kvm_rip_read(vcpu);
tss->flag = kvm_x86_ops->get_rflags(vcpu);
- tss->ax = vcpu->arch.regs[VCPU_REGS_RAX];
- tss->cx = vcpu->arch.regs[VCPU_REGS_RCX];
- tss->dx = vcpu->arch.regs[VCPU_REGS_RDX];
- tss->bx = vcpu->arch.regs[VCPU_REGS_RBX];
- tss->sp = vcpu->arch.regs[VCPU_REGS_RSP];
- tss->bp = vcpu->arch.regs[VCPU_REGS_RBP];
- tss->si = vcpu->arch.regs[VCPU_REGS_RSI];
- tss->di = vcpu->arch.regs[VCPU_REGS_RDI];
+ tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+ tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
@@ -3402,16 +3491,16 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
static int load_state_from_tss16(struct kvm_vcpu *vcpu,
struct tss_segment_16 *tss)
{
- vcpu->arch.rip = tss->ip;
+ kvm_rip_write(vcpu, tss->ip);
kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
- vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax;
- vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx;
- vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx;
- vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx;
- vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp;
- vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp;
- vcpu->arch.regs[VCPU_REGS_RSI] = tss->si;
- vcpu->arch.regs[VCPU_REGS_RDI] = tss->di;
+ kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
+ kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
+ kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
+ kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
return 1;
@@ -3534,7 +3623,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
}
kvm_x86_ops->skip_emulated_instruction(vcpu);
- kvm_x86_ops->cache_regs(vcpu);
if (nseg_desc.type & 8)
ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
@@ -3559,7 +3647,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
tr_seg.type = 11;
kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
out:
- kvm_x86_ops->decache_regs(vcpu);
return ret;
}
EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -3622,6 +3709,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
pr_debug("Set back pending irq %d\n",
pending_vec);
}
+ kvm_pic_clear_isr_ack(vcpu->kvm);
}
kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -3634,6 +3722,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+ /* Older userspace won't unhalt the vcpu on reset. */
+ if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
+ sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
+ !(vcpu->arch.cr0 & X86_CR0_PE))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
vcpu_put(vcpu);
return 0;
@@ -3918,6 +4012,10 @@ struct kvm *kvm_arch_create_vm(void)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
+
+ /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
+ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
return kvm;
}
@@ -3950,6 +4048,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
+ kvm_iommu_unmap_guest(kvm);
+ kvm_free_all_assigned_devices(kvm);
kvm_free_pit(kvm);
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
@@ -3981,7 +4081,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
userspace_addr = do_mmap(NULL, 0,
npages * PAGE_SIZE,
PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_ANONYMOUS,
+ MAP_PRIVATE | MAP_ANONYMOUS,
0);
up_write(&current->mm->mmap_sem);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
new file mode 100644
index 000000000000..6a4be78a7384
--- /dev/null
+++ b/arch/x86/kvm/x86.h
@@ -0,0 +1,22 @@
+#ifndef ARCH_X86_KVM_X86_H
+#define ARCH_X86_KVM_X86_H
+
+#include <linux/kvm_host.h>
+
+static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.exception.pending = false;
+}
+
+static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
+{
+ vcpu->arch.interrupt.pending = true;
+ vcpu->arch.interrupt.nr = vector;
+}
+
+static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.interrupt.pending = false;
+}
+
+#endif
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index f2f90468f8b1..ea051173b0da 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -26,6 +26,7 @@
#define DPRINTF(_f, _a ...) printf(_f , ## _a)
#else
#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
#define DPRINTF(x...) do {} while (0)
#endif
#include <linux/module.h>
@@ -46,25 +47,26 @@
#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
#define DstReg (2<<1) /* Register operand. */
#define DstMem (3<<1) /* Memory operand. */
-#define DstMask (3<<1)
+#define DstAcc (4<<1) /* Destination Accumulator */
+#define DstMask (7<<1)
/* Source operand type. */
-#define SrcNone (0<<3) /* No source operand. */
-#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
-#define SrcReg (1<<3) /* Register operand. */
-#define SrcMem (2<<3) /* Memory operand. */
-#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
-#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
-#define SrcImm (5<<3) /* Immediate operand. */
-#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
-#define SrcMask (7<<3)
+#define SrcNone (0<<4) /* No source operand. */
+#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */
+#define SrcReg (1<<4) /* Register operand. */
+#define SrcMem (2<<4) /* Memory operand. */
+#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
+#define SrcMem32 (4<<4) /* Memory operand (32-bit). */
+#define SrcImm (5<<4) /* Immediate operand. */
+#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
+#define SrcMask (7<<4)
/* Generic ModRM decode. */
-#define ModRM (1<<6)
+#define ModRM (1<<7)
/* Destination is only written; never read. */
-#define Mov (1<<7)
-#define BitOp (1<<8)
-#define MemAbs (1<<9) /* Memory operand is absolute displacement */
-#define String (1<<10) /* String instruction (rep capable) */
-#define Stack (1<<11) /* Stack instruction (push/pop) */
+#define Mov (1<<8)
+#define BitOp (1<<9)
+#define MemAbs (1<<10) /* Memory operand is absolute displacement */
+#define String (1<<12) /* String instruction (rep capable) */
+#define Stack (1<<13) /* Stack instruction (push/pop) */
#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
#define GroupMask 0xff /* Group number stored in bits 0:7 */
@@ -94,7 +96,7 @@ static u16 opcode_table[256] = {
/* 0x20 - 0x27 */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- SrcImmByte, SrcImm, 0, 0,
+ DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
/* 0x28 - 0x2F */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -106,7 +108,8 @@ static u16 opcode_table[256] = {
/* 0x38 - 0x3F */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+ 0, 0,
/* 0x40 - 0x47 */
DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
/* 0x48 - 0x4F */
@@ -153,9 +156,16 @@ static u16 opcode_table[256] = {
0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
ByteOp | ImplicitOps | String, ImplicitOps | String,
- /* 0xB0 - 0xBF */
- 0, 0, 0, 0, 0, 0, 0, 0,
- DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0,
+ /* 0xB0 - 0xB7 */
+ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+ /* 0xB8 - 0xBF */
+ DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+ DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+ DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+ DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
/* 0xC0 - 0xC7 */
ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
0, ImplicitOps | Stack, 0, 0,
@@ -169,17 +179,20 @@ static u16 opcode_table[256] = {
/* 0xD8 - 0xDF */
0, 0, 0, 0, 0, 0, 0, 0,
/* 0xE0 - 0xE7 */
- 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
/* 0xE8 - 0xEF */
ImplicitOps | Stack, SrcImm | ImplicitOps,
ImplicitOps, SrcImmByte | ImplicitOps,
- 0, 0, 0, 0,
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
/* 0xF0 - 0xF7 */
0, 0, 0, 0,
ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
/* 0xF8 - 0xFF */
ImplicitOps, 0, ImplicitOps, ImplicitOps,
- 0, 0, Group | Group4, Group | Group5,
+ ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
};
static u16 twobyte_table[256] = {
@@ -268,15 +281,16 @@ static u16 group_table[] = {
ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
0, 0, 0, 0,
[Group3*8] =
- DstMem | SrcImm | ModRM | SrcImm, 0,
- DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+ DstMem | SrcImm | ModRM, 0,
+ DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
0, 0, 0, 0,
[Group4*8] =
ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
0, 0, 0, 0, 0, 0,
[Group5*8] =
- DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0,
- SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0,
+ DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+ SrcMem | ModRM | Stack, 0,
+ SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
[Group7*8] =
0, 0, ModRM | SrcMem, ModRM | SrcMem,
SrcNone | ModRM | DstMem | Mov, 0,
@@ -839,7 +853,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
/* Shadow copy of register state. Committed on successful emulation. */
memset(c, 0, sizeof(struct decode_cache));
- c->eip = ctxt->vcpu->arch.rip;
+ c->eip = kvm_rip_read(ctxt->vcpu);
ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
@@ -1048,6 +1062,23 @@ done_prefixes:
}
c->dst.type = OP_MEM;
break;
+ case DstAcc:
+ c->dst.type = OP_REG;
+ c->dst.bytes = c->op_bytes;
+ c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+ switch (c->op_bytes) {
+ case 1:
+ c->dst.val = *(u8 *)c->dst.ptr;
+ break;
+ case 2:
+ c->dst.val = *(u16 *)c->dst.ptr;
+ break;
+ case 4:
+ c->dst.val = *(u32 *)c->dst.ptr;
+ break;
+ }
+ c->dst.orig_val = c->dst.val;
+ break;
}
if (c->rip_relative)
@@ -1151,6 +1182,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
case 1: /* dec */
emulate_1op("dec", c->dst, ctxt->eflags);
break;
+ case 2: /* call near abs */ {
+ long int old_eip;
+ old_eip = c->eip;
+ c->eip = c->src.val;
+ c->src.val = old_eip;
+ emulate_push(ctxt);
+ break;
+ }
case 4: /* jmp abs */
c->eip = c->src.val;
break;
@@ -1251,6 +1290,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
u64 msr_data;
unsigned long saved_eip = 0;
struct decode_cache *c = &ctxt->decode;
+ unsigned int port;
+ int io_dir_in;
int rc = 0;
/* Shadow copy of register state. Committed on successful emulation.
@@ -1267,7 +1308,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
if (c->rep_prefix && (c->d & String)) {
/* All REP prefixes have the same first termination condition */
if (c->regs[VCPU_REGS_RCX] == 0) {
- ctxt->vcpu->arch.rip = c->eip;
+ kvm_rip_write(ctxt->vcpu, c->eip);
goto done;
}
/* The second termination condition only applies for REPE
@@ -1281,17 +1322,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
(c->b == 0xae) || (c->b == 0xaf)) {
if ((c->rep_prefix == REPE_PREFIX) &&
((ctxt->eflags & EFLG_ZF) == 0)) {
- ctxt->vcpu->arch.rip = c->eip;
+ kvm_rip_write(ctxt->vcpu, c->eip);
goto done;
}
if ((c->rep_prefix == REPNE_PREFIX) &&
((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
- ctxt->vcpu->arch.rip = c->eip;
+ kvm_rip_write(ctxt->vcpu, c->eip);
goto done;
}
}
c->regs[VCPU_REGS_RCX]--;
- c->eip = ctxt->vcpu->arch.rip;
+ c->eip = kvm_rip_read(ctxt->vcpu);
}
if (c->src.type == OP_MEM) {
@@ -1351,27 +1392,10 @@ special_insn:
sbb: /* sbb */
emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
break;
- case 0x20 ... 0x23:
+ case 0x20 ... 0x25:
and: /* and */
emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
break;
- case 0x24: /* and al imm8 */
- c->dst.type = OP_REG;
- c->dst.ptr = &c->regs[VCPU_REGS_RAX];
- c->dst.val = *(u8 *)c->dst.ptr;
- c->dst.bytes = 1;
- c->dst.orig_val = c->dst.val;
- goto and;
- case 0x25: /* and ax imm16, or eax imm32 */
- c->dst.type = OP_REG;
- c->dst.bytes = c->op_bytes;
- c->dst.ptr = &c->regs[VCPU_REGS_RAX];
- if (c->op_bytes == 2)
- c->dst.val = *(u16 *)c->dst.ptr;
- else
- c->dst.val = *(u32 *)c->dst.ptr;
- c->dst.orig_val = c->dst.val;
- goto and;
case 0x28 ... 0x2d:
sub: /* sub */
emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
@@ -1659,7 +1683,7 @@ special_insn:
case 0xae ... 0xaf: /* scas */
DPRINTF("Urk! I don't handle SCAS.\n");
goto cannot_emulate;
- case 0xb8: /* mov r, imm */
+ case 0xb0 ... 0xbf: /* mov r, imm */
goto mov;
case 0xc0 ... 0xc1:
emulate_grp2(ctxt);
@@ -1679,6 +1703,16 @@ special_insn:
c->src.val = c->regs[VCPU_REGS_RCX];
emulate_grp2(ctxt);
break;
+ case 0xe4: /* inb */
+ case 0xe5: /* in */
+ port = insn_fetch(u8, 1, c->eip);
+ io_dir_in = 1;
+ goto do_io;
+ case 0xe6: /* outb */
+ case 0xe7: /* out */
+ port = insn_fetch(u8, 1, c->eip);
+ io_dir_in = 0;
+ goto do_io;
case 0xe8: /* call (near) */ {
long int rel;
switch (c->op_bytes) {
@@ -1729,6 +1763,22 @@ special_insn:
jmp_rel(c, c->src.val);
c->dst.type = OP_NONE; /* Disable writeback. */
break;
+ case 0xec: /* in al,dx */
+ case 0xed: /* in (e/r)ax,dx */
+ port = c->regs[VCPU_REGS_RDX];
+ io_dir_in = 1;
+ goto do_io;
+ case 0xee: /* out al,dx */
+ case 0xef: /* out (e/r)ax,dx */
+ port = c->regs[VCPU_REGS_RDX];
+ io_dir_in = 0;
+ do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in,
+ (c->d & ByteOp) ? 1 : c->op_bytes,
+ port) != 0) {
+ c->eip = saved_eip;
+ goto cannot_emulate;
+ }
+ return 0;
case 0xf4: /* hlt */
ctxt->vcpu->arch.halt_request = 1;
break;
@@ -1754,6 +1804,14 @@ special_insn:
ctxt->eflags |= X86_EFLAGS_IF;
c->dst.type = OP_NONE; /* Disable writeback. */
break;
+ case 0xfc: /* cld */
+ ctxt->eflags &= ~EFLG_DF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xfd: /* std */
+ ctxt->eflags |= EFLG_DF;
+ c->dst.type = OP_NONE; /* Disable writeback. */
+ break;
case 0xfe ... 0xff: /* Grp4/Grp5 */
rc = emulate_grp45(ctxt, ops);
if (rc != 0)
@@ -1768,7 +1826,7 @@ writeback:
/* Commit shadow register state. */
memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
- ctxt->vcpu->arch.rip = c->eip;
+ kvm_rip_write(ctxt->vcpu, c->eip);
done:
if (rc == X86EMUL_UNHANDLEABLE) {
@@ -1793,7 +1851,7 @@ twobyte_insn:
goto done;
/* Let the processor re-execute the fixed hypercall */
- c->eip = ctxt->vcpu->arch.rip;
+ c->eip = kvm_rip_read(ctxt->vcpu);
/* Disable writeback. */
c->dst.type = OP_NONE;
break;
@@ -1889,7 +1947,7 @@ twobyte_insn:
rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
if (rc) {
kvm_inject_gp(ctxt->vcpu, 0);
- c->eip = ctxt->vcpu->arch.rip;
+ c->eip = kvm_rip_read(ctxt->vcpu);
}
rc = X86EMUL_CONTINUE;
c->dst.type = OP_NONE;
@@ -1899,7 +1957,7 @@ twobyte_insn:
rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
if (rc) {
kvm_inject_gp(ctxt->vcpu, 0);
- c->eip = ctxt->vcpu->arch.rip;
+ c->eip = kvm_rip_read(ctxt->vcpu);
} else {
c->regs[VCPU_REGS_RAX] = (u32)msr_data;
c->regs[VCPU_REGS_RDX] = msr_data >> 32;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 65f0b8a47bed..50a779264bb1 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -367,10 +367,9 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
* lazily after a task switch, and Linux uses that gratefully, but wouldn't a
* name like "FPUTRAP bit" be a little less cryptic?
*
- * We store cr0 (and cr3) locally, because the Host never changes it. The
- * Guest sometimes wants to read it and we'd prefer not to bother the Host
- * unnecessarily. */
-static unsigned long current_cr0, current_cr3;
+ * We store cr0 locally because the Host never changes it. The Guest sometimes
+ * wants to read it and we'd prefer not to bother the Host unnecessarily. */
+static unsigned long current_cr0;
static void lguest_write_cr0(unsigned long val)
{
lazy_hcall(LHCALL_TS, val & X86_CR0_TS, 0, 0);
@@ -399,17 +398,23 @@ static unsigned long lguest_read_cr2(void)
return lguest_data.cr2;
}
+/* See lguest_set_pte() below. */
+static bool cr3_changed = false;
+
/* cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0. Keep a local copy, and tell the Host when it changes. */
+ * cr0. Keep a local copy, and tell the Host when it changes. The only
+ * difference is that our local copy is in lguest_data because the Host needs
+ * to set it upon our initial hypercall. */
static void lguest_write_cr3(unsigned long cr3)
{
+ lguest_data.pgdir = cr3;
lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);
- current_cr3 = cr3;
+ cr3_changed = true;
}
static unsigned long lguest_read_cr3(void)
{
- return current_cr3;
+ return lguest_data.pgdir;
}
/* cr4 is used to enable and disable PGE, but we don't care. */
@@ -498,13 +503,13 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
* to forget all of them. Fortunately, this is very rare.
*
* ... except in early boot when the kernel sets up the initial pagetables,
- * which makes booting astonishingly slow. So we don't even tell the Host
- * anything changed until we've done the first page table switch. */
+ * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell
+ * the Host anything changed until we've done the first page table switch,
+ * which brings boot back to 0.25 seconds. */
static void lguest_set_pte(pte_t *ptep, pte_t pteval)
{
*ptep = pteval;
- /* Don't bother with hypercall before initial setup. */
- if (current_cr3)
+ if (cr3_changed)
lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
}
@@ -521,7 +526,7 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
static void lguest_flush_tlb_single(unsigned long addr)
{
/* Simply set it to zero: if it was not, it will fault back in. */
- lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0);
+ lazy_hcall(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
}
/* This is what happens after the Guest has removed a large number of entries.
@@ -581,8 +586,12 @@ static void __init lguest_init_IRQ(void)
for (i = 0; i < LGUEST_IRQS; i++) {
int vector = FIRST_EXTERNAL_VECTOR + i;
+ /* Some systems map "vectors" to interrupts weirdly. Lguest has
+ * a straightforward 1 to 1 mapping, so force that here. */
+ __get_cpu_var(vector_irq)[vector] = i;
if (vector != SYSCALL_VECTOR) {
- set_intr_gate(vector, interrupt[i]);
+ set_intr_gate(vector,
+ interrupt[vector-FIRST_EXTERNAL_VECTOR]);
set_irq_chip_and_handler_name(i, &lguest_irq_controller,
handle_level_irq,
"level");
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 5c7cef34c9e7..10b9bd35a8ff 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -30,21 +30,6 @@ ENTRY(lguest_entry)
movl $lguest_data - __PAGE_OFFSET, %edx
int $LGUEST_TRAP_ENTRY
- /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl
- * instruction uses %esi implicitly as the source for the copy we're
- * about to do. */
- movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
-
- /* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
- * This means the first 128M of kernel memory will be mapped at
- * PAGE_OFFSET where the kernel expects to run. This will get it far
- * enough through boot to switch to its own pagetables. */
- movl $32, %ecx
- movl %esi, %edi
- addl $((__PAGE_OFFSET >> 22) * 4), %edi
- rep
- movsl
-
/* Set up the initial stack so we can run C code. */
movl $(init_thread_union+THREAD_SIZE),%esp
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 9e68075544f6..4a20b2f9a381 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -39,7 +39,7 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon
#define __do_strncpy_from_user(dst, src, count, res) \
do { \
int __d0, __d1, __d2; \
- might_sleep(); \
+ might_fault(); \
__asm__ __volatile__( \
" testl %1,%1\n" \
" jz 2f\n" \
@@ -126,7 +126,7 @@ EXPORT_SYMBOL(strncpy_from_user);
#define __do_clear_user(addr,size) \
do { \
int __d0; \
- might_sleep(); \
+ might_fault(); \
__asm__ __volatile__( \
"0: rep; stosl\n" \
" movl %2,%0\n" \
@@ -155,7 +155,7 @@ do { \
unsigned long
clear_user(void __user *to, unsigned long n)
{
- might_sleep();
+ might_fault();
if (access_ok(VERIFY_WRITE, to, n))
__do_clear_user(to, n);
return n;
@@ -197,7 +197,7 @@ long strnlen_user(const char __user *s, long n)
unsigned long mask = -__addr_ok(s);
unsigned long res, tmp;
- might_sleep();
+ might_fault();
__asm__ __volatile__(
" testl %0, %0\n"
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index f4df6e7c718b..64d6c84e6353 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -15,7 +15,7 @@
#define __do_strncpy_from_user(dst,src,count,res) \
do { \
long __d0, __d1, __d2; \
- might_sleep(); \
+ might_fault(); \
__asm__ __volatile__( \
" testq %1,%1\n" \
" jz 2f\n" \
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(strncpy_from_user);
unsigned long __clear_user(void __user *addr, unsigned long size)
{
long __d0;
- might_sleep();
+ might_fault();
/* no memory constraint because it doesn't change any memory gcc knows
about */
asm volatile(
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index df37fc9d6a26..3624a364b7f3 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -17,6 +17,7 @@
#include <asm/bigsmp/apic.h>
#include <asm/bigsmp/ipi.h>
#include <asm/mach-default/mach_mpparse.h>
+#include <asm/mach-default/mach_wakecpu.h>
static int dmi_bigsmp; /* can be set by dmi scanners */
@@ -41,6 +42,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
{ }
};
+static cpumask_t vector_allocation_domain(int cpu)
+{
+ return cpumask_of_cpu(cpu);
+}
static int probe_bigsmp(void)
{
diff --git a/arch/x86/mach-generic/default.c b/arch/x86/mach-generic/default.c
index 9e835a11a13a..e63a4a76d8cd 100644
--- a/arch/x86/mach-generic/default.c
+++ b/arch/x86/mach-generic/default.c
@@ -16,6 +16,7 @@
#include <asm/mach-default/mach_apic.h>
#include <asm/mach-default/mach_ipi.h>
#include <asm/mach-default/mach_mpparse.h>
+#include <asm/mach-default/mach_wakecpu.h>
/* should be called last. */
static int probe_default(void)
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
index 6513d41ea21e..7b4e6d0d1690 100644
--- a/arch/x86/mach-generic/es7000.c
+++ b/arch/x86/mach-generic/es7000.c
@@ -16,7 +16,19 @@
#include <asm/es7000/apic.h>
#include <asm/es7000/ipi.h>
#include <asm/es7000/mpparse.h>
-#include <asm/es7000/wakecpu.h>
+#include <asm/mach-default/mach_wakecpu.h>
+
+void __init es7000_update_genapic_to_cluster(void)
+{
+ genapic->target_cpus = target_cpus_cluster;
+ genapic->int_delivery_mode = INT_DELIVERY_MODE_CLUSTER;
+ genapic->int_dest_mode = INT_DEST_MODE_CLUSTER;
+ genapic->no_balance_irq = NO_BALANCE_IRQ_CLUSTER;
+
+ genapic->init_apic_ldr = init_apic_ldr_cluster;
+
+ genapic->cpu_mask_to_apicid = cpu_mask_to_apicid_cluster;
+}
static int probe_es7000(void)
{
@@ -75,4 +87,18 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
}
#endif
+static cpumask_t vector_allocation_domain(int cpu)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+ return domain;
+}
+
struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000);
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
index 8cf58394975e..71a309b122e6 100644
--- a/arch/x86/mach-generic/numaq.c
+++ b/arch/x86/mach-generic/numaq.c
@@ -38,4 +38,18 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 0;
}
+static cpumask_t vector_allocation_domain(int cpu)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+ return domain;
+}
+
struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);
diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c
index 5a7e4619e1c4..c346d9d0226f 100644
--- a/arch/x86/mach-generic/probe.c
+++ b/arch/x86/mach-generic/probe.c
@@ -15,6 +15,7 @@
#include <asm/mpspec.h>
#include <asm/apicdef.h>
#include <asm/genapic.h>
+#include <asm/setup.h>
extern struct genapic apic_numaq;
extern struct genapic apic_summit;
@@ -57,6 +58,9 @@ static int __init parse_apic(char *arg)
}
}
+ if (x86_quirks->update_genapic)
+ x86_quirks->update_genapic();
+
/* Parsed again by __setup for debug/verbose */
return 0;
}
@@ -72,12 +76,15 @@ void __init generic_bigsmp_probe(void)
* - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
*/
- if (!cmdline_apic && genapic == &apic_default)
+ if (!cmdline_apic && genapic == &apic_default) {
if (apic_bigsmp.probe()) {
genapic = &apic_bigsmp;
+ if (x86_quirks->update_genapic)
+ x86_quirks->update_genapic();
printk(KERN_INFO "Overriding APIC driver with %s\n",
genapic->name);
}
+ }
#endif
}
@@ -94,6 +101,9 @@ void __init generic_apic_probe(void)
/* Not visible without early console */
if (!apic_probe[i])
panic("Didn't find an APIC driver");
+
+ if (x86_quirks->update_genapic)
+ x86_quirks->update_genapic();
}
printk(KERN_INFO "Using APIC driver %s\n", genapic->name);
}
@@ -108,6 +118,8 @@ int __init mps_oem_check(struct mp_config_table *mpc, char *oem,
if (apic_probe[i]->mps_oem_check(mpc, oem, productid)) {
if (!cmdline_apic) {
genapic = apic_probe[i];
+ if (x86_quirks->update_genapic)
+ x86_quirks->update_genapic();
printk(KERN_INFO "Switched to APIC driver `%s'.\n",
genapic->name);
}
@@ -124,6 +136,8 @@ int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
if (!cmdline_apic) {
genapic = apic_probe[i];
+ if (x86_quirks->update_genapic)
+ x86_quirks->update_genapic();
printk(KERN_INFO "Switched to APIC driver `%s'.\n",
genapic->name);
}
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
index 6ad6b67a723d..2c6d234e0009 100644
--- a/arch/x86/mach-generic/summit.c
+++ b/arch/x86/mach-generic/summit.c
@@ -16,6 +16,7 @@
#include <asm/summit/apic.h>
#include <asm/summit/ipi.h>
#include <asm/summit/mpparse.h>
+#include <asm/mach-default/mach_wakecpu.h>
static int probe_summit(void)
{
@@ -23,4 +24,18 @@ static int probe_summit(void)
return 0;
}
+static cpumask_t vector_allocation_domain(int cpu)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+ return domain;
+}
+
struct genapic apic_summit = APIC_INIT("summit", probe_summit);
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c
index 6bbdd633864c..a580b9562e76 100644
--- a/arch/x86/mach-voyager/setup.c
+++ b/arch/x86/mach-voyager/setup.c
@@ -27,7 +27,7 @@ static struct irqaction irq2 = {
void __init intr_init_hook(void)
{
#ifdef CONFIG_SMP
- smp_intr_init();
+ voyager_smp_intr_init();
#endif
setup_irq(2, &irq2);
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 199a5f4a873c..52145007bd7e 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -7,6 +7,7 @@
* This file provides all the same external entries as smp.c but uses
* the voyager hal to provide the functionality
*/
+#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/kernel_stat.h>
@@ -90,6 +91,7 @@ static void ack_vic_irq(unsigned int irq);
static void vic_enable_cpi(void);
static void do_boot_cpu(__u8 cpuid);
static void do_quad_bootstrap(void);
+static void initialize_secondary(void);
int hard_smp_processor_id(void);
int safe_smp_processor_id(void);
@@ -344,6 +346,12 @@ static void do_quad_bootstrap(void)
}
}
+void prefill_possible_map(void)
+{
+ /* This is empty on voyager because we need a much
+ * earlier detection which is done in find_smp_config */
+}
+
/* Set up all the basic stuff: read the SMP config and make all the
* SMP information reflect only the boot cpu. All others will be
* brought on-line later. */
@@ -413,6 +421,7 @@ void __init smp_store_cpu_info(int id)
struct cpuinfo_x86 *c = &cpu_data(id);
*c = boot_cpu_data;
+ c->cpu_index = id;
identify_secondary_cpu(c);
}
@@ -650,6 +659,8 @@ void __init smp_boot_cpus(void)
smp_tune_scheduling();
*/
smp_store_cpu_info(boot_cpu_id);
+ /* setup the jump vector */
+ initial_code = (unsigned long)initialize_secondary;
printk("CPU%d: ", boot_cpu_id);
print_cpu_info(&cpu_data(boot_cpu_id));
@@ -702,7 +713,7 @@ void __init smp_boot_cpus(void)
/* Reload the secondary CPUs task structure (this function does not
* return ) */
-void __init initialize_secondary(void)
+static void __init initialize_secondary(void)
{
#if 0
// AC kernels only
@@ -1248,7 +1259,7 @@ static void handle_vic_irq(unsigned int irq, struct irq_desc *desc)
#define QIC_SET_GATE(cpi, vector) \
set_intr_gate((cpi) + QIC_DEFAULT_CPI_BASE, (vector))
-void __init smp_intr_init(void)
+void __init voyager_smp_intr_init(void)
{
int i;
@@ -1483,7 +1494,7 @@ static void disable_local_vic_irq(unsigned int irq)
* the interrupt off to another CPU */
static void before_handle_vic_irq(unsigned int irq)
{
- irq_desc_t *desc = irq_desc + irq;
+ irq_desc_t *desc = irq_to_desc(irq);
__u8 cpu = smp_processor_id();
_raw_spin_lock(&vic_irq_lock);
@@ -1518,7 +1529,7 @@ static void before_handle_vic_irq(unsigned int irq)
/* Finish the VIC interrupt: basically mask */
static void after_handle_vic_irq(unsigned int irq)
{
- irq_desc_t *desc = irq_desc + irq;
+ irq_desc_t *desc = irq_to_desc(irq);
_raw_spin_lock(&vic_irq_lock);
{
@@ -1780,6 +1791,17 @@ void __init smp_setup_processor_id(void)
x86_write_percpu(cpu_number, hard_smp_processor_id());
}
+static void voyager_send_call_func(cpumask_t callmask)
+{
+ __u32 mask = cpus_addr(callmask)[0] & ~(1 << smp_processor_id());
+ send_CPI(mask, VIC_CALL_FUNCTION_CPI);
+}
+
+static void voyager_send_call_func_single(int cpu)
+{
+ send_CPI(1 << cpu, VIC_CALL_FUNCTION_SINGLE_CPI);
+}
+
struct smp_ops smp_ops = {
.smp_prepare_boot_cpu = voyager_smp_prepare_boot_cpu,
.smp_prepare_cpus = voyager_smp_prepare_cpus,
@@ -1789,6 +1811,6 @@ struct smp_ops smp_ops = {
.smp_send_stop = voyager_smp_send_stop,
.smp_send_reschedule = voyager_smp_send_reschedule,
- .send_call_func_ipi = native_send_call_func_ipi,
- .send_call_func_single_ipi = native_send_call_func_single_ipi,
+ .send_call_func_ipi = voyager_send_call_func,
+ .send_call_func_single_ipi = voyager_send_call_func_single,
};
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 59f89b434b45..d8cc96a2738f 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,16 +1,15 @@
obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o gup.o
-obj-$(CONFIG_X86_32) += pgtable_32.o
+obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
-obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o
obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
-mmiotrace-y := pf_in.o mmio-mod.o
+mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
obj-$(CONFIG_NUMA) += numa_$(BITS).o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index d18ea136d8a6..4c056b5d6a95 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -54,7 +54,7 @@
static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
-#ifdef CONFIG_MMIOTRACE_HOOKS
+#ifdef CONFIG_MMIOTRACE
if (unlikely(is_kmmio_active()))
if (kmmio_handler(regs, addr) == 1)
return -1;
@@ -394,7 +394,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
if (pte && pte_present(*pte) && !pte_exec(*pte))
printk(KERN_CRIT "kernel tried to execute "
"NX-protected page - exploit attempt? "
- "(uid: %d)\n", current->uid);
+ "(uid: %d)\n", current_uid());
}
#endif
@@ -414,6 +414,7 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
unsigned long error_code)
{
unsigned long flags = oops_begin();
+ int sig = SIGKILL;
struct task_struct *tsk;
printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
@@ -424,8 +425,8 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
tsk->thread.trap_no = 14;
tsk->thread.error_code = error_code;
if (__die("Bad pagetable", regs, error_code))
- regs = NULL;
- oops_end(flags, regs, SIGKILL);
+ sig = 0;
+ oops_end(flags, regs, sig);
}
#endif
@@ -593,6 +594,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
#ifdef CONFIG_X86_64
unsigned long flags;
+ int sig;
#endif
tsk = current;
@@ -643,24 +645,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
}
-#ifdef CONFIG_X86_32
- /* It's safe to allow irq's after cr2 has been saved and the vmalloc
- fault has been handled. */
- if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
- local_irq_enable();
-
/*
- * If we're in an interrupt, have no user context or are running in an
- * atomic region then we must not take the fault.
+ * It's safe to allow irq's after cr2 has been saved and the
+ * vmalloc fault has been handled.
+ *
+ * User-mode registers count as a user access even for any
+ * potential system fault or CPU buglet.
*/
- if (in_atomic() || !mm)
- goto bad_area_nosemaphore;
-#else /* CONFIG_X86_64 */
- if (likely(regs->flags & X86_EFLAGS_IF))
+ if (user_mode_vm(regs)) {
+ local_irq_enable();
+ error_code |= PF_USER;
+ } else if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
+#ifdef CONFIG_X86_64
if (unlikely(error_code & PF_RSVD))
pgtable_bad(address, regs, error_code);
+#endif
/*
* If we're in an interrupt, have no user context or are running in an
@@ -669,15 +670,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (unlikely(in_atomic() || !mm))
goto bad_area_nosemaphore;
- /*
- * User-mode registers count as a user access even for any
- * potential system fault or CPU buglet.
- */
- if (user_mode_vm(regs))
- error_code |= PF_USER;
again:
-#endif
- /* When running in the kernel we expect faults to occur only to
+ /*
+ * When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
* erroneous fault occurring in a code path which already holds mmap_sem
@@ -740,9 +735,6 @@ good_area:
goto bad_area;
}
-#ifdef CONFIG_X86_32
-survive:
-#endif
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
@@ -866,11 +858,12 @@ no_context:
bust_spinlocks(0);
do_exit(SIGKILL);
#else
+ sig = SIGKILL;
if (__die("Oops", regs, error_code))
- regs = NULL;
+ sig = 0;
/* Executive summary in case the body of the oops scrolled away */
printk(KERN_EMERG "CR2: %016lx\n", address);
- oops_end(flags, regs, SIGKILL);
+ oops_end(flags, regs, sig);
#endif
/*
@@ -881,12 +874,11 @@ out_of_memory:
up_read(&mm->mmap_sem);
if (is_global_init(tsk)) {
yield();
-#ifdef CONFIG_X86_32
- down_read(&mm->mmap_sem);
- goto survive;
-#else
+ /*
+ * Re-lookup the vma - in theory the vma tree might
+ * have changed:
+ */
goto again;
-#endif
}
printk("VM: killing process %s\n", tsk->comm);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 4ba373c5b8c8..be54176e9eb2 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -233,7 +233,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len;
if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
- start, len)))
+ (void __user *)start, len)))
goto slow_irqon;
/*
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 165c871ba9af..bcc079c282dd 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -137,6 +137,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
return (void*) vaddr;
}
+EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
struct page *kmap_atomic_to_page(void *ptr)
{
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8396868e82c5..8655b5bb0963 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -21,6 +21,7 @@
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <linux/pci.h>
#include <linux/pfn.h>
#include <linux/poison.h>
#include <linux/bootmem.h>
@@ -67,7 +68,7 @@ static unsigned long __meminitdata table_top;
static int __initdata after_init_bootmem;
-static __init void *alloc_low_page(unsigned long *phys)
+static __init void *alloc_low_page(void)
{
unsigned long pfn = table_end++;
void *adr;
@@ -77,7 +78,6 @@ static __init void *alloc_low_page(unsigned long *phys)
adr = __va(pfn * PAGE_SIZE);
memset(adr, 0, PAGE_SIZE);
- *phys = pfn * PAGE_SIZE;
return adr;
}
@@ -92,16 +92,17 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
pmd_t *pmd_table;
#ifdef CONFIG_X86_PAE
- unsigned long phys;
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
if (after_init_bootmem)
pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
else
- pmd_table = (pmd_t *)alloc_low_page(&phys);
+ pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
BUG_ON(pmd_table != pmd_offset(pud, 0));
+
+ return pmd_table;
}
#endif
pud = pud_offset(pgd, 0);
@@ -126,10 +127,8 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
if (!page_table)
page_table =
(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
- } else {
- unsigned long phys;
- page_table = (pte_t *)alloc_low_page(&phys);
- }
+ } else
+ page_table = (pte_t *)alloc_low_page();
paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -334,7 +333,6 @@ int devmem_is_allowed(unsigned long pagenr)
return 0;
}
-#ifdef CONFIG_HIGHMEM
pte_t *kmap_pte;
pgprot_t kmap_prot;
@@ -357,6 +355,7 @@ static void __init kmap_init(void)
kmap_prot = PAGE_KERNEL;
}
+#ifdef CONFIG_HIGHMEM
static void __init permanent_kmaps_init(pgd_t *pgd_base)
{
unsigned long vaddr;
@@ -436,7 +435,6 @@ static void __init set_highmem_pages_init(void)
#endif /* !CONFIG_NUMA */
#else
-# define kmap_init() do { } while (0)
# define permanent_kmaps_init(pgd_base) do { } while (0)
# define set_highmem_pages_init() do { } while (0)
#endif /* CONFIG_HIGHMEM */
@@ -970,7 +968,7 @@ void __init mem_init(void)
int codesize, reservedpages, datasize, initsize;
int tmp;
- start_periodic_check_for_corruption();
+ pci_iommu_alloc();
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
@@ -1041,11 +1039,25 @@ void __init mem_init(void)
(unsigned long)&_text, (unsigned long)&_etext,
((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+ /*
+ * Check boundaries twice: Some fundamental inconsistencies can
+ * be detected at build time already.
+ */
+#define __FIXADDR_TOP (-PAGE_SIZE)
+#ifdef CONFIG_HIGHMEM
+ BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+ BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE);
+#endif
+#define high_memory (-128UL << 20)
+ BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
+#undef high_memory
+#undef __FIXADDR_TOP
+
#ifdef CONFIG_HIGHMEM
BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
BUG_ON(VMALLOC_END > PKMAP_BASE);
#endif
- BUG_ON(VMALLOC_START > VMALLOC_END);
+ BUG_ON(VMALLOC_START >= VMALLOC_END);
BUG_ON((unsigned long)high_memory > VMALLOC_START);
if (boot_cpu_data.wp_works_ok < 0)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index b8e461d49412..9f7a0d24d42a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -350,8 +350,10 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
* pagetable pages as RO. So assume someone who pre-setup
* these mappings are more intelligent.
*/
- if (pte_val(*pte))
+ if (pte_val(*pte)) {
+ pages++;
continue;
+ }
if (0)
printk(" pte=%p addr=%lx pte=%016lx\n",
@@ -418,8 +420,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
* not differ with respect to page frame and
* attributes.
*/
- if (page_size_mask & (1 << PG_LEVEL_2M))
+ if (page_size_mask & (1 << PG_LEVEL_2M)) {
+ pages++;
continue;
+ }
new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
}
@@ -499,8 +503,10 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
* not differ with respect to page frame and
* attributes.
*/
- if (page_size_mask & (1 << PG_LEVEL_1G))
+ if (page_size_mask & (1 << PG_LEVEL_1G)) {
+ pages++;
continue;
+ }
prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
}
@@ -665,12 +671,13 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
unsigned long last_map_addr = 0;
unsigned long page_size_mask = 0;
unsigned long start_pfn, end_pfn;
+ unsigned long pos;
struct map_range mr[NR_RANGE_MR];
int nr_range, i;
int use_pse, use_gbpages;
- printk(KERN_INFO "init_memory_mapping\n");
+ printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
/*
* Find space for the kernel direct mapping tables.
@@ -704,35 +711,50 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
/* head if not big page alignment ?*/
start_pfn = start >> PAGE_SHIFT;
- end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
+ pos = start_pfn << PAGE_SHIFT;
+ end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
<< (PMD_SHIFT - PAGE_SHIFT);
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+ pos = end_pfn << PAGE_SHIFT;
+ }
/* big page (2M) range*/
- start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
<< (PMD_SHIFT - PAGE_SHIFT);
- end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
+ end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
<< (PUD_SHIFT - PAGE_SHIFT);
- if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
- end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
- page_size_mask & (1<<PG_LEVEL_2M));
+ if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
+ end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+ pos = end_pfn << PAGE_SHIFT;
+ }
/* big page (1G) range */
- start_pfn = end_pfn;
- end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+ << (PUD_SHIFT - PAGE_SHIFT);
+ end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask &
((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+ pos = end_pfn << PAGE_SHIFT;
+ }
/* tail is not big page (1G) alignment */
- start_pfn = end_pfn;
- end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
- page_size_mask & (1<<PG_LEVEL_2M));
+ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+ << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+ pos = end_pfn << PAGE_SHIFT;
+ }
/* tail is not big page (2M) alignment */
- start_pfn = end_pfn;
+ start_pfn = pos>>PAGE_SHIFT;
end_pfn = end>>PAGE_SHIFT;
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
@@ -831,12 +853,12 @@ int arch_add_memory(int nid, u64 start, u64 size)
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
- last_mapped_pfn = init_memory_mapping(start, start + size-1);
+ last_mapped_pfn = init_memory_mapping(start, start + size);
if (last_mapped_pfn > max_pfn_mapped)
max_pfn_mapped = last_mapped_pfn;
ret = __add_pages(zone, start_pfn, nr_pages);
- WARN_ON(1);
+ WARN_ON_ONCE(ret);
return ret;
}
@@ -878,8 +900,7 @@ static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
void __init mem_init(void)
{
long codesize, reservedpages, datasize, initsize;
-
- start_periodic_check_for_corruption();
+ unsigned long absent_pages;
pci_iommu_alloc();
@@ -893,8 +914,9 @@ void __init mem_init(void)
#else
totalram_pages = free_all_bootmem();
#endif
- reservedpages = max_pfn - totalram_pages -
- absent_pages_in_range(0, max_pfn);
+
+ absent_pages = absent_pages_in_range(0, max_pfn);
+ reservedpages = max_pfn - totalram_pages - absent_pages;
after_bootmem = 1;
codesize = (unsigned long) &_etext - (unsigned long) &_text;
@@ -911,10 +933,11 @@ void __init mem_init(void)
VSYSCALL_END - VSYSCALL_START);
printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
- "%ldk reserved, %ldk data, %ldk init)\n",
+ "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
max_pfn << (PAGE_SHIFT-10),
codesize >> 10,
+ absent_pages << (PAGE_SHIFT-10),
reservedpages << (PAGE_SHIFT-10),
datasize >> 10,
initsize >> 10);
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
new file mode 100644
index 000000000000..d0151d8ce452
--- /dev/null
+++ b/arch/x86/mm/iomap_32.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright © 2008 Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <asm/iomap.h>
+#include <linux/module.h>
+
+/* Map 'pfn' using fixed map 'type' and protections 'prot'
+ */
+void *
+iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+{
+ enum fixed_addresses idx;
+ unsigned long vaddr;
+
+ pagefault_disable();
+
+ idx = type + KM_TYPE_NR*smp_processor_id();
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ set_pte(kmap_pte-idx, pfn_pte(pfn, prot));
+ arch_flush_lazy_mmu_mode();
+
+ return (void*) vaddr;
+}
+EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
+
+void
+iounmap_atomic(void *kvaddr, enum km_type type)
+{
+ unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+ enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
+
+ /*
+ * Force other mappings to Oops if they'll try to access this pte
+ * without first remap it. Keeping stale mappings around is a bad idea
+ * also, in case the page changes cacheability attributes or becomes
+ * a protected page in a hypervisor.
+ */
+ if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
+ kpte_clear_flush(kmap_pte-idx, vaddr);
+
+ arch_flush_lazy_mmu_mode();
+ pagefault_enable();
+}
+EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index e4c43ec71b29..bd85d42819e1 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -220,6 +220,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
return (__force void __iomem *)phys_to_virt(phys_addr);
/*
+ * Check if the request spans more than any BAR in the iomem resource
+ * tree.
+ */
+ WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
+ KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
+
+ /*
* Don't allow anybody to remap normal RAM that we're using..
*/
for (pfn = phys_addr >> PAGE_SHIFT;
@@ -381,7 +388,7 @@ static void __iomem *ioremap_default(resource_size_t phys_addr,
unsigned long size)
{
unsigned long flags;
- void *ret;
+ void __iomem *ret;
int err;
/*
@@ -393,11 +400,11 @@ static void __iomem *ioremap_default(resource_size_t phys_addr,
if (err < 0)
return NULL;
- ret = (void *) __ioremap_caller(phys_addr, size, flags,
- __builtin_return_address(0));
+ ret = __ioremap_caller(phys_addr, size, flags,
+ __builtin_return_address(0));
free_memtype(phys_addr, phys_addr + size);
- return (void __iomem *)ret;
+ return ret;
}
void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
@@ -616,7 +623,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
__early_set_fixmap(idx, 0, __pgprot(0));
}
-static void *prev_map[FIX_BTMAPS_SLOTS] __initdata;
+static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
static int __init check_early_ioremap_leak(void)
{
@@ -639,7 +646,7 @@ static int __init check_early_ioremap_leak(void)
}
late_initcall(check_early_ioremap_leak);
-static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
+static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
{
unsigned long offset, last_addr;
unsigned int nrpages;
@@ -707,23 +714,23 @@ static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size,
if (early_ioremap_debug)
printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
- prev_map[slot] = (void *) (offset + fix_to_virt(idx0));
+ prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0));
return prev_map[slot];
}
/* Remap an IO device */
-void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
+void __init __iomem *early_ioremap(unsigned long phys_addr, unsigned long size)
{
return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
}
/* Remap memory */
-void __init *early_memremap(unsigned long phys_addr, unsigned long size)
+void __init __iomem *early_memremap(unsigned long phys_addr, unsigned long size)
{
return __early_ioremap(phys_addr, size, PAGE_KERNEL);
}
-void __init early_iounmap(void *addr, unsigned long size)
+void __init early_iounmap(void __iomem *addr, unsigned long size)
{
unsigned long virt_addr;
unsigned long offset;
@@ -773,7 +780,7 @@ void __init early_iounmap(void *addr, unsigned long size)
--idx;
--nrpages;
}
- prev_map[slot] = 0;
+ prev_map[slot] = NULL;
}
void __this_fixmap_does_not_exist(void)
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 672e17f8262a..9cab18b0b857 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -61,9 +61,9 @@ static void __init memtest(unsigned long start_phys, unsigned long size,
last_bad += incr;
} else {
if (start_bad) {
- printk(KERN_CONT "\n %010lx bad mem addr %010lx - %010lx reserved",
+ printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
val, start_bad, last_bad + incr);
- reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+ reserve_early(start_bad, last_bad + incr, "BAD RAM");
}
start_bad = last_bad = start_phys_aligned;
}
@@ -72,9 +72,8 @@ static void __init memtest(unsigned long start_phys, unsigned long size,
if (start_bad) {
printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
val, start_bad, last_bad + incr);
- reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+ reserve_early(start_bad, last_bad + incr, "BAD RAM");
}
-
}
/* default is disabled */
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 635b50e85581..2c4baa88f2cb 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -56,13 +56,6 @@ struct remap_trace {
static DEFINE_PER_CPU(struct trap_reason, pf_reason);
static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
-#if 0 /* XXX: no way gather this info anymore */
-/* Access to this is not per-cpu. */
-static DEFINE_PER_CPU(atomic_t, dropped);
-#endif
-
-static struct dentry *marker_file;
-
static DEFINE_MUTEX(mmiotrace_mutex);
static DEFINE_SPINLOCK(trace_lock);
static atomic_t mmiotrace_enabled;
@@ -75,7 +68,7 @@ static LIST_HEAD(trace_list); /* struct remap_trace */
* and trace_lock.
* - Routines depending on is_enabled() must take trace_lock.
* - trace_list users must hold trace_lock.
- * - is_enabled() guarantees that mmio_trace_record is allowed.
+ * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed.
* - pre/post callbacks assume the effect of is_enabled() being true.
*/
@@ -97,44 +90,6 @@ static bool is_enabled(void)
return atomic_read(&mmiotrace_enabled);
}
-#if 0 /* XXX: needs rewrite */
-/*
- * Write callback for the debugfs entry:
- * Read a marker and write it to the mmio trace log
- */
-static ssize_t write_marker(struct file *file, const char __user *buffer,
- size_t count, loff_t *ppos)
-{
- char *event = NULL;
- struct mm_io_header *headp;
- ssize_t len = (count > 65535) ? 65535 : count;
-
- event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
- if (!event)
- return -ENOMEM;
-
- headp = (struct mm_io_header *)event;
- headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
- headp->data_len = len;
-
- if (copy_from_user(event + sizeof(*headp), buffer, len)) {
- kfree(event);
- return -EFAULT;
- }
-
- spin_lock_irq(&trace_lock);
-#if 0 /* XXX: convert this to use tracing */
- if (is_enabled())
- relay_write(chan, event, sizeof(*headp) + len);
- else
-#endif
- len = -EINVAL;
- spin_unlock_irq(&trace_lock);
- kfree(event);
- return len;
-}
-#endif
-
static void print_pte(unsigned long address)
{
unsigned int level;
@@ -307,8 +262,10 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size,
map.map_id = trace->id;
spin_lock_irq(&trace_lock);
- if (!is_enabled())
+ if (!is_enabled()) {
+ kfree(trace);
goto not_enabled;
+ }
mmio_trace_mapping(&map);
list_add_tail(&trace->list, &trace_list);
@@ -377,6 +334,23 @@ void mmiotrace_iounmap(volatile void __iomem *addr)
iounmap_trace_core(addr);
}
+int mmiotrace_printk(const char *fmt, ...)
+{
+ int ret = 0;
+ va_list args;
+ unsigned long flags;
+ va_start(args, fmt);
+
+ spin_lock_irqsave(&trace_lock, flags);
+ if (is_enabled())
+ ret = mmio_trace_printk(fmt, args);
+ spin_unlock_irqrestore(&trace_lock, flags);
+
+ va_end(args);
+ return ret;
+}
+EXPORT_SYMBOL(mmiotrace_printk);
+
static void clear_trace_list(void)
{
struct remap_trace *trace;
@@ -462,26 +436,12 @@ static void leave_uniprocessor(void)
}
#endif
-#if 0 /* XXX: out of order */
-static struct file_operations fops_marker = {
- .owner = THIS_MODULE,
- .write = write_marker
-};
-#endif
-
void enable_mmiotrace(void)
{
mutex_lock(&mmiotrace_mutex);
if (is_enabled())
goto out;
-#if 0 /* XXX: tracing does not support text entries */
- marker_file = debugfs_create_file("marker", 0660, dir, NULL,
- &fops_marker);
- if (!marker_file)
- pr_err(NAME "marker file creation failed.\n");
-#endif
-
if (nommiotrace)
pr_info(NAME "MMIO tracing disabled.\n");
enter_uniprocessor();
@@ -506,11 +466,6 @@ void disable_mmiotrace(void)
clear_trace_list(); /* guarantees: no more kmmio callbacks */
leave_uniprocessor();
- if (marker_file) {
- debugfs_remove(marker_file);
- marker_file = NULL;
- }
-
pr_info(NAME "disabled.\n");
out:
mutex_unlock(&mmiotrace_mutex);
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 847c164725f4..8518c678d83f 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -222,6 +222,41 @@ static void __init remap_numa_kva(void)
}
}
+#ifdef CONFIG_HIBERNATION
+/**
+ * resume_map_numa_kva - add KVA mapping to the temporary page tables created
+ * during resume from hibernation
+ * @pgd_base - temporary resume page directory
+ */
+void resume_map_numa_kva(pgd_t *pgd_base)
+{
+ int node;
+
+ for_each_online_node(node) {
+ unsigned long start_va, start_pfn, size, pfn;
+
+ start_va = (unsigned long)node_remap_start_vaddr[node];
+ start_pfn = node_remap_start_pfn[node];
+ size = node_remap_size[node];
+
+ printk(KERN_DEBUG "%s: node %d\n", __FUNCTION__, node);
+
+ for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) {
+ unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
+ pgd_t *pgd = pgd_base + pgd_index(vaddr);
+ pud_t *pud = pud_offset(pgd, vaddr);
+ pmd_t *pmd = pmd_offset(pud, vaddr);
+
+ set_pmd(pmd, pfn_pmd(start_pfn + pfn,
+ PAGE_KERNEL_LARGE_EXEC));
+
+ printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n",
+ __FUNCTION__, vaddr, start_pfn + pfn);
+ }
+ }
+}
+#endif
+
static unsigned long calculate_numa_remap_pages(void)
{
int nid;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a9ec89c3fbca..e89d24815f26 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -65,23 +65,22 @@ static void split_page_count(int level)
direct_pages_count[level - 1] += PTRS_PER_PTE;
}
-int arch_report_meminfo(char *page)
+void arch_report_meminfo(struct seq_file *m)
{
- int n = sprintf(page, "DirectMap4k: %8lu kB\n",
+ seq_printf(m, "DirectMap4k: %8lu kB\n",
direct_pages_count[PG_LEVEL_4K] << 2);
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
- n += sprintf(page + n, "DirectMap2M: %8lu kB\n",
+ seq_printf(m, "DirectMap2M: %8lu kB\n",
direct_pages_count[PG_LEVEL_2M] << 11);
#else
- n += sprintf(page + n, "DirectMap4M: %8lu kB\n",
+ seq_printf(m, "DirectMap4M: %8lu kB\n",
direct_pages_count[PG_LEVEL_2M] << 12);
#endif
#ifdef CONFIG_X86_64
if (direct_gbpages)
- n += sprintf(page + n, "DirectMap1G: %8lu kB\n",
+ seq_printf(m, "DirectMap1G: %8lu kB\n",
direct_pages_count[PG_LEVEL_1G] << 20);
#endif
- return n;
}
#else
static inline void split_page_count(int level) { }
@@ -792,6 +791,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
/* Must avoid aliasing mappings in the highmem code */
kmap_flush_unused();
+ vm_unmap_aliases();
+
cpa.vaddr = addr;
cpa.numpages = numpages;
cpa.mask_set = mask_set;
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 738fd0f24958..85cbd3cd3723 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -481,12 +481,16 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
return 1;
}
#else
+/* This check is needed to avoid cache aliasing when PAT is enabled */
static inline int range_is_allowed(unsigned long pfn, unsigned long size)
{
u64 from = ((u64)pfn) << PAGE_SHIFT;
u64 to = from + size;
u64 cursor = from;
+ if (!pat_enabled)
+ return 1;
+
while (cursor < to) {
if (!devmem_is_allowed(pfn)) {
printk(KERN_INFO
@@ -592,6 +596,242 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
free_memtype(addr, addr + size);
}
+/*
+ * Internal interface to reserve a range of physical memory with prot.
+ * Reserved non RAM regions only and after successful reserve_memtype,
+ * this func also keeps identity mapping (if any) in sync with this new prot.
+ */
+static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t vma_prot)
+{
+ int is_ram = 0;
+ int id_sz, ret;
+ unsigned long flags;
+ unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
+
+ is_ram = pagerange_is_ram(paddr, paddr + size);
+
+ if (is_ram != 0) {
+ /*
+ * For mapping RAM pages, drivers need to call
+ * set_memory_[uc|wc|wb] directly, for reserve and free, before
+ * setting up the PTE.
+ */
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
+ if (ret)
+ return ret;
+
+ if (flags != want_flags) {
+ free_memtype(paddr, paddr + size);
+ printk(KERN_ERR
+ "%s:%d map pfn expected mapping type %s for %Lx-%Lx, got %s\n",
+ current->comm, current->pid,
+ cattr_name(want_flags),
+ (unsigned long long)paddr,
+ (unsigned long long)(paddr + size),
+ cattr_name(flags));
+ return -EINVAL;
+ }
+
+ /* Need to keep identity mapping in sync */
+ if (paddr >= __pa(high_memory))
+ return 0;
+
+ id_sz = (__pa(high_memory) < paddr + size) ?
+ __pa(high_memory) - paddr :
+ size;
+
+ if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) {
+ free_memtype(paddr, paddr + size);
+ printk(KERN_ERR
+ "%s:%d reserve_pfn_range ioremap_change_attr failed %s "
+ "for %Lx-%Lx\n",
+ current->comm, current->pid,
+ cattr_name(flags),
+ (unsigned long long)paddr,
+ (unsigned long long)(paddr + size));
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Internal interface to free a range of physical memory.
+ * Frees non RAM regions only.
+ */
+static void free_pfn_range(u64 paddr, unsigned long size)
+{
+ int is_ram;
+
+ is_ram = pagerange_is_ram(paddr, paddr + size);
+ if (is_ram == 0)
+ free_memtype(paddr, paddr + size);
+}
+
+/*
+ * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
+ * copied through copy_page_range().
+ *
+ * If the vma has a linear pfn mapping for the entire range, we get the prot
+ * from pte and reserve the entire vma range with single reserve_pfn_range call.
+ * Otherwise, we reserve the entire vma range, my ging through the PTEs page
+ * by page to get physical address and protection.
+ */
+int track_pfn_vma_copy(struct vm_area_struct *vma)
+{
+ int retval = 0;
+ unsigned long i, j;
+ resource_size_t paddr;
+ unsigned long prot;
+ unsigned long vma_start = vma->vm_start;
+ unsigned long vma_end = vma->vm_end;
+ unsigned long vma_size = vma_end - vma_start;
+
+ if (!pat_enabled)
+ return 0;
+
+ if (is_linear_pfn_mapping(vma)) {
+ /*
+ * reserve the whole chunk covered by vma. We need the
+ * starting address and protection from pte.
+ */
+ if (follow_phys(vma, vma_start, 0, &prot, &paddr)) {
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+ return reserve_pfn_range(paddr, vma_size, __pgprot(prot));
+ }
+
+ /* reserve entire vma page by page, using pfn and prot from pte */
+ for (i = 0; i < vma_size; i += PAGE_SIZE) {
+ if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
+ continue;
+
+ retval = reserve_pfn_range(paddr, PAGE_SIZE, __pgprot(prot));
+ if (retval)
+ goto cleanup_ret;
+ }
+ return 0;
+
+cleanup_ret:
+ /* Reserve error: Cleanup partial reservation and return error */
+ for (j = 0; j < i; j += PAGE_SIZE) {
+ if (follow_phys(vma, vma_start + j, 0, &prot, &paddr))
+ continue;
+
+ free_pfn_range(paddr, PAGE_SIZE);
+ }
+
+ return retval;
+}
+
+/*
+ * track_pfn_vma_new is called when a _new_ pfn mapping is being established
+ * for physical range indicated by pfn and size.
+ *
+ * prot is passed in as a parameter for the new mapping. If the vma has a
+ * linear pfn mapping for the entire range reserve the entire vma range with
+ * single reserve_pfn_range call.
+ * Otherwise, we look t the pfn and size and reserve only the specified range
+ * page by page.
+ *
+ * Note that this function can be called with caller trying to map only a
+ * subrange/page inside the vma.
+ */
+int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot,
+ unsigned long pfn, unsigned long size)
+{
+ int retval = 0;
+ unsigned long i, j;
+ resource_size_t base_paddr;
+ resource_size_t paddr;
+ unsigned long vma_start = vma->vm_start;
+ unsigned long vma_end = vma->vm_end;
+ unsigned long vma_size = vma_end - vma_start;
+
+ if (!pat_enabled)
+ return 0;
+
+ if (is_linear_pfn_mapping(vma)) {
+ /* reserve the whole chunk starting from vm_pgoff */
+ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
+ return reserve_pfn_range(paddr, vma_size, prot);
+ }
+
+ /* reserve page by page using pfn and size */
+ base_paddr = (resource_size_t)pfn << PAGE_SHIFT;
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ paddr = base_paddr + i;
+ retval = reserve_pfn_range(paddr, PAGE_SIZE, prot);
+ if (retval)
+ goto cleanup_ret;
+ }
+ return 0;
+
+cleanup_ret:
+ /* Reserve error: Cleanup partial reservation and return error */
+ for (j = 0; j < i; j += PAGE_SIZE) {
+ paddr = base_paddr + j;
+ free_pfn_range(paddr, PAGE_SIZE);
+ }
+
+ return retval;
+}
+
+/*
+ * untrack_pfn_vma is called while unmapping a pfnmap for a region.
+ * untrack can be called for a specific region indicated by pfn and size or
+ * can be for the entire vma (in which case size can be zero).
+ */
+void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
+ unsigned long size)
+{
+ unsigned long i;
+ resource_size_t paddr;
+ unsigned long prot;
+ unsigned long vma_start = vma->vm_start;
+ unsigned long vma_end = vma->vm_end;
+ unsigned long vma_size = vma_end - vma_start;
+
+ if (!pat_enabled)
+ return;
+
+ if (is_linear_pfn_mapping(vma)) {
+ /* free the whole chunk starting from vm_pgoff */
+ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
+ free_pfn_range(paddr, vma_size);
+ return;
+ }
+
+ if (size != 0 && size != vma_size) {
+ /* free page by page, using pfn and size */
+ paddr = (resource_size_t)pfn << PAGE_SHIFT;
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ paddr = paddr + i;
+ free_pfn_range(paddr, PAGE_SIZE);
+ }
+ } else {
+ /* free entire vma, page by page, using the pfn from pte */
+ for (i = 0; i < vma_size; i += PAGE_SIZE) {
+ if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
+ continue;
+
+ free_pfn_range(paddr, PAGE_SIZE);
+ }
+ }
+}
+
+pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+ if (pat_enabled)
+ return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC);
+ else
+ return pgprot_noncached(prot);
+}
+
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
/* get Nth element of the linked list */
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index efa1911e20ca..df3d5c861cda 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -79,25 +79,34 @@ static unsigned int mw32[] = { 0xC7 };
static unsigned int mw64[] = { 0x89, 0x8B };
#endif /* not __i386__ */
-static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged,
- int *rexr)
+struct prefix_bits {
+ unsigned shorted:1;
+ unsigned enlarged:1;
+ unsigned rexr:1;
+ unsigned rex:1;
+};
+
+static int skip_prefix(unsigned char *addr, struct prefix_bits *prf)
{
int i;
unsigned char *p = addr;
- *shorted = 0;
- *enlarged = 0;
- *rexr = 0;
+ prf->shorted = 0;
+ prf->enlarged = 0;
+ prf->rexr = 0;
+ prf->rex = 0;
restart:
for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
if (*p == prefix_codes[i]) {
if (*p == 0x66)
- *shorted = 1;
+ prf->shorted = 1;
#ifdef __amd64__
if ((*p & 0xf8) == 0x48)
- *enlarged = 1;
+ prf->enlarged = 1;
if ((*p & 0xf4) == 0x44)
- *rexr = 1;
+ prf->rexr = 1;
+ if ((*p & 0xf0) == 0x40)
+ prf->rex = 1;
#endif
p++;
goto restart;
@@ -135,12 +144,12 @@ enum reason_type get_ins_type(unsigned long ins_addr)
{
unsigned int opcode;
unsigned char *p;
- int shorted, enlarged, rexr;
+ struct prefix_bits prf;
int i;
enum reason_type rv = OTHERS;
p = (unsigned char *)ins_addr;
- p += skip_prefix(p, &shorted, &enlarged, &rexr);
+ p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
@@ -156,10 +165,11 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr)
{
unsigned int opcode;
unsigned char *p;
- int i, shorted, enlarged, rexr;
+ struct prefix_bits prf;
+ int i;
p = (unsigned char *)ins_addr;
- p += skip_prefix(p, &shorted, &enlarged, &rexr);
+ p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(rw8); i++)
@@ -168,7 +178,7 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr)
for (i = 0; i < ARRAY_SIZE(rw32); i++)
if (rw32[i] == opcode)
- return (shorted ? 2 : (enlarged ? 8 : 4));
+ return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
return 0;
@@ -178,10 +188,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr)
{
unsigned int opcode;
unsigned char *p;
- int i, shorted, enlarged, rexr;
+ struct prefix_bits prf;
+ int i;
p = (unsigned char *)ins_addr;
- p += skip_prefix(p, &shorted, &enlarged, &rexr);
+ p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(mw8); i++)
@@ -194,11 +205,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr)
for (i = 0; i < ARRAY_SIZE(mw32); i++)
if (mw32[i] == opcode)
- return shorted ? 2 : 4;
+ return prf.shorted ? 2 : 4;
for (i = 0; i < ARRAY_SIZE(mw64); i++)
if (mw64[i] == opcode)
- return shorted ? 2 : (enlarged ? 8 : 4);
+ return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
return 0;
@@ -238,7 +249,7 @@ enum {
#endif
};
-static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
+static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs)
{
unsigned char *rv = NULL;
@@ -255,18 +266,6 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
case arg_DL:
rv = (unsigned char *)&regs->dx;
break;
- case arg_AH:
- rv = 1 + (unsigned char *)&regs->ax;
- break;
- case arg_BH:
- rv = 1 + (unsigned char *)&regs->bx;
- break;
- case arg_CH:
- rv = 1 + (unsigned char *)&regs->cx;
- break;
- case arg_DH:
- rv = 1 + (unsigned char *)&regs->dx;
- break;
#ifdef __amd64__
case arg_R8:
rv = (unsigned char *)&regs->r8;
@@ -294,9 +293,55 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
break;
#endif
default:
- printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
break;
}
+
+ if (rv)
+ return rv;
+
+ if (rex) {
+ /*
+ * If REX prefix exists, access low bytes of SI etc.
+ * instead of AH etc.
+ */
+ switch (no) {
+ case arg_SI:
+ rv = (unsigned char *)&regs->si;
+ break;
+ case arg_DI:
+ rv = (unsigned char *)&regs->di;
+ break;
+ case arg_BP:
+ rv = (unsigned char *)&regs->bp;
+ break;
+ case arg_SP:
+ rv = (unsigned char *)&regs->sp;
+ break;
+ default:
+ break;
+ }
+ } else {
+ switch (no) {
+ case arg_AH:
+ rv = 1 + (unsigned char *)&regs->ax;
+ break;
+ case arg_BH:
+ rv = 1 + (unsigned char *)&regs->bx;
+ break;
+ case arg_CH:
+ rv = 1 + (unsigned char *)&regs->cx;
+ break;
+ case arg_DH:
+ rv = 1 + (unsigned char *)&regs->dx;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!rv)
+ printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+
return rv;
}
@@ -368,11 +413,12 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
unsigned char mod_rm;
int reg;
unsigned char *p;
- int i, shorted, enlarged, rexr;
+ struct prefix_bits prf;
+ int i;
unsigned long rv;
p = (unsigned char *)ins_addr;
- p += skip_prefix(p, &shorted, &enlarged, &rexr);
+ p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
if (reg_rop[i] == opcode) {
@@ -392,10 +438,10 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
do_work:
mod_rm = *p;
- reg = ((mod_rm >> 3) & 0x7) | (rexr << 3);
+ reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
switch (get_ins_reg_width(ins_addr)) {
case 1:
- return *get_reg_w8(reg, regs);
+ return *get_reg_w8(reg, prf.rex, regs);
case 2:
return *(unsigned short *)get_reg_w32(reg, regs);
@@ -422,11 +468,12 @@ unsigned long get_ins_imm_val(unsigned long ins_addr)
unsigned char mod_rm;
unsigned char mod;
unsigned char *p;
- int i, shorted, enlarged, rexr;
+ struct prefix_bits prf;
+ int i;
unsigned long rv;
p = (unsigned char *)ins_addr;
- p += skip_prefix(p, &shorted, &enlarged, &rexr);
+ p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
if (imm_wop[i] == opcode) {
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index d877c5b423ef..ab50a8d7402c 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -3,6 +3,7 @@
*/
#include <linux/module.h>
#include <linux/io.h>
+#include <linux/mmiotrace.h>
#define MODULE_NAME "testmmiotrace"
@@ -13,6 +14,7 @@ MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
static void do_write_test(void __iomem *p)
{
unsigned int i;
+ mmiotrace_printk("Write test.\n");
for (i = 0; i < 256; i++)
iowrite8(i, p + i);
for (i = 1024; i < (5 * 1024); i += 2)
@@ -24,6 +26,7 @@ static void do_write_test(void __iomem *p)
static void do_read_test(void __iomem *p)
{
unsigned int i;
+ mmiotrace_printk("Read test.\n");
for (i = 0; i < 256; i++)
ioread8(p + i);
for (i = 1024; i < (5 * 1024); i += 2)
@@ -39,6 +42,7 @@ static void do_test(void)
pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
return;
}
+ mmiotrace_printk("ioremap returned %p.\n", p);
do_write_test(p);
do_read_test(p);
iounmap(p);
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index e2095cba409f..04df67f8a7ba 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -52,8 +52,7 @@ struct frame_head {
unsigned long ret;
} __attribute__((packed));
-static struct frame_head *
-dump_user_backtrace(struct frame_head * head)
+static struct frame_head *dump_user_backtrace(struct frame_head *head)
{
struct frame_head bufhead[2];
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 57f6c9088081..202864ad49a7 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -28,85 +28,9 @@ static struct op_x86_model_spec const *model;
static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
-static int nmi_start(void);
-static void nmi_stop(void);
-static void nmi_cpu_start(void *dummy);
-static void nmi_cpu_stop(void *dummy);
-
/* 0 == registered but off, 1 == registered and on */
static int nmi_enabled = 0;
-#ifdef CONFIG_SMP
-static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
- void *data)
-{
- int cpu = (unsigned long)data;
- switch (action) {
- case CPU_DOWN_FAILED:
- case CPU_ONLINE:
- smp_call_function_single(cpu, nmi_cpu_start, NULL, 0);
- break;
- case CPU_DOWN_PREPARE:
- smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1);
- break;
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block oprofile_cpu_nb = {
- .notifier_call = oprofile_cpu_notifier
-};
-#endif
-
-#ifdef CONFIG_PM
-
-static int nmi_suspend(struct sys_device *dev, pm_message_t state)
-{
- /* Only one CPU left, just stop that one */
- if (nmi_enabled == 1)
- nmi_cpu_stop(NULL);
- return 0;
-}
-
-static int nmi_resume(struct sys_device *dev)
-{
- if (nmi_enabled == 1)
- nmi_cpu_start(NULL);
- return 0;
-}
-
-static struct sysdev_class oprofile_sysclass = {
- .name = "oprofile",
- .resume = nmi_resume,
- .suspend = nmi_suspend,
-};
-
-static struct sys_device device_oprofile = {
- .id = 0,
- .cls = &oprofile_sysclass,
-};
-
-static int __init init_sysfs(void)
-{
- int error;
-
- error = sysdev_class_register(&oprofile_sysclass);
- if (!error)
- error = sysdev_register(&device_oprofile);
- return error;
-}
-
-static void exit_sysfs(void)
-{
- sysdev_unregister(&device_oprofile);
- sysdev_class_unregister(&oprofile_sysclass);
-}
-
-#else
-#define init_sysfs() do { } while (0)
-#define exit_sysfs() do { } while (0)
-#endif /* CONFIG_PM */
-
static int profile_exceptions_notify(struct notifier_block *self,
unsigned long val, void *data)
{
@@ -361,6 +285,77 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
return 0;
}
+#ifdef CONFIG_SMP
+static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
+ void *data)
+{
+ int cpu = (unsigned long)data;
+ switch (action) {
+ case CPU_DOWN_FAILED:
+ case CPU_ONLINE:
+ smp_call_function_single(cpu, nmi_cpu_start, NULL, 0);
+ break;
+ case CPU_DOWN_PREPARE:
+ smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block oprofile_cpu_nb = {
+ .notifier_call = oprofile_cpu_notifier
+};
+#endif
+
+#ifdef CONFIG_PM
+
+static int nmi_suspend(struct sys_device *dev, pm_message_t state)
+{
+ /* Only one CPU left, just stop that one */
+ if (nmi_enabled == 1)
+ nmi_cpu_stop(NULL);
+ return 0;
+}
+
+static int nmi_resume(struct sys_device *dev)
+{
+ if (nmi_enabled == 1)
+ nmi_cpu_start(NULL);
+ return 0;
+}
+
+static struct sysdev_class oprofile_sysclass = {
+ .name = "oprofile",
+ .resume = nmi_resume,
+ .suspend = nmi_suspend,
+};
+
+static struct sys_device device_oprofile = {
+ .id = 0,
+ .cls = &oprofile_sysclass,
+};
+
+static int __init init_sysfs(void)
+{
+ int error;
+
+ error = sysdev_class_register(&oprofile_sysclass);
+ if (!error)
+ error = sysdev_register(&device_oprofile);
+ return error;
+}
+
+static void exit_sysfs(void)
+{
+ sysdev_unregister(&device_oprofile);
+ sysdev_class_unregister(&oprofile_sysclass);
+}
+
+#else
+#define init_sysfs() do { } while (0)
+#define exit_sysfs() do { } while (0)
+#endif /* CONFIG_PM */
+
static int p4force;
module_param(p4force, int, 0);
@@ -406,23 +401,19 @@ static int __init ppro_init(char **cpu_type)
*cpu_type = "i386/pii";
break;
case 6 ... 8:
+ case 10 ... 11:
*cpu_type = "i386/piii";
break;
case 9:
+ case 13:
*cpu_type = "i386/p6_mobile";
break;
- case 10 ... 13:
- *cpu_type = "i386/p6";
- break;
case 14:
*cpu_type = "i386/core";
break;
case 15: case 23:
*cpu_type = "i386/core_2";
break;
- case 26:
- *cpu_type = "i386/core_2";
- break;
default:
/* Unknown */
return 0;
@@ -432,6 +423,16 @@ static int __init ppro_init(char **cpu_type)
return 1;
}
+static int __init arch_perfmon_init(char **cpu_type)
+{
+ if (!cpu_has_arch_perfmon)
+ return 0;
+ *cpu_type = "i386/arch_perfmon";
+ model = &op_arch_perfmon_spec;
+ arch_perfmon_setup_counters();
+ return 1;
+}
+
/* in order to get sysfs right */
static int using_nmi;
@@ -439,7 +440,7 @@ int __init op_nmi_init(struct oprofile_operations *ops)
{
__u8 vendor = boot_cpu_data.x86_vendor;
__u8 family = boot_cpu_data.x86;
- char *cpu_type;
+ char *cpu_type = NULL;
int ret = 0;
if (!cpu_has_apic)
@@ -477,19 +478,20 @@ int __init op_nmi_init(struct oprofile_operations *ops)
switch (family) {
/* Pentium IV */
case 0xf:
- if (!p4_init(&cpu_type))
- return -ENODEV;
+ p4_init(&cpu_type);
break;
/* A P6-class processor */
case 6:
- if (!ppro_init(&cpu_type))
- return -ENODEV;
+ ppro_init(&cpu_type);
break;
default:
- return -ENODEV;
+ break;
}
+
+ if (!cpu_type && !arch_perfmon_init(&cpu_type))
+ return -ENODEV;
break;
default:
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
index 2880b15c4675..91b6a116165e 100644
--- a/arch/x86/oprofile/op_counter.h
+++ b/arch/x86/oprofile/op_counter.h
@@ -6,22 +6,22 @@
*
* @author John Levon
*/
-
+
#ifndef OP_COUNTER_H
#define OP_COUNTER_H
-
+
#define OP_MAX_COUNTER 8
-
+
/* Per-perfctr configuration as set via
* oprofilefs.
*/
struct op_counter_config {
- unsigned long count;
- unsigned long enabled;
- unsigned long event;
- unsigned long kernel;
- unsigned long user;
- unsigned long unit_mask;
+ unsigned long count;
+ unsigned long enabled;
+ unsigned long event;
+ unsigned long kernel;
+ unsigned long user;
+ unsigned long unit_mask;
};
extern struct op_counter_config counter_config[];
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index d9faf607b3a6..98658f25f542 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -65,10 +65,13 @@ static unsigned long reset_value[NUM_COUNTERS];
#define IBS_FETCH_BEGIN 3
#define IBS_OP_BEGIN 4
-/* The function interface needs to be fixed, something like add
- data. Should then be added to linux/oprofile.h. */
-extern void oprofile_add_ibs_sample(struct pt_regs *const regs,
- unsigned int * const ibs_sample, u8 code);
+/*
+ * The function interface needs to be fixed, something like add
+ * data. Should then be added to linux/oprofile.h.
+ */
+extern void
+oprofile_add_ibs_sample(struct pt_regs * const regs,
+ unsigned int * const ibs_sample, int ibs_code);
struct ibs_fetch_sample {
/* MSRC001_1031 IBS Fetch Linear Address Register */
@@ -103,11 +106,6 @@ struct ibs_op_sample {
unsigned int ibs_dc_phys_high;
};
-/*
- * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+
-*/
-static void clear_ibs_nmi(void);
-
static int ibs_allowed; /* AMD Family10h and later */
struct op_ibs_config {
@@ -222,7 +220,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
(unsigned int *)&ibs_fetch,
IBS_FETCH_BEGIN);
- /*reenable the IRQ */
+ /* reenable the IRQ */
rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
high &= ~IBS_FETCH_HIGH_VALID_BIT;
high |= IBS_FETCH_HIGH_ENABLE;
@@ -309,12 +307,15 @@ static void op_amd_start(struct op_msrs const * const msrs)
#ifdef CONFIG_OPROFILE_IBS
if (ibs_allowed && ibs_config.fetch_enabled) {
low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF;
- high = IBS_FETCH_HIGH_ENABLE;
+ high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */
+ + IBS_FETCH_HIGH_ENABLE;
wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
}
if (ibs_allowed && ibs_config.op_enabled) {
- low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + IBS_OP_LOW_ENABLE;
+ low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF)
+ + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */
+ + IBS_OP_LOW_ENABLE;
high = 0;
wrmsr(MSR_AMD64_IBSOPCTL, low, high);
}
@@ -327,8 +328,10 @@ static void op_amd_stop(struct op_msrs const * const msrs)
unsigned int low, high;
int i;
- /* Subtle: stop on all counters to avoid race with
- * setting our pm callback */
+ /*
+ * Subtle: stop on all counters to avoid race with setting our
+ * pm callback
+ */
for (i = 0 ; i < NUM_COUNTERS ; ++i) {
if (!reset_value[i])
continue;
@@ -339,13 +342,15 @@ static void op_amd_stop(struct op_msrs const * const msrs)
#ifdef CONFIG_OPROFILE_IBS
if (ibs_allowed && ibs_config.fetch_enabled) {
- low = 0; /* clear max count and enable */
+ /* clear max count and enable */
+ low = 0;
high = 0;
wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
}
if (ibs_allowed && ibs_config.op_enabled) {
- low = 0; /* clear max count and enable */
+ /* clear max count and enable */
+ low = 0;
high = 0;
wrmsr(MSR_AMD64_IBSOPCTL, low, high);
}
@@ -366,18 +371,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
}
}
-#ifndef CONFIG_OPROFILE_IBS
-
-/* no IBS support */
-
-static int op_amd_init(struct oprofile_operations *ops)
-{
- return 0;
-}
-
-static void op_amd_exit(void) {}
-
-#else
+#ifdef CONFIG_OPROFILE_IBS
static u8 ibs_eilvt_off;
@@ -391,7 +385,7 @@ static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
}
-static int pfm_amd64_setup_eilvt(void)
+static int init_ibs_nmi(void)
{
#define IBSCTL_LVTOFFSETVAL (1 << 8)
#define IBSCTL 0x1cc
@@ -439,18 +433,22 @@ static int pfm_amd64_setup_eilvt(void)
return 0;
}
-/*
- * initialize the APIC for the IBS interrupts
- * if available (AMD Family10h rev B0 and later)
- */
-static void setup_ibs(void)
+/* uninitialize the APIC for the IBS interrupts if needed */
+static void clear_ibs_nmi(void)
+{
+ if (ibs_allowed)
+ on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
+}
+
+/* initialize the APIC for the IBS interrupts if available */
+static void ibs_init(void)
{
ibs_allowed = boot_cpu_has(X86_FEATURE_IBS);
if (!ibs_allowed)
return;
- if (pfm_amd64_setup_eilvt()) {
+ if (init_ibs_nmi()) {
ibs_allowed = 0;
return;
}
@@ -458,21 +456,18 @@ static void setup_ibs(void)
printk(KERN_INFO "oprofile: AMD IBS detected\n");
}
-
-/*
- * unitialize the APIC for the IBS interrupts if needed on AMD Family10h
- * rev B0 and later */
-static void clear_ibs_nmi(void)
+static void ibs_exit(void)
{
- if (ibs_allowed)
- on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
+ if (!ibs_allowed)
+ return;
+
+ clear_ibs_nmi();
}
-static int (*create_arch_files)(struct super_block * sb, struct dentry * root);
+static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
-static int setup_ibs_files(struct super_block * sb, struct dentry * root)
+static int setup_ibs_files(struct super_block *sb, struct dentry *root)
{
- char buf[12];
struct dentry *dir;
int ret = 0;
@@ -494,29 +489,29 @@ static int setup_ibs_files(struct super_block * sb, struct dentry * root)
ibs_config.max_cnt_op = 250000;
ibs_config.op_enabled = 0;
ibs_config.dispatched_ops = 1;
- snprintf(buf, sizeof(buf), "ibs_fetch");
- dir = oprofilefs_mkdir(sb, root, buf);
- oprofilefs_create_ulong(sb, dir, "rand_enable",
- &ibs_config.rand_en);
+
+ dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
oprofilefs_create_ulong(sb, dir, "enable",
- &ibs_config.fetch_enabled);
+ &ibs_config.fetch_enabled);
oprofilefs_create_ulong(sb, dir, "max_count",
- &ibs_config.max_cnt_fetch);
- snprintf(buf, sizeof(buf), "ibs_uops");
- dir = oprofilefs_mkdir(sb, root, buf);
+ &ibs_config.max_cnt_fetch);
+ oprofilefs_create_ulong(sb, dir, "rand_enable",
+ &ibs_config.rand_en);
+
+ dir = oprofilefs_mkdir(sb, root, "ibs_op");
oprofilefs_create_ulong(sb, dir, "enable",
- &ibs_config.op_enabled);
+ &ibs_config.op_enabled);
oprofilefs_create_ulong(sb, dir, "max_count",
- &ibs_config.max_cnt_op);
+ &ibs_config.max_cnt_op);
oprofilefs_create_ulong(sb, dir, "dispatched_ops",
- &ibs_config.dispatched_ops);
+ &ibs_config.dispatched_ops);
return 0;
}
static int op_amd_init(struct oprofile_operations *ops)
{
- setup_ibs();
+ ibs_init();
create_arch_files = ops->create_files;
ops->create_files = setup_ibs_files;
return 0;
@@ -524,20 +519,31 @@ static int op_amd_init(struct oprofile_operations *ops)
static void op_amd_exit(void)
{
- clear_ibs_nmi();
+ ibs_exit();
}
-#endif
+#else
+
+/* no IBS support */
+
+static int op_amd_init(struct oprofile_operations *ops)
+{
+ return 0;
+}
+
+static void op_amd_exit(void) {}
+
+#endif /* CONFIG_OPROFILE_IBS */
struct op_x86_model_spec const op_amd_spec = {
- .init = op_amd_init,
- .exit = op_amd_exit,
- .num_counters = NUM_COUNTERS,
- .num_controls = NUM_CONTROLS,
- .fill_in_addresses = &op_amd_fill_in_addresses,
- .setup_ctrs = &op_amd_setup_ctrs,
- .check_ctrs = &op_amd_check_ctrs,
- .start = &op_amd_start,
- .stop = &op_amd_stop,
- .shutdown = &op_amd_shutdown
+ .init = op_amd_init,
+ .exit = op_amd_exit,
+ .num_counters = NUM_COUNTERS,
+ .num_controls = NUM_CONTROLS,
+ .fill_in_addresses = &op_amd_fill_in_addresses,
+ .setup_ctrs = &op_amd_setup_ctrs,
+ .check_ctrs = &op_amd_check_ctrs,
+ .start = &op_amd_start,
+ .stop = &op_amd_stop,
+ .shutdown = &op_amd_shutdown
};
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 43ac5af338d8..4c4a51c90bc2 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -698,24 +698,24 @@ static void p4_shutdown(struct op_msrs const * const msrs)
#ifdef CONFIG_SMP
struct op_x86_model_spec const op_p4_ht2_spec = {
- .num_counters = NUM_COUNTERS_HT2,
- .num_controls = NUM_CONTROLS_HT2,
- .fill_in_addresses = &p4_fill_in_addresses,
- .setup_ctrs = &p4_setup_ctrs,
- .check_ctrs = &p4_check_ctrs,
- .start = &p4_start,
- .stop = &p4_stop,
- .shutdown = &p4_shutdown
+ .num_counters = NUM_COUNTERS_HT2,
+ .num_controls = NUM_CONTROLS_HT2,
+ .fill_in_addresses = &p4_fill_in_addresses,
+ .setup_ctrs = &p4_setup_ctrs,
+ .check_ctrs = &p4_check_ctrs,
+ .start = &p4_start,
+ .stop = &p4_stop,
+ .shutdown = &p4_shutdown
};
#endif
struct op_x86_model_spec const op_p4_spec = {
- .num_counters = NUM_COUNTERS_NON_HT,
- .num_controls = NUM_CONTROLS_NON_HT,
- .fill_in_addresses = &p4_fill_in_addresses,
- .setup_ctrs = &p4_setup_ctrs,
- .check_ctrs = &p4_check_ctrs,
- .start = &p4_start,
- .stop = &p4_stop,
- .shutdown = &p4_shutdown
+ .num_counters = NUM_COUNTERS_NON_HT,
+ .num_controls = NUM_CONTROLS_NON_HT,
+ .fill_in_addresses = &p4_fill_in_addresses,
+ .setup_ctrs = &p4_setup_ctrs,
+ .check_ctrs = &p4_check_ctrs,
+ .start = &p4_start,
+ .stop = &p4_stop,
+ .shutdown = &p4_shutdown
};
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index eff431f6c57b..e9f80c744cf3 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -1,32 +1,33 @@
/*
* @file op_model_ppro.h
- * pentium pro / P6 model-specific MSR operations
+ * Family 6 perfmon and architectural perfmon MSR operations
*
* @remark Copyright 2002 OProfile authors
+ * @remark Copyright 2008 Intel Corporation
* @remark Read the file COPYING
*
* @author John Levon
* @author Philippe Elie
* @author Graydon Hoare
+ * @author Andi Kleen
*/
#include <linux/oprofile.h>
+#include <linux/slab.h>
#include <asm/ptrace.h>
#include <asm/msr.h>
#include <asm/apic.h>
#include <asm/nmi.h>
+#include <asm/intel_arch_perfmon.h>
#include "op_x86_model.h"
#include "op_counter.h"
-#define NUM_COUNTERS 2
-#define NUM_CONTROLS 2
+static int num_counters = 2;
+static int counter_width = 32;
#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
-#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
-#define CTR_32BIT_WRITE(l, msrs, c) \
- do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 0); } while (0)
-#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
+#define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1))))
#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
#define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0)
@@ -40,20 +41,20 @@
#define CTRL_SET_UM(val, m) (val |= (m << 8))
#define CTRL_SET_EVENT(val, e) (val |= e)
-static unsigned long reset_value[NUM_COUNTERS];
+static u64 *reset_value;
static void ppro_fill_in_addresses(struct op_msrs * const msrs)
{
int i;
- for (i = 0; i < NUM_COUNTERS; i++) {
+ for (i = 0; i < num_counters; i++) {
if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
else
msrs->counters[i].addr = 0;
}
- for (i = 0; i < NUM_CONTROLS; i++) {
+ for (i = 0; i < num_counters; i++) {
if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i))
msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
else
@@ -67,8 +68,22 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
unsigned int low, high;
int i;
+ if (!reset_value) {
+ reset_value = kmalloc(sizeof(reset_value[0]) * num_counters,
+ GFP_ATOMIC);
+ if (!reset_value)
+ return;
+ }
+
+ if (cpu_has_arch_perfmon) {
+ union cpuid10_eax eax;
+ eax.full = cpuid_eax(0xa);
+ if (counter_width < eax.split.bit_width)
+ counter_width = eax.split.bit_width;
+ }
+
/* clear all counters */
- for (i = 0 ; i < NUM_CONTROLS; ++i) {
+ for (i = 0 ; i < num_counters; ++i) {
if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
continue;
CTRL_READ(low, high, msrs, i);
@@ -77,18 +92,18 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
}
/* avoid a false detection of ctr overflows in NMI handler */
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
if (unlikely(!CTR_IS_RESERVED(msrs, i)))
continue;
- CTR_32BIT_WRITE(1, msrs, i);
+ wrmsrl(msrs->counters[i].addr, -1LL);
}
/* enable active counters */
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
reset_value[i] = counter_config[i].count;
- CTR_32BIT_WRITE(counter_config[i].count, msrs, i);
+ wrmsrl(msrs->counters[i].addr, -reset_value[i]);
CTRL_READ(low, high, msrs, i);
CTRL_CLEAR(low);
@@ -108,16 +123,16 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
static int ppro_check_ctrs(struct pt_regs * const regs,
struct op_msrs const * const msrs)
{
- unsigned int low, high;
+ u64 val;
int i;
- for (i = 0 ; i < NUM_COUNTERS; ++i) {
+ for (i = 0 ; i < num_counters; ++i) {
if (!reset_value[i])
continue;
- CTR_READ(low, high, msrs, i);
- if (CTR_OVERFLOWED(low)) {
+ rdmsrl(msrs->counters[i].addr, val);
+ if (CTR_OVERFLOWED(val)) {
oprofile_add_sample(regs, i);
- CTR_32BIT_WRITE(reset_value[i], msrs, i);
+ wrmsrl(msrs->counters[i].addr, -reset_value[i]);
}
}
@@ -141,7 +156,9 @@ static void ppro_start(struct op_msrs const * const msrs)
unsigned int low, high;
int i;
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ if (!reset_value)
+ return;
+ for (i = 0; i < num_counters; ++i) {
if (reset_value[i]) {
CTRL_READ(low, high, msrs, i);
CTRL_SET_ACTIVE(low);
@@ -156,7 +173,9 @@ static void ppro_stop(struct op_msrs const * const msrs)
unsigned int low, high;
int i;
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ if (!reset_value)
+ return;
+ for (i = 0; i < num_counters; ++i) {
if (!reset_value[i])
continue;
CTRL_READ(low, high, msrs, i);
@@ -169,24 +188,70 @@ static void ppro_shutdown(struct op_msrs const * const msrs)
{
int i;
- for (i = 0 ; i < NUM_COUNTERS ; ++i) {
+ for (i = 0 ; i < num_counters ; ++i) {
if (CTR_IS_RESERVED(msrs, i))
release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
}
- for (i = 0 ; i < NUM_CONTROLS ; ++i) {
+ for (i = 0 ; i < num_counters ; ++i) {
if (CTRL_IS_RESERVED(msrs, i))
release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
}
+ if (reset_value) {
+ kfree(reset_value);
+ reset_value = NULL;
+ }
}
-struct op_x86_model_spec const op_ppro_spec = {
- .num_counters = NUM_COUNTERS,
- .num_controls = NUM_CONTROLS,
- .fill_in_addresses = &ppro_fill_in_addresses,
- .setup_ctrs = &ppro_setup_ctrs,
- .check_ctrs = &ppro_check_ctrs,
- .start = &ppro_start,
- .stop = &ppro_stop,
- .shutdown = &ppro_shutdown
+struct op_x86_model_spec op_ppro_spec = {
+ .num_counters = 2, /* can be overriden */
+ .num_controls = 2, /* dito */
+ .fill_in_addresses = &ppro_fill_in_addresses,
+ .setup_ctrs = &ppro_setup_ctrs,
+ .check_ctrs = &ppro_check_ctrs,
+ .start = &ppro_start,
+ .stop = &ppro_stop,
+ .shutdown = &ppro_shutdown
+};
+
+/*
+ * Architectural performance monitoring.
+ *
+ * Newer Intel CPUs (Core1+) have support for architectural
+ * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details.
+ * The advantage of this is that it can be done without knowing about
+ * the specific CPU.
+ */
+
+void arch_perfmon_setup_counters(void)
+{
+ union cpuid10_eax eax;
+
+ eax.full = cpuid_eax(0xa);
+
+ /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */
+ if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
+ current_cpu_data.x86_model == 15) {
+ eax.split.version_id = 2;
+ eax.split.num_counters = 2;
+ eax.split.bit_width = 40;
+ }
+
+ num_counters = eax.split.num_counters;
+
+ op_arch_perfmon_spec.num_counters = num_counters;
+ op_arch_perfmon_spec.num_controls = num_counters;
+ op_ppro_spec.num_counters = num_counters;
+ op_ppro_spec.num_controls = num_counters;
+}
+
+struct op_x86_model_spec op_arch_perfmon_spec = {
+ /* num_counters/num_controls filled in at runtime */
+ .fill_in_addresses = &ppro_fill_in_addresses,
+ /* user space does the cpuid check for available events */
+ .setup_ctrs = &ppro_setup_ctrs,
+ .check_ctrs = &ppro_check_ctrs,
+ .start = &ppro_start,
+ .stop = &ppro_stop,
+ .shutdown = &ppro_shutdown
};
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 05a0261ba0c3..825e79064d64 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -22,8 +22,8 @@ struct op_msr {
};
struct op_msrs {
- struct op_msr * counters;
- struct op_msr * controls;
+ struct op_msr *counters;
+ struct op_msr *controls;
};
struct pt_regs;
@@ -34,8 +34,8 @@ struct pt_regs;
struct op_x86_model_spec {
int (*init)(struct oprofile_operations *ops);
void (*exit)(void);
- unsigned int const num_counters;
- unsigned int const num_controls;
+ unsigned int num_counters;
+ unsigned int num_controls;
void (*fill_in_addresses)(struct op_msrs * const msrs);
void (*setup_ctrs)(struct op_msrs const * const msrs);
int (*check_ctrs)(struct pt_regs * const regs,
@@ -45,9 +45,12 @@ struct op_x86_model_spec {
void (*shutdown)(struct op_msrs const * const msrs);
};
-extern struct op_x86_model_spec const op_ppro_spec;
+extern struct op_x86_model_spec op_ppro_spec;
extern struct op_x86_model_spec const op_p4_spec;
extern struct op_x86_model_spec const op_p4_ht2_spec;
extern struct op_x86_model_spec const op_amd_spec;
+extern struct op_x86_model_spec op_arch_perfmon_spec;
+
+extern void arch_perfmon_setup_counters(void);
#endif /* OP_X86_MODEL_H */
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index b67732bbb85a..bb1a01f089e2 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -23,6 +23,12 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
unsigned int pci_early_dump_regs;
static int pci_bf_sort;
int pci_routeirq;
+int noioapicquirk;
+#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
+int noioapicreroute = 0;
+#else
+int noioapicreroute = 1;
+#endif
int pcibios_last_bus = -1;
unsigned long pirq_table_addr;
struct pci_bus *pci_root_bus;
@@ -519,6 +525,17 @@ char * __devinit pcibios_setup(char *str)
} else if (!strcmp(str, "skip_isa_align")) {
pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
return NULL;
+ } else if (!strcmp(str, "noioapicquirk")) {
+ noioapicquirk = 1;
+ return NULL;
+ } else if (!strcmp(str, "ioapicreroute")) {
+ if (noioapicreroute != -1)
+ noioapicreroute = 0;
+ return NULL;
+ } else if (!strcmp(str, "noioapicreroute")) {
+ if (noioapicreroute != -1)
+ noioapicreroute = 1;
+ return NULL;
}
return str;
}
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index 9915293500fb..9a5af6c8fbe9 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -173,7 +173,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
#undef PCI_CONF2_ADDRESS
-static struct pci_raw_ops pci_direct_conf2 = {
+struct pci_raw_ops pci_direct_conf2 = {
.read = pci_conf2_read,
.write = pci_conf2_write,
};
@@ -289,6 +289,7 @@ int __init pci_direct_probe(void)
if (pci_check_type1()) {
raw_pci_ops = &pci_direct_conf1;
+ port_cf9_safe = true;
return 1;
}
release_resource(region);
@@ -305,6 +306,7 @@ int __init pci_direct_probe(void)
if (pci_check_type2()) {
raw_pci_ops = &pci_direct_conf2;
+ port_cf9_safe = true;
return 2;
}
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 3c27a809393b..2051dc96b8e9 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -496,21 +496,24 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SIEMENS, 0x0015,
pci_siemens_interrupt_controller);
/*
- * Regular PCI devices have 256 bytes, but AMD Family 10h Opteron ext config
- * have 4096 bytes. Even if the device is capable, that doesn't mean we can
- * access it. Maybe we don't have a way to generate extended config space
- * accesses. So check it
+ * Regular PCI devices have 256 bytes, but AMD Family 10h/11h CPUs have
+ * 4096 bytes configuration space for each function of their processor
+ * configuration space.
*/
-static void fam10h_pci_cfg_space_size(struct pci_dev *dev)
+static void amd_cpu_pci_cfg_space_size(struct pci_dev *dev)
{
dev->cfg_size = pci_cfg_space_size_ext(dev);
}
-
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1200, fam10h_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, fam10h_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1200, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1300, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1301, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1302, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1303, amd_cpu_pci_cfg_space_size);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1304, amd_cpu_pci_cfg_space_size);
/*
* SB600: Disable BAR1 on device 14.0 to avoid HPET resources from
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 006599db0dc7..bf69dbe08bff 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -493,7 +493,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq
if (pirq <= 4)
irq = read_config_nybble(router, 0x56, pirq - 1);
dev_info(&dev->dev,
- "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n",
+ "AMD756: dev [%04x:%04x], router PIRQ %d get IRQ %d\n",
dev->vendor, dev->device, pirq, irq);
return irq;
}
@@ -501,7 +501,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq
static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
{
dev_info(&dev->dev,
- "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n",
+ "AMD756: dev [%04x:%04x], router PIRQ %d set IRQ %d\n",
dev->vendor, dev->device, pirq, irq);
if (pirq <= 4)
write_config_nybble(router, 0x56, pirq - 1, irq);
@@ -590,13 +590,20 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
case PCI_DEVICE_ID_INTEL_ICH10_1:
case PCI_DEVICE_ID_INTEL_ICH10_2:
case PCI_DEVICE_ID_INTEL_ICH10_3:
- case PCI_DEVICE_ID_INTEL_PCH_0:
- case PCI_DEVICE_ID_INTEL_PCH_1:
r->name = "PIIX/ICH";
r->get = pirq_piix_get;
r->set = pirq_piix_set;
return 1;
}
+
+ if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) &&
+ (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) {
+ r->name = "PIIX/ICH";
+ r->get = pirq_piix_get;
+ r->set = pirq_piix_set;
+ return 1;
+ }
+
return 0;
}
@@ -823,7 +830,7 @@ static void __init pirq_find_router(struct irq_router *r)
r->get = NULL;
r->set = NULL;
- DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
+ DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n",
rt->rtr_vendor, rt->rtr_device);
pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
@@ -843,7 +850,7 @@ static void __init pirq_find_router(struct irq_router *r)
h->probe(r, pirq_router_dev, pirq_router_dev->device))
break;
}
- dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n",
+ dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n",
pirq_router.name,
pirq_router_dev->vendor, pirq_router_dev->device);
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index 15b9cf6be729..1959018aac02 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -96,6 +96,7 @@ extern struct pci_raw_ops *raw_pci_ops;
extern struct pci_raw_ops *raw_pci_ext_ops;
extern struct pci_raw_ops pci_direct_conf1;
+extern bool port_cf9_safe;
/* arch_initcall level */
extern int pci_direct_probe(void);
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index f2b6e3f11bfc..81197c62d5b3 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -12,6 +12,7 @@
#include <asm/system.h>
#include <asm/page.h>
#include <asm/pgtable.h>
+#include <asm/mmzone.h>
/* Defined in hibernate_asm_32.S */
extern int restore_image(void);
@@ -127,6 +128,9 @@ static int resume_physical_mapping_init(pgd_t *pgd_base)
}
}
}
+
+ resume_map_numa_kva(pgd_base);
+
return 0;
}
diff --git a/arch/x86/scripts/strip-symbols b/arch/x86/scripts/strip-symbols
new file mode 100644
index 000000000000..a2f1ccb827c7
--- /dev/null
+++ b/arch/x86/scripts/strip-symbols
@@ -0,0 +1 @@
+__cpu_vendor_dev_X86_VENDOR_*
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 1ef0f90813d6..d9d35824c56f 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -9,6 +9,9 @@
* Also alternative() doesn't work.
*/
+/* Disable profiling for userspace code: */
+#define DISABLE_BRANCH_PROFILING
+
#include <linux/kernel.h>
#include <linux/posix-timers.h>
#include <linux/time.h>
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 513f330c5832..1241f118ab56 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -310,7 +310,7 @@ int __init sysenter_setup(void)
}
/* Setup a VMA at program startup for the vsyscall page */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
struct mm_struct *mm = current->mm;
unsigned long addr;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 257ba4a10abf..9c98cc6ba978 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -98,7 +98,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
/* Setup a VMA at program startup for the vsyscall page.
Not called for compat tasks */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
struct mm_struct *mm = current->mm;
unsigned long addr;
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 313947940a1a..6dcefba7836f 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
-ifdef CONFIG_FTRACE
+ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
CFLAGS_REMOVE_spinlock.o = -pg
CFLAGS_REMOVE_time.o = -pg
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0013a729b41d..bea215230b20 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -28,6 +28,7 @@
#include <linux/console.h>
#include <xen/interface/xen.h>
+#include <xen/interface/version.h>
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
#include <xen/features.h>
@@ -793,7 +794,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
ret = 0;
- switch(msr) {
+ switch (msr) {
#ifdef CONFIG_X86_64
unsigned which;
u64 base;
@@ -863,14 +864,16 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l
if (PagePinned(virt_to_page(mm->pgd))) {
SetPagePinned(page);
+ vm_unmap_aliases();
if (!PageHighMem(page)) {
make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
if (level == PT_PTE && USE_SPLIT_PTLOCKS)
pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
- } else
+ } else {
/* make sure there are no stray mappings of
this page */
kmap_flush_unused();
+ }
}
}
@@ -1451,7 +1454,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
ident_pte = 0;
pfn = 0;
- for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
+ for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
pte_t *pte_page;
/* Reuse or allocate a page of ptes */
@@ -1469,7 +1472,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
}
/* Install mappings */
- for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
+ for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
pte_t pte;
if (pfn > max_pfn_mapped)
@@ -1483,7 +1486,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
}
}
- for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
+ for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
set_page_prot(pmd, PAGE_KERNEL_RO);
@@ -1497,7 +1500,7 @@ static void convert_pfn_mfn(void *v)
/* All levels are converted the same way, so just treat them
as ptes. */
- for(i = 0; i < PTRS_PER_PTE; i++)
+ for (i = 0; i < PTRS_PER_PTE; i++)
pte[i] = xen_make_pte(pte[i].pte);
}
@@ -1512,7 +1515,8 @@ static void convert_pfn_mfn(void *v)
* of the physical mapping once some sort of allocator has been set
* up.
*/
-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
+static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+ unsigned long max_pfn)
{
pud_t *l3;
pmd_t *l2;
@@ -1575,7 +1579,8 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf
#else /* !CONFIG_X86_64 */
static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
+static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+ unsigned long max_pfn)
{
pmd_t *kernel_pmd;
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 28b85ab8422e..bb042608c602 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -21,7 +21,6 @@ void xen_force_evtchn_callback(void)
static void __init __xen_init_IRQ(void)
{
-#ifdef CONFIG_X86_64
int i;
/* Create identity vector->irq map */
@@ -31,7 +30,6 @@ static void __init __xen_init_IRQ(void)
for_each_possible_cpu(cpu)
per_cpu(vector_irq, cpu)[i] = i;
}
-#endif /* CONFIG_X86_64 */
xen_init_IRQ();
}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ae173f6edd8b..773d68d3e912 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -154,13 +154,13 @@ void xen_setup_mfn_list_list(void)
{
unsigned pfn, idx;
- for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
+ for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
unsigned topidx = p2m_top_index(pfn);
p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
}
- for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
+ for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
}
@@ -179,7 +179,7 @@ void __init xen_build_dynamic_phys_to_machine(void)
unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
unsigned pfn;
- for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
+ for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
unsigned topidx = p2m_top_index(pfn);
p2m_top[topidx] = &mfn_list[pfn];
@@ -207,7 +207,7 @@ static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
BUG_ON(p == NULL);
- for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
+ for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
p[i] = INVALID_P2M_ENTRY;
if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
@@ -246,11 +246,21 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
{
unsigned long address = (unsigned long)vaddr;
unsigned int level;
- pte_t *pte = lookup_address(address, &level);
- unsigned offset = address & ~PAGE_MASK;
+ pte_t *pte;
+ unsigned offset;
- BUG_ON(pte == NULL);
+ /*
+ * if the PFN is in the linear mapped vaddr range, we can just use
+ * the (quick) virt_to_machine() p2m lookup
+ */
+ if (virt_addr_valid(vaddr))
+ return virt_to_machine(vaddr);
+ /* otherwise we have to do a (slower) full page-table walk */
+
+ pte = lookup_address(address, &level);
+ BUG_ON(pte == NULL);
+ offset = address & ~PAGE_MASK;
return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
}
@@ -397,7 +407,8 @@ out:
preempt_enable();
}
-pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
{
/* Just return the pte as-is. We preserve the bits on commit */
return *ptep;
@@ -410,7 +421,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
xen_mc_batch();
- u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
+ u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
u.val = pte_val_ma(pte);
xen_extend_mmu_update(&u);
@@ -651,12 +662,11 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
* For 64-bit, we must skip the Xen hole in the middle of the address
* space, just after the big x86-64 virtual hole.
*/
-static int xen_pgd_walk(struct mm_struct *mm,
- int (*func)(struct mm_struct *mm, struct page *,
- enum pt_level),
- unsigned long limit)
+static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
+ int (*func)(struct mm_struct *mm, struct page *,
+ enum pt_level),
+ unsigned long limit)
{
- pgd_t *pgd = mm->pgd;
int flush = 0;
unsigned hole_low, hole_high;
unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
@@ -743,6 +753,14 @@ out:
return flush;
}
+static int xen_pgd_walk(struct mm_struct *mm,
+ int (*func)(struct mm_struct *mm, struct page *,
+ enum pt_level),
+ unsigned long limit)
+{
+ return __xen_pgd_walk(mm, mm->pgd, func, limit);
+}
+
/* If we're using split pte locks, then take the page's lock and
return a pointer to it. Otherwise return NULL. */
static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
@@ -840,12 +858,16 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page,
read-only, and can be pinned. */
static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
{
+ vm_unmap_aliases();
+
xen_mc_batch();
- if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) {
- /* re-enable interrupts for kmap_flush_unused */
+ if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
+ /* re-enable interrupts for flushing */
xen_mc_issue(0);
+
kmap_flush_unused();
+
xen_mc_batch();
}
@@ -857,13 +879,14 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
if (user_pgd) {
xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
- xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
+ xen_do_pin(MMUEXT_PIN_L4_TABLE,
+ PFN_DOWN(__pa(user_pgd)));
}
}
#else /* CONFIG_X86_32 */
#ifdef CONFIG_X86_PAE
/* Need to make sure unshared kernel PMD is pinnable */
- xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
+ xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
PT_PMD);
#endif
xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
@@ -972,7 +995,8 @@ static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
pgd_t *user_pgd = xen_get_user_pgd(pgd);
if (user_pgd) {
- xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
+ xen_do_pin(MMUEXT_UNPIN_TABLE,
+ PFN_DOWN(__pa(user_pgd)));
xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
}
}
@@ -980,11 +1004,11 @@ static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
#ifdef CONFIG_X86_PAE
/* Need to make sure unshared kernel PMD is unpinned */
- xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
+ xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
PT_PMD);
#endif
- xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT);
+ __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
xen_mc_issue(0);
}
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 8ea8a0d0b0de..c738644b5435 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -154,7 +154,7 @@ void xen_mc_flush(void)
ret, smp_processor_id());
dump_stack();
for (i = 0; i < b->mcidx; i++) {
- printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
+ printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
i+1, b->mcidx,
b->debug[i].op,
b->debug[i].args[0],
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index d67901083888..15c6c68db6a2 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -28,6 +28,9 @@
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
+extern void xen_sysenter_target(void);
+extern void xen_syscall_target(void);
+extern void xen_syscall32_target(void);
/**
@@ -110,7 +113,6 @@ static __cpuinit int register_callback(unsigned type, const void *func)
void __cpuinit xen_enable_sysenter(void)
{
- extern void xen_sysenter_target(void);
int ret;
unsigned sysenter_feature;
@@ -132,8 +134,6 @@ void __cpuinit xen_enable_syscall(void)
{
#ifdef CONFIG_X86_64
int ret;
- extern void xen_syscall_target(void);
- extern void xen_syscall32_target(void);
ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
if (ret != 0) {
@@ -160,7 +160,8 @@ void __init xen_arch_setup(void)
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
if (!xen_feature(XENFEAT_auto_translated_physmap))
- HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
+ HYPERVISOR_vm_assist(VMASST_CMD_enable,
+ VMASST_TYPE_pae_extended_cr3);
if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index d77da613b1d2..acd9b6705e02 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -362,7 +362,7 @@ static void xen_cpu_die(unsigned int cpu)
alternatives_smp_switch(0);
}
-static void xen_play_dead(void)
+static void __cpuinit xen_play_dead(void) /* used only with CPU_HOTPLUG */
{
play_dead_common();
HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index dd71e3a021cd..5601506f2dd9 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -241,7 +241,7 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
} while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
- kstat_this_cpu.irqs[irq]++;
+ kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
out:
raw_local_irq_restore(flags);
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 004ba86326ae..c9f7cda48ed7 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -198,17 +198,10 @@ unsigned long long xen_sched_clock(void)
/* Get the TSC speed from Xen */
unsigned long xen_tsc_khz(void)
{
- u64 xen_khz = 1000000ULL << 32;
- const struct pvclock_vcpu_time_info *info =
+ struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
- do_div(xen_khz, info->tsc_to_system_mul);
- if (info->tsc_shift < 0)
- xen_khz <<= -info->tsc_shift;
- else
- xen_khz >>= info->tsc_shift;
-
- return xen_khz;
+ return pvclock_tsc_khz(info);
}
cycle_t xen_clocksource_read(void)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index d7422dc2a55c..9e1afae8461f 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -49,7 +49,7 @@ bool xen_vcpu_stolen(int vcpu);
void xen_mark_init_mm_pinned(void);
-void __init xen_setup_vcpu_info_placement(void);
+void xen_setup_vcpu_info_placement(void);
#ifdef CONFIG_SMP
void xen_smp_init(void);