Update from upstream with manual merge of Yasunori Goto's

changes to swiotlb.c made in commit 281dd25cdc0d6903929b79183816d151ea626341 since this file has been moved from arch/ia64/lib/swiotlb.c to lib/swiotlb.c Signed-off-by: Tony Luck <tony.luck@intel.com>
author: Tony Luck 2005-10-20 19:41:44 +0200
committer: Tony Luck 2005-10-20 19:41:44 +0200
commit: 9cec58dc138d6fcad9f447a19c8ff69f6540e667 (patch)
tree: 4fe1cca94fdba8b705c87615bee06d3346f687ce /arch/x86_64
parent: [PATCH] Removed remaining PCI specific references from swiotlb.c (diff)
parent: [PATCH] Fix handling spurious page fault for hugetlb region (diff)
download: kernel-qcow2-linux-9cec58dc138d6fcad9f447a19c8ff69f6540e667.tar.gz
kernel-qcow2-linux-9cec58dc138d6fcad9f447a19c8ff69f6540e667.tar.xz
kernel-qcow2-linux-9cec58dc138d6fcad9f447a19c8ff69f6540e667.zip
17 files changed, 204 insertions, 106 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index e63323e03ea9..21afa69a086d 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -277,11 +277,6 @@ source "mm/Kconfig"
 config HAVE_ARCH_EARLY_PFN_TO_NID
 	def_bool y
 
-config HAVE_DEC_LOCK
-	bool
-	depends on SMP
-	default y
-
 config NR_CPUS
 	int "Maximum number of CPUs (2-256)"
 	range 2 256
@@ -313,7 +308,7 @@ config HPET_TIMER
 	  present.  The HPET provides a stable time base on SMP
 	  systems, unlike the TSC, but it is more expensive to access,
 	  as it is off-chip.  You can find the HPET spec at
-	  <http://www.intel.com/labs/platcomp/hpet/hpetspec.htm>.
+	  <http://www.intel.com/hardwaredesign/hpetspec.htm>.
 
 config X86_PM_TIMER
 	bool "PM timer"
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
index 66e2821533db..0903cc1faef2 100644
--- a/arch/x86_64/ia32/ia32_signal.c
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -425,7 +425,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
 		rsp = (unsigned long) ka->sa.sa_restorer;
 	}
 
-	return (void __user *)((rsp - frame_size) & -8UL);
+	rsp -= frame_size;
+	/* Align the stack pointer according to the i386 ABI,
+	 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
+	rsp = ((rsp + 4) & -16ul) - 4;
+	return (void __user *) rsp;
 }
 
 int ia32_setup_frame(int sig, struct k_sigaction *ka,
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 4e34b0f9d613..ab3f87aaff70 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -17,6 +17,8 @@
 #include <linux/ioport.h>
 #include <linux/string.h>
 #include <linux/kexec.h>
+#include <linux/module.h>
+
 #include <asm/page.h>
 #include <asm/e820.h>
 #include <asm/proto.h>
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 4592bf21fcaf..b92e5f45ed46 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -270,26 +270,26 @@ ENTRY(level3_kernel_pgt)
 .org 0x4000
 ENTRY(level2_ident_pgt)
 	/* 40MB for bootup. 	*/
-	.quad	0x0000000000000183
-	.quad	0x0000000000200183
-	.quad	0x0000000000400183
-	.quad	0x0000000000600183
-	.quad	0x0000000000800183
-	.quad	0x0000000000A00183
-	.quad	0x0000000000C00183
-	.quad	0x0000000000E00183
-	.quad	0x0000000001000183
-	.quad	0x0000000001200183
-	.quad	0x0000000001400183
-	.quad	0x0000000001600183
-	.quad	0x0000000001800183
-	.quad	0x0000000001A00183
-	.quad	0x0000000001C00183
-	.quad	0x0000000001E00183
-	.quad	0x0000000002000183
-	.quad	0x0000000002200183
-	.quad	0x0000000002400183
-	.quad	0x0000000002600183
+	.quad	0x0000000000000083
+	.quad	0x0000000000200083
+	.quad	0x0000000000400083
+	.quad	0x0000000000600083
+	.quad	0x0000000000800083
+	.quad	0x0000000000A00083
+	.quad	0x0000000000C00083
+	.quad	0x0000000000E00083
+	.quad	0x0000000001000083
+	.quad	0x0000000001200083
+	.quad	0x0000000001400083
+	.quad	0x0000000001600083
+	.quad	0x0000000001800083
+	.quad	0x0000000001A00083
+	.quad	0x0000000001C00083
+	.quad	0x0000000001E00083
+	.quad	0x0000000002000083
+	.quad	0x0000000002200083
+	.quad	0x0000000002400083
+	.quad	0x0000000002600083
 	/* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */
 	.globl temp_boot_pmds
 temp_boot_pmds:
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index df08c43276a0..76a28b007be9 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -77,9 +77,9 @@ static inline int is_IF_modifier(kprobe_opcode_t *insn)
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
 {
 	/* insn: must be on special executable page on x86_64. */
-	up(&kprobe_mutex);
-	p->ainsn.insn = get_insn_slot();
 	down(&kprobe_mutex);
+	p->ainsn.insn = get_insn_slot();
+	up(&kprobe_mutex);
 	if (!p->ainsn.insn) {
 		return -ENOMEM;
 	}
@@ -231,9 +231,9 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
-	up(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn);
 	down(&kprobe_mutex);
+	free_insn_slot(p->ainsn.insn);
+	up(&kprobe_mutex);
 }
 
 static inline void save_previous_kprobe(void)
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 08203b07f4bd..69541db5ff2c 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -54,9 +54,12 @@ void mce_log(struct mce *mce)
 {
 	unsigned next, entry;
 	mce->finished = 0;
-	smp_wmb();
+	wmb();
 	for (;;) {
 		entry = rcu_dereference(mcelog.next);
+		/* The rmb forces the compiler to reload next in each
+		    iteration */
+		rmb();
 		for (;;) {
 			/* When the buffer fills up discard new entries. Assume
 			   that the earlier errors are the more interesting. */
@@ -69,6 +72,7 @@ void mce_log(struct mce *mce)
 				entry++;
 				continue;
 			}
+			break;
 		}
 		smp_rmb();
 		next = entry + 1;
@@ -76,9 +80,9 @@ void mce_log(struct mce *mce)
 			break;
 	}
 	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
-	smp_wmb();
+	wmb();
 	mcelog.entry[entry].finished = 1;
-	smp_wmb();
+	wmb();
 
 	if (!test_and_set_bit(0, &console_logged))
 		notify_user = 1;
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 351d8d64c2fb..cb28df14ff6f 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -836,6 +836,23 @@ static int __init init_amd(struct cpuinfo_x86 *c)
 	int r;
 	int level;
 
+#ifdef CONFIG_SMP
+	unsigned long value;
+
+	/*
+	 * Disable TLB flush filter by setting HWCR.FFDIS on K8
+	 * bit 6 of msr C001_0015
+ 	 *
+	 * Errata 63 for SH-B3 steppings
+	 * Errata 122 for all steppings (F+ have it disabled by default)
+	 */
+	if (c->x86 == 15) {
+		rdmsrl(MSR_K8_HWCR, value);
+		value |= 1 << 6;
+		wrmsrl(MSR_K8_HWCR, value);
+	}
+#endif
+
 	/* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
 	   3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
 	clear_bit(0*32+31, &c->x86_capability);
@@ -950,13 +967,12 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
 static void srat_detect_node(void)
 {
 #ifdef CONFIG_NUMA
-	unsigned apicid, node;
+	unsigned node;
 	int cpu = smp_processor_id();
 
 	/* Don't do the funky fallback heuristics the AMD version employs
 	   for now. */
-	apicid = phys_proc_id[cpu];
-	node = apicid_to_node[apicid];
+	node = apicid_to_node[hard_smp_processor_id()];
 	if (node == NUMA_NO_NODE)
 		node = 0;
 	cpu_to_node[cpu] = node;
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index bd33be24a386..79190891fbc5 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -87,6 +87,10 @@ void __init setup_per_cpu_areas(void)
 	int i;
 	unsigned long size;
 
+#ifdef CONFIG_HOTPLUG_CPU
+	prefill_possible_map();
+#endif
+
 	/* Copy section for each CPU (we discard the original) */
 	size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
 #ifdef CONFIG_MODULES
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index e12d7baeb33e..658a81b33f3b 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -892,7 +892,7 @@ static __init void disable_smp(void)
  * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
  * - Ashok Raj
  */
-static void prefill_possible_map(void)
+__init void prefill_possible_map(void)
 {
 	int i;
 	for (i = 0; i < NR_CPUS; i++)
@@ -967,10 +967,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	current_cpu_data = boot_cpu_data;
 	current_thread_info()->cpu = 0;  /* needed? */
 
-#ifdef CONFIG_HOTPLUG_CPU
-	prefill_possible_map();
-#endif
-
 	if (smp_sanity_check(max_cpus) < 0) {
 		printk(KERN_INFO "SMP disabled\n");
 		disable_smp();
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index ebb9abf3ce6d..f066c6ab3618 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -11,6 +11,8 @@
 #include <linux/smp.h>
 #include <linux/suspend.h>
 #include <asm/proto.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
 
 struct saved_context saved_context;
 
@@ -140,4 +142,129 @@ void fix_processor_context(void)
 
 }
 
+#ifdef CONFIG_SOFTWARE_SUSPEND
+/* Defined in arch/x86_64/kernel/suspend_asm.S */
+extern int restore_image(void);
 
+pgd_t *temp_level4_pgt;
+
+static void **pages;
+
+static inline void *__add_page(void)
+{
+	void **c;
+
+	c = (void **)get_usable_page(GFP_ATOMIC);
+	if (c) {
+		*c = pages;
+		pages = c;
+	}
+	return c;
+}
+
+static inline void *__next_page(void)
+{
+	void **c;
+
+	c = pages;
+	if (c) {
+		pages = *c;
+		*c = NULL;
+	}
+	return c;
+}
+
+/*
+ * Try to allocate as many usable pages as needed and daisy chain them.
+ * If one allocation fails, free the pages allocated so far
+ */
+static int alloc_usable_pages(unsigned long n)
+{
+	void *p;
+
+	pages = NULL;
+	do
+		if (!__add_page())
+			break;
+	while (--n);
+	if (n) {
+		p = __next_page();
+		while (p) {
+			free_page((unsigned long)p);
+			p = __next_page();
+		}
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+{
+	long i, j;
+
+	i = pud_index(address);
+	pud = pud + i;
+	for (; i < PTRS_PER_PUD; pud++, i++) {
+		unsigned long paddr;
+		pmd_t *pmd;
+
+		paddr = address + i*PUD_SIZE;
+		if (paddr >= end)
+			break;
+
+		pmd = (pmd_t *)__next_page();
+		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+		for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
+			unsigned long pe;
+
+			if (paddr >= end)
+				break;
+			pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr;
+			pe &= __supported_pte_mask;
+			set_pmd(pmd, __pmd(pe));
+		}
+	}
+}
+
+static void set_up_temporary_mappings(void)
+{
+	unsigned long start, end, next;
+
+	temp_level4_pgt = (pgd_t *)__next_page();
+
+	/* It is safe to reuse the original kernel mapping */
+	set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
+		init_level4_pgt[pgd_index(__START_KERNEL_map)]);
+
+	/* Set up the direct mapping from scratch */
+	start = (unsigned long)pfn_to_kaddr(0);
+	end = (unsigned long)pfn_to_kaddr(end_pfn);
+
+	for (; start < end; start = next) {
+		pud_t *pud = (pud_t *)__next_page();
+		next = start + PGDIR_SIZE;
+		if (next > end)
+			next = end;
+		res_phys_pud_init(pud, __pa(start), __pa(next));
+		set_pgd(temp_level4_pgt + pgd_index(start),
+			mk_kernel_pgd(__pa(pud)));
+	}
+}
+
+int swsusp_arch_resume(void)
+{
+	unsigned long n;
+
+	n = ((end_pfn << PAGE_SHIFT) + PUD_SIZE - 1) >> PUD_SHIFT;
+	n += (n + PTRS_PER_PUD - 1) / PTRS_PER_PUD + 1;
+	pr_debug("swsusp_arch_resume(): pages needed = %lu\n", n);
+	if (alloc_usable_pages(n)) {
+		free_eaten_memory();
+		return -ENOMEM;
+	}
+	/* We have got enough memory and from now on we cannot recover */
+	set_up_temporary_mappings();
+	restore_image();
+	return 0;
+}
+#endif /* CONFIG_SOFTWARE_SUSPEND */
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
index 4d659e97df10..320b6fb00cca 100644
--- a/arch/x86_64/kernel/suspend_asm.S
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -39,12 +39,13 @@ ENTRY(swsusp_arch_suspend)
 	call swsusp_save
 	ret
 
-ENTRY(swsusp_arch_resume)
-	/* set up cr3 */	
-	leaq	init_level4_pgt(%rip),%rax
-	subq	$__START_KERNEL_map,%rax
-	movq	%rax,%cr3
-
+ENTRY(restore_image)
+	/* switch to temporary page tables */
+	movq	$__PAGE_OFFSET, %rdx
+	movq	temp_level4_pgt(%rip), %rax
+	subq	%rdx, %rax
+	movq	%rax, %cr3
+	/* Flush TLB */
 	movq	mmu_cr4_features(%rip), %rax
 	movq	%rax, %rdx
 	andq	$~(1<<7), %rdx	# PGE
@@ -69,6 +70,10 @@ loop:
 	movq	pbe_next(%rdx), %rdx
 	jmp	loop
 done:
+	/* go back to the original page tables */
+	leaq	init_level4_pgt(%rip), %rax
+	subq	$__START_KERNEL_map, %rax
+	movq	%rax, %cr3
 	/* Flush TLB, including "global" things (vmalloc) */
 	movq	mmu_cr4_features(%rip), %rax
 	movq	%rax, %rdx
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 2373cb8b8625..703acde2a1a5 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -959,9 +959,6 @@ static __init int unsynchronized_tsc(void)
  	   are handled in the OEM check above. */
  	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
  		return 0;
- 	/* All in a single socket - should be synchronized */
- 	if (cpus_weight(cpu_core_map[0]) == num_online_cpus())
- 		return 0;
 #endif
  	/* Assume multi socket systems are not synchronized */
  	return num_online_cpus() > 1;
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 68ec03070e5a..fd99ddd009bc 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -178,10 +178,6 @@ EXPORT_SYMBOL(rwsem_down_write_failed_thunk);
 
 EXPORT_SYMBOL(empty_zero_page);
 
-#ifdef CONFIG_HAVE_DEC_LOCK
-EXPORT_SYMBOL(_atomic_dec_and_lock);
-#endif
-
 EXPORT_SYMBOL(die_chain);
 EXPORT_SYMBOL(register_die_notifier);
 
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
index 6b26a1c1e9ff..bba5db6cebd6 100644
--- a/arch/x86_64/lib/Makefile
+++ b/arch/x86_64/lib/Makefile
@@ -10,5 +10,3 @@ lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
 	usercopy.o getuser.o putuser.o  \
 	thunk.o clear_page.o copy_page.o bitstr.o bitops.o
 lib-y += memcpy.o memmove.o memset.o copy_user.o
-
-lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
diff --git a/arch/x86_64/lib/dec_and_lock.c b/arch/x86_64/lib/dec_and_lock.c
deleted file mode 100644
index ab43394dc775..000000000000
--- a/arch/x86_64/lib/dec_and_lock.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * x86 version of "atomic_dec_and_lock()" using
- * the atomic "cmpxchg" instruction.
- *
- * (For CPU's lacking cmpxchg, we use the slow
- * generic version, and this one never even gets
- * compiled).
- */
-
-#include <linux/spinlock.h>
-#include <asm/atomic.h>
-
-int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
-{
-	int counter;
-	int newcount;
-
-repeat:
-	counter = atomic_read(atomic);
-	newcount = counter-1;
-
-	if (!newcount)
-		goto slow_path;
-
-	asm volatile("lock; cmpxchgl %1,%2"
-		:"=a" (newcount)
-		:"r" (newcount), "m" (atomic->counter), "0" (counter));
-
-	/* If the above failed, "eax" will have changed */
-	if (newcount != counter)
-		goto repeat;
-	return 0;
-
-slow_path:
-	spin_lock(lock);
-	if (atomic_dec_and_test(atomic))
-		return 1;
-	spin_unlock(lock);
-	return 0;
-}
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 80a49d9bd8a7..214803821001 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -167,18 +167,16 @@ void __init numa_init_array(void)
 	   mapping. To avoid this fill in the mapping for all possible
 	   CPUs, as the number of CPUs is not known yet. 
 	   We round robin the existing nodes. */
-	rr = 0;
+	rr = first_node(node_online_map);
 	for (i = 0; i < NR_CPUS; i++) {
 		if (cpu_to_node[i] != NUMA_NO_NODE)
 			continue;
+		cpu_to_node[i] = rr;
 		rr = next_node(rr, node_online_map);
 		if (rr == MAX_NUMNODES)
 			rr = first_node(node_online_map);
-		cpu_to_node[i] = rr;
-		rr++; 
 	}
 
-	set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
 }
 
 #ifdef CONFIG_NUMA_EMU
@@ -266,9 +264,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 
 __cpuinit void numa_add_cpu(int cpu)
 {
-	/* BP is initialized elsewhere */
-	if (cpu) 
-		set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+	set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
 } 
 
 unsigned long __init numa_free_all_bootmem(void) 
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index 94862e1ec032..b90e8fe9eeb0 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -220,8 +220,6 @@ void global_flush_tlb(void)
 	down_read(&init_mm.mmap_sem);
 	df = xchg(&df_list, NULL);
 	up_read(&init_mm.mmap_sem);
-	if (!df)
-		return;
 	flush_map((df && !df->next) ? df->address : 0);
 	for (; df; df = next_df) { 
 		next_df = df->next;
author	Tony Luck	2005-10-20 19:41:44 +0200
committer	Tony Luck	2005-10-20 19:41:44 +0200
commit	9cec58dc138d6fcad9f447a19c8ff69f6540e667 (patch)
tree	4fe1cca94fdba8b705c87615bee06d3346f687ce /arch/x86_64
parent	[PATCH] Removed remaining PCI specific references from swiotlb.c (diff)
parent	[PATCH] Fix handling spurious page fault for hugetlb region (diff)
download	kernel-qcow2-linux-9cec58dc138d6fcad9f447a19c8ff69f6540e667.tar.gz kernel-qcow2-linux-9cec58dc138d6fcad9f447a19c8ff69f6540e667.tar.xz kernel-qcow2-linux-9cec58dc138d6fcad9f447a19c8ff69f6540e667.zip