From 9674f35b1ec17577163897f052f405c1e9e5893d Mon Sep 17 00:00:00 2001
From: Cliff Wickman
Date: Fri, 3 Apr 2009 08:34:05 -0500
Subject: x86: UV BAU and nodes with no memory

This patch fixes BAU initialization for systems containing
nodes with no memory and for systems with non-consecutive
node numbers.

Fixes and clarifies situations where pnode should be used instead
of node id.

Tested on the UV hardware simulator.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
LKML-Reference: <E1LpjX3-00007N-12@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 108 ++++++++++++++++++++++++++---------------------
 1 file changed, 61 insertions(+), 47 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 79c073247284..b833bc634d17 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -31,6 +31,34 @@ static unsigned long		uv_mmask __read_mostly;
 static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
 static DEFINE_PER_CPU(struct bau_control, bau_control);
 
+/*
+ * Determine the first node on a blade.
+ */
+static int __init blade_to_first_node(int blade)
+{
+	int node, b;
+
+	for_each_online_node(node) {
+		b = uv_node_to_blade_id(node);
+		if (blade == b)
+			return node;
+	}
+	BUG();
+}
+
+/*
+ * Determine the apicid of the first cpu on a blade.
+ */
+static int __init blade_to_first_apicid(int blade)
+{
+	int cpu;
+
+	for_each_present_cpu(cpu)
+		if (blade == uv_cpu_to_blade_id(cpu))
+			return per_cpu(x86_cpu_to_apicid, cpu);
+	return -1;
+}
+
 /*
  * Free a software acknowledge hardware resource by clearing its Pending
  * bit. This will return a reply to the sender.
@@ -67,7 +95,7 @@ static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
 	msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
 	cpu = uv_blade_processor_id();
 	msg->number_of_cpus =
-	    uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
+		uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
 	this_cpu_mask = 1UL << cpu;
 	if (msp->seen_by.bits & this_cpu_mask)
 		return;
@@ -215,14 +243,14 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
  * Returns @flush_mask if some remote flushing remains to be done. The
  * mask will have some bits still set.
  */
-const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
+const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
 					     struct bau_desc *bau_desc,
 					     struct cpumask *flush_mask)
 {
 	int completion_status = 0;
 	int right_shift;
 	int tries = 0;
-	int blade;
+	int pnode;
 	int bit;
 	unsigned long mmr_offset;
 	unsigned long index;
@@ -265,8 +293,8 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
 	 * use the IPI method of shootdown on them.
 	 */
 	for_each_cpu(bit, flush_mask) {
-		blade = uv_cpu_to_blade_id(bit);
-		if (blade == this_blade)
+		pnode = uv_cpu_to_pnode(bit);
+		if (pnode == this_pnode)
 			continue;
 		cpumask_clear_cpu(bit, flush_mask);
 	}
@@ -308,16 +336,16 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	struct cpumask *flush_mask = &__get_cpu_var(flush_tlb_mask);
 	int i;
 	int bit;
-	int blade;
+	int pnode;
 	int uv_cpu;
-	int this_blade;
+	int this_pnode;
 	int locals = 0;
 	struct bau_desc *bau_desc;
 
 	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
 
 	uv_cpu = uv_blade_processor_id();
-	this_blade = uv_numa_blade_id();
+	this_pnode = uv_hub_info->pnode;
 	bau_desc = __get_cpu_var(bau_control).descriptor_base;
 	bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
 
@@ -325,13 +353,13 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 
 	i = 0;
 	for_each_cpu(bit, flush_mask) {
-		blade = uv_cpu_to_blade_id(bit);
-		BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
-		if (blade == this_blade) {
+		pnode = uv_cpu_to_pnode(bit);
+		BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1));
+		if (pnode == this_pnode) {
 			locals++;
 			continue;
 		}
-		bau_node_set(blade, &bau_desc->distribution);
+		bau_node_set(pnode, &bau_desc->distribution);
 		i++;
 	}
 	if (i == 0) {
@@ -349,7 +377,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	bau_desc->payload.address = va;
 	bau_desc->payload.sending_cpu = cpu;
 
-	return uv_flush_send_and_wait(uv_cpu, this_blade, bau_desc, flush_mask);
+	return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask);
 }
 
 /*
@@ -481,8 +509,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 			   stat->requestee, stat->onetlb, stat->alltlb,
 			   stat->s_retry, stat->d_retry, stat->ptc_i);
 		seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
-			   uv_read_global_mmr64(uv_blade_to_pnode
-					(uv_cpu_to_blade_id(cpu)),
+			   uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
 					UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
 			   stat->sflush, stat->dflush,
 			   stat->retriesok, stat->nomsg,
@@ -616,16 +643,18 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node)
  * finish the initialization of the per-blade control structures
  */
 static void __init
-uv_table_bases_finish(int blade, int node, int cur_cpu,
+uv_table_bases_finish(int blade,
 		      struct bau_control *bau_tablesp,
 		      struct bau_desc *adp)
 {
 	struct bau_control *bcp;
-	int i;
+	int cpu;
 
-	for (i = cur_cpu; i < cur_cpu + uv_blade_nr_possible_cpus(blade); i++) {
-		bcp = (struct bau_control *)&per_cpu(bau_control, i);
+	for_each_present_cpu(cpu) {
+		if (blade != uv_cpu_to_blade_id(cpu))
+			continue;
 
+		bcp = (struct bau_control *)&per_cpu(bau_control, cpu);
 		bcp->bau_msg_head	= bau_tablesp->va_queue_first;
 		bcp->va_queue_first	= bau_tablesp->va_queue_first;
 		bcp->va_queue_last	= bau_tablesp->va_queue_last;
@@ -648,8 +677,7 @@ uv_activation_descriptor_init(int node, int pnode)
 	struct bau_desc *adp;
 	struct bau_desc *ad2;
 
-	adp = (struct bau_desc *)
-	    kmalloc_node(16384, GFP_KERNEL, node);
+	adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node);
 	BUG_ON(!adp);
 
 	pa = __pa((unsigned long)adp);
@@ -666,8 +694,7 @@ uv_activation_descriptor_init(int node, int pnode)
 	for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
 		memset(ad2, 0, sizeof(struct bau_desc));
 		ad2->header.sw_ack_flag = 1;
-		ad2->header.base_dest_nodeid =
-		    uv_blade_to_pnode(uv_cpu_to_blade_id(0));
+		ad2->header.base_dest_nodeid = uv_cpu_to_pnode(0);
 		ad2->header.command = UV_NET_ENDPOINT_INTD;
 		ad2->header.int_both = 1;
 		/*
@@ -714,8 +741,9 @@ uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
 /*
  * Initialization of each UV blade's structures
  */
-static int __init uv_init_blade(int blade, int node, int cur_cpu)
+static int __init uv_init_blade(int blade)
 {
+	int node;
 	int pnode;
 	unsigned long pa;
 	unsigned long apicid;
@@ -723,16 +751,17 @@ static int __init uv_init_blade(int blade, int node, int cur_cpu)
 	struct bau_payload_queue_entry *pqp;
 	struct bau_control *bau_tablesp;
 
+	node = blade_to_first_node(blade);
 	bau_tablesp = uv_table_bases_init(blade, node);
 	pnode = uv_blade_to_pnode(blade);
 	adp = uv_activation_descriptor_init(node, pnode);
 	pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
-	uv_table_bases_finish(blade, node, cur_cpu, bau_tablesp, adp);
+	uv_table_bases_finish(blade, bau_tablesp, adp);
 	/*
 	 * the below initialization can't be in firmware because the
 	 * messaging IRQ will be determined by the OS
 	 */
-	apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
+	apicid = blade_to_first_apicid(blade);
 	pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
 	if ((pa & 0xff) != UV_BAU_MESSAGE) {
 		uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
@@ -747,9 +776,7 @@ static int __init uv_init_blade(int blade, int node, int cur_cpu)
 static int __init uv_bau_init(void)
 {
 	int blade;
-	int node;
 	int nblades;
-	int last_blade;
 	int cur_cpu;
 
 	if (!is_uv_system())
@@ -758,29 +785,16 @@ static int __init uv_bau_init(void)
 	uv_bau_retry_limit = 1;
 	uv_nshift = uv_hub_info->n_val;
 	uv_mmask = (1UL << uv_hub_info->n_val) - 1;
-	nblades = 0;
-	last_blade = -1;
-	cur_cpu = 0;
-	for_each_online_node(node) {
-		blade = uv_node_to_blade_id(node);
-		if (blade == last_blade)
-			continue;
-		last_blade = blade;
-		nblades++;
-	}
+	nblades = uv_num_possible_blades();
+
 	uv_bau_table_bases = (struct bau_control **)
 	    kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
 	BUG_ON(!uv_bau_table_bases);
 
-	last_blade = -1;
-	for_each_online_node(node) {
-		blade = uv_node_to_blade_id(node);
-		if (blade == last_blade)
-			continue;
-		last_blade = blade;
-		uv_init_blade(blade, node, cur_cpu);
-		cur_cpu += uv_blade_nr_possible_cpus(blade);
-	}
+	for (blade = 0; blade < nblades; blade++)
+		if (uv_blade_nr_possible_cpus(blade))
+			uv_init_blade(blade);
+
 	alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
 	uv_enable_timeouts();
 
-- 
cgit v1.2.3-55-g7522


From c4c4688f72e638708e5f6b5c259699de82a36fec Mon Sep 17 00:00:00 2001
From: Cliff Wickman
Date: Fri, 3 Apr 2009 08:34:32 -0500
Subject: x86: UV BAU messaging timeouts

This patch replaces a 'nop' uv_enable_timeouts() in the
UV TLB shootdown code. (somehow, long ago that function got
eviscerated)

If any cpu in the destination node does not get interrupted by the
message and post completion in a reasonable time the hardware
should respond to the sender with an error.  This function
enables such timeouts.

Tested on the UV hardware simulator.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
LKML-Reference: <E1LpjXU-00007e-Qh@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_mmrs.h |  5 ++++
 arch/x86/kernel/tlb_uv.c          | 56 +++++++++++++++++++++++++++++++--------
 2 files changed, 50 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index db68ac8a5ac2..2cae46c7c8a2 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -17,6 +17,11 @@
 /* ========================================================================= */
 /*                           UVH_BAU_DATA_CONFIG                             */
 /* ========================================================================= */
+#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
+#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
+/* 1011 timebase 7 (168millisec) * 3 ticks -> 500ms */
 #define UVH_BAU_DATA_CONFIG 0x61680UL
 #define UVH_BAU_DATA_CONFIG_32 0x0438
 
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index b833bc634d17..fced96e94e22 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -445,24 +445,58 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
 	set_irq_regs(old_regs);
 }
 
+/*
+ * uv_enable_timeouts
+ *
+ * Each target blade (i.e. blades that have cpu's) needs to have
+ * shootdown message timeouts enabled.  The timeout does not cause
+ * an interrupt, but causes an error message to be returned to
+ * the sender.
+ */
 static void uv_enable_timeouts(void)
 {
-	int i;
 	int blade;
-	int last_blade;
+	int nblades;
 	int pnode;
-	int cur_cpu = 0;
-	unsigned long apicid;
+	unsigned long mmr_image;
+
+	nblades = uv_num_possible_blades();
 
-	last_blade = -1;
-	for_each_online_node(i) {
-		blade = uv_node_to_blade_id(i);
-		if (blade == last_blade)
+	for (blade = 0; blade < nblades; blade++) {
+		if (!uv_blade_nr_possible_cpus(blade))
 			continue;
-		last_blade = blade;
-		apicid = per_cpu(x86_cpu_to_apicid, cur_cpu);
+
 		pnode = uv_blade_to_pnode(blade);
-		cur_cpu += uv_blade_nr_possible_cpus(i);
+		mmr_image =
+		    uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
+		/*
+		 * Set the timeout period and then lock it in, in three
+		 * steps; captures and locks in the period.
+		 *
+		 * To program the period, the SOFT_ACK_MODE must be off.
+		 */
+		mmr_image &= ~((unsigned long)1 <<
+			       UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+		uv_write_global_mmr64
+		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+		/*
+		 * Set the 4-bit period.
+		 */
+		mmr_image &= ~((unsigned long)0xf <<
+			UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+		mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
+			     UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+		uv_write_global_mmr64
+		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+		/*
+		 * Subsequent reversals of the timebase bit (3) cause an
+		 * immediate timeout of one or all INTD resources as
+		 * indicated in bits 2:0 (7 causes all of them to timeout).
+		 */
+		mmr_image |= ((unsigned long)1 <<
+			      UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+		uv_write_global_mmr64
+		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
 	}
 }
 
-- 
cgit v1.2.3-55-g7522


From 1a544e659cbfce178395e9a090a47d1907d0cfa8 Mon Sep 17 00:00:00 2001
From: Russ Anderson
Date: Mon, 30 Mar 2009 17:52:40 -0500
Subject: x86, UV: system table in bios accessed after unmap

Use the copy of UV system table in kernel memory, not the one in
bios after unmapping.

Signed-off-by: Russ Anderson <rja@sgi.com>
LKML-Reference: <20090330225240.GA22776@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/bios_uv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index f63882728d91..63a88e1f987d 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -182,7 +182,8 @@ void uv_bios_init(void)
 	memcpy(&uv_systab, tab, sizeof(struct uv_systab));
 	iounmap(tab);
 
-	printk(KERN_INFO "EFI UV System Table Revision %d\n", tab->revision);
+	printk(KERN_INFO "EFI UV System Table Revision %d\n",
+					uv_systab.revision);
 }
 #else	/* !CONFIG_EFI */
 
-- 
cgit v1.2.3-55-g7522


From 6a891a24e4d0056c365a90ff2d71c38fd366b0d0 Mon Sep 17 00:00:00 2001
From: Jack Steiner
Date: Mon, 30 Mar 2009 09:01:11 -0500
Subject: x86, UV: Fix for nodes with memory and no cpus

Fix initialization of UV blade information for systems that have
nodes with memory but no cpus.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20090330140111.GA18461@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 1248318436e8..de1a50af807b 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -549,7 +549,8 @@ void __init uv_system_init(void)
 	unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
 	int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
 	int max_pnode = 0;
-	unsigned long mmr_base, present;
+	unsigned long mmr_base, present, paddr;
+	unsigned short pnode_mask;
 
 	map_low_mmrs();
 
@@ -592,6 +593,7 @@ void __init uv_system_init(void)
 		}
 	}
 
+	pnode_mask = (1 << n_val) - 1;
 	node_id.v = uv_read_local_mmr(UVH_NODE_ID);
 	gnode_upper = (((unsigned long)node_id.s.node_id) &
 		       ~((1 << n_val) - 1)) << m_val;
@@ -615,7 +617,7 @@ void __init uv_system_init(void)
 		uv_cpu_hub_info(cpu)->numa_blade_id = blade;
 		uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
 		uv_cpu_hub_info(cpu)->pnode = pnode;
-		uv_cpu_hub_info(cpu)->pnode_mask = (1 << n_val) - 1;
+		uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
 		uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
 		uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
 		uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
@@ -631,6 +633,16 @@ void __init uv_system_init(void)
 			lcpu, blade);
 	}
 
+	/* Add blade/pnode info for nodes without cpus */
+	for_each_online_node(nid) {
+		if (uv_node_to_blade[nid] >= 0)
+			continue;
+		paddr = node_start_pfn(nid) << PAGE_SHIFT;
+		pnode = (paddr >> m_val) & pnode_mask;
+		blade = boot_pnode_to_blade(pnode);
+		uv_node_to_blade[nid] = blade;
+	}
+
 	map_gru_high(max_pnode);
 	map_mmr_high(max_pnode);
 	map_config_high(max_pnode);
-- 
cgit v1.2.3-55-g7522


From d22616942804798105e61428afa41a9132421bb9 Mon Sep 17 00:00:00 2001
From: Huang Weiyi
Date: Mon, 6 Apr 2009 17:16:46 +0800
Subject: ACPI: cpufreq: remove dupilcated #include

Remove dupilicated #include in arch/x86/kernel/cpu/cpufreq/longhaul.c.

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/cpufreq/longhaul.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 0bd48e65a0ca..ce2ed3e4aad9 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -33,7 +33,6 @@
 #include <linux/timex.h>
 #include <linux/io.h>
 #include <linux/acpi.h>
-#include <linux/kernel.h>
 
 #include <asm/msr.h>
 #include <acpi/processor.h>
-- 
cgit v1.2.3-55-g7522


From e4f6937222dbb61b8b8e62caca3d32e648b3b14b Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi
Date: Mon, 6 Apr 2009 11:26:07 -0700
Subject: ACPI x86: Cleanup acpi_cpufreq structures related to aperf/mperf

Change structure name to make the code cleaner and simpler. No
functionality change in this patch.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 42 ++++++++++++++----------------
 1 file changed, 20 insertions(+), 22 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 19f6b9d27e83..340bdbebba07 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -241,23 +241,23 @@ static u32 get_cur_val(const struct cpumask *mask)
 	return cmd.val;
 }
 
-struct perf_cur {
+struct perf_pair {
 	union {
 		struct {
 			u32 lo;
 			u32 hi;
 		} split;
 		u64 whole;
-	} aperf_cur, mperf_cur;
+	} aperf, mperf;
 };
 
 
 static long read_measured_perf_ctrs(void *_cur)
 {
-	struct perf_cur *cur = _cur;
+	struct perf_pair *cur = _cur;
 
-	rdmsr(MSR_IA32_APERF, cur->aperf_cur.split.lo, cur->aperf_cur.split.hi);
-	rdmsr(MSR_IA32_MPERF, cur->mperf_cur.split.lo, cur->mperf_cur.split.hi);
+	rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi);
+	rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi);
 
 	wrmsr(MSR_IA32_APERF, 0, 0);
 	wrmsr(MSR_IA32_MPERF, 0, 0);
@@ -281,7 +281,7 @@ static long read_measured_perf_ctrs(void *_cur)
 static unsigned int get_measured_perf(struct cpufreq_policy *policy,
 				      unsigned int cpu)
 {
-	struct perf_cur cur;
+	struct perf_pair cur;
 	unsigned int perf_percent;
 	unsigned int retval;
 
@@ -294,39 +294,37 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
 	 * Get an approximate value. Return failure in case we cannot get
 	 * an approximate value.
 	 */
-	if (unlikely(cur.aperf_cur.split.hi || cur.mperf_cur.split.hi)) {
+	if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) {
 		int shift_count;
 		u32 h;
 
-		h = max_t(u32, cur.aperf_cur.split.hi, cur.mperf_cur.split.hi);
+		h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi);
 		shift_count = fls(h);
 
-		cur.aperf_cur.whole >>= shift_count;
-		cur.mperf_cur.whole >>= shift_count;
+		cur.aperf.whole >>= shift_count;
+		cur.mperf.whole >>= shift_count;
 	}
 
-	if (((unsigned long)(-1) / 100) < cur.aperf_cur.split.lo) {
+	if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) {
 		int shift_count = 7;
-		cur.aperf_cur.split.lo >>= shift_count;
-		cur.mperf_cur.split.lo >>= shift_count;
+		cur.aperf.split.lo >>= shift_count;
+		cur.mperf.split.lo >>= shift_count;
 	}
 
-	if (cur.aperf_cur.split.lo && cur.mperf_cur.split.lo)
-		perf_percent = (cur.aperf_cur.split.lo * 100) /
-				cur.mperf_cur.split.lo;
+	if (cur.aperf.split.lo && cur.mperf.split.lo)
+		perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo;
 	else
 		perf_percent = 0;
 
 #else
-	if (unlikely(((unsigned long)(-1) / 100) < cur.aperf_cur.whole)) {
+	if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {
 		int shift_count = 7;
-		cur.aperf_cur.whole >>= shift_count;
-		cur.mperf_cur.whole >>= shift_count;
+		cur.aperf.whole >>= shift_count;
+		cur.mperf.whole >>= shift_count;
 	}
 
-	if (cur.aperf_cur.whole && cur.mperf_cur.whole)
-		perf_percent = (cur.aperf_cur.whole * 100) /
-				cur.mperf_cur.whole;
+	if (cur.aperf.whole && cur.mperf.whole)
+		perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;
 	else
 		perf_percent = 0;
 
-- 
cgit v1.2.3-55-g7522


From 18b2646fe3babeb40b34a0c1751e0bf5adfdc64c Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi
Date: Mon, 6 Apr 2009 11:26:08 -0700
Subject: ACPI x86: Make aperf/mperf MSR access in acpi_cpufreq read_only

Do not write zeroes to APERF and MPERF by ondemand governor. With this
change, other users can share these MSRs for reads.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 340bdbebba07..9d3af380c6bd 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -68,6 +68,7 @@ struct acpi_cpufreq_data {
 	unsigned int max_freq;
 	unsigned int resume;
 	unsigned int cpu_feature;
+	u64 saved_aperf, saved_mperf;
 };
 
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
@@ -259,9 +260,6 @@ static long read_measured_perf_ctrs(void *_cur)
 	rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi);
 	rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi);
 
-	wrmsr(MSR_IA32_APERF, 0, 0);
-	wrmsr(MSR_IA32_MPERF, 0, 0);
-
 	return 0;
 }
 
@@ -281,13 +279,20 @@ static long read_measured_perf_ctrs(void *_cur)
 static unsigned int get_measured_perf(struct cpufreq_policy *policy,
 				      unsigned int cpu)
 {
-	struct perf_pair cur;
+	struct perf_pair readin, cur;
 	unsigned int perf_percent;
 	unsigned int retval;
 
-	if (!work_on_cpu(cpu, read_measured_perf_ctrs, &cur))
+	if (!work_on_cpu(cpu, read_measured_perf_ctrs, &readin))
 		return 0;
 
+	cur.aperf.whole = readin.aperf.whole -
+				per_cpu(drv_data, cpu)->saved_aperf;
+	cur.mperf.whole = readin.mperf.whole -
+				per_cpu(drv_data, cpu)->saved_mperf;
+	per_cpu(drv_data, cpu)->saved_aperf = readin.aperf.whole;
+	per_cpu(drv_data, cpu)->saved_mperf = readin.mperf.whole;
+
 #ifdef __i386__
 	/*
 	 * We dont want to do 64 bit divide with 32 bit kernel
-- 
cgit v1.2.3-55-g7522


From db954b5898dd3ef3ef93f4144158ea8f97deb058 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi
Date: Mon, 6 Apr 2009 18:51:29 -0700
Subject: x86 ACPI: Add support for Always Running APIC timer

Add support for Always Running APIC timer, CPUID_0x6_EAX_Bit2.
This bit means the APIC timer continues to run even when CPU is
in deep C-states.

The advantage is that we can use LAPIC timer on these CPUs
always, and there is no need for "slow to read and program"
external timers (HPET/PIT) and the timer broadcast logic
and related code in C-state entry and exit.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/include/asm/cpufeature.h          | 1 +
 arch/x86/kernel/apic/apic.c                | 6 ++++++
 arch/x86/kernel/cpu/addon_cpuid_features.c | 1 +
 drivers/acpi/processor_idle.c              | 3 +++
 4 files changed, 11 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 0beba0d1468d..bb83b1c397aa 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -154,6 +154,7 @@
  * CPUID levels like 0x6, 0xA etc
  */
 #define X86_FEATURE_IDA		(7*32+ 0) /* Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT	(7*32+ 1) /* Always Running APIC Timer */
 
 /* Virtualization flags: Linux defined */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 098ec84b8c00..f2870920f246 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -431,6 +431,12 @@ static void __cpuinit setup_APIC_timer(void)
 {
 	struct clock_event_device *levt = &__get_cpu_var(lapic_events);
 
+	if (cpu_has(&current_cpu_data, X86_FEATURE_ARAT)) {
+		lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
+		/* Make LAPIC timer preferrable over percpu HPET */
+		lapic_clockevent.rating = 150;
+	}
+
 	memcpy(levt, &lapic_clockevent, sizeof(*levt));
 	levt->cpumask = cpumask_of(smp_processor_id());
 
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 8220ae69849d..c965e5212714 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -31,6 +31,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 
 	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
 		{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
+		{ X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
 		{ 0, 0, 0, 0 }
 	};
 
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 4e6e758bd397..6fe121434ffb 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -145,6 +145,9 @@ static void acpi_timer_check_state(int state, struct acpi_processor *pr,
 	struct acpi_processor_power *pwr = &pr->power;
 	u8 type = local_apic_timer_c2_ok ? ACPI_STATE_C3 : ACPI_STATE_C2;
 
+	if (cpu_has(&cpu_data(pr->id), X86_FEATURE_ARAT))
+		return;
+
 	/*
 	 * Check, if one of the previous states already marked the lapic
 	 * unstable
-- 
cgit v1.2.3-55-g7522


From 06aa05b307e8efbc278f201198e7cdf3877bc5c2 Mon Sep 17 00:00:00 2001
From: Russ Anderson
Date: Fri, 3 Apr 2009 17:24:23 -0500
Subject: x86: prevent /sys/firmware/sgi_uv from being created on non-uv
 systems

/sys/firmware/sgi_uv should only be created on uv systems.

Signed-off-by: Russ Anderson <rja@sgi.com>
LKML-Reference: <20090403222423.GA28546@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/uv_sysfs.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
index 67f9b9dbf800..36afb98675a4 100644
--- a/arch/x86/kernel/uv_sysfs.c
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -21,6 +21,7 @@
 
 #include <linux/sysdev.h>
 #include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
 
 struct kobject *sgi_uv_kobj;
 
@@ -47,6 +48,9 @@ static int __init sgi_uv_sysfs_init(void)
 {
 	unsigned long ret;
 
+	if (!is_uv_system())
+		return -ENODEV;
+
 	if (!sgi_uv_kobj)
 		sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
 	if (!sgi_uv_kobj) {
-- 
cgit v1.2.3-55-g7522


From 47788c58e66c050982241d9a05eb690daceb05a9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Wed, 8 Apr 2009 20:40:59 +0200
Subject: tracing/syscalls: use a dedicated file header

Impact: fix build warnings and possibe compat misbehavior on IA64

Building a kernel on ia64 might trigger these ugly build warnings:

CC      arch/ia64/ia32/sys_ia32.o
In file included from arch/ia64/ia32/sys_ia32.c:55:
arch/ia64/ia32/ia32priv.h:290:1: warning: "elf_check_arch" redefined
In file included from include/linux/elf.h:7,
                 from include/linux/module.h:14,
                 from include/linux/ftrace.h:8,
                 from include/linux/syscalls.h:68,
                 from arch/ia64/ia32/sys_ia32.c:18:
arch/ia64/include/asm/elf.h:19:1: warning: this is the location of the previous definition
[...]

sys_ia32.c includes linux/syscalls.h which in turn includes linux/ftrace.h
to import the syscalls tracing prototypes.

But including ftrace.h can pull too much things for a low level file,
especially on ia64 where the ia32 private headers conflict with higher
level headers.

Now we isolate the syscall tracing headers in their own lightweight file.

Reported-by: Tony Luck <tony.luck@intel.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jason Baron <jbaron@redhat.com>
Cc: "Frank Ch. Eigler" <fche@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Jiaying Zhang <jiayingz@google.com>
Cc: Michael Rubin <mrubin@google.com>
Cc: Martin Bligh <mbligh@google.com>
Cc: Michael Davidson <md@google.com>
LKML-Reference: <20090408184058.GB6017@nowhere>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c      |  2 ++
 arch/x86/kernel/ptrace.c      |  3 ++-
 include/linux/ftrace.h        | 29 -----------------------------
 include/linux/syscalls.h      |  2 +-
 include/trace/syscall.h       | 35 +++++++++++++++++++++++++++++++++++
 kernel/trace/trace_syscalls.c |  2 +-
 6 files changed, 41 insertions(+), 32 deletions(-)
 create mode 100644 include/trace/syscall.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 70a10ca100f6..18dfa30795c9 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -18,6 +18,8 @@
 #include <linux/init.h>
 #include <linux/list.h>
 
+#include <trace/syscall.h>
+
 #include <asm/cacheflush.h>
 #include <asm/ftrace.h>
 #include <asm/nops.h>
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index fe9345c967de..23b7c8f017e2 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,7 +21,6 @@
 #include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/signal.h>
-#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -35,6 +34,8 @@
 #include <asm/proto.h>
 #include <asm/ds.h>
 
+#include <trace/syscall.h>
+
 #include "tls.h"
 
 enum x86_regset {
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ff112a872d75..8a0c2f221e6b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -511,33 +511,4 @@ static inline void trace_hw_branch_oops(void) {}
 
 #endif /* CONFIG_HW_BRANCH_TRACER */
 
-/*
- * A syscall entry in the ftrace syscalls array.
- *
- * @name: name of the syscall
- * @nb_args: number of parameters it takes
- * @types: list of types as strings
- * @args: list of args as strings (args[i] matches types[i])
- */
-struct syscall_metadata {
-	const char	*name;
-	int		nb_args;
-	const char	**types;
-	const char	**args;
-};
-
-#ifdef CONFIG_FTRACE_SYSCALLS
-extern void arch_init_ftrace_syscalls(void);
-extern struct syscall_metadata *syscall_nr_to_meta(int nr);
-extern void start_ftrace_syscalls(void);
-extern void stop_ftrace_syscalls(void);
-extern void ftrace_syscall_enter(struct pt_regs *regs);
-extern void ftrace_syscall_exit(struct pt_regs *regs);
-#else
-static inline void start_ftrace_syscalls(void) { }
-static inline void stop_ftrace_syscalls(void) { }
-static inline void ftrace_syscall_enter(struct pt_regs *regs) { }
-static inline void ftrace_syscall_exit(struct pt_regs *regs) { }
-#endif
-
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 6470f74074af..dabe4ad89141 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -65,7 +65,7 @@ struct old_linux_dirent;
 #include <asm/signal.h>
 #include <linux/quota.h>
 #include <linux/key.h>
-#include <linux/ftrace.h>
+#include <trace/syscall.h>
 
 #define __SC_DECL1(t1, a1)	t1 a1
 #define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__)
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
new file mode 100644
index 000000000000..8cfe515cbc47
--- /dev/null
+++ b/include/trace/syscall.h
@@ -0,0 +1,35 @@
+#ifndef _TRACE_SYSCALL_H
+#define _TRACE_SYSCALL_H
+
+#include <asm/ptrace.h>
+
+/*
+ * A syscall entry in the ftrace syscalls array.
+ *
+ * @name: name of the syscall
+ * @nb_args: number of parameters it takes
+ * @types: list of types as strings
+ * @args: list of args as strings (args[i] matches types[i])
+ */
+struct syscall_metadata {
+	const char	*name;
+	int		nb_args;
+	const char	**types;
+	const char	**args;
+};
+
+#ifdef CONFIG_FTRACE_SYSCALLS
+extern void arch_init_ftrace_syscalls(void);
+extern struct syscall_metadata *syscall_nr_to_meta(int nr);
+extern void start_ftrace_syscalls(void);
+extern void stop_ftrace_syscalls(void);
+extern void ftrace_syscall_enter(struct pt_regs *regs);
+extern void ftrace_syscall_exit(struct pt_regs *regs);
+#else
+static inline void start_ftrace_syscalls(void)			{ }
+static inline void stop_ftrace_syscalls(void)			{ }
+static inline void ftrace_syscall_enter(struct pt_regs *regs)	{ }
+static inline void ftrace_syscall_exit(struct pt_regs *regs)	{ }
+#endif
+
+#endif /* _TRACE_SYSCALL_H */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index a2a3af29c943..5e579645ac86 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,5 @@
+#include <trace/syscall.h>
 #include <linux/kernel.h>
-#include <linux/ftrace.h>
 #include <asm/syscall.h>
 
 #include "trace_output.h"
-- 
cgit v1.2.3-55-g7522


From 575922248c0df490843ddfbcf3bc65b54c4adb08 Mon Sep 17 00:00:00 2001
From: Rakib Mullick
Date: Sat, 11 Apr 2009 09:04:59 +0600
Subject: x86: Fix section mismatches in mpparse

Impact: fix section mismatch

In arch/x86/kernel/mpparse.c, smp_reserve_bootmem() has been called
and also refers to a function which is in .init section. Thus causes
the first warning. And check_irq_src() also requires an __init,
because it refers to an .init section.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <b9df5fa10904102004g51265d9axc8d07278bfdb6ba0@mail.gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mpparse.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index dce99dca6cf8..70fd7e414c15 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -679,7 +679,7 @@ void __init get_smp_config(void)
 	__get_smp_config(0);
 }
 
-static void smp_reserve_bootmem(struct mpf_intel *mpf)
+static void __init smp_reserve_bootmem(struct mpf_intel *mpf)
 {
 	unsigned long size = get_mpc_size(mpf->physptr);
 #ifdef CONFIG_X86_32
@@ -838,7 +838,7 @@ static int  __init get_MP_intsrc_index(struct mpc_intsrc *m)
 
 static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
 
-static void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
+static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
 {
 	int i;
 
@@ -866,7 +866,8 @@ static void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
 	}
 }
 #else /* CONFIG_X86_IO_APIC */
-static inline void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
+static
+inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
 #endif /* CONFIG_X86_IO_APIC */
 
 static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
-- 
cgit v1.2.3-55-g7522


From a30469e7921a6dd2067e9e836d7787cfa0105627 Mon Sep 17 00:00:00 2001
From: Suresh Siddha
Date: Fri, 10 Apr 2009 15:21:24 -0700
Subject: x86: add linux kernel support for YMM state

Impact: save/restore Intel-AVX state properly between tasks

Intel Advanced Vector Extensions (AVX) introduce 256-bit vector processing
capability. More about AVX at http://software.intel.com/sites/avx

Add OS support for YMM state management using xsave/xrstor infrastructure
to support AVX.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1239402084.27006.8057.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/processor.h  | 6 ++++++
 arch/x86/include/asm/sigcontext.h | 6 ++++++
 arch/x86/include/asm/xsave.h      | 3 ++-
 arch/x86/kernel/xsave.c           | 2 +-
 4 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 34c52370f2fe..fcf4d92e7e04 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -352,6 +352,11 @@ struct i387_soft_struct {
 	u32			entry_eip;
 };
 
+struct ymmh_struct {
+	/* 16 * 16 bytes for each YMMH-reg = 256 bytes */
+	u32 ymmh_space[64];
+};
+
 struct xsave_hdr_struct {
 	u64 xstate_bv;
 	u64 reserved1[2];
@@ -361,6 +366,7 @@ struct xsave_hdr_struct {
 struct xsave_struct {
 	struct i387_fxsave_struct i387;
 	struct xsave_hdr_struct xsave_hdr;
+	struct ymmh_struct ymmh;
 	/* new processor state extensions will go here */
 } __attribute__ ((packed, aligned (64)));
 
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index ec666491aaa4..72e5a4491661 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -269,6 +269,11 @@ struct _xsave_hdr {
 	__u64 reserved2[5];
 };
 
+struct _ymmh_state {
+	/* 16 * 16 bytes for each YMMH-reg */
+	__u32 ymmh_space[64];
+};
+
 /*
  * Extended state pointed by the fpstate pointer in the sigcontext.
  * In addition to the fpstate, information encoded in the xstate_hdr
@@ -278,6 +283,7 @@ struct _xsave_hdr {
 struct _xstate {
 	struct _fpstate fpstate;
 	struct _xsave_hdr xstate_hdr;
+	struct _ymmh_state ymmh;
 	/* new processor state extensions go here */
 };
 
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 08e9a1ac07a9..727acc152344 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -7,6 +7,7 @@
 
 #define XSTATE_FP	0x1
 #define XSTATE_SSE	0x2
+#define XSTATE_YMM	0x4
 
 #define XSTATE_FPSSE	(XSTATE_FP | XSTATE_SSE)
 
@@ -15,7 +16,7 @@
 /*
  * These are the features that the OS can handle currently.
  */
-#define XCNTXT_MASK	(XSTATE_FP | XSTATE_SSE)
+#define XCNTXT_MASK	(XSTATE_FP | XSTATE_SSE | XSTATE_YMM)
 
 #ifdef CONFIG_X86_64
 #define REX_PREFIX	"0x48, "
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 2b54fe002e94..0a5b04aa98f1 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -324,7 +324,7 @@ void __ref xsave_cntxt_init(void)
 	}
 
 	/*
-	 * for now OS knows only about FP/SSE
+	 * Support only the state known to OS.
 	 */
 	pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
 	xsave_init();
-- 
cgit v1.2.3-55-g7522


From 01599fca6758d2cd133e78f87426fc851c9ea725 Mon Sep 17 00:00:00 2001
From: Andrew Morton
Date: Mon, 13 Apr 2009 10:27:49 -0700
Subject: cpufreq: use smp_call_function_[single|many]() in acpi-cpufreq.c

Atttempting to rid us of the problematic work_on_cpu().  Just use
smp_call_fuction_single() here.

This repairs a 10% sysbench(oltp)+mysql regression which Mike reported,
due to

  commit 6b44003e5ca66a3fffeb5bc90f40ada2c4340896
  Author: Andrew Morton <akpm@linux-foundation.org>
  Date:   Thu Apr 9 09:50:37 2009 -0600

      work_on_cpu(): rewrite it to create a kernel thread on demand

It seems that the kernel calls these acpi-cpufreq functions at a quite
high frequency.

Valdis Kletnieks also reports that this causes 70-90 forks per second on
his hardware.

Cc: Valdis.Kletnieks@vt.edu
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Zhao Yakui <yakui.zhao@intel.com>
Acked-by: Dave Jones <davej@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Mike Galbraith <efault@gmx.de>
Cc: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
[ Made it use smp_call_function_many() instead of looping over cpu's
  with smp_call_function_single()    - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 9d3af380c6bd..3e3cd3db7a0c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -153,7 +153,8 @@ struct drv_cmd {
 	u32 val;
 };
 
-static long do_drv_read(void *_cmd)
+/* Called via smp_call_function_single(), on the target CPU */
+static void do_drv_read(void *_cmd)
 {
 	struct drv_cmd *cmd = _cmd;
 	u32 h;
@@ -170,10 +171,10 @@ static long do_drv_read(void *_cmd)
 	default:
 		break;
 	}
-	return 0;
 }
 
-static long do_drv_write(void *_cmd)
+/* Called via smp_call_function_many(), on the target CPUs */
+static void do_drv_write(void *_cmd)
 {
 	struct drv_cmd *cmd = _cmd;
 	u32 lo, hi;
@@ -192,23 +193,18 @@ static long do_drv_write(void *_cmd)
 	default:
 		break;
 	}
-	return 0;
 }
 
 static void drv_read(struct drv_cmd *cmd)
 {
 	cmd->val = 0;
 
-	work_on_cpu(cpumask_any(cmd->mask), do_drv_read, cmd);
+	smp_call_function_single(cpumask_any(cmd->mask), do_drv_read, cmd, 1);
 }
 
 static void drv_write(struct drv_cmd *cmd)
 {
-	unsigned int i;
-
-	for_each_cpu(i, cmd->mask) {
-		work_on_cpu(i, do_drv_write, cmd);
-	}
+	smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
 }
 
 static u32 get_cur_val(const struct cpumask *mask)
@@ -252,15 +248,13 @@ struct perf_pair {
 	} aperf, mperf;
 };
 
-
-static long read_measured_perf_ctrs(void *_cur)
+/* Called via smp_call_function_single(), on the target CPU */
+static void read_measured_perf_ctrs(void *_cur)
 {
 	struct perf_pair *cur = _cur;
 
 	rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi);
 	rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi);
-
-	return 0;
 }
 
 /*
@@ -283,7 +277,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
 	unsigned int perf_percent;
 	unsigned int retval;
 
-	if (!work_on_cpu(cpu, read_measured_perf_ctrs, &readin))
+	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &cur, 1))
 		return 0;
 
 	cur.aperf.whole = readin.aperf.whole -
-- 
cgit v1.2.3-55-g7522


From ff6c6fed3a8ab9b0a7b02574e095e905e89421d9 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput
Date: Sun, 12 Apr 2009 23:24:21 +0530
Subject: x86: pci-swiotlb.c swiotlb_dma_ops should be static

Impact: reduce kernel size a bit, address sparse warning

Addresses the problem pointed out by this sparse warning:

  arch/x86/kernel/pci-swiotlb.c:53:20: warning: symbol 'swiotlb_dma_ops' was not declared. Should it be static?

For x86: swiotlb_dma_ops can be static, because it's not used outside
of pci-swiotlb.c

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
LKML-Reference: <1239558861.3938.2.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-swiotlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 34f12e9996ed..221a3853e268 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -50,7 +50,7 @@ static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
 }
 
-struct dma_map_ops swiotlb_dma_ops = {
+static struct dma_map_ops swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
 	.alloc_coherent = x86_swiotlb_alloc_coherent,
 	.free_coherent = swiotlb_free_coherent,
-- 
cgit v1.2.3-55-g7522


From 1c98aa7424ff163637d8321674ec58dee28152d4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds
Date: Mon, 13 Apr 2009 18:09:20 -0700
Subject: Fix quilt merge error in acpi-cpufreq.c

We ended up incorrectly using '&cur' instead of '&readin' in the
work_on_cpu() -> smp_call_function_single() transformation in commit
01599fca6758d2cd133e78f87426fc851c9ea725 ("cpufreq: use
smp_call_function_[single|many]() in acpi-cpufreq.c").

Andrew explains:
 "OK, the acpi tree went and had conflicting changes merged into it after
  I'd written the patch and it appears that I incorrectly reverted part
  of 18b2646fe3babeb40b34a0c1751e0bf5adfdc64c while fixing the resulting
  rejects.

  Switching it to `readin' looks correct."

Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 3e3cd3db7a0c..837c2c4cc203 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -277,7 +277,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
 	unsigned int perf_percent;
 	unsigned int retval;
 
-	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &cur, 1))
+	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1))
 		return 0;
 
 	cur.aperf.whole = readin.aperf.whole -
-- 
cgit v1.2.3-55-g7522


From 6ec3cfeca04622e3d80c9270191cd7f5f88214af Mon Sep 17 00:00:00 2001
From: Pallipadi, Venkatesh
Date: Mon, 13 Apr 2009 15:20:58 -0700
Subject: x86, irq: Remove IRQ_DISABLED check in process context IRQ move

As discussed in the thread here:

  http://marc.info/?l=linux-kernel&m=123964468521142&w=2

Eric W. Biederman observed:

> It looks like some additional bugs have slipped in since last I looked.
>
> set_irq_affinity does this:
> ifdef CONFIG_GENERIC_PENDING_IRQ
>        if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
>                cpumask_copy(desc->affinity, cpumask);
>                desc->chip->set_affinity(irq, cpumask);
>        } else {
>                desc->status |= IRQ_MOVE_PENDING;
>                cpumask_copy(desc->pending_mask, cpumask);
>        }
> #else
>
> That IRQ_DISABLED case is a software state and as such it has nothing to
> do with how safe it is to move an irq in process context.

[...]

>
> The only reason we migrate MSIs in interrupt context today is that there
> wasn't infrastructure for support migration both in interrupt context
> and outside of it.

Yes. The idea here was to force the MSI migration to happen in process
context. One of the patches in the series did

        disable_irq(dev->irq);
        irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
        enable_irq(dev->irq);

with the above patch adding irq/manage code check for interrupt disabled
and moving the interrupt in process context.

IIRC, there was no IRQ_MOVE_PCNTXT when we were developing this HPET
code and we ended up having this ugly hack. IRQ_MOVE_PCNTXT was there
when we eventually submitted the patch upstream. But, looks like I did a
blind rebasing instead of using IRQ_MOVE_PCNTXT in hpet MSI code.

Below patch fixes this. i.e., revert commit 932775a4ab622e3c99bd59f14cc
and add PCNTXT to HPET MSI setup. Also removes copying of desc->affinity
in generic code as set_affinity routines are doing it internally.

Reported-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Li Shaohua" <shaohua.li@intel.com>
Cc: Gary Hade <garyhade@us.ibm.com>
Cc: "lcm@us.ibm.com" <lcm@us.ibm.com>
Cc: suresh.b.siddha@intel.com
LKML-Reference: <20090413222058.GB8211@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 2 ++
 kernel/irq/manage.c            | 5 ++---
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index a2789e42e162..30da617d18e4 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3670,12 +3670,14 @@ int arch_setup_hpet_msi(unsigned int irq)
 {
 	int ret;
 	struct msi_msg msg;
+	struct irq_desc *desc = irq_to_desc(irq);
 
 	ret = msi_compose_msg(NULL, irq, &msg);
 	if (ret < 0)
 		return ret;
 
 	hpet_msi_write(irq, &msg);
+	desc->status |= IRQ_MOVE_PCNTXT;
 	set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq,
 		"edge");
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7e2e7dd4cd2f..2734eca59243 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -109,10 +109,9 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	spin_lock_irqsave(&desc->lock, flags);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
-		cpumask_copy(desc->affinity, cpumask);
+	if (desc->status & IRQ_MOVE_PCNTXT)
 		desc->chip->set_affinity(irq, cpumask);
-	} else {
+	else {
 		desc->status |= IRQ_MOVE_PENDING;
 		cpumask_copy(desc->pending_mask, cpumask);
 	}
-- 
cgit v1.2.3-55-g7522


From 94ca8e4852807fc42d2f64fcaf248aafc4f2e6a7 Mon Sep 17 00:00:00 2001
From: Cliff Wickman
Date: Tue, 14 Apr 2009 10:56:48 -0500
Subject: x86: UV: BAU partition-relative distribution map

This patch enables each partition's BAU distribution bit map
to be partition-relative.

The distribution bitmap had been constructed assuming 0 as the base
node number.  That construct would not have allowed a total system of
greater than 256 nodes.
It also corrects an error that occurred when the first blade's nasid
was not zero.  That nasid was stored as the base node.
The base node number gets added by hardware to the node numbers implied
in the distribution bitmap, resulting in invalid target nasids.

Tested on the UV hardware simulator.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
LKML-Reference: <E1Ltl0C-0004Ob-37@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index fced96e94e22..98307f953492 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -25,6 +25,8 @@ static int			uv_bau_retry_limit __read_mostly;
 
 /* position of pnode (which is nasid>>1): */
 static int			uv_nshift __read_mostly;
+/* base pnode in this partition */
+static int			uv_partition_base_pnode __read_mostly;
 
 static unsigned long		uv_mmask __read_mostly;
 
@@ -43,7 +45,7 @@ static int __init blade_to_first_node(int blade)
 		if (blade == b)
 			return node;
 	}
-	BUG();
+	return -1; /* shouldn't happen */
 }
 
 /*
@@ -359,7 +361,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 			locals++;
 			continue;
 		}
-		bau_node_set(pnode, &bau_desc->distribution);
+		bau_node_set(pnode - uv_partition_base_pnode,
+				&bau_desc->distribution);
 		i++;
 	}
 	if (i == 0) {
@@ -728,7 +731,12 @@ uv_activation_descriptor_init(int node, int pnode)
 	for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
 		memset(ad2, 0, sizeof(struct bau_desc));
 		ad2->header.sw_ack_flag = 1;
-		ad2->header.base_dest_nodeid = uv_cpu_to_pnode(0);
+		/*
+		 * base_dest_nodeid is the first node in the partition, so
+		 * the bit map will indicate partition-relative node numbers.
+		 * note that base_dest_nodeid is actually a nasid.
+		 */
+		ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
 		ad2->header.command = UV_NET_ENDPOINT_INTD;
 		ad2->header.int_both = 1;
 		/*
@@ -825,6 +833,11 @@ static int __init uv_bau_init(void)
 	    kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
 	BUG_ON(!uv_bau_table_bases);
 
+	uv_partition_base_pnode = 0x7fffffff;
+	for (blade = 0; blade < nblades; blade++)
+		if (uv_blade_nr_possible_cpus(blade) &&
+			(uv_blade_to_pnode(blade) < uv_partition_base_pnode))
+			uv_partition_base_pnode = uv_blade_to_pnode(blade);
 	for (blade = 0; blade < nblades; blade++)
 		if (uv_blade_nr_possible_cpus(blade))
 			uv_init_blade(blade);
-- 
cgit v1.2.3-55-g7522


From 6f66cbc63081fd70e3191b4dbb796746780e5ae1 Mon Sep 17 00:00:00 2001
From: Hugh Dickins
Date: Tue, 14 Apr 2009 19:25:42 +0100
Subject: x86 microcode: revert some work_on_cpu

Revert part of af5c820a3169e81af869c113e18ec7588836cd50 ("x86: cpumask:
use work_on_cpu in arch/x86/kernel/microcode_core.c")

That change is causing only one Intel CPU's microcode to be updated e.g.
microcode: CPU3 updated from revision 0x9 to 0x17, date = 2005-04-22
where before it announced that also for CPU0 and CPU1 and CPU2.

We cannot use work_on_cpu() in the CONFIG_MICROCODE_OLD_INTERFACE code,
because Intel's request_microcode_user() involves a copy_from_user() from
/sbin/microcode_ctl, which therefore needs to be on that CPU at the time.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/microcode_core.c | 33 +++++++++++----------------------
 1 file changed, 11 insertions(+), 22 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index a0f3851ef310..2e0eb4140951 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -108,40 +108,29 @@ struct ucode_cpu_info		ucode_cpu_info[NR_CPUS];
 EXPORT_SYMBOL_GPL(ucode_cpu_info);
 
 #ifdef CONFIG_MICROCODE_OLD_INTERFACE
-struct update_for_cpu {
-	const void __user	*buf;
-	size_t			size;
-};
-
-static long update_for_cpu(void *_ufc)
-{
-	struct update_for_cpu *ufc = _ufc;
-	int error;
-
-	error = microcode_ops->request_microcode_user(smp_processor_id(),
-						      ufc->buf, ufc->size);
-	if (error < 0)
-		return error;
-	if (!error)
-		microcode_ops->apply_microcode(smp_processor_id());
-	return error;
-}
-
 static int do_microcode_update(const void __user *buf, size_t size)
 {
+	cpumask_t old;
 	int error = 0;
 	int cpu;
-	struct update_for_cpu ufc = { .buf = buf, .size = size };
+
+	old = current->cpus_allowed;
 
 	for_each_online_cpu(cpu) {
 		struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
 		if (!uci->valid)
 			continue;
-		error = work_on_cpu(cpu, update_for_cpu, &ufc);
+
+		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+		error = microcode_ops->request_microcode_user(cpu, buf, size);
 		if (error < 0)
-			break;
+			goto out;
+		if (!error)
+			microcode_ops->apply_microcode(cpu);
 	}
+out:
+	set_cpus_allowed_ptr(current, &old);
 	return error;
 }
 
-- 
cgit v1.2.3-55-g7522


From ea34f43a074af85823e49b9bf62f47d8d3f0e81a Mon Sep 17 00:00:00 2001
From: Linus Torvalds
Date: Wed, 15 Apr 2009 08:05:13 -0700
Subject: acpi-cpufreq: fix 'smp_call_function_many()' confusion

It turns out that 'smp_call_function_many()' doesn't work at all like
'smp_call_function_single()', and my change to Andrew's patch to use it
rather than a loop over all CPU's acpi-cpufreq doesn't work.

My bad.

'smp_call_function_many()' has two "features" (aka "documented bugs"):

 (a) it needs to be called with preemption disabled, because it uses
     smp_processor_id() without guarding the CPU lookup with 'get_cpu()'
     and 'put_cpu()' like the 'single' variant does.

 (b) even if the current CPU is part of the CPU mask, it won't do the
     call on that CPU.

Still, we're better off trying to use 'smp_call_function_many()' than
looping over CPU's, since it at least in theory allows us to use a
broadcast IPI and do it all in parallel.  So let's just work around the
silly semantic bugs in that function.

Reported-and-tested-by: Ali Gholami Rudi <ali@rudi.ir>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andrew Morton <akpm@linux-foundation.org>,
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Dave Jones <davej@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 837c2c4cc203..ecdb682ab516 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -204,7 +204,13 @@ static void drv_read(struct drv_cmd *cmd)
 
 static void drv_write(struct drv_cmd *cmd)
 {
+	int this_cpu;
+
+	this_cpu = get_cpu();
+	if (cpumask_test_cpu(this_cpu, cmd->mask))
+		do_drv_write(cmd);
 	smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
+	put_cpu();
 }
 
 static u32 get_cur_val(const struct cpumask *mask)
-- 
cgit v1.2.3-55-g7522


From 4ea3c51d5bd3bb4eea7d7d3a1f80d1a48c2a6f92 Mon Sep 17 00:00:00 2001
From: Cliff Wickman
Date: Thu, 16 Apr 2009 07:53:09 -0500
Subject: x86: UV BAU distribution and payload MMRs

This patch correctly sets BAU memory mapped registers to point
to the sending activation descriptor table and target payload table.

The "Broadcast Assist Unit" is used for TLB shootdown in UV.

The memory mapped registers that point to sending and receiving
memory structures contain node numbers.

In one case the __pa() function did not provide the node id of
memory on blade zero in configurations where that id is nonzero.
In another case, it was assumed that memory was allocated on
the local node.  That assumption is not true in a configuration
in which the node has no memory.

Tested on the UV hardware simulator.

[ Impact: fix possible runtime crash due to incorrect TLB logic ]

Signed-off-by: Cliff Wickman <cpw@sgi.com>
LKML-Reference: <E1LuR5Z-0007An-B8@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 98307f953492..78422336ddea 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -717,7 +717,7 @@ uv_activation_descriptor_init(int node, int pnode)
 	adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node);
 	BUG_ON(!adp);
 
-	pa = __pa((unsigned long)adp);
+	pa = uv_gpa(adp); /* need the real nasid*/
 	n = pa >> uv_nshift;
 	m = pa & uv_mmask;
 
@@ -754,6 +754,8 @@ static struct bau_payload_queue_entry * __init
 uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
 {
 	struct bau_payload_queue_entry *pqp;
+	unsigned long pa;
+	int pn;
 	char *cp;
 
 	pqp = (struct bau_payload_queue_entry *) kmalloc_node(
@@ -764,10 +766,14 @@ uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
 	cp = (char *)pqp + 31;
 	pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
 	bau_tablesp->va_queue_first = pqp;
+	/*
+	 * need the pnode of where the memory was really allocated
+	 */
+	pa = uv_gpa(pqp);
+	pn = pa >> uv_nshift;
 	uv_write_global_mmr64(pnode,
 			      UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
-			      ((unsigned long)pnode <<
-			       UV_PAYLOADQ_PNODE_SHIFT) |
+			      ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
 			      uv_physnodeaddr(pqp));
 	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
 			      uv_physnodeaddr(pqp));
-- 
cgit v1.2.3-55-g7522


From 0917798d82212f884fff650e7e520de3b438f947 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko
Date: Wed, 15 Apr 2009 16:51:48 +0200
Subject: x86: fix microcode driver newly spewing warnings

Jeff Garzik reported this WARN_ON() noise:

> Kernel: 2.6.30-rc1-00306-g8371f87
> Hardware: ICH10 x86-64
>
> This is a regression from 2.6.29.  Microcode spews the following WARNING
> multiple times during boot:
>
> ------------[ cut here ]------------
> WARNING: at fs/sysfs/group.c:138 sysfs_remove_group+0xeb/0xf0()
> Hardware name:         sysfs group ffffffffa0209700 not found for
>  kobject 'cpu0'

Keep sysfs files around for cpus even when we failed to locate
microcode for them at the moment of module loading. The appropriate
microcode firmware can become available later on.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_core.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index a0f3851ef310..4d420de9ac61 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -391,8 +391,6 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
 		return err;
 
 	err = microcode_init_cpu(cpu);
-	if (err)
-		sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
 
 	return err;
 }
-- 
cgit v1.2.3-55-g7522


From 27229ca63269c1be0a710f074fa9de4024f283f1 Mon Sep 17 00:00:00 2001
From: Jack Steiner
Date: Fri, 17 Apr 2009 09:24:47 -0500
Subject: x86/uv: fix init of cpu-less nodes

Fix an endcase in the UV initialization code for the "UV large system mode"
of apicids.  If node zero contains no cpus, cpus on another node will be the
boot cpu.  The percpu data that contains the extra apicid bits was not
being initialized early enough.

[ Impact: fix potential boot crash on cpu-less UV nodes ]

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20090417142447.GA23759@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 1248318436e8..49d2f5c739b7 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -19,6 +19,7 @@
 #include <linux/timer.h>
 #include <linux/cpu.h>
 #include <linux/init.h>
+#include <linux/io.h>
 
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_hub.h>
@@ -34,6 +35,17 @@ DEFINE_PER_CPU(int, x2apic_extra_bits);
 
 static enum uv_system_type uv_system_type;
 
+static int early_get_nodeid(void)
+{
+	union uvh_node_id_u node_id;
+	unsigned long *mmr;
+
+	mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
+	node_id.v = *mmr;
+	early_iounmap(mmr, sizeof(*mmr));
+	return node_id.s.node_id;
+}
+
 static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
 	if (!strcmp(oem_id, "SGI")) {
@@ -42,6 +54,8 @@ static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 		else if (!strcmp(oem_table_id, "UVX"))
 			uv_system_type = UV_X2APIC;
 		else if (!strcmp(oem_table_id, "UVH")) {
+			__get_cpu_var(x2apic_extra_bits) =
+				early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1);
 			uv_system_type = UV_NON_UNIQUE_APIC;
 			return 1;
 		}
-- 
cgit v1.2.3-55-g7522


From 0300e7f1a525ae4e4ac05344624adf0e5f13ea52 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Fri, 17 Apr 2009 08:33:52 -0400
Subject: lockdep, x86: account for irqs enabled in paranoid_exit

I hit the check_flags error of lockdep:

 WARNING: at kernel/lockdep.c:2893 check_flags+0x1a7/0x1d0()
 [...]
 hardirqs last  enabled at (12567): [<ffffffff8026206a>] local_bh_enable+0xaa/0x110
 hardirqs last disabled at (12569): [<ffffffff80610c76>] int3+0x16/0x40
 softirqs last  enabled at (12566): [<ffffffff80514d2b>] lock_sock_nested+0xfb/0x110
 softirqs last disabled at (12568): [<ffffffff8058454e>] tcp_prequeue_process+0x2e/0xa0

The check_flags warning of lockdep tells me that lockdep thought interrupts
were disabled, but they were really enabled.

The numbers in the above parenthesis show the order of events:

 12566: softirqs last enabled:  lock_sock_nested
 12567: hardirqs last enabled:  local_bh_enable
 12568: softirqs last disabled: tcp_prequeue_process
 12566: hardirqs last disabled: int3

int3 is a breakpoint!

Examining this further, I have CONFIG_NET_TCPPROBE enabled which adds
break points into the kernel.

The paranoid_exit of the return of int3 does not account for enabling
interrupts on return to kernel. This code is a bit tricky since it
is also used by the nmi handler (when lockdep is off), and we must be
careful about the swapgs. We can not call kernel code after the swapgs
has been performed.

[ Impact: fix lockdep check_flags warning + self-turn-off ]

Acked-by: Peter Zijlsta <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a331ec38af9e..38946c6e8433 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1410,7 +1410,10 @@ ENTRY(paranoid_exit)
 paranoid_swapgs:
 	TRACE_IRQS_IRETQ 0
 	SWAPGS_UNSAFE_STACK
+	RESTORE_ALL 8
+	jmp irq_return
 paranoid_restore:
+	TRACE_IRQS_IRETQ 0
 	RESTORE_ALL 8
 	jmp irq_return
 paranoid_userspace:
-- 
cgit v1.2.3-55-g7522


From 093f13e23137b9e5f7629dd5932ceea1419e2b61 Mon Sep 17 00:00:00 2001
From: Pallipadi, Venkatesh
Date: Wed, 15 Apr 2009 10:37:33 -0700
Subject: x86, acpi_cpufreq: Fix the NULL pointer dereference in
 get_measured_perf

Fix for a regression that was introduced by earlier commit
18b2646fe3babeb40b34a0c1751e0bf5adfdc64c on Mon Apr 6 11:26:08 2009

Regression resulted in the below error happened on systems with
software coordination where per_cpu acpi data will not be initiated for
secondary CPUs in a P-state domain.

On Tue, 2009-04-14 at 23:01 -0700, Zhang, Yanmin wrote:
 My machine hanged with kernel 2.6.30-rc2 when script read
> /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor.
>
> opps happens in get_measured_perf:
>
>         cur.aperf.whole = readin.aperf.whole -
>                                 per_cpu(drv_data, cpu)->saved_aperf;
>
> Because per_cpu(drv_data, cpu)=NULL.
>
> So function get_measured_perf should check if (per_cpu(drv_data,
> cpu)==NULL)
> and return 0 if it's NULL.

--------------sys log------------------

BUG: unable to handle kernel NULL pointer dereference at
0000000000000020
IP: [<ffffffff8021af75>] get_measured_perf+0x4a/0xf9
PGD a7dd88067 PUD a7ccf5067 PMD 0
Oops: 0000 [#1] SMP
last sysfs file: /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
CPU 0
Modules linked in: video output
Pid: 2091, comm: kondemand/0 Not tainted 2.6.30-rc2 #1 MP Server
RIP: 0010:[<ffffffff8021af75>]  [<ffffffff8021af75>]
get_measured_perf+0x4a/0xf9
RSP: 0018:ffff880a7d56de20  EFLAGS: 00010246
RAX: 0000000000000000 RBX: 00000046241a42b6 RCX: ffff88004d219000
RDX: 000000000000b660 RSI: 0000000000000020 RDI: 0000000000000001
RBP: ffff880a7f052000 R08: 00000046241a42b6 R09: ffffffff807639f0
R10: 00000000ffffffea R11: ffffffff802207f4 R12: ffff880a7f052000
R13: ffff88004d20e460 R14: 0000000000ddd5a6 R15: 0000000000000001
FS:  0000000000000000(0000) GS:ffff88004d200000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 0000000000000020 CR3: 0000000a7f1bf000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process kondemand/0 (pid: 2091, threadinfo ffff880a7d56c000, task
ffff880a7d4d18c0)
Stack:
 ffff880a7f052078 ffffffff803efd54 00000046241a42b6 000000462ffa9e95
 0000000000000001 0000000000000001 00000000ffffffea ffffffff8064f41a
 0000000000000012 0000000000000012 ffff880a7f052000 ffffffff80650547
Call Trace:
 [<ffffffff803efd54>] ? kobject_get+0x12/0x17
 [<ffffffff8064f41a>] ? __cpufreq_driver_getavg+0x42/0x57
 [<ffffffff80650547>] ? do_dbs_timer+0x147/0x272
 [<ffffffff80650400>] ? do_dbs_timer+0x0/0x272
 [<ffffffff802474ca>] ? worker_thread+0x15b/0x1f5
 [<ffffffff8024a02c>] ? autoremove_wake_function+0x0/0x2e
 [<ffffffff8024736f>] ? worker_thread+0x0/0x1f5
 [<ffffffff80249f0d>] ? kthread+0x54/0x83
 [<ffffffff8020c87a>] ? child_rip+0xa/0x20
 [<ffffffff80249eb9>] ? kthread+0x0/0x83
 [<ffffffff8020c870>] ? child_rip+0x0/0x20
Code: 99 a6 03 00 31 c9 85 c0 0f 85 c3 00 00 00 89 df 4c 8b 44 24 10 48
c7 c2 60 b6 00 00 48 8b 0c fd e0 30 a5 80 4c 89 c3 48 8b 04 0a <48> 2b
58 20 48 8b 44 24 18 48 89 1c 24 48 8b 34 0a 48 2b 46 28
RIP  [<ffffffff8021af75>] get_measured_perf+0x4a/0xf9
 RSP <ffff880a7d56de20>
CR2: 0000000000000020
---[ end trace 2b8fac9a49e19ad4 ]---

Tested-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index ecdb682ab516..dd0bd76d14c0 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -68,11 +68,16 @@ struct acpi_cpufreq_data {
 	unsigned int max_freq;
 	unsigned int resume;
 	unsigned int cpu_feature;
-	u64 saved_aperf, saved_mperf;
 };
 
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
 
+struct acpi_msr_data {
+	u64 saved_aperf, saved_mperf;
+};
+
+static DEFINE_PER_CPU(struct acpi_msr_data, msr_data);
+
 DEFINE_TRACE(power_mark);
 
 /* acpi_perf_data is a pointer to percpu data. */
@@ -287,11 +292,11 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
 		return 0;
 
 	cur.aperf.whole = readin.aperf.whole -
-				per_cpu(drv_data, cpu)->saved_aperf;
+				per_cpu(msr_data, cpu).saved_aperf;
 	cur.mperf.whole = readin.mperf.whole -
-				per_cpu(drv_data, cpu)->saved_mperf;
-	per_cpu(drv_data, cpu)->saved_aperf = readin.aperf.whole;
-	per_cpu(drv_data, cpu)->saved_mperf = readin.mperf.whole;
+				per_cpu(msr_data, cpu).saved_mperf;
+	per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole;
+	per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole;
 
 #ifdef __i386__
 	/*
-- 
cgit v1.2.3-55-g7522


From e0e8c4e512e92bc25c19bd8d4926de17d2f8fbf2 Mon Sep 17 00:00:00 2001
From: Thomas Renninger
Date: Fri, 17 Apr 2009 16:22:06 +0200
Subject: acpi-cpufreq: Cleanup: Use printk_once

Signed-off-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index dd0bd76d14c0..4eab747a8966 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -693,13 +693,9 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	/* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
 	if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
 	    policy->cpuinfo.transition_latency > 20 * 1000) {
-		static int print_once;
 		policy->cpuinfo.transition_latency = 20 * 1000;
-		if (!print_once) {
-			print_once = 1;
-			printk(KERN_INFO "Capping off P-state tranision latency"
-				" at 20 uS\n");
-		}
+			printk_once(KERN_INFO "Capping off P-state tranision"
+				    " latency at 20 uS\n");
 	}
 
 	data->max_freq = perf->states[0].core_frequency * 1000;
-- 
cgit v1.2.3-55-g7522


From d91758f5ddb80e91176fa2cf80c88c1633950b3d Mon Sep 17 00:00:00 2001
From: Thomas Renninger
Date: Fri, 17 Apr 2009 16:22:07 +0200
Subject: acpi-cpufreq: style-only: add parens to math expression

Signed-off-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4eab747a8966..aec3161abede 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -340,7 +340,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
 
 #endif
 
-	retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100;
+	retval = (per_cpu(drv_data, policy->cpu)->max_freq * perf_percent) / 100;
 
 	return retval;
 }
-- 
cgit v1.2.3-55-g7522


From d876dfbbf5c8728102fb4f683450fa9ae3259cda Mon Sep 17 00:00:00 2001
From: Thomas Renninger
Date: Fri, 17 Apr 2009 16:22:08 +0200
Subject: acpi-cpufreq: Do not let get_measured perf depend on internal
 variable

Take already available policy->cpuinfo.max_freq and get rid of acpi-cpufreq
specific max_freq variable.

This implies that P0 is always the highest frequency which should always
be true as ACPI spec says:
As a result, the zeroth entry describes the highest performance state

Signed-off-by: Thomas Renninger <trenn@suse.de>
Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index aec3161abede..208ecf6643df 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -65,7 +65,6 @@ enum {
 struct acpi_cpufreq_data {
 	struct acpi_processor_performance *acpi_data;
 	struct cpufreq_frequency_table *freq_table;
-	unsigned int max_freq;
 	unsigned int resume;
 	unsigned int cpu_feature;
 };
@@ -340,7 +339,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
 
 #endif
 
-	retval = (per_cpu(drv_data, policy->cpu)->max_freq * perf_percent) / 100;
+	retval = (policy->cpuinfo.max_freq * perf_percent) / 100;
 
 	return retval;
 }
@@ -698,7 +697,6 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 				    " latency at 20 uS\n");
 	}
 
-	data->max_freq = perf->states[0].core_frequency * 1000;
 	/* table init */
 	for (i = 0; i < perf->state_count; i++) {
 		if (i > 0 && perf->states[i].core_frequency >=
@@ -717,6 +715,9 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	if (result)
 		goto err_freqfree;
 
+	if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)
+		printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n");
+
 	switch (perf->control_register.space_id) {
 	case ACPI_ADR_SPACE_SYSTEM_IO:
 		/* Current speed is unknown and not detectable by IO port */
-- 
cgit v1.2.3-55-g7522


From fc61e6636d13eb3a23eb29b4327eeee9de0ef3bc Mon Sep 17 00:00:00 2001
From: Jack Steiner
Date: Mon, 20 Apr 2009 08:25:31 -0500
Subject: x86/uv: fix for no memory at paddr 0

Fix endcase where the memory at physical address 0 does not really
exist AND one of the sockets on blade 0 has no active cpus.

The memory that _appears_ to be at physical address 0 is actually
memory that located at a different address but has been remapped by
the chipset so that it appears to be at physical address 0.

When determining the UV pnode, the algorithm for determining the pnode
incorrectly used the relocated physical address instead of the actual
(global) address.

[ Impact: boot failure on partitioned systems ]

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20090420132530.GA23156@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index d6712334ce4b..2bda69352976 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -652,6 +652,7 @@ void __init uv_system_init(void)
 		if (uv_node_to_blade[nid] >= 0)
 			continue;
 		paddr = node_start_pfn(nid) << PAGE_SHIFT;
+		paddr = uv_soc_phys_ram_to_gpa(paddr);
 		pnode = (paddr >> m_val) & pnode_mask;
 		blade = boot_pnode_to_blade(pnode);
 		uv_node_to_blade[nid] = blade;
-- 
cgit v1.2.3-55-g7522


From 06c38d5e36b12d040839ff224e805146c0368556 Mon Sep 17 00:00:00 2001
From: Suresh Siddha
Date: Thu, 9 Apr 2009 15:24:34 -0700
Subject: x86-64: fix FPU corruption with signals and preemption

In 64bit signal delivery path, clear_used_math() was happening before saving
the current active FPU state on to the user stack for signal handling. Between
clear_used_math() and the state store on to the user stack, potentially we
can get a page fault for the user address and can block. Infact, while testing
we were hitting the might_fault() in __clear_user() which can do a schedule().

At a later point in time, we will schedule back into this process and
resume the save state (using "xsave/fxsave" instruction) which can lead
to DNA fault. And as used_math was cleared before, we will reinit the FP state
in the DNA fault and continue. This reinit will result in loosing the
FPU state of the process.

Move clear_used_math() to a point after the FPU state has been stored
onto the user stack.

This issue is present from a long time (even before the xsave changes
and the x86 merge). But it can easily be exposed in 2.6.28.x and 2.6.29.x
series because of the __clear_user() in this path, which has an explicit
__cond_resched() leading to a context switch with CONFIG_PREEMPT_VOLUNTARY.

[ Impact: fix FPU state corruption ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: <stable@kernel.org>			[2.6.28.x, 2.6.29.x]
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/xsave.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 0a5b04aa98f1..c5ee17e8c6d9 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -89,7 +89,7 @@ int save_i387_xstate(void __user *buf)
 
 	if (!used_math())
 		return 0;
-	clear_used_math(); /* trigger finit */
+
 	if (task_thread_info(tsk)->status & TS_USEDFPU) {
 		/*
 	 	 * Start with clearing the user buffer. This will present a
@@ -114,6 +114,8 @@ int save_i387_xstate(void __user *buf)
 			return -1;
 	}
 
+	clear_used_math(); /* trigger finit */
+
 	if (task_thread_info(tsk)->status & TS_XSAVE) {
 		struct _fpstate __user *fx = buf;
 		struct _xstate __user *x = buf;
-- 
cgit v1.2.3-55-g7522


From 2f537a9f8e82f55c241b002c8cfbf34303b45ada Mon Sep 17 00:00:00 2001
From: Rusty Russell
Date: Tue, 21 Apr 2009 16:00:15 +0930
Subject: x86: fix boot crash in NMI watchdog with CONFIG_CPUMASK_OFFSTACK=y
 and flat APIC

fcef8576d8a64fc603e719c97d423f9f6d4e0e8b converted backtrace_mask to a
cpumask_var_t, and assumed check_nmi_watchdog was called before
nmi_watchdog_tick was ever called.  Steven's oops shows I was wrong.

This is something of a bandaid: I'm not sure we *should* be calling
nmi_watchdog_tick before check_nmi_watchdog.  Note that gcc eliminates
this test for the CONFIG_CPUMASK_OFFSTACK=n case.

[ Impact: fix boot crash in rare configs ]

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <alpine.DEB.2.00.0904202113520.10097@gandalf.stny.rr.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/nmi.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index d6bd62407152..2ba52f35a88c 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -414,7 +414,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 		touched = 1;
 	}
 
-	if (cpumask_test_cpu(cpu, backtrace_mask)) {
+	/* We can be called before check_nmi_watchdog, hence NULL check. */
+	if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) {
 		static DEFINE_SPINLOCK(lock);	/* Serialise the printks */
 
 		spin_lock(&lock);
-- 
cgit v1.2.3-55-g7522


From fcc5c4a2feea3886dc058498b28508b2731720d5 Mon Sep 17 00:00:00 2001
From: Rusty Russell
Date: Tue, 21 Apr 2009 16:03:41 +0930
Subject: x86: avoid theoretical spurious NMI backtraces with
 CONFIG_CPUMASK_OFFSTACK=y

In theory (though not shown in practice) alloc_cpumask_var() doesn't zero
memory, so CPUs might print an "NMI backtrace for cpu %d" once on boot.

(Bug introduced in fcef8576d8a64fc603e719c97d423f9f6d4e0e8b).

[ Impact: avoid theoretical syslog noise in rare configs ]

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <alpine.DEB.2.00.0904202113520.10097@gandalf.stny.rr.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/nmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 2ba52f35a88c..ce4fbfa315a1 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -138,7 +138,7 @@ int __init check_nmi_watchdog(void)
 	if (!prev_nmi_count)
 		goto error;
 
-	alloc_cpumask_var(&backtrace_mask, GFP_KERNEL);
+	alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO);
 	printk(KERN_INFO "Testing NMI watchdog ... ");
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3-55-g7522


From 8e19608e8b5c001e4a66ce482edc474f05fb7355 Mon Sep 17 00:00:00 2001
From: Magnus Damm
Date: Tue, 21 Apr 2009 12:24:00 -0700
Subject: clocksource: pass clocksource to read() callback

Pass clocksource pointer to the read() callback for clocksources.  This
allows us to share the callback between multiple instances.

[hugh@veritas.com: fix powerpc build of clocksource pass clocksource mods]
[akpm@linux-foundation.org: cleanup]
Signed-off-by: Magnus Damm <damm@igel.co.jp>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mach-at91/at91rm9200_time.c         |  2 +-
 arch/arm/mach-at91/at91sam926x_time.c        |  2 +-
 arch/arm/mach-davinci/time.c                 |  2 +-
 arch/arm/mach-imx/time.c                     |  2 +-
 arch/arm/mach-ixp4xx/common.c                |  2 +-
 arch/arm/mach-msm/timer.c                    |  4 ++--
 arch/arm/mach-netx/time.c                    |  2 +-
 arch/arm/mach-ns9xxx/time-ns9360.c           |  2 +-
 arch/arm/mach-omap1/time.c                   |  2 +-
 arch/arm/mach-omap2/timer-gp.c               |  2 +-
 arch/arm/mach-pxa/time.c                     |  2 +-
 arch/arm/mach-realview/core.c                |  2 +-
 arch/arm/mach-versatile/core.c               |  2 +-
 arch/arm/plat-mxc/time.c                     |  2 +-
 arch/arm/plat-omap/common.c                  |  4 ++--
 arch/arm/plat-orion/time.c                   |  2 +-
 arch/avr32/kernel/time.c                     |  2 +-
 arch/blackfin/kernel/time-ts.c               | 12 ++++++------
 arch/ia64/kernel/cyclone.c                   |  2 +-
 arch/ia64/kernel/time.c                      |  4 ++--
 arch/ia64/sn/kernel/sn2/timer.c              |  2 +-
 arch/m68knommu/platform/68328/timers.c       |  2 +-
 arch/m68knommu/platform/coldfire/dma_timer.c |  2 +-
 arch/m68knommu/platform/coldfire/pit.c       |  2 +-
 arch/m68knommu/platform/coldfire/timers.c    |  2 +-
 arch/mips/kernel/cevt-txx9.c                 |  2 +-
 arch/mips/kernel/csrc-bcm1480.c              |  2 +-
 arch/mips/kernel/csrc-ioasic.c               |  6 +++---
 arch/mips/kernel/csrc-r4k.c                  |  2 +-
 arch/mips/kernel/csrc-sb1250.c               |  2 +-
 arch/mips/kernel/i8253.c                     |  2 +-
 arch/mips/nxp/pnx8550/common/time.c          |  2 +-
 arch/mips/sgi-ip27/ip27-timer.c              |  2 +-
 arch/powerpc/kernel/time.c                   |  8 ++++----
 arch/s390/kernel/time.c                      |  2 +-
 arch/sh/kernel/time_32.c                     |  2 +-
 arch/sh/kernel/timers/timer-tmu.c            |  2 +-
 arch/sparc/kernel/time_64.c                  |  7 ++++++-
 arch/um/kernel/time.c                        |  2 +-
 arch/x86/kernel/hpet.c                       |  6 +++---
 arch/x86/kernel/i8253.c                      |  2 +-
 arch/x86/kernel/kvmclock.c                   |  7 ++++++-
 arch/x86/kernel/tsc.c                        |  2 +-
 arch/x86/kernel/vmiclock_32.c                |  2 +-
 arch/x86/lguest/boot.c                       |  2 +-
 arch/x86/xen/time.c                          |  7 ++++++-
 drivers/char/hpet.c                          |  2 +-
 drivers/clocksource/acpi_pm.c                | 12 ++++++------
 drivers/clocksource/cyclone.c                |  2 +-
 drivers/clocksource/scx200_hrt.c             |  2 +-
 drivers/clocksource/tcb_clksrc.c             |  2 +-
 include/linux/clocksource.h                  |  6 +++---
 kernel/time/clocksource.c                    |  8 ++++----
 kernel/time/jiffies.c                        |  2 +-
 54 files changed, 94 insertions(+), 79 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/arm/mach-at91/at91rm9200_time.c b/arch/arm/mach-at91/at91rm9200_time.c
index 1ff1bda0a894..309f3511aa20 100644
--- a/arch/arm/mach-at91/at91rm9200_time.c
+++ b/arch/arm/mach-at91/at91rm9200_time.c
@@ -85,7 +85,7 @@ static struct irqaction at91rm9200_timer_irq = {
 	.handler	= at91rm9200_timer_interrupt
 };
 
-static cycle_t read_clk32k(void)
+static cycle_t read_clk32k(struct clocksource *cs)
 {
 	return read_CRTR();
 }
diff --git a/arch/arm/mach-at91/at91sam926x_time.c b/arch/arm/mach-at91/at91sam926x_time.c
index b63e1d5f1bad..4bd56aee4370 100644
--- a/arch/arm/mach-at91/at91sam926x_time.c
+++ b/arch/arm/mach-at91/at91sam926x_time.c
@@ -31,7 +31,7 @@ static u32 pit_cnt;		/* access only w/system irq blocked */
  * Clocksource:  just a monotonic counter of MCK/16 cycles.
  * We don't care whether or not PIT irqs are enabled.
  */
-static cycle_t read_pit_clk(void)
+static cycle_t read_pit_clk(struct clocksource *cs)
 {
 	unsigned long flags;
 	u32 elapsed;
diff --git a/arch/arm/mach-davinci/time.c b/arch/arm/mach-davinci/time.c
index f8bcd29d17a6..6c227d4ba998 100644
--- a/arch/arm/mach-davinci/time.c
+++ b/arch/arm/mach-davinci/time.c
@@ -238,7 +238,7 @@ static void __init timer_init(void)
 /*
  * clocksource
  */
-static cycle_t read_cycles(void)
+static cycle_t read_cycles(struct clocksource *cs)
 {
 	struct timer_s *t = &timers[TID_CLOCKSOURCE];
 
diff --git a/arch/arm/mach-imx/time.c b/arch/arm/mach-imx/time.c
index aff0ebcfa847..5aef18b599e5 100644
--- a/arch/arm/mach-imx/time.c
+++ b/arch/arm/mach-imx/time.c
@@ -73,7 +73,7 @@ static void __init imx_timer_hardware_init(void)
 	IMX_TCTL(TIMER_BASE) = TCTL_FRR | TCTL_CLK_PCLK1 | TCTL_TEN;
 }
 
-cycle_t imx_get_cycles(void)
+cycle_t imx_get_cycles(struct clocksource *cs)
 {
 	return IMX_TCN(TIMER_BASE);
 }
diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c
index f4656d2ac8a8..1e93dfee7543 100644
--- a/arch/arm/mach-ixp4xx/common.c
+++ b/arch/arm/mach-ixp4xx/common.c
@@ -401,7 +401,7 @@ void __init ixp4xx_sys_init(void)
 /*
  * clocksource
  */
-cycle_t ixp4xx_get_cycles(void)
+cycle_t ixp4xx_get_cycles(struct clocksource *cs)
 {
 	return *IXP4XX_OSTS;
 }
diff --git a/arch/arm/mach-msm/timer.c b/arch/arm/mach-msm/timer.c
index 444d9c0f5ca6..4855b8ca5101 100644
--- a/arch/arm/mach-msm/timer.c
+++ b/arch/arm/mach-msm/timer.c
@@ -57,12 +57,12 @@ static irqreturn_t msm_timer_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static cycle_t msm_gpt_read(void)
+static cycle_t msm_gpt_read(struct clocksource *cs)
 {
 	return readl(MSM_GPT_BASE + TIMER_COUNT_VAL);
 }
 
-static cycle_t msm_dgt_read(void)
+static cycle_t msm_dgt_read(struct clocksource *cs)
 {
 	return readl(MSM_DGT_BASE + TIMER_COUNT_VAL) >> MSM_DGT_SHIFT;
 }
diff --git a/arch/arm/mach-netx/time.c b/arch/arm/mach-netx/time.c
index f201fddb594f..82801dbf0579 100644
--- a/arch/arm/mach-netx/time.c
+++ b/arch/arm/mach-netx/time.c
@@ -104,7 +104,7 @@ static struct irqaction netx_timer_irq = {
 	.handler	= netx_timer_interrupt,
 };
 
-cycle_t netx_get_cycles(void)
+cycle_t netx_get_cycles(struct clocksource *cs)
 {
 	return readl(NETX_GPIO_COUNTER_CURRENT(TIMER_CLOCKSOURCE));
 }
diff --git a/arch/arm/mach-ns9xxx/time-ns9360.c b/arch/arm/mach-ns9xxx/time-ns9360.c
index 41df69721769..77281260358a 100644
--- a/arch/arm/mach-ns9xxx/time-ns9360.c
+++ b/arch/arm/mach-ns9xxx/time-ns9360.c
@@ -25,7 +25,7 @@
 #define TIMER_CLOCKEVENT 1
 static u32 latch;
 
-static cycle_t ns9360_clocksource_read(void)
+static cycle_t ns9360_clocksource_read(struct clocksource *cs)
 {
 	return __raw_readl(SYS_TR(TIMER_CLOCKSOURCE));
 }
diff --git a/arch/arm/mach-omap1/time.c b/arch/arm/mach-omap1/time.c
index 495a32c287b4..4d56408d3cff 100644
--- a/arch/arm/mach-omap1/time.c
+++ b/arch/arm/mach-omap1/time.c
@@ -198,7 +198,7 @@ static struct irqaction omap_mpu_timer2_irq = {
 	.handler	= omap_mpu_timer2_interrupt,
 };
 
-static cycle_t mpu_read(void)
+static cycle_t mpu_read(struct clocksource *cs)
 {
 	return ~omap_mpu_timer_read(1);
 }
diff --git a/arch/arm/mach-omap2/timer-gp.c b/arch/arm/mach-omap2/timer-gp.c
index 9fc13a2cc3f4..1cb2c0909c2b 100644
--- a/arch/arm/mach-omap2/timer-gp.c
+++ b/arch/arm/mach-omap2/timer-gp.c
@@ -138,7 +138,7 @@ static inline void __init omap2_gp_clocksource_init(void) {}
  * clocksource
  */
 static struct omap_dm_timer *gpt_clocksource;
-static cycle_t clocksource_read_cycles(void)
+static cycle_t clocksource_read_cycles(struct clocksource *cs)
 {
 	return (cycle_t)omap_dm_timer_read_counter(gpt_clocksource);
 }
diff --git a/arch/arm/mach-pxa/time.c b/arch/arm/mach-pxa/time.c
index 8eb3830fbb0b..750c448db672 100644
--- a/arch/arm/mach-pxa/time.c
+++ b/arch/arm/mach-pxa/time.c
@@ -125,7 +125,7 @@ static struct clock_event_device ckevt_pxa_osmr0 = {
 	.set_mode	= pxa_osmr0_set_mode,
 };
 
-static cycle_t pxa_read_oscr(void)
+static cycle_t pxa_read_oscr(struct clocksource *cs)
 {
 	return OSCR;
 }
diff --git a/arch/arm/mach-realview/core.c b/arch/arm/mach-realview/core.c
index 9ab947c14f26..942e1a7eb9b2 100644
--- a/arch/arm/mach-realview/core.c
+++ b/arch/arm/mach-realview/core.c
@@ -715,7 +715,7 @@ static struct irqaction realview_timer_irq = {
 	.handler	= realview_timer_interrupt,
 };
 
-static cycle_t realview_get_cycles(void)
+static cycle_t realview_get_cycles(struct clocksource *cs)
 {
 	return ~readl(timer3_va_base + TIMER_VALUE);
 }
diff --git a/arch/arm/mach-versatile/core.c b/arch/arm/mach-versatile/core.c
index 565776680d8c..1f929c391af7 100644
--- a/arch/arm/mach-versatile/core.c
+++ b/arch/arm/mach-versatile/core.c
@@ -948,7 +948,7 @@ static struct irqaction versatile_timer_irq = {
 	.handler	= versatile_timer_interrupt,
 };
 
-static cycle_t versatile_get_cycles(void)
+static cycle_t versatile_get_cycles(struct clocksource *cs)
 {
 	return ~readl(TIMER3_VA_BASE + TIMER_VALUE);
 }
diff --git a/arch/arm/plat-mxc/time.c b/arch/arm/plat-mxc/time.c
index ef1b3cd85bd3..dab3357196fb 100644
--- a/arch/arm/plat-mxc/time.c
+++ b/arch/arm/plat-mxc/time.c
@@ -36,7 +36,7 @@ static enum clock_event_mode clockevent_mode = CLOCK_EVT_MODE_UNUSED;
 
 /* clock source */
 
-static cycle_t mxc_get_cycles(void)
+static cycle_t mxc_get_cycles(struct clocksource *cs)
 {
 	return __raw_readl(TIMER_BASE + MXC_TCN);
 }
diff --git a/arch/arm/plat-omap/common.c b/arch/arm/plat-omap/common.c
index d1797147732f..433021f3d7cc 100644
--- a/arch/arm/plat-omap/common.c
+++ b/arch/arm/plat-omap/common.c
@@ -185,7 +185,7 @@ console_initcall(omap_add_serial_console);
 
 #include <linux/clocksource.h>
 
-static cycle_t omap_32k_read(void)
+static cycle_t omap_32k_read(struct clocksource *cs)
 {
 	return omap_readl(TIMER_32K_SYNCHRONIZED);
 }
@@ -207,7 +207,7 @@ unsigned long long sched_clock(void)
 {
 	unsigned long long ret;
 
-	ret = (unsigned long long)omap_32k_read();
+	ret = (unsigned long long)omap_32k_read(&clocksource_32k);
 	ret = (ret * clocksource_32k.mult_orig) >> clocksource_32k.shift;
 	return ret;
 }
diff --git a/arch/arm/plat-orion/time.c b/arch/arm/plat-orion/time.c
index 6fa2923e6dca..2faf9dba4ef7 100644
--- a/arch/arm/plat-orion/time.c
+++ b/arch/arm/plat-orion/time.c
@@ -41,7 +41,7 @@ static u32 ticks_per_jiffy;
 /*
  * Clocksource handling.
  */
-static cycle_t orion_clksrc_read(void)
+static cycle_t orion_clksrc_read(struct clocksource *cs)
 {
 	return 0xffffffff - readl(TIMER0_VAL);
 }
diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c
index 0ff46bf873b0..f27aa3b259fa 100644
--- a/arch/avr32/kernel/time.c
+++ b/arch/avr32/kernel/time.c
@@ -18,7 +18,7 @@
 #include <mach/pm.h>
 
 
-static cycle_t read_cycle_count(void)
+static cycle_t read_cycle_count(struct clocksource *cs)
 {
 	return (cycle_t)sysreg_read(COUNT);
 }
diff --git a/arch/blackfin/kernel/time-ts.c b/arch/blackfin/kernel/time-ts.c
index 0ed2badfd746..27646121280a 100644
--- a/arch/blackfin/kernel/time-ts.c
+++ b/arch/blackfin/kernel/time-ts.c
@@ -58,16 +58,11 @@ static inline unsigned long long cycles_2_ns(cycle_t cyc)
 	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
 }
 
-static cycle_t read_cycles(void)
+static cycle_t read_cycles(struct clocksource *cs)
 {
 	return __bfin_cycles_off + (get_cycles() << __bfin_cycles_mod);
 }
 
-unsigned long long sched_clock(void)
-{
-	return cycles_2_ns(read_cycles());
-}
-
 static struct clocksource clocksource_bfin = {
 	.name		= "bfin_cycles",
 	.rating		= 350,
@@ -77,6 +72,11 @@ static struct clocksource clocksource_bfin = {
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
+unsigned long long sched_clock(void)
+{
+	return cycles_2_ns(read_cycles(&clocksource_bfin));
+}
+
 static int __init bfin_clocksource_init(void)
 {
 	set_cyc2ns_scale(get_cclk() / 1000);
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
index 790ef0d87e12..71e35864d2e2 100644
--- a/arch/ia64/kernel/cyclone.c
+++ b/arch/ia64/kernel/cyclone.c
@@ -21,7 +21,7 @@ void __init cyclone_setup(void)
 
 static void __iomem *cyclone_mc;
 
-static cycle_t read_cyclone(void)
+static cycle_t read_cyclone(struct clocksource *cs)
 {
 	return (cycle_t)readq((void __iomem *)cyclone_mc);
 }
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 641c8b61c4f1..604c1a35db33 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -33,7 +33,7 @@
 
 #include "fsyscall_gtod_data.h"
 
-static cycle_t itc_get_cycles(void);
+static cycle_t itc_get_cycles(struct clocksource *cs);
 
 struct fsyscall_gtod_data_t fsyscall_gtod_data = {
 	.lock = SEQLOCK_UNLOCKED,
@@ -383,7 +383,7 @@ ia64_init_itm (void)
 	}
 }
 
-static cycle_t itc_get_cycles(void)
+static cycle_t itc_get_cycles(struct clocksource *cs)
 {
 	u64 lcycle, now, ret;
 
diff --git a/arch/ia64/sn/kernel/sn2/timer.c b/arch/ia64/sn/kernel/sn2/timer.c
index cf67fc562054..21d6f09e3447 100644
--- a/arch/ia64/sn/kernel/sn2/timer.c
+++ b/arch/ia64/sn/kernel/sn2/timer.c
@@ -23,7 +23,7 @@
 
 extern unsigned long sn_rtc_cycles_per_second;
 
-static cycle_t read_sn2(void)
+static cycle_t read_sn2(struct clocksource *cs)
 {
 	return (cycle_t)readq(RTC_COUNTER_ADDR);
 }
diff --git a/arch/m68knommu/platform/68328/timers.c b/arch/m68knommu/platform/68328/timers.c
index 6bafefa546e5..309f725995bf 100644
--- a/arch/m68knommu/platform/68328/timers.c
+++ b/arch/m68knommu/platform/68328/timers.c
@@ -75,7 +75,7 @@ static struct irqaction m68328_timer_irq = {
 
 /***************************************************************************/
 
-static cycle_t m68328_read_clk(void)
+static cycle_t m68328_read_clk(struct clocksource *cs)
 {
 	unsigned long flags;
 	u32 cycles;
diff --git a/arch/m68knommu/platform/coldfire/dma_timer.c b/arch/m68knommu/platform/coldfire/dma_timer.c
index 772578b1084f..a5f562823d7a 100644
--- a/arch/m68knommu/platform/coldfire/dma_timer.c
+++ b/arch/m68knommu/platform/coldfire/dma_timer.c
@@ -34,7 +34,7 @@
 #define DMA_DTMR_CLK_DIV_16	(2 << 1)
 #define DMA_DTMR_ENABLE		(1 << 0)
 
-static cycle_t cf_dt_get_cycles(void)
+static cycle_t cf_dt_get_cycles(struct clocksource *cs)
 {
 	return __raw_readl(DTCN0);
 }
diff --git a/arch/m68knommu/platform/coldfire/pit.c b/arch/m68knommu/platform/coldfire/pit.c
index 2a12e7fa9748..61b96211f8ff 100644
--- a/arch/m68knommu/platform/coldfire/pit.c
+++ b/arch/m68knommu/platform/coldfire/pit.c
@@ -125,7 +125,7 @@ static struct irqaction pit_irq = {
 
 /***************************************************************************/
 
-static cycle_t pit_read_clk(void)
+static cycle_t pit_read_clk(struct clocksource *cs)
 {
 	unsigned long flags;
 	u32 cycles;
diff --git a/arch/m68knommu/platform/coldfire/timers.c b/arch/m68knommu/platform/coldfire/timers.c
index 454f25493491..1ba8a3731653 100644
--- a/arch/m68knommu/platform/coldfire/timers.c
+++ b/arch/m68knommu/platform/coldfire/timers.c
@@ -78,7 +78,7 @@ static struct irqaction mcftmr_timer_irq = {
 
 /***************************************************************************/
 
-static cycle_t mcftmr_read_clk(void)
+static cycle_t mcftmr_read_clk(struct clocksource *cs)
 {
 	unsigned long flags;
 	u32 cycles;
diff --git a/arch/mips/kernel/cevt-txx9.c b/arch/mips/kernel/cevt-txx9.c
index eccf7d6096bd..2e911e3da8d3 100644
--- a/arch/mips/kernel/cevt-txx9.c
+++ b/arch/mips/kernel/cevt-txx9.c
@@ -22,7 +22,7 @@
 
 static struct txx9_tmr_reg __iomem *txx9_cs_tmrptr;
 
-static cycle_t txx9_cs_read(void)
+static cycle_t txx9_cs_read(struct clocksource *cs)
 {
 	return __raw_readl(&txx9_cs_tmrptr->trr);
 }
diff --git a/arch/mips/kernel/csrc-bcm1480.c b/arch/mips/kernel/csrc-bcm1480.c
index 868745e7184b..51489f8a825e 100644
--- a/arch/mips/kernel/csrc-bcm1480.c
+++ b/arch/mips/kernel/csrc-bcm1480.c
@@ -28,7 +28,7 @@
 
 #include <asm/sibyte/sb1250.h>
 
-static cycle_t bcm1480_hpt_read(void)
+static cycle_t bcm1480_hpt_read(struct clocksource *cs)
 {
 	return (cycle_t) __raw_readq(IOADDR(A_SCD_ZBBUS_CYCLE_COUNT));
 }
diff --git a/arch/mips/kernel/csrc-ioasic.c b/arch/mips/kernel/csrc-ioasic.c
index 1d5f63cf8997..b551f48d3a07 100644
--- a/arch/mips/kernel/csrc-ioasic.c
+++ b/arch/mips/kernel/csrc-ioasic.c
@@ -25,7 +25,7 @@
 #include <asm/dec/ioasic.h>
 #include <asm/dec/ioasic_addrs.h>
 
-static cycle_t dec_ioasic_hpt_read(void)
+static cycle_t dec_ioasic_hpt_read(struct clocksource *cs)
 {
 	return ioasic_read(IO_REG_FCTR);
 }
@@ -47,13 +47,13 @@ void __init dec_ioasic_clocksource_init(void)
 	while (!ds1287_timer_state())
 		;
 
-	start = dec_ioasic_hpt_read();
+	start = dec_ioasic_hpt_read(&clocksource_dec);
 
 	while (i--)
 		while (!ds1287_timer_state())
 			;
 
-	end = dec_ioasic_hpt_read();
+	end = dec_ioasic_hpt_read(&clocksource_dec);
 
 	freq = (end - start) * 10;
 	printk(KERN_INFO "I/O ASIC clock frequency %dHz\n", freq);
diff --git a/arch/mips/kernel/csrc-r4k.c b/arch/mips/kernel/csrc-r4k.c
index f1a2893931ed..e95a3cd48eea 100644
--- a/arch/mips/kernel/csrc-r4k.c
+++ b/arch/mips/kernel/csrc-r4k.c
@@ -10,7 +10,7 @@
 
 #include <asm/time.h>
 
-static cycle_t c0_hpt_read(void)
+static cycle_t c0_hpt_read(struct clocksource *cs)
 {
 	return read_c0_count();
 }
diff --git a/arch/mips/kernel/csrc-sb1250.c b/arch/mips/kernel/csrc-sb1250.c
index 92212bbb8e45..d14d3d1907fa 100644
--- a/arch/mips/kernel/csrc-sb1250.c
+++ b/arch/mips/kernel/csrc-sb1250.c
@@ -33,7 +33,7 @@
  * The HPT is free running from SB1250_HPT_VALUE down to 0 then starts over
  * again.
  */
-static cycle_t sb1250_hpt_read(void)
+static cycle_t sb1250_hpt_read(struct clocksource *cs)
 {
 	unsigned int count;
 
diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c
index 689719e34f08..ed20e7fe65e3 100644
--- a/arch/mips/kernel/i8253.c
+++ b/arch/mips/kernel/i8253.c
@@ -128,7 +128,7 @@ void __init setup_pit_timer(void)
  * to just read by itself. So use jiffies to emulate a free
  * running counter:
  */
-static cycle_t pit_read(void)
+static cycle_t pit_read(struct clocksource *cs)
 {
 	unsigned long flags;
 	int count;
diff --git a/arch/mips/nxp/pnx8550/common/time.c b/arch/mips/nxp/pnx8550/common/time.c
index cf293b279098..8df43e9e4d90 100644
--- a/arch/mips/nxp/pnx8550/common/time.c
+++ b/arch/mips/nxp/pnx8550/common/time.c
@@ -35,7 +35,7 @@
 
 static unsigned long cpj;
 
-static cycle_t hpt_read(void)
+static cycle_t hpt_read(struct clocksource *cs)
 {
 	return read_c0_count2();
 }
diff --git a/arch/mips/sgi-ip27/ip27-timer.c b/arch/mips/sgi-ip27/ip27-timer.c
index f024057a35f8..f10a7cd64f7e 100644
--- a/arch/mips/sgi-ip27/ip27-timer.c
+++ b/arch/mips/sgi-ip27/ip27-timer.c
@@ -159,7 +159,7 @@ static void __init hub_rt_clock_event_global_init(void)
 	setup_irq(irq, &hub_rt_irqaction);
 }
 
-static cycle_t hub_rt_read(void)
+static cycle_t hub_rt_read(struct clocksource *cs)
 {
 	return REMOTE_HUB_L(cputonasid(0), PI_RT_COUNT);
 }
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 926ea864e34f..48571ac56fb7 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -77,7 +77,7 @@
 #include <linux/clockchips.h>
 #include <linux/clocksource.h>
 
-static cycle_t rtc_read(void);
+static cycle_t rtc_read(struct clocksource *);
 static struct clocksource clocksource_rtc = {
 	.name         = "rtc",
 	.rating       = 400,
@@ -88,7 +88,7 @@ static struct clocksource clocksource_rtc = {
 	.read         = rtc_read,
 };
 
-static cycle_t timebase_read(void);
+static cycle_t timebase_read(struct clocksource *);
 static struct clocksource clocksource_timebase = {
 	.name         = "timebase",
 	.rating       = 400,
@@ -766,12 +766,12 @@ unsigned long read_persistent_clock(void)
 }
 
 /* clocksource code */
-static cycle_t rtc_read(void)
+static cycle_t rtc_read(struct clocksource *cs)
 {
 	return (cycle_t)get_rtc();
 }
 
-static cycle_t timebase_read(void)
+static cycle_t timebase_read(struct clocksource *cs)
 {
 	return (cycle_t)get_tb();
 }
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 6ded50dfa75a..ef596d020573 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -201,7 +201,7 @@ unsigned long read_persistent_clock(void)
 	return ts.tv_sec;
 }
 
-static cycle_t read_tod_clock(void)
+static cycle_t read_tod_clock(struct clocksource *cs)
 {
 	return get_clock();
 }
diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c
index c34e1e0f9b02..1700d2465f6c 100644
--- a/arch/sh/kernel/time_32.c
+++ b/arch/sh/kernel/time_32.c
@@ -208,7 +208,7 @@ unsigned long long sched_clock(void)
 	if (!clocksource_sh.rating)
 		return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
 
-	cycles = clocksource_sh.read();
+	cycles = clocksource_sh.read(&clocksource_sh);
 	return cyc2ns(&clocksource_sh, cycles);
 }
 #endif
diff --git a/arch/sh/kernel/timers/timer-tmu.c b/arch/sh/kernel/timers/timer-tmu.c
index c5d3396f5960..fe8d8930ccb6 100644
--- a/arch/sh/kernel/timers/timer-tmu.c
+++ b/arch/sh/kernel/timers/timer-tmu.c
@@ -81,7 +81,7 @@ static int tmu_timer_stop(void)
  */
 static int tmus_are_scaled;
 
-static cycle_t tmu_timer_read(void)
+static cycle_t tmu_timer_read(struct clocksource *cs)
 {
 	return ((cycle_t)(~_tmu_read(TMU1)))<<tmus_are_scaled;
 }
diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c
index db310aa00183..5c12e79b4bdf 100644
--- a/arch/sparc/kernel/time_64.c
+++ b/arch/sparc/kernel/time_64.c
@@ -814,6 +814,11 @@ void udelay(unsigned long usecs)
 }
 EXPORT_SYMBOL(udelay);
 
+static cycle_t clocksource_tick_read(struct clocksource *cs)
+{
+	return tick_ops->get_tick();
+}
+
 void __init time_init(void)
 {
 	unsigned long freq = sparc64_init_timers();
@@ -827,7 +832,7 @@ void __init time_init(void)
 	clocksource_tick.mult =
 		clocksource_hz2mult(freq,
 				    clocksource_tick.shift);
-	clocksource_tick.read = tick_ops->get_tick;
+	clocksource_tick.read = clocksource_tick_read;
 
 	printk("clocksource: mult[%x] shift[%d]\n",
 	       clocksource_tick.mult, clocksource_tick.shift);
diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index b13a87a3ec95..c8b9c469fcd7 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -65,7 +65,7 @@ static irqreturn_t um_timer(int irq, void *dev)
 	return IRQ_HANDLED;
 }
 
-static cycle_t itimer_read(void)
+static cycle_t itimer_read(struct clocksource *cs)
 {
 	return os_nsecs() / 1000;
 }
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 648b3a2a3a44..3f0019e0a229 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -722,7 +722,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
 /*
  * Clock source related code
  */
-static cycle_t read_hpet(void)
+static cycle_t read_hpet(struct clocksource *cs)
 {
 	return (cycle_t)hpet_readl(HPET_COUNTER);
 }
@@ -756,7 +756,7 @@ static int hpet_clocksource_register(void)
 	hpet_restart_counter();
 
 	/* Verify whether hpet counter works */
-	t1 = read_hpet();
+	t1 = hpet_readl(HPET_COUNTER);
 	rdtscll(start);
 
 	/*
@@ -770,7 +770,7 @@ static int hpet_clocksource_register(void)
 		rdtscll(now);
 	} while ((now - start) < 200000UL);
 
-	if (t1 == read_hpet()) {
+	if (t1 == hpet_readl(HPET_COUNTER)) {
 		printk(KERN_WARNING
 		       "HPET counter not counting. HPET disabled\n");
 		return -ENODEV;
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 3475440baa54..c2e0bb0890d4 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -129,7 +129,7 @@ void __init setup_pit_timer(void)
  * to just read by itself. So use jiffies to emulate a free
  * running counter:
  */
-static cycle_t pit_read(void)
+static cycle_t pit_read(struct clocksource *cs)
 {
 	static int old_count;
 	static u32 old_jifs;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 137f2e8132df..223af43f1526 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -77,6 +77,11 @@ static cycle_t kvm_clock_read(void)
 	return ret;
 }
 
+static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
+{
+	return kvm_clock_read();
+}
+
 /*
  * If we don't do that, there is the possibility that the guest
  * will calibrate under heavy load - thus, getting a lower lpj -
@@ -107,7 +112,7 @@ static void kvm_get_preset_lpj(void)
 
 static struct clocksource kvm_clock = {
 	.name = "kvm-clock",
-	.read = kvm_clock_read,
+	.read = kvm_clock_get_cycles,
 	.rating = 400,
 	.mask = CLOCKSOURCE_MASK(64),
 	.mult = 1 << KVM_SCALE,
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7a567ebe6361..d57de05dc430 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -699,7 +699,7 @@ static struct clocksource clocksource_tsc;
  * code, which is necessary to support wrapping clocksources like pm
  * timer.
  */
-static cycle_t read_tsc(void)
+static cycle_t read_tsc(struct clocksource *cs)
 {
 	cycle_t ret = (cycle_t)get_cycles();
 
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index d303369a7bad..2b3eb82efeeb 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -283,7 +283,7 @@ void __devinit vmi_time_ap_init(void)
 /** vmi clocksource */
 static struct clocksource clocksource_vmi;
 
-static cycle_t read_real_cycles(void)
+static cycle_t read_real_cycles(struct clocksource *cs)
 {
 	cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
 	return max(ret, clocksource_vmi.cycle_last);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index a2085368a3dc..ca7ec44bafc3 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -663,7 +663,7 @@ static unsigned long lguest_tsc_khz(void)
 
 /* If we can't use the TSC, the kernel falls back to our lower-priority
  * "lguest_clock", where we read the time value given to us by the Host. */
-static cycle_t lguest_clock_read(void)
+static cycle_t lguest_clock_read(struct clocksource *cs)
 {
 	unsigned long sec, nsec;
 
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 14f240623497..0a5aa44299a5 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -213,6 +213,11 @@ cycle_t xen_clocksource_read(void)
 	return ret;
 }
 
+static cycle_t xen_clocksource_get_cycles(struct clocksource *cs)
+{
+	return xen_clocksource_read();
+}
+
 static void xen_read_wallclock(struct timespec *ts)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
@@ -241,7 +246,7 @@ int xen_set_wallclock(unsigned long now)
 static struct clocksource xen_clocksource __read_mostly = {
 	.name = "xen",
 	.rating = 400,
-	.read = xen_clocksource_read,
+	.read = xen_clocksource_get_cycles,
 	.mask = ~0,
 	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */
 	.shift = XEN_SHIFT,
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 50dfa3bc71ce..340ba4f9dc54 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -72,7 +72,7 @@ static u32 hpet_nhpet, hpet_max_freq = HPET_USER_FREQ;
 #ifdef CONFIG_IA64
 static void __iomem *hpet_mctr;
 
-static cycle_t read_hpet(void)
+static cycle_t read_hpet(struct clocksource *cs)
 {
 	return (cycle_t)read_counter((void __iomem *)hpet_mctr);
 }
diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c
index ee19b6e8fcb4..40bd8c61c7d7 100644
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@ -57,7 +57,7 @@ u32 acpi_pm_read_verified(void)
 	return v2;
 }
 
-static cycle_t acpi_pm_read(void)
+static cycle_t acpi_pm_read(struct clocksource *cs)
 {
 	return (cycle_t)read_pmtmr();
 }
@@ -83,7 +83,7 @@ static int __init acpi_pm_good_setup(char *__str)
 }
 __setup("acpi_pm_good", acpi_pm_good_setup);
 
-static cycle_t acpi_pm_read_slow(void)
+static cycle_t acpi_pm_read_slow(struct clocksource *cs)
 {
 	return (cycle_t)acpi_pm_read_verified();
 }
@@ -156,9 +156,9 @@ static int verify_pmtmr_rate(void)
 	unsigned long count, delta;
 
 	mach_prepare_counter();
-	value1 = clocksource_acpi_pm.read();
+	value1 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
 	mach_countup(&count);
-	value2 = clocksource_acpi_pm.read();
+	value2 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
 	delta = (value2 - value1) & ACPI_PM_MASK;
 
 	/* Check that the PMTMR delta is within 5% of what we expect */
@@ -195,9 +195,9 @@ static int __init init_acpi_pm_clocksource(void)
 	/* "verify" this timing source: */
 	for (j = 0; j < ACPI_PM_MONOTONICITY_CHECKS; j++) {
 		udelay(100 * j);
-		value1 = clocksource_acpi_pm.read();
+		value1 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
 		for (i = 0; i < ACPI_PM_READ_CHECKS; i++) {
-			value2 = clocksource_acpi_pm.read();
+			value2 = clocksource_acpi_pm.read(&clocksource_acpi_pm);
 			if (value2 == value1)
 				continue;
 			if (value2 > value1)
diff --git a/drivers/clocksource/cyclone.c b/drivers/clocksource/cyclone.c
index 8615059a8729..64e528e8bfa6 100644
--- a/drivers/clocksource/cyclone.c
+++ b/drivers/clocksource/cyclone.c
@@ -19,7 +19,7 @@
 int use_cyclone = 0;
 static void __iomem *cyclone_ptr;
 
-static cycle_t read_cyclone(void)
+static cycle_t read_cyclone(struct clocksource *cs)
 {
 	return (cycle_t)readl(cyclone_ptr);
 }
diff --git a/drivers/clocksource/scx200_hrt.c b/drivers/clocksource/scx200_hrt.c
index b92da677aa5d..27f4d9637b62 100644
--- a/drivers/clocksource/scx200_hrt.c
+++ b/drivers/clocksource/scx200_hrt.c
@@ -43,7 +43,7 @@ MODULE_PARM_DESC(ppm, "+-adjust to actual XO freq (ppm)");
 /* The base timer frequency, * 27 if selected */
 #define HRT_FREQ   1000000
 
-static cycle_t read_hrt(void)
+static cycle_t read_hrt(struct clocksource *cs)
 {
 	/* Read the timer value */
 	return (cycle_t) inl(scx200_cb_base + SCx200_TIMER_OFFSET);
diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
index 254f1064d973..01b886e68822 100644
--- a/drivers/clocksource/tcb_clksrc.c
+++ b/drivers/clocksource/tcb_clksrc.c
@@ -39,7 +39,7 @@
 
 static void __iomem *tcaddr;
 
-static cycle_t tc_get_cycles(void)
+static cycle_t tc_get_cycles(struct clocksource *cs)
 {
 	unsigned long	flags;
 	u32		lower, upper;
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 573819ef4cc0..0d96cde9ee5d 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -143,7 +143,7 @@ extern u64 timecounter_cyc2time(struct timecounter *tc,
  *			400-499: Perfect
  *				The ideal clocksource. A must-use where
  *				available.
- * @read:		returns a cycle value
+ * @read:		returns a cycle value, passes clocksource as argument
  * @mask:		bitmask for two's complement
  *			subtraction of non 64 bit counters
  * @mult:		cycle to nanosecond multiplier (adjusted by NTP)
@@ -162,7 +162,7 @@ struct clocksource {
 	char *name;
 	struct list_head list;
 	int rating;
-	cycle_t (*read)(void);
+	cycle_t (*read)(struct clocksource *cs);
 	cycle_t mask;
 	u32 mult;
 	u32 mult_orig;
@@ -271,7 +271,7 @@ static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant)
  */
 static inline cycle_t clocksource_read(struct clocksource *cs)
 {
-	return cs->read();
+	return cs->read(cs);
 }
 
 /**
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c46c931a7fe7..ecfd7b5187e0 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -181,12 +181,12 @@ static void clocksource_watchdog(unsigned long data)
 
 	resumed = test_and_clear_bit(0, &watchdog_resumed);
 
-	wdnow = watchdog->read();
+	wdnow = watchdog->read(watchdog);
 	wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
 	watchdog_last = wdnow;
 
 	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
-		csnow = cs->read();
+		csnow = cs->read(cs);
 
 		if (unlikely(resumed)) {
 			cs->wd_last = csnow;
@@ -247,7 +247,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 
 		list_add(&cs->wd_list, &watchdog_list);
 		if (!started && watchdog) {
-			watchdog_last = watchdog->read();
+			watchdog_last = watchdog->read(watchdog);
 			watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
 			add_timer_on(&watchdog_timer,
 				     cpumask_first(cpu_online_mask));
@@ -268,7 +268,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 				cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
 			/* Start if list is not empty */
 			if (!list_empty(&watchdog_list)) {
-				watchdog_last = watchdog->read();
+				watchdog_last = watchdog->read(watchdog);
 				watchdog_timer.expires =
 					jiffies + WATCHDOG_INTERVAL;
 				add_timer_on(&watchdog_timer,
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 06f197560f3b..c3f6c30816e3 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -50,7 +50,7 @@
  */
 #define JIFFIES_SHIFT	8
 
-static cycle_t jiffies_read(void)
+static cycle_t jiffies_read(struct clocksource *cs)
 {
 	return (cycle_t) jiffies;
 }
-- 
cgit v1.2.3-55-g7522


From 7a6f9cbb37120c745fc187083fb5c3de4dca4f97 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann
Date: Tue, 21 Apr 2009 20:00:37 +0200
Subject: x86: hpet: fix periodic mode programming on AMD 81xx

(See http://bugzilla.kernel.org/show_bug.cgi?id=12961)

It partially reverts commit c23e253e67c9d8a91a0ffa33c1f571a17f0a2403
(x86: hpet: stop HPET_COUNTER when programming periodic mode)

HPET on AMD 81xx chipset needs a second write (with HPET_TN_SETVAL
cleared) to T0_CMP register to set the period in periodic mode.

With this patch HPET_COUNTER is still stopped but not reset when HPET
is programmed in periodic mode. This should help to avoid races when
HPET is programmed in periodic mode and fixes a boot time hang that
I've observed on a machine when using 1000HZ.

[ Impact: fix boot time hang on machines with AMD 81xx chipset ]

Reported-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Tested-by: Jeff Mahoney <jeffm@suse.com>
LKML-Reference: <20090421180037.GA2763@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/hpet.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 3f0019e0a229..81408b93f887 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -236,6 +236,10 @@ static void hpet_stop_counter(void)
 	unsigned long cfg = hpet_readl(HPET_CFG);
 	cfg &= ~HPET_CFG_ENABLE;
 	hpet_writel(cfg, HPET_CFG);
+}
+
+static void hpet_reset_counter(void)
+{
 	hpet_writel(0, HPET_COUNTER);
 	hpet_writel(0, HPET_COUNTER + 4);
 }
@@ -250,6 +254,7 @@ static void hpet_start_counter(void)
 static void hpet_restart_counter(void)
 {
 	hpet_stop_counter();
+	hpet_reset_counter();
 	hpet_start_counter();
 }
 
@@ -309,7 +314,7 @@ static int hpet_setup_msi_irq(unsigned int irq);
 static void hpet_set_mode(enum clock_event_mode mode,
 			  struct clock_event_device *evt, int timer)
 {
-	unsigned long cfg;
+	unsigned long cfg, cmp, now;
 	uint64_t delta;
 
 	switch (mode) {
@@ -317,12 +322,23 @@ static void hpet_set_mode(enum clock_event_mode mode,
 		hpet_stop_counter();
 		delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
 		delta >>= evt->shift;
+		now = hpet_readl(HPET_COUNTER);
+		cmp = now + (unsigned long) delta;
 		cfg = hpet_readl(HPET_Tn_CFG(timer));
 		/* Make sure we use edge triggered interrupts */
 		cfg &= ~HPET_TN_LEVEL;
 		cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
 		       HPET_TN_SETVAL | HPET_TN_32BIT;
 		hpet_writel(cfg, HPET_Tn_CFG(timer));
+		hpet_writel(cmp, HPET_Tn_CMP(timer));
+		udelay(1);
+		/*
+		 * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
+		 * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
+		 * bit is automatically cleared after the first write.
+		 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
+		 * Publication # 24674)
+		 */
 		hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer));
 		hpet_start_counter();
 		hpet_print_config();
-- 
cgit v1.2.3-55-g7522


From c5428e950ad42640f00092949fd17e356dfdeafa Mon Sep 17 00:00:00 2001
From: Coly Li
Date: Wed, 22 Apr 2009 23:21:56 +0800
Subject: uv_time: add parameter to uv_read_rtc()

uv_read_rtc() is referenced by read member of struct clocksource clocksource_uv.
In include/linux/clocksource.h, read of struct clocksource is declared as:
cycle_t (*read)(struct clocksource *cs)

This got introduced recently in:

   8e19608: clocksource: pass clocksource to read() callback

But arch/x86/kernel/uv_time.c was not properly converted by that pach.

This patch adds a dummy parameter (struct clocksource type) to uv_read_rtc() to
fix the incompatible reference in clocksource_uv, and add a NULL parameter in
all places where uv_read_rtc() gets called.

[ Impact: cleanup, address compiler warning ]

Signed-off-by: Coly Li <coly.li@suse.de>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Magnus Damm <damm@igel.co.jp>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hugh@veritas.com>
LKML-Reference: <49EF3614.1050806@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Dimitri Sivanich <sivanich@sgi.com>
---
 arch/x86/kernel/uv_time.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 2ffb6c53326e..583f11d5c480 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -29,7 +29,7 @@
 
 #define RTC_NAME		"sgi_rtc"
 
-static cycle_t uv_read_rtc(void);
+static cycle_t uv_read_rtc(struct clocksource *cs);
 static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
 static void uv_rtc_timer_setup(enum clock_event_mode,
 				struct clock_event_device *);
@@ -123,7 +123,7 @@ static int uv_setup_intr(int cpu, u64 expires)
 	/* Initialize comparator value */
 	uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
 
-	return (expires < uv_read_rtc() && !uv_intr_pending(pnode));
+	return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode));
 }
 
 /*
@@ -256,7 +256,7 @@ static int uv_rtc_unset_timer(int cpu)
 
 	spin_lock_irqsave(&head->lock, flags);
 
-	if (head->next_cpu == bcpu && uv_read_rtc() >= *t)
+	if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t)
 		rc = 1;
 
 	*t = ULLONG_MAX;
@@ -278,7 +278,7 @@ static int uv_rtc_unset_timer(int cpu)
 /*
  * Read the RTC.
  */
-static cycle_t uv_read_rtc(void)
+static cycle_t uv_read_rtc(struct clocksource *cs)
 {
 	return (cycle_t)uv_read_local_mmr(UVH_RTC);
 }
@@ -291,7 +291,7 @@ static int uv_rtc_next_event(unsigned long delta,
 {
 	int ced_cpu = cpumask_first(ced->cpumask);
 
-	return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc());
+	return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc(NULL));
 }
 
 /*
-- 
cgit v1.2.3-55-g7522


From 6298c512bc1007c3ff5c9ce20e6996781651cc45 Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Thu, 9 Apr 2009 12:28:22 +0200
Subject: x86, mce: make polling timer interval per CPU

The polling timer while running per CPU still uses a global next_interval
variable, which lead to some CPUs either polling too fast or too slow.   
This was not a serious problem because all errors get picked up eventually,
but it's still better to avoid it. Turn next_interval into a per cpu variable.

v2: Fix check_interval == 0 case (Hidetoshi Seto)

[ Impact: minor bug fix ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 863f89568b1a..82614f1b923a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -452,13 +452,14 @@ void mce_log_therm_throt_event(__u64 status)
  */
 
 static int check_interval = 5 * 60; /* 5 minutes */
-static int next_interval; /* in jiffies */
+static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
 static void mcheck_timer(unsigned long);
 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
 static void mcheck_timer(unsigned long data)
 {
 	struct timer_list *t = &per_cpu(mce_timer, data);
+	int *n;
 
 	WARN_ON(smp_processor_id() != data);
 
@@ -470,14 +471,14 @@ static void mcheck_timer(unsigned long data)
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
 	 * polling interval, otherwise increase the polling interval.
 	 */
+	n = &__get_cpu_var(next_interval);
 	if (mce_notify_user()) {
-		next_interval = max(next_interval/2, HZ/100);
+		*n = max(*n/2, HZ/100);
 	} else {
-		next_interval = min(next_interval * 2,
-				(int)round_jiffies_relative(check_interval*HZ));
+		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
 	}
 
-	t->expires = jiffies + next_interval;
+	t->expires = jiffies + *n;
 	add_timer(t);
 }
 
@@ -632,14 +633,13 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
 static void mce_init_timer(void)
 {
 	struct timer_list *t = &__get_cpu_var(mce_timer);
+	int *n = &__get_cpu_var(next_interval);
 
-	/* data race harmless because everyone sets to the same value */
-	if (!next_interval)
-		next_interval = check_interval * HZ;
-	if (!next_interval)
+	*n = check_interval * HZ;
+	if (!*n)
 		return;
 	setup_timer(t, mcheck_timer, smp_processor_id());
-	t->expires = round_jiffies(jiffies + next_interval);
+	t->expires = round_jiffies(jiffies + *n);
 	add_timer(t);
 }
 
@@ -907,7 +907,6 @@ static void mce_cpu_restart(void *data)
 /* Reinit MCEs after user configuration changes */
 static void mce_restart(void)
 {
-	next_interval = check_interval * HZ;
 	on_each_cpu(mce_cpu_restart, NULL, 1);
 }
 
@@ -1110,7 +1109,8 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 		break;
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
-		t->expires = round_jiffies(jiffies + next_interval);
+		t->expires = round_jiffies(jiffies +
+						__get_cpu_var(next_interval));
 		add_timer_on(t, cpu);
 		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
 		break;
-- 
cgit v1.2.3-55-g7522


From 5679af4c1625a1534a4321e1ecc3c48a1cf65eb8 Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Tue, 7 Apr 2009 17:06:55 +0200
Subject: x86, mce: fix boot logging logic

The earlier patch to change the poller to a separate function subtly
broke the boot logging logic. This could lead to machine checks
getting logged at boot even when disabled or defaulting to off
on some systems. Fix that.

[ Impact: bug fix - avoid spurious MCE in log ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/mce.h          | 1 +
 arch/x86/kernel/cpu/mcheck/mce_64.c | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 563933e06a35..4f8c199584e7 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -137,6 +137,7 @@ DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
 enum mcp_flags {
 	MCP_TIMESTAMP = (1 << 0),	/* log time stamp */
 	MCP_UC = (1 << 1),		/* log uncorrected errors */
+	MCP_DONTLOG = (1 << 2),		/* only clear, don't log */
 };
 extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 82614f1b923a..6fb0b359d2a5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -239,9 +239,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		 * Don't get the IP here because it's unlikely to
 		 * have anything to do with the actual error location.
 		 */
-
-		mce_log(&m);
-		add_taint(TAINT_MACHINE_CHECK);
+		if (!(flags & MCP_DONTLOG)) {
+			mce_log(&m);
+			add_taint(TAINT_MACHINE_CHECK);
+		}
 
 		/*
 		 * Clear state for this bank.
@@ -585,7 +586,7 @@ static void mce_init(void *dummy)
 	 * Log the machine checks left over from the previous reset.
 	 */
 	bitmap_fill(all_banks, MAX_NR_BANKS);
-	machine_check_poll(MCP_UC, &all_banks);
+	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
 
 	set_in_cr4(X86_CR4_MCE);
 
-- 
cgit v1.2.3-55-g7522


From d2c8604121648b744ebb127991f1c5876931885e Mon Sep 17 00:00:00 2001
From: Len Brown
Date: Thu, 23 Apr 2009 19:19:42 -0400
Subject: x86, hpet: Stop soliciting hpet=force users on ICH4M

The HPET in the ICH4M is not documented in the data sheet
because it was not officially validated.

While it is fine for hackers to continue to use "hpet=force"
to enable the hardware that they have, it is not prudent to
solicit additional "hpet=force" users on this hardware.

[ Impact: remove hpet=force syslog message on old-ICH systems ]

Signed-off-by: Len Brown <len.brown@intel.com>
Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
LKML-Reference: <alpine.LFD.2.00.0904231918510.15843@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/quirks.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index e95022e4f5d5..7563b31b4f03 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -261,8 +261,6 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev)
 {
 	if (hpet_force_user)
 		old_ich_force_enable_hpet(dev);
-	else
-		hpet_print_force_info();
 }
 
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
-- 
cgit v1.2.3-55-g7522


From f9a196b8dceba3c1e5fe885b81e45043ad7c60fc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Fri, 1 May 2009 20:59:25 +0200
Subject: x86: initialize io_bitmap_base on 32bit

commit db949bba3c7cf2e664ac12e237c6d4c914f0c69d (x86-32: use non-lazy
io bitmap context switching) broke ioperm for 32bit because it removed
the lazy initialization of io_bitmap_base and did not set it to the
real bitmap offset.

[ Impact: fix non-working sys_ioperm() on 32-bit kernels ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/common.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c4f667896c28..c1caefc82e62 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1203,6 +1203,8 @@ void __cpuinit cpu_init(void)
 	load_TR_desc();
 	load_LDT(&init_mm.context);
 
+	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+
 #ifdef CONFIG_DOUBLEFAULT
 	/* Set up doublefault TSS pointer in the GDT */
 	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-- 
cgit v1.2.3-55-g7522


From 6da7342ff1c5274c51ada084974668d10f769c16 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Mon, 4 May 2009 11:44:38 +0200
Subject: amd-iommu: fix iommu flag masks

The feature bits should be set via bitmasks, not via feature IDs.

[ Impact: fix feature enabling in newer IOMMU versions ]

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
LKML-Reference: <20090504102028.GA30307@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_iommu_init.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 42c33cebf00f..8c0be0902dac 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -49,10 +49,10 @@
 #define IVHD_DEV_EXT_SELECT             0x46
 #define IVHD_DEV_EXT_SELECT_RANGE       0x47
 
-#define IVHD_FLAG_HT_TUN_EN             0x00
-#define IVHD_FLAG_PASSPW_EN             0x01
-#define IVHD_FLAG_RESPASSPW_EN          0x02
-#define IVHD_FLAG_ISOC_EN               0x03
+#define IVHD_FLAG_HT_TUN_EN_MASK        0x01
+#define IVHD_FLAG_PASSPW_EN_MASK        0x02
+#define IVHD_FLAG_RESPASSPW_EN_MASK     0x04
+#define IVHD_FLAG_ISOC_EN_MASK          0x08
 
 #define IVMD_FLAG_EXCL_RANGE            0x08
 #define IVMD_FLAG_UNITY_MAP             0x01
@@ -569,19 +569,19 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 	 * First set the recommended feature enable bits from ACPI
 	 * into the IOMMU control registers
 	 */
-	h->flags & IVHD_FLAG_HT_TUN_EN ?
+	h->flags & IVHD_FLAG_HT_TUN_EN_MASK ?
 		iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
 		iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
 
-	h->flags & IVHD_FLAG_PASSPW_EN ?
+	h->flags & IVHD_FLAG_PASSPW_EN_MASK ?
 		iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
 		iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
 
-	h->flags & IVHD_FLAG_RESPASSPW_EN ?
+	h->flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
 		iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
 		iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
 
-	h->flags & IVHD_FLAG_ISOC_EN ?
+	h->flags & IVHD_FLAG_ISOC_EN_MASK ?
 		iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
 		iommu_feature_disable(iommu, CONTROL_ISOC_EN);
 
-- 
cgit v1.2.3-55-g7522


From 35d11680a9d82c93eb92f08f9702b72877427b4a Mon Sep 17 00:00:00 2001
From: Andreas Herrmann
Date: Mon, 4 May 2009 20:28:59 +0200
Subject: x86: show number of core_siblings instead of thread_siblings in
 /proc/cpuinfo

Commit 7ad728f98162cb1af06a85b2a5fc422dddd4fb78
(cpumask: x86: convert cpu_sibling_map/cpu_core_map to cpumask_var_t)
changed the output of /proc/cpuinfo for siblings:

Example on an AMD Phenom:

  physical id   : 0
  siblings : 1
  core id	   : 3
  cpu cores  : 4

Before that commit it was:

  physical id	: 0
  siblings : 4
  core id	   : 3
  cpu cores  : 4

Instead of cpu_core_mask it now uses cpu_sibling_mask to count siblings.
This is due to the following hunk of above commit:

|  --- a/arch/x86/kernel/cpu/proc.c
|  +++ b/arch/x86/kernel/cpu/proc.c
|  @@ -14,7 +14,7 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinf
|          if (c->x86_max_cores * smp_num_siblings > 1) {
|                  seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
|                  seq_printf(m, "siblings\t: %d\n",
|  -                          cpus_weight(per_cpu(cpu_core_map, cpu)));
|  +                          cpumask_weight(cpu_sibling_mask(cpu)));
|                  seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
|                  seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
|                  seq_printf(m, "apicid\t\t: %d\n", c->apicid);

This was a mistake, because the impact line shows that this side-effect
was not anticipated:

   Impact: reduce per-cpu size for CONFIG_CPUMASK_OFFSTACK=y

So revert the respective hunk to restore the old behavior.

[ Impact: fix sibling-info regression in /proc/cpuinfo ]

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <20090504182859.GA29045@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index f93047fed791..d5e30397246b 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -14,7 +14,7 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
 	if (c->x86_max_cores * smp_num_siblings > 1) {
 		seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
 		seq_printf(m, "siblings\t: %d\n",
-			   cpumask_weight(cpu_sibling_mask(cpu)));
+			   cpumask_weight(cpu_core_mask(cpu)));
 		seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
 		seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
 		seq_printf(m, "apicid\t\t: %d\n", c->apicid);
-- 
cgit v1.2.3-55-g7522


From 61438766514a2d7f191ce1b3cf6812eabbef4ef7 Mon Sep 17 00:00:00 2001
From: Jan Beulich
Date: Wed, 6 May 2009 13:02:19 +0100
Subject: x86: fix boot hang in early_reserve_e820()

If the first non-reserved (sub-)range doesn't fit the size requested,
an endless loop will be entered. If a range returned from
find_e820_area_size() turns out insufficient in size, the range must
be skipped before calling the function again.

[ Impact: fixes boot hang on some platforms ]

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/e820.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index ef2c3563357d..006281302925 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1074,12 +1074,13 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
 	u64 addr;
 	u64 start;
 
-	start = startt;
-	while (size < sizet && (start + 1))
+	for (start = startt; ; start += size) {
 		start = find_e820_area_size(start, &size, align);
-
-	if (size < sizet)
-		return 0;
+		if (!(start + 1))
+			return 0;
+		if (size >= sizet)
+			break;
+	}
 
 #ifdef CONFIG_X86_32
 	if (start >= MAXMEM)
-- 
cgit v1.2.3-55-g7522


From 6407df5ca54a511054200a1eb23f78f723ca1de4 Mon Sep 17 00:00:00 2001
From: Huang Ying
Date: Fri, 8 May 2009 10:51:41 +0800
Subject: x86, kexec: fix crashdump panic with CONFIG_KEXEC_JUMP

Tim Starling reported that crashdump will panic with kernel compiled
with CONFIG_KEXEC_JUMP due to null pointer deference in
machine_kexec_32.c: machine_kexec(), when deferencing
kexec_image. Refering to:

http://bugzilla.kernel.org/show_bug.cgi?id=13265

This patch fixes the BUG via replacing global variable reference:
kexec_image in machine_kexec() with local variable reference: image,
which is more appropriate, and will not be null.

Same BUG is in machine_kexec_64.c too, so fixed too in the same way.

[ Impact: fix crash on kexec ]

Reported-by: Tim Starling <tstarling@wikimedia.org>
Signed-off-by: Huang Ying <ying.huang@intel.com>
LKML-Reference: <1241751101.6259.85.camel@yhuang-dev.sh.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/machine_kexec_32.c | 4 ++--
 arch/x86/kernel/machine_kexec_64.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index e7368c1da01d..c1c429d00130 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -194,7 +194,7 @@ void machine_kexec(struct kimage *image)
 				       unsigned int preserve_context);
 
 #ifdef CONFIG_KEXEC_JUMP
-	if (kexec_image->preserve_context)
+	if (image->preserve_context)
 		save_processor_state();
 #endif
 
@@ -253,7 +253,7 @@ void machine_kexec(struct kimage *image)
 					   image->preserve_context);
 
 #ifdef CONFIG_KEXEC_JUMP
-	if (kexec_image->preserve_context)
+	if (image->preserve_context)
 		restore_processor_state();
 #endif
 
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 89cea4d44679..84c3bf209e98 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -274,7 +274,7 @@ void machine_kexec(struct kimage *image)
 	int save_ftrace_enabled;
 
 #ifdef CONFIG_KEXEC_JUMP
-	if (kexec_image->preserve_context)
+	if (image->preserve_context)
 		save_processor_state();
 #endif
 
@@ -333,7 +333,7 @@ void machine_kexec(struct kimage *image)
 				       image->preserve_context);
 
 #ifdef CONFIG_KEXEC_JUMP
-	if (kexec_image->preserve_context)
+	if (image->preserve_context)
 		restore_processor_state();
 #endif
 
-- 
cgit v1.2.3-55-g7522


From e5299926d7459d9fa7c7f856983147817aedb10e Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto
Date: Fri, 8 May 2009 17:28:40 +0900
Subject: x86: MCE: make cmci_discover_lock irq-safe

Lockdep reports the warning below when Li tries to offline one cpu:

[  110.835487] =================================
[  110.835616] [ INFO: inconsistent lock state ]
[  110.835688] 2.6.30-rc4-00336-g8c9ed89 #52
[  110.835757] ---------------------------------
[  110.835828] inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
[  110.835908] swapper/0 [HC1[1]:SC0[0]:HE0:SE1] takes:
[  110.835982]  (cmci_discover_lock){?.+...}, at: [<ffffffff80236dc0>] cmci_clear+0x30/0x9b

cmci_clear() can be called via smp_call_function_single().

It is better to disable interrupt while holding cmci_discover_lock,
to turn it into an irq-safe lock - we can deadlock otherwise.

[ Impact: fix possible deadlock in the MCE code ]

Reported-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4A03ED38.8000700@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Shaohua Li<shaohua.li@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index d6b72df89d69..cef3ee30744b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -151,10 +151,11 @@ static void print_update(char *type, int *hdr, int num)
 static void cmci_discover(int banks, int boot)
 {
 	unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
+	unsigned long flags;
 	int hdr = 0;
 	int i;
 
-	spin_lock(&cmci_discover_lock);
+	spin_lock_irqsave(&cmci_discover_lock, flags);
 	for (i = 0; i < banks; i++) {
 		u64 val;
 
@@ -184,7 +185,7 @@ static void cmci_discover(int banks, int boot)
 			WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
 		}
 	}
-	spin_unlock(&cmci_discover_lock);
+	spin_unlock_irqrestore(&cmci_discover_lock, flags);
 	if (hdr)
 		printk(KERN_CONT "\n");
 }
@@ -211,13 +212,14 @@ void cmci_recheck(void)
  */
 void cmci_clear(void)
 {
+	unsigned long flags;
 	int i;
 	int banks;
 	u64 val;
 
 	if (!cmci_supported(&banks))
 		return;
-	spin_lock(&cmci_discover_lock);
+	spin_lock_irqsave(&cmci_discover_lock, flags);
 	for (i = 0; i < banks; i++) {
 		if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
 			continue;
@@ -227,7 +229,7 @@ void cmci_clear(void)
 		wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
 		__clear_bit(i, __get_cpu_var(mce_banks_owned));
 	}
-	spin_unlock(&cmci_discover_lock);
+	spin_unlock_irqrestore(&cmci_discover_lock, flags);
 }
 
 /*
-- 
cgit v1.2.3-55-g7522


From b74d446f1f337e3fe906169a3266cb65ffa4179e Mon Sep 17 00:00:00 2001
From: Sam Ravnborg
Date: Sat, 9 May 2009 15:35:10 +0600
Subject: x86: Fix false positive section mismatch warnings in the apic code

[ Impact: reduce kernel image size a bit, annotate away warnings ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
[ modified and tested it ]
Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Cc: Marcin Slusarz <marcin.slusarz@gmail.com>
LKML-Reference: <b9df5fa10905090235s4bfd26a8o979f93809c9727ad@mail.gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/es7000_32.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 1c11b819f245..302947775575 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -254,7 +254,7 @@ static int parse_unisys_oem(char *oemptr)
 }
 
 #ifdef CONFIG_ACPI
-static int find_unisys_acpi_oem_table(unsigned long *oem_addr)
+static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
 {
 	struct acpi_table_header *header = NULL;
 	struct es7000_oem_table *table;
@@ -285,7 +285,7 @@ static int find_unisys_acpi_oem_table(unsigned long *oem_addr)
 	return 0;
 }
 
-static void unmap_unisys_acpi_oem_table(unsigned long oem_addr)
+static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
 {
 	if (!oem_addr)
 		return;
@@ -306,7 +306,7 @@ static int es7000_check_dsdt(void)
 static int es7000_acpi_ret;
 
 /* Hook from generic ACPI tables.c */
-static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
 	unsigned long oem_addr = 0;
 	int check_dsdt;
@@ -717,7 +717,7 @@ struct apic apic_es7000_cluster = {
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 };
 
-struct apic apic_es7000 = {
+struct apic __refdata apic_es7000 = {
 
 	.name				= "es7000",
 	.probe				= probe_es7000,
-- 
cgit v1.2.3-55-g7522


From 917a0153621572e88aeb2d5df025ad2e81027287 Mon Sep 17 00:00:00 2001
From: Yinghai Lu
Date: Wed, 6 May 2009 21:36:16 -0700
Subject: x86: mtrr: Fix high_width computation when phys-addr is >= 44bit

found one system where cpu address line is 44bits, mtrr printout
is not right:

 [    0.000000] MTRR variable ranges enabled:
 [    0.000000]   0 base 0   00000000 mask FF0 00000000 write-back
 [    0.000000]   1 base 10  00000000 mask FFF 80000000 write-back
 [    0.000000]   2 base 0   80000000 mask FFF 80000000 uncachable
 [    0.000000]   3 base 0   7F800000 mask FFF FF800000 uncachable

Li Zefan and Frederic pointed out the high_width could be -4 some how.

It turns out when phys_addr is 44bit, size_or_mask will be
ffffffff,00000000 so ffs(size_or_mask) will be 0.

Try to check low 32 bit, to get correct high_width.

Signed-off-by: Yinghai Lu <yinghai@kerne.org>
Also-analyzed-by: Frederic Weisbecker <fweisbec@gmail.com>
Also-analyzed-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Zhaolei <zhaolei@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4A026540.8060504@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/generic.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0b776c09aff3..d21d4fb161f7 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -275,7 +275,11 @@ static void __init print_mtrr_state(void)
 	}
 	printk(KERN_DEBUG "MTRR variable ranges %sabled:\n",
 	       mtrr_state.enabled & 2 ? "en" : "dis");
-	high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
+	if (size_or_mask & 0xffffffffUL)
+		high_width = ffs(size_or_mask & 0xffffffffUL) - 1;
+	else
+		high_width = ffs(size_or_mask>>32) + 32 - 1;
+	high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4;
 	for (i = 0; i < num_var_ranges; ++i) {
 		if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
 			printk(KERN_DEBUG "  %u base %0*X%05X000 mask %0*X%05X000 %s\n",
-- 
cgit v1.2.3-55-g7522


From aa512a27e9e8ed32f31b15eec67ab1ceca33839b Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Wed, 13 May 2009 13:52:19 -0400
Subject: x86/function-graph: fix constraint for recording old return value

After upgrading from gcc 4.2.2 to 4.4.0, the function graph tracer broke.
Investigating, I found that in the asm that replaces the return value,
gcc was using the same register for the old value as it was for the
new value.

	mov	(addr), old
	mov	new, (addr)

But if old and new are the same register, we clobber new with old!
I first thought this was a bug in gcc 4.4.0 and reported it:

  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40132

Andrew Pinski responded (quickly), saying that it was correct gcc behavior
and the code needed to denote old as an "early clobber".

Instead of "=r"(old), we need "=&r"(old).

[Impact: keep function graph tracer from breaking with gcc 4.4.0 ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 18dfa30795c9..b79c5533c421 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -442,7 +442,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		_ASM_EXTABLE(1b, 4b)
 		_ASM_EXTABLE(2b, 4b)
 
-		: [old] "=r" (old), [faulted] "=r" (faulted)
+		: [old] "=&r" (old), [faulted] "=r" (faulted)
 		: [parent] "r" (parent), [return_hooker] "r" (return_hooker)
 		: "memory"
 	);
-- 
cgit v1.2.3-55-g7522


From 33ab1979bc9f719213bc3f392c8fd9d012e4f4e9 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Wed, 11 Feb 2009 18:46:32 -0600
Subject: kgdb,i386: use address that SP register points to in the exception
 frame

The treatment of the SP register is different on x86_64 and i386.
This is a regression fix that lived outside the mainline kernel from
2.6.27 to now.  The regression was a result of the original merge
consolidation of the i386 and x86_64 archs to x86.

The incorrectly reported SP on i386 prevented stack tracebacks from
working correctly in gdb.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/kgdb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index eedfaebe1063..b1f4dffb919e 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -88,6 +88,7 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
 	gdb_regs[GDB_SS]	= __KERNEL_DS;
 	gdb_regs[GDB_FS]	= 0xFFFF;
 	gdb_regs[GDB_GS]	= 0xFFFF;
+	gdb_regs[GDB_SP]	= (int)&regs->sp;
 #else
 	gdb_regs[GDB_R8]	= regs->r8;
 	gdb_regs[GDB_R9]	= regs->r9;
@@ -100,8 +101,8 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
 	gdb_regs32[GDB_PS]	= regs->flags;
 	gdb_regs32[GDB_CS]	= regs->cs;
 	gdb_regs32[GDB_SS]	= regs->ss;
-#endif
 	gdb_regs[GDB_SP]	= regs->sp;
+#endif
 }
 
 /**
-- 
cgit v1.2.3-55-g7522


From b4ecc126991b30fe5f9a59dfacda046aeac124b2 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge
Date: Wed, 13 May 2009 17:16:55 -0700
Subject: x86: Fix performance regression caused by paravirt_ops on native
 kernels

Xiaohui Xin and some other folks at Intel have been looking into what's
behind the performance hit of paravirt_ops when running native.

It appears that the hit is entirely due to the paravirtualized
spinlocks introduced by:

 | commit 8efcbab674de2bee45a2e4cdf97de16b8e609ac8
 | Date:   Mon Jul 7 12:07:51 2008 -0700
 |
 |     paravirt: introduce a "lock-byte" spinlock implementation

The extra call/return in the spinlock path is somehow
causing an increase in the cycles/instruction of somewhere around 2-7%
(seems to vary quite a lot from test to test).  The working theory is
that the CPU's pipeline is getting upset about the
call->call->locked-op->return->return, and seems to be failing to
speculate (though I haven't seen anything definitive about the precise
reasons).  This doesn't entirely make sense, because the performance
hit is also visible on unlock and other operations which don't involve
locked instructions.  But spinlock operations clearly swamp all the
other pvops operations, even though I can't imagine that they're
nearly as common (there's only a .05% increase in instructions
executed).

If I disable just the pv-spinlock calls, my tests show that pvops is
identical to non-pvops performance on native (my measurements show that
it is actually about .1% faster, but Xiaohui shows a .05% slowdown).

Summary of results, averaging 10 runs of the "mmperf" test, using a
no-pvops build as baseline:

		nopv		Pv-nospin	Pv-spin
CPU cycles	100.00%		99.89%		102.18%
instructions	100.00%		100.10%		100.15%
CPI		100.00%		99.79%		102.03%
cache ref	100.00%		100.84%		100.28%
cache miss	100.00%		90.47%		88.56%
cache miss rate	100.00%		89.72%		88.31%
branches	100.00%		99.93%		100.04%
branch miss	100.00%		103.66%		107.72%
branch miss rt	100.00%		103.73%		107.67%
wallclock	100.00%		99.90%		102.20%

The clear effect here is that the 2% increase in CPI is
directly reflected in the final wallclock time.

(The other interesting effect is that the more ops are
out of line calls via pvops, the lower the cache access
and miss rates.  Not too surprising, but it suggests that
the non-pvops kernel is over-inlined.  On the flipside,
the branch misses go up correspondingly...)

So, what's the fix?

Paravirt patching turns all the pvops calls into direct calls, so
_spin_lock etc do end up having direct calls.  For example, the compiler
generated code for paravirtualized _spin_lock is:

<_spin_lock+0>:		mov    %gs:0xb4c8,%rax
<_spin_lock+9>:		incl   0xffffffffffffe044(%rax)
<_spin_lock+15>:	callq  *0xffffffff805a5b30
<_spin_lock+22>:	retq

The indirect call will get patched to:
<_spin_lock+0>:		mov    %gs:0xb4c8,%rax
<_spin_lock+9>:		incl   0xffffffffffffe044(%rax)
<_spin_lock+15>:	callq <__ticket_spin_lock>
<_spin_lock+20>:	nop; nop		/* or whatever 2-byte nop */
<_spin_lock+22>:	retq

One possibility is to inline _spin_lock, etc, when building an
optimised kernel (ie, when there's no spinlock/preempt
instrumentation/debugging enabled).  That will remove the outer
call/return pair, returning the instruction stream to a single
call/return, which will presumably execute the same as the non-pvops
case.  The downsides arel 1) it will replicate the
preempt_disable/enable code at eack lock/unlock callsite; this code is
fairly small, but not nothing; and 2) the spinlock definitions are
already a very heavily tangled mass of #ifdefs and other preprocessor
magic, and making any changes will be non-trivial.

The other obvious answer is to disable pv-spinlocks.  Making them a
separate config option is fairly easy, and it would be trivial to
enable them only when Xen is enabled (as the only non-default user).
But it doesn't really address the common case of a distro build which
is going to have Xen support enabled, and leaves the open question of
whether the native performance cost of pv-spinlocks is worth the
performance improvement on a loaded Xen system (10% saving of overall
system CPU when guests block rather than spin).  Still it is a
reasonable short-term workaround.

[ Impact: fix pvops performance regression when running native ]

Analysed-by: "Xin Xiaohui" <xiaohui.xin@intel.com>
Analysed-by: "Li Xin" <xin.li@intel.com>
Analysed-by: "Nakajima Jun" <jun.nakajima@intel.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Xen-devel <xen-devel@lists.xensource.com>
LKML-Reference: <4A0B62F7.5030802@goop.org>
[ fixed the help text ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                | 13 +++++++++++++
 arch/x86/include/asm/paravirt.h |  2 +-
 arch/x86/include/asm/spinlock.h |  4 ++--
 arch/x86/kernel/Makefile        |  3 ++-
 arch/x86/kernel/paravirt.c      |  2 ++
 arch/x86/xen/Makefile           |  5 +++--
 arch/x86/xen/xen-ops.h          | 19 +++++++++++++++----
 7 files changed, 38 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index df9e885eee14..a6efe0a2e9ae 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -498,6 +498,19 @@ config PARAVIRT
 	  over full virtualization.  However, when run without a hypervisor
 	  the kernel is theoretically slower and slightly larger.
 
+config PARAVIRT_SPINLOCKS
+	bool "Paravirtualization layer for spinlocks"
+	depends on PARAVIRT && SMP && EXPERIMENTAL
+	---help---
+	  Paravirtualized spinlocks allow a pvops backend to replace the
+	  spinlock implementation with something virtualization-friendly
+	  (for example, block the virtual CPU rather than spinning).
+
+	  Unfortunately the downside is an up to 5% performance hit on
+	  native kernels, with various workloads.
+
+	  If you are unsure how to answer this question, answer N.
+
 config PARAVIRT_CLOCK
 	bool
 	default n
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 378e3691c08c..a53da004e08e 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -1443,7 +1443,7 @@ u64 _paravirt_ident_64(u64);
 
 #define paravirt_nop	((void *)_paravirt_nop)
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
 
 static inline int __raw_spin_is_locked(struct raw_spinlock *lock)
 {
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index e5e6caffec87..b7e5db876399 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -172,7 +172,7 @@ static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
 	return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
 }
 
-#ifndef CONFIG_PARAVIRT
+#ifndef CONFIG_PARAVIRT_SPINLOCKS
 
 static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
 {
@@ -206,7 +206,7 @@ static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
 	__raw_spin_lock(lock);
 }
 
-#endif
+#endif	/* CONFIG_PARAVIRT_SPINLOCKS */
 
 static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
 {
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 145cce75cda7..88d1bfc847d3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -89,7 +89,8 @@ obj-$(CONFIG_DEBUG_NX_TEST)	+= test_nx.o
 obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o
 obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o
-obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o
+obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
+obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
 
 obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 8e45f4464880..9faf43bea336 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -134,7 +134,9 @@ static void *get_call_destination(u8 type)
 		.pv_irq_ops = pv_irq_ops,
 		.pv_apic_ops = pv_apic_ops,
 		.pv_mmu_ops = pv_mmu_ops,
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
 		.pv_lock_ops = pv_lock_ops,
+#endif
 	};
 	return *((void **)&tmpl + type);
 }
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3b767d03fd6a..172438f86a02 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -9,5 +9,6 @@ obj-y		:= enlighten.o setup.o multicalls.o mmu.o irq.o \
 			time.o xen-asm.o xen-asm_$(BITS).o \
 			grant-table.o suspend.o
 
-obj-$(CONFIG_SMP)		+= smp.o spinlock.o
-obj-$(CONFIG_XEN_DEBUG_FS)	+= debugfs.o
\ No newline at end of file
+obj-$(CONFIG_SMP)		+= smp.o
+obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
+obj-$(CONFIG_XEN_DEBUG_FS)	+= debugfs.o
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 20139464943c..ca6596b05d53 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -62,15 +62,26 @@ void xen_setup_vcpu_info_placement(void);
 #ifdef CONFIG_SMP
 void xen_smp_init(void);
 
-void __init xen_init_spinlocks(void);
-__cpuinit void xen_init_lock_cpu(int cpu);
-void xen_uninit_lock_cpu(int cpu);
-
 extern cpumask_var_t xen_cpu_initialized_map;
 #else
 static inline void xen_smp_init(void) {}
 #endif
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+void __init xen_init_spinlocks(void);
+__cpuinit void xen_init_lock_cpu(int cpu);
+void xen_uninit_lock_cpu(int cpu);
+#else
+static inline void xen_init_spinlocks(void)
+{
+}
+static inline void xen_init_lock_cpu(int cpu)
+{
+}
+static inline void xen_uninit_lock_cpu(int cpu)
+{
+}
+#endif
 
 /* Declare an asm function, along with symbols needed to make it
    inlineable */
-- 
cgit v1.2.3-55-g7522


From 88dff4936c0a5fa53080cca68dc963a8a2a674b0 Mon Sep 17 00:00:00 2001
From: Zhang Rui
Date: Fri, 22 May 2009 11:35:50 +0800
Subject: x86: DMI match for the Sony VGN-Z540N as it needs BIOS reboot

x86: DMI match for the Sony VGN-Z540N as it needs BIOS reboot,
see:

  http://bugzilla.kernel.org/show_bug.cgi?id=12901

[ Impact: fix hung reboot on certain systems ]

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Cc: Len Brown <lenb@kernel.org>
LKML-Reference: <1242963350.32574.53.camel@rzhang-dt>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/reboot.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1340dad417f4..667188e0b5a0 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -232,6 +232,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),
 		},
 	},
+	{	/* Handle problems with rebooting on Sony VGN-Z540N */
+		.callback = set_bios_reboot,
+		.ident = "Sony VGN-Z540N",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
+		},
+	},
 	{ }
 };
 
-- 
cgit v1.2.3-55-g7522


From 0c752a93353d9b17dbe148312d732fbe06d235e1 Mon Sep 17 00:00:00 2001
From: Suresh Siddha
Date: Fri, 22 May 2009 12:17:45 -0700
Subject: x86: introduce noxsave boot parameter

Introduce "noxsave" boot parameter which will disable the cpu's xsave/xrstor
capabilities. Useful for debugging and working around xsave related issues.

[ Impact: make it possible to debug problems in the field ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 Documentation/kernel-parameters.txt | 4 ++++
 arch/x86/kernel/cpu/common.c        | 7 +++++++
 2 files changed, 11 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e87bdbfbcc75..fd5cac013037 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1535,6 +1535,10 @@ and is between 256 and 4096 characters. It is defined in the file
 			register save and restore. The kernel will only save
 			legacy floating-point registers on task switch.
 
+	noxsave		[BUGS=X86] Disables x86 extended register state save
+			and restore using xsave. The kernel will fallback to
+			enabling legacy floating-point and sse state.
+
 	nohlt		[BUGS=ARM,SH] Tells the kernel that the sleep(SH) or
 			wfi(ARM) instruction doesn't work correctly and not to
 			use it. This is also useful when using JTAG debugger.
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c1caefc82e62..77848d9fca68 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -114,6 +114,13 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 } };
 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 
+static int __init x86_xsave_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+	return 1;
+}
+__setup("noxsave", x86_xsave_setup);
+
 #ifdef CONFIG_X86_32
 static int cachesize_override __cpuinitdata = -1;
 static int disable_x86_serial_nr __cpuinitdata = 1;
-- 
cgit v1.2.3-55-g7522


From 71c9d8b68b299bef614afc7907393564a9f1476f Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Mon, 25 May 2009 12:01:59 +0900
Subject: x86: Remove remap percpu allocator for the time being

Remap percpu allocator has subtle bug when combined with page
attribute changing.  Remap percpu allocator aliases PMD pages for the
first chunk and as pageattr doesn't know about the alias it ends up
updating page attributes of the original mapping thus leaving the
alises in inconsistent state which might lead to subtle data
corruption.  Please read the following threads for more information:

  http://thread.gmane.org/gmane.linux.kernel/835783

The following is the proposed fix which teaches pageattr about percpu
aliases.

  http://thread.gmane.org/gmane.linux.kernel/837157

However, the above changes are deemed too pervasive for upstream
inclusion for 2.6.30 release, so this patch essentially disables
the remap allocator for the time being.

Signed-off-by: Tejun Heo <tj@kernel.org>
LKML-Reference: <4A1A0A27.4050301@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_percpu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 3a97a4cf1872..8f0e13be36b3 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -160,8 +160,10 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
 	/*
 	 * If large page isn't supported, there's no benefit in doing
 	 * this.  Also, on non-NUMA, embedding is better.
+	 *
+	 * NOTE: disabled for now.
 	 */
-	if (!cpu_has_pse || !pcpu_need_numa())
+	if (true || !cpu_has_pse || !pcpu_need_numa())
 		return -EINVAL;
 
 	/*
-- 
cgit v1.2.3-55-g7522


From 4319503779060120fa5de9b8fde056603bb6e0fd Mon Sep 17 00:00:00 2001
From: Jarod Wilson
Date: Fri, 6 Mar 2009 20:24:57 +0000
Subject: [CPUFREQ] add atom family to p4-clockmod

Some atom procs don't do freq scaling (such as the atom 330 on my own
littlefalls2 board). By adding the atom family here, we at least get
the benefit of passive cooling in a thermal emergency. Not sure how
to see that its actually helping any, but the driver does bind and
claim its functioning on my atom 330.

Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 6ac55bd341ae..869615193720 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -168,6 +168,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
 		case 0x0E: /* Core */
 		case 0x0F: /* Core Duo */
 		case 0x16: /* Celeron Core */
+		case 0x1C: /* Atom */
 			p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
 			return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
 		case 0x0D: /* Pentium M (Dothan) */
-- 
cgit v1.2.3-55-g7522


From d38e73e8dad454a5916f446b0d3523c1161ae95a Mon Sep 17 00:00:00 2001
From: Dave Jones
Date: Thu, 23 Apr 2009 13:36:12 -0400
Subject: [CPUFREQ] powernow-k7 build fix when ACPI=n

arch/x86/kernel/cpu/cpufreq/powernow-k7.c:172: warning: 'invalidate_entry' defined but not used

Reported-by: Toralf Förster <toralf.foerster@gmx.de>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k7.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 3c28ccd49742..a8363e5be4ef 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -168,10 +168,12 @@ static int check_powernow(void)
 	return 1;
 }
 
+#ifdef CONFIG_X86_POWERNOW_K7_ACPI
 static void invalidate_entry(unsigned int entry)
 {
 	powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
 }
+#endif
 
 static int get_ranges(unsigned char *pst)
 {
-- 
cgit v1.2.3-55-g7522


From df1829770db415dc5a5ed5ada3bd70176c6f0a01 Mon Sep 17 00:00:00 2001
From: Thomas Renninger
Date: Wed, 22 Apr 2009 13:48:32 +0200
Subject: [CPUFREQ] powernow-k8 cleanup msg if BIOS does not export ACPI _PSS
 cpufreq data

- Make the message shorter and easier to grep for
- Use printk_once instead of WARN_ONCE (functionality of these was mixed)

Signed-off-by: Thomas Renninger <trenn@suse.de>
Cc: Langsdorf, Mark <mark.langsdorf@amd.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 4709ead2db52..feef10c085a1 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1215,13 +1215,16 @@ static int powernowk8_verify(struct cpufreq_policy *pol)
 	return cpufreq_frequency_table_verify(pol, data->powernow_table);
 }
 
+static const char ACPI_PSS_BIOS_BUG_MSG[] =
+	KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
+	KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n";
+
 /* per CPU init entry point to the driver */
 static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 {
 	struct powernow_k8_data *data;
 	cpumask_t oldmask;
 	int rc;
-	static int print_once;
 
 	if (!cpu_online(pol->cpu))
 		return -ENODEV;
@@ -1244,19 +1247,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 		 * an UP version, and is deprecated by AMD.
 		 */
 		if (num_online_cpus() != 1) {
-			/*
-			 * Replace this one with print_once as soon as such a
-			 * thing gets introduced
-			 */
-			if (!print_once) {
-				WARN_ONCE(1, KERN_ERR FW_BUG PFX "Your BIOS "
-					"does not provide ACPI _PSS objects "
-					"in a way that Linux understands. "
-					"Please report this to the Linux ACPI"
-					" maintainers and complain to your "
-					"BIOS vendor.\n");
-				print_once++;
-			}
+			printk_once(ACPI_PSS_BIOS_BUG_MSG);
 			goto err_out;
 		}
 		if (pol->cpu != 0) {
-- 
cgit v1.2.3-55-g7522


From ca446d06351992e4f1a7c1e5e99870ab4ec5188f Mon Sep 17 00:00:00 2001
From: Andreas Herrmann
Date: Wed, 22 Apr 2009 13:48:33 +0200
Subject: [CPUFREQ] powernow-k8: determine exact CPU frequency for HW Pstates

Slightly modified by trenn@suse.de -> only do this on fam 10h and fam 11h.

Currently powernow-k8 determines CPU frequency from ACPI PSS objects, but
according to AMD family 11h BKDG this frequency is just a rounded value:

  "CoreFreq (MHz) = The CPU COF specified by MSRC001_00[6B:64][CpuFid]
  rounded to the nearest 100 Mhz."

As a consequnce powernow-k8 reports wrong CPU frequency on some systems,
e.g. on Turion X2 Ultra:

  powernow-k8: Found 1 AMD Turion(tm)X2 Ultra DualCore Mobile ZM-82
               processors (2 cpu cores) (version 2.20.00)
  powernow-k8:    0 : pstate 0 (2200 MHz)
  powernow-k8:    1 : pstate 1 (1100 MHz)
  powernow-k8:    2 : pstate 2 (600 MHz)

But this is wrong as frequency for Pstate2 is 550 MHz. x86info reports it
correctly:

  #x86info -a |grep Pstate
  ...
  Pstate-0: fid=e, did=0, vid=24 (2200MHz)
  Pstate-1: fid=e, did=1, vid=30 (1100MHz)
  Pstate-2: fid=e, did=2, vid=3c (550MHz) (current)

Solution is to determine the frequency directly from Pstate MSRs instead
of using rounded values from ACPI table.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Signed-off-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index feef10c085a1..f6b32d112357 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -649,6 +649,20 @@ static void print_basics(struct powernow_k8_data *data)
 				data->batps);
 }
 
+static u32 freq_from_fid_did(u32 fid, u32 did)
+{
+	u32 mhz = 0;
+
+	if (boot_cpu_data.x86 == 0x10)
+		mhz = (100 * (fid + 0x10)) >> did;
+	else if (boot_cpu_data.x86 == 0x11)
+		mhz = (100 * (fid + 8)) >> did;
+	else
+		BUG();
+
+	return mhz * 1000;
+}
+
 static int fill_powernow_table(struct powernow_k8_data *data,
 		struct pst_s *pst, u8 maxvid)
 {
@@ -923,8 +937,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
 
 		powernow_table[i].index = index;
 
-		powernow_table[i].frequency =
-			data->acpi_data.states[i].core_frequency * 1000;
+		/* Frequency may be rounded for these */
+		if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) {
+			powernow_table[i].frequency =
+				freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
+		} else
+			powernow_table[i].frequency =
+				data->acpi_data.states[i].core_frequency * 1000;
 	}
 	return 0;
 }
-- 
cgit v1.2.3-55-g7522


From 61c8c67e3ad67ea1d1360f2e88688bd942834756 Mon Sep 17 00:00:00 2001
From: Joe Perches
Date: Tue, 26 May 2009 14:58:39 -0700
Subject: acpi-cpufreq: fix printk typo and indentation

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 208ecf6643df..54b6de2cd947 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -693,8 +693,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
 	    policy->cpuinfo.transition_latency > 20 * 1000) {
 		policy->cpuinfo.transition_latency = 20 * 1000;
-			printk_once(KERN_INFO "Capping off P-state tranision"
-				    " latency at 20 uS\n");
+		printk_once(KERN_INFO
+			    "P-state transition latency capped at 20 uS\n");
 	}
 
 	/* table init */
-- 
cgit v1.2.3-55-g7522